Add comments in conv benchmark

Add benchmark for conv
Merge branch 'benchmark_softmax' of github.com:InfiniTensor/InfiniTensor into benchmark_conv
2023-07-20 16:57:27 +08:00 · 2023-07-20 16:49:14 +08:00 · 2023-07-20 14:15:37 +08:00 · 2023-07-20 14:14:06 +08:00 · 2023-07-20 14:03:05 +08:00 · 2023-07-20 11:03:07 +08:00
455 changed files with 4246 additions and 23082 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1,11 +1,12 @@
 name: Build and test cpu
 on:
  push:
+    branch: 'master'
    paths-ignore:
      - '**.md'
      - 'LICENSE'
  pull_request:
-    paths:
+    paths-ignore:
      - '**.md'
      - 'LICENSE'

@ -14,10 +15,10 @@ env:
  protobuf-version: "3.21.12"
  python-version: "3.10"

-  resnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/resnet18-v2-7.onnx
-  inception-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/inception-v2-9.onnx
-  densenet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/densenet-12.onnx
-  efficientnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/efficientnet-lite4-11.onnx
+  resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx
+  inception-download: https://media.githubusercontent.com/media/onnx/models/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx
+  densenet-download: https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-12.onnx
+  efficientnet-download: https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx

 jobs:
  build:
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@ -1,11 +1,12 @@
 name: clang-format Check
 on:
  push:
+    branch: 'master'
    paths-ignore:
      - '**.md'
      - 'LICENSE'
  pull_request:
-    paths:
+    paths-ignore:
      - '**.md'
      - 'LICENSE'

--- a/.gitignore
+++ b/.gitignore
@ -42,5 +42,3 @@ build_debug/

 # onnx model
 *.onnx
-*.pb
-*.npy
--- a/.gitmodules
+++ b/.gitmodules
@ -11,8 +11,5 @@
 	path = 3rd-party/backward-cpp
 	url = git@github.com:bombela/backward-cpp.git
 [submodule "example"]
-	path = examples/NNmodel
+	path = example
 	url = git@github.com:wanghailu0717/NNmodel.git
-[submodule "examples/distributed/onnxsim_large_model"]
-	path = examples/distributed/onnxsim_large_model
-	url = git@github.com:luchangli03/onnxsim_large_model.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,45 +1,20 @@
-# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
-option(USE_CUDA "Support CUDA GPU" OFF)
-option(USE_BANG "Support BANG MLU" OFF)
-option(USE_KUNLUN "Support KUNLUN XPU" OFF)
-option(USE_INTELCPU "Support INTELCPU" OFF)
-option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
-option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
-option(BUILD_NNET "Build nnet" OFF)
-option(BUILD_DIST "Build project for distributed running" OFF)
-option(BUILD_TEST "Build tests" OFF)
-
-if(USE_CUDA)
-    message("CMake 3.18 or higher is required for setting CUDAToolkit")
-    cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit
-else()
-    cmake_minimum_required(VERSION 3.17)
-endif()
-
+cmake_minimum_required(VERSION 3.17) # FindCUDAToolkit
 include(CMakeDependentOption)
 project(InfiniTensor C CXX)

+# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
+option(USE_CUDA "Support CUDA GPU" OFF)
+option(USE_BANG "Support BANG MLU" OFF)
+option(USE_INTELCPU "Support INTELCPU" OFF)
+option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
+option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
+option(BUILD_TEST "Build tests" OFF)
+
 cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
+cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)

 set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
-# Build Type
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    message("Configuring for Debug build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
-    add_compile_definitions(DEBUG_MODE)
-elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
-    message("Configuring for Release build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-    add_compile_definitions(NDEBUG)
-elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
-    message("Configuring for RelWithDebInfo build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
-else()
-    message("Build type not specified. Configuring for RelWithDebInfo build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
-endif()
-

 if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
  message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")
@ -53,13 +28,11 @@ endif()

 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
-add_compile_options(-Wno-error=unused-variable)

 find_package(
  Python
  COMPONENTS Interpreter Development
  REQUIRED)
-
 # OpenMP
 find_package(OpenMP)
 if(OpenMP_C_FOUND)
@ -96,17 +69,16 @@ add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
 include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)

 # TVM backend
-if(BUILD_NNET AND BUILD_TEST)
+if(BUILD_TEST_EINNET)
+  if (NOT TVM_INCLUDE_DIR OR NOT DMLC_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR)
+    message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_TEST_EINNET is ON")
+  endif()
  # TVM and DMLC for invoking TVM packed functions
  include_directories(${TVM_INCLUDE_DIR})
  include_directories(${DMLC_INCLUDE_DIR})
  include_directories(${DLPACK_INCLUDE_DIR})
-  if (TVM_INCLUDE_DIR AND DMLC_INCLUDE_DIR AND DLPACK_INCLUDE_DIR AND DLPACK_INCLUDE_DIR)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
-  else()
-    # message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_NNET AND BUILD_TEST is ON")
-  endif()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
 endif()

 if(BUILD_TEST)
@ -120,21 +92,13 @@ if(BUILD_TEST)
  include_directories(3rd-party/googletest/googletest/include)
 endif()

-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations -Wno-error=pointer-arith")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion


 # Source files
-file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/operators/*.cc src/utils/*.cc)
-
-if(BUILD_NNET)
-  add_compile_definitions(BUILD_NNET=1)
-  file(GLOB_RECURSE SRC_NNET src/nnet/*.cc)
-  list (APPEND SRC ${SRC_NNET})
-  # For locating resource files
-  set_source_files_properties(src/nnet/test.cc PROPERTIES COMPILE_OPTIONS "-DINFINI_PROJECT_HOME=${CMAKE_CURRENT_SOURCE_DIR}")
-endif()
+file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/operators/*.cc src/utils/*.cc)

 if(USE_CUDA)
  file(GLOB_RECURSE SRC_CUDA src/cuda/*.cc src/cuda/*.cu src/kernels/cuda/*.cc src/kernels/cuda/*.cu)
@ -146,11 +110,6 @@ if(USE_BANG)
  list (APPEND SRC ${SRC_BANG})
 endif()

-if(USE_KUNLUN)
-  file(GLOB_RECURSE SRC_KUNLUN src/kunlun/*.cc src/kernels/kunlun/*.cc )
-  list (APPEND SRC ${SRC_KUNLUN})
-endif()
-
 if(USE_INTELCPU)
  file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc )
  list (APPEND SRC ${SRC_INTELCPU})
@ -165,7 +124,7 @@ endif()
 target_link_libraries(InfiniTensor pybind11::embed)

 # TVM backend
-if(BUILD_NNET AND BUILD_TEST AND TVM_LIB_DIR)
+if(BUILD_TEST_EINNET)
  target_link_libraries(InfiniTensor ${TVM_LIB_DIR}/libtvm.so)
 endif()

@ -218,13 +177,6 @@ if(USE_CUDA)
  enable_language(CUDA)
  find_package(CUDAToolkit) # For nvrtc and cuda driver
  target_link_libraries(InfiniTensor cudnn CUDA::curand CUDA::cublas CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
-  if (BUILD_DIST)
-    message(STATUS "Add BUILD_DIST, use NCCL with CUDA")
-    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
-    find_package(NCCL REQUIRED)
-    add_compile_definitions(INFINI_USE_NCCL=1)
-    target_link_libraries(InfiniTensor nccl)
-  endif()
 endif()

 if(USE_BANG)
@ -263,50 +215,7 @@ if(USE_BANG)
  # BangC Kernels
  ################################################################################

-  if (BUILD_DIST)
-    find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
-    target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
-    message(STATUS "Add BUILD_DIST, use CNCL with BANG")
-    add_compile_definitions(INFINI_USE_CNCL=1)
-  else()
-    target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
-  endif()
-endif()
-
-if(USE_KUNLUN)
-  add_compile_definitions(USE_KUNLUN=1)
-  if ((NOT DEFINED KUNLUN_HOME) AND (NOT DEFINED ENV{KUNLUN_HOME}))
-    message(FATAL_ERROR "KUNLUN_HOME is not defined from cmake or env")
-  elseif (DEFINED KUNLUN_HOME)
-          set(KUNLUN_HOME ${KUNLUN_HOME} CACHE STRING "KUNLUN_HOME directory for Kunlun development")
-  else()
-          set(KUNLUN_HOME $ENV{KUNLUN_HOME} CACHE STRING "KUNLUN_HOME directory for Kunlun development")
-  endif()
-  message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")
-
-  include_directories("${KUNLUN_HOME}/include/")
-  find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64/")
-  find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/lib64/")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
-
-  if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
-    execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
-    set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
-  elseif(DEFINED TARGET_CPU_ARCH)
-    set(TARGET_CPU_ARCH ${TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
-  else()
-    set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
-  endif()
-  message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
-
-  if (BUILD_DIST)
-    message(STATUS "Add BUILD_DIST, use XCCL with KUNLUN XPU")
-    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
-    find_package(XCCL REQUIRED)
-    add_compile_definitions(INFINI_USE_XCCL=1)
-    target_link_libraries(InfiniTensor ${XCCL_LIBRARIES})
-  endif()
-  target_link_libraries(InfiniTensor ${KUNLUN_RT} ${KUNLUN_DNN} stdc++)
+  target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
 endif()

 # # Python bindings
@ -325,7 +234,6 @@ function(build_test files)
 endfunction()

 if(BUILD_TEST)
-  add_compile_definitions(BUILD_TEST=1)
  enable_testing()
  if(USE_TRACE)
    build_test(test/trace/*.cc)
@ -333,18 +241,11 @@ if(BUILD_TEST)
  if(BUILD_TEST_CORE)
    build_test(test/core/*.cc)
    build_test(test/operators/*.cc)
-    build_test(test/kernels/nativecpu/*.cc)
    if (USE_CUDA)
      build_test(test/kernels/cuda/*.cc)
-      build_test(test/cuda/*.cc)
    endif()
    if (USE_BANG)
      build_test(test/kernels/bang/*.cc)
-      build_test(test/bang/*.cc)
-    endif()
-    if (USE_KUNLUN)
-      build_test(test/kernels/kunlun/*.cc)
-      build_test(test/kunlun/*.cc)
    endif()
    if (USE_INTELCPU)
      build_test(test/kernels/intelcpu/*.cc)
@ -353,7 +254,7 @@ if(BUILD_TEST)
  if(BUILD_TEST_PET)
    build_test(test/pet/*.cc)
  endif()
-  if(BUILD_NNET AND BUILD_TEST)
+  if(BUILD_TEST_EINNET)
    build_test(test/nnet/test_*.cc)

    # Build expression reader
@ -361,3 +262,19 @@ if(BUILD_TEST)
    target_link_libraries(nnet_reader InfiniTensor)
  endif()
 endif()
+
+function(build_bench files)
+  file(GLOB BENCH_SOURCES ${files})
+  foreach(benchsourcefile ${BENCH_SOURCES})
+    get_filename_component(benchname ${benchsourcefile} NAME_WE)
+    add_executable("benchmark_${benchname}" ${benchsourcefile})
+    target_link_libraries("benchmark_${benchname}" InfiniTensor)
+    # add_custom_target(NAME ${benchname} COMMAND ${benchname})
+  endforeach(benchsourcefile ${BENCH_SOURCES})
+endfunction()
+
+if (BENCH)
+  if (USE_CUDA)
+    build_bench(benchmark/kernels/cuda/*.cc)
+  endif()
+endif()
--- a/48
+++ b/48
@ -1,38 +1,19 @@
-.PHONY : build clean format install-python test-cpp test-onnx
+.PHONY : build clean install-python test-cpp test-onnx

-TYPE ?= Release
+TYPE ?= release
 CUDA ?= OFF
 BANG ?= OFF
-KUNLUN ?= OFF
 INTELCPU ?= off
 BACKTRACE ?= ON
 TEST ?= ON
-DIST ?= OFF
-NNET ?= OFF
-DIST ?= OFF
-FORMAT_ORIGIN ?=
-# Docker build options
-DOCKER_NAME ?= infinitensor
-DOCKER_IMAGE_NAME ?= infinitensor
-DOCKER_FILE ?= infinitensor_ubuntu_22.04.dockerfile
-DOCKER_RUN_OPTION ?=
-
-# CUDA option.
-ifeq ($(CUDA), ON)
-	DOCKER_IMAGE_NAME = infinitensor_cuda
-	DOCKER_NAME = infinitensor_cuda
-	DOCKER_FILE = infinitensor_ubuntu_22.04_CUDA.dockerfile
-	DOCKER_RUN_OPTION += --gpus all -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v `pwd`:`pwd` -w `pwd`
-endif
+BENCH ?= ON

 CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
 CMAKE_OPT += -DUSE_CUDA=$(CUDA)
 CMAKE_OPT += -DUSE_BANG=$(BANG)
-CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
 CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
 CMAKE_OPT += -DBUILD_TEST=$(TEST)
-CMAKE_OPT += -DBUILD_DIST=$(DIST)
-CMAKE_OPT += -DBUILD_NNET=$(NNET)
+CMAKE_OPT += -DBENCH=$(BENCH)

 ifeq ($(INTELCPU), ON)
 	CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp
@ -45,12 +26,9 @@ build:
 clean:
 	rm -rf build

-format:
-	@python3 scripts/format.py $(FORMAT_ORIGIN)
-
 install-python: build
 	cp build/$(TYPE)/backend*.so pyinfinitensor/src/pyinfinitensor
-	pip install -e pyinfinitensor/
+	pip install pyinfinitensor/

 test-cpp:
 	@echo
@ -59,19 +37,3 @@ test-cpp:
 test-onnx:
 	@echo
 	python3 pyinfinitensor/tests/test_onnx.py
-
-test-api:
-	@echo
-	python3 pyinfinitensor/tests/test_api.py
-
-docker-build:
-	docker build -f scripts/dockerfile/$(DOCKER_FILE) -t $(DOCKER_NAME) .
-
-docker-run:
-	docker run -t --name $(DOCKER_IMAGE_NAME) -d $(DOCKER_NAME) $(DOCKER_RUN_OPTION)
-
-docker-start:
-	docker start $(DOCKER_IMAGE_NAME)
-
-docker-exec:
-	docker exec -it $(DOCKER_IMAGE_NAME) bash
--- a/README.md
+++ b/README.md
@ -33,14 +33,13 @@ There are several configurable CMake options, see the [CMakeLists.txt](/CMakeLis

 ## Roadmap

- [RefactorGraph](https://github.com/InfiniTensor/RefactorGraph) is a newly designed AI framework that is set to replace the current main branch.
 - [EinNet](https://github.com/InfiniTensor/InfiniTensor/tree/NNET_e2e) is going to be merged into the main branch.
 - Integration of [PET](https://github.com/thu-pacman/PET), a tensor program optimizer supporting partially equivalent transformations.
 - Supported hardware
  - ✔ NVIDIA GPU
  - ✔ Cambricon MLU
-  - ✔ Kunlunxin XPU
  - ⬜ Ascend NPU
+  - ⬜ Kunlunxin XPU

 ## Contributor Guide

--- a/benchmark/kernels/cuda/conv.cc
+++ b/benchmark/kernels/cuda/conv.cc
@ -0,0 +1,243 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/conv.h"
+#include "benchmark.h"
+#include <iostream>
+#include <cmath>
+#include <chrono>
+#include <sys/time.h>
+
+using namespace infini;
+
+#define M 1048576
+
+const char algo_name[8][50] = {
+    "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM",
+    "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM",
+    "CUDNN_CONVOLUTION_FWD_ALGO_GEMM",
+    "CUDNN_CONVOLUTION_FWD_ALGO_DIRECT",
+    "CUDNN_CONVOLUTION_FWD_ALGO_FFT",
+    "CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING",
+    "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD",
+    "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED",
+};
+
+const char mode_name[2][50] = {
+    "CUDNN_CONVOLUTION",
+    "CUDNN_CROSS_CORRELATION"
+};
+
+int main() {
+    // Benchmark Settings
+    int warmupRounds = 50;
+    int timingRounds = 100;
+    DataType dtype = DataType::Float32; 
+
+    // cudnn Conv Configurations
+    cudnnConvolutionMode_t convMode = CUDNN_CROSS_CORRELATION;
+    cudnnConvolutionFwdAlgo_t convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    float alpha = 1.f, beta = 0.f;
+
+    int n, c, h, w, f, r, s;
+    int INPUT_BATCH_SIZE = n = 16;
+    int INPUT_CHANNELS = c = 128;
+    int INPUT_HEIGHT = h = 128;
+    int INPUT_WIDTH = w = 128;
+    Shape INPUT_SHAPE = {INPUT_BATCH_SIZE, INPUT_CHANNELS, \
+                         INPUT_HEIGHT, INPUT_WIDTH};
+
+    int OUTPUT_CHANNELS = f = 256;
+    int KERNEL_HEIGHT = r = 3;
+    int KERNEL_WIDTH = s = 3;
+    Shape KERNEL_SHAPE = {INPUT_CHANNELS, OUTPUT_CHANNELS, \
+                          KERNEL_HEIGHT, KERNEL_WIDTH};
+
+    int NUM_GROUPS = 1;
+
+    int PAD_HEIGHT = 0;
+    int PAD_WIDTH = 0;
+    int VERTICAL_STRIDE = 1;
+    int HORIZONTAL_STRIDE = 1;
+    int DILATION_HEIGHT = 1;
+    int DILATION_WIDTH = 1;
+
+    // Get input size
+    size_t inputSize = 1;
+    for (auto dim: INPUT_SHAPE) {
+        inputSize *= dim;
+    }
+    size_t inputSizeInBytes = inputSize * sizeof(dtype);
+
+    // Get kernel size
+    size_t kernelSize = 1;
+    for (auto dim: KERNEL_SHAPE) {
+        kernelSize *= dim;
+    }
+    size_t kernelSizeInBytes = kernelSize * sizeof(dtype);
+
+    // Init time variables
+    double time_memcpy_htod = 0.0, time_memcpy_dtoh = 0.0;
+    double time_op = 0.0;
+
+    // Create runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+
+    // Build input data and kernel on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(RandomGenerator());
+
+    Tensor kernelCpu =
+        make_ref<TensorObj>(KERNEL_SHAPE, dtype, cpuRuntime);
+    kernelCpu->dataMalloc();
+    kernelCpu->setData(RandomGenerator());
+
+    // Build input data and kernel on GPU
+    Tensor inputGpu = 
+        make_ref<TensorObj>(INPUT_SHAPE, dtype, cudaRuntime);
+    inputGpu->dataMalloc();
+
+    Tensor kernelGpu = 
+        make_ref<TensorObj>(KERNEL_SHAPE, dtype, cudaRuntime);
+    kernelGpu->dataMalloc();
+
+    // Do memcpy host to device
+    time_memcpy_htod += timeit(
+        [&]() {
+            inputGpu = inputCpu->clone(cudaRuntime);
+            kernelGpu = kernelCpu->clone(cudaRuntime);
+        },
+        [&]() { cudaRuntime->sync(); }, 
+        warmupRounds, timingRounds
+    );
+
+    int channelsPerGrp = INPUT_CHANNELS / NUM_GROUPS;
+
+    // Build cudnn descriptors
+    // input descriptor
+    cudnnTensorDescriptor_t inDesc;
+    checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
+    checkCudnnError(cudnnSetTensor4dDescriptor(
+        inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
+
+    // kernel descriptor
+    cudnnFilterDescriptor_t knDesc;
+    checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
+    checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
+                                                CUDNN_TENSOR_NCHW, f,
+                                                channelsPerGrp, r, s));
+    
+    // bias descriptor
+    // cudnnTensorDescriptor_t biasDesc;
+    // checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
+    // checkCudnnError(cudnnSetTensor4dDescriptor(
+    //     biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1));
+
+    // convlution descriptor
+    cudnnConvolutionDescriptor_t convDesc;
+    checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
+    checkCudnnError(cudnnSetConvolution2dDescriptor(
+        convDesc, PAD_HEIGHT, PAD_WIDTH, VERTICAL_STRIDE, HORIZONTAL_STRIDE, 
+        DILATION_HEIGHT, DILATION_WIDTH, convMode, CUDNN_DATA_FLOAT));
+    if (NUM_GROUPS > 1) {
+        checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, NUM_GROUPS));
+    }
+
+    // Get output shape
+    int outn, outc, outh, outw;
+    checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
+        convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
+
+    // Build output descriptor
+    cudnnTensorDescriptor_t outDesc;
+    checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
+    checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
+                                               CUDNN_DATA_FLOAT, outn, outc,
+                                               outh, outw));
+
+    // Get output size
+    Shape OUTPUT_SHAPE = {outn, outc, outh, outw};
+    size_t outputSize = 1;
+    for (auto dim: OUTPUT_SHAPE) {
+        outputSize *= dim;
+    }
+    size_t outputSizeInBytes = outputSize * sizeof(dtype);
+
+    // Build output data on CPU
+    Tensor outputCpu =
+        make_ref<TensorObj>(OUTPUT_SHAPE, dtype, cpuRuntime);
+    outputCpu->dataMalloc();
+
+    // Build output data on GPU
+    Tensor outputGpu = 
+        make_ref<TensorObj>(OUTPUT_SHAPE, dtype, cudaRuntime);
+    outputGpu->dataMalloc();
+
+    // Get workspace size
+    size_t workspaceSize = 0;
+    checkCudnnError(cudnnGetConvolutionForwardWorkspaceSize(
+        cudaRuntime->cudnnHandle(), inDesc, knDesc, convDesc,
+        outDesc, convAlgo, &workspaceSize));
+
+    CudaPtr workspace = cudaRuntime->getWorkspace(workspaceSize);
+
+    // Do forward
+    time_op += timeit(
+        [&]() {
+            cudnnConvolutionForward(cudaRuntime->cudnnHandle(), &alpha,
+                                    inDesc, inputGpu->getRawDataPtr<void *>(), 
+                                    knDesc, kernelGpu->getRawDataPtr<void *>(),
+                                    convDesc, convAlgo, workspace, 
+                                    workspaceSize, &beta, 
+                                    outDesc, outputGpu->getRawDataPtr<void *>());
+        },
+        [&]() { cudaRuntime->sync(); },
+        warmupRounds, timingRounds
+    );
+
+    checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
+    checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
+    // checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
+    checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
+    checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
+
+    // Do memcpy device to host
+    time_memcpy_dtoh += timeit(
+        [&]() {
+            outputCpu = outputGpu->clone(cpuRuntime);
+        },
+        [&]() { cudaRuntime->sync(); },
+        warmupRounds, timingRounds
+    );
+
+    // Print Results
+    printf("Operator - Convolution:\n");
+    printf("Conv Algo: %s\n", algo_name[convAlgo]);
+    printf("Conv Mode: %s\n", mode_name[convMode]);
+    printf("Input shape: (%d, %d, %d, %d)\n", 
+        INPUT_SHAPE[0], INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]);
+    printf("Kernel shape: (%d, %d, %d, %d)\n", 
+        KERNEL_SHAPE[0], KERNEL_SHAPE[1], KERNEL_SHAPE[2], KERNEL_SHAPE[3]);
+    printf("Output shape: (%d, %d, %d, %d)\n", 
+        OUTPUT_SHAPE[0], OUTPUT_SHAPE[1], OUTPUT_SHAPE[2], OUTPUT_SHAPE[3]);     
+    printf("Workspace size: %ld Bytes, dtype: %s\n", 
+        workspaceSize, dtype.toString().c_str());
+
+    printf("TFlops: %.5lf tflops\n",
+        2.0 * INPUT_BATCH_SIZE * channelsPerGrp * outh * outw * \
+              OUTPUT_CHANNELS * KERNEL_HEIGHT * KERNEL_WIDTH / \
+              VERTICAL_STRIDE / HORIZONTAL_STRIDE / 1e9 / time_op);
+    printf("Memcpy time: h2d - %.6lf ms, d2h - %.6lf ms\n",
+        time_memcpy_htod, time_memcpy_dtoh);
+    printf("Memcpy throughput: h2d - %.6lf MB/ms, d2h: %.6lf MB/ms\n",
+        (inputSizeInBytes + kernelSizeInBytes) / M / time_memcpy_htod, 
+        outputSizeInBytes / M / time_memcpy_dtoh);    
+    printf("Operation: %.6lf ms\n", time_op);
+
+    return 0;
+}
--- a/benchmark/kernels/cuda/softmax.cc
+++ b/benchmark/kernels/cuda/softmax.cc
@ -0,0 +1,126 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/softmax.h"
+#include "benchmark.h"
+#include <iostream>
+#include <cmath>
+#include <chrono>
+#include <sys/time.h>
+
+using namespace infini;
+
+#define M 1048576
+
+int main() {
+
+    // Benchmark Settings
+    int warmupRounds = 200;
+    int timingRounds = 200;
+    Shape INPUT_SHAPE = {16, 3, 128, 128};
+    DataType dtype = DataType::Float32;
+
+    // Get data size
+    size_t size = 1;
+    for (auto dim: INPUT_SHAPE) {
+        size *= dim;
+    }
+    size_t sizeInBytes = size * sizeof(dtype);
+
+    // Init time variables
+    double time_memcpy_htod = 0.0, time_memcpy_dtoh = 0.0;
+    double time_op = 0.0;
+
+    // Create runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(RandomGenerator());
+
+    // Build input data on GPU
+    Tensor inputGpu = 
+        make_ref<TensorObj>(INPUT_SHAPE, dtype, cudaRuntime);
+    inputGpu->dataMalloc();
+
+    // Do memcpy host to device
+    time_memcpy_htod += timeit(
+        [&]() {
+            inputGpu = inputCpu->clone(cudaRuntime);
+        },
+        [&]() { cudaRuntime->sync(); }, 
+        warmupRounds, timingRounds
+    );
+
+    // Build output data on CPU
+    auto outputGpu = inputGpu->clone(cudaRuntime);
+
+    // Build output data on GPU
+    Tensor outputCpu =
+        make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime);
+    outputCpu->dataMalloc();
+
+    // Build cudnn descriptors
+    cudnnTensorDescriptor_t inputDesc, outputDesc;
+
+    // input descriptor
+    checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
+    checkCudnnError(cudnnSetTensor4dDescriptor(
+        inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, INPUT_SHAPE[0],
+        INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]));
+
+    // output descriptor
+    checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
+    checkCudnnError(cudnnSetTensor4dDescriptor(
+        outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, INPUT_SHAPE[0],
+        INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]));
+
+    // cudnn operator settings
+    float alpha = 1.0, beta = 0.0;
+    cudnnSoftmaxAlgorithm_t algo = CUDNN_SOFTMAX_FAST;
+    cudnnSoftmaxMode_t mode = CUDNN_SOFTMAX_MODE_INSTANCE;
+
+    // Do forward
+    time_op += timeit(
+        [&]() {
+            cudnnSoftmaxForward(cudaRuntime->cudnnHandle(), algo, mode, 
+                &alpha, inputDesc, inputGpu->getRawDataPtr<void *>(), 
+                &beta, outputDesc, outputGpu->getRawDataPtr<void *>());
+        },
+        [&]() { cudaRuntime->sync(); },
+        warmupRounds, timingRounds
+    );
+    
+    checkCudnnError(cudnnDestroyTensorDescriptor(inputDesc));
+    checkCudnnError(cudnnDestroyTensorDescriptor(outputDesc));
+    
+    // Do memcpy device to host
+    time_memcpy_dtoh += timeit(
+        [&]() {
+            outputCpu = outputGpu->clone(cpuRuntime);
+        },
+        [&]() { cudaRuntime->sync(); },
+        warmupRounds, timingRounds
+    );
+
+    // Print Results
+    printf("Operator - Softmax:\n");
+    printf("Input shape: (%d, %d, %d, %d)\n", 
+        INPUT_SHAPE[0], INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]);
+    printf("Input size: %ld, dtype: %s, size in bytes: %ld\n", 
+        size, dtype.toString().c_str(), sizeInBytes);
+
+    printf("TFlops: %.5lf tflops\n", 5 * size / 1e9 / time_op);
+    printf("Memcpy time: h2d - %.6lf ms, d2h - %.6lf ms\n",
+        time_memcpy_htod, time_memcpy_dtoh);
+    printf("Memcpy throughput: h2d - %.6lf MB/ms, d2h: %.6lf MB/ms\n",
+        sizeInBytes / M / time_memcpy_htod, sizeInBytes / M / time_memcpy_dtoh);    
+    printf("Operation: %.6lf ms\n", time_op);
+
+    return 0;
+}
--- a/cmake/FindCNCL.cmake
+++ b/cmake/FindCNCL.cmake
@ -1,76 +0,0 @@
-SET(CNCL_LIB_SEARCH_PATHS $ENV{NEUWARE_HOME}/lib64)
-SET(CNCL_INCLUDE_SEARCH_PATHS $ENV{NEUWARE_HOME}/include)
-
-set(CNCL_INCLUDE_DIR $ENV{NEUWARE_HOME}/include)
-set(CNCL_LIB_DIR $ENV{NEUWARE_HOME}/lib64)
-set(CNCL_VERSION $ENV{CNCL_VERSION} CACHE STRING "Version of CNCL to build with")
-
-if ($ENV{CNCL_ROOT_DIR})
-  message(WARNING "CNCL_ROOT_DIR is deprecated. Please set CNCL_ROOT instead.")
-endif()
-list(APPEND CNCL_ROOT $ENV{CNCL_ROOT_DIR} ${MLU_TOOLKIT_ROOT_DIR})
-# Compatible layer for CMake <3.12. CNCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
-list(APPEND CMAKE_PREFIX_PATH ${CNCL_ROOT})
-
-find_path(CNCL_INCLUDE_DIRS
-  NAMES cncl.h
-  HINTS ${CNCL_INCLUDE_DIR})
-
-if (USE_STATIC_CNCL)
-  MESSAGE(STATUS "USE_STATIC_CNCL is set. Linking with static CNCL library.")
-  SET(CNCL_LIBNAME "CNCL_static")
-  if (CNCL_VERSION)  # Prefer the versioned library if a specific CNCL version is specified
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
-  endif()
-else()
-  SET(CNCL_LIBNAME "cncl")
-  if (CNCL_VERSION)  # Prefer the versioned library if a specific CNCL version is specified
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
-  endif()
-endif()
-
-find_library(CNCL_LIBRARIES
-  NAMES ${CNCL_LIBNAME}
-  HINTS ${CNCL_LIB_DIR})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(CNCL DEFAULT_MSG CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
-
-if(CNCL_FOUND)  # obtaining CNCL version and some sanity checks
-  set (CNCL_HEADER_FILE "${CNCL_INCLUDE_DIRS}/cncl.h")
-  message (STATUS "Determining CNCL version from ${CNCL_HEADER_FILE}...")
-  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
-  list (APPEND CMAKE_REQUIRED_INCLUDES ${CNCL_INCLUDE_DIRS})
-  include(CheckCXXSymbolExists)
-  check_cxx_symbol_exists(CNCL_VERSION_CODE CNCL.h CNCL_VERSION_DEFINED)
-
-  if (CNCL_VERSION_DEFINED)
-    set(file "${PROJECT_BINARY_DIR}/detect_cncl_version.cc")
-    file(WRITE ${file} "
-      #include <iostream>
-      #include <cncl.h>
-      int main()
-      {
-        std::cout << CNCL_MAJOR << '.' << CNCL_MINOR << '.' << CNCL_PATCH << std::endl;
-        int x;
-        CNCLGetVersion(&x);
-        return x == CNCL_VERSION_CODE;
-      }
-")
-    try_run(CNCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
-          RUN_OUTPUT_VARIABLE CNCL_VERSION_FROM_HEADER
-          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${CNCL_INCLUDE_DIRS}"
-          LINK_LIBRARIES ${CNCL_LIBRARIES})
-    if (NOT CNCL_VERSION_MATCHED)
-      message(FATAL_ERROR "Found CNCL header version and library version do not match! \
-(include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES}) Please set CNCL_INCLUDE_DIR and CNCL_LIB_DIR manually.")
-    endif()
-    message(STATUS "CNCL version: ${CNCL_VERSION_FROM_HEADER}")
-  else()
-    # message(STATUS "CNCL version < 2.3.5-5")
-  endif ()
-  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
-
-  message(STATUS "Found CNCL (include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES})")
-  mark_as_advanced(CNCL_ROOT_DIR CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
-endif()
--- a/cmake/FindNCCL.cmake
+++ b/cmake/FindNCCL.cmake
@ -1,165 +0,0 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
-# 
-# From PyTorch:
-# 
-# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
-# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
-# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
-# 
-# From Caffe2:
-# 
-# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
-# 
-# All contributions by Facebook:
-# Copyright (c) 2016 Facebook Inc.
-# 
-# All contributions by Google:
-# Copyright (c) 2015 Google Inc.
-# All rights reserved.
-# 
-# All contributions by Yangqing Jia:
-# Copyright (c) 2015 Yangqing Jia
-# All rights reserved.
-# 
-# All contributions by Kakao Brain:
-# Copyright 2019-2020 Kakao Brain
-# 
-# All contributions from Caffe:
-# Copyright(c) 2013, 2014, 2015, the respective contributors
-# All rights reserved.
-# 
-# All other contributions:
-# Copyright(c) 2015, 2016 the respective contributors
-# All rights reserved.
-# 
-# Caffe2 uses a copyright model similar to Caffe: each contributor holds
-# copyright over their contributions to Caffe2. The project versioning records
-# all such contribution and copyright details. If a contributor wants to further
-# mark their specific copyright on a particular contribution, they should
-# indicate their copyright solely in the commit message of the change when it is
-# committed.
-# 
-# All rights reserved.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# 
-# 1. Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-# 
-# 2. Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-# 
-# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
-#    and IDIAP Research Institute nor the names of its contributors may be
-#    used to endorse or promote products derived from this software without
-#    specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-# 
-# Find the nccl libraries
-#
-# The following variables are optionally searched for defaults
-#  NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
-#  NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
-#  NCCL_LIB_DIR: Directory where NCCL library is found
-#
-# The following are set after configuration is done:
-#  NCCL_FOUND
-#  NCCL_INCLUDE_DIRS
-#  NCCL_LIBRARIES
-#
-# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
-# install NCCL in the same location as the CUDA toolkit.
-# See https://github.com/caffe2/caffe2/issues/1601
-
-set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
-set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
-set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
-
-if ($ENV{NCCL_ROOT_DIR})
-  message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
-endif()
-list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
-# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
-list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
-
-find_path(NCCL_INCLUDE_DIRS
-  NAMES nccl.h
-  HINTS ${NCCL_INCLUDE_DIR})
-
-if (USE_STATIC_NCCL)
-  MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
-  SET(NCCL_LIBNAME "nccl_static")
-  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
-  endif()
-else()
-  SET(NCCL_LIBNAME "nccl")
-  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
-  endif()
-endif()
-
-find_library(NCCL_LIBRARIES
-  NAMES ${NCCL_LIBNAME}
-  HINTS ${NCCL_LIB_DIR})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
-
-if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
-  set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
-  message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
-  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
-  list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
-  include(CheckCXXSymbolExists)
-  check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
-
-  if (NCCL_VERSION_DEFINED)
-    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
-    file(WRITE ${file} "
-      #include <iostream>
-      #include <nccl.h>
-      int main()
-      {
-        std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
-        int x;
-        ncclGetVersion(&x);
-        return x == NCCL_VERSION_CODE;
-      }
-")
-    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
-          RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
-          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
-          LINK_LIBRARIES ${NCCL_LIBRARIES})
-    if (NOT NCCL_VERSION_MATCHED)
-      message(FATAL_ERROR "Found NCCL header version and library version do not match! \
-(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
-    endif()
-    message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
-  else()
-    # message(STATUS "NCCL version < 2.3.5-5")
-  endif ()
-  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
-
-  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
-  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
-endif()
--- a/cmake/FindXCCL.cmake
+++ b/cmake/FindXCCL.cmake
@ -1,27 +0,0 @@
-# Find the xccl libraries
-set(XCCL_INCLUDE_DIR $ENV{KUNLUN_HOME}/include CACHE PATH "Folder contains KUNLUN XCCL headers")
-set(XCCL_LIB_DIR $ENV{KUNLUN_HOME}  CACHE PATH "Folder contains KUNLUN XCCL libraries")
-
-list(APPEND CMAKE_PREFIX_PATH $ENV{KUNLUN_HOME})
-
-find_path(XCCL_INCLUDE_DIRS # ${XCCL_INCLUDE_DIR}
-  NAMES xpu/bkcl.h
-  HINTS XCCL_INCLUDE_DIR)
-
-find_library(XCCL_LIBRARIES # ${XCCL_LIB_DIR}
-  NAMES lib64/libbkcl.so
-  HINTS XCCL_LIB_DIR)
-
-message(STATUS "XCCL_INCLUDE_DIRS: ${XCCL_INCLUDE_DIRS}")
-message(STATUS "XCCL_LIBRARIES: ${XCCL_LIBRARIES}")
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(XCCL DEFAULT_MSG XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
-
-if (XCCL_FOUND)
-  set (XCCL_HEADER_FILE "${XCCL_INCLUDE_DIRS}/xpu/bkcl.h")
-  message (STATUS "Determing XCCL version from ${XCCL_HEADER_FILE}...")
-  list (APPEND CMAKE_REQUIRED_INCLUDES ${XCCL_INCLUDE_DIRS})
-  message(STATUS "Found XCCL (include: ${XCCL_INCLUDE_DIRS}, library: ${XCCL_LIBRARIES})")
-  mark_as_advanced(XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
-endif()
--- a/docs/INSTALL_GUIDE_CN.md
+++ b/docs/INSTALL_GUIDE_CN.md
@ -133,40 +133,10 @@
   make install-python BANG=ON
   ```

-   编译 CPU 部分，同时编译昆仑 XPU 部分：
-
-   ```bash
-   export KUNLUN_HOME=/path/to/your/kunlun_home
-   make install-python KUNLUN=ON
-   ```
-
 3. 使用方法

   安装成功后，您就可以使用本项目的 Python 接口进行编码并运行。具体使用方式可以参考项目样例代码 example/Resnet/resnet.py 以及用户使用手册

-## Docker
-
-本项目也提供了 Docker 的环境，您可以使用 `make docker-build` 或 `make docker-build CUDA=ON` 命令启动并编译 Dockerfile，您可以通过添加编译选项或者修改 Makefile 变量修改 docker image 名称或者所选的 Dockerfile 文件。
- 
-由于在拉取 github repo 时需要将 ssh key 加入到 github profile 中，因此暂时注释掉拉取 repo 并编译项目的过程，由用户在进入 docker 后自己维护 ssh key（将 host 中的 ssh key 复制到 docker 中可能会遇到环境不一致的问题）。
-
-```shell
-# Build docker container.
-make docker-build
-# Run docker image.
-make docker-run
-# Execute docker image.
-make docker-exec
-```
-
-如果需要编译 CUDA 版，请使用如下命令：
-```shell
-# Build docker container.
-make docker-build CUDA=ON
-# Run docker image.
-make docker-run CUDA=ON
-```
-
 ## 技术支持

 如遇到问题，请联系我们技术支持团队
--- a/docs/SUPPORT_MATRIX_CN.md
+++ b/docs/SUPPORT_MATRIX_CN.md
@ -2,7 +2,6 @@

 ## 目录

-
 - [环境支持](#环境支持)
 - [神经网络支持](#神经网络支持)
 - [技术支持](#技术支持)
@ -20,10 +19,10 @@

 目前已经验证过的神经网络模型有

- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
+- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
+- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
+- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
+- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)

 ## 技术支持

--- a/docs/USER_GUIDE_CN.md
+++ b/docs/USER_GUIDE_CN.md
@ -3,10 +3,9 @@
 ## 目录

 - [使用方法](#使用方法)
- [python 前端应用指南](#python-前端应用指南)
-  - [导入 onnx 模型](#导入-onnx-模型)
-  - [优化](#优化)
-  - [导出 onnx 模型](#导出-onnx-模型)
+- [python-前端应用指南](#python-前端应用指南)
+  - [导入-onnx-模型](#导入-onnx-模型)
+  - [导出-onnx-模型](#导出-onnx-模型)
  - [执行推理](#执行推理)
  - [样例代码](#样例代码)
 - [技术支持](#技术支持)
@ -14,7 +13,7 @@

 ## 使用方法

-项目管理功能已写到 [Makefile](../Makefile)，支持下列功能：
+项目管理功能已写到 [Makefile](Makefile)，支持下列功能：

 - 编译项目：`make`/`make build`
 - 清理生成文件：`make clean`
@ -27,7 +26,6 @@
 - `TYPE`：编译模式（`debug`/`release`），默认值为 `release`
 - `CUDA`：是否编译 CUDA 后端，默认为 `OFF`，`ON` 打开
 - `BANG`：是否编译寒武纪后端，默认为 `OFF`，`ON` 打开
- `KUNLUN`：是否编译昆仑后端，默认为 `OFF`，`ON` 打开
 - `BACKTRACE`：是否启用栈回溯，默认为 `ON`，`OFF` 关闭，建议调试时打开
 - `TEST`：是否编译 `googletest`，默认为 `ON`，`OFF` 关闭，只有 `test-cpp` 时必要

@ -39,10 +37,10 @@

 支持的模型：

- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
+- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
+- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
+- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
+- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)

 ```python
 import onnx
@ -97,7 +95,7 @@ for name, tensor in stub.inputs.items():
    print(name, tensor.shape(), tensor)
 ```

-对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)，会打印出：
+对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)，会打印出：

 ```plaintext
 data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
@ -138,7 +136,7 @@ for name, tensor in stub.outputs.items():

 ### 样例代码

-您可以参照[resnet.py](https://github.com/wanghailu0717/NNmodel/blob/main/ResNet/resnet.py)的样例代码进行了解，并尝试运行。在这个文件中，我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式：
+您可以参照[./example/Resnet/resnet.py](./example/ResNet/resnet.py)的样例代码进行了解，并尝试运行。在这个文件中，我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式：

 ```python
 python resnet.py -h
--- a/env.sh
+++ b/env.sh
@ -35,4 +35,4 @@ export LD_LIBRARY_PATH="${NEUWARE_HOME}/lib64:${LD_LIBRARY_PATH}"
 # ├── tools
 # ├── version
 # └── XTDK
-export KUNLUN_HOME=/usr/local/xpu
+export XPU_HOME=/usr/local/xpu
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9
--- a/examples/NNmodel
+++ b/examples/NNmodel
@ -1 +0,0 @@
-Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77
--- a/examples/distributed/README.md
+++ b/examples/distributed/README.md
@ -1,39 +0,0 @@
-# 分布式脚本
-
-## 英伟达平台运行方式
-
-#### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx
-
-使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
-
-```bash
-python run_pytorch.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
-```
-
-会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`，目前只支持单一输入输出。
-
-#### 2. 运行InfiniTensor分布式脚本
-
-```bash
-python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
-```
-
-## 寒武纪平台运行方式
-
-**将上述运行脚本 `run_pytorch.py` 以及 `cuda_launch.py` 针对寒武纪平台做了相应的适配，具体见 `run_pytorch_mlu.py` 以及 `bang_launch.py`。**
-
-#### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx
-
-使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
-
-```bash
-python run_pytorch_mlu.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
-```
-
-会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`，目前只支持单一输入输出。
-
-#### 2. 运行InfiniTensor分布式脚本
-
-```bash
-python bang_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
-```
--- a/examples/distributed/init.py
+++ b/examples/distributed/init.py
--- a/examples/distributed/bang/bang_launch.py
+++ b/examples/distributed/bang/bang_launch.py
@ -1,187 +0,0 @@
-import sys
-sys.path.append('../')
-
-import argparse
-import os
-import time
-import multiprocessing as mp
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-from onnx.external_data_helper import convert_model_to_external_data
-from onnx.shape_inference import infer_shapes_path
-import numpy as np
-from parallel_opt import parallel_model
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
-    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
-    parser.add_argument(
-        "--nproc_per_node", type=int, default=1, help="number of processes per node"
-    )
-    parser.add_argument(
-        "--name", type=str, default="test", help="name of this instance."
-    )
-    parser.add_argument(
-        "--model", type=str, required=True, help="path to the ONNX model file."
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--gen_std",
-        action="store_true",
-        help="whether to generate the standard results.",
-    )
-    parser.add_argument(
-        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.num_nodes,
-        args.nproc_per_node,
-        args.name,
-        args.model,
-        args.batch_size,
-        args.length,
-        args.gen_std,
-        args.type,
-    )
-
-
-def run_model(model, runtime, world_size=1, rank=0, n=10, data_type="default"):
-    stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
-    load_inputs(stub, world_size, rank)
-    # stub.tune()
-    stub.run()
-    # get outputs
-    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
-
-    # bench
-    for _ in range(n):
-        stub.run()
-    begin = time.time()
-    for _ in range(n * 2):
-        stub.run()
-    end = time.time()
-    avg_time = (end - begin) / (n * 2)
-    print(f"average time: {avg_time}")
-    return outputs
-
-def load_inputs(stub, world_size=1, rank=0):
-    for i, (name, tensor) in enumerate(stub.inputs.items()):
-        input = np.load(f"./data/input_{i}.npy")
-        if all(x == y for x,y in zip(input.shape,tensor.shape())):
-            tensor.copyin_numpy(input)
-        else:
-            tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
-
-
-def run_and_compare(name, model, runtime, world_size=1, rank=0, data_type="default"):
-    results = np.load(f"./data/output.npy")
-    outputs = run_model(model, runtime, world_size, rank, data_type=data_type)
-    print("outputs abs mean:", abs(outputs).mean())
-    print("max abs diff:", abs(outputs - results).max())
-
-def start_worker(
-    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
-):
-    dist_name = name + "_dist"
-    model = parallel_model(model, world_size, rank)
-    extern_path = f"./{dist_name}_rank{rank}.pb"
-    if os.path.exists(extern_path):
-        os.remove(extern_path)
-    onnx.save_model(
-        model,
-        f"./{dist_name}_rank{rank}.onnx",
-        save_as_external_data=True,
-        location=extern_path,
-    )
-    #infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
-    runtime = backend.BangRuntime(local_rank)
-    # print("init comm")
-    runtime.init_comm(
-        dist_name,
-        world_size,
-        rank,
-    )
-    run_and_compare(name, model, runtime, world_size, rank, data_type)
-
-
-def start_single(name, model, data_type):
-    runtime = backend.BangRuntime(0)
-    run_and_compare(name, model, runtime, data_type=data_type)
-
-def generate_input_output(model):
-    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
-    runtime = backend.BangRuntime(0)
-    stub = OnnxStub(model, runtime)
-    position_id = 0
-    for i, (name, tensor) in enumerate(stub.inputs.items()):
-        input = tensor.copyout_numpy()
-        if np.issubdtype(input.dtype, np.integer):
-            if input.size == 1:
-                # input = np.array([position_id])
-                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
-            else:
-                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
-        elif input.dtype == np.bool_:
-            input = np.random.randint(0,2,size=input.shape) > 0
-        else:
-            if i == 0:
-                input = np.ones(input.shape).astype(input.dtype)
-                position_id = input.shape[-1] - 1
-            else:
-                input = np.random.rand(*input.shape).astype(input.dtype)
-        tensor.copyin_numpy(input)
-        np.save(f"./data/input_{i}", input)
-    stub.run()
-    time.sleep(0.01)
-    output = next(stub.outputs.values().__iter__()).copyout_numpy()
-    if np.isnan(output).any():
-        print("Nan in output")
-    np.save(f"./data/output", output)
-
-
-def main():
-    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
-    data_type = "default" if data_type == "fp32" else data_type
-    
-    model = onnx.load(model_path)
-
-    # generate standart output
-    if gen_std:
-        print(f"generate standard data for {name}.")
-        # a small vocabulary size to fit all LLM.
-        generate_input_output(model)
-        return
-
-    if nproc_per_node == 1:
-        # run single process.
-        # use standalone process to isolate bang.
-        print("run model by single MLU.")
-        # p = mp.Process(target=start_single, args=(name, model, data_type))
-        # p.start()
-        # p.join()
-        start_single(name, model, data_type)
-        return
-
-    # run distributed parallel.
-    world_size = nnodes * nproc_per_node
-    print(f"run model by {world_size} MLU in parallel.")
-    workers = [
-        mp.Process(
-            target=start_worker,
-            args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
-        )
-        for rank in range(world_size)
-    ]
-
-    for w in workers:
-        w.start()
-
-    for w in workers:
-        w.join()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/bang/run_pytorch_mlu.py
+++ b/examples/distributed/bang/run_pytorch_mlu.py
@ -1,249 +0,0 @@
-import argparse
-import torch
-import torch_mlu
-from transformers import BertModel, BertConfig
-from transformers import GPT2Model, GPT2Config
-from transformers import OPTModel, OPTConfig
-from transformers import AlbertModel, AlbertConfig
-from transformers import LlamaModel, LlamaConfig
-import time
-import numpy as np
-import onnx
-import sys
-import os
-from onnx.external_data_helper import convert_model_to_external_data
-from onnxsim import simplify
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
-    parser.add_argument(
-        "--model", type=str, choices=["gpt2", "bert", "opt", "llama", "albert"], required=True, help="model type"
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--export_onnx",
-        type=str,
-        nargs="?",
-        default=None,
-        const="./",
-        help="whether and where to export onnx file",
-    )
-    parser.add_argument(
-        "--type", type=str, choices=["fp32", "fp16", "tf32"], required=True, help="model data type"
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.model,
-        args.batch_size,
-        args.length,
-        args.export_onnx,
-        args.type
-    )
-
-
-def get_model(modelname):
-    match modelname:
-        case "albert":
-            model = AlbertModel.from_pretrained("albert/albert-base-v2")
-            voc_size = AlbertConfig().vocab_size
-        case "bert":
-            model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
-            voc_size = BertConfig().vocab_size
-        case "gpt2":
-            model = GPT2Model.from_pretrained("GPT2")
-            voc_size = GPT2Config().vocab_size
-        case "opt":
-            model = OPTModel.from_pretrained("facebook/opt-125m")
-            voc_size = OPTConfig().vocab_size
-        case "llama":
-            model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
-            voc_size = LlamaConfig().vocab_size
-        case _:
-            raise KeyError(modelname)
-
-    model = model.eval()
-    return model, voc_size
-
-def run_pytorch(torch_model, voc_size, batchsize, len, dtype="fp32"):
-    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
-    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
-    np.save("./data/input_0", data)
-    inputs = torch.from_numpy(data).to("mlu")
-    torch_model = torch_model.to("mlu")
-    if dtype == "fp16":
-        torch_model = torch_model.half()
-
-    n_iter = 20
-    with torch.no_grad():
-        for _ in range(10):
-            outputs = torch_model(inputs)
-    torch.mlu.synchronize()
-    begin = time.time()
-    with torch.no_grad():
-        for _ in range(n_iter):
-            torch.mlu.synchronize()
-            outputs = torch_model(inputs)
-            torch.mlu.synchronize()
-    torch.mlu.synchronize()
-    end = time.time()
-    
-    avg_time = (end - begin) / n_iter
-    outputs = outputs.last_hidden_state.to("cpu")
-    print("outputs abs mean:", abs(np.array(outputs)).mean())
-    print(f"average time: {avg_time}")
-    # torch.mlu.memory.empty_cache()
-    np.save("./data/output", np.array(outputs))
-    print("Save input & output into ./data.")
-
-
-def export_onnx(modelname, model, data, path, extern=False, dtype="fp32"):
-    data = data.to("mlu")
-    model = model.to("mlu")
-    if dtype == "fp16":
-        model = model.half()
-    torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
-    if modelname != "llama":
-        # use onnxsim to simplify
-        onnx_model = onnx.load(path)
-        onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
-        # onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
-        assert check
-        add_value_info_for_constants(onnx_model)
-        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
-        if extern:
-            extern_path = path.replace('.onnx', '.pb')
-            if os.path.exists(extern_path):
-                os.remove(extern_path)
-            extern_path = extern_path.split("/")[-1]
-            convert_model_to_external_data(
-                onnx_model,
-                all_tensors_to_one_file=True,
-                location=extern_path,
-                size_threshold=1024,
-                convert_attribute=False,
-            )
-        onnx.save(onnx_model, path)
-    else:
-        # use third party tool to simplify llama
-        # reference: https://github.com/luchangli03/onnxsim_large_model/
-        sys.path.append("onnxsim_large_model")
-        from onnx_utils import set_onnx_input_shape
-        from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
-
-        in_model_path = path
-        out_model_path = path
-        if not out_model_path:
-            out_model_path = in_model_path[:-5] + ".sim.onnx"
-        if os.path.isdir(out_model_path):
-            out_model_path = os.path.join(out_model_path, os.path.basename(in_model_path))
-
-        onnx_model = onnx.load(in_model_path)
-        print(f"load model from {in_model_path} success")
-
-        size_th_bytes = 1024 * 1024
-
-        onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
-        print(f"compress model success")
-
-        onnx_model = set_onnx_input_shape(onnx_model, "")
-
-        tensor_size_threshold = f"1024KB"
-        skipped_optimizers = []
-        skipped_optimizers.append("eliminate_duplicate_initializer")
-        onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
-                                    tensor_size_threshold=tensor_size_threshold)
-        if not check:
-            raise ValueError(f"simplify compressed model {in_model_path} failed")
-
-        print(f"simplify model success")
-
-        onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
-        print(f"uncompress model success")
-
-        add_value_info_for_constants(onnx_model)
-
-        onnx.save(onnx_model, out_model_path, save_as_external_data=True)
-
-
-def add_value_info_for_constants(model : onnx.ModelProto):
-    """
-    Currently onnx.shape_inference doesn't use the shape of initializers, so add
-    that info explicitly as ValueInfoProtos.
-    Mutates the model.
-    Args:
-        model: The ModelProto to update.
-    """
-    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
-    if model.ir_version < 4:
-        return
-
-    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
-        inputs = {i.name for i in graph.input}
-        existing_info = {vi.name: vi for vi in graph.value_info}
-        for init in graph.initializer:
-            # Check it really is a constant, not an input
-            if init.name in inputs:
-                continue
-
-            # The details we want to add
-            elem_type = init.data_type
-            shape = init.dims
-
-            # Get existing or create new value info for this constant
-            vi = existing_info.get(init.name)
-            if vi is None:
-                vi = graph.value_info.add()
-                vi.name = init.name
-
-            # Even though it would be weird, we will not overwrite info even if it doesn't match
-            tt = vi.type.tensor_type
-            if tt.elem_type == onnx.TensorProto.UNDEFINED:
-                tt.elem_type = elem_type
-            if not tt.HasField("shape"):
-                # Ensure we set an empty list if the const is scalar (zero dims)
-                tt.shape.dim.extend([])
-                for dim in shape:
-                    tt.shape.dim.add().dim_value = dim
-
-        # Handle subgraphs
-        for node in graph.node:
-            for attr in node.attribute:
-                # Ref attrs refer to other attrs, so we don't need to do anything
-                if attr.ref_attr_name != "":
-                    continue
-
-                if attr.type == onnx.AttributeProto.GRAPH:
-                    add_const_value_infos_to_graph(attr.g)
-                if attr.type == onnx.AttributeProto.GRAPHS:
-                    for g in attr.graphs:
-                        add_const_value_infos_to_graph(g)
-
-
-    return add_const_value_infos_to_graph(model.graph)
-
-
-def main():
-    torch.backends.mlu.matmul.allow_tf32 = False
-    torch.backends.cnnl.allow_tf32 = False
-    modelname, batchsize, seqlen, export_path, dtype = parse_args()
-    if dtype == "tf32":
-        torch.backends.mlu.matmul.allow_tf32 = True
-    else:
-        os.environ["CAMBRICON_TF32_OVERRIDE"] = "0"
-
-    model, voc_size = get_model(modelname)
-    if export_path is not None:
-        filename = "{}_{}_{}_{}.onnx".format(modelname, batchsize, seqlen, dtype)
-        path = os.path.join(export_path, filename)
-        if not os.path.exists(path):
-            param = torch.zeros((batchsize, seqlen), dtype=torch.int)
-            export_onnx(modelname, model, param, path, True, dtype)
-        else:
-            print("Onnx path exists, skipping export.")
-
-    run_pytorch(model, voc_size, batchsize, seqlen, dtype)
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/cuda/cuda_launch.py
+++ b/examples/distributed/cuda/cuda_launch.py
@ -1,161 +0,0 @@
-import argparse
-import os
-import time
-import multiprocessing as mp
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-from onnx.external_data_helper import convert_model_to_external_data
-from onnx.shape_inference import infer_shapes_path
-import numpy as np
-from parallel_opt import parallel_model
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
-    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
-    parser.add_argument(
-        "--nproc_per_node", type=int, default=1, help="number of processes per node"
-    )
-    parser.add_argument(
-        "--name", type=str, default="test", help="name of this instance."
-    )
-    parser.add_argument(
-        "--model", type=str, required=True, help="path to the ONNX model file."
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--gen_std",
-        action="store_true",
-        help="whether to generate the standard results.",
-    )
-    parser.add_argument(
-        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.num_nodes,
-        args.nproc_per_node,
-        args.name,
-        args.model,
-        args.batch_size,
-        args.length,
-        args.gen_std,
-        args.type,
-    )
-
-
-def run_model(model, runtime, inputs, n=10, data_type = "default"):
-    stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
-    for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
-        tensor.copyin_numpy(input)
-    # stub.tune()
-    stub.run()
-    # get outputs
-    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
-
-    # bench
-    for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
-        tensor.copyin_numpy(input)
-    begin = time.time()
-    for _ in range(n):
-        stub.run()
-    end = time.time()
-    avg_time = (end - begin) / n
-    print(f"average time: {avg_time}")
-    return outputs
-
-
-def run_and_compare(name, model, runtime, data_type):
-    input_ids = np.load(f"{name}_inputs.npy")
-    position_ids = np.arange(input_ids.shape[-1])
-    results = np.load(f"{name}_results.npy")
-    outputs = run_model(model, runtime, (input_ids, position_ids), data_type=data_type)
-    print("outputs abs mean:", abs(outputs).mean())
-    print("max abs diff:", abs(outputs - results).max())
-
-
-def start_worker(
-    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
-):
-    dist_name = name + "_dist"
-    model = parallel_model(model, world_size, rank)
-    extern_path = f"./{dist_name}_rank{rank}.pb"
-    if os.path.exists(extern_path):
-        os.remove(extern_path)
-    onnx.save_model(
-        model,
-        f"./{dist_name}_rank{rank}.onnx",
-        save_as_external_data=True,
-        location=extern_path,
-    )
-    #infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
-    runtime = backend.CudaRuntime(local_rank)
-    # print("init comm")
-    runtime.init_comm(
-        dist_name,
-        world_size,
-        rank,
-    )
-    run_and_compare(name, model, runtime, data_type)
-
-
-def start_single(name, model, data_type):
-    runtime = backend.CudaRuntime(0)
-    run_and_compare(name, model, runtime, data_type)
-
-
-def gen_standard(name, model, voc_size, bs, len):
-    # generate standard results
-    input_ids = np.random.randint(0, voc_size, (bs, len))
-    position_ids = np.arange(len)
-    np.save(f"{name}_inputs", input_ids)
-    runtime = backend.CudaRuntime(0)
-    outputs = run_model(model, runtime, (input_ids, position_ids), 1)
-    print("outputs abs mean:", abs(outputs).mean())
-    np.save(f"{name}_results", outputs)
-
-
-def main():
-    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
-    data_type = "default" if data_type == "fp32" else data_type
-    if data_type != "tf32":
-        os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
-    model = onnx.load(model_path)
-
-    # generate standart output
-    if gen_std:
-        print(f"generate standard data for {name}.")
-        # a small vocabulary size to fit all LLM.
-        voc_size = 1000
-        gen_standard(name, model, voc_size, bs, length)
-        return
-
-    # run single process.
-    # use standalone process to isolate cuda.
-    print("run model by single GPU.")
-    p = mp.Process(target=start_single, args=(name, model, data_type))
-    p.start()
-    p.join()
-
-    # run distributed parallel.
-    world_size = nnodes * nproc_per_node
-    print(f"run model by {world_size} GPU in parallel.")
-    workers = [
-        mp.Process(
-            target=start_worker,
-            args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
-        )
-        for rank in range(world_size)
-    ]
-
-    for w in workers:
-        w.start()
-
-    for w in workers:
-        w.join()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/cuda/launch_kvcache.py
+++ b/examples/distributed/cuda/launch_kvcache.py
@ -1,245 +0,0 @@
-import argparse
-import os
-import time
-import multiprocessing as mp
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-from onnx.external_data_helper import convert_model_to_external_data
-import numpy as np
-from parallel_opt import parallel_model
-
-
-os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
-    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
-    parser.add_argument(
-        "--nproc_per_node", type=int, default=1, help="number of processes per node"
-    )
-    parser.add_argument(
-        "--name", type=str, default="test", help="name of this instance."
-    )
-    parser.add_argument(
-        "--model1", type=str, required=True, help="path to the ONNX model file."
-    )
-    parser.add_argument(
-        "--model2", type=str, required=True, help="path to the ONNX model file."
-    )    
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--gen_std",
-        action="store_true",
-        help="whether to generate the standard results.",
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.num_nodes,
-        args.nproc_per_node,
-        args.name,
-        args.model1,
-        args.model2,
-        args.batch_size,
-        args.length,
-        args.gen_std,
-    )
-
-
-def run_model(model1, model2, runtime1, runtime2, inputs1: np.array, inputs2: np.array, n=20):
-    ####################################
-    # run the first graph without kvcache
-    ####################################
-    stub1 = OnnxStub(model1, runtime1)
-    stub1.inputs['onnx::Reshape_0'].copyin_int32(inputs1.reshape(-1).tolist())
-    stub1.tune()
-    stub1.run()
-    kvcache_it1 = []
-    count = 0
-    for output in stub1.outputs.items().__iter__():
-        if count == 0:
-            logits_it1 = np.array(output[1].copyout_float(), dtype=np.float32)
-        else:
-            kvcache_it1.append(np.array(output[1].copyout_float(), dtype=np.float32))
-        count = count + 1
-        
-    # bench for stub1
-    next(stub1.inputs.items().__iter__())[1].copyin_int32(inputs1.reshape(-1).tolist())
-    begin = time.time()
-    for _ in range(n):
-        stub1.run()
-    end = time.time()
-    avg_time = (end - begin) / n
-    print(f"stub1 average time: {avg_time}")        
-        
-    ####################################
-    # run the second graph with kvcache
-    ####################################
-    i = 0
-    batchsize = 1
-    stub2 = OnnxStub(model2, runtime2)
-    past_kvcache_length = (i+2)*np.ones((batchsize, 1), dtype=np.int32)
-    # copyin input
-    stub2.inputs['onnx::Reshape_0'].copyin_int32(inputs2.reshape(-1).tolist())
-    stub2.inputs['input.3'].copyin_int32(past_kvcache_length.reshape(-1).tolist())
-    count = -1
-    for input in stub2.inputs.items().__iter__():
-        if count in range(24):
-            # print(count, input[0])
-            # print(np.dtype(kvcache_it1[count][0]), kvcache_it1[count].shape)
-            input[1].copyin_float(kvcache_it1[count].reshape(-1).tolist())
-        count = count + 1
-    stub2.tune()
-    stub2.run()
-    
-    # copyout output
-    count = 0
-    kvcache_it2 = []
-    for output in stub2.outputs.items().__iter__():
-        if count == 0:
-            logits_it2 = np.array(output[1].copyout_float(), dtype=np.float32)
-        else:
-            kvcache_it2.append(np.array(output[1].copyout_float(), dtype=np.float32))
-        count = count + 1     
-
-    # bench for stub2
-    # copyin input
-    stub2.inputs['onnx::Reshape_0'].copyin_int32(inputs2.reshape(-1).tolist())
-    stub2.inputs['input.3'].copyin_int32(past_kvcache_length.reshape(-1).tolist())
-    count = -1
-    for input in stub2.inputs.items().__iter__():
-        if count in range(24):
-            input[1].copyin_float(kvcache_it1[count].reshape(-1).tolist())
-        count = count + 1
-    begin = time.time()
-    for _ in range(n):
-        stub2.run()
-    end = time.time()
-    avg_time = (end - begin) / n
-    print(f"stub2 average time: {avg_time}")
-    return logits_it2
-
-
-def run_and_compare(name, model1, model2, runtime1, runtime2):
-    data1 = np.load(f"{name}_inputs1.npy")
-    data2 = np.load(f"{name}_inputs2.npy")
-    results = np.load(f"{name}_results.npy")
-    outputs = run_model(model1, model2, runtime1, runtime2, data1, data2)
-    print("outputs sum:", outputs.sum())
-    print("max abs diff:", abs(outputs - results).max())
-    print("max rel diff:", abs((outputs - results) / results).max())
-    # assert np.allclose(outputs, results, rtol=1e-3, atol=1e-6)
-
-
-def start_worker(
-    name: str, world_size: int, rank: int, local_rank: int, model1: onnx.ModelProto, model2: onnx.ModelProto
-):
-    dist_name = name + "_dist"
-    ####################################
-    # shard the first graph
-    ####################################
-    model1 = parallel_model(model1, world_size, rank)
-    extern_path = f"./{dist_name}_stub1_rank{rank}.pb"
-    if os.path.exists(extern_path):
-        os.remove(extern_path)
-    convert_model_to_external_data(
-        model1,
-        all_tensors_to_one_file=True,
-        location=extern_path,
-        size_threshold=1024,
-        convert_attribute=False,
-    )
-    onnx.save(model1, f"./{dist_name}_stub1_rank{rank}.onnx")
-    runtime1 = backend.CudaRuntime(local_rank)
-    runtime1.init_comm(
-        dist_name,
-        world_size,
-        rank,
-    )
-    
-    ####################################
-    # shard the second graph
-    ####################################    
-    model2 = parallel_model(model2, world_size, rank)
-    extern_path = f"./{dist_name}_stub2_rank{rank}.pb"
-    if os.path.exists(extern_path):
-        os.remove(extern_path)
-    convert_model_to_external_data(
-        model2,
-        all_tensors_to_one_file=True,
-        location=extern_path,
-        size_threshold=1024,
-        convert_attribute=False,
-    )    
-    onnx.save(model2, f"./{dist_name}_stub2_rank{rank}.onnx")
-    runtime2 = backend.CudaRuntime(local_rank)
-    # print("init comm")
-    runtime2.init_comm(
-        dist_name,
-        world_size,
-        rank,
-    )    
-    
-    # run the two graphs
-    run_and_compare(name, model1, model2, runtime1, runtime2)
-
-
-def start_single(name, model1, model2):
-    runtime1 = backend.CudaRuntime(0)
-    runtime2 = backend.CudaRuntime(0)
-    run_and_compare(name, model1, model2, runtime1, runtime2)
-
-
-def gen_standard(name, model1, model2, voc_size, bs, len):
-    # generate standard results
-    data1 = np.random.randint(0, voc_size, (bs, len), dtype=np.int32)
-    data2 = np.random.randint(0, voc_size, (bs, len), dtype=np.int32)
-    np.save(f"{name}_inputs1", data1)
-    np.save(f"{name}_inputs2", data2)
-    runtime1 = backend.CudaRuntime(0)
-    runtime2 = backend.CudaRuntime(0)
-    outputs = run_model(model1, model2, runtime1, runtime2, data1, data2, 1)
-    np.save(f"{name}_results", outputs)
-
-
-def main():
-    nnodes, nproc_per_node, name, model1_path, model2_path, bs, length, gen_std = parse_args()
-
-    model1 = onnx.load(model1_path)
-    model2 = onnx.load(model2_path)
-
-    # generate standart output
-    if gen_std:
-        print(f"generate standard data for {name}.")
-        # a small vocabulary size to fit all LLM.
-        voc_size = 1000
-        gen_standard(name, model1, model2, voc_size, bs, length)
-        return
-
-    # run single process.
-    # use standalone process to isolate cuda.
-    p = mp.Process(target=start_single, args=(name, model1, model2))
-    p.start()
-    p.join()
-
-    # run distributed parallel.
-    world_size = nnodes * nproc_per_node
-    workers = [
-        mp.Process(
-            target=start_worker,
-            args=(name, world_size, rank, rank % nproc_per_node, model1, model2),
-        )
-        for rank in range(world_size)
-    ]
-
-    for w in workers:
-        w.start()
-
-    for w in workers:
-        w.join()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/cuda/run_pytorch.py
+++ b/examples/distributed/cuda/run_pytorch.py
@ -1,188 +0,0 @@
-import argparse
-import torch
-from transformers import BertModel, BertConfig
-from transformers import GPT2Model, GPT2Config
-from transformers import OPTModel, OPTConfig
-import time
-import numpy as np
-import onnx
-import os
-from onnx.external_data_helper import convert_model_to_external_data
-from onnxsim import simplify
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
-    parser.add_argument(
-        "--model", type=str, choices=["gpt2", "bert", "opt"], required=True, help="model type"
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--export_onnx",
-        type=str,
-        nargs="?",
-        default=None,
-        const="./",
-        help="whether and where to export onnx file",
-    )
-    parser.add_argument(
-        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.model,
-        args.batch_size,
-        args.length,
-        args.export_onnx,
-        args.type,
-    )
-
-
-def get_model(modelname):
-    match modelname:
-        case "bert":
-            model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
-            voc_size = BertConfig().vocab_size
-        case "gpt2":
-            model = GPT2Model.from_pretrained("gpt2")
-            voc_size = GPT2Config().vocab_size
-        case "opt":
-            model = model = OPTModel.from_pretrained("./opt-125m")
-            voc_size = OPTConfig().vocab_size
-        case _:
-            raise KeyError(modelname)
-
-    model = model.eval()
-    return model, voc_size
-
-def run_pytorch(torch_model, voc_size, batchsize, len):
-    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
-    np.save("test_inputs", data)
-    inputs = torch.from_numpy(data).to("cuda")
-    torch_model = torch_model.to("cuda")
-
-    n_iter = 20
-    with torch.no_grad():
-        for _ in range(10):
-            outputs = torch_model(inputs)
-    torch.cuda.synchronize()
-    begin = time.time()
-    with torch.no_grad():
-        for _ in range(n_iter):
-            torch.cuda.synchronize()
-            outputs = torch_model(inputs)
-            # 
-            torch.cuda.synchronize()
-    torch.cuda.synchronize()
-    end = time.time()
-    
-    avg_time = (end - begin) / n_iter
-    outputs = outputs.last_hidden_state.to("cpu")
-    print("outputs abs mean:", abs(np.array(outputs)).mean())
-    print(f"average time: {avg_time}")
-    torch.cuda.memory.empty_cache()
-    np.save("test_results", np.array(outputs, dtype=np.float32))
-    print("Save input & output as test_inputs.npy and test_results.npy")
-
-
-def export_onnx(model, data, path, extern=False):
-    torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
-    onnx_model = onnx.load(path)
-    onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
-    #onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
-    assert check
-    add_value_info_for_constants(onnx_model)
-    onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
-    if extern:
-        extern_path = path.replace('.onnx', '.pb')
-        if os.path.exists(extern_path):
-            os.remove(extern_path)
-        convert_model_to_external_data(
-            onnx_model,
-            all_tensors_to_one_file=True,
-            location=extern_path,
-            size_threshold=1024,
-            convert_attribute=False,
-        )
-    onnx.save(onnx_model, path)
-
-def add_value_info_for_constants(model : onnx.ModelProto):
-    """
-    Currently onnx.shape_inference doesn't use the shape of initializers, so add
-    that info explicitly as ValueInfoProtos.
-    Mutates the model.
-    Args:
-        model: The ModelProto to update.
-    """
-    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
-    if model.ir_version < 4:
-        return
-
-    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
-        inputs = {i.name for i in graph.input}
-        existing_info = {vi.name: vi for vi in graph.value_info}
-        for init in graph.initializer:
-            # Check it really is a constant, not an input
-            if init.name in inputs:
-                continue
-
-            # The details we want to add
-            elem_type = init.data_type
-            shape = init.dims
-
-            # Get existing or create new value info for this constant
-            vi = existing_info.get(init.name)
-            if vi is None:
-                vi = graph.value_info.add()
-                vi.name = init.name
-
-            # Even though it would be weird, we will not overwrite info even if it doesn't match
-            tt = vi.type.tensor_type
-            if tt.elem_type == onnx.TensorProto.UNDEFINED:
-                tt.elem_type = elem_type
-            if not tt.HasField("shape"):
-                # Ensure we set an empty list if the const is scalar (zero dims)
-                tt.shape.dim.extend([])
-                for dim in shape:
-                    tt.shape.dim.add().dim_value = dim
-
-        # Handle subgraphs
-        for node in graph.node:
-            for attr in node.attribute:
-                # Ref attrs refer to other attrs, so we don't need to do anything
-                if attr.ref_attr_name != "":
-                    continue
-
-                if attr.type == onnx.AttributeProto.GRAPH:
-                    add_const_value_infos_to_graph(attr.g)
-                if attr.type == onnx.AttributeProto.GRAPHS:
-                    for g in attr.graphs:
-                        add_const_value_infos_to_graph(g)
-
-
-    return add_const_value_infos_to_graph(model.graph)
-
-
-def main():
-    torch.backends.cuda.matmul.allow_tf32 = False
-    torch.backends.cudnn.allow_tf32 = False
-    modelname, batchsize, seqlen, export_path, data_type = parse_args()
-    if data_type == "tf32":
-        torch.backends.cuda.matmul.allow_tf32 = True
-    else:
-        os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
-
-    model, voc_size = get_model(modelname)
-    if export_path is not None:
-        filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
-        path = os.path.join(export_path, filename)
-        param = torch.zeros((batchsize, seqlen), dtype=torch.int)
-        export_onnx(model, param, path, True)
-
-    if data_type == "fp16":
-        model = model.half()
-    run_pytorch(model, voc_size, batchsize, seqlen)
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/kunlun/export_onnx.sh
+++ b/examples/distributed/kunlun/export_onnx.sh
@ -1,14 +0,0 @@
- export HF_ENDPOINT=https://hf-mirror.com
-
-models=("bert" "gpt2" "llama")
-batch_size=(1 32)
-seq_len=(100 500)
-nproc=(1 2 4)
-
-for model in "${models[@]}"; do
-    for bs in "${batch_size[@]}"; do
-        for len in "${seq_len[@]}"; do
-            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" --export_onnx ../models/"$model" --export_only 
-        done
-    done
-done 
--- a/examples/distributed/kunlun/kunlun_launch.py
+++ b/examples/distributed/kunlun/kunlun_launch.py
@ -1,280 +0,0 @@
-import sys
-sys.path.append('../')
-
-import argparse
-import os
-import time
-import multiprocessing as mp
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-from onnx.external_data_helper import convert_model_to_external_data
-from onnx.shape_inference import infer_shapes_path
-import numpy as np
-from parallel_opt import parallel_model
-from functools import wraps
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
-    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
-    parser.add_argument(
-        "--nproc_per_node", type=int, default=2, help="number of processes per node"
-    )
-    parser.add_argument(
-        "--name", type=str, choices=["gpt2", "bert", "llama"], help="name of model."
-    )
-    parser.add_argument(
-        "--model", type=str, default="", help="path to the ONNX model file."
-    )
-    parser.add_argument(
-        "--gen_std",
-        default=False,
-        action="store_true",
-        help="whether to generate the standard results.",
-    )
-    parser.add_argument(
-        "--run_single",
-        default=False,
-        action="store_true",
-        help="whether run model with single process with standard inputs"
-    )
-    parser.add_argument(
-        "--input_dir",
-        default="./",
-        help="path to save model input data"
-    )
-    parser.add_argument(
-        "--result_dir",
-        default="./",
-        help="path to save model standard output"
-    )
-    parser.add_argument(
-        "--internal_model_dir",
-        default="./",
-        help="path to save internal onnx model for parallel run"
-    )
-    args = parser.parse_args()
-
-    # check path, mkdir if not exist
-    check_exists(args.input_dir)
-    check_exists(args.result_dir)
-    check_exists(args.internal_model_dir)
-
-    print("arg setting: ", args)
-    return (
-        args.num_nodes,
-        args.nproc_per_node,
-        args.name,
-        args.model,
-        args.gen_std,
-        args.run_single,
-        args.input_dir,
-        args.result_dir,
-        args.internal_model_dir
-    )
-
-
-"""
-utils function for this scripts
-"""
-def check_exists(path: str):
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-def np_assert(base, test, rtol=1e-2, atol=1e-1):
-    # np.testing.assert_allclose(test, base, rtol, atol)
-    print("max abs diff:", abs(base - test).max())
-
-
-"""
-Perf wrapper, run function n times
-then average
-"""
-def perf_it(n):
-    def decorator(func):
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            # warmup
-            for _ in range(n):
-                func(*args, **kwargs)
-
-            t_total = 0
-            for _ in range(n):
-                t0 = time.time()
-                func(*args, **kwargs)
-                t1 = time.time()
-                t_total += t1 - t0
-            avg_time = (t_total) / n
-            print(f"Avg runtime of {n} time is {avg_time:.6f} seconds")
-            return avg_time
-        return wrapper
-    return decorator
-
-
-"""
-Run InfiniTensor model with Standard input
-check=True: check with standard output gen by pytorch
-perf=True: run n times to get avg time
-"""
-def run_model(task_name,
-              model,
-              runtime,
-              world_size=1,
-              rank=0,
-              n=10,
-              check=True,
-              perf=True):
-
-    stub = OnnxStub(model, runtime,
-                    use_naive_allocator=True \
-                    if task_name == "llama" else False)
-
-    # load in Onnx model inputs
-    def load_inputs(stub: OnnxStub):
-        # check exists
-        inputs = []
-        for i, (name, tensor) in enumerate(stub.inputs.items()):
-            input_path = os.path.join(input_dir, \
-                                f"{task_name}_input_{i}.npy")
-            print(input_path)
-            if os.path.exists(input_path):
-                input = np.load(input_path)
-            else :
-                raise KeyError(f"{i} th input of model not exists")
-            # check shape
-            if all(x == y for x,y in zip(input.shape, tensor.shape())):
-                tensor.copyin_numpy(input)
-            else:
-                tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
-
-    load_inputs(stub)
-    # stub.tune()
-    stub.run()
-    time.sleep(0.01)
-    output = next(stub.outputs.values().__iter__()).copyout_numpy()
-
-    # check output results with standard output
-    if check:
-        st_output_path = os.path.join(result_dir, \
-                                f"{task_name}_output.npy")
-        assert os.path.exists(st_output_path) , \
-                    "standard output not exists"
-        st_output = np.load(st_output_path)
-        if np.isnan(output).any():
-            print("Nan in output")
-            exit()
-        np_assert(st_output, output)
-
-    # perf
-    if perf:
-        @perf_it(n)
-        def perf_infinitensor(stub: OnnxStub):
-            stub.run()
-        perf_infinitensor(stub)
-
-    return output
-
-
-"""
-Start a worker in Parallel
-"""
-def start_worker(name: str,
-           world_size: int,
-           rank: int,
-           local_rank: int,
-           model: onnx.ModelProto):
-
-    dist_name = name + "_dist"
-    # partial a onnx model to world_size part
-    model = parallel_model(model, world_size, rank)
-    onnx.save(model, os.path.join(internal_model_dir, \
-                                    f"{dist_name}_rank{rank}.onnx"), save_as_external_data=True)
-    runtime = backend.KUNLUNRuntime(local_rank)
-    # print("init comm")
-    runtime.init_comm(
-        dist_name,
-        world_size,
-        rank,
-    )
-    run_model(name, model, runtime, world_size, rank)
-
-
-"""
-generate standard input/output with
-sigle card run
-"""
-def gen_standard(task_name: str, model: onnx.ModelProto):
-    runtime = backend.KUNLUNRuntime(0)
-    stub = OnnxStub(model, runtime)
-    position_id = 0
-    # generate random input for model
-    for i, (name, tensor) in enumerate(stub.inputs.items()):
-        input = tensor.copyout_numpy()
-        if np.issubdtype(input.dtype, np.integer):
-            if input.size == 1:
-                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
-            else:
-                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
-        elif input.dtype == np.bool_:
-            input = np.random.randint(0,2,size=input.shape) > 0
-        else:
-            if i == 0:
-                input = np.ones(input.shape).astype(input.dtype)
-                position_id = input.shape[-1] - 1
-            else:
-                input = np.random.rand(*input.shape).astype(input.dtype)
-        tensor.copyin_numpy(input)
-        np.save(os.path.join(input_dir, \
-                    f"{task_name}_input_{i}.npy"), input)
-    stub.run()
-    # print(stub.outputs)
-    output = next(stub.outputs.values().__iter__()).copyout_numpy()
-    if np.isnan(output).any():
-        print("Nan in output")
-        exit()
-    np.save(os.path.join(result_dir, f"{task_name}_output.npy"), output)
-
-
-def main():
-
-    global input_dir, result_dir, internal_model_dir
-
-    nnodes, nproc_per_node, task_name, \
-        model_path, gen_std, run_single, \
-            input_dir, result_dir, internal_model_dir = parse_args()
-
-    # load input onnx model
-    model = onnx.load(model_path)
-
-    # generate standart output
-    if gen_std:
-        print("Generate inputs and outputs.")
-        gen_standard(task_name, model)
-        return
-
-    if run_single:
-        print("Run model by one GPU card.")
-        runtime = backend.KUNLUNRuntime(0)
-        run_model(task_name, model, runtime)
-        return
-
-    # run distributed parallel.
-    world_size = nnodes * nproc_per_node
-    print(f"Run model by {world_size} GPU in parallel.")
-    workers = [
-        mp.Process(
-            target=start_worker,
-            args=(task_name, world_size, rank, rank % nproc_per_node, model),
-        )
-        for rank in range(world_size)
-    ]
-
-    for w in workers:
-        w.start()
-
-    for w in workers:
-        w.join()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/kunlun/launch.sh
+++ b/examples/distributed/kunlun/launch.sh
@ -1,36 +0,0 @@
-export HF_ENDPOINT=https://hf-mirror.com
-
-# models=("bert" "gpt2" "llama")
-models=("bert" "gpt2")
-batch_size=(1 32)
-seq_len=(100 500)
-nproc=(1 2 4)
-
-results_dir="results"
-
-if [ -d "$results_dir" ]; then
-    echo "directory ./$results_dir exists"
-else
-    mkdir -p "$results_dir"
-    echo "mkdir $results_dir, logs saved there"
-fi
-
-
-for model in "${models[@]}"; do
-    for bs in "${batch_size[@]}"; do
-        for len in "${seq_len[@]}"; do
-            # run pytorch model
-            echo "Run pytorch $model with batch_size=$bs length=$len ."
-            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" #> results/"$model"_"$bs"_"$len"_pytorch
-            for n in "${nproc[@]}"; do
-                # run infinitensor 
-                echo "Run $n parallel infinitensor "$model" with batch_size=$bs and length=$len ."
-                python kunlun_launch.py --name "$model" --model ../models/"$model"/"$model"_"$bs"_"$len".onnx --nproc_per_node=$n # >> results/"$model"_"$bs"_"$len"_infini 
-                # delete internal files
-                find ./ -type f -name "*.onnx" -delete
-                find ./ -type f -name "*.pb" -delete
-            done
-            find ./ -type f -name "*.npy" -delete
-        done
-    done
-done
--- a/examples/distributed/kunlun/llama_launch.sh
+++ b/examples/distributed/kunlun/llama_launch.sh
@ -1,35 +0,0 @@
-export HF_ENDPOINT=https://hf-mirror.com
-
-# models=("bert" "gpt2" "llama")
-models=("llama")
-batch_size=(1 )
-seq_len=(100 500)
-nproc=(1 2 4)
-
-results_dir="results"
-
-if [ -d "$results_dir" ]; then
-    echo "directory ./$results_dir exists"
-else
-    mkdir -p "$results_dir"
-    echo "mkdir $results_dir, logs saved there"
-fi
-
-
-for model in "${models[@]}"; do
-    for bs in "${batch_size[@]}"; do
-        for len in "${seq_len[@]}"; do
-            echo "Run pytorch llama with batch_size="$bs" and length="$len""
-            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len"
-            for n in "${nproc[@]}"; do
-                    # run pytorch model
-                    echo "Run infinitensor llama with batch_size="$bs" and length="$len" and nproc="$n"."
-                    python kunlun_launch.py --name llama --model ../models/llama/llama_"$bs"_"$len"_fp32.onnx --nproc_per_node=$n
-                    # delete internal files
-                    find ./ -type f -name "*.onnx" -delete
-                    find ./ -type f -name "*0c" -delete
-            done
-            find ./ -type f -name "*.npy" -delete
-        done
-    done
-done
--- a/examples/distributed/kunlun/run_pytorch.py
+++ b/examples/distributed/kunlun/run_pytorch.py
@ -1,245 +0,0 @@
-import argparse
-import torch
-from transformers import BertModel, BertConfig
-from transformers import GPT2Model, GPT2Config
-from transformers import OPTModel, OPTConfig
-from transformers import LlamaModel, LlamaConfig
-import time
-import numpy as np
-import onnx
-import os
-import sys
-from onnx.external_data_helper import convert_model_to_external_data
-from onnxsim import simplify
-
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.backends.cudnn.allow_tf32 = False
-def parse_args():
-    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
-    parser.add_argument(
-        "--model", type=str, choices=["gpt2", "bert", "opt", "llama"], required=True, help="model type"
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--export_onnx",
-        type=str,
-        nargs="?",
-        default=None,
-        const="./",
-        help="whether and where to export onnx file",
-    )
-    parser.add_argument(
-        "--input_dir",
-        type=str,
-        default="./",
-        help="path to save pytorch model input data"
-    )
-    parser.add_argument(
-        "--result_dir",
-        type=str,
-        default="./",
-        help="path to save pytorch model output data"
-    )
-    parser.add_argument(
-        "--export_only",
-        action="store_true"
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.model,
-        args.batch_size,
-        args.length,
-        args.export_onnx,
-        args.input_dir,
-        args.result_dir,
-        args.export_only
-    )
-
-
-def get_model(modelname):
-    if modelname == "bert":
-        model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
-        voc_size = BertConfig().vocab_size
-    elif modelname == "gpt2":
-        model = GPT2Model.from_pretrained("gpt2")
-        voc_size = GPT2Config().vocab_size
-    elif modelname == "opt":
-        model = OPTModel.from_pretrained("./opt-125m")
-        voc_size = OPTConfig().vocab_size
-    elif modelname == "llama":
-        model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
-        voc_size = LlamaConfig().vocab_size
-    else :
-        raise KeyError(modelname)
-
-    model = model.eval()
-    return model, voc_size
-
-def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
-    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
-    np.save(os.path.join(input_dir, f"{model_name}_input_0.npy"), data)
-    inputs = torch.from_numpy(data).to("cuda")
-    torch_model = torch_model.to("cuda")
-
-    n_iter = 10
-    with torch.no_grad():
-        for _ in range(10):
-            outputs = torch_model(inputs)
-    torch.cuda.synchronize()
-    begin = time.time()
-    with torch.no_grad():
-        for _ in range(n_iter):
-            torch.cuda.synchronize()
-            outputs = torch_model(inputs)
-            #
-            torch.cuda.synchronize()
-    torch.cuda.synchronize()
-    end = time.time()
-
-    avg_time = (end - begin) / n_iter
-    outputs = outputs.last_hidden_state.to("cpu")
-    print("outputs abs mean:", abs(np.array(outputs)).mean())
-    print(f"average time: {avg_time}")
-    torch.cuda.memory.empty_cache()
-    np.save(os.path.join(result_dir, f"{model_name}_output.npy"), \
-                                        np.array(outputs))
-    print(f"Save input & output as {model_name}_input_0.npy and {model_name}_output.npy")
-
-
-def export_onnx(model_name, model, data, path, extern=False):
-    # torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
-
-    if model_name != "llama":
-        onnx_model = onnx.load(path)
-        onnx_model, check = simplify(onnx_model,
-                                 skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
-                                 # skipped_optimizers=['fuse_qkv'])
-        assert check
-        add_value_info_for_constants(onnx_model)
-        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
-        if extern:
-            extern_path = path.replace('.onnx', '.pb')
-            if os.path.exists(extern_path):
-                os.remove(extern_path)
-            convert_model_to_external_data(
-                onnx_model,
-                all_tensors_to_one_file=True,
-                location=extern_path.split("/")[-1],
-                size_threshold=1024,
-                convert_attribute=False,
-            )
-        onnx.save(onnx_model, path)
-    else:
-        sys.path.append("onnxsim_large_model")
-        from onnx_utils import set_onnx_input_shape
-        from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
-
-        in_model_path = path
-        out_model_path = in_model_path[:-5] + ".sim.onnx"
-
-        onnx_model = onnx.load(in_model_path)
-        print(f"load model from {in_model_path} success")
-
-        size_th_bytes = 1024 * 1024
-        onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
-        print("compress model success")
-
-        onnx_model = set_onnx_input_shape(onnx_model, "")
-        tensor_size_threshold = f"1024KB"
-        skipped_optimizers = []
-        skipped_optimizers.append("eliminate_duplicate_initializer")
-        onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
-                                    tensor_size_threshold=tensor_size_threshold)
-        if not check:
-            raise ValueError(f"simplify compressed model {in_model_path} failed")
-
-        print(f"simplify model success")
-
-        onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
-        print(f"uncompress model success")
-
-        add_value_info_for_constants(onnx_model)
-
-        onnx.save(onnx_model, out_model_path, save_as_external_data=True)
-
-
-def add_value_info_for_constants(model : onnx.ModelProto):
-    """
-    Currently onnx.shape_inference doesn't use the shape of initializers, so add
-    that info explicitly as ValueInfoProtos.
-    Mutates the model.
-    Args:
-        model: The ModelProto to update.
-    """
-    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
-    if model.ir_version < 4:
-        return
-
-    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
-        inputs = {i.name for i in graph.input}
-        existing_info = {vi.name: vi for vi in graph.value_info}
-        for init in graph.initializer:
-            # Check it really is a constant, not an input
-            if init.name in inputs:
-                continue
-
-            # The details we want to add
-            elem_type = init.data_type
-            shape = init.dims
-
-            # Get existing or create new value info for this constant
-            vi = existing_info.get(init.name)
-            if vi is None:
-                vi = graph.value_info.add()
-                vi.name = init.name
-
-            # Even though it would be weird, we will not overwrite info even if it doesn't match
-            tt = vi.type.tensor_type
-            if tt.elem_type == onnx.TensorProto.UNDEFINED:
-                tt.elem_type = elem_type
-            if not tt.HasField("shape"):
-                # Ensure we set an empty list if the const is scalar (zero dims)
-                tt.shape.dim.extend([])
-                for dim in shape:
-                    tt.shape.dim.add().dim_value = dim
-
-        # Handle subgraphs
-        for node in graph.node:
-            for attr in node.attribute:
-                # Ref attrs refer to other attrs, so we don't need to do anything
-                if attr.ref_attr_name != "":
-                    continue
-
-                if attr.type == onnx.AttributeProto.GRAPH:
-                    add_const_value_infos_to_graph(attr.g)
-                if attr.type == onnx.AttributeProto.GRAPHS:
-                    for g in attr.graphs:
-                        add_const_value_infos_to_graph(g)
-
-
-    return add_const_value_infos_to_graph(model.graph)
-
-
-def main():
-    global input_dir, result_dir
-
-    modelname, batchsize, seqlen, \
-        export_path, input_dir, result_dir, export_only = parse_args()
-
-    model, voc_size = get_model(modelname) # pytorch model
-
-    if export_path is not None:
-        os.makedirs(export_path, exist_ok=True)
-        filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
-        path = os.path.join(export_path, filename)
-        param = torch.zeros((batchsize, seqlen), dtype=torch.int)
-        export_onnx(modelname, model, param, path, True) # export pytorch model to onnx model
-        if export_only:
-            return
-
-    run_pytorch(model, voc_size, batchsize, seqlen, modelname)
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/onnxsim_large_model
+++ b/examples/distributed/onnxsim_large_model
@ -1 +0,0 @@
-Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65
--- a/examples/distributed/parallel.py
+++ b/examples/distributed/parallel.py
@ -1,103 +0,0 @@
-import onnx
-from onnx import (
-    ModelProto,
-    TensorProto,
-    NodeProto,
-    AttributeProto,
-)
-from onnx import helper, numpy_helper
-from typing import Dict, Any
-
-
-def parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
-    for attr in node.attribute:
-        if attr.name in attrs:
-            if attr.type == AttributeProto.INT:
-                attrs[attr.name] = attr.i
-            elif attr.type == AttributeProto.INTS:
-                attrs[attr.name] = attr.ints
-            elif attr.type == AttributeProto.FLOAT:
-                attrs[attr.name] = attr.f
-            elif attr.type == AttributeProto.STRING:
-                attrs[attr.name] = attr.s
-            elif attr.type == AttributeProto.TENSOR:
-                attrs[attr.name] = attr.t
-            else:
-                assert False, "Unsupported Attribute Type: {}".format(attr.type)
-    return attrs
-
-
-def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
-    data = {init.name: init for init in model.graph.initializer}
-    nodes = list(model.graph.node)
-
-    def shard_tensor(tensor: TensorProto, dim: int):
-        array = numpy_helper.to_array(tensor)
-        if dim >= array.ndim:
-            dim = array.ndim - 1
-        assert array.shape[dim] % tp_world_size == 0
-        seg = array.shape[dim] // tp_world_size
-        array = array[tp_rank * seg : (tp_rank + 1) * seg]
-        return numpy_helper.from_array(array, name=tensor.name + f":sharded({dim})")
-
-    def shard_gemm(node: NodeProto):
-        attrs = parse_attribute(
-            node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
-        )
-        trans = [attrs["transA"], attrs["transB"]]
-        dim = 0
-        for i, (input, t) in enumerate(zip(node.input, trans)):
-            if input in data:
-                dim = i
-                sharded = shard_tensor(data[input], dim ^ t)
-                node.input[i] = sharded.name
-                data[input] = sharded
-        if len(node.input) > 2:
-            input = node.input[2]
-            sharded = shard_tensor(data[input], dim)
-            node.input[2] = sharded.name
-            data[input] = sharded
-
-        node.output[0] += f":sharded({dim})"
-        return dim
-
-    for i, node in enumerate(nodes):
-        if node.op_type == "Gemm":
-            output = node.output[0]
-            dim = shard_gemm(node)
-            gathered = [node.output[0] + f".{i}" for i in range(tp_world_size)]
-            # all_gather
-            nodes.insert(
-                i + 1,
-                helper.make_node(
-                    op_type="AllGather",
-                    inputs=[node.output[0]],
-                    outputs=gathered,
-                    name=node.name + "/allgather",
-                    # domain="infini", # shape inference fails for custom domain
-                ),
-            )
-            # concat
-            nodes.insert(
-                i + 2,
-                helper.make_node(
-                    op_type="Concat",
-                    inputs=gathered,
-                    outputs=[output],
-                    name=node.name + "/concat",
-                    axis=dim,
-                ),
-            )
-    graph = helper.make_graph(
-        nodes,
-        model.graph.name + f"_{tp_rank}",
-        model.graph.input,
-        model.graph.output,
-        data.values(),
-        doc_string=model.graph.doc_string,
-        value_info=model.graph.value_info,
-    )
-    model = helper.make_model(graph)
-
-    onnx.shape_inference.infer_shapes(model)
-    return model
--- a/examples/distributed/parallel_opt.py
+++ b/examples/distributed/parallel_opt.py
@ -1,247 +0,0 @@
-import onnx
-from onnx import ModelProto, NodeProto, TensorProto, ValueInfoProto
-from onnx import helper, numpy_helper
-from typing import Dict, List
-from placement import Placement, Replicate, Shard, _Partial
-import numpy as np
-
-
-def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
-    data = {init.name: init for init in model.graph.initializer}
-    vinfo = {info.name: info for info in model.graph.value_info}
-    vinfo.update({info.name: info for info in model.graph.input})
-    vinfo.update({info.name: info for info in model.graph.output})
-    output = {info.name: info for info in model.graph.output}
-    place: Dict[str, Placement] = {}
-    nodes: List[NodeProto] = []
-
-    def is_sharded(name: str):
-        return place[name].is_shard()
-
-    def shard_tensor(tensor: TensorProto, plc: Shard, groups: int = 1):
-        # print(f"shard {tensor.name} at dim {dim}")
-        assert plc.is_shard(), plc
-        ndim = len(tensor.dims)
-        if plc.dim < 0:
-            plc.dim += ndim
-        if tensor.dims[plc.dim] == 1:  # broadcast dim, no need to shard.
-            return tensor
-        array = numpy_helper.to_array(tensor)
-        assert array.shape[plc.dim] % tp_world_size == 0, array.shape[plc.dim]
-        dims = list(tensor.dims)
-        dims.insert(plc.dim, groups)
-        dims[plc.dim + 1] //= groups
-        array = array.reshape(dims)
-        seg = array.shape[plc.dim + 1] // tp_world_size
-        array = array.take(
-            indices=range(tp_rank * seg, (tp_rank + 1) * seg), axis=plc.dim + 1
-        )
-        dims = list(tensor.dims)
-        dims[plc.dim] //= tp_world_size
-        array = array.reshape(dims)
-        tensor = numpy_helper.from_array(array, name=tensor.name)
-        place[tensor.name] = plc
-        return tensor
-
-    def shard_gemm(node: NodeProto, groups: int = 1):
-        # print("gemm", node.name)
-        in_plc = place[node.input[0]]
-        w_plc = Shard(-1) if in_plc.is_replicate() else Shard(0)
-        transB = next((attr.i for attr in node.attribute if attr.name == "transB"), 0)
-        if transB:
-            w_plc.dim = ~w_plc.dim
-        input = node.input[1]
-        data[input] = shard_tensor(data[input], w_plc, groups)
-
-        output = node.output[0]
-        ndim = len(vinfo[output].type.tensor_type.shape.dim)
-        out_plc = Shard(ndim - 1) if in_plc.is_replicate() else _Partial()
-        place[node.output[0]] = out_plc
-
-    def shard_concat(node: NodeProto):
-        # hack for kvcache
-        in_plc = place[node.input[1]]
-        if in_plc.is_shard():
-            seq_len_dim = vinfo[node.input[0]].type.tensor_type.shape.dim.pop(1)
-            seq_len_dim.dim_value //= tp_world_size
-            vinfo[node.input[0]].type.tensor_type.shape.dim.insert(1, seq_len_dim)
-            place[node.input[0]] = in_plc
-            place[node.output[0]] = in_plc
-
-    def shard_binary(node: NodeProto, groups: int = 1):
-        # print("binary", node.name, node.input[0], place[node.input[0]])
-        a = node.input[0]
-        b = node.input[1]
-        if a in data:
-            a, b = b, a
-        place[node.output[0]] = place[a]
-        if is_sharded(a) and b in data and len(data[b].dims) == 1:  # broadcast
-            data[b] = shard_tensor(data[b], Shard(0), groups)
-
-    def shard_reshape(node: NodeProto):
-        # print("reshape", node.name, node.input[0], place[node.input[0]])
-        if not is_sharded(node.input[0]):
-            return
-        in_plc = place[node.input[0]]
-        s_dim = -1
-        in_dims = [d.dim_value for d in vinfo[node.input[0]].type.tensor_type.shape.dim]
-        tensor = data[node.input[1]]
-        out_dims = numpy_helper.to_array(tensor).copy()
-        if len(in_dims) == 3 and len(out_dims) == 4:
-            if in_plc.dim == 0:
-                s_dim = 1
-            elif in_plc.dim == 2:
-                s_dim = 2
-        if len(in_dims) == 4 and len(out_dims) == 3:
-            if in_plc.dim == 1:
-                s_dim = 0
-            elif in_plc.dim == 2:
-                s_dim = 2
-        if len(in_dims) == 2 and len(out_dims) == 3:
-            if in_plc.dim == 1:
-                s_dim = 2
-        if len(in_dims) == 4 and len(out_dims) == 2:
-            if in_plc.dim == 1:
-                s_dim = 0
-            elif in_plc.dim == 2:
-                s_dim = 1
-        if len(in_dims) == 3 and len(out_dims) == 2:
-            if in_plc.dim == 1:
-                s_dim = 0
-            elif in_plc.dim == 2:
-                s_dim = 1
-        assert s_dim != -1
-        assert out_dims[s_dim] % tp_world_size == 0, out_dims
-        out_dims[s_dim] //= tp_world_size
-        # if ONNX uses the same tensor for multiple Reshape Nodes, then rename it to distingush from others.
-        node.input[1] = node.output[0] + "_shape"
-        data[node.input[1]] = numpy_helper.from_array(out_dims, name=node.input[1])
-        place[node.output[0]] = Shard(s_dim)
-
-    def shard_split(node: NodeProto):
-        if not is_sharded(node.input[0]):
-            return
-        in_plc = place[node.input[0]]
-        split_tensor = data[node.input[1]]
-        split = numpy_helper.to_array(split_tensor).copy()
-        split //= tp_world_size
-        data[node.input[1]] = numpy_helper.from_array(split, name=node.input[1])
-        for output in node.output:
-            place[output] = in_plc
-
-    def shard_transpose(node: NodeProto):
-        plc = place[node.input[0]]
-        if plc.is_shard():
-            perm = next(attr.ints for attr in node.attribute if attr.name == "perm")
-            place[node.output[0]] = Shard(list(perm).index(plc.dim))
-
-    def shard_node(node: NodeProto):
-        if node.op_type in ["Relu", "Tanh", "Softmax", "Cast"]:
-            place[node.output[0]] = place[node.input[0]]
-        elif node.op_type in ["Where"]:
-            place[node.output[0]] = place[node.input[1]]
-        if node.op_type in {"Add", "Mul", "Div", "Max"}:
-            shard_binary(node)
-        elif node.op_type == "Reshape":
-            shard_reshape(node)
-        elif node.op_type == "Transpose":
-            shard_transpose(node)
-        elif node.op_type == "Split":
-            shard_split(node)
-        elif node.op_type == "MatMul":
-            assert (
-                place[node.input[0]] == place[node.input[1]]
-            ), f"{place[node.input[0]]} != {place[node.input[1]]}"
-            place[node.output[0]] = place[node.input[0]]
-        elif node.op_type == "Concat":
-            shard_concat(node)
-
-    def find_successor(op_type: str, idx: int, search_limit: int = 1):
-        for node in model.graph.node[idx + 1 : idx + 1 + search_limit]:
-            if node.op_type == op_type:
-                return node
-        return None
-
-    # all tensors are initially replicated.
-    for v in vinfo:
-        place[v] = Replicate()
-
-    for t in data:
-        place[t] = Replicate()
-
-    for index, node in enumerate(model.graph.node):
-        nodes.append(node)
-        # linear
-        if (node.op_type == "MatMul" or node.op_type == "Gemm") and any(
-            input in data for input in node.input
-        ):
-            # FIXME(constroy): the last MatMul should not be sharded as TP.
-            if (
-                node.output[0] in output
-                or (
-                    index + 1 < len(model.graph.node)
-                    and model.graph.node[index + 1].output[0]
-                )
-                in output
-            ):
-                continue
-            groups = 1
-            # If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups
-            split_node = find_successor("Split", index, search_limit=2)
-            if split_node is not None:
-                groups = len(split_node.output)
-            shard_gemm(node, groups)
-            plc = place[node.output[0]]
-            if plc.is_partial():
-                new_name = node.output[0] + f":{plc}"
-                place[new_name] = place[node.output[0]]
-                # insert all_reduce
-                nodes.append(
-                    helper.make_node(
-                        op_type="ReduceSum",
-                        inputs=[new_name],
-                        outputs=[node.output[0]],
-                        name=node.name + "/all_reduce",
-                        noop_with_empty_axes=1,
-                        communicator=0,  # hack to treat ReduceSum as AllReduceSum
-                    )
-                )
-                place[node.output[0]] = Replicate()
-                node.output[0] = new_name
-            if len(node.input) > 2:  # split bias to add
-                prev = nodes[-1]
-                new_name = prev.output[0] + "_no_bias"
-                place[new_name] = place[node.output[0]]
-                bias = helper.make_node(
-                    op_type="Add",
-                    inputs=[new_name, node.input[2]],
-                    outputs=[prev.output[0]],
-                    name=node.name + "/bias",
-                )
-                node.input.pop()
-                prev.output[0] = new_name
-                shard_binary(bias, groups)
-                nodes.append(bias)
-            continue
-        shard_node(node)
-
-    new_input = []
-    for info in model.graph.input:
-        new_input.append(vinfo[info.name])
-
-    graph = helper.make_graph(
-        nodes,
-        model.graph.name + f"_{tp_rank}",
-        new_input,
-        model.graph.output,
-        data.values(),
-        doc_string=model.graph.doc_string,
-        # value_info=vinfo.values(),
-    )
-    for output in graph.output:
-        tt = output.type.tensor_type
-        if tt.HasField("shape"):
-            tt.ClearField("shape")
-    model = helper.make_model(graph)
-    #model = onnx.shape_inference.infer_shapes(model)
-    return model
--- a/examples/distributed/placement.py
+++ b/examples/distributed/placement.py
@ -1,64 +0,0 @@
-from typing import Optional
-
-
-class Placement:
-    # base class Placement type
-
-    # convenient utils to check for placement types
-    def is_shard(self, dim: Optional[int] = None) -> bool:
-        if dim is not None and isinstance(self, Shard):
-            return self.dim == dim
-        else:
-            return isinstance(self, Shard)
-
-    def is_replicate(self) -> bool:
-        return isinstance(self, Replicate)
-
-    def is_partial(self) -> bool:
-        return isinstance(self, _Partial)
-
-
-class Replicate(Placement):
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, Replicate):
-            return False
-        return True
-
-    def __repr__(self) -> str:
-        """
-        machine readable representation of the Replicate placement
-        """
-        return "Replicate()"
-
-
-class Shard(Placement):
-    # shard placement, shard on a dim
-    def __init__(self, dim):
-        self.dim = dim
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, Shard):
-            return False
-        return self.dim == other.dim
-
-    def __repr__(self) -> str:
-        """
-        machine readable representation of the Shard placement
-        """
-        return f"Shard(dim={self.dim})"
-
-
-class _Partial(Placement):
-    def __init__(self, reduce_op: str = "sum"):
-        self.reduce_op: str = reduce_op
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, _Partial):
-            return False
-        return self.reduce_op == other.reduce_op
-
-    def __repr__(self) -> str:
-        """
-        machine readable representation of the Partial placement
-        """
-        return f"_Partial(reduce_op={self.reduce_op})"
--- a/examples/python/llama_kvcache_inference.py
+++ b/examples/python/llama_kvcache_inference.py
@ -1,145 +0,0 @@
-import os
-from pyinfinitensor.onnx import OnnxStub, backend
-import numpy as np
-import onnx
-import torch
-from transformers import LlamaModel, LlamaForCausalLM
-from tqdm import tqdm
-import onnx_graphsurgeon as gs
-from onnxsim import simplify
-import argparse
-
-parser = argparse.ArgumentParser(description='')
-parser.add_argument('--batchsize', dest='batchsize', type=int, default=1)
-parser.add_argument('--layer', dest='n_layers', type=int, default=2)
-parser.add_argument('--iter', dest='n_iter', type=int, default=1)
-parser.add_argument('--n_max_length', dest='n_max_length', type=int, default=1024)
-parser.add_argument('--pretrained_llama_path', dest='pretrained_llama_path', type=str, 
-                    default="/data0/shared/data/public/opensource_models/meta-llama/Llama-2-7b-hf/")
-parser.add_argument('--onnx_model_path', dest='onnx_model_path', type=str, 
-                    default="/data1/shared/llama")
-args = parser.parse_args()
-
-ONNX_MODEL_PATH = "{}/llama_bs{}_layer{}.onnx".format(args.onnx_model_path, args.batchsize, args.n_layers)
-ONNX_WEIGHT_PATH = "./llama_bs{}_layer{}.pb".format(args.batchsize, args.n_layers)
-
-def export_onnx(model: LlamaModel, ONNX_MODEL_PATH):
-    param = torch.zeros(
-        (args.batchsize, 1024), dtype=torch.long)
-    logits = model(param, past_key_values=None)
-    param_kvcache = torch.zeros((args.batchsize, 1), dtype=torch.long)
-
-    torch.onnx.export(model, (param_kvcache, {"past_key_values": logits.past_key_values,
-                                              "position_ids": param_kvcache}), ONNX_MODEL_PATH, verbose=False,
-                      do_constant_folding=True,)
-    onnx_model = onnx.load(ONNX_MODEL_PATH)
-    print("simplifing onnx model")
-    onnx_model, check = simplify(onnx_model, skipped_optimizers=[
-                                 'eliminate_duplicate_initializer'])
-    assert check
-    
-    onnx.save(onnx_model, ONNX_MODEL_PATH, save_as_external_data=True, location=ONNX_WEIGHT_PATH)
-    print("simlifing finished.")
-
-
-@gs.Graph.register()
-def replace_with_attention(self, inputs, outputs, inputs_added, outputs_removed):
-    for inp in inputs:
-        inp.outputs.clear()   
-    for out in outputs:
-        out.inputs.clear()
-    for inp in inputs_added:
-        inputs.append(inp)
-    for out in outputs_removed:
-        out.inputs.clear()
-    return self.layer(op="AttentionKVCache", inputs=inputs, outputs=outputs)
-
-
-def replace_onnx_with_attention_op():
-    graph = gs.import_onnx(
-        onnx.load(ONNX_MODEL_PATH))
-    tmap = graph.tensors()
-    for i in range(args.n_layers):
-        inputs = [
-            tmap["onnx::Concat_" + str((i+1)*2)],
-            tmap["onnx::Concat_" + str((i+1)*2+1)],
-            tmap["/model/layers." + str(i) + "/self_attn/Add_output_0"],
-            tmap["/model/layers." + str(i) + "/self_attn/Add_1_output_0"],
-            tmap["/model/layers." + str(i) + "/self_attn/Transpose_2_output_0"]]
-        outputs = [
-            tmap["/model/layers." + str(i) + "/self_attn/MatMul_1_output_0"]]
-
-        inputs_added = [graph.inputs[1]]
-        outputs_removed = []
-
-        graph.replace_with_attention(
-            inputs, outputs, inputs_added, outputs_removed)
-        
-    graph.outputs = [tmap[graph.outputs[0].name]]
-    graph.cleanup(True).toposort()
-    onnx.save(gs.export_onnx(graph), ONNX_MODEL_PATH, save_as_external_data=True)
-
-
-if __name__ == "__main__":
-    kvcache_torch = None
-    torch_model = LlamaForCausalLM.from_pretrained(
-        args.pretrained_llama_path, num_hidden_layers=int(args.n_layers)).eval()
-    
-    n_heads = torch_model.config.num_attention_heads
-    n_dims = torch_model.config.hidden_size // n_heads
-    
-    if not os.path.exists(ONNX_MODEL_PATH):
-        print("exporting onnx graph")
-        export_onnx(torch_model, ONNX_MODEL_PATH)
-        replace_onnx_with_attention_op()
-    else:
-        print("will use exsiting onnx graph")
-
-    onnx_model = onnx.load(ONNX_MODEL_PATH)
-    stub = OnnxStub(onnx_model, backend.cuda_runtime())
-
-    count_wrong = 0
-    for i in tqdm(range(0, args.n_max_length)):
-        query = np.random.randint(
-            torch_model.config.vocab_size, size=(args.batchsize, 1), dtype=np.int32)
-        position_id = i*np.ones((args.batchsize, 1), dtype=np.int32)
-
-        ####################################
-        # pytorch
-        ####################################
-        outputs_torch = torch_model(
-            torch.tensor(query), past_key_values=kvcache_torch)
-        logit_torch = outputs_torch['logits']
-        kvcache_torch = outputs_torch['past_key_values']
-
-        ####################################
-        # infinitensor
-        ####################################
-        # copyin input
-        (list(stub.inputs.items()))[0][1].copyin_int64(
-            query.reshape(-1).tolist())
-        (list(stub.inputs.items()))[1][1].copyin_int64(
-            position_id.reshape(-1).tolist())
-
-        stub.run()
-
-        ####################################
-        # validation
-        ####################################
-        # copyout output
-        logits_it = np.array((list(stub.outputs.items()))
-                                [0][1].copyout_float())
-        
-        try:
-            np.testing.assert_allclose(
-                logit_torch[:, -1, :].detach().cpu().numpy().flatten(), logits_it, rtol=1e-3, atol=1e-3)
-        except Exception as e: 
-            try:
-                np.testing.assert_allclose(
-                    np.argmax(logit_torch[:, -1, :].detach().cpu().numpy().flatten()), np.argmax(logits_it), rtol=1e-3, atol=1e-3)
-            except:
-                count_wrong = count_wrong + 1
-
-    result = "{}/{} failed.".format(count_wrong, args.n_max_length)
-    print(result)
-    del stub
--- a/examples/python/onnx_inference.py
+++ b/examples/python/onnx_inference.py
@ -1,29 +0,0 @@
-import sys
-import onnx
-import torch
-import numpy as np
-from pyinfinitensor.onnx import OnnxStub, backend
-
-if __name__ == '__main__':
-    args = sys.argv
-    if len(sys.argv) != 2:
-        print("Usage: python onnx_inference.py model_name.onnx")
-        exit()
-    model_path = sys.argv[1]
-    # print(model_path)
-
-    onnx_model = onnx.load(model_path)
-    onnx_input = onnx_model.graph.input[0]
-    input_shape = [[d.dim_value for d in _input.type.tensor_type.shape.dim]
-                   for _input in onnx_model.graph.input]
-    # Assume that there is only one input tensor
-    input_shape = input_shape[0]
-    # print(input_shape)
-    input_data = np.random.random(input_shape).astype(np.float32)
-
-    model = OnnxStub(onnx_model, backend.cuda_runtime())
-    next(iter(model.inputs.values())).copyin_numpy(input_data)
-    model.run()
-    outputs = next(iter(model.outputs.values())).copyout_numpy()
-    outputs = torch.tensor(outputs)
-    print(outputs.shape)
--- a/examples/python/paddle_densenet.py
+++ b/examples/python/paddle_densenet.py
@ -1,80 +0,0 @@
-
-import paddle
-import paddle.vision.transforms as T
-from paddle.vision.datasets import Cifar10
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-import itertools
-
-def run_cifar_train_and_infer():
-    
-    paddle.device.set_device("gpu")
-
-    transform = T.Compose(
-        [
-            T.Resize(224),
-            T.ToTensor(),
-            T.Normalize(
-                mean=[0.5, 0.5, 0.5],
-                std=[0.5, 0.5, 0.5],
-                to_rgb=True,
-            ),
-        ]
-    )
-    
-    # 下载数据集并初始化 DataSet
-    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
-    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
-
-    # 模型组网并初始化网络
-    densenet = paddle.vision.models.DenseNet(num_classes=10)
-    model = paddle.Model(densenet)
-
-    # 模型训练的配置准备，准备损失函数，优化器和评价指标
-    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
-                paddle.nn.CrossEntropyLoss(),
-                paddle.metric.Accuracy())
-
-    # 模型训练
-    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
-    # 模型评估
-    model.evaluate(test_dataset, batch_size=64, verbose=1)
-
-    # export to ONNX
-    save_path = 'onnx.save/densenet' # 需要保存的路径
-    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
-    paddle.onnx.export(densenet, save_path, input_spec=[x_spec], opset_version=11)
-
-    # 加载onnx模型并放到Infinitensor中
-    model_path = save_path + ".onnx"
-    onnx_model = onnx.load(model_path)
-    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
-    model = gofusion_model
-    model.init()
-
-    # 启动推理
-    cifar10_test = Cifar10(
-        mode="test",
-        transform=transform,  # apply transform to every image
-        backend="cv2",  # use OpenCV as image transform backend
-    )
-    batch_size = 1
-    total_size = 0
-    total_acc = 0.0
-    for data in itertools.islice(iter(cifar10_test), 10000):
-        images, labels = data
-        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
-        model.run()
-        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
-        outputs = paddle.to_tensor(outputs)
-        outputs = paddle.reshape(outputs, (1, 10))
-        labels = paddle.to_tensor(labels)
-        labels = paddle.reshape(labels, (1,1))
-        acc = paddle.metric.accuracy(outputs, labels)
-        total_acc += acc
-        total_size += batch_size
-    print("test acc: {}".format(total_acc.numpy() / total_size))
-
-
-if __name__ == "__main__":
-    run_cifar_train_and_infer()
--- a/examples/python/paddle_inception.py
+++ b/examples/python/paddle_inception.py
@ -1,80 +0,0 @@
-import paddle
-import paddle.vision.transforms as T
-from paddle.vision.datasets import Cifar10
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-import itertools
-
-def run_cifar_train_and_infer():
-    
-    paddle.device.set_device("gpu")
-
-    transform = T.Compose(
-        [
-            T.Resize(224),
-            T.ToTensor(),
-            T.Normalize(
-                mean=[0.5, 0.5, 0.5],
-                std=[0.5, 0.5, 0.5],
-                to_rgb=True,
-            ),
-        ]
-    )
-    
-    # 下载数据集并初始化 DataSet
-    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
-    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
-
-    # 模型组网并初始化网络
-    inception = paddle.vision.models.InceptionV3(num_classes=10)
-    model = paddle.Model(inception)
-
-    # 模型训练的配置准备，准备损失函数，优化器和评价指标
-    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
-                paddle.nn.CrossEntropyLoss(),
-                paddle.metric.Accuracy())
-
-    # 模型训练
-    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
-    # 模型评估
-    model.evaluate(test_dataset, batch_size=64, verbose=1)
-
-    # export to ONNX
-    save_path = 'onnx.save/inception' # 需要保存的路径
-    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
-    paddle.onnx.export(inception, save_path, input_spec=[x_spec], opset_version=11)
-
-    # 加载onnx模型并放到Infinitensor中
-    model_path = save_path + ".onnx"
-    onnx_model = onnx.load(model_path)
-    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
-    model = gofusion_model
-    model.init()
-
-    # 启动推理
-    cifar10_test = Cifar10(
-        mode="test",
-        transform=transform,  # apply transform to every image
-        backend="cv2",  # use OpenCV as image transform backend
-    )
-    batch_size = 1
-    total_size = 0
-    total_acc = 0.0
-    for data in itertools.islice(iter(cifar10_test), 10000):
-        images, labels = data
-        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
-        model.run()
-        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
-        outputs = paddle.to_tensor(outputs)
-        outputs = paddle.reshape(outputs, (1, 10))
-        labels = paddle.to_tensor(labels)
-        labels = paddle.reshape(labels, (1,1))
-        acc = paddle.metric.accuracy(outputs, labels)
-        total_acc += acc
-        total_size += batch_size
-    print("test acc: {}".format(total_acc.numpy() / total_size))
-
-
-
-if __name__ == "__main__":
-    run_cifar_train_and_infer() 
--- a/examples/python/paddle_model_dev.md
+++ b/examples/python/paddle_model_dev.md
@ -1,31 +0,0 @@
-## Description
-
-This is a doc to tell you how to run paddle*.py in your machine. If your model run on other machines except Nvidia, you may need to make some change.
-
-## What do we do in paddle*.py files?
-
-1. Train model and evalute model with Cifar10 dataset
-
-2. Export paddle model to onnx model
-
-3. Load onnx model, infer with InfiniTensor and calculate the inference accuracy
-
-## Command
-
-1. Go to `/examples/python` folder 
-
-2. Run the following command
-   
-   1. ```
-      python paddle_resnet.py
-      python paddle_densenet.py
-      python paddle_inception.py
-      ```
-
-## What should I do if I use other device(MLU, XPU, NPU)?
-
-You need to change this code:
-
-```
-paddle.device.set_device("gpu") # Change gpu to mlu, xpu or npu
-```
--- a/examples/python/paddle_resnet.py
+++ b/examples/python/paddle_resnet.py
@ -1,81 +0,0 @@
-
-import paddle
-import paddle.vision.transforms as T
-from paddle.vision.datasets import Cifar10
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-import itertools
-from paddle.vision.models.resnet import BasicBlock
-
-def run_cifar_train_and_infer():
-    
-    paddle.device.set_device("gpu")
-
-    transform = T.Compose(
-        [
-            T.Resize(224),
-            T.ToTensor(),
-            T.Normalize(
-                mean=[0.5, 0.5, 0.5],
-                std=[0.5, 0.5, 0.5],
-                to_rgb=True,
-            ),
-        ]
-    )
-    
-    # 下载数据集并初始化 DataSet
-    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
-    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
-
-    # 模型组网并初始化网络
-    resnet = paddle.vision.models.ResNet(BasicBlock, depth=18, num_classes=10)
-    model = paddle.Model(resnet)
-
-    # 模型训练的配置准备，准备损失函数，优化器和评价指标
-    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
-                paddle.nn.CrossEntropyLoss(),
-                paddle.metric.Accuracy())
-
-    # 模型训练
-    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
-    # 模型评估
-    model.evaluate(test_dataset, batch_size=64, verbose=1)
-
-    # export to ONNX
-    save_path = 'onnx.save/resnet' # 需要保存的路径
-    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
-    paddle.onnx.export(resnet, save_path, input_spec=[x_spec], opset_version=11)
-
-    # 加载onnx模型并放到Infinitensor中
-    model_path = save_path + ".onnx"
-    onnx_model = onnx.load(model_path)
-    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
-    model = gofusion_model
-    model.init()
-
-    # 启动推理
-    cifar10_test = Cifar10(
-        mode="test",
-        transform=transform,  # apply transform to every image
-        backend="cv2",  # use OpenCV as image transform backend
-    )
-    batch_size = 1
-    total_size = 0
-    total_acc = 0.0
-    for data in itertools.islice(iter(cifar10_test), 10000):
-        images, labels = data
-        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
-        model.run()
-        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
-        outputs = paddle.to_tensor(outputs)
-        outputs = paddle.reshape(outputs, (1, 10))
-        labels = paddle.to_tensor(labels)
-        labels = paddle.reshape(labels, (1,1))
-        acc = paddle.metric.accuracy(outputs, labels)
-        total_acc += acc
-        total_size += batch_size
-    print("test acc: {}".format(total_acc.numpy() / total_size))
-
-
-if __name__ == "__main__":
-    run_cifar_train_and_infer()
--- a/examples/python/resnet_inference.py
+++ b/examples/python/resnet_inference.py
@ -1,24 +0,0 @@
-import sys
-import onnx
-import torch
-import numpy as np
-from pyinfinitensor.onnx import OnnxStub, backend
-import torchvision.models as models
-
-if __name__ == '__main__':
-    model_path = './resnet18.onnx'
-    tv_model = models.resnet50(weights=None)
-    input_shape = (1, 3, 224, 224)
-    param = torch.rand(input_shape)
-    torch.onnx.export(tv_model, param, model_path, verbose=False)
-
-    onnx_model = onnx.load(model_path)
-    model = OnnxStub(onnx_model, backend.cuda_runtime())
-    images = np.random.random(input_shape).astype(np.float32)
-    next(iter(model.inputs.values())).copyin_numpy(images)
-    model.run()
-    outputs = next(iter(model.outputs.values())).copyout_numpy()
-    outputs = torch.tensor(outputs)
-    outputs = torch.reshape(outputs, (1, 1000))
-    _, predicted = torch.max(outputs, 1)
-    print(predicted)
--- a/include/bang/bang_common.h
+++ b/include/bang/bang_common.h
@ -2,10 +2,6 @@
 #include "cnnl.h"
 #include "cnrt.h"
 #include "core/common.h"
-#include "core/data_type.h"
-#ifdef INFINI_USE_CNCL
-#include "cncl.h"
-#endif

 #define checkBangError(call)                                                   \
    {                                                                          \
@ -31,70 +27,4 @@ namespace infini {

 using BangPtr = void *;

-inline cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
-    if (dataType == DataType::Float32) {
-        return CNNL_DTYPE_FLOAT;
-    }
-    if (dataType == DataType::Float16) {
-        return CNNL_DTYPE_HALF;
-    }
-    if (dataType == DataType::Double) {
-        return CNNL_DTYPE_DOUBLE;
-    }
-    if (dataType == DataType::Int8) {
-        return CNNL_DTYPE_INT8;
-    }
-    if (dataType == DataType::Int32) {
-        return CNNL_DTYPE_INT32;
-    }
-    if (dataType == DataType::UInt8) {
-        return CNNL_DTYPE_UINT8;
-    }
-    if (dataType == DataType::BFloat16) {
-        return CNNL_DTYPE_BFLOAT16;
-    }
-    if (dataType == DataType::Int64) {
-        return CNNL_DTYPE_INT64;
-    }
-    if (dataType == DataType::Bool) {
-        return CNNL_DTYPE_BOOL;
-    }
-    IT_TODO_HALT_MSG("Data type " + dataType.toString() +
-                     " not supported in CNNL.");
-}
-
-#ifdef INFINI_USE_CNCL
-inline cnclDataType_t cnclDataTypeConvert(DataType dataType) {
-    if (dataType == DataType::Float32) {
-        return cnclFloat32;
-    }
-    if (dataType == DataType::Float16) {
-        return cnclHalf;
-    }
-    if (dataType == DataType::Int8) {
-        return cnclInt8;
-    }
-    if (dataType == DataType::Int16) {
-        return cnclInt16;
-    }
-    if (dataType == DataType::Int32) {
-        return cnclInt32;
-    }
-    if (dataType == DataType::UInt8) {
-        return cnclUint8;
-    }
-    if (dataType == DataType::UInt16) {
-        return cnclUint16;
-    }
-    if (dataType == DataType::UInt32) {
-        return cnclUint32;
-    }
-    if (dataType == DataType::BFloat16) {
-        return cnclBfloat16;
-    }
-    IT_TODO_HALT_MSG("Data type " + dataType.toString() +
-                     " not supported in CNCL.");
-}
-#endif
-
 } // namespace infini
--- a/include/bang/bang_runtime.h
+++ b/include/bang/bang_runtime.h
@ -7,19 +7,16 @@ namespace infini {
 class BangRuntimeObj : public RuntimeObj {
  private:
    cnnlHandle_t cnnl;
-    cnrtQueue_t queue;
-    std::unique_ptr<CommunicatorObj> comm;
    BangPtr workspace;
    size_t workspaceSize;
-    mutable size_t cursor;

  public:
-    explicit BangRuntimeObj(int deviceId = 0)
-        : RuntimeObj(Device::BANG, deviceId) {
+    BangRuntimeObj() : RuntimeObj(Device::BANG) {
        cnInit(0);
        CNdev dev;
-        cnDeviceGet(&dev, deviceId);
+        cnDeviceGet(&dev, 0);
        checkBangError(cnrtSetDevice(dev));
+        cnrtQueue_t queue;
        checkBangError(cnrtQueueCreate(&queue));

        checkCnnlError(cnnlCreate(&cnnl));
@ -27,12 +24,10 @@ class BangRuntimeObj : public RuntimeObj {
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
        workspaceSize = 7ll << 30; // 7 GB
-        cursor = 0;
        workspace = alloc(workspaceSize);
    }
    virtual ~BangRuntimeObj() {
        dealloc(workspace);
-        checkBangError(cnrtQueueDestroy(queue));
        checkCnnlError(cnnlDestroy(cnnl));
    }
    string toString() const override;
@ -50,15 +45,10 @@ class BangRuntimeObj : public RuntimeObj {
    void dealloc(void *ptr) override { checkBangError(cnrtFree(ptr)); }
    cnnlHandle_t cnnlHandle() const { return cnnl; }
    BangPtr getWorkspace(size_t size) const {
-        IT_ASSERT((cursor + size) <= workspaceSize);
-        cursor += size;
-        void *temp = workspace;
-        temp += (cursor - size);
-        return temp;
+        IT_ASSERT(size <= workspaceSize);
+        return workspace;
    }

-    void resetWorkspace() const { cursor = 0; }
-
    void copyBlobFromCPU(void *dst, const void *src,
                         size_t bytes) const override {
        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
@ -76,9 +66,6 @@ class BangRuntimeObj : public RuntimeObj {
        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
                                  CNRT_MEM_TRANS_DIR_PEER2PEER));
    }
-    void initComm(const string &name, int worldSize, int rank) final;
-    CommunicatorObj &getCommunicator() const override { return *comm; }
-    cnrtQueue_t getBangQueue() const { return queue; }

  private:
    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
--- a/include/bang/cncl_communicator.h
+++ b/include/bang/cncl_communicator.h
@ -1,79 +0,0 @@
-#pragma once
-#include "bang_common.h"
-#include "core/communicator.h"
-#include <chrono>
-#include <cncl.h>
-#include <cnrt.h>
-#include <cstdlib>
-#include <filesystem>
-#include <fstream>
-#include <mutex>
-#include <thread>
-
-namespace infini {
-
-class CnclCommunicatorObj final : public CommunicatorObj {
-  private:
-    cnclComm_t *comms;
-
-  public:
-    CnclCommunicatorObj(const string &name, int worldSize, int rank)
-        : CommunicatorObj(worldSize, rank) {
-        const std::string filePath("./" + name + "_cncl_id.bin");
-        cnclCliqueId clique_id;
-        if (rank == 0) {
-            CNCL_CHECK(cnclGetCliqueId(&clique_id));
-            std::ofstream ofs(filePath, std::ios::binary);
-            ofs.write((char *)&clique_id, sizeof(cnclCliqueId));
-
-        } else {
-            auto begin = std::chrono::steady_clock::now();
-            while (!std::filesystem::exists(filePath)) {
-                auto now = std::chrono::steady_clock::now();
-                _IT_ASSERT_2(now < begin + std::chrono::seconds(10),
-                             "time limit (10s) exceeded.");
-                std::this_thread::sleep_for(std::chrono::milliseconds(100));
-            }
-            std::ifstream ifs(filePath, std::ios::binary);
-            ifs.read((char *)&clique_id, sizeof(cnclCliqueId));
-        }
-
-        int num_comms = 1;
-        int *dev_list = new int[num_comms];
-        int *rank_list = new int[num_comms];
-        comms = new cnclComm_t[num_comms];
-        uint32_t num_dev = 0;
-        checkBangError(cnrtGetDeviceCount(&num_dev));
-
-        for (int i = 0; i < num_comms; i++) {
-            rank_list[i] = rank;
-            dev_list[i] = rank_list[i] % num_dev;
-        }
-
-        CNCL_CHECK(cnclInitComms(comms, num_comms, dev_list, rank_list,
-                                 worldSize, &clique_id));
-
-        if (rank == 0) {
-            std::filesystem::remove(filePath);
-        }
-
-        delete[] dev_list;
-        delete[] rank_list;
-    }
-
-    ~CnclCommunicatorObj() {
-        CNCL_CHECK(cnclDestroyComms(comms, 1));
-        delete[] comms;
-    }
-
-    // Get the actual cnclComm_t
-    cnclComm_t getCnclComm() { return comms[0]; }
-
-    virtual string toString() const final {
-        std::ostringstream oss;
-        oss << "CNCL communicator";
-        return oss.str();
-    }
-};
-
-} // namespace infini
--- a/include/benchmark.h
+++ b/include/benchmark.h
@ -0,0 +1,4 @@
+#pragma once
+#include "core/common.h"
+#include "core/tensor_base.h"
+#include "utils/data_generator.h"
--- a/include/core/common.h
+++ b/include/core/common.h
@ -40,12 +40,12 @@ using HashType = uint64_t; // compatible with std::hash

 // Assert: conditions should have no side effect
 #define _IT_ASSERT_2(condition, info)                                          \
-    static_cast<bool>(condition)                                               \
-        ? void(0)                                                              \
-        : throw ::infini::Exception(                                           \
-              std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) +   \
-              "] Assertion failed (" + #condition + "): " + info)
-#define _IT_ASSERT_1(condition) _IT_ASSERT_2(condition, "")
+    (static_cast<bool>(condition)                                              \
+         ? void(0)                                                             \
+         : throw ::infini::Exception(                                          \
+               std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) +  \
+               "] Assertion failed (" + #condition + "): " + info))
+#define _IT_ASSERT_1(condition) _IT_ASSERT_2(condition, "");
 #define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)

 #define IT_TODO_HALT() _IT_ASSERT_2(false, "Unimplemented")
@ -61,35 +61,21 @@ template <typename T> auto enum_to_underlying(T e) {
 }

 template <typename T> std::string vecToString(const std::vector<T> &vec) {
-    std::stringstream ss;
-    ss << "[";
-    for (size_t i = 0; i < vec.size(); ++i) {
-        ss << vec.at(i);
-        if (i < vec.size() - 1) {
-            ss << ",";
-        }
+    std::string ret;
+    ret.append("[");
+    for (auto d : vec) {
+        ret.append(std::to_string(d));
+        ret.append(",");
    }
-    ss << "]";
-    return ss.str();
-}
-
-template <typename T> std::string vecToString(const T *st, size_t length) {
-    std::stringstream ss;
-    ss << "[";
-    size_t i = 0;
-    for (i = 0; i < length; i++) {
-        ss << *(st + i);
-        if (i < length - 1) {
-            ss << ",";
-        }
-    }
-    ss << "]";
-    return ss.str();
+    if (!vec.empty())
+        ret.pop_back();
+    ret.append("]");
+    return ret;
 }

 double timeit(
    const std::function<void()> &func,
-    const std::function<void(void)> &sync = []() {}, int warmupRounds = 10,
-    int timingRounds = 10);
+    const std::function<void(void)> &sync = []() {}, int warmupRounds = 200,
+    int timingRounds = 200);

 } // namespace infini
--- a/include/core/communicator.h
+++ b/include/core/communicator.h
@ -1,22 +0,0 @@
-#pragma once
-#include "object.h"
-#include "ref.h"
-
-namespace infini {
-
-// base class
-class CommunicatorObj : public Object {
-  protected:
-    int worldSize;
-    int rank;
-
-  public:
-    CommunicatorObj(int worldSize, int rank)
-        : worldSize(worldSize), rank(rank) {}
-
-    virtual ~CommunicatorObj() = default;
-    virtual int getWorldSize() const { return worldSize; }
-    virtual int getRank() const { return rank; }
-};
-
-} // namespace infini
--- a/include/core/data_type.h
+++ b/include/core/data_type.h
@ -1,54 +1,22 @@
-#pragma once
 #include "core/common.h"

 namespace infini {

 class DataType {
  public:
-    // <https://onnx.ai/onnx/intro/concepts.html#element-type>
-    static const DataType Undefine;
+    // legacy
    static const DataType Float32;
-    static const DataType UInt8;
-    static const DataType Int8;
-    static const DataType UInt16;
-    static const DataType Int16;
-    static const DataType Int32;
-    static const DataType Int64;
-    static const DataType String;
-    static const DataType Bool;
-    static const DataType Float16;
-    static const DataType Double;
    static const DataType UInt32;
-    static const DataType UInt64;
-    static const DataType BFloat16;
-    // "sizePerElement" show the DType to cpu_type
-    // DataType::Bool -> int8_t   DataType::Float16 -> uint16_t
-    static constexpr size_t sizePerElement[]{0,
-                                             sizeof(float),
-                                             sizeof(uint8_t),
-                                             sizeof(int8_t),
-                                             sizeof(uint16_t),
-                                             sizeof(int16_t),
-                                             sizeof(int32_t),
-                                             sizeof(int64_t),
-                                             sizeof(std::string),
-                                             sizeof(int8_t),
-                                             sizeof(uint16_t),
-                                             sizeof(double),
-                                             sizeof(uint32_t),
-                                             sizeof(uint64_t),
-                                             0,
-                                             0,
-                                             sizeof(uint16_t)};
+    // These are just aligned with the type and index of onnx:
+    // <https://onnx.ai/onnx/intro/concepts.html#element-type>
+    static const DataType UInt8, Int8, UInt16, Int16, Int32, Int64;
+    static constexpr size_t sizePerElement[]{
+        sizeof(float),    sizeof(uint32_t), sizeof(uint8_t), sizeof(int8_t),
+        sizeof(uint16_t), sizeof(int16_t),  sizeof(int32_t), sizeof(int64_t)};

-    static constexpr std::string_view names[]{
-        "Undefine",    "Float32", "UInt8",  "Int8",   "UInt16",
-        "Int16",       "Int32",   "Int64",  "String", "Bool",
-        "Float16",     "Double",  "UInt32", "UInt64", "PlaceHolder",
-        "PlaceHolder", "BFloat16"};
-
-    static constexpr int cpuType[]{-1, 0, 2, 3, 4, 5,  6,  7, -1,
-                                   3,  4, 9, 1, 8, -1, -1, 4};
+    static constexpr std::string_view names[]{"Float32", "UInt32", "UInt8",
+                                              "Int8",    "UInt16", "Int16",
+                                              "Int32",   "Int64"};

  private:
    int index;
@ -61,43 +29,37 @@ class DataType {
    bool operator==(const DataType &rhs) const { return index == rhs.index; }
    bool operator<(const DataType &rhs) const { return index < rhs.index; }

-    template <typename T> static int get() {
+    template <typename T> static DataType get() {
        IT_TODO_HALT_MSG("Unsupported data type");
    }
    size_t getSize() const { return sizePerElement[index]; }
    string toString() const { return string(names[index]); }
-    int cpuTypeInt() const { return cpuType[index]; }
-    int getIndex() const { return index; }
 };

+inline const DataType DataType::Float32(0);
+inline const DataType DataType::UInt32(1);
+inline const DataType DataType::UInt8(2), DataType::Int8(3),
+    DataType::UInt16(4), DataType::Int16(5), DataType::Int32(6),
+    DataType::Int64(7);
 // Method definitions are out of the declaration due to GCC bug:
 // https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
-template <> inline int DataType::get<float>() { return 0; }
-template <> inline int DataType::get<uint32_t>() { return 1; }
-template <> inline int DataType::get<uint8_t>() { return 2; }
-template <> inline int DataType::get<int8_t>() { return 3; }
-template <> inline int DataType::get<uint16_t>() { return 4; }
-template <> inline int DataType::get<int16_t>() { return 5; }
-template <> inline int DataType::get<int32_t>() { return 6; }
-template <> inline int DataType::get<int64_t>() { return 7; }
-template <> inline int DataType::get<uint64_t>() { return 8; }
-template <> inline int DataType::get<double>() { return 9; }
+template <> inline DataType DataType::get<float>() { return Float32; }
+template <> inline DataType DataType::get<uint32_t>() { return UInt32; }
+template <> inline DataType DataType::get<uint8_t>() { return UInt8; }
+template <> inline DataType DataType::get<int8_t>() { return Int8; }
+template <> inline DataType DataType::get<uint16_t>() { return UInt16; }
+template <> inline DataType DataType::get<int16_t>() { return Int16; }
+template <> inline DataType DataType::get<int32_t>() { return Int32; }
+template <> inline DataType DataType::get<int64_t>() { return Int64; }

 template <int index> struct DT {};
-template <> struct DT<0> { using t = bool; };
-template <> struct DT<1> { using t = float; };
+template <> struct DT<0> { using t = float; };
+template <> struct DT<1> { using t = uint32_t; };
 template <> struct DT<2> { using t = uint8_t; };
 template <> struct DT<3> { using t = int8_t; };
 template <> struct DT<4> { using t = uint16_t; };
 template <> struct DT<5> { using t = int16_t; };
 template <> struct DT<6> { using t = int32_t; };
 template <> struct DT<7> { using t = int64_t; };
-template <> struct DT<8> { using t = char; };
-template <> struct DT<9> { using t = int8_t; };
-template <> struct DT<10> { using t = uint16_t; };
-template <> struct DT<11> { using t = double; };
-template <> struct DT<12> { using t = uint32_t; };
-template <> struct DT<13> { using t = uint64_t; };
-template <> struct DT<16> { using t = uint16_t; };

 } // namespace infini
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -1,5 +1,4 @@
 #pragma once
-#include "core/lazy_allocator.h"
 #include "core/operator.h"
 #include "core/tensor.h"

@ -10,11 +9,9 @@ class GraphObj : public Object {
    Runtime runtime;
    TensorVec tensors;
    OpVec ops;
-    LazyAllocator allocator;

  public:
-    explicit GraphObj(Runtime runtime)
-        : runtime(runtime), allocator(runtime), sorted(false){};
+    explicit GraphObj(Runtime runtime) : runtime(runtime), sorted(false){};
    GraphObj(Runtime runtime, OpVec ops_in);
    string toString() const override;
    Runtime getRuntime() const { return runtime; }
@ -53,7 +50,6 @@ class GraphObj : public Object {
    const TensorVec &getTensors() const { return tensors; }
    const OpVec &getOperators() const { return ops; }
    OpVec getComputeOps() const;
-    Tensor getTensor(int) const;

    /**
     * Sort the nodes in topological order.
@ -65,13 +61,7 @@ class GraphObj : public Object {

    void optimize();

-    void shape_infer();
-
-    void dataMalloc(bool useNaiveAllocator = false, size_t memPoolSize = 0);
-
-    Tensor cloneKV(Tensor &tensor);
-
-    void freeHeap();
+    void dataMalloc();

    /**
     * @brief Add an operator and create its outputs. Output tensor arguments
@ -127,11 +117,6 @@ class GraphObj : public Object {
     * @brief If the nodes is sorted in topological order.
     */
    bool sorted;
-
-    /**
-     * @brief If the weight tensors are allocated.
-     */
-    bool weightAllocated = false;
 };

 } // namespace infini
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -5,12 +5,32 @@
 #include <cstdint>
 #include <iostream>

-#ifdef USE_CUDA
-#include "cuda/cuda_runtime.h"
-#endif
-
 namespace infini {

+// Use the indices from onnx to reduce delivery overhead,
+// which comes from onnx but may be not only used for onnx.
+//
+// see https://onnx.ai/onnx/intro/concepts.html#element-type
+enum OnnxDType : int {
+    UNDEFINED = 0,
+    FLOAT,
+    UINT8,
+    INT8,
+    UINT16,
+    INT16,
+    INT32,
+    INT64,
+    STRING,
+    BOOL,
+    FLOAT16,
+    DOUBLE,
+    UINT32,
+    UINT64,
+    COMPLEX64,
+    COMPLEX128,
+    BFLOAT16,
+};
+
 class GraphHandlerObj {
    Graph g;

@ -30,40 +50,27 @@ class GraphHandlerObj {
                            int pw, int sh, int sw, int dh, int dw, int oph,
                            int opw);
    Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
-                  Tensor bias, ActType act,
-                  std::string matmul_compute_type = "default");
-    Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
-                              Tensor var, Tensor scale, Tensor bias,
-                              float momentum, float eps, bool training);
-    Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
-                              Tensor bias, float eps, int axis, int stash_type);
-    Tensor rmsNorm(Tensor input, Tensor weight, Tensor output);
+                  Tensor bias, ActType act);
+    Tensor batchNorm(Tensor input, Tensor output, Tensor mean, Tensor var,
+                     Tensor scale, Tensor bias, float momentum, float eps,
+                     bool training);

    Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
-                   int ph, int pw, int sh, int sw, int ceilMode);
+                   int ph, int pw, int sh, int sw);
    Tensor avgPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
-                   int ph, int pw, int sh, int sw, int ceilMode);
+                   int ph, int pw, int sh, int sw);

    Tensor add(Tensor a, Tensor b, Tensor c);
    Tensor sub(Tensor a, Tensor b, Tensor c);
    Tensor mul(Tensor a, Tensor b, Tensor c);
    Tensor div(Tensor a, Tensor b, Tensor c);
    Tensor pow(Tensor a, Tensor b, Tensor c);
-    Tensor min(Tensor a, Tensor b, Tensor c);
-    Tensor max(Tensor a, Tensor b, Tensor c);

    Tensor relu(Tensor x, Tensor y);
-    Tensor silu(Tensor x, Tensor y);
-    Tensor gelu(Tensor x, Tensor y);
    Tensor sigmoid(Tensor x, Tensor y);
-    Tensor hardSigmoid(Tensor x, Tensor y);
-    Tensor hardSwish(Tensor x, Tensor y);
    Tensor tanh(Tensor x, Tensor y);
-    Tensor erf(Tensor x, Tensor y);
    Tensor softmax(Tensor x, Tensor y, int axis);
    Tensor abs(Tensor x, Tensor y);
-    Tensor sqrt(Tensor x, Tensor y);
-    Tensor neg(Tensor x, Tensor y);
    Tensor shape(Tensor x, Tensor y);
    Tensor identity(Tensor x, Tensor y);
    Tensor flatten(Tensor s, Tensor y, int axis);
@ -72,51 +79,17 @@ class GraphHandlerObj {
                std::optional<float> max);
    Tensor transpose(Tensor data, Tensor transposed, Shape perm);
    Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
-    Tensor resize(Tensor input, Tensor output,
-                  const std::optional<vector<int>> &axes, Tensor sizes,
-                  Tensor scales, Tensor roi, vector<uint32_t> sizes_,
-                  vector<float> scales_, vector<float> roi_, string mode,
-                  string ratioPolicy, string nearestMode,
-                  string coordTransMode);
-    Tensor squeeze(Tensor input, Tensor output, Shape axes);
-    Tensor unsqueeze(Tensor input, Tensor output, Shape axes);
    Tensor concat(TensorVec inputs, Tensor output, int dim);
-    Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
-                            Tensor input_q, Tensor input_k, Tensor input_v,
-                            Tensor position_id, Tensor output_matmul);
-    Tensor RoPE(Tensor pos, Tensor input, Tensor output);
    TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
-                    std::variant<int, vector<int>> numOrRatio);
+                    int num_outputs);
    Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
-    Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
    Tensor reduceMean(Tensor data, Tensor reduced,
                      const optional<vector<int>> &axes, bool keepdims);
-    Tensor reduceSum(Tensor data, Tensor reduced,
-                     const optional<vector<int>> &axes, bool keepdims);
    Tensor slice(Tensor input, Tensor output, const vector<int> &starts,
                 const vector<int> &ends, const optional<vector<int>> &axes,
                 const optional<vector<int>> &steps);
    Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
               const optional<vector<int>> &axes);
-    Tensor cast(Tensor input, Tensor output, int to);
-    Tensor expand(Tensor input, Tensor output, Shape dims);
-    Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
-    std::vector<int> getDims(Tensor x) { return x->getDims(); }
-
-    Tensor allReduceSum(Tensor input, Tensor output);
-    Tensor allReduceProd(Tensor input, Tensor output);
-    Tensor allReduceMin(Tensor input, Tensor output);
-    Tensor allReduceMax(Tensor input, Tensor output);
-    Tensor allReduceAvg(Tensor input, Tensor output);
-    TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
-    Tensor broadcast(Tensor input, Tensor output, int root);
-    Tensor send(Tensor input, int source, int destination, Tensor output);
-    Tensor recv(Tensor output, int source, int destination, Shape dims,
-                int outputType, Tensor input);
-    Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
-                        std::string mode);
-    Tensor lrn(Tensor input, Tensor output, float alpha, float beta, float bias,
-               int size);

    //------ modifiers

@ -124,31 +97,15 @@ class GraphHandlerObj {

    inline void optimize() { g->optimize(); }

-    inline void shape_infer() { g->shape_infer(); }
-
-    void change_shape(const vector<int> &shape, int tensorId);
    //------ runtime

-    inline void data_malloc(bool useNaiveAllocator = false,
-                            size_t memPoolSize = 0) {
-        g->dataMalloc(useNaiveAllocator, memPoolSize);
-    }
-
-    inline Tensor clone_KV(Tensor &tensor) { return g->cloneKV(tensor); }
-
-    inline void free_heap() { g->freeHeap(); }
+    inline void data_malloc() { g->dataMalloc(); }

    inline void tune() { g->getRuntime()->run(g, true); }

    inline void run() { g->getRuntime()->run(g); }

    inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }
-
-#ifdef USE_CUDA
-    inline void run_with_cudagraph() {
-        (as<CudaRuntimeObj>(g->getRuntime()))->runWithCudaGraph(g);
-    }
-#endif
 };

 } // namespace infini
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@ -2,11 +2,10 @@
 #include "core/common.h"
 #include "core/operator.h"
 #include "core/tensor.h"
-#include "utils/operator_utils.h"
 #include <functional>
 #include <nlohmann/json.hpp>
-namespace infini {
 using json = nlohmann::json;
+namespace infini {

 class RuntimeObj; // Forward declaration for Kernel::compute

@ -30,6 +29,7 @@ class Kernel {
  public:
    Kernel() {}
    virtual ~Kernel() {}
+
    /**
     * @param op The operator to be executed.
     * @param record The parameters for kernel execution. If extra parameters
@ -102,9 +102,11 @@ class KernelRegistry {
    }
    Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
        auto it = kernels.find(kernelAttrs);
-        IT_ASSERT(it != kernels.end(), "Kernel not found for key {" +
-                                           get_kernel_attrs_str(kernelAttrs) +
-                                           "}");
+        IT_ASSERT(it != kernels.end(),
+                  "Kernel not found for key {" +
+                      to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
+                      ", " + OpRegistry::getOpName(std::get<1>(kernelAttrs)) +
+                      ", " + std::get<2>(kernelAttrs).toString() + "}");
        return std::get<0>(it->second);
    }
    const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
@ -129,16 +131,15 @@ class CpuKernelWithoutConfig : public Kernel {

 } // namespace infini

-#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt)                  \
+#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt)        \
    namespace infini {                                                         \
    static const bool _CAT(_register_kernel_, cnt) =                           \
-        KernelRegistry::getInstance().registerKernel(KernelAttrs{device,       \
-                                                                 opType},      \
-                                                     new kernel(), name);      \
+        KernelRegistry::getInstance().registerKernel(                          \
+            KernelAttrs{device, opType, dataType}, new kernel(), name);        \
    }

-#define REGISTER_KERNEL(device, opType, kernel, name)                          \
-    _REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)
+#define REGISTER_KERNEL(device, opType, dataType, kernel, name)                \
+    _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__)

 #define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt)                        \
    namespace infini {                                                         \
--- a/include/core/lazy_allocator.h
+++ b/include/core/lazy_allocator.h
@ -1,122 +0,0 @@
-#pragma once
-#include "core/runtime.h"
-#include "core/tensor.h"
-#ifdef BUILD_TEST
-#include "gtest/gtest.h"
-#endif
-#include <cstddef>
-#include <map>
-#include <unordered_set>
-
-namespace infini {
-
-class LazyAllocator {
-  private:
-#ifdef BUILD_TEST
-    FRIEND_TEST(LazyAllocator, testMergeFreeBlocks);
-
-    FRIEND_TEST(LazyAllocator, testAllocWithEndFreeBlock);
-#endif
-
-    Runtime runtime;
-
-    size_t used = 0;
-
-    size_t peak = 0;
-
-    size_t weightPeak = 0;
-
-    size_t heapPeak = 0;
-
-    size_t alignment;
-
-    bool hasMemPool = false;
-
-    size_t memPoolSize = 0;
-
-    // pointer to the memory actually allocated
-    void *ptr = nullptr;
-
-    // pointer to the weight memory space
-    void *weightPtr = nullptr;
-
-    // memory pool ptr
-    void *memPoolPtr = nullptr;
-
-    // // a cache designed for a batch size that has already occurred
-    // std::unordered_map<size_t, std::unordered_map<TensorObj *, size_t>>
-    // batchsizeToTensorOffset;
-
-    struct freeBlockInfo {
-        size_t addr;
-        size_t blockSize;
-    };
-
-    struct cmpFreeBlockInfo {
-        bool operator()(const freeBlockInfo &a, const freeBlockInfo &b) const {
-            return (a.blockSize != b.blockSize) ? (a.blockSize < b.blockSize)
-                                                : (a.addr < b.addr);
-        }
-    };
-
-    // free balanced tree, maintains all free memory blocks
-    std::set<freeBlockInfo, cmpFreeBlockInfo> freeBlocks;
-
-    // key: head address offset of the free memory block
-    // value: blockSize of the block
-    std::unordered_map<size_t, size_t> headAddrToBlockSize;
-
-    // key: tail address offset of the free memory block
-    // value: blockSize of the block
-    std::unordered_map<size_t, size_t> tailAddrToBlockSize;
-
-  public:
-    LazyAllocator(Runtime runtime);
-
-    virtual ~LazyAllocator();
-
-    void init();
-
-    void setMemPool(size_t memPoolSize);
-
-    bool getMemPoolStatus();
-
-    // function: simulate memory allocation
-    // arguments：
-    //     size: size of memory block to be allocated
-    // return: head address offset of the allocated memory block
-    size_t alloc(size_t size);
-
-    size_t allocWeight(size_t size);
-
-    size_t heapAlloc(size_t size);
-
-    void freeHeap();
-
-    // function: simulate memory free
-    // arguments:
-    //     addr: head address offset of memory block to be free
-    //     size: size of memory block to be freed
-    void free(size_t addr, size_t size);
-
-    // function: perform actual memory allocation
-    // return: pointer to the head address of the allocated memory
-    void *getPtr();
-
-    // void addCache(size_t batchsize, std::unordered_map<TensorObj *, size_t>);
-
-    // std::unordered_map<TensorObj *, size_t> getCache(size_t batchsize);
-
-    void *getWeightPtr();
-
-    void *getHeapPtr();
-
-    void info();
-
-  private:
-    // function: memory alignment, rouned up
-    // return: size of the aligned memory block
-    size_t getAlignedSize(size_t size);
-};
-
-} // namespace infini
--- a/include/core/op_type.h
+++ b/include/core/op_type.h
@ -1,269 +0,0 @@
-#pragma once
-#ifndef OP_TYPE_H
-#define OP_TYPE_H
-
-#include <string>
-#include <unordered_set>
-
-namespace infini {
-
-struct OpType {
-    using underlying_t = uint16_t;
-
-    // Clang-format is ambiguous in formating of comment alignment.
-    // In order to disambiguate, it is necessary to comment all enum
-    // elements.
-    enum : underlying_t {
-        Unknown,
-        Abs,                // Unary
-        Acos,               // Unary
-        Acosh,              // Unary
-        Add,                // Binary
-        And,                // Binary
-        ArgMax,             //
-        Asin,               // Unary
-        Asinh,              // Unary
-        Atan,               // Unary
-        Atanh,              // Unary
-        AttentionKVCache,   // Fusion
-        AveragePool,        // Pool
-        BatchNormalization, //
-        Bernoulli,          //
-        BitShift,           // Binary
-        BitwiseAnd,         // Binary
-        BitwiseNot,         // Binary
-        BitwiseOr,          // Binary
-        BitwiseXor,         // Binary
-        BlackmanWindow,     //
-        Cast,               // Unary
-        CastLike,           //
-        Ceil,               // Unary
-        Celu,               //
-        CenterCropPad,      //
-        Clip,               // Unary
-        Col2lm,
-        Compress,
-        Concat,
-        ConcatFromSequence,
-        ConstantOfShape,
-        Conv,          // ComputationIntensive
-        ConvInteger,   // ComputationIntensive
-        ConvTranspose, // ComputationIntensive
-        Cos,           // Unary
-        Cosh,          // Unary
-        CumSum,
-        DFT,
-        DeformConv, // ComputationIntensive
-        DepthToSpace,
-        DequantizeLinear,
-        Det,
-        Div, // Binary
-        Dropout,
-        DynamicQuantizeLinear,
-        Einsum,
-        Elu,
-        Equal, // Compair
-        Erf,   // Unary
-        Exp,   // Unary
-        Expand,
-        EyeLike,
-        Flatten,
-        Floor, // Unary
-        GRU,
-        Gather,
-        GatherElements,
-        GatherND,
-        Gemm,
-        Gelu,              // Unary
-        GlobalAveragePool, // GlobalPool
-        GlobalLpPool,      // GlobalPool
-        GlobalMaxPool,     // GlobalPool
-        Greater,           // Compair
-        GreaterOrEqual,    // Compair
-        GridSample,
-        GroupNormalization,
-        HammingWindow,
-        HannWindow,
-        HardSigmoid,
-        HardSwish,
-        Hardmax,
-        Identity,
-        If,
-        InstanceNormalization,
-        IsInf,
-        IsNaN,
-        LRN,
-        LSTM,
-        LayerNormalization,
-        LeakyRelu,
-        Less,        // Compair
-        LessOrEqual, // Compair
-        Log,         // Unary
-        LogSoftmax,
-        Loop,
-        LpNormalization,
-        LpPool,
-        MatMul,        // ComputationIntensive
-        MatMulInteger, // ComputationIntensive
-        Max,
-        MaxPool,
-        MaxRoiPool,
-        MaxUnpool,
-        Mean,
-        MeanVarianceNormalization,
-        MelWeightMatrix,
-        Min,
-        Mish,
-        Mod,         // Binary
-        Mul,         // Binary
-        Multinomial, //
-        Neg,         // Unary
-        NegativeLogLikelihoodLoss,
-        NonMaxSuppression,
-        NonZero,
-        Not, // Unary
-        OneHot,
-        Optional,
-        OptionalGetElement,
-        OptionalHasElement,
-        Or,            // Binary
-        PRelu,         //
-        Pad,           //
-        Pow,           // Binary
-        QLinearConv,   // ComputationIntensive
-        QLinearMatMul, // ComputationIntensive
-        QuantizeLinear,
-        RNN,
-        RandomNormal,
-        RandomNormalLike,
-        RandomUniform,
-        RandomUniformLike,
-        Range,
-        Reciprocal,
-        ReduceL1,        // Reduce
-        ReduceL2,        // Reduce
-        ReduceLogSum,    // Reduce
-        ReduceLogSumExp, // Reduce
-        ReduceMax,       // Reduce
-        ReduceMean,      // Reduce
-        ReduceMin,       // Reduce
-        ReduceProd,      // Reduce
-        ReduceSum,       // Reduce
-        ReduceSumSquare, // Reduce
-        Relu,            // Unary
-        Silu,            // Unary
-        Reshape,
-        Resize,
-        ReverseSequence,
-        RoiAlign,
-        RoPE,    // Fusion
-        Round,   // Unary
-        RMSNorm, // Fusion
-        STFT,
-        Scan,
-        Scatter,
-        ScatterElements,
-        ScatterND,
-        Selu,
-        SequenceAt,
-        SequenceConstruct,
-        SequenceEmpty,
-        SequenceErase,
-        SequenceInsert,
-        SequenceLength,
-        SequenceMap,
-        Shape,
-        Shrink,
-        Sigmoid,
-        Sign,
-        Sin,  // Unary
-        Sinh, // Unary
-        Size,
-        Slice,
-        Softmax,
-        SoftmaxCrossEntropyLoss,
-        Softplus,
-        Softsign,
-        SpaceToDepth,
-        Split,
-        SplitToSequence,
-        Sqrt,
-        Squeeze,
-        StringNormalizer,
-        Sub,  // Binary
-        Sum,  //
-        Tan,  // Unary
-        Tanh, // unary
-        TfIdfVectorizer,
-        ThresholdedRelu,
-        Tile,
-        TopK,
-        Transpose,
-        Trilu,
-        Unique,
-        Unsqueeze,
-        Upsample,
-        Where,
-        Xor, // Binary
-        // CUSTOM DEFINED
-        G2BMM,
-        GBMM,
-        MemBound,
-        // TODO
-        ConvTransNHWC,
-        ConvBackwardFilter,
-        ReluBackward,
-        SigmoidBackward,
-        TanhBackward,
-
-        Fill,
-        Extend,
-        MSELoss,
-        Hardtanh,
-        L2Loss,
-        Rsqrt,
-        FloorDiv,
-        FloorMod,
-        Square,
-        SquaredDifference,
-
-        // Communication Ops
-        AllReduceSum,
-        AllReduceProd,
-        AllReduceMin,
-        AllReduceMax,
-        AllReduceAvg,
-        AllGather,
-        Broadcast,
-        Send,
-        Recv,
-    } type;
-
-    constexpr OpType(decltype(type) t) : type(t) {}
-    constexpr explicit OpType(underlying_t val) : type((decltype(type))val) {}
-    constexpr underlying_t underlying() const { return type; }
-
-    bool operator==(OpType others) const { return type == others.type; }
-    bool operator!=(OpType others) const { return type != others.type; }
-    bool operator<(OpType others) const { return type < others.type; }
-
-    const char *toString() const;
-    bool isUnary() const;
-    bool isBinary() const;
-    bool isElementWise() const;
-    bool isCompair() const;
-    bool isPool() const;
-    bool isGlobalPool() const;
-    bool isMatMulOrConv() const;
-};
-
-enum class ActType {
-    None,
-    Relu,
-    Sigmoid,
-    Tanh,
-};
-
-} // namespace infini
-
-#endif // OP_TYPE_H
--- a/include/core/operator.h
+++ b/include/core/operator.h
@ -1,14 +1,231 @@
 #pragma once
-
-#include "core/op_type.h"
 #include "core/tensor.h"
-
 namespace infini {
-using KernelAttrs = std::tuple<Device, OpType::underlying_t>;
+
+enum class OpType {
+    Unknown = 0,
+    // linear
+    Conv = 100,
+    ConvBackwardFilter,
+    ConvBackwardData,
+    Matmul,
+    ConvTrans,
+    ConvTransNHWC,
+    G2BMM,
+    GBMM,
+    Pad,
+    Slice,
+    Concat,
+    Split,
+    Transpose,
+    Extend,
+    MaxPool,
+    AvgPool,
+    Add,
+    Sub,
+    Mul,
+    Div,
+    Pow,
+    Gather,
+    ReduceMean,
+    Reshape,
+    Flatten,
+    Identity,
+    // element wise
+    BatchNorm = 200,
+    Softmax,
+    Activation,
+    Relu,
+    ReluBackward,
+    PRelu,
+    Sigmoid,
+    SigmoidBackward,
+    Tanh,
+    TanhBackward,
+    Abs,
+    Sin,
+    Cos,
+    Tan,
+    ASin,
+    ACos,
+    ATan,
+    SinH,
+    CosH,
+    TanH,
+    ASinH,
+    ACosH,
+    ATanH,
+    Resize,
+    Arange,
+    Shape,
+    Copy,
+    Ceil,
+    Floor,
+    Clip,
+    Erf,
+    Exp,
+    Fill,
+    Log,
+    L2Loss,
+    Maximum,
+    Minimum,
+    MSELoss,
+    Neg,
+    Power,
+    Reciprocal,
+    Sqrt,
+    Rsqrt,
+    Cast,
+    FloorDiv,
+    FloorMod,
+    Det,
+    Round,
+    Square,
+    SquaredDifference,
+    Hardtanh,
+    Equal,
+    NotEqual,
+    GreaterThan,
+    GreaterEqual,
+    LessThan,
+    LessEqual,
+    And,
+    Or,
+    Xor,
+    Not,
+    BitAnd,
+    BitOr,
+    BitXor,
+    BitNot,
+    BitLeftShift,
+    BitRightShift,
+    Dropout,
+    //
+    MemBound = 300,
+};
+
+using KernelAttrs = std::tuple<Device, OpType, DataType>;
+
+class OpRegistry {
+  public:
+    static std::string getOpName(OpType opType) {
+#define FOP(op)                                                                \
+    case OpType::op:                                                           \
+        return #op
+
+        switch (opType) {
+            FOP(Unknown);
+            // linear
+            FOP(Conv);
+            FOP(ConvBackwardFilter);
+            FOP(ConvBackwardData);
+            FOP(Matmul);
+            FOP(ConvTrans);
+            FOP(G2BMM);
+            FOP(GBMM);
+            FOP(Pad);
+            FOP(Slice);
+            FOP(Concat);
+            FOP(Split);
+            FOP(Transpose);
+            FOP(Extend);
+            FOP(MaxPool);
+            FOP(AvgPool);
+            FOP(Add);
+            FOP(Sub);
+            FOP(Mul);
+            FOP(Div);
+            FOP(Pow);
+            FOP(Gather);
+            FOP(ReduceMean);
+            FOP(Reshape);
+            FOP(Identity);
+            FOP(Shape);
+            // element wise
+            FOP(BatchNorm);
+            FOP(Softmax);
+            FOP(Activation);
+            FOP(Relu);
+            FOP(ReluBackward);
+            FOP(PRelu);
+            FOP(Sigmoid);
+            FOP(SigmoidBackward);
+            FOP(Tanh);
+            FOP(TanhBackward);
+            FOP(Abs);
+            FOP(Sin);
+            FOP(Cos);
+            FOP(Tan);
+            FOP(ASin);
+            FOP(ACos);
+            FOP(ATan);
+            FOP(SinH);
+            FOP(CosH);
+            FOP(TanH);
+            FOP(ASinH);
+            FOP(ACosH);
+            FOP(ATanH);
+            FOP(Copy);
+            FOP(Ceil);
+            FOP(Floor);
+            FOP(Clip);
+            FOP(Erf);
+            FOP(Exp);
+            FOP(Fill);
+            FOP(Log);
+            FOP(L2Loss);
+            FOP(Maximum);
+            FOP(Minimum);
+            FOP(MSELoss);
+            FOP(Neg);
+            FOP(Power);
+            FOP(Reciprocal);
+            FOP(Sqrt);
+            FOP(Rsqrt);
+            FOP(Cast);
+            FOP(FloorDiv);
+            FOP(FloorMod);
+            FOP(Det);
+            FOP(Round);
+            FOP(Square);
+            FOP(SquaredDifference);
+            FOP(Hardtanh);
+            FOP(Equal);
+            FOP(NotEqual);
+            FOP(GreaterThan);
+            FOP(GreaterEqual);
+            FOP(LessThan);
+            FOP(LessEqual);
+            FOP(And);
+            FOP(Or);
+            FOP(Xor);
+            FOP(Not);
+            FOP(BitAnd);
+            FOP(BitOr);
+            FOP(BitXor);
+            FOP(BitNot);
+            FOP(BitLeftShift);
+            FOP(BitRightShift);
+            //
+            FOP(MemBound);
+        default:
+            IT_ASSERT(false);
+            break;
+        }
+#undef FOP
+    }
+};
+
+enum class ActType {
+    None,
+    Relu,
+    Sigmoid,
+    Tanh,
+};

 struct OpPerfKey {
    HashType hash;
-    OpType::underlying_t opType;
+    OpType opType;
    vector<int> attrs;

  public:
@ -16,7 +233,7 @@ struct OpPerfKey {
    // https://github.com/nlohmann/json#how-can-i-use-get-for-non-default-constructiblenon-copyable-types
    OpPerfKey() = default;
    OpPerfKey(HashType hash, OpType opType, vector<int> attrs = {})
-        : hash(hash), opType(opType.underlying()), attrs(attrs) {}
+        : hash(hash), opType(opType), attrs(attrs) {}
    bool operator==(const OpPerfKey &rhs) const {
        if (hash != rhs.hash)
            return false;
@ -55,7 +272,8 @@ class OperatorObj : public Object {

  public:
    OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
-    virtual optional<vector<Shape>> inferShape(const TensorVec &inputs) = 0;
+    virtual optional<vector<Shape>>
+    inferShape(const TensorVec &inputs) const = 0;
    virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
    /**
     * @brief Constructs outputs (if requried) and check whether the operator is
@ -72,7 +290,16 @@ class OperatorObj : public Object {
     */
    HashType hash() const;

-  public:
+  public: // check Op type
+    bool isLinearOp() const;
+    bool isElementWiseOp() const;
+    bool isSplitOp() const;
+    bool isConcatOp() const;
+    bool isComputeOp() const;
+    bool isTransposeOp() const;
+    bool isReshapeOp() const;
+    bool isMemBoundOp() const;
+
  public: // getter and setter
    const TensorVec &getInputs() const { return inputs; }
    const TensorVec &getOutputs() const { return outputs; }
@ -90,7 +317,6 @@ class OperatorObj : public Object {
    OpType getOpType() const { return type; }
    // HACK: set correct data type
    DataType getDType() const { return getInputs(0)->getDType(); }
-    DataType getOutDType() const { return getOutput()->getDType(); }
    virtual int numInputs() const = 0;
    virtual int numOutputs() const = 0;

@ -105,7 +331,7 @@ class OperatorObj : public Object {
                           const TensorVec &newOutputs) const = 0;

  protected:
-    optional<vector<Shape>> inferShape();
+    optional<vector<Shape>> inferShape() const;
    vector<DataType> inferDataType() const;

  private:
--- a/include/core/perf_engine.h
+++ b/include/core/perf_engine.h
@ -2,8 +2,8 @@
 #include "core/graph.h"
 #include "core/kernel.h"
 #include <nlohmann/json_fwd.hpp>
-namespace infini {
 using json = nlohmann::json;
+namespace infini {

 class PerfEngine {
  public:
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -1,7 +1,5 @@
 #pragma once
 #include "core/common.h"
-#include "core/communicator.h"
-#include "core/op_type.h"
 #include "core/ref.h"
 #include <memory>

@ -15,7 +13,6 @@ class GraphObj;
 class GraphHandlerObj;
 class RuntimeObj;
 class BlobObj;
-template <typename T> class WorkspaceObj;

 using TensorBase = Ref<TensorBaseObj>;
 using Tensor = Ref<TensorObj>;
@ -24,7 +21,7 @@ using Graph = Ref<GraphObj>;
 using GraphHandler = Ref<GraphHandlerObj>;
 using Runtime = Ref<RuntimeObj>;
 using Blob = Ref<BlobObj>;
-template <typename T> using Workspace = Ref<WorkspaceObj<T>>;
+enum class OpType;

 using TensorVec = vector<Tensor>;
 using OpVec = vector<Operator>;
@ -32,17 +29,15 @@ using OpLists = list<Operator>;

 using VType = uint32_t;

-enum class Device { CPU = 1, CUDA, BANG, INTELCPU, KUNLUN };
+enum class Device { CPU = 1, CUDA, BANG, INTELCPU };
 /***************** Forward declaration end *****************/

 class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
  protected:
    Device device;
-    int deviceId;

  public:
-    explicit RuntimeObj(Device device, int deviceId = 0)
-        : device(device), deviceId(deviceId) {}
+    RuntimeObj(Device device) : device(device) {}
    RuntimeObj(RuntimeObj &other) = delete;
    RuntimeObj &operator=(RuntimeObj const &) = delete;
    virtual ~RuntimeObj() {}
@ -74,7 +69,6 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
    }
    bool isCuda() const { return device == Device::CUDA; }
    bool isBang() const { return device == Device::BANG; }
-    bool isKUNLUN() const { return device == Device::KUNLUN; }
    void copyBlob(const TensorObj *dst, const TensorObj *src) const;
    // TODO: unify these copy APIs
    virtual void copyBlobFromCPU(void *dst, const void *src,
@ -83,12 +77,6 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
                               size_t bytes) const = 0;
    virtual string toString() const = 0;

-    int getDeviceId() const { return deviceId; }
-
-    virtual void initComm(const string &name, int worldSize, int rank) = 0;
-
-    virtual CommunicatorObj &getCommunicator() const = 0;
-
  protected:
    void printProfilingData(double totTime,
                            const std::map<OpType, double> &opTime,
@ -109,9 +97,6 @@ class CpuRuntimeObj : public RuntimeObj {
    void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
    void copyBlobInsideRuntime(void *dst, const void *src,
                               size_t bytes) const override;
-    void initComm(const string &, int, int) override { IT_TODO_HALT(); }
-
-    CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
 };

 class NativeCpuRuntimeObj : public CpuRuntimeObj {
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -1,17 +1,12 @@
 #pragma once
 #include "core/tensor_base.h"
-#include "core/tensor_type.h"
-#include "utils/data_convert.h"
 #include <cmath>
 #include <cstring>
-#include <fstream>

 #if USE_CUDA
 #include "cuda/cuda_runtime.h"
 #endif
-#if USE_BANG
-#include "bang/bang_runtime.h"
-#endif
+
 namespace infini {

 // TODO: how to deal with this
@ -23,7 +18,13 @@ class TensorObj : public TensorBaseObj {
    size_t _size; // Cache of Π(shape).
    Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
                  // scratch have a new id.
-    TensorType tensorType = TensorType::others;
+
+    void copyin(const void *ptr, size_t size) {
+        runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
+    }
+    void copyout(void *ptr, size_t size) const {
+        runtime->copyBlobToCPU(ptr, getRawDataPtr<void *>(), size);
+    }

  public:
    TensorObj(Shape shape, DataType dtype, Runtime runtime);
@ -34,74 +35,30 @@ class TensorObj : public TensorBaseObj {
    size_t getBytes() const { return _size * dtype.getSize(); }

    Shape getDims() const { return shape; }
-    void setShape(Shape shape_);
-    size_t getRank() const { return shape.size(); }
-    Shape getStride() const;
+    vector<size_t> getStride() const;
    size_t getOffset(const vector<int> &ds) const;
    void dataMalloc();
    UidBaseType getFuid() const { return fuid; }
-    bool isWeight() const { return tensorType == TensorType::weight; }
-    bool isInput() const { return tensorType == TensorType::input; }
-    bool isOutput() const { return tensorType == TensorType::output; }
-    bool isOthers() const { return tensorType == TensorType::others; }
-    void setWeight() { tensorType = TensorType::weight; }
-    void setInput() {
-        if (!this->isWeight()) {
-            tensorType = TensorType::input;
-        }
-    }
-    void setOutput() {
-        if (!this->isWeight()) {
-            tensorType = TensorType::output;
-        }
-    }
-    string tensorTypeToString() const {
-        switch (tensorType) {
-        case TensorType::weight:
-            return "weight";
-            break;
-        case TensorType::input:
-            return "input";
-            break;
-        case TensorType::output:
-            return "output";
-            break;
-        case TensorType::others:
-            return "others";
-            break;
-
-        default:
-            return "unknown tensor type";
-            break;
-        }
-    }

    void load(std::string file_path);
    void save(std::string file_path);

-    void copyin(const void *ptr, size_t size) {
-        runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
-    }
-    void copyout(void *ptr, size_t size) const {
-        runtime->copyBlobToCPU(ptr, getRawDataPtr<void *>(), size);
-    }
-
    // Copy elements from `data`.
    template <typename T> void copyin(const vector<T> &data) {
-        IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
-        IT_ASSERT(data.size() == _size);
+        IT_ASSERT(DataType::get<T>() == dtype);
+        IT_ASSERT(data.size() >= _size);
        copyin(data.data(), getBytes());
    }
    // Copy all the elements to a vector.
    template <typename T> auto copyout() const {
-        IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
+        IT_ASSERT(DataType::get<T>() == dtype);
        std::vector<T> ans(_size);
        copyout(ans.data(), getBytes());
        return ans;
    }
    // Copy the element at `pos`.
    template <typename T> auto copyOne(const vector<int> &pos) const {
-        IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
+        IT_ASSERT(DataType::get<T>() == dtype);
        auto offset = getOffset(pos);
        auto bytes = dtype.getSize();
        T ans;
@ -113,16 +70,10 @@ class TensorObj : public TensorBaseObj {
    void copyData(const TensorObj *src);
    void copyData(const Tensor &src) { copyData(src.get()); }

-    // TODO: Rename this function later, because it is confused that it will
-    // change the field data, but actually it generates data and maybe copy to
-    // device.
    // FIXME: std::fucntion copies the generator instead of passing it by ref.
    // Thus the internal state of generator cannot be updated.
    void setData(
        std::function<void(void *, size_t, DataType)> const &generator) const;
-
-    void setDataBlob(const Blob &blob);
-
    Tensor clone() const {
        auto obj = make_ref<TensorObj>(*this);
        obj->freeData();
@ -144,16 +95,11 @@ class TensorObj : public TensorBaseObj {
    }

    void printData() const;
-    void dumpData(std::ofstream &ofs) const;
    bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;

    template <typename T> bool equalData(const vector<T> &dataVector) {
+        IT_ASSERT(DataType::get<T>() == dtype);
        IT_ASSERT(size() == dataVector.size());
-        if (dtype == DataType::Float16) {
-            return equalDataImpl_fp16(getRawDataPtr<uint16_t *>(),
-                                      (float *)dataVector.data(), size());
-        }
-        IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
        return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
    }

@ -193,41 +139,19 @@ class TensorObj : public TensorBaseObj {
    }

    template <typename T>
-    bool equalDataImpl(const T *a, const T *b, size_t size,
-                       double relativeError = 1e-6) const {
+    bool equalDataImpl(const T *a, const T *b, size_t size) const {
        for (size_t i = 0; i < size; ++i) {
            if constexpr (std::is_integral_v<T>) {
                if (a[i] != b[i])
                    return false;
            } else if constexpr (std::is_floating_point_v<T>) {
-                if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
-                    fabs(a[i] - b[i]) > relativeError) {
-                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
-                    return false;
-                } else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
-                           fabs(a[i] - b[i]) /
-                                   std::max(fabs(a[i]), fabs(b[i])) >
-                               relativeError) {
+                if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
+                    1e-6) {
                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
                    return false;
                }
-            } else {
+            } else
                static_assert(!sizeof(T), "Unsupported data type");
-            }
-        }
-        return true;
-    }
-
-    bool equalDataImpl_fp16(const uint16_t *a, const float *b,
-                            size_t size) const {
-        for (size_t i = 0; i < size; ++i) {
-            auto a_fp32 = fp16_to_float(a[i]);
-            auto b_fp32 = b[i];
-            if (fabs(a_fp32 - b_fp32) / std::max(fabs(a_fp32), fabs(b_fp32)) >
-                1e-6) {
-                printf("Error on %lu: %f %f\n", i, a_fp32, b_fp32);
-                return false;
-            }
        }
        return true;
    }
@ -248,8 +172,8 @@ class TensorObj : public TensorBaseObj {
    //         // std::cerr << "Init beginned " << std::endl;
    // #pragma omp parallel for
    //         for (size_t i = 0; i < iEnd; ++i)
-    //             data[i] = fastrand(random_seed[omp_get_thread_num() *
-    //             16]) % 10000;
+    //             data[i] = fastrand(random_seed[omp_get_thread_num() * 16]) %
+    //             10000;
    //         // std::cerr << "Init finished" << std::endl;
    //         computed = ComputedFull;
    //         return true;
@ -294,8 +218,8 @@ class TensorObj : public TensorBaseObj {
    //         auto nDim = dims.size();
    //         auto nBroadcastDim = ds.size() - nDim;
    //         for (size_t i = 0; i < nDim; ++i)
-    //             if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim +
-    //             i] >= dims[i])
+    //             if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim + i] >=
+    //             dims[i])
    //                 return (size_t)-1;
    //         size_t idx = 0;
    //         for (size_t i = 0; i < nDim; ++i)
@ -354,14 +278,12 @@ class TensorObj : public TensorBaseObj {
    //         return (g_seed >> 16) & 0x7FFF;
    //     }

-    //     std::vector<std::vector<int>> const *getSplittingPoints()
-    //     const {
+    //     std::vector<std::vector<int>> const *getSplittingPoints() const {
    //         assert(!splittingPoints.empty());
    //         return &splittingPoints;
    //     }

-    //     bool setSplittingPoints(std::vector<std::vector<int>> value)
-    //     {
+    //     bool setSplittingPoints(std::vector<std::vector<int>> value) {
    //         assert(!value.empty());
    //         splittingPoints = value;
    //         return true;
@ -383,7 +305,7 @@ class TensorObj : public TensorBaseObj {
    //     }

    //     void initSplittingPoints() {
-    //     splittingPoints.resize(getRank()); }
+    //     splittingPoints.resize(getDims().size()); }

    //     void printShape();
 };
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@ -44,7 +44,6 @@ class TensorBaseObj : public Object {
    }

    DataType getDType() const { return dtype; }
-    int getDTypeIndex() const { return dtype.getIndex(); }
    Runtime getRuntime() const { return runtime; }

    //     std::pair<Operator *, int> getOutputOfWithIndex();
--- a/include/core/tensor_type.h
+++ b/include/core/tensor_type.h
@ -1,7 +0,0 @@
-#pragma once
-
-namespace infini {
-
-enum class TensorType { weight, input, output, others };
-
-} // namespace infini
--- a/include/core/workspace.h
+++ b/include/core/workspace.h
@ -1,42 +0,0 @@
-#pragma once
-#include "core/runtime.h"
-
-namespace infini {
-
-template <class T> class WorkspaceObj {
-  private:
-    T workspace;           // workspace pointer
-    size_t workspaceSize;  // Size of workspace
-    size_t workspaceAlloc; // currently use workspace size
-
-  public:
-    WorkspaceObj(T workspace_, size_t workspaceSize_)
-        : workspace(workspace_), workspaceSize(workspaceSize_) {
-        workspaceAlloc = 0;
-    }
-    virtual ~WorkspaceObj() {
-        // Dealloc workspace in RuntimeObj
-        // Set workspace = nullptr here
-        workspace = nullptr;
-    }
-    size_t getWorkspaceSize() const { return workspaceSize; }
-
-    T getWorkspace(size_t size) {
-        // Get unused workspace
-        IT_ASSERT(size + workspaceAlloc <= workspaceSize);
-        auto ret = (T)(static_cast<uint8_t *>(workspace) + workspaceAlloc);
-        workspaceAlloc += size;
-        return ret;
-    }
-    T getWorkspace() {
-        // Override getWorkspace in order to dealloc in runtime
-        return workspace;
-    }
-    void resetWorkspace() {
-        // Reset workspaceAlloc every time end kernel
-        workspaceAlloc = 0;
-    }
-    size_t getWorkspaceAlloc() const { return workspaceAlloc; }
-};
-
-} // namespace infini
--- a/include/cuda/cuda_attention_kvcache.h
+++ b/include/cuda/cuda_attention_kvcache.h
@ -1,17 +0,0 @@
-#pragma once
-#include "core/common.h"
-#include <cstdio>
-
-struct AttentionKVCacheMetadata {
-    int dimSize[4];
-    int stride[4];
-};
-
-namespace infini {
-void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
-                              float *input_q, float *input_k, float *input_v,
-                              int *position_id, float *output_matmul,
-                              const AttentionKVCacheMetadata &compMeta,
-                              float *output_O_temp, float *output_sum_temp);
-
-} // namespace infini
--- a/include/cuda/cuda_common.h
+++ b/include/cuda/cuda_common.h
@ -5,13 +5,17 @@
 #include <cuda_profiler_api.h>
 #include <cudnn.h>
 #include <curand.h>
-#include <memory>

+// TODO: replace with Exception (IT_ASSERT)
 #define checkCudaError(call)                                                   \
-    if (auto err = call; err != cudaSuccess)                                   \
-    throw ::infini::Exception(std::string("[") + __FILE__ + ":" +              \
-                              std::to_string(__LINE__) + "] CUDA error (" +    \
-                              #call + "): " + cudaGetErrorString(err))
+    {                                                                          \
+        auto err = call;                                                       \
+        if (cudaSuccess != err) {                                              \
+            fprintf(stderr, "Cuda error in %s:%i : %s.\n", __FILE__, __LINE__, \
+                    cudaGetErrorString(err));                                  \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    }

 #define checkCUresult(call)                                                    \
    {                                                                          \
@ -35,10 +39,14 @@
    }

 #define checkCudnnError(call)                                                  \
-    if (auto err = call; err != CUDNN_STATUS_SUCCESS)                          \
-    throw ::infini::Exception(std::string("[") + __FILE__ + ":" +              \
-                              std::to_string(__LINE__) + "] cuDNN error (" +   \
-                              #call + "): " + cudnnGetErrorString(err))
+    {                                                                          \
+        auto err = call;                                                       \
+        if (CUDNN_STATUS_SUCCESS != err) {                                     \
+            fprintf(stderr, "cuDNN error in %s:%i : %s.\n", __FILE__,          \
+                    __LINE__, cudnnGetErrorString(err));                       \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    }

 #define checkCurandError(call)                                                 \
    {                                                                          \
@ -112,20 +120,4 @@ inline const char *curandGetErrorString(curandStatus_t error) {

 using CudaPtr = void *;

-class CUDAStream {
-  public:
-    CUDAStream(const CUDAStream &) = delete;
-    CUDAStream(CUDAStream &&) = delete;
-    void operator=(const CUDAStream &) = delete;
-    void operator=(CUDAStream &&) = delete;
-    static cudaStream_t getCurrentStream() { return _stream; }
-    static void Init() { CUDAStream::_stream = 0; };
-    static void createStream() { checkCudaError(cudaStreamCreate(&_stream)); }
-    static void destroyStream() { checkCudaError(cudaStreamDestroy(_stream)); }
-
-  private:
-    CUDAStream(){};
-    static cudaStream_t _stream;
-};
-
 } // namespace infini
--- a/include/cuda/cuda_element_wise.h
+++ b/include/cuda/cuda_element_wise.h
@ -1,20 +1,8 @@
 #pragma once

 namespace infini {
-void div_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
-                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
-                int c2, int c3);
-void add_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
-                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
-                int c2, int c3);
-void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
-                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
-                int c2, int c3);
-void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
-                 int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
-                 int c2, int c3);
-
-void div_const_kernel(int dType, void *a, void *b, void *c, size_t n);
-
-void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n);
+void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
+void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
 }; // namespace infini
--- a/include/cuda/cuda_expand.h
+++ b/include/cuda/cuda_expand.h
@ -1,12 +0,0 @@
-#pragma once
-
-#include "operators/unary.h"
-#include "utils/small_array.h"
-namespace infini {
-void expandKernel(int dType, void *input, void *output, int nDims,
-                  int outputsize, SmallArray inputShape,
-                  SmallArray outputShape);
-
-void expandRowKernel(int dType, void *input, void *output, int n_rows,
-                     int row_len);
-}; // namespace infini
--- a/include/cuda/cuda_layernorm.h
+++ b/include/cuda/cuda_layernorm.h
@ -1,17 +0,0 @@
-#pragma once
-#include "operators/unary.h"
-
-namespace infini {
-void LaynormKernel(const float *input, const float *scale, const float eps,
-                   int size, int scaleSize, const int dimsize, const int stride,
-                   float *output, const float *bias, int biasSize);
-void LaynormKernel(const float *input, const float *scale, const float eps,
-                   int size, int scaleSize, const int dimsize, const int stride,
-                   float *output);
-void LaynormKernel(const half *input, const half *scale, const half eps,
-                   int size, int scaleSize, const int dimsize, const int stride,
-                   half *output, const half *bias, int biasSize);
-void LaynormKernel(const half *input, const half *scale, const half eps,
-                   int size, int scaleSize, const int dimsize, const int stride,
-                   half *output);
-}; // namespace infini
--- a/include/cuda/cuda_pad_slice.h
+++ b/include/cuda/cuda_pad_slice.h
@ -10,11 +10,10 @@ typedef struct {
    int wholeNDim[MAX_DIM];  // dim size after padding or before slicing
    int partNDim[MAX_DIM];   // dim size before padding or after slicing
    int partStride[MAX_DIM]; // stride before padding or after slicing
-    int DType;
 } TransMetaData;

 namespace infini {
-void pad_slice_kernel(void *partData, void *wholeData,
+void pad_slice_kernel(float *partData, float *wholeData,
                      const TransMetaData &metadata, int nDims, int num,
                      bool isPad);
 } // namespace infini
--- a/include/cuda/cuda_rmsnorm.h
+++ b/include/cuda/cuda_rmsnorm.h
@ -1,10 +0,0 @@
-#pragma once
-
-#include "operators/rms_norm.h"
-
-namespace infini {
-
-void rmsnorm_kernel(int dType, void *input, void *weight, void *output,
-                    int num_tokens, int hidden_size);
-
-}; // namespace infini
--- a/include/cuda/cuda_rope.h
+++ b/include/cuda/cuda_rope.h
@ -1,12 +0,0 @@
-#pragma once
-
-#include "operators/rope.h"
-#include "utils/small_array.h"
-
-namespace infini {
-
-void rope_kernel(int dType, int *pos, void *input, void *output, int size,
-                 int dim_model, int dim_head, int hidden_stride,
-                 int pos_stride);
-
-}; // namespace infini
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -1,9 +1,6 @@
 #pragma once
 #include "core/runtime.h"
 #include "cuda/cuda_common.h"
-#ifdef INFINI_USE_NCCL
-#include "cuda/nccl_communicator.h"
-#endif

 namespace infini {

@ -11,34 +8,21 @@ class CudaRuntimeObj : public RuntimeObj {
  private:
    cudnnHandle_t cudnn;
    cublasHandle_t cublas;
-    std::unique_ptr<CommunicatorObj> comm;
    CudaPtr workspace;
    size_t workspaceSize;
-    bool isCudaGraphCreated;
-    cudaGraph_t cudaGraph;
-    cudaGraphExec_t cudaGraphInstance;

  public:
-    explicit CudaRuntimeObj(int deviceId = 0)
-        : RuntimeObj(Device::CUDA, deviceId) {
+    CudaRuntimeObj() : RuntimeObj(Device::CUDA) {

-        checkCudaError(cudaSetDevice(deviceId));
        checkCudnnError(cudnnCreate(&cudnn));
        checkCublasError(cublasCreate(&cublas));
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
        workspaceSize = 7ll << 30; // 7 GB
        workspace = alloc(workspaceSize);
-        isCudaGraphCreated = false;
-        CUDAStream::Init();
    }
    virtual ~CudaRuntimeObj() {
        try {
-            if (isCudaGraphCreated) {
-                checkCudaError(cudaGraphExecDestroy(cudaGraphInstance));
-                checkCudaError(cudaGraphDestroy(cudaGraph));
-                CUDAStream::destroyStream();
-            }
            dealloc(workspace);
            checkCudnnError(cudnnDestroy(cudnn));
            checkCublasError(cublasDestroy(cublas));
@ -85,13 +69,6 @@ class CudaRuntimeObj : public RuntimeObj {

    void runWithoutSync(const Graph &graph) const;

-    void runWithCudaGraph(const Graph &graph);
-
-    // init communicator
-    void initComm(const string &name, int worldSize, int rank) final;
-
-    CommunicatorObj &getCommunicator() const final { return *comm; }
-
  private:
    void tune(const Graph &graph, bool profiling) const;
 };
--- a/include/cuda/cuda_softmax.h
+++ b/include/cuda/cuda_softmax.h
@ -1,8 +0,0 @@
-#pragma once
-#include "utils/small_array.h"
-namespace infini {
-void softmax_kernel(int num_blocks, float *input, float *output, int size,
-                    int dimsize, int stride);
-void softmax_kernel(int num_blocks, half *input, half *output, int size,
-                    int dimsize, int stride);
-} // namespace infini
--- a/include/cuda/cuda_split_concat.h
+++ b/include/cuda/cuda_split_concat.h
@ -3,13 +3,13 @@
 #include <cstdio>

 const int BATCH_SIZE = 32; // parallel tensor number.
-const int DIM_MAX_SIZE = 8;
+const int DIM_MAX_SIZE = 4;

 // Concat operator acts like element tensors composing to one big tensor,and
 // split operator acts like one big tensor being composed by element
 // tensors.
-template <typename T> struct ElementTensorMetadata {
-    T *data[BATCH_SIZE];
+struct ElementTensorMetadata {
+    float *data[BATCH_SIZE];
    int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in
                             // the composed tensor.
    int dimSize[BATCH_SIZE]; // the dimention size of the element tensor.
@ -20,17 +20,16 @@ template <typename T> struct ElementTensorMetadata {
                   data[i], dimBgNo[i], dimSize[i], nElements[i]);
    }
 };
-template <typename T> struct ComposedTensorMetadata {
+
+struct ComposedTensorMetadata {
    int dimSize[DIM_MAX_SIZE];
    int stride[DIM_MAX_SIZE];
-    T *data;
+    float *data;
 };

 namespace infini {
-void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
-                         const ComposedTensorMetadata<float> &compMeta, int dim,
-                         int batchSize, int nDims, bool isSplit);
-void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
-                         const ComposedTensorMetadata<half> &compMeta, int dim,
+void split_concat_kernel(const ElementTensorMetadata &eleMeta,
+                         const ComposedTensorMetadata &compMeta, int dim,
                         int batchSize, int nDims, bool isSplit);
+
 } // namespace infini
--- a/include/cuda/cuda_transpose.h
+++ b/include/cuda/cuda_transpose.h
@ -1,11 +0,0 @@
-#pragma once
-
-#include "operators/transpose.h"
-#include "utils/small_array.h"
-
-namespace infini {
-
-void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
-                      SmallArray strides, SmallArray outputShape);
-
-}; // namespace infini
--- a/include/cuda/cuda_unary.h
+++ b/include/cuda/cuda_unary.h
@ -3,22 +3,31 @@
 #include "operators/unary.h"

 namespace infini {
-template <typename T> void softmax_kernel(T *input, T *output, size_t num);
-template <typename T> void relu_kernel(T *input, T *output, size_t num);
-template <typename T> void silu_kernel(T *input, T *output, size_t num);
-template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
-template <typename T> void tanh_kernel(T *input, T *output, size_t num);
-template <typename T> void abs_kernel(T *input, T *output, size_t num);
-template <typename T> void sqrt_kernel(T *input, T *output, size_t num);
-template <typename T> void neg_kernel(T *input, T *output, size_t num);
-template <typename T> void gelu_kernel(T *input, T *output, size_t num);
-template <typename T> void erf_kernel(T *input, T *output, size_t num);
-template <typename T> void hard_sigmoid_kernel(T *input, T *output, size_t num);
-template <typename T> void hard_swish_kernel(T *input, T *output, size_t num);
+void softmax_kernel(float *input, float *output, int num);
+void relu_kernel(float *input, float *output, int num);
+void sigmoid_kernel(float *input, float *output, int num);
+void tanh_kernel(float *input, float *output, int num);
+void abs_kernel(float *input, float *output, int num);

-template <typename INPUT, typename OUTPUT>
-void cast_kernel(INPUT *input, OUTPUT *output, size_t num);
+void unary_kernel(const Operator &_op) {
+    auto op = as<UnaryObj>(_op);
+    float *const inputData = (op->getInputs(0)->getRawDataPtr<float *>());
+    float *const outputData = (op->getOutput()->getRawDataPtr<float *>());

-void unary_kernel(const Operator &_op);
+    auto dim = op->getInputs(0)->getDims();
+    int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
+    if (op->getOpType() == OpType::Softmax)
+        softmax_kernel(inputData, outputData, n * c * h * w);
+    else if (op->getOpType() == OpType::Relu)
+        relu_kernel(inputData, outputData, n * c * h * w);
+    else if (op->getOpType() == OpType::Sigmoid)
+        sigmoid_kernel(inputData, outputData, n * c * h * w);
+    else if (op->getOpType() == OpType::Tanh)
+        tanh_kernel(inputData, outputData, n * c * h * w);
+    else if (op->getOpType() == OpType::Abs)
+        abs_kernel(inputData, outputData, n * c * h * w);
+    else
+        IT_TODO_HALT();
+}

 }; // namespace infini
--- a/include/cuda/cuda_utility.h
+++ b/include/cuda/cuda_utility.h
@ -1,29 +1,11 @@
-#pragma once
 #include "core/tensor.h"
-#include "cuda/cuda_common.h"

 namespace infini {

 void cudaPrintFloat(float *x, int len);

-void cudaPrintTensor(const Tensor &tensor);
+void cudaPrintTensor(const Tensor &tensor) {
+    cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
+}

-cudnnDataType_t cudnnDataTypeConvert(DataType dataType);
-cudaDataType cublasDataTypeConvert(DataType);
-
-template <int index> struct DT_CUDA {};
-template <> struct DT_CUDA<0> { using t = bool; };
-template <> struct DT_CUDA<1> { using t = float; };
-template <> struct DT_CUDA<2> { using t = unsigned char; };
-template <> struct DT_CUDA<3> { using t = char; };
-template <> struct DT_CUDA<4> { using t = unsigned short; };
-template <> struct DT_CUDA<5> { using t = short; };
-template <> struct DT_CUDA<6> { using t = int; };
-template <> struct DT_CUDA<7> { using t = long long; };
-template <> struct DT_CUDA<9> { using t = bool; };
-template <> struct DT_CUDA<10> { using t = half; };
-template <> struct DT_CUDA<11> { using t = double; };
-template <> struct DT_CUDA<12> { using t = unsigned int; };
-template <> struct DT_CUDA<13> { using t = unsigned long long; };
-template <> struct DT_CUDA<16> { using t = nv_bfloat16; };
-} // namespace infini
+} // namespace infini
--- a/include/cuda/cuda_where.h
+++ b/include/cuda/cuda_where.h
@ -1,17 +0,0 @@
-#pragma once
-#include "operators/unary.h"
-#include "utils/small_array.h"
-
-namespace infini {
-
-void whereKernel(const float *inputX, const float *inputY,
-                 const uint8_t *condition, float *output, int nDims,
-                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
-                 SmallArray conditionShape, SmallArray outputShape, int xSize,
-                 int ySize, int cSize);
-void whereKernel(const half *inputX, const half *inputY,
-                 const uint8_t *condition, half *output, int nDims,
-                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
-                 SmallArray conditionShape, SmallArray outputShape, int xSize,
-                 int ySize, int cSize);
-}; // namespace infini
--- a/include/cuda/gather.h
+++ b/include/cuda/gather.h
@ -1,61 +1,17 @@
 #pragma once
-#include "core/data_type.h"
-#include "core/operator.h"
-#include "operators/gather.h"
+
+typedef struct {
+    int *indexValue;
+    int axis;
+    int inNDim;
+    int outNDim;
+    int idxNDim;
+    int outDim[4];
+    int idxDim[4];
+    int idxStride[4];
+    int inStride[4];
+} GatherMetaData;

 namespace infini {
-struct GatherMetaData {
-    // Pointer to indices
-    void *indexValue;
-    // Type of index values
-    DataType indexType;
-    // Type of input and output data
-    DataType dataType;
-    // Axis of the gather operation
-    int axis;
-    // Rank of input
-    int inNDim;
-    // Rank of output
-    int outNDim;
-    // Rank of indices
-    int idxNDim;
-    // Shape of output
-    int outDim[4];
-    // Shape of indices
-    int idxDim[4];
-    // Strides of indices
-    int idxStride[4];
-    // Strides of input
-    int inStride[4];
-};
-
-inline void initGatherMetaData(GatherMetaData &metaData,
-                               const Ref<OperatorObj> &_op) {
-    memset(&metaData, 0, sizeof(metaData));
-    auto op = as<GatherBaseObj>(_op);
-    Ref<TensorObj> in = op->getInputs(0);
-    Ref<TensorObj> index = op->getInputs(1);
-    Ref<TensorObj> out = op->getOutput();
-    metaData.indexValue = index->getRawDataPtr<void *>();
-    metaData.indexType = index->getDType();
-    metaData.dataType = in->getDType();
-    metaData.axis = op->getAxis();
-    metaData.inNDim = in->getRank();
-    metaData.outNDim = out->getRank();
-    metaData.idxNDim = index->getRank();
-    for (int i = 0; i < metaData.outNDim; ++i)
-        metaData.outDim[i] = out->getDims()[i];
-    for (int i = 0; i < metaData.idxNDim; ++i) {
-        metaData.idxDim[i] = index->getDims()[i];
-        metaData.idxStride[i] = index->getStride()[i];
-    }
-    for (int i = 0; i < metaData.inNDim; ++i) {
-        metaData.inStride[i] = in->getStride()[i];
-    }
+void gather_kernel(float *in, float *out, GatherMetaData metaData, int num);
 }
-template <typename T>
-void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num);
-
-void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
-                            size_t num);
-} // namespace infini
--- a/include/cuda/nccl_communicator.h
+++ b/include/cuda/nccl_communicator.h
@ -1,70 +0,0 @@
-#pragma once
-#include "core/communicator.h"
-#include <chrono>
-#include <cstdlib>
-#include <filesystem>
-#include <fstream>
-#include <nccl.h>
-#include <thread>
-
-#define checkNcclError(call)                                                   \
-    {                                                                          \
-        auto err = call;                                                       \
-        if (ncclSuccess != err) {                                              \
-            fprintf(stderr, "NCCL error in %s:%i : %s.\n", __FILE__, __LINE__, \
-                    ncclGetErrorString(err));                                  \
-            exit(EXIT_FAILURE);                                                \
-        }                                                                      \
-    }
-
-namespace infini {
-
-class NcclCommunicatorObj final : public CommunicatorObj {
-  private:
-    ncclComm_t comm;
-
-  public:
-    NcclCommunicatorObj(const string &name, int worldSize, int rank)
-        : CommunicatorObj(worldSize, rank) {
-        const std::string filePath("./" + name + "_nccl_id.bin");
-        ncclUniqueId commId;
-        if (rank == 0) {
-            checkNcclError(ncclGetUniqueId(&commId));
-            std::ofstream ofs(filePath, std::ios::binary);
-            ofs.write((char *)&commId, sizeof(ncclUniqueId));
-
-        } else {
-            auto begin = std::chrono::steady_clock::now();
-            while (!std::filesystem::exists(filePath)) {
-                auto now = std::chrono::steady_clock::now();
-                _IT_ASSERT_2(now < begin + std::chrono::seconds(10),
-                             "time limit (10s) exceeded.");
-                std::this_thread::sleep_for(std::chrono::milliseconds(100));
-            }
-            std::ifstream ifs(filePath, std::ios::binary);
-            ifs.read((char *)&commId, sizeof(ncclUniqueId));
-        }
-        checkNcclError(ncclCommInitRank(&comm, worldSize, commId, rank));
-        if (rank == 0) {
-            std::filesystem::remove(filePath);
-        }
-    }
-
-    // Get the actual ncclComm_t
-    ncclComm_t getNcclComm() { return comm; }
-
-    void finalize() { checkNcclError(ncclCommFinalize(comm)); }
-
-    ~NcclCommunicatorObj() final {
-        finalize();
-        checkNcclError(ncclCommDestroy(comm));
-    }
-
-    virtual string toString() const final {
-        std::ostringstream oss;
-        oss << "NCCL communicator";
-        return oss.str();
-    }
-};
-
-} // namespace infini
--- a/include/cuda/softmax.h
+++ b/include/cuda/softmax.h
@ -0,0 +1,6 @@
+#pragma once
+
+namespace infini {
+void softmax_kernel(int max_threadblock_size, int batch_size, float *x,
+                    float *y, int dim, int stride);
+}
--- a/include/kunlun/kunlun_act_type.h
+++ b/include/kunlun/kunlun_act_type.h
@ -1,23 +0,0 @@
-#include "core/op_type.h"
-#include "kunlun/kunlun_common.h"
-
-namespace infini {
-using KunlunActType = xdnn::Activation_t;
-KunlunActType parseActType(ActType act) {
-    switch (act) {
-    case ActType::None:
-        return KunlunActType::LINEAR;
-    case ActType::Tanh:
-        return KunlunActType::TANH;
-    case ActType::Sigmoid:
-        return KunlunActType::SIGMOID;
-    case ActType::Relu:
-        return KunlunActType::RELU6;
-    default:
-        fprintf(stderr, "Activation Type not support yet!\n");
-        break;
-    }
-    return KunlunActType::LINEAR;
-}
-
-}; // namespace infini
--- a/include/kunlun/kunlun_common.h
+++ b/include/kunlun/kunlun_common.h
@ -1,22 +0,0 @@
-#pragma once
-#include "core/common.h"
-#include "xpu/runtime_ex.h"
-#include "xpu/xdnn.h"
-
-namespace xdnn = baidu::xpu::api;
-
-#define checkKUNLUNError(call)                                                 \
-    {                                                                          \
-        auto err = call;                                                       \
-        if (XPU_SUCCESS != err) {                                              \
-            fprintf(stderr, "KUNLUN error in %s:%i : %s.\n", __FILE__,         \
-                    __LINE__, xpu_strerror(err));                              \
-            exit(EXIT_FAILURE);                                                \
-        }                                                                      \
-    }
-
-namespace infini {
-
-using KUNLUNPtr = void *;
-
-} // namespace infini
--- a/include/kunlun/kunlun_kernel_without_config.h
+++ b/include/kunlun/kunlun_kernel_without_config.h
@ -1,24 +0,0 @@
-#pragma once
-#include "core/kernel.h"
-#include "kunlun/kunlun_runtime.h"
-
-namespace infini {
-
-class KUNLUNKernelWithoutConfig : public Kernel {
-  public:
-    virtual void compute(const Operator &op, const PerfRecord &record,
-                         const RuntimeObj *context) const {
-        compute(op, context);
-    }
-    virtual void compute(const Operator &op,
-                         const RuntimeObj *context) const = 0;
-    // Premise: op is idempotent since it is called multiple times.
-    virtual PerfRecord tune(const Operator &op,
-                            const RuntimeObj *_context) const {
-        auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
-        return make_ref<PerfRecordObj>(timeit([&]() { compute(op, _context); },
-                                              [&]() { context->sync(); }));
-    }
-};
-
-} // namespace infini
--- a/include/kunlun/kunlun_runtime.h
+++ b/include/kunlun/kunlun_runtime.h
@ -1,81 +0,0 @@
-#pragma once
-#include "core/runtime.h"
-#include "core/workspace.h"
-#include "kunlun/kunlun_common.h"
-#ifdef INFINI_USE_XCCL
-#include "kunlun/xccl_communicator.h"
-#endif
-namespace infini {
-
-class KUNLUNRuntimeObj : public RuntimeObj {
-  private:
-    xdnn::Context *ctx;
-    std::unique_ptr<CommunicatorObj> comm;
-    // KUNLUNPtr workspace;
-    // size_t workspaceSize;
-    Workspace<KUNLUNPtr> workspace;
-
-  public:
-    KUNLUNRuntimeObj(int deviceId = 0) : RuntimeObj(Device::KUNLUN) {
-        xpu_set_device(deviceId);
-        ctx = xdnn::create_context();
-        // 10GB for Longformer
-        // size_t longformerNum = 3lu * (1 << 30);
-        size_t workspaceSize = 2llu << 30; // 2 GB
-        KUNLUNPtr wkspacePtr = alloc(workspaceSize);
-        workspace =
-            make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
-    }
-    virtual ~KUNLUNRuntimeObj() {
-        KUNLUNPtr wkspacePtr = workspace->getWorkspace();
-        dealloc(wkspacePtr);
-        xdnn::destroy_context(ctx);
-    }
-    string toString() const override;
-
-    void run(const Graph &graph, bool tune = false,
-             bool profiling = false) const;
-    // double runEvaluation(const Graph &graph, int nWarmups,
-    //                      int nEvaluations) const;
-    void sync() const;
-
-    KUNLUNPtr alloc(size_t size) override {
-        void *ptr;
-        checkKUNLUNError(
-            xpu_malloc((void **)&ptr, size, XPUMemoryKind::XPU_MEM_HBM));
-        return ptr;
-    }
-    void dealloc(void *ptr) override { xpu_free(ptr); }
-
-    xdnn::Context *KUNLUNHandle() const { return ctx; }
-    // Get $size workspace by bytes
-    KUNLUNPtr getWorkspace(size_t size) const {
-        auto ret = workspace->getWorkspace(size);
-        return ret;
-    }
-    Workspace<KUNLUNPtr> getWorkspaceObj() const { return workspace; }
-
-    void copyBlobFromCPU(void *dst, const void *src,
-                         size_t bytes) const override {
-        xpu_memcpy(dst, const_cast<void *>(src), bytes,
-                   XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-    }
-    void copyBlobToCPU(void *dst, const void *src,
-                       size_t bytes) const override {
-        xpu_memcpy(dst, const_cast<void *>(src), bytes,
-                   XPUMemcpyKind::XPU_DEVICE_TO_HOST);
-    }
-    void copyBlobInsideRuntime(void *dst, const void *src,
-                               size_t bytes) const override {
-        xpu_memcpy(dst, const_cast<void *>(src), bytes,
-                   XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
-    }
-    void initComm(const string &name, int worldSize, int rank) final;
-
-    CommunicatorObj &getCommunicator() const final { return *comm; }
-
-  private:
-    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
-};
-
-} // namespace infini
--- a/include/kunlun/operator_timer.h
+++ b/include/kunlun/operator_timer.h
@ -1,10 +0,0 @@
-#pragma once
-namespace infini {
-namespace opTimer {
-double getPerfConvXdnn(int n, int c, int h, int w, int f, int r, int s,
-                       int padh, int padw, int strideh, int stridew,
-                       int dilationh, int dilationw, int group,
-                       const char *name);
-double getPerfMatmulXdnn(int b, int m, int n, int k, const char *name);
-} // namespace opTimer
-} // namespace infini
--- a/include/kunlun/xccl_communicator.h
+++ b/include/kunlun/xccl_communicator.h
@ -1,60 +0,0 @@
-#pragma once
-#include "core/communicator.h"
-#include "xpu/bkcl.h"
-#include <chrono>
-#include <filesystem>
-#include <fstream>
-#include <thread>
-
-#define checkXcclError(call)                                                   \
-    {                                                                          \
-        auto err = call;                                                       \
-        if (BKCL_SUCCESS != err) {                                             \
-            fprintf(stderr, "XCCL error in %s:%i.\n", __FILE__, __LINE__);     \
-            exit(EXIT_FAILURE);                                                \
-        }                                                                      \
-    }
-
-namespace infini {
-
-class XcclCommunicatorObj final : public CommunicatorObj {
-  private:
-    BKCLContext_t comm;
-
-  public:
-    XcclCommunicatorObj(const string &name, int worldSize, int rank)
-        : CommunicatorObj(worldSize, rank) {
-        const std::string filePath("./" + name + "_xccl_id.bin");
-        BKCLUniqueId commId;
-        if (rank == 0) {
-            checkXcclError(bkcl_get_unique_id(&commId));
-            std::ofstream ofs(filePath, std::ios::binary);
-            ofs.write((char *)&commId, sizeof(BKCLUniqueId));
-        } else {
-            auto begin = std::chrono::steady_clock::now();
-            while (!std::filesystem::exists(filePath)) {
-                auto now = std::chrono::steady_clock::now();
-                _IT_ASSERT_2(now < begin + std::chrono::seconds(100),
-                             "time limit (100s) exceeded.");
-                std::this_thread::sleep_for(std::chrono::milliseconds(100));
-            }
-            std::ifstream ifs(filePath, std::ios::binary);
-            ifs.read((char *)&commId, sizeof(BKCLUniqueId));
-        }
-        checkXcclError(bkcl_init_rank(&comm, rank, worldSize, &commId));
-        if (rank == 0) {
-            std::filesystem::remove(filePath);
-        }
-    }
-
-    BKCLContext_t getXcclComm() { return comm; }
-
-    ~XcclCommunicatorObj() final { checkXcclError(bkcl_destroy_context(comm)); }
-    virtual string toString() const final {
-        std::ostringstream oss;
-        oss << "XCCL communicator";
-        return oss.str();
-    }
-};
-
-} // namespace infini
--- a/include/nnet/test.h
+++ b/include/nnet/test.h
@ -24,7 +24,7 @@
 // clang-format on

 namespace nnet {
-int matchExprResult(Derivator &derivator, string pathRelativeToProjectHome);
-bool checkExprLogSame(string pathRelativeToProjectHome, int start, int end);
+int matchExprResult(Derivator &derivator, string fn);
+bool checkExprLogSame(string fnPrefix, int start, int end);
 bool checkExprsEquvivalence(VecExpr exprs);
 } // namespace nnet
--- a/include/operators/G2BMM.h
+++ b/include/operators/G2BMM.h
@ -35,7 +35,7 @@ class G2BMMObj : public OperatorObj {
    OP_CLONE(G2BMMObj);

    std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;

    int numInputs() const override { return 2; }
    int numOutputs() const override { return 1; }
--- a/include/operators/GBMM.h
+++ b/include/operators/GBMM.h
@ -33,7 +33,7 @@ class GBMMObj : public OperatorObj {
    OP_CLONE(GBMMObj);

    std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;

    int numInputs() const override { return 2; }
    int numOutputs() const override { return 1; }
--- a/include/operators/activation_backward.h
+++ b/include/operators/activation_backward.h
@ -7,7 +7,7 @@ class ActivationBackwardObj : public OperatorObj {
    ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y,
                          Tensor x, Tensor diff_x);
    OP_CLONE(ActivationBackwardObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;

    std::string toString() const override;
    int numInputs() const override { return 3; }
--- a/include/operators/all_gather.h
+++ b/include/operators/all_gather.h
@ -1,44 +0,0 @@
-#pragma once
-#include "core/operator.h"
-
-namespace infini {
-/**
- * @brief The AllGather operation gathers N values from k ranks into
- * an output of size k*N, and distributes that result to all ranks.
- * The output is ordered by rank index.
- *
- * For more details:
- * https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allgather
- */
-class AllGatherObj : public OperatorObj {
-
-  public:
-    /**
-     * @brief Construct a new AllGather object
-     *
-     * @param graph The computation graph that this operator belongs to.
-     * @param input The input tensor from this rank.
-     * @param outputs A list of output tensors collected from all ranks.
-     * @param world_size Total number of ranks.
-     */
-    AllGatherObj(GraphObj *graph, Tensor input, std::optional<TensorVec>,
-                 int world_size);
-    OP_CLONE(AllGatherObj);
-
-    int numInputs() const override { return 1; }
-    int numOutputs() const override { return world_size; }
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
-
-    std::string toString() const override;
-
-    int getWorldSize() const { return world_size; }
-
-  private:
-    vector<int> getWorkloadVector() const override;
-    vector<int> getOpAttrVector() const override;
-    vector<DataType> inferDataType(const TensorVec &inputs) const override;
-
-  protected:
-    int world_size;
-};
-} // namespace infini
--- a/include/operators/all_reduce.h
+++ b/include/operators/all_reduce.h
@ -1,75 +0,0 @@
-#pragma once
-#include "core/operator.h"
-
-namespace infini {
-/**
- * @brief The AllReduce operation is performing reductions on data (sum, min,
- * max, avg, or div) across devices and writing the result in the
- * receive buffers of every rank. For example, in an allreduce operation between
- * k ranks and performing a sum, each rank will provide an array Vk of N values,
- * and receive an identical arrays S of N values, where S[i] =
- * V0[i]+V1[i]+…+Vk-1[i].
- *
- * For more details:
- * https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allreduce
- */
-class AllReduceBaseObj : public OperatorObj {
-
-  public:
-    /**
-     * @brief Construct a new AllReduce base object. Should be called by every
-     * child class constructor, but not directly.
-     *
-     * @param graph The computation graph that this operator belongs to.
-     * @param opType The operation type. This param is taken care of by child
-     * classes.
-     * @param input The input tensor from this rank.
-     * @param output The output tensor, same size as input.
-     */
-    AllReduceBaseObj(GraphObj *graph, OpType opType, Tensor input,
-                     Tensor output);
-    OP_CLONE(AllReduceBaseObj);
-
-    int numInputs() const override { return 1; }
-    int numOutputs() const override { return 1; }
-
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
-        return {{inputs[0]->getDims()}};
-    };
-
-    std::string toString() const override;
-
-  private:
-    vector<int> getWorkloadVector() const override;
-    vector<int> getOpAttrVector() const override;
-    vector<DataType> inferDataType(const TensorVec &inputs) const override {
-        return {inputs[0]->getDType()};
-    };
-};
-
-class AllReduceSumObj : public AllReduceBaseObj {
-  public:
-    AllReduceSumObj(GraphObj *graph, Tensor input, Tensor output);
-};
-
-class AllReduceProdObj : public AllReduceBaseObj {
-  public:
-    AllReduceProdObj(GraphObj *graph, Tensor input, Tensor output);
-};
-
-class AllReduceMinObj : public AllReduceBaseObj {
-  public:
-    AllReduceMinObj(GraphObj *graph, Tensor input, Tensor output);
-};
-
-class AllReduceMaxObj : public AllReduceBaseObj {
-  public:
-    AllReduceMaxObj(GraphObj *graph, Tensor input, Tensor output);
-};
-
-class AllReduceAvgObj : public AllReduceBaseObj {
-  public:
-    AllReduceAvgObj(GraphObj *graph, Tensor input, Tensor output);
-};
-
-} // namespace infini
--- a/include/operators/attention_kvcache.h
+++ b/include/operators/attention_kvcache.h
@ -1,43 +0,0 @@
-#pragma once
-#include "core/operator.h"
-
-namespace infini {
-/**
- * @brief Fused Attention with KVCache input operator. All the input and output
- * tensors should have the same rank except for the position_id.
- *
- */
-class AttentionKVCacheObj : public OperatorObj {
-    int dim;
-
-  public:
-    /**
-     * @brief Construct a new AttentionKVCache object.
-     *
-     * @param graph The computation graph that this operator belongs to.
-     * @param input_k_cache The k_cache input tensor.
-     * @param input_v_cache The v_cache input tensor.
-     * @param input_q The query input tensor.
-     * @param input_k The key input tensor.
-     * @param input_v The value input tensor.
-     * @param position_id The positon id of the query,
-     * @param output_matmul The query output tensor.
-     */
-    AttentionKVCacheObj(GraphObj *graph, Tensor input_k_cache,
-                        Tensor input_v_cache, Tensor input_q, Tensor input_k,
-                        Tensor input_v, Tensor position_id,
-                        Tensor output_matmul);
-    OP_CLONE(AttentionKVCacheObj);
-
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
-
-    std::string toString() const override;
-    int numInputs() const override { return 6; }
-    int numOutputs() const override { return 1; }
-    int getDim() const { return dim; }
-
-  private:
-    vector<int> getWorkloadVector() const override;
-    vector<int> getOpAttrVector() const override;
-};
-} // namespace infini
--- a/include/operators/batch_norm.h
+++ b/include/operators/batch_norm.h
@ -34,7 +34,7 @@ class BatchNormObj : public OperatorObj {
                 Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
                 float eps = 1e-5, bool trainingMode = false);
    OP_CLONE(BatchNormObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
    std::string toString() const override;

    // output size will be 3 when training
--- a/include/operators/broadcast.h
+++ b/include/operators/broadcast.h
@ -1,49 +0,0 @@
-#pragma once
-#include "core/operator.h"
-
-namespace infini {
-/**
- * @brief The Broadcast operation copies an N-element buffer on the root rank to
- * all ranks.
- *
- * For more details:
- * https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#broadcast
- */
-class BroadcastObj : public OperatorObj {
-  public:
-    /**
-     * @brief Construct a new Broadcast object.
-     *
-     * @param graph The computation graph that this operator belongs to.
-     * @param input The input tensor. Only root needs to initialize it with
-     * data.
-     * @param output The output tensor, same size as input.
-     * @param root The root rank who performs the broadcast.
-     */
-    BroadcastObj(GraphObj *graph, Tensor input, Tensor output, int root);
-    OP_CLONE(BroadcastObj);
-
-    int numInputs() const override { return 1; }
-    int numOutputs() const override { return 1; }
-
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
-        return {{inputs[0]->getDims()}};
-    };
-
-    std::string toString() const override;
-
-    int getRoot() const { return root; }
-
-  private:
-    vector<int> getWorkloadVector() const override;
-    vector<int> getOpAttrVector() const override;
-    vector<DataType> inferDataType(const TensorVec &inputs) const override {
-        return {inputs[0]->getDType()};
-    };
-
-  protected:
-    // The rank who broadcasts data among this communication group
-    int root;
-};
-
-} // namespace infini
--- a/include/operators/concat.h
+++ b/include/operators/concat.h
@ -22,7 +22,7 @@ class ConcatObj : public OperatorObj {
    ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim);
    OP_CLONE(ConcatObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;

    std::string toString() const override;
    int numInputs() const override { return inputs.size(); }
--- a/include/operators/conv.h
+++ b/include/operators/conv.h
@ -142,7 +142,7 @@ class ConvObj : public ConvBaseObj {
            ActType act = ActType::None);
    OP_CLONE(ConvObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
    int getNumGroups() const override { return c / getChannelPerGroup(); }

  private:
@ -164,7 +164,7 @@ class ConvBackwardFilterObj : public ConvBaseObj {
                          int sh = 1, int sw = 1, int dh = 1, int dw = 1,
                          Tensor bias = nullptr, ActType act = ActType::None);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
    ActType getAct() const { return act; }
    int getNumGroups() const override { return c / getChannelPerGroup(); }

@ -191,7 +191,7 @@ class ConvTransposed2dObj : public ConvBaseObj {
                        Tensor bias = nullptr, ActType act = ActType::None);
    OP_CLONE(ConvTransposed2dObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
    int getNumGroups() const override { return group; }
    std::pair<int, int> getOutputPadding() const { return {oph, opw}; }

@ -218,7 +218,7 @@ class ConvTransposed2dNHWCObj : public ConvBaseObj {
                            Tensor bias = nullptr, ActType act = ActType::None);
    OP_CLONE(ConvTransposed2dNHWCObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
    int getNumGroups() const override { return group; }

  private:
--- a/include/operators/det.h
+++ b/include/operators/det.h
@ -7,7 +7,7 @@ class DetObj : public OperatorObj {
    enum Mode { NormalDet = 0, LogDet };
    DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode);
    OP_CLONE(DetObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;

    std::string toString() const override;
    int numInputs() const override { return 1; }
--- a/include/operators/dropout.h
+++ b/include/operators/dropout.h
@ -37,7 +37,7 @@ class DropoutObj : public OperatorObj {
    DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask,
               float ratio, bool training_mode);
    OP_CLONE(DropoutObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;

    std::string toString() const override;
    int numInputs() const override { return 1; }
--- a/include/operators/element_wise.h
+++ b/include/operators/element_wise.h
@ -21,7 +21,7 @@ class ElementWiseObj : public OperatorObj {
     */
    ElementWiseObj(OpType type, GraphObj *graph, Tensor input0, Tensor input1,
                   Tensor output);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;

    std::string toString() const override;
    int numInputs() const override { return 2; }
@ -38,7 +38,7 @@ class MSELossObj : public OperatorObj {
    MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
               Reduction reduction, Tensor output);
    OP_CLONE(MSELossObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;

    Reduction getReduction() const { return reductionMode; }
    std::string toString() const override;
@ -65,24 +65,26 @@ DEFINE_ELEMENT_WISE_OBJ(Sub, OpType::Sub)
 DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
 DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
 DEFINE_ELEMENT_WISE_OBJ(Pow, OpType::Pow)
-DEFINE_ELEMENT_WISE_OBJ(Maximum, OpType::Max)
-DEFINE_ELEMENT_WISE_OBJ(Minimum, OpType::Min)
-DEFINE_ELEMENT_WISE_OBJ(Power, OpType::Pow)
+DEFINE_ELEMENT_WISE_OBJ(Maximum, OpType::Maximum)
+DEFINE_ELEMENT_WISE_OBJ(Minimum, OpType::Minimum)
+DEFINE_ELEMENT_WISE_OBJ(Power, OpType::Power)
 DEFINE_ELEMENT_WISE_OBJ(FloorDiv, OpType::FloorDiv)
 DEFINE_ELEMENT_WISE_OBJ(FloorMod, OpType::FloorMod)
 DEFINE_ELEMENT_WISE_OBJ(SquaredDifference, OpType::SquaredDifference)
 DEFINE_ELEMENT_WISE_OBJ(Equal, OpType::Equal)
-DEFINE_ELEMENT_WISE_OBJ(GreaterThan, OpType::Greater)
-DEFINE_ELEMENT_WISE_OBJ(GreaterEqual, OpType::GreaterOrEqual)
-DEFINE_ELEMENT_WISE_OBJ(LessThan, OpType::Less)
-DEFINE_ELEMENT_WISE_OBJ(LessEqual, OpType::LessOrEqual)
+DEFINE_ELEMENT_WISE_OBJ(NotEqual, OpType::NotEqual)
+DEFINE_ELEMENT_WISE_OBJ(GreaterThan, OpType::GreaterThan)
+DEFINE_ELEMENT_WISE_OBJ(GreaterEqual, OpType::GreaterEqual)
+DEFINE_ELEMENT_WISE_OBJ(LessThan, OpType::LessThan)
+DEFINE_ELEMENT_WISE_OBJ(LessEqual, OpType::LessEqual)
 DEFINE_ELEMENT_WISE_OBJ(And, OpType::And)
 DEFINE_ELEMENT_WISE_OBJ(Or, OpType::Or)
 DEFINE_ELEMENT_WISE_OBJ(Xor, OpType::Xor)
 DEFINE_ELEMENT_WISE_OBJ(Not, OpType::Not)
-DEFINE_ELEMENT_WISE_OBJ(BitAnd, OpType::BitwiseAnd)
-DEFINE_ELEMENT_WISE_OBJ(BitOr, OpType::BitwiseOr)
-DEFINE_ELEMENT_WISE_OBJ(BitXor, OpType::BitwiseXor)
-DEFINE_ELEMENT_WISE_OBJ(BitNot, OpType::BitwiseNot)
-DEFINE_ELEMENT_WISE_OBJ(BitLeftShift, OpType::BitShift)
+DEFINE_ELEMENT_WISE_OBJ(BitAnd, OpType::BitAnd)
+DEFINE_ELEMENT_WISE_OBJ(BitOr, OpType::BitOr)
+DEFINE_ELEMENT_WISE_OBJ(BitXor, OpType::BitXor)
+DEFINE_ELEMENT_WISE_OBJ(BitNot, OpType::BitNot)
+DEFINE_ELEMENT_WISE_OBJ(BitLeftShift, OpType::BitLeftShift)
+DEFINE_ELEMENT_WISE_OBJ(BitRightShift, OpType::BitRightShift)
 }; // namespace infini
--- a/include/operators/expand.h
+++ b/include/operators/expand.h
@ -1,36 +0,0 @@
-#pragma once
-#include "core/operator.h"
-
-namespace infini {
-/**
- *  @brief Broadcast the input tensor following the given shape and the
- * broadcast rule.
- *
- */
-class ExpandObj : public OperatorObj {
-    Shape dims;
-
-  public:
-    /**
-     * @brief Construct a new Expand object.
-     * @param graph The computation graph that this operator belongs to.
-     * @param input The input tensor.
-     * @param output The output tensor.
-     * @param dims The shape you want to expand to, following the broadcast
-     * rule.
-     */
-    ExpandObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
-    OP_CLONE(ExpandObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
-
-    std::string toString() const override;
-    int numInputs() const override { return 1; }
-    int numOutputs() const override { return 1; }
-    Shape getShape() const { return dims; }
-
-  private:
-    vector<int> getWorkloadVector() const override;
-    vector<int> getOpAttrVector() const override;
-};
-
-} // namespace infini
--- a/include/operators/extend.h
+++ b/include/operators/extend.h
@ -23,7 +23,7 @@ class ExtendObj : public OperatorObj {
    ExtendObj(GraphObj *graph, Tensor input, Tensor output, int dim,
              int num = 1);
    OP_CLONE(ExtendObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;

    std::string toString() const override;
    int numInputs() const override { return 1; }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
bolun	608f997042	Add comments in conv benchmark	2023-07-20 16:57:27 +08:00
bolun	97970c5d94	Add benchmark for conv	2023-07-20 16:49:14 +08:00
bolun	4b762cc8d9	Merge branch 'benchmark_softmax' of github.com:InfiniTensor/InfiniTensor into benchmark_conv	2023-07-20 14:15:37 +08:00
bolun	1c55c74151	Add comments	2023-07-20 14:14:06 +08:00
bolun	ddaddf375e	Init benchmark for conv	2023-07-20 14:03:05 +08:00
bolun	7945693131	Add: benchmark for cudnn softmax	2023-07-20 11:03:07 +08:00
bolun	fdb2d30868	fix: Makefile	2023-07-19 11:29:15 +08:00
zhangyue207	f532784d4f	update CMakeList.txt Makefile for micro benchmark	2023-07-19 11:10:18 +08:00
zhangyue207	454b7651a8	update CMakeList.txt Makefile for micro benchmark	2023-07-19 10:55:51 +08:00
zhangyue207	48322dbf27	Conv Kernel FP16	2023-07-19 10:06:47 +08:00
zhangyue207	523946cb8b	add conv_half kernel	2023-07-18 16:55:00 +08:00
				`@ -0,0 +1 @@`
				`Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9`
				`@ -1 +0,0 @@`
				`Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77`
				`@ -1 +0,0 @@`
				`Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65`