Add: show conv2gemm derivation

Rename: Expr construction helpers
Add: enable mutator search in python
2023-11-10 22:49:07 +08:00 · 2023-06-25 20:31:08 +08:00 · 2023-06-25 20:18:18 +08:00 · 2023-05-07 13:22:39 +08:00 · 2023-05-05 15:16:07 +08:00 · 2023-04-30 23:44:10 +08:00
536 changed files with 14095 additions and 24956 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1,11 +1,12 @@
 name: Build and test cpu
 on:
  push:
+    branch: 'master'
    paths-ignore:
      - '**.md'
      - 'LICENSE'
  pull_request:
-    paths:
+    paths-ignore:
      - '**.md'
      - 'LICENSE'

@ -14,10 +15,10 @@ env:
  protobuf-version: "3.21.12"
  python-version: "3.10"

-  resnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/resnet18-v2-7.onnx
-  inception-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/inception-v2-9.onnx
-  densenet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/densenet-12.onnx
-  efficientnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/efficientnet-lite4-11.onnx
+  resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx
+  inception-download: https://media.githubusercontent.com/media/onnx/models/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx
+  densenet-download: https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-12.onnx
+  efficientnet-download: https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx

 jobs:
  build:
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@ -1,11 +1,12 @@
 name: clang-format Check
 on:
  push:
+    branch: 'master'
    paths-ignore:
      - '**.md'
      - 'LICENSE'
  pull_request:
-    paths:
+    paths-ignore:
      - '**.md'
      - 'LICENSE'

--- a/.gitignore
+++ b/.gitignore
@ -31,8 +31,7 @@
 *.out
 *.app

-build/
-build_debug/
+*build*/

 .vscode/

@ -42,5 +41,3 @@ build_debug/

 # onnx model
 *.onnx
-*.pb
-*.npy
--- a/.gitmodules
+++ b/.gitmodules
@ -11,8 +11,5 @@
 	path = 3rd-party/backward-cpp
 	url = git@github.com:bombela/backward-cpp.git
 [submodule "example"]
-	path = examples/NNmodel
+	path = example
 	url = git@github.com:wanghailu0717/NNmodel.git
-[submodule "examples/distributed/onnxsim_large_model"]
-	path = examples/distributed/onnxsim_large_model
-	url = git@github.com:luchangli03/onnxsim_large_model.git
--- a/3rd-party/backward-cpp
+++ b/3rd-party/backward-cpp
@ -1 +1 @@
-Subproject commit 3bb9240cb15459768adb3e7d963a20e1523a6294
+Subproject commit f30744bcf726ea3735df7ecf9e9de9ddac540283
--- a/3rd-party/googletest
+++ b/3rd-party/googletest
@ -1 +1 @@
-Subproject commit b796f7d44681514f58a683a3a71ff17c94edb0c1
+Subproject commit e2239ee6043f73722e7aa812a459f54a28552929
--- a/3rd-party/nlohmann_json_cmake_fetchcontent
+++ b/3rd-party/nlohmann_json_cmake_fetchcontent
@ -1 +1 @@
-Subproject commit 13132dd361c8c5b5753983d5186cf54f689d90f9
+Subproject commit 6aebf09233951e4ce30a63919186a70b2b195756
--- a/3rd-party/pybind11
+++ b/3rd-party/pybind11
@ -1 +1 @@
-Subproject commit 0bd8896a4010f2d91b2340570c24fa08606ec406
+Subproject commit 1e3400b6742288429f2069aaf5febf92d0662dae
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,13 +0,0 @@
-# Changelog
-
-All notable changes to this project will be documented in this file.
-
-The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-
-## Unreleased
-
-### Added
-
-### Modified
-
-### Fixed
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,45 +1,20 @@
-# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
-option(USE_CUDA "Support CUDA GPU" OFF)
-option(USE_BANG "Support BANG MLU" OFF)
-option(USE_KUNLUN "Support KUNLUN XPU" OFF)
-option(USE_INTELCPU "Support INTELCPU" OFF)
-option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
-option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
-option(BUILD_NNET "Build nnet" OFF)
-option(BUILD_DIST "Build project for distributed running" OFF)
-option(BUILD_TEST "Build tests" OFF)
-
-if(USE_CUDA)
-    message("CMake 3.18 or higher is required for setting CUDAToolkit")
-    cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit
-else()
-    cmake_minimum_required(VERSION 3.17)
-endif()
-
+cmake_minimum_required(VERSION 3.17) # FindCUDAToolkit
 include(CMakeDependentOption)
 project(InfiniTensor C CXX)

+# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
+option(USE_CUDA "Support CUDA GPU" OFF)
+option(USE_BANG "Support BANG MLU" OFF)
+option(USE_INTELCPU "Support INTELCPU" OFF)
+option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
+option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
+option(BUILD_TEST "Build tests" OFF)
+
 cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
+cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)

 set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
-# Build Type
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    message("Configuring for Debug build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
-    add_compile_definitions(DEBUG_MODE)
-elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
-    message("Configuring for Release build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-    add_compile_definitions(NDEBUG)
-elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
-    message("Configuring for RelWithDebInfo build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
-else()
-    message("Build type not specified. Configuring for RelWithDebInfo build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
-endif()
-

 if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
  message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")
@ -53,13 +28,11 @@ endif()

 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
-add_compile_options(-Wno-error=unused-variable)

 find_package(
  Python
  COMPONENTS Interpreter Development
  REQUIRED)
-
 # OpenMP
 find_package(OpenMP)
 if(OpenMP_C_FOUND)
@ -96,17 +69,16 @@ add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
 include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)

 # TVM backend
-if(BUILD_NNET AND BUILD_TEST)
+if(BUILD_TEST_EINNET)
+  if (NOT TVM_INCLUDE_DIR OR NOT DMLC_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR)
+    message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_TEST_EINNET is ON")
+  endif()
  # TVM and DMLC for invoking TVM packed functions
  include_directories(${TVM_INCLUDE_DIR})
  include_directories(${DMLC_INCLUDE_DIR})
  include_directories(${DLPACK_INCLUDE_DIR})
-  if (TVM_INCLUDE_DIR AND DMLC_INCLUDE_DIR AND DLPACK_INCLUDE_DIR AND DLPACK_INCLUDE_DIR)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
-  else()
-    # message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_NNET AND BUILD_TEST is ON")
-  endif()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
 endif()

 if(BUILD_TEST)
@ -120,21 +92,13 @@ if(BUILD_TEST)
  include_directories(3rd-party/googletest/googletest/include)
 endif()

-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations -Wno-error=pointer-arith")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion


 # Source files
-file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/operators/*.cc src/utils/*.cc)
-
-if(BUILD_NNET)
-  add_compile_definitions(BUILD_NNET=1)
-  file(GLOB_RECURSE SRC_NNET src/nnet/*.cc)
-  list (APPEND SRC ${SRC_NNET})
-  # For locating resource files
-  set_source_files_properties(src/nnet/test.cc PROPERTIES COMPILE_OPTIONS "-DINFINI_PROJECT_HOME=${CMAKE_CURRENT_SOURCE_DIR}")
-endif()
+file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/operators/*.cc src/utils/*.cc)

 if(USE_CUDA)
  file(GLOB_RECURSE SRC_CUDA src/cuda/*.cc src/cuda/*.cu src/kernels/cuda/*.cc src/kernels/cuda/*.cu)
@ -146,11 +110,6 @@ if(USE_BANG)
  list (APPEND SRC ${SRC_BANG})
 endif()

-if(USE_KUNLUN)
-  file(GLOB_RECURSE SRC_KUNLUN src/kunlun/*.cc src/kernels/kunlun/*.cc )
-  list (APPEND SRC ${SRC_KUNLUN})
-endif()
-
 if(USE_INTELCPU)
  file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc )
  list (APPEND SRC ${SRC_INTELCPU})
@ -165,12 +124,12 @@ endif()
 target_link_libraries(InfiniTensor pybind11::embed)

 # TVM backend
-if(BUILD_NNET AND BUILD_TEST AND TVM_LIB_DIR)
+if(BUILD_TEST_EINNET)
  target_link_libraries(InfiniTensor ${TVM_LIB_DIR}/libtvm.so)
 endif()

 # Python bindings
-file(GLOB_RECURSE FFIS src/ffi/ffi_infinitensor.cc)
+file(GLOB_RECURSE FFIS src/ffi/ffi_callback.cc src/ffi/ffi_infinitensor.cc)
 pybind11_add_module(backend MODULE ${FFIS})
 target_link_libraries(backend PRIVATE InfiniTensor)

@ -209,6 +168,7 @@ endif()

 if(USE_CUDA)
  add_compile_definitions(USE_CUDA=1)
+  add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM=1) # Support CUDA graph stream caputre
  # Since enable_language only executes once, rerun cmake is required if CMAKE_CUDA_HOST_COMPILER is wrong
  set(CMAKE_CUDA_HOST_COMPILER
      ${CMAKE_CXX_COMPILER}
@ -218,13 +178,6 @@ if(USE_CUDA)
  enable_language(CUDA)
  find_package(CUDAToolkit) # For nvrtc and cuda driver
  target_link_libraries(InfiniTensor cudnn CUDA::curand CUDA::cublas CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
-  if (BUILD_DIST)
-    message(STATUS "Add BUILD_DIST, use NCCL with CUDA")
-    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
-    find_package(NCCL REQUIRED)
-    add_compile_definitions(INFINI_USE_NCCL=1)
-    target_link_libraries(InfiniTensor nccl)
-  endif()
 endif()

 if(USE_BANG)
@ -263,50 +216,7 @@ if(USE_BANG)
  # BangC Kernels
  ################################################################################

-  if (BUILD_DIST)
-    find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
-    target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
-    message(STATUS "Add BUILD_DIST, use CNCL with BANG")
-    add_compile_definitions(INFINI_USE_CNCL=1)
-  else()
-    target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
-  endif()
-endif()
-
-if(USE_KUNLUN)
-  add_compile_definitions(USE_KUNLUN=1)
-  if ((NOT DEFINED KUNLUN_HOME) AND (NOT DEFINED ENV{KUNLUN_HOME}))
-    message(FATAL_ERROR "KUNLUN_HOME is not defined from cmake or env")
-  elseif (DEFINED KUNLUN_HOME)
-          set(KUNLUN_HOME ${KUNLUN_HOME} CACHE STRING "KUNLUN_HOME directory for Kunlun development")
-  else()
-          set(KUNLUN_HOME $ENV{KUNLUN_HOME} CACHE STRING "KUNLUN_HOME directory for Kunlun development")
-  endif()
-  message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")
-
-  include_directories("${KUNLUN_HOME}/include/")
-  find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64/")
-  find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/lib64/")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
-
-  if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
-    execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
-    set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
-  elseif(DEFINED TARGET_CPU_ARCH)
-    set(TARGET_CPU_ARCH ${TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
-  else()
-    set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
-  endif()
-  message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
-
-  if (BUILD_DIST)
-    message(STATUS "Add BUILD_DIST, use XCCL with KUNLUN XPU")
-    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
-    find_package(XCCL REQUIRED)
-    add_compile_definitions(INFINI_USE_XCCL=1)
-    target_link_libraries(InfiniTensor ${XCCL_LIBRARIES})
-  endif()
-  target_link_libraries(InfiniTensor ${KUNLUN_RT} ${KUNLUN_DNN} stdc++)
+  target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
 endif()

 # # Python bindings
@ -325,7 +235,6 @@ function(build_test files)
 endfunction()

 if(BUILD_TEST)
-  add_compile_definitions(BUILD_TEST=1)
  enable_testing()
  if(USE_TRACE)
    build_test(test/trace/*.cc)
@ -333,18 +242,11 @@ if(BUILD_TEST)
  if(BUILD_TEST_CORE)
    build_test(test/core/*.cc)
    build_test(test/operators/*.cc)
-    build_test(test/kernels/nativecpu/*.cc)
    if (USE_CUDA)
      build_test(test/kernels/cuda/*.cc)
-      build_test(test/cuda/*.cc)
    endif()
    if (USE_BANG)
      build_test(test/kernels/bang/*.cc)
-      build_test(test/bang/*.cc)
-    endif()
-    if (USE_KUNLUN)
-      build_test(test/kernels/kunlun/*.cc)
-      build_test(test/kunlun/*.cc)
    endif()
    if (USE_INTELCPU)
      build_test(test/kernels/intelcpu/*.cc)
@ -353,7 +255,7 @@ if(BUILD_TEST)
  if(BUILD_TEST_PET)
    build_test(test/pet/*.cc)
  endif()
-  if(BUILD_NNET AND BUILD_TEST)
+  if(BUILD_TEST_EINNET)
    build_test(test/nnet/test_*.cc)

    # Build expression reader
--- a/48
+++ b/48
@ -1,38 +1,17 @@
-.PHONY : build clean format install-python test-cpp test-onnx
+.PHONY : build clean install-python test-cpp test-onnx

-TYPE ?= Release
+TYPE ?= release
 CUDA ?= OFF
 BANG ?= OFF
-KUNLUN ?= OFF
 INTELCPU ?= off
 BACKTRACE ?= ON
 TEST ?= ON
-DIST ?= OFF
-NNET ?= OFF
-DIST ?= OFF
-FORMAT_ORIGIN ?=
-# Docker build options
-DOCKER_NAME ?= infinitensor
-DOCKER_IMAGE_NAME ?= infinitensor
-DOCKER_FILE ?= infinitensor_ubuntu_22.04.dockerfile
-DOCKER_RUN_OPTION ?=
-
-# CUDA option.
-ifeq ($(CUDA), ON)
-	DOCKER_IMAGE_NAME = infinitensor_cuda
-	DOCKER_NAME = infinitensor_cuda
-	DOCKER_FILE = infinitensor_ubuntu_22.04_CUDA.dockerfile
-	DOCKER_RUN_OPTION += --gpus all -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v `pwd`:`pwd` -w `pwd`
-endif

 CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
 CMAKE_OPT += -DUSE_CUDA=$(CUDA)
 CMAKE_OPT += -DUSE_BANG=$(BANG)
-CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
 CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
 CMAKE_OPT += -DBUILD_TEST=$(TEST)
-CMAKE_OPT += -DBUILD_DIST=$(DIST)
-CMAKE_OPT += -DBUILD_NNET=$(NNET)

 ifeq ($(INTELCPU), ON)
 	CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp
@ -45,33 +24,14 @@ build:
 clean:
 	rm -rf build

-format:
-	@python3 scripts/format.py $(FORMAT_ORIGIN)
-
 install-python: build
 	cp build/$(TYPE)/backend*.so pyinfinitensor/src/pyinfinitensor
-	pip install -e pyinfinitensor/
+	pip install pyinfinitensor/

-test-cpp:
+test-cpp: build
 	@echo
 	cd build/$(TYPE) && make test

 test-onnx:
 	@echo
 	python3 pyinfinitensor/tests/test_onnx.py
-
-test-api:
-	@echo
-	python3 pyinfinitensor/tests/test_api.py
-
-docker-build:
-	docker build -f scripts/dockerfile/$(DOCKER_FILE) -t $(DOCKER_NAME) .
-
-docker-run:
-	docker run -t --name $(DOCKER_IMAGE_NAME) -d $(DOCKER_NAME) $(DOCKER_RUN_OPTION)
-
-docker-start:
-	docker start $(DOCKER_IMAGE_NAME)
-
-docker-exec:
-	docker exec -it $(DOCKER_IMAGE_NAME) bash
--- a/README.md
+++ b/README.md
@ -1,14 +1,19 @@
 # InfiniTensor

-[中文项目简介](/README_CN.md) | Documentation | [中文文档](/docs/INDEX.md)
-
-[![Build](https://github.com/InfiniTensor/InfiniTensor/actions/workflows/build.yml/badge.svg?branch=master)](https://github.com/InfiniTensor/InfiniTensor/actions)
-[![issue](https://img.shields.io/github/issues/InfiniTensor/InfiniTensor)](https://github.com/InfiniTensor/InfiniTensor/issues)
-![license](https://img.shields.io/github/license/InfiniTensor/InfiniTensor)
-
-InfiniTensor is a high-performance inference engine tailored for GPUs and AI accelerators. Its design focuses on effective deployment and swift academic validation.
-
-## Get started
+## Compilation on Lotus
+# Compilation for cuda
+``` bash
+# Enter the root of InfiniTensor
+source test/script/env_lotus.sh
+make CUDA=ON
+```
+## Compilation for intelcpu
+``` bash
+# Enter the root of InfiniTensor
+source test/script/env_lotus.sh intelcpu
+mkdir build && cd build
+cmake -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp .. && make -j 12
+```

 ### Make Commands

@ -25,23 +30,12 @@ InfiniTensor is a high-performance inference engine tailored for GPUs and AI acc

 ### CMake Options

-There are several configurable CMake options, see the [CMakeLists.txt](/CMakeLists.txt#L5) file.
+There are several configurable CMake options, see the [CMakeLists.txt file](/CMakeLists.txt#L5).

 - If `USE_BACKTRACE` is `ON`, `libdw-dev` have to be installed. See the README of [backward-cpp](https://github.com/bombela/backward-cpp) for details.
 - If `USE_PROTOBUF` is `ON`, `protobuf` have to be installed. See the README of [protobuf](https://github.com/protocolbuffers/protobuf) for details.
 - If `USE_CUDA` is `ON`, `cuda` have to be installed.

-## Roadmap
-
- [RefactorGraph](https://github.com/InfiniTensor/RefactorGraph) is a newly designed AI framework that is set to replace the current main branch.
- [EinNet](https://github.com/InfiniTensor/InfiniTensor/tree/NNET_e2e) is going to be merged into the main branch.
- Integration of [PET](https://github.com/thu-pacman/PET), a tensor program optimizer supporting partially equivalent transformations.
- Supported hardware
-  - ✔ NVIDIA GPU
-  - ✔ Cambricon MLU
-  - ✔ Kunlunxin XPU
-  - ⬜ Ascend NPU
-
 ## Contributor Guide

 InfiniTensor development is based on the pull request on Github. Before requesting for merging, a PR should satisfy the following requirements
@ -52,24 +46,9 @@ InfiniTensor development is based on the pull request on Github. Before requesti
 2. Receive at least one approval from reviewers.
 3. PR title should be concise since it is going to be the commit message in the main branch after merging and squashing.

-## Reference
+## Dependencies

-Please cite EinNet or PET in your publications if it helps your research:
-
-```plaintext
-@article{zheng2023einnet,
-  title={EINNET: Optimizing Tensor Programs with Derivation-Based Transformations},
-  author={Zheng, Liyan and Wang, Haojie and Zhai, Jidong and Hu, Muyan and Ma, Zixuan and Wang, Tuowei and Huang, Shuhong and Miao, Xupeng and Tang, Shizhi and Huang, Kezhao and Jia, Zhihao},
-  booktitle={17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)},
-  pages={739--755},
-  year={2023}
-}
-
-@inproceedings{wang2021pet,
-  title={PET: Optimizing tensor programs with partially equivalent transformations and automated corrections},
-  author={Wang, Haojie and Zhai, Jidong and Gao, Mingyu and Ma, Zixuan and Tang, Shizhi and Zheng, Liyan and Li, Yuanzhi and Rong, Kaiyuan and Chen, Yuanyong and Jia, Zhihao},
-  booktitle={15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)},
-  pages={37--54},
-  year={2021}
-}
-```
+- [backward-cpp](https://github.com/bombela/backward-cpp): [v1.6](https://github.com/bombela/backward-cpp/releases/tag/v1.6)
+- [googletest](https://github.com/google/googletest): [v1.13.0](https://github.com/google/googletest/releases/tag/v1.13.0)
+- [nlohmann_json_cmake_fetchcontent](https://github.com/ArthurSonzogni/nlohmann_json_cmake_fetchcontent): [v3.10.5](https://github.com/ArthurSonzogni/nlohmann_json_cmake_fetchcontent/releases/tag/v3.10.5)
+- [pybind11](https://github.com/pybind/pybind11): [v2.10.3](https://github.com/pybind/pybind11/releases/tag/v2.10.3)
--- a/README_CN.md
+++ b/README_CN.md
@ -1,13 +1,221 @@
-# Infinitensor
+# 使用指南

-## 项目简介
+## 目录

-本项目是深度学习领域的一个编译器集合，本项目旨在缩小深度学习应用与后端硬件之间的鸿沟。本项目通过使用编译器超优化技术，对神经网络模型进行优化，从而获得更好的性能。同时，本项目与深度学习框架相互配合，为不同的硬件后端提供端倒端的编译，方便用户迁移部署。
+- [编译](#编译)
+- [使用](#使用)
+- [python-前端应用指南](#python-前端应用指南)
+  - [导入-onnx-模型](#导入-onnx-模型)
+  - [导出-onnx-模型](#导出-onnx-模型)
+  - [执行推理](#执行推理)
+- [测试](#测试)

-## 项目设计
+## 编译

-本项目的设计是前后端解耦合的，主要有三个模块，分别为：
+推荐使用 Ubuntu-22.04，本文以此环境为例。

- Runtime 模块：该模式负责对不同的加速卡后端进行包装与支持，支撑后端运行。另外提供统一的向上接口，方便上层建设。
- Compiler 模块：该模式负责对神经网络模型进行优化变换，获得更加高效的等价模型。
- Interface 模块：该模式负责给用户提供编程与交互的接口，方便用户使用本系统。
+1. 使用 apt 安装依赖
+
+   > 如果不使用 Ubuntu-22.04，部分软件版本可能不够高。
+
+   ```bash
+   sudo apt-get install make cmake build-essential python-is-python3 python-dev-is-python3 python3-pip libdw-dev
+   ```
+
+2. 更新 pip 并换清华源
+
+   ```bash
+   python -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip
+   pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+   ```
+
+3. 编译并安装 python 库
+
+   > 第一次执行会同时安装 python 依赖库，比较慢
+
+   仅编译 CPU 部分：
+
+   ```bash
+   make install-python
+   ```
+
+   编译 GPU 部分：
+
+   ```bash
+   make install-python CUDA=ON
+   ```
+
+## 使用
+
+项目管理功能已写到 [Makefile](Makefile)，支持下列功能：
+
+- 编译项目：`make`/`make build`
+- 清理生成文件：`make clean`
+- 安装 python 库：`make install-python`
+- 测试 c++ 后端：`make test-cpp`
+- 测试 python 前端：`make test-onnx`
+
+并使用下列环境变量传递选项参数：
+
+- `TYPE`：编译模式（`debug`/`release`），默认值为 `release`
+- `CUDA`：是否编译 CUDA 后端，默认为 `OFF`，`ON` 打开
+- `BANG`：是否编译寒武纪后端，默认为 `OFF`，`ON` 打开
+- `BACKTRACE`：是否启用栈回溯，默认为 `ON`，`OFF` 关闭，建议调试时打开
+- `TEST`：是否编译 `googletest`，默认为 `ON`，`OFF` 关闭，只有 `test-cpp` 时必要
+
+## python 前端应用指南
+
+`make install-python` 会将项目的 python 前端以 `pyinfinitensor` 为名字安装到系统目录，可以直接 `import pyinfinitensor` 来使用。现阶段，项目的主要用法是从 onnx 导入模型进行优化，然后可以再导出优化后的模型到 onnx，也可以直接运行推理。
+
+### 导入 onnx 模型
+
+支持的模型：
+
+- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
+- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
+- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
+- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
+
+```python
+import onnx
+from pyinfinitensor.onnx import OnnxStub
+from pyinfinitensor import backend
+
+stub = OnnxStub.from_model(onnx.load("model_file"), backend.cpu_runtime())
+```
+
+[`onnx.load`](https://onnx.ai/onnx/api/serialization.html#load-a-model) 是 onnx 提供的加载函数，将 onnx 文件读取为保存在内存中的 onnx 模型。
+
+`OnnxStub` 是 onnx 模型在项目中的表示，通过构造这个对象，将 onnx 模型导入到项目中。其构造器的第一个参数是 onnx 模型文件；第二个参数是模型运行的后端运行时，可以是 `backend.cpu_runtime()`、`backend.cuda_runtime()` 或 `backend.bang_runtime()`。
+
+构造出的 stub 对象可以用于操作项目中的模型和运行时。
+
+### 优化
+
+TODO
+
+### 导出 onnx 模型
+
+优化后的模型可以导出成 onnx 文件提供给其他运行时。
+
+```python
+with open("optimized.onnx", "wb") as f:
+    f.write(stub.to_onnx("optimized").SerializeToString())
+```
+
+`stub.to_onnx(<name>)` 将模型转换为 onnx 模型对象，`<name>` 将填写到 onnx 模型的 `name` 字段。序列化到文件的代码见[官方示例](https://onnx.ai/onnx/intro/python.html#model-serialization)。
+
+要可视化检查导出的模型文件，可以利用 [onnx 提供的功能](https://onnx.ai/onnx/api/shape_inference.html#infer-shapes)将所有的张量的形状推理出来再导出：
+
+```python
+from onnx.shape_inference import infer_shapes
+
+with open("optimized.onnx", "wb") as f:
+    f.write(infer_shapes(stub.to_onnx("optimized")).SerializeToString())
+```
+
+然后用 [Netron](https://netron.app/) 绘制计算图。
+
+### 执行推理
+
+也可以使用项目的运行时执行推理。
+
+第一步是将数据传入计算图。`OnnxStub.inputs` 是一个 `Dict[str, Tensor]`，保存着模型的所有输入的名字和对象。可以用 [`items()`](https://docs.python.org/zh-cn/3/library/stdtypes.html#dict.items) 来遍历。
+
+这个代码片段显示了如何打印出模型所有输入张量的名字、形状和对象指针：
+
+```python
+for name, tensor in stub.inputs.items():
+    print(name, tensor.shape(), tensor)
+```
+
+对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)，会打印出：
+
+```plaintext
+data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
+```
+
+当然，地址是随机的。这个输出表明需要输入一个名为 “data”，形为 1×3×224×224 的数据。通常来说，这表示一张 224×224 的 rgb 图片。而这个模型是一个 1000 分类的图像分类模型。
+
+为了方便，这里我们向模型传入一个随机的数据。
+
+```python
+import numpy
+
+stub.init()
+for name, tensor in stub.inputs.items():
+    print(name, tensor.shape(), tensor)
+    input = numpy.random.random(tensor.shape()).astype(numpy.float32)
+    tensor.copyin_float(input.flatten().tolist())
+```
+
+`stub.init()` 为所有张量分配空间。空间是预分配的，所以不支持动态 size 的模型。
+
+`tensor.copyin_float(<data>)` 向张量传入数据。其参数必须是一个 `List[float]`，即压平的数据。类似的函数还有 `copyin_int32(<data>)` 和 `copyin_int64(<data>)`
+
+然后，调用 `stub.run()` 执行推理：
+
+```python
+stub.run()
+```
+
+最后，将结果拷贝出来，传入类似：
+
+```python
+stub.init()
+for name, tensor in stub.outputs.items():
+    print(name, tensor.shape(), tensor)
+    print(tensor.copyout_float())
+```
+
+## 测试
+
+除了单元测试 `make test-cpp` 和 `make test-onnx` 之外，还可以用其他方式来测试单个模型导入导出和优化的正确性。
+
+这个脚本利用 onnxruntime 来测试导出的模型是否与导入的模型等价：
+
+```python
+import onnx
+import numpy
+import sys
+from onnx import ModelProto, ValueInfoProto
+from pyinfinitensor.onnx import OnnxStub
+from pyinfinitensor import backend
+from onnxruntime import InferenceSession
+
+
+def infer(model: ModelProto, input) -> dict:
+    collection = set()
+    for node in model.graph.node:
+        for output in node.output:
+            collection.add(output)
+    model.graph.output.extend([ValueInfoProto(name=x) for x in collection])
+    session = InferenceSession(model.SerializeToString())
+    i = session.get_inputs()[0].name
+    return dict(
+        zip(
+            [x.name for x in session.get_outputs()],
+            [x.flatten() for x in session.run(None, {i: input})],
+        )
+    )
+
+
+model0 = onnx.load(sys.argv[1])
+model1 = OnnxStub.from_model(model0, backend.cpu_runtime()).to_onnx("new")
+
+input_shape = [x.dim_value for x in model1.graph.input[0].type.tensor_type.shape.dim]
+input = numpy.random.random(input_shape).astype(numpy.float32)
+
+output0 = infer(model0, input)[model0.graph.output[0].name]
+output1 = infer(model1, input)[model1.graph.output[0].name]
+
+print("error =", sum((output1 - output0) ** 2) / len(output0))
+```
+
+要运行脚本，先安装 onnxruntime：
+
+```bash
+pip install onnxruntime
+```
+
+打印出的 `error = ...` 是两个模型输出张量的均方误差。对于不同的模型，这个误差最小为 0，最大不超过 1e-9。
--- a/cmake/FindCNCL.cmake
+++ b/cmake/FindCNCL.cmake
@ -1,76 +0,0 @@
-SET(CNCL_LIB_SEARCH_PATHS $ENV{NEUWARE_HOME}/lib64)
-SET(CNCL_INCLUDE_SEARCH_PATHS $ENV{NEUWARE_HOME}/include)
-
-set(CNCL_INCLUDE_DIR $ENV{NEUWARE_HOME}/include)
-set(CNCL_LIB_DIR $ENV{NEUWARE_HOME}/lib64)
-set(CNCL_VERSION $ENV{CNCL_VERSION} CACHE STRING "Version of CNCL to build with")
-
-if ($ENV{CNCL_ROOT_DIR})
-  message(WARNING "CNCL_ROOT_DIR is deprecated. Please set CNCL_ROOT instead.")
-endif()
-list(APPEND CNCL_ROOT $ENV{CNCL_ROOT_DIR} ${MLU_TOOLKIT_ROOT_DIR})
-# Compatible layer for CMake <3.12. CNCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
-list(APPEND CMAKE_PREFIX_PATH ${CNCL_ROOT})
-
-find_path(CNCL_INCLUDE_DIRS
-  NAMES cncl.h
-  HINTS ${CNCL_INCLUDE_DIR})
-
-if (USE_STATIC_CNCL)
-  MESSAGE(STATUS "USE_STATIC_CNCL is set. Linking with static CNCL library.")
-  SET(CNCL_LIBNAME "CNCL_static")
-  if (CNCL_VERSION)  # Prefer the versioned library if a specific CNCL version is specified
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
-  endif()
-else()
-  SET(CNCL_LIBNAME "cncl")
-  if (CNCL_VERSION)  # Prefer the versioned library if a specific CNCL version is specified
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
-  endif()
-endif()
-
-find_library(CNCL_LIBRARIES
-  NAMES ${CNCL_LIBNAME}
-  HINTS ${CNCL_LIB_DIR})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(CNCL DEFAULT_MSG CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
-
-if(CNCL_FOUND)  # obtaining CNCL version and some sanity checks
-  set (CNCL_HEADER_FILE "${CNCL_INCLUDE_DIRS}/cncl.h")
-  message (STATUS "Determining CNCL version from ${CNCL_HEADER_FILE}...")
-  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
-  list (APPEND CMAKE_REQUIRED_INCLUDES ${CNCL_INCLUDE_DIRS})
-  include(CheckCXXSymbolExists)
-  check_cxx_symbol_exists(CNCL_VERSION_CODE CNCL.h CNCL_VERSION_DEFINED)
-
-  if (CNCL_VERSION_DEFINED)
-    set(file "${PROJECT_BINARY_DIR}/detect_cncl_version.cc")
-    file(WRITE ${file} "
-      #include <iostream>
-      #include <cncl.h>
-      int main()
-      {
-        std::cout << CNCL_MAJOR << '.' << CNCL_MINOR << '.' << CNCL_PATCH << std::endl;
-        int x;
-        CNCLGetVersion(&x);
-        return x == CNCL_VERSION_CODE;
-      }
-")
-    try_run(CNCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
-          RUN_OUTPUT_VARIABLE CNCL_VERSION_FROM_HEADER
-          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${CNCL_INCLUDE_DIRS}"
-          LINK_LIBRARIES ${CNCL_LIBRARIES})
-    if (NOT CNCL_VERSION_MATCHED)
-      message(FATAL_ERROR "Found CNCL header version and library version do not match! \
-(include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES}) Please set CNCL_INCLUDE_DIR and CNCL_LIB_DIR manually.")
-    endif()
-    message(STATUS "CNCL version: ${CNCL_VERSION_FROM_HEADER}")
-  else()
-    # message(STATUS "CNCL version < 2.3.5-5")
-  endif ()
-  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
-
-  message(STATUS "Found CNCL (include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES})")
-  mark_as_advanced(CNCL_ROOT_DIR CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
-endif()
--- a/cmake/FindNCCL.cmake
+++ b/cmake/FindNCCL.cmake
@ -1,165 +0,0 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
-# 
-# From PyTorch:
-# 
-# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
-# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
-# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
-# 
-# From Caffe2:
-# 
-# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
-# 
-# All contributions by Facebook:
-# Copyright (c) 2016 Facebook Inc.
-# 
-# All contributions by Google:
-# Copyright (c) 2015 Google Inc.
-# All rights reserved.
-# 
-# All contributions by Yangqing Jia:
-# Copyright (c) 2015 Yangqing Jia
-# All rights reserved.
-# 
-# All contributions by Kakao Brain:
-# Copyright 2019-2020 Kakao Brain
-# 
-# All contributions from Caffe:
-# Copyright(c) 2013, 2014, 2015, the respective contributors
-# All rights reserved.
-# 
-# All other contributions:
-# Copyright(c) 2015, 2016 the respective contributors
-# All rights reserved.
-# 
-# Caffe2 uses a copyright model similar to Caffe: each contributor holds
-# copyright over their contributions to Caffe2. The project versioning records
-# all such contribution and copyright details. If a contributor wants to further
-# mark their specific copyright on a particular contribution, they should
-# indicate their copyright solely in the commit message of the change when it is
-# committed.
-# 
-# All rights reserved.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# 
-# 1. Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-# 
-# 2. Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-# 
-# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
-#    and IDIAP Research Institute nor the names of its contributors may be
-#    used to endorse or promote products derived from this software without
-#    specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-# 
-# Find the nccl libraries
-#
-# The following variables are optionally searched for defaults
-#  NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
-#  NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
-#  NCCL_LIB_DIR: Directory where NCCL library is found
-#
-# The following are set after configuration is done:
-#  NCCL_FOUND
-#  NCCL_INCLUDE_DIRS
-#  NCCL_LIBRARIES
-#
-# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
-# install NCCL in the same location as the CUDA toolkit.
-# See https://github.com/caffe2/caffe2/issues/1601
-
-set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
-set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
-set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
-
-if ($ENV{NCCL_ROOT_DIR})
-  message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
-endif()
-list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
-# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
-list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
-
-find_path(NCCL_INCLUDE_DIRS
-  NAMES nccl.h
-  HINTS ${NCCL_INCLUDE_DIR})
-
-if (USE_STATIC_NCCL)
-  MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
-  SET(NCCL_LIBNAME "nccl_static")
-  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
-  endif()
-else()
-  SET(NCCL_LIBNAME "nccl")
-  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
-  endif()
-endif()
-
-find_library(NCCL_LIBRARIES
-  NAMES ${NCCL_LIBNAME}
-  HINTS ${NCCL_LIB_DIR})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
-
-if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
-  set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
-  message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
-  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
-  list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
-  include(CheckCXXSymbolExists)
-  check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
-
-  if (NCCL_VERSION_DEFINED)
-    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
-    file(WRITE ${file} "
-      #include <iostream>
-      #include <nccl.h>
-      int main()
-      {
-        std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
-        int x;
-        ncclGetVersion(&x);
-        return x == NCCL_VERSION_CODE;
-      }
-")
-    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
-          RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
-          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
-          LINK_LIBRARIES ${NCCL_LIBRARIES})
-    if (NOT NCCL_VERSION_MATCHED)
-      message(FATAL_ERROR "Found NCCL header version and library version do not match! \
-(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
-    endif()
-    message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
-  else()
-    # message(STATUS "NCCL version < 2.3.5-5")
-  endif ()
-  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
-
-  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
-  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
-endif()
--- a/cmake/FindXCCL.cmake
+++ b/cmake/FindXCCL.cmake
@ -1,27 +0,0 @@
-# Find the xccl libraries
-set(XCCL_INCLUDE_DIR $ENV{KUNLUN_HOME}/include CACHE PATH "Folder contains KUNLUN XCCL headers")
-set(XCCL_LIB_DIR $ENV{KUNLUN_HOME}  CACHE PATH "Folder contains KUNLUN XCCL libraries")
-
-list(APPEND CMAKE_PREFIX_PATH $ENV{KUNLUN_HOME})
-
-find_path(XCCL_INCLUDE_DIRS # ${XCCL_INCLUDE_DIR}
-  NAMES xpu/bkcl.h
-  HINTS XCCL_INCLUDE_DIR)
-
-find_library(XCCL_LIBRARIES # ${XCCL_LIB_DIR}
-  NAMES lib64/libbkcl.so
-  HINTS XCCL_LIB_DIR)
-
-message(STATUS "XCCL_INCLUDE_DIRS: ${XCCL_INCLUDE_DIRS}")
-message(STATUS "XCCL_LIBRARIES: ${XCCL_LIBRARIES}")
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(XCCL DEFAULT_MSG XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
-
-if (XCCL_FOUND)
-  set (XCCL_HEADER_FILE "${XCCL_INCLUDE_DIRS}/xpu/bkcl.h")
-  message (STATUS "Determing XCCL version from ${XCCL_HEADER_FILE}...")
-  list (APPEND CMAKE_REQUIRED_INCLUDES ${XCCL_INCLUDE_DIRS})
-  message(STATUS "Found XCCL (include: ${XCCL_INCLUDE_DIRS}, library: ${XCCL_LIBRARIES})")
-  mark_as_advanced(XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
-endif()
--- a/docs/INDEX.md
+++ b/docs/INDEX.md
@ -1,5 +0,0 @@
-# 项目文档
-
- [安装部署指南](INSTALL_GUIDE_CN.md)
- [硬件支持](SUPPORT_MATRIX_CN.md)
- [使用指南](USER_GUIDE_CN.md)
--- a/docs/INSTALL_GUIDE_CN.md
+++ b/docs/INSTALL_GUIDE_CN.md
@ -1,172 +0,0 @@
-# 安装部署指南
-
-## 目录
-
- [环境准备](#环境准备)
- [编译本项目](#编译本项目)
- [技术支持](#技术支持)
-
-## 环境准备
-
-目前的软硬件环境支持矩阵
-
-| Host CPU | Device        | OS            |  Support   |
-| -------- | ------------  | -----------   | ---------- |
-| X86-64   | Nvidia GPU    |  Ubuntu-22.04 |  Yes       |
-| X86-64   | Cambricon MLU |  Ubuntu-22.04 |  Yes       |
-
-推荐使用 X86-64 机器以及 Ubuntu-22.04，本文以此环境为例。
-
-1. 确认 GCC 版本为 11.3 及以上的稳定版本，如若您的机器 GCC 版本不满足此条件，请自行编译安装，下述方式二选一：
-
-   - [GCC 官方文档](https://gcc.gnu.org/onlinedocs/gcc-11.3.0/gcc/)
-
-   - [网友安装分享](https://zhuanlan.zhihu.com/p/509695395)
-
-2. 确认 CMake 版本为 3.17 及以上的稳定版本， 如若您的机器 CMake 版本不满足此条件，请自行编译安装，下述方式二选一：
-
-   - [CMake 官方文档](https://cmake.org/install/)
-
-   - [网友安装分享](https://zhuanlan.zhihu.com/p/110793004)
-
-3. 第三方加速卡软件资源安装，目前本项目已经适配了如下的第三方加速卡：
-
-   - 如您的第三方加速卡为英伟达 GPU，请参考英伟达官方文档进行：
-
-     > [驱动安装](https://www.nvidia.cn/geforce/drivers/)，
-     > [CUDA Toolkit 安装](https://developer.nvidia.com/cuda-toolkit)，
-     > [Cudnn 安装](https://developer.nvidia.com/rdp/cudnn-download)，
-     > [Cublas 安装](https://developer.nvidia.com/cublas)，
-     > 安装完成后请进行相应的环境变量配置，将可执行文件目录与库目录添加到操作系统识别的路径中，例如
-     >
-     > ```bash
-     > # 将如下内容写入到你的 bashrc 文件并 source 该文件
-     > export CUDA_HOME="/PATH/TO/YOUR/CUDA_HOME"
-     > export CUDNN_HOME="/PATH/TO/YOUR/CUDNN_HOME"
-     > export PATH="${CUDA_HOME}/bin:${PATH}"
-     > export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
-     > # 如您不方便将上述环境变量配置到 bashrc 文件中进行长期使用，你也可以在我们提供的 env.sh 文件中进行正确配置并激活，作为临时使用
-     > source env.sh
-     > ```
-
-     我们强烈建议您规范安装，统一到一个目录下，以免不必要的麻烦。
-
-   - 如您的第三方加速卡为寒武纪 MLU，请参考寒武纪官方文档进行：
-     > [驱动安装](https://www.cambricon.com/docs/sdk_1.11.0/driver_5.10.6/user_guide_5.10.6/index.html)，
-     > [CNToolkit 安装](https://www.cambricon.com/docs/sdk_1.11.0/cntoolkit_3.4.1/cntoolkit_install_3.4.1/index.html)，
-     > [CNNL 安装](https://www.cambricon.com/docs/sdk_1.11.0/cambricon_cnnl_1.16.1/user_guide/index.html)，
-     > 安装完成后请进行相应的环境变量配置，将可执行文件目录与库目录添加到操作系统识别的路径中，例如
-     >
-     > ```bash
-     > # 将如下内容写入到你的 bashrc 文件并 source 该文件
-     > export NEUWARE_HOME="/usr/local/neuware"
-     > export PATH="${NEUWARE_HOME}/bin:${PATH}"
-     > export LD_LIBRARY_PATH="${NEUWARE_HOME}/lib64:${LD_LIBRARY_PATH}"
-     > # 如您不方便将上述环境变量配置到 bashrc 文件中进行长期使用，你也可以在我们提供的 env.sh 文件中进行正确配置并激活，作为临时使用
-     > source env.sh
-     > ```
-
-     我们强烈建议您规范安装，统一到一个目录下，以免不必要的麻烦。另外请注意，由于 MLU 上层软件建设适配程度有限，如您在其覆盖的机器，操作系统之外运行，需要在安装驱动之后使用上层软件的 Docker。
-
-4. 确认您安装了 make，build-essential， python-is-python3， python-dev-is-python3， python3-pip， libdw-dev，如您的机器没有上述基础依赖，请自行按需安装。
-
-   - 在使用 apt-get 工具情况下，您可以这样执行
-
-     ```bash
-     sudo apt-get install make cmake build-essential python-is-python3 python-dev-is-python3 python3-pip libdw-dev
-     ```
-
-5. 更新pip并切换到清华源
-
-   ```bash
-   python -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip
-   pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
-   ```
-
-6. 安装一些不必要的项目（可选）
-
-   - 如您需要运行本项目下的 example 代码，您需要安装一些辅助项目。请注意这些项目不是必要的，若您不需要运行样例代码，这些项目无需安装。
-
-     > [Pytorch](https://pytorch.org/get-started/locally/)：业界内流行的神经网络编程框架
-     > [ONNX](https://onnx.ai/get-started.html)：业界内流行的神经网络模型存储文件与转换器
-     > [onnxsim](https://pypi.org/project/onnxsim/)：一个简化onnx模型的小工具
-     > [onnx2torch](https://github.com/ENOT-AutoDL/onnx2torch)：一个将onnx模型转换pytorch模型的小工具
-     > [tqdm](https://pypi.org/project/tqdm/)：一个显示程序运行进度条的小工具
-
-   - 如您需要使用本项目下的 InfiniTest 测试工具，你还需要安装如下的项目：
-
-     > [protobuf](https://github.com/protocolbuffers/protobuf)： 一种序列化文件的格式及其编译、序列化、解析工具
-
-## 编译本项目
-
-推荐使用 X86-64 机器以及 Ubuntu-22.04，本文以此环境为例。
-
-1. 配置环境
-
-   打开 env.sh 文件进行环境变量配置，之后执行
-
-   ```bash
-   source env.sh
-   ```
-
-2. 编译本项目并打包成 Python 库进行安装
-
-   我们提供了意见编译参数，您可以在项目根目录下执行下面的命令。第一次执行会同时安装 python 依赖库，耗时略长，请耐心等待。
-
-   仅编译 CPU 部分，不编译第三方计算卡：
-
-   ```bash
-   make install-python
-   ```
-
-   编译 CPU 部分，同时编译英伟达 GPU 部分：
-
-   ```bash
-   export CUDA_HOME=/path/to/your/cuda_home
-   make install-python CUDA=ON
-   ```
-
-   编译 CPU 部分，同时编译寒武纪 MLU 部分：
-
-   ```bash
-   export NEUWARE_HOME=/path/to/your/neuware_home
-   make install-python BANG=ON
-   ```
-
-   编译 CPU 部分，同时编译昆仑 XPU 部分：
-
-   ```bash
-   export KUNLUN_HOME=/path/to/your/kunlun_home
-   make install-python KUNLUN=ON
-   ```
-
-3. 使用方法
-
-   安装成功后，您就可以使用本项目的 Python 接口进行编码并运行。具体使用方式可以参考项目样例代码 example/Resnet/resnet.py 以及用户使用手册
-
-## Docker
-
-本项目也提供了 Docker 的环境，您可以使用 `make docker-build` 或 `make docker-build CUDA=ON` 命令启动并编译 Dockerfile，您可以通过添加编译选项或者修改 Makefile 变量修改 docker image 名称或者所选的 Dockerfile 文件。
- 
-由于在拉取 github repo 时需要将 ssh key 加入到 github profile 中，因此暂时注释掉拉取 repo 并编译项目的过程，由用户在进入 docker 后自己维护 ssh key（将 host 中的 ssh key 复制到 docker 中可能会遇到环境不一致的问题）。
-
-```shell
-# Build docker container.
-make docker-build
-# Run docker image.
-make docker-run
-# Execute docker image.
-make docker-exec
-```
-
-如果需要编译 CUDA 版，请使用如下命令：
-```shell
-# Build docker container.
-make docker-build CUDA=ON
-# Run docker image.
-make docker-run CUDA=ON
-```
-
-## 技术支持
-
-如遇到问题，请联系我们技术支持团队
--- a/docs/SUPPORT_MATRIX_CN.md
+++ b/docs/SUPPORT_MATRIX_CN.md
@ -1,30 +0,0 @@
-# 支持矩阵
-
-## 目录
-
-
- [环境支持](#环境支持)
- [神经网络支持](#神经网络支持)
- [技术支持](#技术支持)
-
-## 环境支持
-
-目前的软硬件环境支持矩阵
-
-| Host CPU | Device        | OS            |  Support   |
-| -------- | ------------  | -----------   | ---------- |
-| X86-64   | Nvidia GPU    |  Ubuntu-22.04 |  Yes       |
-| X86-64   | Cambricon MLU |  Ubuntu-22.04 |  Yes       |
-
-## 神经网络支持
-
-目前已经验证过的神经网络模型有
-
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
-
-## 技术支持
-
-如若您遇到了本项目的问题，请联系我们的技术支持团队
--- a/docs/TODO.md
+++ b/docs/TODO.md
@ -1 +0,0 @@
-
--- a/docs/USER_GUIDE_CN.md
+++ b/docs/USER_GUIDE_CN.md
@ -1,203 +0,0 @@
-# 使用指南
-
-## 目录
-
- [使用方法](#使用方法)
- [python 前端应用指南](#python-前端应用指南)
-  - [导入 onnx 模型](#导入-onnx-模型)
-  - [优化](#优化)
-  - [导出 onnx 模型](#导出-onnx-模型)
-  - [执行推理](#执行推理)
-  - [样例代码](#样例代码)
- [技术支持](#技术支持)
- [测试](#测试)
-
-## 使用方法
-
-项目管理功能已写到 [Makefile](../Makefile)，支持下列功能：
-
- 编译项目：`make`/`make build`
- 清理生成文件：`make clean`
- 安装 python 库：`make install-python`
- 测试 c++ 后端：`make test-cpp`
- 测试 python 前端：`make test-onnx`
-
-并使用下列环境变量传递选项参数：
-
- `TYPE`：编译模式（`debug`/`release`），默认值为 `release`
- `CUDA`：是否编译 CUDA 后端，默认为 `OFF`，`ON` 打开
- `BANG`：是否编译寒武纪后端，默认为 `OFF`，`ON` 打开
- `KUNLUN`：是否编译昆仑后端，默认为 `OFF`，`ON` 打开
- `BACKTRACE`：是否启用栈回溯，默认为 `ON`，`OFF` 关闭，建议调试时打开
- `TEST`：是否编译 `googletest`，默认为 `ON`，`OFF` 关闭，只有 `test-cpp` 时必要
-
-## python 前端应用指南
-
-`make install-python` 会将项目的 python 前端以 `pyinfinitensor` 为名字安装到系统目录，可以直接 `import pyinfinitensor` 来使用。现阶段，项目的主要用法是从 onnx 导入模型进行优化，然后可以再导出优化后的模型到 onnx，也可以直接运行推理。
-
-### 导入 onnx 模型
-
-支持的模型：
-
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
-
-```python
-import onnx
-from pyinfinitensor.onnx import OnnxStub
-from pyinfinitensor import backend
-
-stub = OnnxStub(onnx.load("model_file"), backend.cpu_runtime())
-```
-
-[`onnx.load`](https://onnx.ai/onnx/api/serialization.html#load-a-model) 是 onnx 提供的加载函数，将 onnx 文件读取为保存在内存中的 onnx 模型。
-
-`OnnxStub` 是 onnx 模型在项目中的表示，通过构造这个对象，将 onnx 模型导入到项目中。其构造器的第一个参数是 onnx 模型文件；第二个参数是模型运行的后端运行时，可以是 `backend.cpu_runtime()`、`backend.cuda_runtime()` 或 `backend.bang_runtime()`。
-
-构造出的 stub 对象可以用于操作项目中的模型和运行时。
-
-### 优化
-
-TODO
-
-### 导出 onnx 模型
-
-优化后的模型可以导出成 onnx 文件提供给其他运行时。
-
-```python
-with open("optimized.onnx", "wb") as f:
-    f.write(stub.to_onnx("optimized").SerializeToString())
-```
-
-`stub.to_onnx(<name>)` 将模型转换为 onnx 模型对象，`<name>` 将填写到 onnx 模型的 `name` 字段。序列化到文件的代码见[官方示例](https://onnx.ai/onnx/intro/python.html#model-serialization)。
-
-要可视化检查导出的模型文件，可以利用 [onnx 提供的功能](https://onnx.ai/onnx/api/shape_inference.html#infer-shapes)将所有的张量的形状推理出来再导出：
-
-```python
-from onnx.shape_inference import infer_shapes
-
-with open("optimized.onnx", "wb") as f:
-    f.write(infer_shapes(stub.to_onnx("optimized")).SerializeToString())
-```
-
-然后用 [Netron](https://netron.app/) 绘制计算图。
-
-### 执行推理
-
-也可以使用项目的运行时执行推理。
-
-第一步是将数据传入计算图。`OnnxStub.inputs` 是一个 `Dict[str, Tensor]`，保存着模型的所有输入的名字和对象。可以用 [`items()`](https://docs.python.org/zh-cn/3/library/stdtypes.html#dict.items) 来遍历。
-
-这个代码片段显示了如何打印出模型所有输入张量的名字、形状和对象指针：
-
-```python
-for name, tensor in stub.inputs.items():
-    print(name, tensor.shape(), tensor)
-```
-
-对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)，会打印出：
-
-```plaintext
-data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
-```
-
-当然，地址是随机的。这个输出表明需要输入一个名为 “data”，形为 1×3×224×224 的数据。通常来说，这表示一张 224×224 的 rgb 图片。而这个模型是一个 1000 分类的图像分类模型。
-
-为了方便，这里我们向模型传入一个随机的数据。
-
-```python
-import numpy
-
-stub.init()
-for name, tensor in stub.inputs.items():
-    print(name, tensor.shape(), tensor)
-    input = numpy.random.random(tensor.shape()).astype(numpy.float32)
-    tensor.copyin_float(input.flatten().tolist())
-```
-
-`stub.init()` 为所有张量分配空间。空间是预分配的，所以不支持动态 size 的模型。
-
-`tensor.copyin_float(<data>)` 向张量传入数据。其参数必须是一个 `List[float]`，即压平的数据。类似的函数还有 `copyin_int32(<data>)` 和 `copyin_int64(<data>)`
-
-然后，调用 `stub.run()` 执行推理：
-
-```python
-stub.run()
-```
-
-最后，将结果拷贝出来，传入类似：
-
-```python
-stub.init()
-for name, tensor in stub.outputs.items():
-    print(name, tensor.shape(), tensor)
-    print(tensor.copyout_float())
-```
-
-### 样例代码
-
-您可以参照[resnet.py](https://github.com/wanghailu0717/NNmodel/blob/main/ResNet/resnet.py)的样例代码进行了解，并尝试运行。在这个文件中，我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式：
-
-```python
-python resnet.py -h
-```
-
-在样例代码中，我们对定义的网络进行了序列化操作，并存储为模型文件。之后加载该模型文件，并转换为本项目的模型进行优化操作，再进行推理。您可以关注一下代码中 242 行之后的代码。请注意，您可以按照您的需求来进行操作，通常来说，您所需要撰写的代码就是加载模型，转换为本项目的模型进行优化，推理运行。
-
-## 技术支持
-
-如若您遇到了本项目的问题，请联系我们的技术支持团队
-
-## 测试
-
-除了单元测试 `make test-cpp` 和 `make test-onnx` 之外，还可以用其他方式来测试单个模型导入导出和优化的正确性。
-
-这个脚本利用 onnxruntime 来测试导出的模型是否与导入的模型等价：
-
-```python
-import onnx
-import numpy
-import sys
-from onnx import ModelProto, ValueInfoProto
-from pyinfinitensor.onnx import OnnxStub
-from pyinfinitensor import backend
-from onnxruntime import InferenceSession
-
-
-def infer(model: ModelProto, input) -> dict:
-    collection = set()
-    for node in model.graph.node:
-        for output in node.output:
-            collection.add(output)
-    model.graph.output.extend([ValueInfoProto(name=x) for x in collection])
-    session = InferenceSession(model.SerializeToString())
-    i = session.get_inputs()[0].name
-    return dict(
-        zip(
-            [x.name for x in session.get_outputs()],
-            [x.flatten() for x in session.run(None, {i: input})],
-        )
-    )
-
-
-model0 = onnx.load(sys.argv[1])
-model1 = OnnxStub(model0, backend.cpu_runtime()).to_onnx("new")
-
-input_shape = [x.dim_value for x in model1.graph.input[0].type.tensor_type.shape.dim]
-input = numpy.random.random(input_shape).astype(numpy.float32)
-
-output0 = infer(model0, input)[model0.graph.output[0].name]
-output1 = infer(model1, input)[model1.graph.output[0].name]
-
-print("error =", sum((output1 - output0) ** 2) / len(output0))
-```
-
-要运行脚本，先安装 onnxruntime：
-
-```bash
-pip install onnxruntime
-```
-
-打印出的 `error = ...` 是两个模型输出张量的均方误差。对于不同的模型，这个误差最小为 0，最大不超过 1e-9。
--- a/env.sh
+++ b/env.sh
@ -1,38 +0,0 @@
-# 配置英伟达 CUDA 的 HOME 路径，请注意安装 CUDA Toolkit, CUDNN 并将路径配置到下述环境变量。
-export CUDA_HOME=/PATH/TO/YOUR/CUDA/HOME
-export CUDNN_HOME=/PATH/TO/YOUR/CUDNN/HOME
-export PATH="${CUDA_HOME}/bin:${PATH}"
-export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
-
-# 配置寒武纪 BANG 的 HOME 路径，请注意 /usr/local/neuware 是寒武纪软件栈建议的，同时也是默认的安装路径。
-# 如若用户有其他的路径安装方式，请自行配置正确的路径。
-# 这里是 neuware 目录下一个可能的结构图，请参考。
-# .
-# ├── bin
-# ├── cmake
-# ├── data
-# ├── edge
-# ├── include
-# ├── lib
-# ├── lib64
-# ├── LICENSE
-# ├── mlvm
-# ├── README
-# ├── samples
-# ├── share
-# └── version.txt
-export NEUWARE_HOME=/usr/local/neuware
-export PATH="${NEUWARE_HOME}/bin:${PATH}"
-export LD_LIBRARY_PATH="${NEUWARE_HOME}/lib64:${LD_LIBRARY_PATH}"
-
-# 配置昆仑芯 XPU 的 HOME 路径，请注意 /usr/local/xpu 是昆仑芯软件栈提供的软件包路径。
-# 如若用户有其他的路径安装方式，请自行配置正确的路径。
-# 这里是 xpu 目录下一个可能的结构图，请参考。
-# .
-# ├── bin
-# ├── include
-# ├── lib64
-# ├── tools
-# ├── version
-# └── XTDK
-export KUNLUN_HOME=/usr/local/xpu
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9
--- a/examples/NNmodel
+++ b/examples/NNmodel
@ -1 +0,0 @@
-Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77
--- a/examples/distributed/README.md
+++ b/examples/distributed/README.md
@ -1,39 +0,0 @@
-# 分布式脚本
-
-## 英伟达平台运行方式
-
-#### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx
-
-使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
-
-```bash
-python run_pytorch.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
-```
-
-会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`，目前只支持单一输入输出。
-
-#### 2. 运行InfiniTensor分布式脚本
-
-```bash
-python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
-```
-
-## 寒武纪平台运行方式
-
-**将上述运行脚本 `run_pytorch.py` 以及 `cuda_launch.py` 针对寒武纪平台做了相应的适配，具体见 `run_pytorch_mlu.py` 以及 `bang_launch.py`。**
-
-#### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx
-
-使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
-
-```bash
-python run_pytorch_mlu.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
-```
-
-会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`，目前只支持单一输入输出。
-
-#### 2. 运行InfiniTensor分布式脚本
-
-```bash
-python bang_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
-```
--- a/examples/distributed/init.py
+++ b/examples/distributed/init.py
--- a/examples/distributed/bang/bang_launch.py
+++ b/examples/distributed/bang/bang_launch.py
@ -1,187 +0,0 @@
-import sys
-sys.path.append('../')
-
-import argparse
-import os
-import time
-import multiprocessing as mp
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-from onnx.external_data_helper import convert_model_to_external_data
-from onnx.shape_inference import infer_shapes_path
-import numpy as np
-from parallel_opt import parallel_model
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
-    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
-    parser.add_argument(
-        "--nproc_per_node", type=int, default=1, help="number of processes per node"
-    )
-    parser.add_argument(
-        "--name", type=str, default="test", help="name of this instance."
-    )
-    parser.add_argument(
-        "--model", type=str, required=True, help="path to the ONNX model file."
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--gen_std",
-        action="store_true",
-        help="whether to generate the standard results.",
-    )
-    parser.add_argument(
-        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.num_nodes,
-        args.nproc_per_node,
-        args.name,
-        args.model,
-        args.batch_size,
-        args.length,
-        args.gen_std,
-        args.type,
-    )
-
-
-def run_model(model, runtime, world_size=1, rank=0, n=10, data_type="default"):
-    stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
-    load_inputs(stub, world_size, rank)
-    # stub.tune()
-    stub.run()
-    # get outputs
-    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
-
-    # bench
-    for _ in range(n):
-        stub.run()
-    begin = time.time()
-    for _ in range(n * 2):
-        stub.run()
-    end = time.time()
-    avg_time = (end - begin) / (n * 2)
-    print(f"average time: {avg_time}")
-    return outputs
-
-def load_inputs(stub, world_size=1, rank=0):
-    for i, (name, tensor) in enumerate(stub.inputs.items()):
-        input = np.load(f"./data/input_{i}.npy")
-        if all(x == y for x,y in zip(input.shape,tensor.shape())):
-            tensor.copyin_numpy(input)
-        else:
-            tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
-
-
-def run_and_compare(name, model, runtime, world_size=1, rank=0, data_type="default"):
-    results = np.load(f"./data/output.npy")
-    outputs = run_model(model, runtime, world_size, rank, data_type=data_type)
-    print("outputs abs mean:", abs(outputs).mean())
-    print("max abs diff:", abs(outputs - results).max())
-
-def start_worker(
-    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
-):
-    dist_name = name + "_dist"
-    model = parallel_model(model, world_size, rank)
-    extern_path = f"./{dist_name}_rank{rank}.pb"
-    if os.path.exists(extern_path):
-        os.remove(extern_path)
-    onnx.save_model(
-        model,
-        f"./{dist_name}_rank{rank}.onnx",
-        save_as_external_data=True,
-        location=extern_path,
-    )
-    #infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
-    runtime = backend.BangRuntime(local_rank)
-    # print("init comm")
-    runtime.init_comm(
-        dist_name,
-        world_size,
-        rank,
-    )
-    run_and_compare(name, model, runtime, world_size, rank, data_type)
-
-
-def start_single(name, model, data_type):
-    runtime = backend.BangRuntime(0)
-    run_and_compare(name, model, runtime, data_type=data_type)
-
-def generate_input_output(model):
-    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
-    runtime = backend.BangRuntime(0)
-    stub = OnnxStub(model, runtime)
-    position_id = 0
-    for i, (name, tensor) in enumerate(stub.inputs.items()):
-        input = tensor.copyout_numpy()
-        if np.issubdtype(input.dtype, np.integer):
-            if input.size == 1:
-                # input = np.array([position_id])
-                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
-            else:
-                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
-        elif input.dtype == np.bool_:
-            input = np.random.randint(0,2,size=input.shape) > 0
-        else:
-            if i == 0:
-                input = np.ones(input.shape).astype(input.dtype)
-                position_id = input.shape[-1] - 1
-            else:
-                input = np.random.rand(*input.shape).astype(input.dtype)
-        tensor.copyin_numpy(input)
-        np.save(f"./data/input_{i}", input)
-    stub.run()
-    time.sleep(0.01)
-    output = next(stub.outputs.values().__iter__()).copyout_numpy()
-    if np.isnan(output).any():
-        print("Nan in output")
-    np.save(f"./data/output", output)
-
-
-def main():
-    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
-    data_type = "default" if data_type == "fp32" else data_type
-    
-    model = onnx.load(model_path)
-
-    # generate standart output
-    if gen_std:
-        print(f"generate standard data for {name}.")
-        # a small vocabulary size to fit all LLM.
-        generate_input_output(model)
-        return
-
-    if nproc_per_node == 1:
-        # run single process.
-        # use standalone process to isolate bang.
-        print("run model by single MLU.")
-        # p = mp.Process(target=start_single, args=(name, model, data_type))
-        # p.start()
-        # p.join()
-        start_single(name, model, data_type)
-        return
-
-    # run distributed parallel.
-    world_size = nnodes * nproc_per_node
-    print(f"run model by {world_size} MLU in parallel.")
-    workers = [
-        mp.Process(
-            target=start_worker,
-            args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
-        )
-        for rank in range(world_size)
-    ]
-
-    for w in workers:
-        w.start()
-
-    for w in workers:
-        w.join()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/bang/run_pytorch_mlu.py
+++ b/examples/distributed/bang/run_pytorch_mlu.py
@ -1,249 +0,0 @@
-import argparse
-import torch
-import torch_mlu
-from transformers import BertModel, BertConfig
-from transformers import GPT2Model, GPT2Config
-from transformers import OPTModel, OPTConfig
-from transformers import AlbertModel, AlbertConfig
-from transformers import LlamaModel, LlamaConfig
-import time
-import numpy as np
-import onnx
-import sys
-import os
-from onnx.external_data_helper import convert_model_to_external_data
-from onnxsim import simplify
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
-    parser.add_argument(
-        "--model", type=str, choices=["gpt2", "bert", "opt", "llama", "albert"], required=True, help="model type"
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--export_onnx",
-        type=str,
-        nargs="?",
-        default=None,
-        const="./",
-        help="whether and where to export onnx file",
-    )
-    parser.add_argument(
-        "--type", type=str, choices=["fp32", "fp16", "tf32"], required=True, help="model data type"
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.model,
-        args.batch_size,
-        args.length,
-        args.export_onnx,
-        args.type
-    )
-
-
-def get_model(modelname):
-    match modelname:
-        case "albert":
-            model = AlbertModel.from_pretrained("albert/albert-base-v2")
-            voc_size = AlbertConfig().vocab_size
-        case "bert":
-            model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
-            voc_size = BertConfig().vocab_size
-        case "gpt2":
-            model = GPT2Model.from_pretrained("GPT2")
-            voc_size = GPT2Config().vocab_size
-        case "opt":
-            model = OPTModel.from_pretrained("facebook/opt-125m")
-            voc_size = OPTConfig().vocab_size
-        case "llama":
-            model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
-            voc_size = LlamaConfig().vocab_size
-        case _:
-            raise KeyError(modelname)
-
-    model = model.eval()
-    return model, voc_size
-
-def run_pytorch(torch_model, voc_size, batchsize, len, dtype="fp32"):
-    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
-    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
-    np.save("./data/input_0", data)
-    inputs = torch.from_numpy(data).to("mlu")
-    torch_model = torch_model.to("mlu")
-    if dtype == "fp16":
-        torch_model = torch_model.half()
-
-    n_iter = 20
-    with torch.no_grad():
-        for _ in range(10):
-            outputs = torch_model(inputs)
-    torch.mlu.synchronize()
-    begin = time.time()
-    with torch.no_grad():
-        for _ in range(n_iter):
-            torch.mlu.synchronize()
-            outputs = torch_model(inputs)
-            torch.mlu.synchronize()
-    torch.mlu.synchronize()
-    end = time.time()
-    
-    avg_time = (end - begin) / n_iter
-    outputs = outputs.last_hidden_state.to("cpu")
-    print("outputs abs mean:", abs(np.array(outputs)).mean())
-    print(f"average time: {avg_time}")
-    # torch.mlu.memory.empty_cache()
-    np.save("./data/output", np.array(outputs))
-    print("Save input & output into ./data.")
-
-
-def export_onnx(modelname, model, data, path, extern=False, dtype="fp32"):
-    data = data.to("mlu")
-    model = model.to("mlu")
-    if dtype == "fp16":
-        model = model.half()
-    torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
-    if modelname != "llama":
-        # use onnxsim to simplify
-        onnx_model = onnx.load(path)
-        onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
-        # onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
-        assert check
-        add_value_info_for_constants(onnx_model)
-        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
-        if extern:
-            extern_path = path.replace('.onnx', '.pb')
-            if os.path.exists(extern_path):
-                os.remove(extern_path)
-            extern_path = extern_path.split("/")[-1]
-            convert_model_to_external_data(
-                onnx_model,
-                all_tensors_to_one_file=True,
-                location=extern_path,
-                size_threshold=1024,
-                convert_attribute=False,
-            )
-        onnx.save(onnx_model, path)
-    else:
-        # use third party tool to simplify llama
-        # reference: https://github.com/luchangli03/onnxsim_large_model/
-        sys.path.append("onnxsim_large_model")
-        from onnx_utils import set_onnx_input_shape
-        from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
-
-        in_model_path = path
-        out_model_path = path
-        if not out_model_path:
-            out_model_path = in_model_path[:-5] + ".sim.onnx"
-        if os.path.isdir(out_model_path):
-            out_model_path = os.path.join(out_model_path, os.path.basename(in_model_path))
-
-        onnx_model = onnx.load(in_model_path)
-        print(f"load model from {in_model_path} success")
-
-        size_th_bytes = 1024 * 1024
-
-        onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
-        print(f"compress model success")
-
-        onnx_model = set_onnx_input_shape(onnx_model, "")
-
-        tensor_size_threshold = f"1024KB"
-        skipped_optimizers = []
-        skipped_optimizers.append("eliminate_duplicate_initializer")
-        onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
-                                    tensor_size_threshold=tensor_size_threshold)
-        if not check:
-            raise ValueError(f"simplify compressed model {in_model_path} failed")
-
-        print(f"simplify model success")
-
-        onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
-        print(f"uncompress model success")
-
-        add_value_info_for_constants(onnx_model)
-
-        onnx.save(onnx_model, out_model_path, save_as_external_data=True)
-
-
-def add_value_info_for_constants(model : onnx.ModelProto):
-    """
-    Currently onnx.shape_inference doesn't use the shape of initializers, so add
-    that info explicitly as ValueInfoProtos.
-    Mutates the model.
-    Args:
-        model: The ModelProto to update.
-    """
-    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
-    if model.ir_version < 4:
-        return
-
-    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
-        inputs = {i.name for i in graph.input}
-        existing_info = {vi.name: vi for vi in graph.value_info}
-        for init in graph.initializer:
-            # Check it really is a constant, not an input
-            if init.name in inputs:
-                continue
-
-            # The details we want to add
-            elem_type = init.data_type
-            shape = init.dims
-
-            # Get existing or create new value info for this constant
-            vi = existing_info.get(init.name)
-            if vi is None:
-                vi = graph.value_info.add()
-                vi.name = init.name
-
-            # Even though it would be weird, we will not overwrite info even if it doesn't match
-            tt = vi.type.tensor_type
-            if tt.elem_type == onnx.TensorProto.UNDEFINED:
-                tt.elem_type = elem_type
-            if not tt.HasField("shape"):
-                # Ensure we set an empty list if the const is scalar (zero dims)
-                tt.shape.dim.extend([])
-                for dim in shape:
-                    tt.shape.dim.add().dim_value = dim
-
-        # Handle subgraphs
-        for node in graph.node:
-            for attr in node.attribute:
-                # Ref attrs refer to other attrs, so we don't need to do anything
-                if attr.ref_attr_name != "":
-                    continue
-
-                if attr.type == onnx.AttributeProto.GRAPH:
-                    add_const_value_infos_to_graph(attr.g)
-                if attr.type == onnx.AttributeProto.GRAPHS:
-                    for g in attr.graphs:
-                        add_const_value_infos_to_graph(g)
-
-
-    return add_const_value_infos_to_graph(model.graph)
-
-
-def main():
-    torch.backends.mlu.matmul.allow_tf32 = False
-    torch.backends.cnnl.allow_tf32 = False
-    modelname, batchsize, seqlen, export_path, dtype = parse_args()
-    if dtype == "tf32":
-        torch.backends.mlu.matmul.allow_tf32 = True
-    else:
-        os.environ["CAMBRICON_TF32_OVERRIDE"] = "0"
-
-    model, voc_size = get_model(modelname)
-    if export_path is not None:
-        filename = "{}_{}_{}_{}.onnx".format(modelname, batchsize, seqlen, dtype)
-        path = os.path.join(export_path, filename)
-        if not os.path.exists(path):
-            param = torch.zeros((batchsize, seqlen), dtype=torch.int)
-            export_onnx(modelname, model, param, path, True, dtype)
-        else:
-            print("Onnx path exists, skipping export.")
-
-    run_pytorch(model, voc_size, batchsize, seqlen, dtype)
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/cuda/cuda_launch.py
+++ b/examples/distributed/cuda/cuda_launch.py
@ -1,161 +0,0 @@
-import argparse
-import os
-import time
-import multiprocessing as mp
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-from onnx.external_data_helper import convert_model_to_external_data
-from onnx.shape_inference import infer_shapes_path
-import numpy as np
-from parallel_opt import parallel_model
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
-    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
-    parser.add_argument(
-        "--nproc_per_node", type=int, default=1, help="number of processes per node"
-    )
-    parser.add_argument(
-        "--name", type=str, default="test", help="name of this instance."
-    )
-    parser.add_argument(
-        "--model", type=str, required=True, help="path to the ONNX model file."
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--gen_std",
-        action="store_true",
-        help="whether to generate the standard results.",
-    )
-    parser.add_argument(
-        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.num_nodes,
-        args.nproc_per_node,
-        args.name,
-        args.model,
-        args.batch_size,
-        args.length,
-        args.gen_std,
-        args.type,
-    )
-
-
-def run_model(model, runtime, inputs, n=10, data_type = "default"):
-    stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
-    for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
-        tensor.copyin_numpy(input)
-    # stub.tune()
-    stub.run()
-    # get outputs
-    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
-
-    # bench
-    for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
-        tensor.copyin_numpy(input)
-    begin = time.time()
-    for _ in range(n):
-        stub.run()
-    end = time.time()
-    avg_time = (end - begin) / n
-    print(f"average time: {avg_time}")
-    return outputs
-
-
-def run_and_compare(name, model, runtime, data_type):
-    input_ids = np.load(f"{name}_inputs.npy")
-    position_ids = np.arange(input_ids.shape[-1])
-    results = np.load(f"{name}_results.npy")
-    outputs = run_model(model, runtime, (input_ids, position_ids), data_type=data_type)
-    print("outputs abs mean:", abs(outputs).mean())
-    print("max abs diff:", abs(outputs - results).max())
-
-
-def start_worker(
-    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
-):
-    dist_name = name + "_dist"
-    model = parallel_model(model, world_size, rank)
-    extern_path = f"./{dist_name}_rank{rank}.pb"
-    if os.path.exists(extern_path):
-        os.remove(extern_path)
-    onnx.save_model(
-        model,
-        f"./{dist_name}_rank{rank}.onnx",
-        save_as_external_data=True,
-        location=extern_path,
-    )
-    #infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
-    runtime = backend.CudaRuntime(local_rank)
-    # print("init comm")
-    runtime.init_comm(
-        dist_name,
-        world_size,
-        rank,
-    )
-    run_and_compare(name, model, runtime, data_type)
-
-
-def start_single(name, model, data_type):
-    runtime = backend.CudaRuntime(0)
-    run_and_compare(name, model, runtime, data_type)
-
-
-def gen_standard(name, model, voc_size, bs, len):
-    # generate standard results
-    input_ids = np.random.randint(0, voc_size, (bs, len))
-    position_ids = np.arange(len)
-    np.save(f"{name}_inputs", input_ids)
-    runtime = backend.CudaRuntime(0)
-    outputs = run_model(model, runtime, (input_ids, position_ids), 1)
-    print("outputs abs mean:", abs(outputs).mean())
-    np.save(f"{name}_results", outputs)
-
-
-def main():
-    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
-    data_type = "default" if data_type == "fp32" else data_type
-    if data_type != "tf32":
-        os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
-    model = onnx.load(model_path)
-
-    # generate standart output
-    if gen_std:
-        print(f"generate standard data for {name}.")
-        # a small vocabulary size to fit all LLM.
-        voc_size = 1000
-        gen_standard(name, model, voc_size, bs, length)
-        return
-
-    # run single process.
-    # use standalone process to isolate cuda.
-    print("run model by single GPU.")
-    p = mp.Process(target=start_single, args=(name, model, data_type))
-    p.start()
-    p.join()
-
-    # run distributed parallel.
-    world_size = nnodes * nproc_per_node
-    print(f"run model by {world_size} GPU in parallel.")
-    workers = [
-        mp.Process(
-            target=start_worker,
-            args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
-        )
-        for rank in range(world_size)
-    ]
-
-    for w in workers:
-        w.start()
-
-    for w in workers:
-        w.join()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/cuda/launch_kvcache.py
+++ b/examples/distributed/cuda/launch_kvcache.py
@ -1,245 +0,0 @@
-import argparse
-import os
-import time
-import multiprocessing as mp
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-from onnx.external_data_helper import convert_model_to_external_data
-import numpy as np
-from parallel_opt import parallel_model
-
-
-os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
-    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
-    parser.add_argument(
-        "--nproc_per_node", type=int, default=1, help="number of processes per node"
-    )
-    parser.add_argument(
-        "--name", type=str, default="test", help="name of this instance."
-    )
-    parser.add_argument(
-        "--model1", type=str, required=True, help="path to the ONNX model file."
-    )
-    parser.add_argument(
-        "--model2", type=str, required=True, help="path to the ONNX model file."
-    )    
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--gen_std",
-        action="store_true",
-        help="whether to generate the standard results.",
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.num_nodes,
-        args.nproc_per_node,
-        args.name,
-        args.model1,
-        args.model2,
-        args.batch_size,
-        args.length,
-        args.gen_std,
-    )
-
-
-def run_model(model1, model2, runtime1, runtime2, inputs1: np.array, inputs2: np.array, n=20):
-    ####################################
-    # run the first graph without kvcache
-    ####################################
-    stub1 = OnnxStub(model1, runtime1)
-    stub1.inputs['onnx::Reshape_0'].copyin_int32(inputs1.reshape(-1).tolist())
-    stub1.tune()
-    stub1.run()
-    kvcache_it1 = []
-    count = 0
-    for output in stub1.outputs.items().__iter__():
-        if count == 0:
-            logits_it1 = np.array(output[1].copyout_float(), dtype=np.float32)
-        else:
-            kvcache_it1.append(np.array(output[1].copyout_float(), dtype=np.float32))
-        count = count + 1
-        
-    # bench for stub1
-    next(stub1.inputs.items().__iter__())[1].copyin_int32(inputs1.reshape(-1).tolist())
-    begin = time.time()
-    for _ in range(n):
-        stub1.run()
-    end = time.time()
-    avg_time = (end - begin) / n
-    print(f"stub1 average time: {avg_time}")        
-        
-    ####################################
-    # run the second graph with kvcache
-    ####################################
-    i = 0
-    batchsize = 1
-    stub2 = OnnxStub(model2, runtime2)
-    past_kvcache_length = (i+2)*np.ones((batchsize, 1), dtype=np.int32)
-    # copyin input
-    stub2.inputs['onnx::Reshape_0'].copyin_int32(inputs2.reshape(-1).tolist())
-    stub2.inputs['input.3'].copyin_int32(past_kvcache_length.reshape(-1).tolist())
-    count = -1
-    for input in stub2.inputs.items().__iter__():
-        if count in range(24):
-            # print(count, input[0])
-            # print(np.dtype(kvcache_it1[count][0]), kvcache_it1[count].shape)
-            input[1].copyin_float(kvcache_it1[count].reshape(-1).tolist())
-        count = count + 1
-    stub2.tune()
-    stub2.run()
-    
-    # copyout output
-    count = 0
-    kvcache_it2 = []
-    for output in stub2.outputs.items().__iter__():
-        if count == 0:
-            logits_it2 = np.array(output[1].copyout_float(), dtype=np.float32)
-        else:
-            kvcache_it2.append(np.array(output[1].copyout_float(), dtype=np.float32))
-        count = count + 1     
-
-    # bench for stub2
-    # copyin input
-    stub2.inputs['onnx::Reshape_0'].copyin_int32(inputs2.reshape(-1).tolist())
-    stub2.inputs['input.3'].copyin_int32(past_kvcache_length.reshape(-1).tolist())
-    count = -1
-    for input in stub2.inputs.items().__iter__():
-        if count in range(24):
-            input[1].copyin_float(kvcache_it1[count].reshape(-1).tolist())
-        count = count + 1
-    begin = time.time()
-    for _ in range(n):
-        stub2.run()
-    end = time.time()
-    avg_time = (end - begin) / n
-    print(f"stub2 average time: {avg_time}")
-    return logits_it2
-
-
-def run_and_compare(name, model1, model2, runtime1, runtime2):
-    data1 = np.load(f"{name}_inputs1.npy")
-    data2 = np.load(f"{name}_inputs2.npy")
-    results = np.load(f"{name}_results.npy")
-    outputs = run_model(model1, model2, runtime1, runtime2, data1, data2)
-    print("outputs sum:", outputs.sum())
-    print("max abs diff:", abs(outputs - results).max())
-    print("max rel diff:", abs((outputs - results) / results).max())
-    # assert np.allclose(outputs, results, rtol=1e-3, atol=1e-6)
-
-
-def start_worker(
-    name: str, world_size: int, rank: int, local_rank: int, model1: onnx.ModelProto, model2: onnx.ModelProto
-):
-    dist_name = name + "_dist"
-    ####################################
-    # shard the first graph
-    ####################################
-    model1 = parallel_model(model1, world_size, rank)
-    extern_path = f"./{dist_name}_stub1_rank{rank}.pb"
-    if os.path.exists(extern_path):
-        os.remove(extern_path)
-    convert_model_to_external_data(
-        model1,
-        all_tensors_to_one_file=True,
-        location=extern_path,
-        size_threshold=1024,
-        convert_attribute=False,
-    )
-    onnx.save(model1, f"./{dist_name}_stub1_rank{rank}.onnx")
-    runtime1 = backend.CudaRuntime(local_rank)
-    runtime1.init_comm(
-        dist_name,
-        world_size,
-        rank,
-    )
-    
-    ####################################
-    # shard the second graph
-    ####################################    
-    model2 = parallel_model(model2, world_size, rank)
-    extern_path = f"./{dist_name}_stub2_rank{rank}.pb"
-    if os.path.exists(extern_path):
-        os.remove(extern_path)
-    convert_model_to_external_data(
-        model2,
-        all_tensors_to_one_file=True,
-        location=extern_path,
-        size_threshold=1024,
-        convert_attribute=False,
-    )    
-    onnx.save(model2, f"./{dist_name}_stub2_rank{rank}.onnx")
-    runtime2 = backend.CudaRuntime(local_rank)
-    # print("init comm")
-    runtime2.init_comm(
-        dist_name,
-        world_size,
-        rank,
-    )    
-    
-    # run the two graphs
-    run_and_compare(name, model1, model2, runtime1, runtime2)
-
-
-def start_single(name, model1, model2):
-    runtime1 = backend.CudaRuntime(0)
-    runtime2 = backend.CudaRuntime(0)
-    run_and_compare(name, model1, model2, runtime1, runtime2)
-
-
-def gen_standard(name, model1, model2, voc_size, bs, len):
-    # generate standard results
-    data1 = np.random.randint(0, voc_size, (bs, len), dtype=np.int32)
-    data2 = np.random.randint(0, voc_size, (bs, len), dtype=np.int32)
-    np.save(f"{name}_inputs1", data1)
-    np.save(f"{name}_inputs2", data2)
-    runtime1 = backend.CudaRuntime(0)
-    runtime2 = backend.CudaRuntime(0)
-    outputs = run_model(model1, model2, runtime1, runtime2, data1, data2, 1)
-    np.save(f"{name}_results", outputs)
-
-
-def main():
-    nnodes, nproc_per_node, name, model1_path, model2_path, bs, length, gen_std = parse_args()
-
-    model1 = onnx.load(model1_path)
-    model2 = onnx.load(model2_path)
-
-    # generate standart output
-    if gen_std:
-        print(f"generate standard data for {name}.")
-        # a small vocabulary size to fit all LLM.
-        voc_size = 1000
-        gen_standard(name, model1, model2, voc_size, bs, length)
-        return
-
-    # run single process.
-    # use standalone process to isolate cuda.
-    p = mp.Process(target=start_single, args=(name, model1, model2))
-    p.start()
-    p.join()
-
-    # run distributed parallel.
-    world_size = nnodes * nproc_per_node
-    workers = [
-        mp.Process(
-            target=start_worker,
-            args=(name, world_size, rank, rank % nproc_per_node, model1, model2),
-        )
-        for rank in range(world_size)
-    ]
-
-    for w in workers:
-        w.start()
-
-    for w in workers:
-        w.join()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/cuda/run_pytorch.py
+++ b/examples/distributed/cuda/run_pytorch.py
@ -1,188 +0,0 @@
-import argparse
-import torch
-from transformers import BertModel, BertConfig
-from transformers import GPT2Model, GPT2Config
-from transformers import OPTModel, OPTConfig
-import time
-import numpy as np
-import onnx
-import os
-from onnx.external_data_helper import convert_model_to_external_data
-from onnxsim import simplify
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
-    parser.add_argument(
-        "--model", type=str, choices=["gpt2", "bert", "opt"], required=True, help="model type"
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--export_onnx",
-        type=str,
-        nargs="?",
-        default=None,
-        const="./",
-        help="whether and where to export onnx file",
-    )
-    parser.add_argument(
-        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.model,
-        args.batch_size,
-        args.length,
-        args.export_onnx,
-        args.type,
-    )
-
-
-def get_model(modelname):
-    match modelname:
-        case "bert":
-            model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
-            voc_size = BertConfig().vocab_size
-        case "gpt2":
-            model = GPT2Model.from_pretrained("gpt2")
-            voc_size = GPT2Config().vocab_size
-        case "opt":
-            model = model = OPTModel.from_pretrained("./opt-125m")
-            voc_size = OPTConfig().vocab_size
-        case _:
-            raise KeyError(modelname)
-
-    model = model.eval()
-    return model, voc_size
-
-def run_pytorch(torch_model, voc_size, batchsize, len):
-    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
-    np.save("test_inputs", data)
-    inputs = torch.from_numpy(data).to("cuda")
-    torch_model = torch_model.to("cuda")
-
-    n_iter = 20
-    with torch.no_grad():
-        for _ in range(10):
-            outputs = torch_model(inputs)
-    torch.cuda.synchronize()
-    begin = time.time()
-    with torch.no_grad():
-        for _ in range(n_iter):
-            torch.cuda.synchronize()
-            outputs = torch_model(inputs)
-            # 
-            torch.cuda.synchronize()
-    torch.cuda.synchronize()
-    end = time.time()
-    
-    avg_time = (end - begin) / n_iter
-    outputs = outputs.last_hidden_state.to("cpu")
-    print("outputs abs mean:", abs(np.array(outputs)).mean())
-    print(f"average time: {avg_time}")
-    torch.cuda.memory.empty_cache()
-    np.save("test_results", np.array(outputs, dtype=np.float32))
-    print("Save input & output as test_inputs.npy and test_results.npy")
-
-
-def export_onnx(model, data, path, extern=False):
-    torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
-    onnx_model = onnx.load(path)
-    onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
-    #onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
-    assert check
-    add_value_info_for_constants(onnx_model)
-    onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
-    if extern:
-        extern_path = path.replace('.onnx', '.pb')
-        if os.path.exists(extern_path):
-            os.remove(extern_path)
-        convert_model_to_external_data(
-            onnx_model,
-            all_tensors_to_one_file=True,
-            location=extern_path,
-            size_threshold=1024,
-            convert_attribute=False,
-        )
-    onnx.save(onnx_model, path)
-
-def add_value_info_for_constants(model : onnx.ModelProto):
-    """
-    Currently onnx.shape_inference doesn't use the shape of initializers, so add
-    that info explicitly as ValueInfoProtos.
-    Mutates the model.
-    Args:
-        model: The ModelProto to update.
-    """
-    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
-    if model.ir_version < 4:
-        return
-
-    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
-        inputs = {i.name for i in graph.input}
-        existing_info = {vi.name: vi for vi in graph.value_info}
-        for init in graph.initializer:
-            # Check it really is a constant, not an input
-            if init.name in inputs:
-                continue
-
-            # The details we want to add
-            elem_type = init.data_type
-            shape = init.dims
-
-            # Get existing or create new value info for this constant
-            vi = existing_info.get(init.name)
-            if vi is None:
-                vi = graph.value_info.add()
-                vi.name = init.name
-
-            # Even though it would be weird, we will not overwrite info even if it doesn't match
-            tt = vi.type.tensor_type
-            if tt.elem_type == onnx.TensorProto.UNDEFINED:
-                tt.elem_type = elem_type
-            if not tt.HasField("shape"):
-                # Ensure we set an empty list if the const is scalar (zero dims)
-                tt.shape.dim.extend([])
-                for dim in shape:
-                    tt.shape.dim.add().dim_value = dim
-
-        # Handle subgraphs
-        for node in graph.node:
-            for attr in node.attribute:
-                # Ref attrs refer to other attrs, so we don't need to do anything
-                if attr.ref_attr_name != "":
-                    continue
-
-                if attr.type == onnx.AttributeProto.GRAPH:
-                    add_const_value_infos_to_graph(attr.g)
-                if attr.type == onnx.AttributeProto.GRAPHS:
-                    for g in attr.graphs:
-                        add_const_value_infos_to_graph(g)
-
-
-    return add_const_value_infos_to_graph(model.graph)
-
-
-def main():
-    torch.backends.cuda.matmul.allow_tf32 = False
-    torch.backends.cudnn.allow_tf32 = False
-    modelname, batchsize, seqlen, export_path, data_type = parse_args()
-    if data_type == "tf32":
-        torch.backends.cuda.matmul.allow_tf32 = True
-    else:
-        os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
-
-    model, voc_size = get_model(modelname)
-    if export_path is not None:
-        filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
-        path = os.path.join(export_path, filename)
-        param = torch.zeros((batchsize, seqlen), dtype=torch.int)
-        export_onnx(model, param, path, True)
-
-    if data_type == "fp16":
-        model = model.half()
-    run_pytorch(model, voc_size, batchsize, seqlen)
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/kunlun/export_onnx.sh
+++ b/examples/distributed/kunlun/export_onnx.sh
@ -1,14 +0,0 @@
- export HF_ENDPOINT=https://hf-mirror.com
-
-models=("bert" "gpt2" "llama")
-batch_size=(1 32)
-seq_len=(100 500)
-nproc=(1 2 4)
-
-for model in "${models[@]}"; do
-    for bs in "${batch_size[@]}"; do
-        for len in "${seq_len[@]}"; do
-            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" --export_onnx ../models/"$model" --export_only 
-        done
-    done
-done 
--- a/examples/distributed/kunlun/kunlun_launch.py
+++ b/examples/distributed/kunlun/kunlun_launch.py
@ -1,280 +0,0 @@
-import sys
-sys.path.append('../')
-
-import argparse
-import os
-import time
-import multiprocessing as mp
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-from onnx.external_data_helper import convert_model_to_external_data
-from onnx.shape_inference import infer_shapes_path
-import numpy as np
-from parallel_opt import parallel_model
-from functools import wraps
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
-    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
-    parser.add_argument(
-        "--nproc_per_node", type=int, default=2, help="number of processes per node"
-    )
-    parser.add_argument(
-        "--name", type=str, choices=["gpt2", "bert", "llama"], help="name of model."
-    )
-    parser.add_argument(
-        "--model", type=str, default="", help="path to the ONNX model file."
-    )
-    parser.add_argument(
-        "--gen_std",
-        default=False,
-        action="store_true",
-        help="whether to generate the standard results.",
-    )
-    parser.add_argument(
-        "--run_single",
-        default=False,
-        action="store_true",
-        help="whether run model with single process with standard inputs"
-    )
-    parser.add_argument(
-        "--input_dir",
-        default="./",
-        help="path to save model input data"
-    )
-    parser.add_argument(
-        "--result_dir",
-        default="./",
-        help="path to save model standard output"
-    )
-    parser.add_argument(
-        "--internal_model_dir",
-        default="./",
-        help="path to save internal onnx model for parallel run"
-    )
-    args = parser.parse_args()
-
-    # check path, mkdir if not exist
-    check_exists(args.input_dir)
-    check_exists(args.result_dir)
-    check_exists(args.internal_model_dir)
-
-    print("arg setting: ", args)
-    return (
-        args.num_nodes,
-        args.nproc_per_node,
-        args.name,
-        args.model,
-        args.gen_std,
-        args.run_single,
-        args.input_dir,
-        args.result_dir,
-        args.internal_model_dir
-    )
-
-
-"""
-utils function for this scripts
-"""
-def check_exists(path: str):
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-def np_assert(base, test, rtol=1e-2, atol=1e-1):
-    # np.testing.assert_allclose(test, base, rtol, atol)
-    print("max abs diff:", abs(base - test).max())
-
-
-"""
-Perf wrapper, run function n times
-then average
-"""
-def perf_it(n):
-    def decorator(func):
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            # warmup
-            for _ in range(n):
-                func(*args, **kwargs)
-
-            t_total = 0
-            for _ in range(n):
-                t0 = time.time()
-                func(*args, **kwargs)
-                t1 = time.time()
-                t_total += t1 - t0
-            avg_time = (t_total) / n
-            print(f"Avg runtime of {n} time is {avg_time:.6f} seconds")
-            return avg_time
-        return wrapper
-    return decorator
-
-
-"""
-Run InfiniTensor model with Standard input
-check=True: check with standard output gen by pytorch
-perf=True: run n times to get avg time
-"""
-def run_model(task_name,
-              model,
-              runtime,
-              world_size=1,
-              rank=0,
-              n=10,
-              check=True,
-              perf=True):
-
-    stub = OnnxStub(model, runtime,
-                    use_naive_allocator=True \
-                    if task_name == "llama" else False)
-
-    # load in Onnx model inputs
-    def load_inputs(stub: OnnxStub):
-        # check exists
-        inputs = []
-        for i, (name, tensor) in enumerate(stub.inputs.items()):
-            input_path = os.path.join(input_dir, \
-                                f"{task_name}_input_{i}.npy")
-            print(input_path)
-            if os.path.exists(input_path):
-                input = np.load(input_path)
-            else :
-                raise KeyError(f"{i} th input of model not exists")
-            # check shape
-            if all(x == y for x,y in zip(input.shape, tensor.shape())):
-                tensor.copyin_numpy(input)
-            else:
-                tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
-
-    load_inputs(stub)
-    # stub.tune()
-    stub.run()
-    time.sleep(0.01)
-    output = next(stub.outputs.values().__iter__()).copyout_numpy()
-
-    # check output results with standard output
-    if check:
-        st_output_path = os.path.join(result_dir, \
-                                f"{task_name}_output.npy")
-        assert os.path.exists(st_output_path) , \
-                    "standard output not exists"
-        st_output = np.load(st_output_path)
-        if np.isnan(output).any():
-            print("Nan in output")
-            exit()
-        np_assert(st_output, output)
-
-    # perf
-    if perf:
-        @perf_it(n)
-        def perf_infinitensor(stub: OnnxStub):
-            stub.run()
-        perf_infinitensor(stub)
-
-    return output
-
-
-"""
-Start a worker in Parallel
-"""
-def start_worker(name: str,
-           world_size: int,
-           rank: int,
-           local_rank: int,
-           model: onnx.ModelProto):
-
-    dist_name = name + "_dist"
-    # partial a onnx model to world_size part
-    model = parallel_model(model, world_size, rank)
-    onnx.save(model, os.path.join(internal_model_dir, \
-                                    f"{dist_name}_rank{rank}.onnx"), save_as_external_data=True)
-    runtime = backend.KUNLUNRuntime(local_rank)
-    # print("init comm")
-    runtime.init_comm(
-        dist_name,
-        world_size,
-        rank,
-    )
-    run_model(name, model, runtime, world_size, rank)
-
-
-"""
-generate standard input/output with
-sigle card run
-"""
-def gen_standard(task_name: str, model: onnx.ModelProto):
-    runtime = backend.KUNLUNRuntime(0)
-    stub = OnnxStub(model, runtime)
-    position_id = 0
-    # generate random input for model
-    for i, (name, tensor) in enumerate(stub.inputs.items()):
-        input = tensor.copyout_numpy()
-        if np.issubdtype(input.dtype, np.integer):
-            if input.size == 1:
-                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
-            else:
-                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
-        elif input.dtype == np.bool_:
-            input = np.random.randint(0,2,size=input.shape) > 0
-        else:
-            if i == 0:
-                input = np.ones(input.shape).astype(input.dtype)
-                position_id = input.shape[-1] - 1
-            else:
-                input = np.random.rand(*input.shape).astype(input.dtype)
-        tensor.copyin_numpy(input)
-        np.save(os.path.join(input_dir, \
-                    f"{task_name}_input_{i}.npy"), input)
-    stub.run()
-    # print(stub.outputs)
-    output = next(stub.outputs.values().__iter__()).copyout_numpy()
-    if np.isnan(output).any():
-        print("Nan in output")
-        exit()
-    np.save(os.path.join(result_dir, f"{task_name}_output.npy"), output)
-
-
-def main():
-
-    global input_dir, result_dir, internal_model_dir
-
-    nnodes, nproc_per_node, task_name, \
-        model_path, gen_std, run_single, \
-            input_dir, result_dir, internal_model_dir = parse_args()
-
-    # load input onnx model
-    model = onnx.load(model_path)
-
-    # generate standart output
-    if gen_std:
-        print("Generate inputs and outputs.")
-        gen_standard(task_name, model)
-        return
-
-    if run_single:
-        print("Run model by one GPU card.")
-        runtime = backend.KUNLUNRuntime(0)
-        run_model(task_name, model, runtime)
-        return
-
-    # run distributed parallel.
-    world_size = nnodes * nproc_per_node
-    print(f"Run model by {world_size} GPU in parallel.")
-    workers = [
-        mp.Process(
-            target=start_worker,
-            args=(task_name, world_size, rank, rank % nproc_per_node, model),
-        )
-        for rank in range(world_size)
-    ]
-
-    for w in workers:
-        w.start()
-
-    for w in workers:
-        w.join()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/kunlun/launch.sh
+++ b/examples/distributed/kunlun/launch.sh
@ -1,36 +0,0 @@
-export HF_ENDPOINT=https://hf-mirror.com
-
-# models=("bert" "gpt2" "llama")
-models=("bert" "gpt2")
-batch_size=(1 32)
-seq_len=(100 500)
-nproc=(1 2 4)
-
-results_dir="results"
-
-if [ -d "$results_dir" ]; then
-    echo "directory ./$results_dir exists"
-else
-    mkdir -p "$results_dir"
-    echo "mkdir $results_dir, logs saved there"
-fi
-
-
-for model in "${models[@]}"; do
-    for bs in "${batch_size[@]}"; do
-        for len in "${seq_len[@]}"; do
-            # run pytorch model
-            echo "Run pytorch $model with batch_size=$bs length=$len ."
-            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" #> results/"$model"_"$bs"_"$len"_pytorch
-            for n in "${nproc[@]}"; do
-                # run infinitensor 
-                echo "Run $n parallel infinitensor "$model" with batch_size=$bs and length=$len ."
-                python kunlun_launch.py --name "$model" --model ../models/"$model"/"$model"_"$bs"_"$len".onnx --nproc_per_node=$n # >> results/"$model"_"$bs"_"$len"_infini 
-                # delete internal files
-                find ./ -type f -name "*.onnx" -delete
-                find ./ -type f -name "*.pb" -delete
-            done
-            find ./ -type f -name "*.npy" -delete
-        done
-    done
-done
--- a/examples/distributed/kunlun/llama_launch.sh
+++ b/examples/distributed/kunlun/llama_launch.sh
@ -1,35 +0,0 @@
-export HF_ENDPOINT=https://hf-mirror.com
-
-# models=("bert" "gpt2" "llama")
-models=("llama")
-batch_size=(1 )
-seq_len=(100 500)
-nproc=(1 2 4)
-
-results_dir="results"
-
-if [ -d "$results_dir" ]; then
-    echo "directory ./$results_dir exists"
-else
-    mkdir -p "$results_dir"
-    echo "mkdir $results_dir, logs saved there"
-fi
-
-
-for model in "${models[@]}"; do
-    for bs in "${batch_size[@]}"; do
-        for len in "${seq_len[@]}"; do
-            echo "Run pytorch llama with batch_size="$bs" and length="$len""
-            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len"
-            for n in "${nproc[@]}"; do
-                    # run pytorch model
-                    echo "Run infinitensor llama with batch_size="$bs" and length="$len" and nproc="$n"."
-                    python kunlun_launch.py --name llama --model ../models/llama/llama_"$bs"_"$len"_fp32.onnx --nproc_per_node=$n
-                    # delete internal files
-                    find ./ -type f -name "*.onnx" -delete
-                    find ./ -type f -name "*0c" -delete
-            done
-            find ./ -type f -name "*.npy" -delete
-        done
-    done
-done
--- a/examples/distributed/kunlun/run_pytorch.py
+++ b/examples/distributed/kunlun/run_pytorch.py
@ -1,245 +0,0 @@
-import argparse
-import torch
-from transformers import BertModel, BertConfig
-from transformers import GPT2Model, GPT2Config
-from transformers import OPTModel, OPTConfig
-from transformers import LlamaModel, LlamaConfig
-import time
-import numpy as np
-import onnx
-import os
-import sys
-from onnx.external_data_helper import convert_model_to_external_data
-from onnxsim import simplify
-
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.backends.cudnn.allow_tf32 = False
-def parse_args():
-    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
-    parser.add_argument(
-        "--model", type=str, choices=["gpt2", "bert", "opt", "llama"], required=True, help="model type"
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--export_onnx",
-        type=str,
-        nargs="?",
-        default=None,
-        const="./",
-        help="whether and where to export onnx file",
-    )
-    parser.add_argument(
-        "--input_dir",
-        type=str,
-        default="./",
-        help="path to save pytorch model input data"
-    )
-    parser.add_argument(
-        "--result_dir",
-        type=str,
-        default="./",
-        help="path to save pytorch model output data"
-    )
-    parser.add_argument(
-        "--export_only",
-        action="store_true"
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.model,
-        args.batch_size,
-        args.length,
-        args.export_onnx,
-        args.input_dir,
-        args.result_dir,
-        args.export_only
-    )
-
-
-def get_model(modelname):
-    if modelname == "bert":
-        model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
-        voc_size = BertConfig().vocab_size
-    elif modelname == "gpt2":
-        model = GPT2Model.from_pretrained("gpt2")
-        voc_size = GPT2Config().vocab_size
-    elif modelname == "opt":
-        model = OPTModel.from_pretrained("./opt-125m")
-        voc_size = OPTConfig().vocab_size
-    elif modelname == "llama":
-        model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
-        voc_size = LlamaConfig().vocab_size
-    else :
-        raise KeyError(modelname)
-
-    model = model.eval()
-    return model, voc_size
-
-def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
-    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
-    np.save(os.path.join(input_dir, f"{model_name}_input_0.npy"), data)
-    inputs = torch.from_numpy(data).to("cuda")
-    torch_model = torch_model.to("cuda")
-
-    n_iter = 10
-    with torch.no_grad():
-        for _ in range(10):
-            outputs = torch_model(inputs)
-    torch.cuda.synchronize()
-    begin = time.time()
-    with torch.no_grad():
-        for _ in range(n_iter):
-            torch.cuda.synchronize()
-            outputs = torch_model(inputs)
-            #
-            torch.cuda.synchronize()
-    torch.cuda.synchronize()
-    end = time.time()
-
-    avg_time = (end - begin) / n_iter
-    outputs = outputs.last_hidden_state.to("cpu")
-    print("outputs abs mean:", abs(np.array(outputs)).mean())
-    print(f"average time: {avg_time}")
-    torch.cuda.memory.empty_cache()
-    np.save(os.path.join(result_dir, f"{model_name}_output.npy"), \
-                                        np.array(outputs))
-    print(f"Save input & output as {model_name}_input_0.npy and {model_name}_output.npy")
-
-
-def export_onnx(model_name, model, data, path, extern=False):
-    # torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
-
-    if model_name != "llama":
-        onnx_model = onnx.load(path)
-        onnx_model, check = simplify(onnx_model,
-                                 skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
-                                 # skipped_optimizers=['fuse_qkv'])
-        assert check
-        add_value_info_for_constants(onnx_model)
-        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
-        if extern:
-            extern_path = path.replace('.onnx', '.pb')
-            if os.path.exists(extern_path):
-                os.remove(extern_path)
-            convert_model_to_external_data(
-                onnx_model,
-                all_tensors_to_one_file=True,
-                location=extern_path.split("/")[-1],
-                size_threshold=1024,
-                convert_attribute=False,
-            )
-        onnx.save(onnx_model, path)
-    else:
-        sys.path.append("onnxsim_large_model")
-        from onnx_utils import set_onnx_input_shape
-        from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
-
-        in_model_path = path
-        out_model_path = in_model_path[:-5] + ".sim.onnx"
-
-        onnx_model = onnx.load(in_model_path)
-        print(f"load model from {in_model_path} success")
-
-        size_th_bytes = 1024 * 1024
-        onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
-        print("compress model success")
-
-        onnx_model = set_onnx_input_shape(onnx_model, "")
-        tensor_size_threshold = f"1024KB"
-        skipped_optimizers = []
-        skipped_optimizers.append("eliminate_duplicate_initializer")
-        onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
-                                    tensor_size_threshold=tensor_size_threshold)
-        if not check:
-            raise ValueError(f"simplify compressed model {in_model_path} failed")
-
-        print(f"simplify model success")
-
-        onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
-        print(f"uncompress model success")
-
-        add_value_info_for_constants(onnx_model)
-
-        onnx.save(onnx_model, out_model_path, save_as_external_data=True)
-
-
-def add_value_info_for_constants(model : onnx.ModelProto):
-    """
-    Currently onnx.shape_inference doesn't use the shape of initializers, so add
-    that info explicitly as ValueInfoProtos.
-    Mutates the model.
-    Args:
-        model: The ModelProto to update.
-    """
-    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
-    if model.ir_version < 4:
-        return
-
-    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
-        inputs = {i.name for i in graph.input}
-        existing_info = {vi.name: vi for vi in graph.value_info}
-        for init in graph.initializer:
-            # Check it really is a constant, not an input
-            if init.name in inputs:
-                continue
-
-            # The details we want to add
-            elem_type = init.data_type
-            shape = init.dims
-
-            # Get existing or create new value info for this constant
-            vi = existing_info.get(init.name)
-            if vi is None:
-                vi = graph.value_info.add()
-                vi.name = init.name
-
-            # Even though it would be weird, we will not overwrite info even if it doesn't match
-            tt = vi.type.tensor_type
-            if tt.elem_type == onnx.TensorProto.UNDEFINED:
-                tt.elem_type = elem_type
-            if not tt.HasField("shape"):
-                # Ensure we set an empty list if the const is scalar (zero dims)
-                tt.shape.dim.extend([])
-                for dim in shape:
-                    tt.shape.dim.add().dim_value = dim
-
-        # Handle subgraphs
-        for node in graph.node:
-            for attr in node.attribute:
-                # Ref attrs refer to other attrs, so we don't need to do anything
-                if attr.ref_attr_name != "":
-                    continue
-
-                if attr.type == onnx.AttributeProto.GRAPH:
-                    add_const_value_infos_to_graph(attr.g)
-                if attr.type == onnx.AttributeProto.GRAPHS:
-                    for g in attr.graphs:
-                        add_const_value_infos_to_graph(g)
-
-
-    return add_const_value_infos_to_graph(model.graph)
-
-
-def main():
-    global input_dir, result_dir
-
-    modelname, batchsize, seqlen, \
-        export_path, input_dir, result_dir, export_only = parse_args()
-
-    model, voc_size = get_model(modelname) # pytorch model
-
-    if export_path is not None:
-        os.makedirs(export_path, exist_ok=True)
-        filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
-        path = os.path.join(export_path, filename)
-        param = torch.zeros((batchsize, seqlen), dtype=torch.int)
-        export_onnx(modelname, model, param, path, True) # export pytorch model to onnx model
-        if export_only:
-            return
-
-    run_pytorch(model, voc_size, batchsize, seqlen, modelname)
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/onnxsim_large_model
+++ b/examples/distributed/onnxsim_large_model
@ -1 +0,0 @@
-Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65
--- a/examples/distributed/parallel.py
+++ b/examples/distributed/parallel.py
@ -1,103 +0,0 @@
-import onnx
-from onnx import (
-    ModelProto,
-    TensorProto,
-    NodeProto,
-    AttributeProto,
-)
-from onnx import helper, numpy_helper
-from typing import Dict, Any
-
-
-def parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
-    for attr in node.attribute:
-        if attr.name in attrs:
-            if attr.type == AttributeProto.INT:
-                attrs[attr.name] = attr.i
-            elif attr.type == AttributeProto.INTS:
-                attrs[attr.name] = attr.ints
-            elif attr.type == AttributeProto.FLOAT:
-                attrs[attr.name] = attr.f
-            elif attr.type == AttributeProto.STRING:
-                attrs[attr.name] = attr.s
-            elif attr.type == AttributeProto.TENSOR:
-                attrs[attr.name] = attr.t
-            else:
-                assert False, "Unsupported Attribute Type: {}".format(attr.type)
-    return attrs
-
-
-def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
-    data = {init.name: init for init in model.graph.initializer}
-    nodes = list(model.graph.node)
-
-    def shard_tensor(tensor: TensorProto, dim: int):
-        array = numpy_helper.to_array(tensor)
-        if dim >= array.ndim:
-            dim = array.ndim - 1
-        assert array.shape[dim] % tp_world_size == 0
-        seg = array.shape[dim] // tp_world_size
-        array = array[tp_rank * seg : (tp_rank + 1) * seg]
-        return numpy_helper.from_array(array, name=tensor.name + f":sharded({dim})")
-
-    def shard_gemm(node: NodeProto):
-        attrs = parse_attribute(
-            node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
-        )
-        trans = [attrs["transA"], attrs["transB"]]
-        dim = 0
-        for i, (input, t) in enumerate(zip(node.input, trans)):
-            if input in data:
-                dim = i
-                sharded = shard_tensor(data[input], dim ^ t)
-                node.input[i] = sharded.name
-                data[input] = sharded
-        if len(node.input) > 2:
-            input = node.input[2]
-            sharded = shard_tensor(data[input], dim)
-            node.input[2] = sharded.name
-            data[input] = sharded
-
-        node.output[0] += f":sharded({dim})"
-        return dim
-
-    for i, node in enumerate(nodes):
-        if node.op_type == "Gemm":
-            output = node.output[0]
-            dim = shard_gemm(node)
-            gathered = [node.output[0] + f".{i}" for i in range(tp_world_size)]
-            # all_gather
-            nodes.insert(
-                i + 1,
-                helper.make_node(
-                    op_type="AllGather",
-                    inputs=[node.output[0]],
-                    outputs=gathered,
-                    name=node.name + "/allgather",
-                    # domain="infini", # shape inference fails for custom domain
-                ),
-            )
-            # concat
-            nodes.insert(
-                i + 2,
-                helper.make_node(
-                    op_type="Concat",
-                    inputs=gathered,
-                    outputs=[output],
-                    name=node.name + "/concat",
-                    axis=dim,
-                ),
-            )
-    graph = helper.make_graph(
-        nodes,
-        model.graph.name + f"_{tp_rank}",
-        model.graph.input,
-        model.graph.output,
-        data.values(),
-        doc_string=model.graph.doc_string,
-        value_info=model.graph.value_info,
-    )
-    model = helper.make_model(graph)
-
-    onnx.shape_inference.infer_shapes(model)
-    return model
--- a/examples/distributed/parallel_opt.py
+++ b/examples/distributed/parallel_opt.py
@ -1,247 +0,0 @@
-import onnx
-from onnx import ModelProto, NodeProto, TensorProto, ValueInfoProto
-from onnx import helper, numpy_helper
-from typing import Dict, List
-from placement import Placement, Replicate, Shard, _Partial
-import numpy as np
-
-
-def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
-    data = {init.name: init for init in model.graph.initializer}
-    vinfo = {info.name: info for info in model.graph.value_info}
-    vinfo.update({info.name: info for info in model.graph.input})
-    vinfo.update({info.name: info for info in model.graph.output})
-    output = {info.name: info for info in model.graph.output}
-    place: Dict[str, Placement] = {}
-    nodes: List[NodeProto] = []
-
-    def is_sharded(name: str):
-        return place[name].is_shard()
-
-    def shard_tensor(tensor: TensorProto, plc: Shard, groups: int = 1):
-        # print(f"shard {tensor.name} at dim {dim}")
-        assert plc.is_shard(), plc
-        ndim = len(tensor.dims)
-        if plc.dim < 0:
-            plc.dim += ndim
-        if tensor.dims[plc.dim] == 1:  # broadcast dim, no need to shard.
-            return tensor
-        array = numpy_helper.to_array(tensor)
-        assert array.shape[plc.dim] % tp_world_size == 0, array.shape[plc.dim]
-        dims = list(tensor.dims)
-        dims.insert(plc.dim, groups)
-        dims[plc.dim + 1] //= groups
-        array = array.reshape(dims)
-        seg = array.shape[plc.dim + 1] // tp_world_size
-        array = array.take(
-            indices=range(tp_rank * seg, (tp_rank + 1) * seg), axis=plc.dim + 1
-        )
-        dims = list(tensor.dims)
-        dims[plc.dim] //= tp_world_size
-        array = array.reshape(dims)
-        tensor = numpy_helper.from_array(array, name=tensor.name)
-        place[tensor.name] = plc
-        return tensor
-
-    def shard_gemm(node: NodeProto, groups: int = 1):
-        # print("gemm", node.name)
-        in_plc = place[node.input[0]]
-        w_plc = Shard(-1) if in_plc.is_replicate() else Shard(0)
-        transB = next((attr.i for attr in node.attribute if attr.name == "transB"), 0)
-        if transB:
-            w_plc.dim = ~w_plc.dim
-        input = node.input[1]
-        data[input] = shard_tensor(data[input], w_plc, groups)
-
-        output = node.output[0]
-        ndim = len(vinfo[output].type.tensor_type.shape.dim)
-        out_plc = Shard(ndim - 1) if in_plc.is_replicate() else _Partial()
-        place[node.output[0]] = out_plc
-
-    def shard_concat(node: NodeProto):
-        # hack for kvcache
-        in_plc = place[node.input[1]]
-        if in_plc.is_shard():
-            seq_len_dim = vinfo[node.input[0]].type.tensor_type.shape.dim.pop(1)
-            seq_len_dim.dim_value //= tp_world_size
-            vinfo[node.input[0]].type.tensor_type.shape.dim.insert(1, seq_len_dim)
-            place[node.input[0]] = in_plc
-            place[node.output[0]] = in_plc
-
-    def shard_binary(node: NodeProto, groups: int = 1):
-        # print("binary", node.name, node.input[0], place[node.input[0]])
-        a = node.input[0]
-        b = node.input[1]
-        if a in data:
-            a, b = b, a
-        place[node.output[0]] = place[a]
-        if is_sharded(a) and b in data and len(data[b].dims) == 1:  # broadcast
-            data[b] = shard_tensor(data[b], Shard(0), groups)
-
-    def shard_reshape(node: NodeProto):
-        # print("reshape", node.name, node.input[0], place[node.input[0]])
-        if not is_sharded(node.input[0]):
-            return
-        in_plc = place[node.input[0]]
-        s_dim = -1
-        in_dims = [d.dim_value for d in vinfo[node.input[0]].type.tensor_type.shape.dim]
-        tensor = data[node.input[1]]
-        out_dims = numpy_helper.to_array(tensor).copy()
-        if len(in_dims) == 3 and len(out_dims) == 4:
-            if in_plc.dim == 0:
-                s_dim = 1
-            elif in_plc.dim == 2:
-                s_dim = 2
-        if len(in_dims) == 4 and len(out_dims) == 3:
-            if in_plc.dim == 1:
-                s_dim = 0
-            elif in_plc.dim == 2:
-                s_dim = 2
-        if len(in_dims) == 2 and len(out_dims) == 3:
-            if in_plc.dim == 1:
-                s_dim = 2
-        if len(in_dims) == 4 and len(out_dims) == 2:
-            if in_plc.dim == 1:
-                s_dim = 0
-            elif in_plc.dim == 2:
-                s_dim = 1
-        if len(in_dims) == 3 and len(out_dims) == 2:
-            if in_plc.dim == 1:
-                s_dim = 0
-            elif in_plc.dim == 2:
-                s_dim = 1
-        assert s_dim != -1
-        assert out_dims[s_dim] % tp_world_size == 0, out_dims
-        out_dims[s_dim] //= tp_world_size
-        # if ONNX uses the same tensor for multiple Reshape Nodes, then rename it to distingush from others.
-        node.input[1] = node.output[0] + "_shape"
-        data[node.input[1]] = numpy_helper.from_array(out_dims, name=node.input[1])
-        place[node.output[0]] = Shard(s_dim)
-
-    def shard_split(node: NodeProto):
-        if not is_sharded(node.input[0]):
-            return
-        in_plc = place[node.input[0]]
-        split_tensor = data[node.input[1]]
-        split = numpy_helper.to_array(split_tensor).copy()
-        split //= tp_world_size
-        data[node.input[1]] = numpy_helper.from_array(split, name=node.input[1])
-        for output in node.output:
-            place[output] = in_plc
-
-    def shard_transpose(node: NodeProto):
-        plc = place[node.input[0]]
-        if plc.is_shard():
-            perm = next(attr.ints for attr in node.attribute if attr.name == "perm")
-            place[node.output[0]] = Shard(list(perm).index(plc.dim))
-
-    def shard_node(node: NodeProto):
-        if node.op_type in ["Relu", "Tanh", "Softmax", "Cast"]:
-            place[node.output[0]] = place[node.input[0]]
-        elif node.op_type in ["Where"]:
-            place[node.output[0]] = place[node.input[1]]
-        if node.op_type in {"Add", "Mul", "Div", "Max"}:
-            shard_binary(node)
-        elif node.op_type == "Reshape":
-            shard_reshape(node)
-        elif node.op_type == "Transpose":
-            shard_transpose(node)
-        elif node.op_type == "Split":
-            shard_split(node)
-        elif node.op_type == "MatMul":
-            assert (
-                place[node.input[0]] == place[node.input[1]]
-            ), f"{place[node.input[0]]} != {place[node.input[1]]}"
-            place[node.output[0]] = place[node.input[0]]
-        elif node.op_type == "Concat":
-            shard_concat(node)
-
-    def find_successor(op_type: str, idx: int, search_limit: int = 1):
-        for node in model.graph.node[idx + 1 : idx + 1 + search_limit]:
-            if node.op_type == op_type:
-                return node
-        return None
-
-    # all tensors are initially replicated.
-    for v in vinfo:
-        place[v] = Replicate()
-
-    for t in data:
-        place[t] = Replicate()
-
-    for index, node in enumerate(model.graph.node):
-        nodes.append(node)
-        # linear
-        if (node.op_type == "MatMul" or node.op_type == "Gemm") and any(
-            input in data for input in node.input
-        ):
-            # FIXME(constroy): the last MatMul should not be sharded as TP.
-            if (
-                node.output[0] in output
-                or (
-                    index + 1 < len(model.graph.node)
-                    and model.graph.node[index + 1].output[0]
-                )
-                in output
-            ):
-                continue
-            groups = 1
-            # If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups
-            split_node = find_successor("Split", index, search_limit=2)
-            if split_node is not None:
-                groups = len(split_node.output)
-            shard_gemm(node, groups)
-            plc = place[node.output[0]]
-            if plc.is_partial():
-                new_name = node.output[0] + f":{plc}"
-                place[new_name] = place[node.output[0]]
-                # insert all_reduce
-                nodes.append(
-                    helper.make_node(
-                        op_type="ReduceSum",
-                        inputs=[new_name],
-                        outputs=[node.output[0]],
-                        name=node.name + "/all_reduce",
-                        noop_with_empty_axes=1,
-                        communicator=0,  # hack to treat ReduceSum as AllReduceSum
-                    )
-                )
-                place[node.output[0]] = Replicate()
-                node.output[0] = new_name
-            if len(node.input) > 2:  # split bias to add
-                prev = nodes[-1]
-                new_name = prev.output[0] + "_no_bias"
-                place[new_name] = place[node.output[0]]
-                bias = helper.make_node(
-                    op_type="Add",
-                    inputs=[new_name, node.input[2]],
-                    outputs=[prev.output[0]],
-                    name=node.name + "/bias",
-                )
-                node.input.pop()
-                prev.output[0] = new_name
-                shard_binary(bias, groups)
-                nodes.append(bias)
-            continue
-        shard_node(node)
-
-    new_input = []
-    for info in model.graph.input:
-        new_input.append(vinfo[info.name])
-
-    graph = helper.make_graph(
-        nodes,
-        model.graph.name + f"_{tp_rank}",
-        new_input,
-        model.graph.output,
-        data.values(),
-        doc_string=model.graph.doc_string,
-        # value_info=vinfo.values(),
-    )
-    for output in graph.output:
-        tt = output.type.tensor_type
-        if tt.HasField("shape"):
-            tt.ClearField("shape")
-    model = helper.make_model(graph)
-    #model = onnx.shape_inference.infer_shapes(model)
-    return model
--- a/examples/distributed/placement.py
+++ b/examples/distributed/placement.py
@ -1,64 +0,0 @@
-from typing import Optional
-
-
-class Placement:
-    # base class Placement type
-
-    # convenient utils to check for placement types
-    def is_shard(self, dim: Optional[int] = None) -> bool:
-        if dim is not None and isinstance(self, Shard):
-            return self.dim == dim
-        else:
-            return isinstance(self, Shard)
-
-    def is_replicate(self) -> bool:
-        return isinstance(self, Replicate)
-
-    def is_partial(self) -> bool:
-        return isinstance(self, _Partial)
-
-
-class Replicate(Placement):
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, Replicate):
-            return False
-        return True
-
-    def __repr__(self) -> str:
-        """
-        machine readable representation of the Replicate placement
-        """
-        return "Replicate()"
-
-
-class Shard(Placement):
-    # shard placement, shard on a dim
-    def __init__(self, dim):
-        self.dim = dim
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, Shard):
-            return False
-        return self.dim == other.dim
-
-    def __repr__(self) -> str:
-        """
-        machine readable representation of the Shard placement
-        """
-        return f"Shard(dim={self.dim})"
-
-
-class _Partial(Placement):
-    def __init__(self, reduce_op: str = "sum"):
-        self.reduce_op: str = reduce_op
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, _Partial):
-            return False
-        return self.reduce_op == other.reduce_op
-
-    def __repr__(self) -> str:
-        """
-        machine readable representation of the Partial placement
-        """
-        return f"_Partial(reduce_op={self.reduce_op})"
--- a/examples/python/llama_kvcache_inference.py
+++ b/examples/python/llama_kvcache_inference.py
@ -1,145 +0,0 @@
-import os
-from pyinfinitensor.onnx import OnnxStub, backend
-import numpy as np
-import onnx
-import torch
-from transformers import LlamaModel, LlamaForCausalLM
-from tqdm import tqdm
-import onnx_graphsurgeon as gs
-from onnxsim import simplify
-import argparse
-
-parser = argparse.ArgumentParser(description='')
-parser.add_argument('--batchsize', dest='batchsize', type=int, default=1)
-parser.add_argument('--layer', dest='n_layers', type=int, default=2)
-parser.add_argument('--iter', dest='n_iter', type=int, default=1)
-parser.add_argument('--n_max_length', dest='n_max_length', type=int, default=1024)
-parser.add_argument('--pretrained_llama_path', dest='pretrained_llama_path', type=str, 
-                    default="/data0/shared/data/public/opensource_models/meta-llama/Llama-2-7b-hf/")
-parser.add_argument('--onnx_model_path', dest='onnx_model_path', type=str, 
-                    default="/data1/shared/llama")
-args = parser.parse_args()
-
-ONNX_MODEL_PATH = "{}/llama_bs{}_layer{}.onnx".format(args.onnx_model_path, args.batchsize, args.n_layers)
-ONNX_WEIGHT_PATH = "./llama_bs{}_layer{}.pb".format(args.batchsize, args.n_layers)
-
-def export_onnx(model: LlamaModel, ONNX_MODEL_PATH):
-    param = torch.zeros(
-        (args.batchsize, 1024), dtype=torch.long)
-    logits = model(param, past_key_values=None)
-    param_kvcache = torch.zeros((args.batchsize, 1), dtype=torch.long)
-
-    torch.onnx.export(model, (param_kvcache, {"past_key_values": logits.past_key_values,
-                                              "position_ids": param_kvcache}), ONNX_MODEL_PATH, verbose=False,
-                      do_constant_folding=True,)
-    onnx_model = onnx.load(ONNX_MODEL_PATH)
-    print("simplifing onnx model")
-    onnx_model, check = simplify(onnx_model, skipped_optimizers=[
-                                 'eliminate_duplicate_initializer'])
-    assert check
-    
-    onnx.save(onnx_model, ONNX_MODEL_PATH, save_as_external_data=True, location=ONNX_WEIGHT_PATH)
-    print("simlifing finished.")
-
-
-@gs.Graph.register()
-def replace_with_attention(self, inputs, outputs, inputs_added, outputs_removed):
-    for inp in inputs:
-        inp.outputs.clear()   
-    for out in outputs:
-        out.inputs.clear()
-    for inp in inputs_added:
-        inputs.append(inp)
-    for out in outputs_removed:
-        out.inputs.clear()
-    return self.layer(op="AttentionKVCache", inputs=inputs, outputs=outputs)
-
-
-def replace_onnx_with_attention_op():
-    graph = gs.import_onnx(
-        onnx.load(ONNX_MODEL_PATH))
-    tmap = graph.tensors()
-    for i in range(args.n_layers):
-        inputs = [
-            tmap["onnx::Concat_" + str((i+1)*2)],
-            tmap["onnx::Concat_" + str((i+1)*2+1)],
-            tmap["/model/layers." + str(i) + "/self_attn/Add_output_0"],
-            tmap["/model/layers." + str(i) + "/self_attn/Add_1_output_0"],
-            tmap["/model/layers." + str(i) + "/self_attn/Transpose_2_output_0"]]
-        outputs = [
-            tmap["/model/layers." + str(i) + "/self_attn/MatMul_1_output_0"]]
-
-        inputs_added = [graph.inputs[1]]
-        outputs_removed = []
-
-        graph.replace_with_attention(
-            inputs, outputs, inputs_added, outputs_removed)
-        
-    graph.outputs = [tmap[graph.outputs[0].name]]
-    graph.cleanup(True).toposort()
-    onnx.save(gs.export_onnx(graph), ONNX_MODEL_PATH, save_as_external_data=True)
-
-
-if __name__ == "__main__":
-    kvcache_torch = None
-    torch_model = LlamaForCausalLM.from_pretrained(
-        args.pretrained_llama_path, num_hidden_layers=int(args.n_layers)).eval()
-    
-    n_heads = torch_model.config.num_attention_heads
-    n_dims = torch_model.config.hidden_size // n_heads
-    
-    if not os.path.exists(ONNX_MODEL_PATH):
-        print("exporting onnx graph")
-        export_onnx(torch_model, ONNX_MODEL_PATH)
-        replace_onnx_with_attention_op()
-    else:
-        print("will use exsiting onnx graph")
-
-    onnx_model = onnx.load(ONNX_MODEL_PATH)
-    stub = OnnxStub(onnx_model, backend.cuda_runtime())
-
-    count_wrong = 0
-    for i in tqdm(range(0, args.n_max_length)):
-        query = np.random.randint(
-            torch_model.config.vocab_size, size=(args.batchsize, 1), dtype=np.int32)
-        position_id = i*np.ones((args.batchsize, 1), dtype=np.int32)
-
-        ####################################
-        # pytorch
-        ####################################
-        outputs_torch = torch_model(
-            torch.tensor(query), past_key_values=kvcache_torch)
-        logit_torch = outputs_torch['logits']
-        kvcache_torch = outputs_torch['past_key_values']
-
-        ####################################
-        # infinitensor
-        ####################################
-        # copyin input
-        (list(stub.inputs.items()))[0][1].copyin_int64(
-            query.reshape(-1).tolist())
-        (list(stub.inputs.items()))[1][1].copyin_int64(
-            position_id.reshape(-1).tolist())
-
-        stub.run()
-
-        ####################################
-        # validation
-        ####################################
-        # copyout output
-        logits_it = np.array((list(stub.outputs.items()))
-                                [0][1].copyout_float())
-        
-        try:
-            np.testing.assert_allclose(
-                logit_torch[:, -1, :].detach().cpu().numpy().flatten(), logits_it, rtol=1e-3, atol=1e-3)
-        except Exception as e: 
-            try:
-                np.testing.assert_allclose(
-                    np.argmax(logit_torch[:, -1, :].detach().cpu().numpy().flatten()), np.argmax(logits_it), rtol=1e-3, atol=1e-3)
-            except:
-                count_wrong = count_wrong + 1
-
-    result = "{}/{} failed.".format(count_wrong, args.n_max_length)
-    print(result)
-    del stub
--- a/examples/python/onnx_inference.py
+++ b/examples/python/onnx_inference.py
@ -1,29 +0,0 @@
-import sys
-import onnx
-import torch
-import numpy as np
-from pyinfinitensor.onnx import OnnxStub, backend
-
-if __name__ == '__main__':
-    args = sys.argv
-    if len(sys.argv) != 2:
-        print("Usage: python onnx_inference.py model_name.onnx")
-        exit()
-    model_path = sys.argv[1]
-    # print(model_path)
-
-    onnx_model = onnx.load(model_path)
-    onnx_input = onnx_model.graph.input[0]
-    input_shape = [[d.dim_value for d in _input.type.tensor_type.shape.dim]
-                   for _input in onnx_model.graph.input]
-    # Assume that there is only one input tensor
-    input_shape = input_shape[0]
-    # print(input_shape)
-    input_data = np.random.random(input_shape).astype(np.float32)
-
-    model = OnnxStub(onnx_model, backend.cuda_runtime())
-    next(iter(model.inputs.values())).copyin_numpy(input_data)
-    model.run()
-    outputs = next(iter(model.outputs.values())).copyout_numpy()
-    outputs = torch.tensor(outputs)
-    print(outputs.shape)
--- a/examples/python/paddle_densenet.py
+++ b/examples/python/paddle_densenet.py
@ -1,80 +0,0 @@
-
-import paddle
-import paddle.vision.transforms as T
-from paddle.vision.datasets import Cifar10
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-import itertools
-
-def run_cifar_train_and_infer():
-    
-    paddle.device.set_device("gpu")
-
-    transform = T.Compose(
-        [
-            T.Resize(224),
-            T.ToTensor(),
-            T.Normalize(
-                mean=[0.5, 0.5, 0.5],
-                std=[0.5, 0.5, 0.5],
-                to_rgb=True,
-            ),
-        ]
-    )
-    
-    # 下载数据集并初始化 DataSet
-    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
-    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
-
-    # 模型组网并初始化网络
-    densenet = paddle.vision.models.DenseNet(num_classes=10)
-    model = paddle.Model(densenet)
-
-    # 模型训练的配置准备，准备损失函数，优化器和评价指标
-    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
-                paddle.nn.CrossEntropyLoss(),
-                paddle.metric.Accuracy())
-
-    # 模型训练
-    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
-    # 模型评估
-    model.evaluate(test_dataset, batch_size=64, verbose=1)
-
-    # export to ONNX
-    save_path = 'onnx.save/densenet' # 需要保存的路径
-    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
-    paddle.onnx.export(densenet, save_path, input_spec=[x_spec], opset_version=11)
-
-    # 加载onnx模型并放到Infinitensor中
-    model_path = save_path + ".onnx"
-    onnx_model = onnx.load(model_path)
-    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
-    model = gofusion_model
-    model.init()
-
-    # 启动推理
-    cifar10_test = Cifar10(
-        mode="test",
-        transform=transform,  # apply transform to every image
-        backend="cv2",  # use OpenCV as image transform backend
-    )
-    batch_size = 1
-    total_size = 0
-    total_acc = 0.0
-    for data in itertools.islice(iter(cifar10_test), 10000):
-        images, labels = data
-        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
-        model.run()
-        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
-        outputs = paddle.to_tensor(outputs)
-        outputs = paddle.reshape(outputs, (1, 10))
-        labels = paddle.to_tensor(labels)
-        labels = paddle.reshape(labels, (1,1))
-        acc = paddle.metric.accuracy(outputs, labels)
-        total_acc += acc
-        total_size += batch_size
-    print("test acc: {}".format(total_acc.numpy() / total_size))
-
-
-if __name__ == "__main__":
-    run_cifar_train_and_infer()
--- a/examples/python/paddle_inception.py
+++ b/examples/python/paddle_inception.py
@ -1,80 +0,0 @@
-import paddle
-import paddle.vision.transforms as T
-from paddle.vision.datasets import Cifar10
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-import itertools
-
-def run_cifar_train_and_infer():
-    
-    paddle.device.set_device("gpu")
-
-    transform = T.Compose(
-        [
-            T.Resize(224),
-            T.ToTensor(),
-            T.Normalize(
-                mean=[0.5, 0.5, 0.5],
-                std=[0.5, 0.5, 0.5],
-                to_rgb=True,
-            ),
-        ]
-    )
-    
-    # 下载数据集并初始化 DataSet
-    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
-    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
-
-    # 模型组网并初始化网络
-    inception = paddle.vision.models.InceptionV3(num_classes=10)
-    model = paddle.Model(inception)
-
-    # 模型训练的配置准备，准备损失函数，优化器和评价指标
-    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
-                paddle.nn.CrossEntropyLoss(),
-                paddle.metric.Accuracy())
-
-    # 模型训练
-    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
-    # 模型评估
-    model.evaluate(test_dataset, batch_size=64, verbose=1)
-
-    # export to ONNX
-    save_path = 'onnx.save/inception' # 需要保存的路径
-    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
-    paddle.onnx.export(inception, save_path, input_spec=[x_spec], opset_version=11)
-
-    # 加载onnx模型并放到Infinitensor中
-    model_path = save_path + ".onnx"
-    onnx_model = onnx.load(model_path)
-    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
-    model = gofusion_model
-    model.init()
-
-    # 启动推理
-    cifar10_test = Cifar10(
-        mode="test",
-        transform=transform,  # apply transform to every image
-        backend="cv2",  # use OpenCV as image transform backend
-    )
-    batch_size = 1
-    total_size = 0
-    total_acc = 0.0
-    for data in itertools.islice(iter(cifar10_test), 10000):
-        images, labels = data
-        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
-        model.run()
-        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
-        outputs = paddle.to_tensor(outputs)
-        outputs = paddle.reshape(outputs, (1, 10))
-        labels = paddle.to_tensor(labels)
-        labels = paddle.reshape(labels, (1,1))
-        acc = paddle.metric.accuracy(outputs, labels)
-        total_acc += acc
-        total_size += batch_size
-    print("test acc: {}".format(total_acc.numpy() / total_size))
-
-
-
-if __name__ == "__main__":
-    run_cifar_train_and_infer() 
--- a/examples/python/paddle_model_dev.md
+++ b/examples/python/paddle_model_dev.md
@ -1,31 +0,0 @@
-## Description
-
-This is a doc to tell you how to run paddle*.py in your machine. If your model run on other machines except Nvidia, you may need to make some change.
-
-## What do we do in paddle*.py files?
-
-1. Train model and evalute model with Cifar10 dataset
-
-2. Export paddle model to onnx model
-
-3. Load onnx model, infer with InfiniTensor and calculate the inference accuracy
-
-## Command
-
-1. Go to `/examples/python` folder 
-
-2. Run the following command
-   
-   1. ```
-      python paddle_resnet.py
-      python paddle_densenet.py
-      python paddle_inception.py
-      ```
-
-## What should I do if I use other device(MLU, XPU, NPU)?
-
-You need to change this code:
-
-```
-paddle.device.set_device("gpu") # Change gpu to mlu, xpu or npu
-```
--- a/examples/python/paddle_resnet.py
+++ b/examples/python/paddle_resnet.py
@ -1,81 +0,0 @@
-
-import paddle
-import paddle.vision.transforms as T
-from paddle.vision.datasets import Cifar10
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-import itertools
-from paddle.vision.models.resnet import BasicBlock
-
-def run_cifar_train_and_infer():
-    
-    paddle.device.set_device("gpu")
-
-    transform = T.Compose(
-        [
-            T.Resize(224),
-            T.ToTensor(),
-            T.Normalize(
-                mean=[0.5, 0.5, 0.5],
-                std=[0.5, 0.5, 0.5],
-                to_rgb=True,
-            ),
-        ]
-    )
-    
-    # 下载数据集并初始化 DataSet
-    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
-    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
-
-    # 模型组网并初始化网络
-    resnet = paddle.vision.models.ResNet(BasicBlock, depth=18, num_classes=10)
-    model = paddle.Model(resnet)
-
-    # 模型训练的配置准备，准备损失函数，优化器和评价指标
-    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
-                paddle.nn.CrossEntropyLoss(),
-                paddle.metric.Accuracy())
-
-    # 模型训练
-    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
-    # 模型评估
-    model.evaluate(test_dataset, batch_size=64, verbose=1)
-
-    # export to ONNX
-    save_path = 'onnx.save/resnet' # 需要保存的路径
-    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
-    paddle.onnx.export(resnet, save_path, input_spec=[x_spec], opset_version=11)
-
-    # 加载onnx模型并放到Infinitensor中
-    model_path = save_path + ".onnx"
-    onnx_model = onnx.load(model_path)
-    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
-    model = gofusion_model
-    model.init()
-
-    # 启动推理
-    cifar10_test = Cifar10(
-        mode="test",
-        transform=transform,  # apply transform to every image
-        backend="cv2",  # use OpenCV as image transform backend
-    )
-    batch_size = 1
-    total_size = 0
-    total_acc = 0.0
-    for data in itertools.islice(iter(cifar10_test), 10000):
-        images, labels = data
-        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
-        model.run()
-        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
-        outputs = paddle.to_tensor(outputs)
-        outputs = paddle.reshape(outputs, (1, 10))
-        labels = paddle.to_tensor(labels)
-        labels = paddle.reshape(labels, (1,1))
-        acc = paddle.metric.accuracy(outputs, labels)
-        total_acc += acc
-        total_size += batch_size
-    print("test acc: {}".format(total_acc.numpy() / total_size))
-
-
-if __name__ == "__main__":
-    run_cifar_train_and_infer()
--- a/examples/python/resnet_inference.py
+++ b/examples/python/resnet_inference.py
@ -1,24 +0,0 @@
-import sys
-import onnx
-import torch
-import numpy as np
-from pyinfinitensor.onnx import OnnxStub, backend
-import torchvision.models as models
-
-if __name__ == '__main__':
-    model_path = './resnet18.onnx'
-    tv_model = models.resnet50(weights=None)
-    input_shape = (1, 3, 224, 224)
-    param = torch.rand(input_shape)
-    torch.onnx.export(tv_model, param, model_path, verbose=False)
-
-    onnx_model = onnx.load(model_path)
-    model = OnnxStub(onnx_model, backend.cuda_runtime())
-    images = np.random.random(input_shape).astype(np.float32)
-    next(iter(model.inputs.values())).copyin_numpy(images)
-    model.run()
-    outputs = next(iter(model.outputs.values())).copyout_numpy()
-    outputs = torch.tensor(outputs)
-    outputs = torch.reshape(outputs, (1, 1000))
-    _, predicted = torch.max(outputs, 1)
-    print(predicted)
--- a/include/bang/bang_common.h
+++ b/include/bang/bang_common.h
@ -2,10 +2,6 @@
 #include "cnnl.h"
 #include "cnrt.h"
 #include "core/common.h"
-#include "core/data_type.h"
-#ifdef INFINI_USE_CNCL
-#include "cncl.h"
-#endif

 #define checkBangError(call)                                                   \
    {                                                                          \
@ -31,70 +27,4 @@ namespace infini {

 using BangPtr = void *;

-inline cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
-    if (dataType == DataType::Float32) {
-        return CNNL_DTYPE_FLOAT;
-    }
-    if (dataType == DataType::Float16) {
-        return CNNL_DTYPE_HALF;
-    }
-    if (dataType == DataType::Double) {
-        return CNNL_DTYPE_DOUBLE;
-    }
-    if (dataType == DataType::Int8) {
-        return CNNL_DTYPE_INT8;
-    }
-    if (dataType == DataType::Int32) {
-        return CNNL_DTYPE_INT32;
-    }
-    if (dataType == DataType::UInt8) {
-        return CNNL_DTYPE_UINT8;
-    }
-    if (dataType == DataType::BFloat16) {
-        return CNNL_DTYPE_BFLOAT16;
-    }
-    if (dataType == DataType::Int64) {
-        return CNNL_DTYPE_INT64;
-    }
-    if (dataType == DataType::Bool) {
-        return CNNL_DTYPE_BOOL;
-    }
-    IT_TODO_HALT_MSG("Data type " + dataType.toString() +
-                     " not supported in CNNL.");
-}
-
-#ifdef INFINI_USE_CNCL
-inline cnclDataType_t cnclDataTypeConvert(DataType dataType) {
-    if (dataType == DataType::Float32) {
-        return cnclFloat32;
-    }
-    if (dataType == DataType::Float16) {
-        return cnclHalf;
-    }
-    if (dataType == DataType::Int8) {
-        return cnclInt8;
-    }
-    if (dataType == DataType::Int16) {
-        return cnclInt16;
-    }
-    if (dataType == DataType::Int32) {
-        return cnclInt32;
-    }
-    if (dataType == DataType::UInt8) {
-        return cnclUint8;
-    }
-    if (dataType == DataType::UInt16) {
-        return cnclUint16;
-    }
-    if (dataType == DataType::UInt32) {
-        return cnclUint32;
-    }
-    if (dataType == DataType::BFloat16) {
-        return cnclBfloat16;
-    }
-    IT_TODO_HALT_MSG("Data type " + dataType.toString() +
-                     " not supported in CNCL.");
-}
-#endif
-
 } // namespace infini
--- a/include/bang/bang_runtime.h
+++ b/include/bang/bang_runtime.h
@ -7,19 +7,16 @@ namespace infini {
 class BangRuntimeObj : public RuntimeObj {
  private:
    cnnlHandle_t cnnl;
-    cnrtQueue_t queue;
-    std::unique_ptr<CommunicatorObj> comm;
    BangPtr workspace;
    size_t workspaceSize;
-    mutable size_t cursor;

  public:
-    explicit BangRuntimeObj(int deviceId = 0)
-        : RuntimeObj(Device::BANG, deviceId) {
+    BangRuntimeObj() : RuntimeObj(Device::BANG) {
        cnInit(0);
        CNdev dev;
-        cnDeviceGet(&dev, deviceId);
+        cnDeviceGet(&dev, 0);
        checkBangError(cnrtSetDevice(dev));
+        cnrtQueue_t queue;
        checkBangError(cnrtQueueCreate(&queue));

        checkCnnlError(cnnlCreate(&cnnl));
@ -27,12 +24,10 @@ class BangRuntimeObj : public RuntimeObj {
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
        workspaceSize = 7ll << 30; // 7 GB
-        cursor = 0;
        workspace = alloc(workspaceSize);
    }
    virtual ~BangRuntimeObj() {
        dealloc(workspace);
-        checkBangError(cnrtQueueDestroy(queue));
        checkCnnlError(cnnlDestroy(cnnl));
    }
    string toString() const override;
@ -41,7 +36,7 @@ class BangRuntimeObj : public RuntimeObj {
             bool profiling = false) const;
    // double runEvaluation(const Graph &graph, int nWarmups,
    //                      int nEvaluations) const;
-    void sync() const;
+    void sync() const override;
    BangPtr alloc(size_t size) override {
        void *ptr;
        checkBangError(cnrtMalloc(&ptr, size));
@ -50,15 +45,10 @@ class BangRuntimeObj : public RuntimeObj {
    void dealloc(void *ptr) override { checkBangError(cnrtFree(ptr)); }
    cnnlHandle_t cnnlHandle() const { return cnnl; }
    BangPtr getWorkspace(size_t size) const {
-        IT_ASSERT((cursor + size) <= workspaceSize);
-        cursor += size;
-        void *temp = workspace;
-        temp += (cursor - size);
-        return temp;
+        IT_ASSERT(size <= workspaceSize);
+        return workspace;
    }

-    void resetWorkspace() const { cursor = 0; }
-
    void copyBlobFromCPU(void *dst, const void *src,
                         size_t bytes) const override {
        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
@ -76,9 +66,6 @@ class BangRuntimeObj : public RuntimeObj {
        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
                                  CNRT_MEM_TRANS_DIR_PEER2PEER));
    }
-    void initComm(const string &name, int worldSize, int rank) final;
-    CommunicatorObj &getCommunicator() const override { return *comm; }
-    cnrtQueue_t getBangQueue() const { return queue; }

  private:
    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
--- a/include/bang/cncl_communicator.h
+++ b/include/bang/cncl_communicator.h
@ -1,79 +0,0 @@
-#pragma once
-#include "bang_common.h"
-#include "core/communicator.h"
-#include <chrono>
-#include <cncl.h>
-#include <cnrt.h>
-#include <cstdlib>
-#include <filesystem>
-#include <fstream>
-#include <mutex>
-#include <thread>
-
-namespace infini {
-
-class CnclCommunicatorObj final : public CommunicatorObj {
-  private:
-    cnclComm_t *comms;
-
-  public:
-    CnclCommunicatorObj(const string &name, int worldSize, int rank)
-        : CommunicatorObj(worldSize, rank) {
-        const std::string filePath("./" + name + "_cncl_id.bin");
-        cnclCliqueId clique_id;
-        if (rank == 0) {
-            CNCL_CHECK(cnclGetCliqueId(&clique_id));
-            std::ofstream ofs(filePath, std::ios::binary);
-            ofs.write((char *)&clique_id, sizeof(cnclCliqueId));
-
-        } else {
-            auto begin = std::chrono::steady_clock::now();
-            while (!std::filesystem::exists(filePath)) {
-                auto now = std::chrono::steady_clock::now();
-                _IT_ASSERT_2(now < begin + std::chrono::seconds(10),
-                             "time limit (10s) exceeded.");
-                std::this_thread::sleep_for(std::chrono::milliseconds(100));
-            }
-            std::ifstream ifs(filePath, std::ios::binary);
-            ifs.read((char *)&clique_id, sizeof(cnclCliqueId));
-        }
-
-        int num_comms = 1;
-        int *dev_list = new int[num_comms];
-        int *rank_list = new int[num_comms];
-        comms = new cnclComm_t[num_comms];
-        uint32_t num_dev = 0;
-        checkBangError(cnrtGetDeviceCount(&num_dev));
-
-        for (int i = 0; i < num_comms; i++) {
-            rank_list[i] = rank;
-            dev_list[i] = rank_list[i] % num_dev;
-        }
-
-        CNCL_CHECK(cnclInitComms(comms, num_comms, dev_list, rank_list,
-                                 worldSize, &clique_id));
-
-        if (rank == 0) {
-            std::filesystem::remove(filePath);
-        }
-
-        delete[] dev_list;
-        delete[] rank_list;
-    }
-
-    ~CnclCommunicatorObj() {
-        CNCL_CHECK(cnclDestroyComms(comms, 1));
-        delete[] comms;
-    }
-
-    // Get the actual cnclComm_t
-    cnclComm_t getCnclComm() { return comms[0]; }
-
-    virtual string toString() const final {
-        std::ostringstream oss;
-        oss << "CNCL communicator";
-        return oss.str();
-    }
-};
-
-} // namespace infini
--- a/include/core/common.h
+++ b/include/core/common.h
@ -40,12 +40,12 @@ using HashType = uint64_t; // compatible with std::hash

 // Assert: conditions should have no side effect
 #define _IT_ASSERT_2(condition, info)                                          \
-    static_cast<bool>(condition)                                               \
-        ? void(0)                                                              \
-        : throw ::infini::Exception(                                           \
-              std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) +   \
-              "] Assertion failed (" + #condition + "): " + info)
-#define _IT_ASSERT_1(condition) _IT_ASSERT_2(condition, "")
+    (static_cast<bool>(condition)                                              \
+         ? void(0)                                                             \
+         : throw ::infini::Exception(                                          \
+               std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) +  \
+               "] Assertion failed (" + #condition + "): " + info))
+#define _IT_ASSERT_1(condition) _IT_ASSERT_2(condition, "");
 #define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)

 #define IT_TODO_HALT() _IT_ASSERT_2(false, "Unimplemented")
@ -61,35 +61,22 @@ template <typename T> auto enum_to_underlying(T e) {
 }

 template <typename T> std::string vecToString(const std::vector<T> &vec) {
-    std::stringstream ss;
-    ss << "[";
-    for (size_t i = 0; i < vec.size(); ++i) {
-        ss << vec.at(i);
-        if (i < vec.size() - 1) {
-            ss << ",";
-        }
+    std::string ret;
+    ret.append("[");
+    for (auto d : vec) {
+        ret.append(std::to_string(d));
+        ret.append(",");
    }
-    ss << "]";
-    return ss.str();
-}
-
-template <typename T> std::string vecToString(const T *st, size_t length) {
-    std::stringstream ss;
-    ss << "[";
-    size_t i = 0;
-    for (i = 0; i < length; i++) {
-        ss << *(st + i);
-        if (i < length - 1) {
-            ss << ",";
-        }
-    }
-    ss << "]";
-    return ss.str();
+    if (!vec.empty())
+        ret.pop_back();
+    ret.append("]");
+    return ret;
 }

 double timeit(
    const std::function<void()> &func,
+    // HACK: set timeit rounds to 10 for fast debug
    const std::function<void(void)> &sync = []() {}, int warmupRounds = 10,
-    int timingRounds = 10);
+    int timingRounds = 100);

 } // namespace infini
--- a/include/core/communicator.h
+++ b/include/core/communicator.h
@ -1,22 +0,0 @@
-#pragma once
-#include "object.h"
-#include "ref.h"
-
-namespace infini {
-
-// base class
-class CommunicatorObj : public Object {
-  protected:
-    int worldSize;
-    int rank;
-
-  public:
-    CommunicatorObj(int worldSize, int rank)
-        : worldSize(worldSize), rank(rank) {}
-
-    virtual ~CommunicatorObj() = default;
-    virtual int getWorldSize() const { return worldSize; }
-    virtual int getRank() const { return rank; }
-};
-
-} // namespace infini
--- a/include/core/data_type.h
+++ b/include/core/data_type.h
@ -1,54 +1,22 @@
-#pragma once
 #include "core/common.h"

 namespace infini {

 class DataType {
  public:
-    // <https://onnx.ai/onnx/intro/concepts.html#element-type>
-    static const DataType Undefine;
+    // legacy
    static const DataType Float32;
-    static const DataType UInt8;
-    static const DataType Int8;
-    static const DataType UInt16;
-    static const DataType Int16;
-    static const DataType Int32;
-    static const DataType Int64;
-    static const DataType String;
-    static const DataType Bool;
-    static const DataType Float16;
-    static const DataType Double;
    static const DataType UInt32;
-    static const DataType UInt64;
-    static const DataType BFloat16;
-    // "sizePerElement" show the DType to cpu_type
-    // DataType::Bool -> int8_t   DataType::Float16 -> uint16_t
-    static constexpr size_t sizePerElement[]{0,
-                                             sizeof(float),
-                                             sizeof(uint8_t),
-                                             sizeof(int8_t),
-                                             sizeof(uint16_t),
-                                             sizeof(int16_t),
-                                             sizeof(int32_t),
-                                             sizeof(int64_t),
-                                             sizeof(std::string),
-                                             sizeof(int8_t),
-                                             sizeof(uint16_t),
-                                             sizeof(double),
-                                             sizeof(uint32_t),
-                                             sizeof(uint64_t),
-                                             0,
-                                             0,
-                                             sizeof(uint16_t)};
+    // These are just aligned with the type and index of onnx:
+    // <https://onnx.ai/onnx/intro/concepts.html#element-type>
+    static const DataType UInt8, Int8, UInt16, Int16, Int32, Int64;
+    static constexpr size_t sizePerElement[]{
+        sizeof(float),    sizeof(uint32_t), sizeof(uint8_t), sizeof(int8_t),
+        sizeof(uint16_t), sizeof(int16_t),  sizeof(int32_t), sizeof(int64_t)};

-    static constexpr std::string_view names[]{
-        "Undefine",    "Float32", "UInt8",  "Int8",   "UInt16",
-        "Int16",       "Int32",   "Int64",  "String", "Bool",
-        "Float16",     "Double",  "UInt32", "UInt64", "PlaceHolder",
-        "PlaceHolder", "BFloat16"};
-
-    static constexpr int cpuType[]{-1, 0, 2, 3, 4, 5,  6,  7, -1,
-                                   3,  4, 9, 1, 8, -1, -1, 4};
+    static constexpr std::string_view names[]{"Float32", "UInt32", "UInt8",
+                                              "Int8",    "UInt16", "Int16",
+                                              "Int32",   "Int64"};

  private:
    int index;
@ -61,43 +29,37 @@ class DataType {
    bool operator==(const DataType &rhs) const { return index == rhs.index; }
    bool operator<(const DataType &rhs) const { return index < rhs.index; }

-    template <typename T> static int get() {
+    template <typename T> static DataType get() {
        IT_TODO_HALT_MSG("Unsupported data type");
    }
    size_t getSize() const { return sizePerElement[index]; }
    string toString() const { return string(names[index]); }
-    int cpuTypeInt() const { return cpuType[index]; }
-    int getIndex() const { return index; }
 };

+inline const DataType DataType::Float32(0);
+inline const DataType DataType::UInt32(1);
+inline const DataType DataType::UInt8(2), DataType::Int8(3),
+    DataType::UInt16(4), DataType::Int16(5), DataType::Int32(6),
+    DataType::Int64(7);
 // Method definitions are out of the declaration due to GCC bug:
 // https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
-template <> inline int DataType::get<float>() { return 0; }
-template <> inline int DataType::get<uint32_t>() { return 1; }
-template <> inline int DataType::get<uint8_t>() { return 2; }
-template <> inline int DataType::get<int8_t>() { return 3; }
-template <> inline int DataType::get<uint16_t>() { return 4; }
-template <> inline int DataType::get<int16_t>() { return 5; }
-template <> inline int DataType::get<int32_t>() { return 6; }
-template <> inline int DataType::get<int64_t>() { return 7; }
-template <> inline int DataType::get<uint64_t>() { return 8; }
-template <> inline int DataType::get<double>() { return 9; }
+template <> inline DataType DataType::get<float>() { return Float32; }
+template <> inline DataType DataType::get<uint32_t>() { return UInt32; }
+template <> inline DataType DataType::get<uint8_t>() { return UInt8; }
+template <> inline DataType DataType::get<int8_t>() { return Int8; }
+template <> inline DataType DataType::get<uint16_t>() { return UInt16; }
+template <> inline DataType DataType::get<int16_t>() { return Int16; }
+template <> inline DataType DataType::get<int32_t>() { return Int32; }
+template <> inline DataType DataType::get<int64_t>() { return Int64; }

 template <int index> struct DT {};
-template <> struct DT<0> { using t = bool; };
-template <> struct DT<1> { using t = float; };
+template <> struct DT<0> { using t = float; };
+template <> struct DT<1> { using t = uint32_t; };
 template <> struct DT<2> { using t = uint8_t; };
 template <> struct DT<3> { using t = int8_t; };
 template <> struct DT<4> { using t = uint16_t; };
 template <> struct DT<5> { using t = int16_t; };
 template <> struct DT<6> { using t = int32_t; };
 template <> struct DT<7> { using t = int64_t; };
-template <> struct DT<8> { using t = char; };
-template <> struct DT<9> { using t = int8_t; };
-template <> struct DT<10> { using t = uint16_t; };
-template <> struct DT<11> { using t = double; };
-template <> struct DT<12> { using t = uint32_t; };
-template <> struct DT<13> { using t = uint64_t; };
-template <> struct DT<16> { using t = uint16_t; };

 } // namespace infini
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -1,5 +1,4 @@
 #pragma once
-#include "core/lazy_allocator.h"
 #include "core/operator.h"
 #include "core/tensor.h"

@ -10,16 +9,15 @@ class GraphObj : public Object {
    Runtime runtime;
    TensorVec tensors;
    OpVec ops;
-    LazyAllocator allocator;

  public:
-    explicit GraphObj(Runtime runtime)
-        : runtime(runtime), allocator(runtime), sorted(false){};
+    explicit GraphObj(Runtime runtime) : runtime(runtime), sorted(false){};
    GraphObj(Runtime runtime, OpVec ops_in);
    string toString() const override;
    Runtime getRuntime() const { return runtime; }

-    Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
+    Tensor addTensor(Shape dim, DataType dtype = DataType::Float32,
+                     TensorType tensorType = TensorType::Other);
    Tensor addTensor(const Tensor &tensor);
    TensorVec addTensor(const TensorVec &tensors);
    /**
@ -50,10 +48,25 @@ class GraphObj : public Object {
        return opClone;
    }

+    Operator cloneOpAndCreateOutputs(Operator op, TensorVec inputs) {
+        auto shapes = *op->inferShape(inputs);
+        vector<Tensor> outputs;
+        for (auto shape : shapes)
+            outputs.emplace_back(addTensor(shape));
+        return cloneOperator(op, inputs, outputs);
+    }
+
+    Operator cloneOpAndCreateInputsOutputs(Operator op) {
+        vector<Tensor> inputs;
+        for (auto t : op->getInputs()) {
+            inputs.emplace_back(cloneTensor(t));
+        }
+        return cloneOpAndCreateOutputs(op, inputs);
+    }
+
    const TensorVec &getTensors() const { return tensors; }
    const OpVec &getOperators() const { return ops; }
    OpVec getComputeOps() const;
-    Tensor getTensor(int) const;

    /**
     * Sort the nodes in topological order.
@ -65,13 +78,8 @@ class GraphObj : public Object {

    void optimize();

-    void shape_infer();
-
-    void dataMalloc(bool useNaiveAllocator = false, size_t memPoolSize = 0);
-
-    Tensor cloneKV(Tensor &tensor);
-
-    void freeHeap();
+    void dataMalloc();
+    void dataFree();

    /**
     * @brief Add an operator and create its outputs. Output tensor arguments
@ -117,6 +125,11 @@ class GraphObj : public Object {

    bool checkValid() const;

+    /// @brief If a tensor has no source and garget, it is independent and
+    /// removed from the graph.
+    /// @return The number of removed tensors.
+    int removeIndependentTensors();
+
  private:
    /**
     * @brief Add reverse connections and Op relationship in ctor.
@ -127,11 +140,6 @@ class GraphObj : public Object {
     * @brief If the nodes is sorted in topological order.
     */
    bool sorted;
-
-    /**
-     * @brief If the weight tensors are allocated.
-     */
-    bool weightAllocated = false;
 };

 } // namespace infini
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -5,65 +5,85 @@
 #include <cstdint>
 #include <iostream>

-#ifdef USE_CUDA
-#include "cuda/cuda_runtime.h"
-#endif
-
 namespace infini {

+// Use the indices from onnx to reduce delivery overhead,
+// which comes from onnx but may be not only used for onnx.
+//
+// see https://onnx.ai/onnx/intro/concepts.html#element-type
+enum OnnxDType : int {
+    UNDEFINED = 0,
+    FLOAT,
+    UINT8,
+    INT8,
+    UINT16,
+    INT16,
+    INT32,
+    INT64,
+    STRING,
+    BOOL,
+    FLOAT16,
+    DOUBLE,
+    UINT32,
+    UINT64,
+    COMPLEX64,
+    COMPLEX128,
+    BFLOAT16,
+};
+
 class GraphHandlerObj {
    Graph g;

  public:
-    GraphHandlerObj(Runtime runtime)
+    explicit GraphHandlerObj(Runtime runtime)
        : g(make_ref<GraphObj>(std::move(runtime))) {}

-    Tensor tensor(Shape dims, int dtype);
+    explicit GraphHandlerObj(Graph g) : g(std::move(g)) {}
+
+    //------ tensors
+
+    vector<Tensor> inputs() { return g->getInputs(); }
+
+    vector<Tensor> outputs() { return g->getOutputs(); }
+
+    Tensor tensor(Shape dims, int dtype, TensorType ttype);

    //------ operators

-    inline OpVec operators() { return g->getOperators(); }
+    OpVec operators() { return g->getOperators(); }

    Tensor conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
                int sh, int sw, int dh, int dw);
    Tensor convTransposed2d(Tensor input, Tensor weight, Tensor output, int ph,
                            int pw, int sh, int sw, int dh, int dw, int oph,
                            int opw);
+    Tensor convNHWC(Tensor input, Tensor weight, Tensor output, int ph, int pw,
+                    int sh, int sw, int dh, int dw);
+    Tensor convTransposed2dNHWC(Tensor input, Tensor weight, Tensor output,
+                                int ph, int pw, int sh, int sw, int dh, int dw,
+                                int oph, int opw);
    Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
-                  Tensor bias, ActType act,
-                  std::string matmul_compute_type = "default");
-    Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
-                              Tensor var, Tensor scale, Tensor bias,
-                              float momentum, float eps, bool training);
-    Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
-                              Tensor bias, float eps, int axis, int stash_type);
-    Tensor rmsNorm(Tensor input, Tensor weight, Tensor output);
+                  Tensor bias, ActType act);
+    Tensor batchNorm(Tensor input, Tensor output, Tensor mean, Tensor var,
+                     Tensor scale, Tensor bias, float momentum, float eps,
+                     bool training);

    Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
-                   int ph, int pw, int sh, int sw, int ceilMode);
+                   int ph, int pw, int sh, int sw);
    Tensor avgPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
-                   int ph, int pw, int sh, int sw, int ceilMode);
+                   int ph, int pw, int sh, int sw);

    Tensor add(Tensor a, Tensor b, Tensor c);
    Tensor sub(Tensor a, Tensor b, Tensor c);
    Tensor mul(Tensor a, Tensor b, Tensor c);
    Tensor div(Tensor a, Tensor b, Tensor c);
    Tensor pow(Tensor a, Tensor b, Tensor c);
-    Tensor min(Tensor a, Tensor b, Tensor c);
-    Tensor max(Tensor a, Tensor b, Tensor c);

    Tensor relu(Tensor x, Tensor y);
-    Tensor silu(Tensor x, Tensor y);
-    Tensor gelu(Tensor x, Tensor y);
    Tensor sigmoid(Tensor x, Tensor y);
-    Tensor hardSigmoid(Tensor x, Tensor y);
-    Tensor hardSwish(Tensor x, Tensor y);
    Tensor tanh(Tensor x, Tensor y);
-    Tensor erf(Tensor x, Tensor y);
    Tensor softmax(Tensor x, Tensor y, int axis);
    Tensor abs(Tensor x, Tensor y);
-    Tensor sqrt(Tensor x, Tensor y);
-    Tensor neg(Tensor x, Tensor y);
    Tensor shape(Tensor x, Tensor y);
    Tensor identity(Tensor x, Tensor y);
    Tensor flatten(Tensor s, Tensor y, int axis);
@ -72,83 +92,34 @@ class GraphHandlerObj {
                std::optional<float> max);
    Tensor transpose(Tensor data, Tensor transposed, Shape perm);
    Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
-    Tensor resize(Tensor input, Tensor output,
-                  const std::optional<vector<int>> &axes, Tensor sizes,
-                  Tensor scales, Tensor roi, vector<uint32_t> sizes_,
-                  vector<float> scales_, vector<float> roi_, string mode,
-                  string ratioPolicy, string nearestMode,
-                  string coordTransMode);
-    Tensor squeeze(Tensor input, Tensor output, Shape axes);
-    Tensor unsqueeze(Tensor input, Tensor output, Shape axes);
    Tensor concat(TensorVec inputs, Tensor output, int dim);
-    Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
-                            Tensor input_q, Tensor input_k, Tensor input_v,
-                            Tensor position_id, Tensor output_matmul);
-    Tensor RoPE(Tensor pos, Tensor input, Tensor output);
    TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
-                    std::variant<int, vector<int>> numOrRatio);
+                    int num_outputs);
    Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
-    Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
    Tensor reduceMean(Tensor data, Tensor reduced,
                      const optional<vector<int>> &axes, bool keepdims);
-    Tensor reduceSum(Tensor data, Tensor reduced,
-                     const optional<vector<int>> &axes, bool keepdims);
    Tensor slice(Tensor input, Tensor output, const vector<int> &starts,
                 const vector<int> &ends, const optional<vector<int>> &axes,
                 const optional<vector<int>> &steps);
    Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
               const optional<vector<int>> &axes);
-    Tensor cast(Tensor input, Tensor output, int to);
-    Tensor expand(Tensor input, Tensor output, Shape dims);
-    Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
-    std::vector<int> getDims(Tensor x) { return x->getDims(); }
-
-    Tensor allReduceSum(Tensor input, Tensor output);
-    Tensor allReduceProd(Tensor input, Tensor output);
-    Tensor allReduceMin(Tensor input, Tensor output);
-    Tensor allReduceMax(Tensor input, Tensor output);
-    Tensor allReduceAvg(Tensor input, Tensor output);
-    TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
-    Tensor broadcast(Tensor input, Tensor output, int root);
-    Tensor send(Tensor input, int source, int destination, Tensor output);
-    Tensor recv(Tensor output, int source, int destination, Shape dims,
-                int outputType, Tensor input);
-    Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
-                        std::string mode);
-    Tensor lrn(Tensor input, Tensor output, float alpha, float beta, float bias,
-               int size);
+    /// @brief Import memBound operator from a json
+    TensorVec memBound(const TensorVec &inputs, const Tensor &outputs,
+                       const string &jsonString);

    //------ modifiers

-    inline bool topo_sort() { return g->topo_sort(); }
+    bool topo_sort() { return g->topo_sort(); }

-    inline void optimize() { g->optimize(); }
+    void optimize() { g->optimize(); }

-    inline void shape_infer() { g->shape_infer(); }
-
-    void change_shape(const vector<int> &shape, int tensorId);
    //------ runtime

-    inline void data_malloc(bool useNaiveAllocator = false,
-                            size_t memPoolSize = 0) {
-        g->dataMalloc(useNaiveAllocator, memPoolSize);
-    }
+    void data_malloc() { g->dataMalloc(); }

-    inline Tensor clone_KV(Tensor &tensor) { return g->cloneKV(tensor); }
+    void run() { g->getRuntime()->run(g); }

-    inline void free_heap() { g->freeHeap(); }
-
-    inline void tune() { g->getRuntime()->run(g, true); }
-
-    inline void run() { g->getRuntime()->run(g); }
-
-    inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }
-
-#ifdef USE_CUDA
-    inline void run_with_cudagraph() {
-        (as<CudaRuntimeObj>(g->getRuntime()))->runWithCudaGraph(g);
-    }
-#endif
+    Graph getGraph() const;
 };

 } // namespace infini
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@ -2,11 +2,10 @@
 #include "core/common.h"
 #include "core/operator.h"
 #include "core/tensor.h"
-#include "utils/operator_utils.h"
 #include <functional>
 #include <nlohmann/json.hpp>
-namespace infini {
 using json = nlohmann::json;
+namespace infini {

 class RuntimeObj; // Forward declaration for Kernel::compute

@ -30,6 +29,7 @@ class Kernel {
  public:
    Kernel() {}
    virtual ~Kernel() {}
+
    /**
     * @param op The operator to be executed.
     * @param record The parameters for kernel execution. If extra parameters
@ -102,9 +102,11 @@ class KernelRegistry {
    }
    Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
        auto it = kernels.find(kernelAttrs);
-        IT_ASSERT(it != kernels.end(), "Kernel not found for key {" +
-                                           get_kernel_attrs_str(kernelAttrs) +
-                                           "}");
+        IT_ASSERT(it != kernels.end(),
+                  "Kernel not found for key {" +
+                      to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
+                      ", " + OpRegistry::getOpName(std::get<1>(kernelAttrs)) +
+                      ", " + std::get<2>(kernelAttrs).toString() + "}");
        return std::get<0>(it->second);
    }
    const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
@ -129,16 +131,15 @@ class CpuKernelWithoutConfig : public Kernel {

 } // namespace infini

-#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt)                  \
+#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt)        \
    namespace infini {                                                         \
    static const bool _CAT(_register_kernel_, cnt) =                           \
-        KernelRegistry::getInstance().registerKernel(KernelAttrs{device,       \
-                                                                 opType},      \
-                                                     new kernel(), name);      \
+        KernelRegistry::getInstance().registerKernel(                          \
+            KernelAttrs{device, opType, dataType}, new kernel(), name);        \
    }

-#define REGISTER_KERNEL(device, opType, kernel, name)                          \
-    _REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)
+#define REGISTER_KERNEL(device, opType, dataType, kernel, name)                \
+    _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__)

 #define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt)                        \
    namespace infini {                                                         \
--- a/include/core/lazy_allocator.h
+++ b/include/core/lazy_allocator.h
@ -1,122 +0,0 @@
-#pragma once
-#include "core/runtime.h"
-#include "core/tensor.h"
-#ifdef BUILD_TEST
-#include "gtest/gtest.h"
-#endif
-#include <cstddef>
-#include <map>
-#include <unordered_set>
-
-namespace infini {
-
-class LazyAllocator {
-  private:
-#ifdef BUILD_TEST
-    FRIEND_TEST(LazyAllocator, testMergeFreeBlocks);
-
-    FRIEND_TEST(LazyAllocator, testAllocWithEndFreeBlock);
-#endif
-
-    Runtime runtime;
-
-    size_t used = 0;
-
-    size_t peak = 0;
-
-    size_t weightPeak = 0;
-
-    size_t heapPeak = 0;
-
-    size_t alignment;
-
-    bool hasMemPool = false;
-
-    size_t memPoolSize = 0;
-
-    // pointer to the memory actually allocated
-    void *ptr = nullptr;
-
-    // pointer to the weight memory space
-    void *weightPtr = nullptr;
-
-    // memory pool ptr
-    void *memPoolPtr = nullptr;
-
-    // // a cache designed for a batch size that has already occurred
-    // std::unordered_map<size_t, std::unordered_map<TensorObj *, size_t>>
-    // batchsizeToTensorOffset;
-
-    struct freeBlockInfo {
-        size_t addr;
-        size_t blockSize;
-    };
-
-    struct cmpFreeBlockInfo {
-        bool operator()(const freeBlockInfo &a, const freeBlockInfo &b) const {
-            return (a.blockSize != b.blockSize) ? (a.blockSize < b.blockSize)
-                                                : (a.addr < b.addr);
-        }
-    };
-
-    // free balanced tree, maintains all free memory blocks
-    std::set<freeBlockInfo, cmpFreeBlockInfo> freeBlocks;
-
-    // key: head address offset of the free memory block
-    // value: blockSize of the block
-    std::unordered_map<size_t, size_t> headAddrToBlockSize;
-
-    // key: tail address offset of the free memory block
-    // value: blockSize of the block
-    std::unordered_map<size_t, size_t> tailAddrToBlockSize;
-
-  public:
-    LazyAllocator(Runtime runtime);
-
-    virtual ~LazyAllocator();
-
-    void init();
-
-    void setMemPool(size_t memPoolSize);
-
-    bool getMemPoolStatus();
-
-    // function: simulate memory allocation
-    // arguments：
-    //     size: size of memory block to be allocated
-    // return: head address offset of the allocated memory block
-    size_t alloc(size_t size);
-
-    size_t allocWeight(size_t size);
-
-    size_t heapAlloc(size_t size);
-
-    void freeHeap();
-
-    // function: simulate memory free
-    // arguments:
-    //     addr: head address offset of memory block to be free
-    //     size: size of memory block to be freed
-    void free(size_t addr, size_t size);
-
-    // function: perform actual memory allocation
-    // return: pointer to the head address of the allocated memory
-    void *getPtr();
-
-    // void addCache(size_t batchsize, std::unordered_map<TensorObj *, size_t>);
-
-    // std::unordered_map<TensorObj *, size_t> getCache(size_t batchsize);
-
-    void *getWeightPtr();
-
-    void *getHeapPtr();
-
-    void info();
-
-  private:
-    // function: memory alignment, rouned up
-    // return: size of the aligned memory block
-    size_t getAlignedSize(size_t size);
-};
-
-} // namespace infini
--- a/include/core/mutator.h
+++ b/include/core/mutator.h
@ -16,6 +16,7 @@ class Mutator {
            Runtime runtime = NativeCpuRuntimeObj::getInstance())
        : candidatesLimit(candidatesLimit), runtime(runtime){};
    virtual ~Mutator(){};
+    bool hasTunedKernel = false;

    virtual vector<Graph> run(const Graph &in_graph) = 0;
    /**
@ -30,6 +31,14 @@ class Mutator {
    virtual bool isMultiBranchMergable(const Graph &in_graph) {
        IT_TODO_HALT();
    }
+
+    /// @brief Fuse memory bound operators.
+    /// @return The graph after fusion. Return `nullptr` if fails.
+    virtual Graph fuseVertically(const Graph &inputGraph) { IT_TODO_HALT(); }
+
+    /// @brief Eliminate transpose and reshape.
+    /// @return The graph after elimination. Return `nullptr` if fails.
+    virtual Graph eliminateVertically(const Graph &in_graph) { IT_TODO_HALT(); }
 };

 } // namespace infini
--- a/include/core/op_type.h
+++ b/include/core/op_type.h
@ -1,269 +0,0 @@
-#pragma once
-#ifndef OP_TYPE_H
-#define OP_TYPE_H
-
-#include <string>
-#include <unordered_set>
-
-namespace infini {
-
-struct OpType {
-    using underlying_t = uint16_t;
-
-    // Clang-format is ambiguous in formating of comment alignment.
-    // In order to disambiguate, it is necessary to comment all enum
-    // elements.
-    enum : underlying_t {
-        Unknown,
-        Abs,                // Unary
-        Acos,               // Unary
-        Acosh,              // Unary
-        Add,                // Binary
-        And,                // Binary
-        ArgMax,             //
-        Asin,               // Unary
-        Asinh,              // Unary
-        Atan,               // Unary
-        Atanh,              // Unary
-        AttentionKVCache,   // Fusion
-        AveragePool,        // Pool
-        BatchNormalization, //
-        Bernoulli,          //
-        BitShift,           // Binary
-        BitwiseAnd,         // Binary
-        BitwiseNot,         // Binary
-        BitwiseOr,          // Binary
-        BitwiseXor,         // Binary
-        BlackmanWindow,     //
-        Cast,               // Unary
-        CastLike,           //
-        Ceil,               // Unary
-        Celu,               //
-        CenterCropPad,      //
-        Clip,               // Unary
-        Col2lm,
-        Compress,
-        Concat,
-        ConcatFromSequence,
-        ConstantOfShape,
-        Conv,          // ComputationIntensive
-        ConvInteger,   // ComputationIntensive
-        ConvTranspose, // ComputationIntensive
-        Cos,           // Unary
-        Cosh,          // Unary
-        CumSum,
-        DFT,
-        DeformConv, // ComputationIntensive
-        DepthToSpace,
-        DequantizeLinear,
-        Det,
-        Div, // Binary
-        Dropout,
-        DynamicQuantizeLinear,
-        Einsum,
-        Elu,
-        Equal, // Compair
-        Erf,   // Unary
-        Exp,   // Unary
-        Expand,
-        EyeLike,
-        Flatten,
-        Floor, // Unary
-        GRU,
-        Gather,
-        GatherElements,
-        GatherND,
-        Gemm,
-        Gelu,              // Unary
-        GlobalAveragePool, // GlobalPool
-        GlobalLpPool,      // GlobalPool
-        GlobalMaxPool,     // GlobalPool
-        Greater,           // Compair
-        GreaterOrEqual,    // Compair
-        GridSample,
-        GroupNormalization,
-        HammingWindow,
-        HannWindow,
-        HardSigmoid,
-        HardSwish,
-        Hardmax,
-        Identity,
-        If,
-        InstanceNormalization,
-        IsInf,
-        IsNaN,
-        LRN,
-        LSTM,
-        LayerNormalization,
-        LeakyRelu,
-        Less,        // Compair
-        LessOrEqual, // Compair
-        Log,         // Unary
-        LogSoftmax,
-        Loop,
-        LpNormalization,
-        LpPool,
-        MatMul,        // ComputationIntensive
-        MatMulInteger, // ComputationIntensive
-        Max,
-        MaxPool,
-        MaxRoiPool,
-        MaxUnpool,
-        Mean,
-        MeanVarianceNormalization,
-        MelWeightMatrix,
-        Min,
-        Mish,
-        Mod,         // Binary
-        Mul,         // Binary
-        Multinomial, //
-        Neg,         // Unary
-        NegativeLogLikelihoodLoss,
-        NonMaxSuppression,
-        NonZero,
-        Not, // Unary
-        OneHot,
-        Optional,
-        OptionalGetElement,
-        OptionalHasElement,
-        Or,            // Binary
-        PRelu,         //
-        Pad,           //
-        Pow,           // Binary
-        QLinearConv,   // ComputationIntensive
-        QLinearMatMul, // ComputationIntensive
-        QuantizeLinear,
-        RNN,
-        RandomNormal,
-        RandomNormalLike,
-        RandomUniform,
-        RandomUniformLike,
-        Range,
-        Reciprocal,
-        ReduceL1,        // Reduce
-        ReduceL2,        // Reduce
-        ReduceLogSum,    // Reduce
-        ReduceLogSumExp, // Reduce
-        ReduceMax,       // Reduce
-        ReduceMean,      // Reduce
-        ReduceMin,       // Reduce
-        ReduceProd,      // Reduce
-        ReduceSum,       // Reduce
-        ReduceSumSquare, // Reduce
-        Relu,            // Unary
-        Silu,            // Unary
-        Reshape,
-        Resize,
-        ReverseSequence,
-        RoiAlign,
-        RoPE,    // Fusion
-        Round,   // Unary
-        RMSNorm, // Fusion
-        STFT,
-        Scan,
-        Scatter,
-        ScatterElements,
-        ScatterND,
-        Selu,
-        SequenceAt,
-        SequenceConstruct,
-        SequenceEmpty,
-        SequenceErase,
-        SequenceInsert,
-        SequenceLength,
-        SequenceMap,
-        Shape,
-        Shrink,
-        Sigmoid,
-        Sign,
-        Sin,  // Unary
-        Sinh, // Unary
-        Size,
-        Slice,
-        Softmax,
-        SoftmaxCrossEntropyLoss,
-        Softplus,
-        Softsign,
-        SpaceToDepth,
-        Split,
-        SplitToSequence,
-        Sqrt,
-        Squeeze,
-        StringNormalizer,
-        Sub,  // Binary
-        Sum,  //
-        Tan,  // Unary
-        Tanh, // unary
-        TfIdfVectorizer,
-        ThresholdedRelu,
-        Tile,
-        TopK,
-        Transpose,
-        Trilu,
-        Unique,
-        Unsqueeze,
-        Upsample,
-        Where,
-        Xor, // Binary
-        // CUSTOM DEFINED
-        G2BMM,
-        GBMM,
-        MemBound,
-        // TODO
-        ConvTransNHWC,
-        ConvBackwardFilter,
-        ReluBackward,
-        SigmoidBackward,
-        TanhBackward,
-
-        Fill,
-        Extend,
-        MSELoss,
-        Hardtanh,
-        L2Loss,
-        Rsqrt,
-        FloorDiv,
-        FloorMod,
-        Square,
-        SquaredDifference,
-
-        // Communication Ops
-        AllReduceSum,
-        AllReduceProd,
-        AllReduceMin,
-        AllReduceMax,
-        AllReduceAvg,
-        AllGather,
-        Broadcast,
-        Send,
-        Recv,
-    } type;
-
-    constexpr OpType(decltype(type) t) : type(t) {}
-    constexpr explicit OpType(underlying_t val) : type((decltype(type))val) {}
-    constexpr underlying_t underlying() const { return type; }
-
-    bool operator==(OpType others) const { return type == others.type; }
-    bool operator!=(OpType others) const { return type != others.type; }
-    bool operator<(OpType others) const { return type < others.type; }
-
-    const char *toString() const;
-    bool isUnary() const;
-    bool isBinary() const;
-    bool isElementWise() const;
-    bool isCompair() const;
-    bool isPool() const;
-    bool isGlobalPool() const;
-    bool isMatMulOrConv() const;
-};
-
-enum class ActType {
-    None,
-    Relu,
-    Sigmoid,
-    Tanh,
-};
-
-} // namespace infini
-
-#endif // OP_TYPE_H
--- a/include/core/operator.h
+++ b/include/core/operator.h
@ -1,14 +1,244 @@
 #pragma once
-
-#include "core/op_type.h"
 #include "core/tensor.h"
-
 namespace infini {
-using KernelAttrs = std::tuple<Device, OpType::underlying_t>;
+
+enum class OpType {
+    Unknown = 0,
+    // linear
+    Conv = 100,
+    ConvBackwardFilter,
+    ConvBackwardData,
+    Matmul,
+    ConvTrans,
+    ConvTransNHWC,
+    ConvNHWC,
+    G2BMM,
+    GBMM,
+    Pad,
+    Slice,
+    Concat,
+    Split,
+    Transpose,
+    Extend,
+    MaxPool,
+    AvgPool,
+    Add,
+    Sub,
+    Mul,
+    Div,
+    Pow,
+    Gather,
+    ReduceMean,
+    Reshape,
+    Flatten,
+    Identity,
+    // element wise
+    BatchNorm = 200,
+    Softmax,
+    Activation,
+    Relu,
+    ReluBackward,
+    PRelu,
+    Sigmoid,
+    SigmoidBackward,
+    Tanh,
+    TanhBackward,
+    Abs,
+    Sin,
+    Cos,
+    Tan,
+    ASin,
+    ACos,
+    ATan,
+    SinH,
+    CosH,
+    TanH,
+    ASinH,
+    ACosH,
+    ATanH,
+    Resize,
+    Arange,
+    Shape,
+    Copy,
+    Ceil,
+    Floor,
+    Clip,
+    Erf,
+    Exp,
+    Fill,
+    Log,
+    L2Loss,
+    Maximum,
+    Minimum,
+    MSELoss,
+    Neg,
+    Power,
+    Reciprocal,
+    Sqrt,
+    Rsqrt,
+    Cast,
+    FloorDiv,
+    FloorMod,
+    Det,
+    Round,
+    Square,
+    SquaredDifference,
+    Hardtanh,
+    Equal,
+    NotEqual,
+    GreaterThan,
+    GreaterEqual,
+    LessThan,
+    LessEqual,
+    And,
+    Or,
+    Xor,
+    Not,
+    BitAnd,
+    BitOr,
+    BitXor,
+    BitNot,
+    BitLeftShift,
+    BitRightShift,
+    Dropout,
+    //
+    MemBound = 300,
+    //
+    Conv2dReduce = 400,
+    Conv2dReduceTranspose,
+    Any
+};
+
+using KernelAttrs = std::tuple<Device, OpType, DataType>;
+
+class OpRegistry {
+  public:
+    static std::string getOpName(OpType opType) {
+#define FOP(op)                                                                \
+    case OpType::op:                                                           \
+        return #op
+
+        switch (opType) {
+            FOP(Unknown);
+            // linear
+            FOP(Conv);
+            FOP(ConvBackwardFilter);
+            FOP(ConvBackwardData);
+            FOP(Matmul);
+            FOP(ConvTrans);
+            FOP(ConvTransNHWC);
+            FOP(ConvNHWC);
+            FOP(G2BMM);
+            FOP(GBMM);
+            FOP(Pad);
+            FOP(Slice);
+            FOP(Concat);
+            FOP(Split);
+            FOP(Transpose);
+            FOP(Extend);
+            FOP(MaxPool);
+            FOP(AvgPool);
+            FOP(Add);
+            FOP(Sub);
+            FOP(Mul);
+            FOP(Div);
+            FOP(Pow);
+            FOP(Gather);
+            FOP(ReduceMean);
+            FOP(Reshape);
+            FOP(Identity);
+            FOP(Shape);
+            FOP(Flatten);
+            // element wise
+            FOP(BatchNorm);
+            FOP(Softmax);
+            FOP(Activation);
+            FOP(Relu);
+            FOP(ReluBackward);
+            FOP(PRelu);
+            FOP(Sigmoid);
+            FOP(SigmoidBackward);
+            FOP(Tanh);
+            FOP(TanhBackward);
+            FOP(Abs);
+            FOP(Sin);
+            FOP(Cos);
+            FOP(Tan);
+            FOP(ASin);
+            FOP(ACos);
+            FOP(ATan);
+            FOP(SinH);
+            FOP(CosH);
+            FOP(TanH);
+            FOP(ASinH);
+            FOP(ACosH);
+            FOP(ATanH);
+            FOP(Copy);
+            FOP(Ceil);
+            FOP(Floor);
+            FOP(Clip);
+            FOP(Erf);
+            FOP(Exp);
+            FOP(Fill);
+            FOP(Log);
+            FOP(L2Loss);
+            FOP(Maximum);
+            FOP(Minimum);
+            FOP(MSELoss);
+            FOP(Neg);
+            FOP(Power);
+            FOP(Reciprocal);
+            FOP(Sqrt);
+            FOP(Rsqrt);
+            FOP(Cast);
+            FOP(FloorDiv);
+            FOP(FloorMod);
+            FOP(Det);
+            FOP(Round);
+            FOP(Square);
+            FOP(SquaredDifference);
+            FOP(Hardtanh);
+            FOP(Equal);
+            FOP(NotEqual);
+            FOP(GreaterThan);
+            FOP(GreaterEqual);
+            FOP(LessThan);
+            FOP(LessEqual);
+            FOP(And);
+            FOP(Or);
+            FOP(Xor);
+            FOP(Not);
+            FOP(BitAnd);
+            FOP(BitOr);
+            FOP(BitXor);
+            FOP(BitNot);
+            FOP(BitLeftShift);
+            FOP(BitRightShift);
+            //
+            FOP(MemBound);
+            //
+            FOP(Conv2dReduce);
+            FOP(Conv2dReduceTranspose);
+            FOP(Any);
+        default:
+            IT_ASSERT(false, "Unknown OpType " +
+                                 std::to_string(enum_to_underlying(opType)));
+            break;
+        }
+#undef FOP
+    }
+};
+
+enum class ActType {
+    None,
+    Relu,
+    Sigmoid,
+    Tanh,
+};

 struct OpPerfKey {
    HashType hash;
-    OpType::underlying_t opType;
+    OpType opType;
    vector<int> attrs;

  public:
@ -16,7 +246,7 @@ struct OpPerfKey {
    // https://github.com/nlohmann/json#how-can-i-use-get-for-non-default-constructiblenon-copyable-types
    OpPerfKey() = default;
    OpPerfKey(HashType hash, OpType opType, vector<int> attrs = {})
-        : hash(hash), opType(opType.underlying()), attrs(attrs) {}
+        : hash(hash), opType(opType), attrs(attrs) {}
    bool operator==(const OpPerfKey &rhs) const {
        if (hash != rhs.hash)
            return false;
@ -55,7 +285,8 @@ class OperatorObj : public Object {

  public:
    OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
-    virtual optional<vector<Shape>> inferShape(const TensorVec &inputs) = 0;
+    virtual optional<vector<Shape>>
+    inferShape(const TensorVec &inputs) const = 0;
    virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
    /**
     * @brief Constructs outputs (if requried) and check whether the operator is
@ -72,7 +303,16 @@ class OperatorObj : public Object {
     */
    HashType hash() const;

-  public:
+  public: // check Op type
+    bool isLinearOp() const;
+    bool isElementWiseOp() const;
+    bool isSplitOp() const;
+    bool isConcatOp() const;
+    bool isComputeOp() const;
+    bool isTransposeOp() const;
+    bool isReshapeOp() const;
+    bool isMemBoundOp() const;
+
  public: // getter and setter
    const TensorVec &getInputs() const { return inputs; }
    const TensorVec &getOutputs() const { return outputs; }
@ -90,7 +330,6 @@ class OperatorObj : public Object {
    OpType getOpType() const { return type; }
    // HACK: set correct data type
    DataType getDType() const { return getInputs(0)->getDType(); }
-    DataType getOutDType() const { return getOutput()->getDType(); }
    virtual int numInputs() const = 0;
    virtual int numOutputs() const = 0;

@ -105,7 +344,7 @@ class OperatorObj : public Object {
                           const TensorVec &newOutputs) const = 0;

  protected:
-    optional<vector<Shape>> inferShape();
+    optional<vector<Shape>> inferShape() const;
    vector<DataType> inferDataType() const;

  private:
--- a/include/core/perf_engine.h
+++ b/include/core/perf_engine.h
@ -2,8 +2,8 @@
 #include "core/graph.h"
 #include "core/kernel.h"
 #include <nlohmann/json_fwd.hpp>
-namespace infini {
 using json = nlohmann::json;
+namespace infini {

 class PerfEngine {
  public:
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -1,7 +1,6 @@
 #pragma once
 #include "core/common.h"
-#include "core/communicator.h"
-#include "core/op_type.h"
+#include "core/object.h"
 #include "core/ref.h"
 #include <memory>

@ -15,7 +14,6 @@ class GraphObj;
 class GraphHandlerObj;
 class RuntimeObj;
 class BlobObj;
-template <typename T> class WorkspaceObj;

 using TensorBase = Ref<TensorBaseObj>;
 using Tensor = Ref<TensorObj>;
@ -24,7 +22,7 @@ using Graph = Ref<GraphObj>;
 using GraphHandler = Ref<GraphHandlerObj>;
 using Runtime = Ref<RuntimeObj>;
 using Blob = Ref<BlobObj>;
-template <typename T> using Workspace = Ref<WorkspaceObj<T>>;
+enum class OpType;

 using TensorVec = vector<Tensor>;
 using OpVec = vector<Operator>;
@ -32,17 +30,15 @@ using OpLists = list<Operator>;

 using VType = uint32_t;

-enum class Device { CPU = 1, CUDA, BANG, INTELCPU, KUNLUN };
+enum class Device { CPU = 1, CUDA, BANG, INTELCPU };
 /***************** Forward declaration end *****************/

 class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
  protected:
    Device device;
-    int deviceId;

  public:
-    explicit RuntimeObj(Device device, int deviceId = 0)
-        : device(device), deviceId(deviceId) {}
+    RuntimeObj(Device device) : device(device) {}
    RuntimeObj(RuntimeObj &other) = delete;
    RuntimeObj &operator=(RuntimeObj const &) = delete;
    virtual ~RuntimeObj() {}
@ -64,17 +60,18 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
     * execution happens.
     *
     * @param graph
-     * @param profiling Whether to print breakdown of time
+     * @param printProfiling Whether to print breakdown of time
     * @return double Return the sum of perf time for each operator
     */
-    double getPerfTime(const Graph &graph, bool profiling = false) const;
+    double getPerfTime(const Graph &graph, bool printProfiling = false,
+                       bool allowEstimation = false,
+                       bool ignoreMemboundOp = false) const;
    Blob allocBlob(size_t size);
    bool isCpu() const {
        return device == Device::CPU || device == Device::INTELCPU;
    }
    bool isCuda() const { return device == Device::CUDA; }
    bool isBang() const { return device == Device::BANG; }
-    bool isKUNLUN() const { return device == Device::KUNLUN; }
    void copyBlob(const TensorObj *dst, const TensorObj *src) const;
    // TODO: unify these copy APIs
    virtual void copyBlobFromCPU(void *dst, const void *src,
@ -82,17 +79,19 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
    virtual void copyBlobToCPU(void *dst, const void *src,
                               size_t bytes) const = 0;
    virtual string toString() const = 0;
+    virtual void sync() const {}

-    int getDeviceId() const { return deviceId; }
+    map<UidBaseType, bool>
+    getCompileTimeComputableAttribute(const Graph &graph) const;

-    virtual void initComm(const string &name, int worldSize, int rank) = 0;
-
-    virtual CommunicatorObj &getCommunicator() const = 0;
+    double timeNonCtcOperators(const Graph &graph, int warmup = 1000,
+                               int repeat = 1000) const;

  protected:
-    void printProfilingData(double totTime,
+    void printProfilingData(double totalTime,
                            const std::map<OpType, double> &opTime,
-                            const std::map<OpType, int> &opCnt) const;
+                            const std::map<OpType, int> &opCnt,
+                            const std::map<OpType, int> &opNonCtcCnt) const;
    virtual void copyBlobInsideRuntime(void *dst, const void *src,
                                       size_t bytes) const = 0;
 };
@ -109,9 +108,6 @@ class CpuRuntimeObj : public RuntimeObj {
    void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
    void copyBlobInsideRuntime(void *dst, const void *src,
                               size_t bytes) const override;
-    void initComm(const string &, int, int) override { IT_TODO_HALT(); }
-
-    CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
 };

 class NativeCpuRuntimeObj : public CpuRuntimeObj {
--- a/include/core/search_engine.h
+++ b/include/core/search_engine.h
@ -4,44 +4,35 @@
 #include "graph.h"
 #include "mutator.h"

-#include <unordered_map>
-
 namespace infini {
 class SearchEngine {
  private:
    Runtime runtimeExec;
    Ref<Mutator> mutator;
+    std::function<bool(const Graph &, const Graph &)> graphTimeComparer;

  public:
-    SearchEngine(Runtime _runtime, Ref<Mutator> _mutator) {
-        runtimeExec = _runtime;
-        mutator = _mutator;
-    }
+    SearchEngine(Runtime runtime, Ref<Mutator> mutator);
    ~SearchEngine() {}
+    int searchFilter = 0;
+    bool chooseBestMutation = true;

  private: // Configurations
    size_t partitionThreshold =
        3;                  // cut nodes whose #in + #out >= partitionThreshold
    size_t GRAPH_SIZE = 16; // num of best graphs.

-  private: // Composed objects
-    std::shared_ptr<Mutator> mutationEngine;
-
  public:
-    std::shared_ptr<Mutator> getMutationEngine() { return mutationEngine; };
    struct GroupEdge {
        int v, next;
        GroupEdge() = delete;
    };

-    struct Candidate { // a graph with perf
-        std::shared_ptr<Graph> graph;
-        double perf = INFINITY;
-    };
-    class MetaGraph { // a graph of subgraphs, for searching.
-      public:
-        MetaGraph() {}
-        ~MetaGraph() {}
+    // struct Candidate { // a graph with perf
+    //     Graph graph;
+    //     double perf = INFINITY;
+    // };
+    struct MetaGraphObj { // a graph of subgraphs, for searching.
        struct Node {
            Graph graph;
            std::vector<int> suc;
@ -50,31 +41,33 @@ class SearchEngine {
        };
        std::vector<Node> nodes;
    };
+    using MetaGraph = Ref<MetaGraphObj>;

-    Graph run(const Graph graph);                  // entrance of search engine.
+    Graph run(const Graph graph);                  // entrance to search engine.
    std::vector<Graph> search(const Graph &graph); // search for a partition.

  private:
    std::vector<Graph> partitionGraph(const Graph graph);
-    std::shared_ptr<MetaGraph> buildMetaGraphWithGraph(const Graph graph);
-    std::shared_ptr<MetaGraph>
-    buildMetaGraphWithPlan(const std::shared_ptr<MetaGraph> metaGraph,
-                           const std::vector<int> &plan);
+    MetaGraph buildMetaGraphWithGraph(const Graph graph);
+    MetaGraph buildMetaGraphWithPlan(const MetaGraph metaGraph,
+                                     const std::vector<int> &plan);
    // search horizontal merges
-    std::vector<std::shared_ptr<MetaGraph>>
-    searchMerge(std::shared_ptr<MetaGraph> &metaGraph);
-    void searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
-                        std::vector<int> &plan, std::vector<int> &frontier,
+    std::vector<MetaGraph> searchMerge(MetaGraph &metaGraph);
+    void searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
+                        std::vector<int> &frontier,
                        std::vector<std::vector<int>> &plans,
                        std::unordered_set<uint64_t> &planSet);
-    std::vector<Graph>
-    searchMutation(const std::shared_ptr<MetaGraph> &metaGraph);
+    std::vector<Graph> searchMutation(const MetaGraph &metaGraph);

-    void printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph);
+    void printMetaGraph(MetaGraph metaGraph);
    /**
     * @brief Check whether a multi-brach graph can be merged into a single
     * branch.
     */
    bool isMultiBranchMergable(const Graph graph);
+    Graph fuseVertically(const Graph &graph);
+
+    double getEstimatedGraphPerf(Graph graph);
 };
+
 } // namespace infini
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -1,84 +1,25 @@
 #pragma once
 #include "core/tensor_base.h"
-#include "core/tensor_type.h"
-#include "utils/data_convert.h"
 #include <cmath>
 #include <cstring>
-#include <fstream>

 #if USE_CUDA
 #include "cuda/cuda_runtime.h"
 #endif
-#if USE_BANG
-#include "bang/bang_runtime.h"
-#endif
+
 namespace infini {

 // TODO: how to deal with this
 using ShapeElem = int;
 using Shape = vector<ShapeElem>;
+enum class TensorType { Error = 0, Input = 1, Initialized = 2, Other = 3 };
 class TensorObj : public TensorBaseObj {
  private:
    Shape shape;
    size_t _size; // Cache of Π(shape).
    Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
                  // scratch have a new id.
-    TensorType tensorType = TensorType::others;
-
-  public:
-    TensorObj(Shape shape, DataType dtype, Runtime runtime);
-    virtual ~TensorObj() {}
-    string toString() const override;
-
-    size_t size() const { return _size; }
-    size_t getBytes() const { return _size * dtype.getSize(); }
-
-    Shape getDims() const { return shape; }
-    void setShape(Shape shape_);
-    size_t getRank() const { return shape.size(); }
-    Shape getStride() const;
-    size_t getOffset(const vector<int> &ds) const;
-    void dataMalloc();
-    UidBaseType getFuid() const { return fuid; }
-    bool isWeight() const { return tensorType == TensorType::weight; }
-    bool isInput() const { return tensorType == TensorType::input; }
-    bool isOutput() const { return tensorType == TensorType::output; }
-    bool isOthers() const { return tensorType == TensorType::others; }
-    void setWeight() { tensorType = TensorType::weight; }
-    void setInput() {
-        if (!this->isWeight()) {
-            tensorType = TensorType::input;
-        }
-    }
-    void setOutput() {
-        if (!this->isWeight()) {
-            tensorType = TensorType::output;
-        }
-    }
-    string tensorTypeToString() const {
-        switch (tensorType) {
-        case TensorType::weight:
-            return "weight";
-            break;
-        case TensorType::input:
-            return "input";
-            break;
-        case TensorType::output:
-            return "output";
-            break;
-        case TensorType::others:
-            return "others";
-            break;
-
-        default:
-            return "unknown tensor type";
-            break;
-        }
-    }
-
-    void load(std::string file_path);
-    void save(std::string file_path);
-
+    TensorType tensorType;
    void copyin(const void *ptr, size_t size) {
        runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
    }
@ -86,22 +27,41 @@ class TensorObj : public TensorBaseObj {
        runtime->copyBlobToCPU(ptr, getRawDataPtr<void *>(), size);
    }

+  public:
+    TensorObj(Shape shape, DataType dtype, Runtime runtime,
+              TensorType tensorType = TensorType::Other);
+    virtual ~TensorObj() {}
+    string toString() const override;
+
+    size_t size() const { return _size; }
+    size_t getBytes() const { return _size * dtype.getSize(); }
+
+    Shape getDims() const { return shape; }
+    vector<size_t> getStride() const;
+    size_t getOffset(const vector<int> &ds) const;
+    void dataMalloc();
+    UidBaseType getFuid() const { return fuid; }
+    TensorType getTensorType() const { return tensorType; }
+
+    void load(std::string file_path);
+    void save(std::string file_path);
+
    // Copy elements from `data`.
    template <typename T> void copyin(const vector<T> &data) {
-        IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
-        IT_ASSERT(data.size() == _size);
+        IT_ASSERT(DataType::get<T>() == dtype);
+        IT_ASSERT(data.size() >= _size);
        copyin(data.data(), getBytes());
    }
    // Copy all the elements to a vector.
    template <typename T> auto copyout() const {
-        IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
+        IT_ASSERT(DataType::get<T>() == dtype);
        std::vector<T> ans(_size);
        copyout(ans.data(), getBytes());
        return ans;
    }
    // Copy the element at `pos`.
    template <typename T> auto copyOne(const vector<int> &pos) const {
-        IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
+        IT_ASSERT(DataType::get<T>() == dtype);
        auto offset = getOffset(pos);
        auto bytes = dtype.getSize();
        T ans;
@ -113,60 +73,33 @@ class TensorObj : public TensorBaseObj {
    void copyData(const TensorObj *src);
    void copyData(const Tensor &src) { copyData(src.get()); }

-    // TODO: Rename this function later, because it is confused that it will
-    // change the field data, but actually it generates data and maybe copy to
-    // device.
    // FIXME: std::fucntion copies the generator instead of passing it by ref.
    // Thus the internal state of generator cannot be updated.
    void setData(
        std::function<void(void *, size_t, DataType)> const &generator) const;
-
-    void setDataBlob(const Blob &blob);
-
-    Tensor clone() const {
-        auto obj = make_ref<TensorObj>(*this);
-        obj->freeData();
-        obj->targets.clear();
-        obj->source.reset();
-        return obj;
-    }
-    Tensor clone(Runtime runtime) const {
-        auto obj = make_ref<TensorObj>(*this);
-        obj->runtime = runtime;
-        obj->freeData();
-        obj->targets.clear();
-        obj->source.reset();
-        if (hasData()) {
-            obj->dataMalloc();
-            obj->copyData(this);
-        }
-        return obj;
-    }
+    void setData(const Blob &_blob) { data = _blob; }
+    Tensor clone() const;
+    Tensor clone(Runtime runtime) const;

    void printData() const;
-    void dumpData(std::ofstream &ofs) const;
    bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;

    template <typename T> bool equalData(const vector<T> &dataVector) {
+        IT_ASSERT(DataType::get<T>() == dtype);
        IT_ASSERT(size() == dataVector.size());
-        if (dtype == DataType::Float16) {
-            return equalDataImpl_fp16(getRawDataPtr<uint16_t *>(),
-                                      (float *)dataVector.data(), size());
-        }
-        IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
        return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
    }

    size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const;

  private:
-    template <class T> string dataToString() const {
+    template <class T> string dataToString(void *rawPtr) const {
        std::stringstream builder;
        builder << "Tensor: " << guid << std::endl;

        auto numDims = shape.size();
        auto dimSzVec = vector<int>(numDims, 1);
-        auto ptr = data->getPtr<T *>();
+        T *ptr = (T *)rawPtr;
        dimSzVec[numDims - 1] = shape[numDims - 1];

        for (int i = numDims - 1; i != 0; --i)
@ -177,6 +110,12 @@ class TensorObj : public TensorBaseObj {
                if (i % dimSzVec[j] == 0)
                    builder << "[";

+            if (iEnd > 1000 && i > 20 && i < iEnd - 20) {
+                printf("... , ");
+                i = iEnd - 20;
+                continue;
+            }
+
            builder << ptr[i];
            for (size_t j = 0; j < numDims; ++j)
                if ((int)i % dimSzVec[j] == dimSzVec[j] - 1)
@ -193,41 +132,19 @@ class TensorObj : public TensorBaseObj {
    }

    template <typename T>
-    bool equalDataImpl(const T *a, const T *b, size_t size,
-                       double relativeError = 1e-6) const {
+    bool equalDataImpl(const T *a, const T *b, size_t size) const {
        for (size_t i = 0; i < size; ++i) {
            if constexpr (std::is_integral_v<T>) {
                if (a[i] != b[i])
                    return false;
            } else if constexpr (std::is_floating_point_v<T>) {
-                if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
-                    fabs(a[i] - b[i]) > relativeError) {
-                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
-                    return false;
-                } else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
-                           fabs(a[i] - b[i]) /
-                                   std::max(fabs(a[i]), fabs(b[i])) >
-                               relativeError) {
+                if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
+                    1e-6) {
                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
                    return false;
                }
-            } else {
+            } else
                static_assert(!sizeof(T), "Unsupported data type");
-            }
-        }
-        return true;
-    }
-
-    bool equalDataImpl_fp16(const uint16_t *a, const float *b,
-                            size_t size) const {
-        for (size_t i = 0; i < size; ++i) {
-            auto a_fp32 = fp16_to_float(a[i]);
-            auto b_fp32 = b[i];
-            if (fabs(a_fp32 - b_fp32) / std::max(fabs(a_fp32), fabs(b_fp32)) >
-                1e-6) {
-                printf("Error on %lu: %f %f\n", i, a_fp32, b_fp32);
-                return false;
-            }
        }
        return true;
    }
@ -248,8 +165,8 @@ class TensorObj : public TensorBaseObj {
    //         // std::cerr << "Init beginned " << std::endl;
    // #pragma omp parallel for
    //         for (size_t i = 0; i < iEnd; ++i)
-    //             data[i] = fastrand(random_seed[omp_get_thread_num() *
-    //             16]) % 10000;
+    //             data[i] = fastrand(random_seed[omp_get_thread_num() * 16]) %
+    //             10000;
    //         // std::cerr << "Init finished" << std::endl;
    //         computed = ComputedFull;
    //         return true;
@ -294,8 +211,8 @@ class TensorObj : public TensorBaseObj {
    //         auto nDim = dims.size();
    //         auto nBroadcastDim = ds.size() - nDim;
    //         for (size_t i = 0; i < nDim; ++i)
-    //             if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim +
-    //             i] >= dims[i])
+    //             if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim + i] >=
+    //             dims[i])
    //                 return (size_t)-1;
    //         size_t idx = 0;
    //         for (size_t i = 0; i < nDim; ++i)
@ -354,14 +271,12 @@ class TensorObj : public TensorBaseObj {
    //         return (g_seed >> 16) & 0x7FFF;
    //     }

-    //     std::vector<std::vector<int>> const *getSplittingPoints()
-    //     const {
+    //     std::vector<std::vector<int>> const *getSplittingPoints() const {
    //         assert(!splittingPoints.empty());
    //         return &splittingPoints;
    //     }

-    //     bool setSplittingPoints(std::vector<std::vector<int>> value)
-    //     {
+    //     bool setSplittingPoints(std::vector<std::vector<int>> value) {
    //         assert(!value.empty());
    //         splittingPoints = value;
    //         return true;
@ -383,7 +298,7 @@ class TensorObj : public TensorBaseObj {
    //     }

    //     void initSplittingPoints() {
-    //     splittingPoints.resize(getRank()); }
+    //     splittingPoints.resize(getDims().size()); }

    //     void printShape();
 };
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@ -44,7 +44,6 @@ class TensorBaseObj : public Object {
    }

    DataType getDType() const { return dtype; }
-    int getDTypeIndex() const { return dtype.getIndex(); }
    Runtime getRuntime() const { return runtime; }

    //     std::pair<Operator *, int> getOutputOfWithIndex();
--- a/include/core/tensor_type.h
+++ b/include/core/tensor_type.h
@ -1,7 +0,0 @@
-#pragma once
-
-namespace infini {
-
-enum class TensorType { weight, input, output, others };
-
-} // namespace infini
--- a/include/core/workspace.h
+++ b/include/core/workspace.h
@ -1,42 +0,0 @@
-#pragma once
-#include "core/runtime.h"
-
-namespace infini {
-
-template <class T> class WorkspaceObj {
-  private:
-    T workspace;           // workspace pointer
-    size_t workspaceSize;  // Size of workspace
-    size_t workspaceAlloc; // currently use workspace size
-
-  public:
-    WorkspaceObj(T workspace_, size_t workspaceSize_)
-        : workspace(workspace_), workspaceSize(workspaceSize_) {
-        workspaceAlloc = 0;
-    }
-    virtual ~WorkspaceObj() {
-        // Dealloc workspace in RuntimeObj
-        // Set workspace = nullptr here
-        workspace = nullptr;
-    }
-    size_t getWorkspaceSize() const { return workspaceSize; }
-
-    T getWorkspace(size_t size) {
-        // Get unused workspace
-        IT_ASSERT(size + workspaceAlloc <= workspaceSize);
-        auto ret = (T)(static_cast<uint8_t *>(workspace) + workspaceAlloc);
-        workspaceAlloc += size;
-        return ret;
-    }
-    T getWorkspace() {
-        // Override getWorkspace in order to dealloc in runtime
-        return workspace;
-    }
-    void resetWorkspace() {
-        // Reset workspaceAlloc every time end kernel
-        workspaceAlloc = 0;
-    }
-    size_t getWorkspaceAlloc() const { return workspaceAlloc; }
-};
-
-} // namespace infini
--- a/include/cuda/cuda_any.h
+++ b/include/cuda/cuda_any.h
@ -0,0 +1,10 @@
+#pragma once
+
+#include "operators/any.h"
+
+namespace infini {
+
+void any_kernel_mapping(vector<float *> input, vector<float *> output,
+                        const string &kernel_name, const vector<int> &attr);
+
+} // namespace infini
--- a/include/cuda/cuda_attention_kvcache.h
+++ b/include/cuda/cuda_attention_kvcache.h
@ -1,17 +0,0 @@
-#pragma once
-#include "core/common.h"
-#include <cstdio>
-
-struct AttentionKVCacheMetadata {
-    int dimSize[4];
-    int stride[4];
-};
-
-namespace infini {
-void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
-                              float *input_q, float *input_k, float *input_v,
-                              int *position_id, float *output_matmul,
-                              const AttentionKVCacheMetadata &compMeta,
-                              float *output_O_temp, float *output_sum_temp);
-
-} // namespace infini
--- a/include/cuda/cuda_common.h
+++ b/include/cuda/cuda_common.h
@ -5,13 +5,18 @@
 #include <cuda_profiler_api.h>
 #include <cudnn.h>
 #include <curand.h>
-#include <memory>

+// TODO: replace with Exception (IT_ASSERT)
 #define checkCudaError(call)                                                   \
-    if (auto err = call; err != cudaSuccess)                                   \
-    throw ::infini::Exception(std::string("[") + __FILE__ + ":" +              \
-                              std::to_string(__LINE__) + "] CUDA error (" +    \
-                              #call + "): " + cudaGetErrorString(err))
+    {                                                                          \
+        auto err = call;                                                       \
+        if (cudaSuccess != err) {                                              \
+            fprintf(stderr, "Cuda error in %s:%i : %s.\n", __FILE__, __LINE__, \
+                    cudaGetErrorString(err));                                  \
+            IT_ASSERT(false);                                                  \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    }

 #define checkCUresult(call)                                                    \
    {                                                                          \
@ -35,10 +40,14 @@
    }

 #define checkCudnnError(call)                                                  \
-    if (auto err = call; err != CUDNN_STATUS_SUCCESS)                          \
-    throw ::infini::Exception(std::string("[") + __FILE__ + ":" +              \
-                              std::to_string(__LINE__) + "] cuDNN error (" +   \
-                              #call + "): " + cudnnGetErrorString(err))
+    {                                                                          \
+        auto err = call;                                                       \
+        if (CUDNN_STATUS_SUCCESS != err) {                                     \
+            fprintf(stderr, "cuDNN error in %s:%i : %s.\n", __FILE__,          \
+                    __LINE__, cudnnGetErrorString(err));                       \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    }

 #define checkCurandError(call)                                                 \
    {                                                                          \
@ -112,20 +121,4 @@ inline const char *curandGetErrorString(curandStatus_t error) {

 using CudaPtr = void *;

-class CUDAStream {
-  public:
-    CUDAStream(const CUDAStream &) = delete;
-    CUDAStream(CUDAStream &&) = delete;
-    void operator=(const CUDAStream &) = delete;
-    void operator=(CUDAStream &&) = delete;
-    static cudaStream_t getCurrentStream() { return _stream; }
-    static void Init() { CUDAStream::_stream = 0; };
-    static void createStream() { checkCudaError(cudaStreamCreate(&_stream)); }
-    static void destroyStream() { checkCudaError(cudaStreamDestroy(_stream)); }
-
-  private:
-    CUDAStream(){};
-    static cudaStream_t _stream;
-};
-
 } // namespace infini
--- a/include/cuda/cuda_conv2dreduce.h
+++ b/include/cuda/cuda_conv2dreduce.h
@ -0,0 +1,31 @@
+#pragma once
+
+namespace infini {
+
+void conv2dreduce_kernel(float *input, float *bias, float *output, bool PReLU,
+                         int n, int h, int w, int f, int r, int s, int oh,
+                         int ow, int ph, int pw, int sh, int sw, int dh,
+                         int dw);
+
+void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
+                                  int act, int n, int h, int w, int f, int r,
+                                  int s, int oh, int ow, int ph, int pw, int sh,
+                                  int sw, int dh, int dw);
+
+void reduceConvRxSToNCHW(float *input, float *bias, float *output, int act,
+                         int n, int h, int w, int f, int r, int s, int oh,
+                         int ow, int ph, int pw, int sh, int sw, int dh,
+                         int dw);
+
+void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
+                                  int act, int n, int h, int w, int f, int r,
+                                  int s, int oh, int ow, int ph, int pw, int sh,
+                                  int sw, int dh, int dw);
+
+void conv5x5ToConv3x3Reduce(int n, int f, int h, int w, float *input,
+                            float *output, float *bias);
+
+void conv3x3ToReduce(int n, int h, int w, int f, float *input, float *output,
+                     float *bias);
+
+} // namespace infini
--- a/include/cuda/cuda_element_wise.h
+++ b/include/cuda/cuda_element_wise.h
@ -1,20 +1,8 @@
 #pragma once

 namespace infini {
-void div_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
-                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
-                int c2, int c3);
-void add_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
-                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
-                int c2, int c3);
-void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
-                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
-                int c2, int c3);
-void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
-                 int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
-                 int c2, int c3);
-
-void div_const_kernel(int dType, void *a, void *b, void *c, size_t n);
-
-void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n);
+void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
+void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
 }; // namespace infini
--- a/include/cuda/cuda_expand.h
+++ b/include/cuda/cuda_expand.h
@ -1,12 +0,0 @@
-#pragma once
-
-#include "operators/unary.h"
-#include "utils/small_array.h"
-namespace infini {
-void expandKernel(int dType, void *input, void *output, int nDims,
-                  int outputsize, SmallArray inputShape,
-                  SmallArray outputShape);
-
-void expandRowKernel(int dType, void *input, void *output, int n_rows,
-                     int row_len);
-}; // namespace infini
--- a/include/cuda/cuda_layernorm.h
+++ b/include/cuda/cuda_layernorm.h
@ -1,17 +0,0 @@
-#pragma once
-#include "operators/unary.h"
-
-namespace infini {
-void LaynormKernel(const float *input, const float *scale, const float eps,
-                   int size, int scaleSize, const int dimsize, const int stride,
-                   float *output, const float *bias, int biasSize);
-void LaynormKernel(const float *input, const float *scale, const float eps,
-                   int size, int scaleSize, const int dimsize, const int stride,
-                   float *output);
-void LaynormKernel(const half *input, const half *scale, const half eps,
-                   int size, int scaleSize, const int dimsize, const int stride,
-                   half *output, const half *bias, int biasSize);
-void LaynormKernel(const half *input, const half *scale, const half eps,
-                   int size, int scaleSize, const int dimsize, const int stride,
-                   half *output);
-}; // namespace infini
--- a/include/cuda/cuda_pad_slice.h
+++ b/include/cuda/cuda_pad_slice.h
@ -10,11 +10,10 @@ typedef struct {
    int wholeNDim[MAX_DIM];  // dim size after padding or before slicing
    int partNDim[MAX_DIM];   // dim size before padding or after slicing
    int partStride[MAX_DIM]; // stride before padding or after slicing
-    int DType;
 } TransMetaData;

 namespace infini {
-void pad_slice_kernel(void *partData, void *wholeData,
+void pad_slice_kernel(float *partData, float *wholeData,
                      const TransMetaData &metadata, int nDims, int num,
                      bool isPad);
 } // namespace infini
--- a/include/cuda/cuda_rmsnorm.h
+++ b/include/cuda/cuda_rmsnorm.h
@ -1,10 +0,0 @@
-#pragma once
-
-#include "operators/rms_norm.h"
-
-namespace infini {
-
-void rmsnorm_kernel(int dType, void *input, void *weight, void *output,
-                    int num_tokens, int hidden_size);
-
-}; // namespace infini
--- a/include/cuda/cuda_rope.h
+++ b/include/cuda/cuda_rope.h
@ -1,12 +0,0 @@
-#pragma once
-
-#include "operators/rope.h"
-#include "utils/small_array.h"
-
-namespace infini {
-
-void rope_kernel(int dType, int *pos, void *input, void *output, int size,
-                 int dim_model, int dim_head, int hidden_stride,
-                 int pos_stride);
-
-}; // namespace infini
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -1,65 +1,57 @@
 #pragma once
 #include "core/runtime.h"
 #include "cuda/cuda_common.h"
-#ifdef INFINI_USE_NCCL
-#include "cuda/nccl_communicator.h"
-#endif

 namespace infini {

 class CudaRuntimeObj : public RuntimeObj {
  private:
+    cudaStream_t stream;
    cudnnHandle_t cudnn;
    cublasHandle_t cublas;
-    std::unique_ptr<CommunicatorObj> comm;
    CudaPtr workspace;
    size_t workspaceSize;
-    bool isCudaGraphCreated;
-    cudaGraph_t cudaGraph;
-    cudaGraphExec_t cudaGraphInstance;
+
+    // Memory information
+    size_t allocatedGPUMemorySize = 0;
+    map<void *, size_t> allocationMap;
+
+    bool cudaGraphStatus; // Whether CUDA graph stream capture is enabled
+
+    // CUDA device properties
+    cudaDeviceProp deviceProperties;
+
+    bool enableTF32 = false;

  public:
-    explicit CudaRuntimeObj(int deviceId = 0)
-        : RuntimeObj(Device::CUDA, deviceId) {
-
-        checkCudaError(cudaSetDevice(deviceId));
-        checkCudnnError(cudnnCreate(&cudnn));
-        checkCublasError(cublasCreate(&cublas));
-        // 10GB for Longformer
-        // size_t longformerNum = 3lu * (1 << 30);
-        workspaceSize = 7ll << 30; // 7 GB
-        workspace = alloc(workspaceSize);
-        isCudaGraphCreated = false;
-        CUDAStream::Init();
-    }
-    virtual ~CudaRuntimeObj() {
-        try {
-            if (isCudaGraphCreated) {
-                checkCudaError(cudaGraphExecDestroy(cudaGraphInstance));
-                checkCudaError(cudaGraphDestroy(cudaGraph));
-                CUDAStream::destroyStream();
-            }
-            dealloc(workspace);
-            checkCudnnError(cudnnDestroy(cudnn));
-            checkCublasError(cublasDestroy(cublas));
-        } catch (const std::exception &e) {
-            std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
-        }
-    }
+    CudaRuntimeObj();
+    virtual ~CudaRuntimeObj();
    string toString() const override;

    void run(const Graph &graph, bool tune = false,
             bool profiling = false) const;
    // double runEvaluation(const Graph &graph, int nWarmups,
    //                      int nEvaluations) const;
-    void sync() const;
+    void sync() const override;
    CudaPtr alloc(size_t size) override {
        void *ptr;
+        // printf("Try to cudaMalloc: %lu bytes\n", size);
        checkCudaError(cudaMalloc(&ptr, size));
-        // printf("cuda malloc: %p %lu bytes\n", ptr, size);
+        allocatedGPUMemorySize += size;
+        allocationMap[ptr] = size;
+        // printf("cuda malloc: %p %lu bytes, total %lu bytes (%.2lf GB)\n",
+        // ptr,
+        //        size, allocatedGPUMemorySize,
+        //        double(allocatedGPUMemorySize) / 1024 / 1024 / 1024);
        return ptr;
    }
-    void dealloc(void *ptr) override { checkCudaError(cudaFree(ptr)); }
+    void dealloc(void *ptr) override {
+        checkCudaError(cudaFree(ptr));
+        allocatedGPUMemorySize -= allocationMap.at(ptr);
+        allocationMap.erase(ptr);
+        // printf("cuda dealloc: %p %lu bytes, total %lu\n", ptr,
+        //        allocationMap.at(ptr), allocatedGPUMemorySize);
+    }
    cudnnHandle_t cudnnHandle() const { return cudnn; }
    cublasHandle_t cublasHandle() const { return cublas; }
    size_t getWorkspaceSize() const { return workspaceSize; }
@ -67,6 +59,10 @@ class CudaRuntimeObj : public RuntimeObj {
        IT_ASSERT(size <= workspaceSize);
        return workspace;
    }
+    pair<int, int> getComputeCapacitiy() const {
+        return {deviceProperties.major, deviceProperties.minor};
+    }
+    int getNumSMs() const { return deviceProperties.multiProcessorCount; }

    void copyBlobFromCPU(void *dst, const void *src,
                         size_t bytes) const override {
@ -85,14 +81,19 @@ class CudaRuntimeObj : public RuntimeObj {

    void runWithoutSync(const Graph &graph) const;

-    void runWithCudaGraph(const Graph &graph);
+    bool isInCudaGraph() const { return cudaGraphStatus; }
+    cudaStream_t getStream() const { return stream; }

-    // init communicator
-    void initComm(const string &name, int worldSize, int rank) final;
-
-    CommunicatorObj &getCommunicator() const final { return *comm; }
+    double timeWithCudaGraph(Graph graph, int rounds = 50);
+    double timeWithCudaGraph(vector<std::function<void(void)>> funcs,
+                             int rounds = 50);
+    void setEnableTF32(bool state);
+    bool getEnableTF32() const { return enableTF32; }

  private:
    void tune(const Graph &graph, bool profiling) const;
+
+    void beginCudaGraphStreamCapture();
+    tuple<cudaGraphExec_t, size_t> endCudaGraphStreamCapture();
 };
 } // namespace infini
--- a/include/cuda/cuda_softmax.h
+++ b/include/cuda/cuda_softmax.h
@ -1,8 +0,0 @@
-#pragma once
-#include "utils/small_array.h"
-namespace infini {
-void softmax_kernel(int num_blocks, float *input, float *output, int size,
-                    int dimsize, int stride);
-void softmax_kernel(int num_blocks, half *input, half *output, int size,
-                    int dimsize, int stride);
-} // namespace infini
--- a/include/cuda/cuda_split_concat.h
+++ b/include/cuda/cuda_split_concat.h
@ -3,13 +3,13 @@
 #include <cstdio>

 const int BATCH_SIZE = 32; // parallel tensor number.
-const int DIM_MAX_SIZE = 8;
+const int DIM_MAX_SIZE = 4;

 // Concat operator acts like element tensors composing to one big tensor,and
 // split operator acts like one big tensor being composed by element
 // tensors.
-template <typename T> struct ElementTensorMetadata {
-    T *data[BATCH_SIZE];
+struct ElementTensorMetadata {
+    float *data[BATCH_SIZE];
    int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in
                             // the composed tensor.
    int dimSize[BATCH_SIZE]; // the dimention size of the element tensor.
@ -20,17 +20,16 @@ template <typename T> struct ElementTensorMetadata {
                   data[i], dimBgNo[i], dimSize[i], nElements[i]);
    }
 };
-template <typename T> struct ComposedTensorMetadata {
+
+struct ComposedTensorMetadata {
    int dimSize[DIM_MAX_SIZE];
    int stride[DIM_MAX_SIZE];
-    T *data;
+    float *data;
 };

 namespace infini {
-void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
-                         const ComposedTensorMetadata<float> &compMeta, int dim,
-                         int batchSize, int nDims, bool isSplit);
-void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
-                         const ComposedTensorMetadata<half> &compMeta, int dim,
+void split_concat_kernel(const ElementTensorMetadata &eleMeta,
+                         const ComposedTensorMetadata &compMeta, int dim,
                         int batchSize, int nDims, bool isSplit);
+
 } // namespace infini
--- a/include/cuda/cuda_transpose.h
+++ b/include/cuda/cuda_transpose.h
@ -5,7 +5,12 @@

 namespace infini {

-void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
-                      SmallArray strides, SmallArray outputShape);
+void transpose_kernel(float *input, float *output, int nDims, int size,
+                      SmallArray strides, SmallArray outputShape,
+                      vector<int> _dims_in, vector<int> _dims_out,
+                      vector<int> _perms);

-}; // namespace infini
+void invoke_transpose_last_two_dim(float *ptrA, float *ptrB, int dim0, int dim1,
+                                   int dim2, int numSMs);
+
+} // namespace infini
--- a/include/cuda/cuda_unary.h
+++ b/include/cuda/cuda_unary.h
@ -3,22 +3,31 @@
 #include "operators/unary.h"

 namespace infini {
-template <typename T> void softmax_kernel(T *input, T *output, size_t num);
-template <typename T> void relu_kernel(T *input, T *output, size_t num);
-template <typename T> void silu_kernel(T *input, T *output, size_t num);
-template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
-template <typename T> void tanh_kernel(T *input, T *output, size_t num);
-template <typename T> void abs_kernel(T *input, T *output, size_t num);
-template <typename T> void sqrt_kernel(T *input, T *output, size_t num);
-template <typename T> void neg_kernel(T *input, T *output, size_t num);
-template <typename T> void gelu_kernel(T *input, T *output, size_t num);
-template <typename T> void erf_kernel(T *input, T *output, size_t num);
-template <typename T> void hard_sigmoid_kernel(T *input, T *output, size_t num);
-template <typename T> void hard_swish_kernel(T *input, T *output, size_t num);
+void softmax_kernel(float *input, float *output, int num);
+void relu_kernel(float *input, float *output, int num);
+void sigmoid_kernel(float *input, float *output, int num);
+void tanh_kernel(float *input, float *output, int num);
+void abs_kernel(float *input, float *output, int num);

-template <typename INPUT, typename OUTPUT>
-void cast_kernel(INPUT *input, OUTPUT *output, size_t num);
+void unary_kernel(const Operator &_op) {
+    auto op = as<UnaryObj>(_op);
+    float *const inputData = (op->getInputs(0)->getRawDataPtr<float *>());
+    float *const outputData = (op->getOutput()->getRawDataPtr<float *>());

-void unary_kernel(const Operator &_op);
+    auto dim = op->getInputs(0)->getDims();
+    int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
+    if (op->getOpType() == OpType::Softmax)
+        softmax_kernel(inputData, outputData, n * c * h * w);
+    else if (op->getOpType() == OpType::Relu)
+        relu_kernel(inputData, outputData, n * c * h * w);
+    else if (op->getOpType() == OpType::Sigmoid)
+        sigmoid_kernel(inputData, outputData, n * c * h * w);
+    else if (op->getOpType() == OpType::Tanh)
+        tanh_kernel(inputData, outputData, n * c * h * w);
+    else if (op->getOpType() == OpType::Abs)
+        abs_kernel(inputData, outputData, n * c * h * w);
+    else
+        IT_TODO_HALT();
+}

 }; // namespace infini
--- a/include/cuda/cuda_utility.h
+++ b/include/cuda/cuda_utility.h
@ -1,29 +1,11 @@
-#pragma once
 #include "core/tensor.h"
-#include "cuda/cuda_common.h"

 namespace infini {

 void cudaPrintFloat(float *x, int len);

-void cudaPrintTensor(const Tensor &tensor);
+void cudaPrintTensor(const Tensor &tensor) {
+    cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
+}

-cudnnDataType_t cudnnDataTypeConvert(DataType dataType);
-cudaDataType cublasDataTypeConvert(DataType);
-
-template <int index> struct DT_CUDA {};
-template <> struct DT_CUDA<0> { using t = bool; };
-template <> struct DT_CUDA<1> { using t = float; };
-template <> struct DT_CUDA<2> { using t = unsigned char; };
-template <> struct DT_CUDA<3> { using t = char; };
-template <> struct DT_CUDA<4> { using t = unsigned short; };
-template <> struct DT_CUDA<5> { using t = short; };
-template <> struct DT_CUDA<6> { using t = int; };
-template <> struct DT_CUDA<7> { using t = long long; };
-template <> struct DT_CUDA<9> { using t = bool; };
-template <> struct DT_CUDA<10> { using t = half; };
-template <> struct DT_CUDA<11> { using t = double; };
-template <> struct DT_CUDA<12> { using t = unsigned int; };
-template <> struct DT_CUDA<13> { using t = unsigned long long; };
-template <> struct DT_CUDA<16> { using t = nv_bfloat16; };
-} // namespace infini
+} // namespace infini
--- a/include/cuda/cuda_where.h
+++ b/include/cuda/cuda_where.h
@ -1,17 +0,0 @@
-#pragma once
-#include "operators/unary.h"
-#include "utils/small_array.h"
-
-namespace infini {
-
-void whereKernel(const float *inputX, const float *inputY,
-                 const uint8_t *condition, float *output, int nDims,
-                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
-                 SmallArray conditionShape, SmallArray outputShape, int xSize,
-                 int ySize, int cSize);
-void whereKernel(const half *inputX, const half *inputY,
-                 const uint8_t *condition, half *output, int nDims,
-                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
-                 SmallArray conditionShape, SmallArray outputShape, int xSize,
-                 int ySize, int cSize);
-}; // namespace infini
--- a/include/cuda/gather.h
+++ b/include/cuda/gather.h
@ -1,61 +1,17 @@
 #pragma once
-#include "core/data_type.h"
-#include "core/operator.h"
-#include "operators/gather.h"
+
+typedef struct {
+    int *indexValue;
+    int axis;
+    int inNDim;
+    int outNDim;
+    int idxNDim;
+    int outDim[4];
+    int idxDim[4];
+    int idxStride[4];
+    int inStride[4];
+} GatherMetaData;

 namespace infini {
-struct GatherMetaData {
-    // Pointer to indices
-    void *indexValue;
-    // Type of index values
-    DataType indexType;
-    // Type of input and output data
-    DataType dataType;
-    // Axis of the gather operation
-    int axis;
-    // Rank of input
-    int inNDim;
-    // Rank of output
-    int outNDim;
-    // Rank of indices
-    int idxNDim;
-    // Shape of output
-    int outDim[4];
-    // Shape of indices
-    int idxDim[4];
-    // Strides of indices
-    int idxStride[4];
-    // Strides of input
-    int inStride[4];
-};
-
-inline void initGatherMetaData(GatherMetaData &metaData,
-                               const Ref<OperatorObj> &_op) {
-    memset(&metaData, 0, sizeof(metaData));
-    auto op = as<GatherBaseObj>(_op);
-    Ref<TensorObj> in = op->getInputs(0);
-    Ref<TensorObj> index = op->getInputs(1);
-    Ref<TensorObj> out = op->getOutput();
-    metaData.indexValue = index->getRawDataPtr<void *>();
-    metaData.indexType = index->getDType();
-    metaData.dataType = in->getDType();
-    metaData.axis = op->getAxis();
-    metaData.inNDim = in->getRank();
-    metaData.outNDim = out->getRank();
-    metaData.idxNDim = index->getRank();
-    for (int i = 0; i < metaData.outNDim; ++i)
-        metaData.outDim[i] = out->getDims()[i];
-    for (int i = 0; i < metaData.idxNDim; ++i) {
-        metaData.idxDim[i] = index->getDims()[i];
-        metaData.idxStride[i] = index->getStride()[i];
-    }
-    for (int i = 0; i < metaData.inNDim; ++i) {
-        metaData.inStride[i] = in->getStride()[i];
-    }
+void gather_kernel(float *in, float *out, GatherMetaData metaData, int num);
 }
-template <typename T>
-void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num);
-
-void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
-                            size_t num);
-} // namespace infini
--- a/include/cuda/gbmm_g2bmm.cuh
+++ b/include/cuda/gbmm_g2bmm.cuh
--- a/include/cuda/nccl_communicator.h
+++ b/include/cuda/nccl_communicator.h
@ -1,70 +0,0 @@
-#pragma once
-#include "core/communicator.h"
-#include <chrono>
-#include <cstdlib>
-#include <filesystem>
-#include <fstream>
-#include <nccl.h>
-#include <thread>
-
-#define checkNcclError(call)                                                   \
-    {                                                                          \
-        auto err = call;                                                       \
-        if (ncclSuccess != err) {                                              \
-            fprintf(stderr, "NCCL error in %s:%i : %s.\n", __FILE__, __LINE__, \
-                    ncclGetErrorString(err));                                  \
-            exit(EXIT_FAILURE);                                                \
-        }                                                                      \
-    }
-
-namespace infini {
-
-class NcclCommunicatorObj final : public CommunicatorObj {
-  private:
-    ncclComm_t comm;
-
-  public:
-    NcclCommunicatorObj(const string &name, int worldSize, int rank)
-        : CommunicatorObj(worldSize, rank) {
-        const std::string filePath("./" + name + "_nccl_id.bin");
-        ncclUniqueId commId;
-        if (rank == 0) {
-            checkNcclError(ncclGetUniqueId(&commId));
-            std::ofstream ofs(filePath, std::ios::binary);
-            ofs.write((char *)&commId, sizeof(ncclUniqueId));
-
-        } else {
-            auto begin = std::chrono::steady_clock::now();
-            while (!std::filesystem::exists(filePath)) {
-                auto now = std::chrono::steady_clock::now();
-                _IT_ASSERT_2(now < begin + std::chrono::seconds(10),
-                             "time limit (10s) exceeded.");
-                std::this_thread::sleep_for(std::chrono::milliseconds(100));
-            }
-            std::ifstream ifs(filePath, std::ios::binary);
-            ifs.read((char *)&commId, sizeof(ncclUniqueId));
-        }
-        checkNcclError(ncclCommInitRank(&comm, worldSize, commId, rank));
-        if (rank == 0) {
-            std::filesystem::remove(filePath);
-        }
-    }
-
-    // Get the actual ncclComm_t
-    ncclComm_t getNcclComm() { return comm; }
-
-    void finalize() { checkNcclError(ncclCommFinalize(comm)); }
-
-    ~NcclCommunicatorObj() final {
-        finalize();
-        checkNcclError(ncclCommDestroy(comm));
-    }
-
-    virtual string toString() const final {
-        std::ostringstream oss;
-        oss << "NCCL communicator";
-        return oss.str();
-    }
-};
-
-} // namespace infini
--- a/include/cuda/softmax.h
+++ b/include/cuda/softmax.h
@ -0,0 +1,6 @@
+#pragma once
+
+namespace infini {
+void softmax_kernel(int max_threadblock_size, int batch_size, float *x,
+                    float *y, int dim, int stride);
+}
--- a/include/ffi/ffi_callback.h
+++ b/include/ffi/ffi_callback.h
@ -0,0 +1,9 @@
+#include "core/graph_handler.h"
+#include "core/mutator.h"
+#include "core/search_engine.h"
+
+namespace infini {
+namespace callback {
+void exportONNX(const Graph &graph, const string &path);
+}
+} // namespace infini
--- a/include/intelcpu/mkl_runtime.h
+++ b/include/intelcpu/mkl_runtime.h
@ -29,7 +29,7 @@ class MklRuntimeObj : public CpuRuntimeObj {
    string toString() const override { return "INTELCPU Runtime"; };
    dnnl::engine getEngine() const { return dnnl::engine(engine, true); }
    dnnl::stream getStream() const { return dnnl::stream(stream, true); }
-    void sync() const;
+    void sync() const override;
 };

 } // namespace infini
--- a/include/kunlun/kunlun_act_type.h
+++ b/include/kunlun/kunlun_act_type.h
@ -1,23 +0,0 @@
-#include "core/op_type.h"
-#include "kunlun/kunlun_common.h"
-
-namespace infini {
-using KunlunActType = xdnn::Activation_t;
-KunlunActType parseActType(ActType act) {
-    switch (act) {
-    case ActType::None:
-        return KunlunActType::LINEAR;
-    case ActType::Tanh:
-        return KunlunActType::TANH;
-    case ActType::Sigmoid:
-        return KunlunActType::SIGMOID;
-    case ActType::Relu:
-        return KunlunActType::RELU6;
-    default:
-        fprintf(stderr, "Activation Type not support yet!\n");
-        break;
-    }
-    return KunlunActType::LINEAR;
-}
-
-}; // namespace infini
--- a/include/kunlun/kunlun_common.h
+++ b/include/kunlun/kunlun_common.h
@ -1,22 +0,0 @@
-#pragma once
-#include "core/common.h"
-#include "xpu/runtime_ex.h"
-#include "xpu/xdnn.h"
-
-namespace xdnn = baidu::xpu::api;
-
-#define checkKUNLUNError(call)                                                 \
-    {                                                                          \
-        auto err = call;                                                       \
-        if (XPU_SUCCESS != err) {                                              \
-            fprintf(stderr, "KUNLUN error in %s:%i : %s.\n", __FILE__,         \
-                    __LINE__, xpu_strerror(err));                              \
-            exit(EXIT_FAILURE);                                                \
-        }                                                                      \
-    }
-
-namespace infini {
-
-using KUNLUNPtr = void *;
-
-} // namespace infini
--- a/include/kunlun/kunlun_kernel_without_config.h
+++ b/include/kunlun/kunlun_kernel_without_config.h
@ -1,24 +0,0 @@
-#pragma once
-#include "core/kernel.h"
-#include "kunlun/kunlun_runtime.h"
-
-namespace infini {
-
-class KUNLUNKernelWithoutConfig : public Kernel {
-  public:
-    virtual void compute(const Operator &op, const PerfRecord &record,
-                         const RuntimeObj *context) const {
-        compute(op, context);
-    }
-    virtual void compute(const Operator &op,
-                         const RuntimeObj *context) const = 0;
-    // Premise: op is idempotent since it is called multiple times.
-    virtual PerfRecord tune(const Operator &op,
-                            const RuntimeObj *_context) const {
-        auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
-        return make_ref<PerfRecordObj>(timeit([&]() { compute(op, _context); },
-                                              [&]() { context->sync(); }));
-    }
-};
-
-} // namespace infini
--- a/include/kunlun/kunlun_runtime.h
+++ b/include/kunlun/kunlun_runtime.h
@ -1,81 +0,0 @@
-#pragma once
-#include "core/runtime.h"
-#include "core/workspace.h"
-#include "kunlun/kunlun_common.h"
-#ifdef INFINI_USE_XCCL
-#include "kunlun/xccl_communicator.h"
-#endif
-namespace infini {
-
-class KUNLUNRuntimeObj : public RuntimeObj {
-  private:
-    xdnn::Context *ctx;
-    std::unique_ptr<CommunicatorObj> comm;
-    // KUNLUNPtr workspace;
-    // size_t workspaceSize;
-    Workspace<KUNLUNPtr> workspace;
-
-  public:
-    KUNLUNRuntimeObj(int deviceId = 0) : RuntimeObj(Device::KUNLUN) {
-        xpu_set_device(deviceId);
-        ctx = xdnn::create_context();
-        // 10GB for Longformer
-        // size_t longformerNum = 3lu * (1 << 30);
-        size_t workspaceSize = 2llu << 30; // 2 GB
-        KUNLUNPtr wkspacePtr = alloc(workspaceSize);
-        workspace =
-            make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
-    }
-    virtual ~KUNLUNRuntimeObj() {
-        KUNLUNPtr wkspacePtr = workspace->getWorkspace();
-        dealloc(wkspacePtr);
-        xdnn::destroy_context(ctx);
-    }
-    string toString() const override;
-
-    void run(const Graph &graph, bool tune = false,
-             bool profiling = false) const;
-    // double runEvaluation(const Graph &graph, int nWarmups,
-    //                      int nEvaluations) const;
-    void sync() const;
-
-    KUNLUNPtr alloc(size_t size) override {
-        void *ptr;
-        checkKUNLUNError(
-            xpu_malloc((void **)&ptr, size, XPUMemoryKind::XPU_MEM_HBM));
-        return ptr;
-    }
-    void dealloc(void *ptr) override { xpu_free(ptr); }
-
-    xdnn::Context *KUNLUNHandle() const { return ctx; }
-    // Get $size workspace by bytes
-    KUNLUNPtr getWorkspace(size_t size) const {
-        auto ret = workspace->getWorkspace(size);
-        return ret;
-    }
-    Workspace<KUNLUNPtr> getWorkspaceObj() const { return workspace; }
-
-    void copyBlobFromCPU(void *dst, const void *src,
-                         size_t bytes) const override {
-        xpu_memcpy(dst, const_cast<void *>(src), bytes,
-                   XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-    }
-    void copyBlobToCPU(void *dst, const void *src,
-                       size_t bytes) const override {
-        xpu_memcpy(dst, const_cast<void *>(src), bytes,
-                   XPUMemcpyKind::XPU_DEVICE_TO_HOST);
-    }
-    void copyBlobInsideRuntime(void *dst, const void *src,
-                               size_t bytes) const override {
-        xpu_memcpy(dst, const_cast<void *>(src), bytes,
-                   XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
-    }
-    void initComm(const string &name, int worldSize, int rank) final;
-
-    CommunicatorObj &getCommunicator() const final { return *comm; }
-
-  private:
-    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
-};
-
-} // namespace infini
--- a/include/kunlun/operator_timer.h
+++ b/include/kunlun/operator_timer.h
@ -1,10 +0,0 @@
-#pragma once
-namespace infini {
-namespace opTimer {
-double getPerfConvXdnn(int n, int c, int h, int w, int f, int r, int s,
-                       int padh, int padw, int strideh, int stridew,
-                       int dilationh, int dilationw, int group,
-                       const char *name);
-double getPerfMatmulXdnn(int b, int m, int n, int k, const char *name);
-} // namespace opTimer
-} // namespace infini
--- a/include/kunlun/xccl_communicator.h
+++ b/include/kunlun/xccl_communicator.h
@ -1,60 +0,0 @@
-#pragma once
-#include "core/communicator.h"
-#include "xpu/bkcl.h"
-#include <chrono>
-#include <filesystem>
-#include <fstream>
-#include <thread>
-
-#define checkXcclError(call)                                                   \
-    {                                                                          \
-        auto err = call;                                                       \
-        if (BKCL_SUCCESS != err) {                                             \
-            fprintf(stderr, "XCCL error in %s:%i.\n", __FILE__, __LINE__);     \
-            exit(EXIT_FAILURE);                                                \
-        }                                                                      \
-    }
-
-namespace infini {
-
-class XcclCommunicatorObj final : public CommunicatorObj {
-  private:
-    BKCLContext_t comm;
-
-  public:
-    XcclCommunicatorObj(const string &name, int worldSize, int rank)
-        : CommunicatorObj(worldSize, rank) {
-        const std::string filePath("./" + name + "_xccl_id.bin");
-        BKCLUniqueId commId;
-        if (rank == 0) {
-            checkXcclError(bkcl_get_unique_id(&commId));
-            std::ofstream ofs(filePath, std::ios::binary);
-            ofs.write((char *)&commId, sizeof(BKCLUniqueId));
-        } else {
-            auto begin = std::chrono::steady_clock::now();
-            while (!std::filesystem::exists(filePath)) {
-                auto now = std::chrono::steady_clock::now();
-                _IT_ASSERT_2(now < begin + std::chrono::seconds(100),
-                             "time limit (100s) exceeded.");
-                std::this_thread::sleep_for(std::chrono::milliseconds(100));
-            }
-            std::ifstream ifs(filePath, std::ios::binary);
-            ifs.read((char *)&commId, sizeof(BKCLUniqueId));
-        }
-        checkXcclError(bkcl_init_rank(&comm, rank, worldSize, &commId));
-        if (rank == 0) {
-            std::filesystem::remove(filePath);
-        }
-    }
-
-    BKCLContext_t getXcclComm() { return comm; }
-
-    ~XcclCommunicatorObj() final { checkXcclError(bkcl_destroy_context(comm)); }
-    virtual string toString() const final {
-        std::ostringstream oss;
-        oss << "XCCL communicator";
-        return oss.str();
-    }
-};
-
-} // namespace infini
--- a/include/nnet/Visitor/Serializer.h
+++ b/include/nnet/Visitor/Serializer.h
@ -20,6 +20,7 @@ class Serializer : public Functor<string()> {
    string visit_(const Subscript &c) override;
    string visit_(const Var &c) override;
    string visit_(const Tensor &c) override;
+    string visit_(const Func &c) override;
    string dispatchRoutine(const Routine &c);

    Expr buildExprTree(string key);
@ -29,16 +30,44 @@ class Serializer : public Functor<string()> {
    Serializer(int _verobse = 0);
    virtual ~Serializer();

+    /**
+     * @brief Serialize the given expression to string
+     *
+     * @param expr The expression to be serialized
+     * @param msg Message of derivation
+     * @param inputs membound operator attributes
+     * @param exec_time membound operator attributes
+     * @param hint membound operator attributes
+     * @return bool Whether the serialization succeed
+     */
+    std::optional<std::string> toString(Expr const &expr,
+                                        const string &msg = "",
+                                        vector<Tensor> inputs = {},
+                                        double exec_time = -1e9,
+                                        string hint = "");
+
    /**
     * @brief Serialize the given expression to json file
     *
     * @param expr The expression to be serialized
     * @param filePath The path of json file to be output
     * @param msg Message of derivation
+     * @param inputs membound operator attributes
+     * @param exec_time membound operator attributes
+     * @param hint membound operator attributes
     * @return bool Whether the serialization succeed
     */
-    bool serialize(const Expr &expr, const string &filePath,
-                   const string &msg = "");
+    bool toFile(const Expr &expr, const string &filePath,
+                const string &msg = "", vector<Tensor> inputs = {},
+                double exec_time = -1e9, string hint = "");
+
+    /**
+     * @brief Deserialize the given json file to expression
+     *
+     * @param text The text of the expr to be deserialized
+     * @return Expression deserialized from the given json file
+     */
+    Expr fromString(const string &text);

    /**
     * @brief Deserialize the given json file to expression
@ -46,7 +75,15 @@ class Serializer : public Functor<string()> {
     * @param filePath The path to file to be deserialized
     * @return Expression deserialized from the given json file
     */
-    Expr deserialize(const string &filePath);
+    Expr fromFile(const string &filePath);
+
+    tuple<Expr, vector<Tensor>, double, string>
+    deserializeAsMemobundOp(const string &filePath);
+
+    // FIXME: the order of elements in tuple is not consistent with memboundObj
+    // constructor
+    tuple<Expr, vector<Tensor>, double, string>
+    membundOpFromString(const string &data);
 };

-} // namespace nnet
+} // namespace nnet
--- a/include/nnet/common.h
+++ b/include/nnet/common.h
@ -69,7 +69,8 @@ static inline HashType genhash(string s) {
    { IT_TODO_HALT(); }

 #define nnet_unimplemented_continue()                                          \
-    { dbg("Unimplemented"); }
+    {}
+// { dbg("Unimplemented"); }

 #define nnet_assert(expr, msg) assert(((void)(msg), (expr)))

--- a/include/nnet/derivator.h
+++ b/include/nnet/derivator.h
@ -67,11 +67,13 @@ class Derivator {
    vector<string> ruleStates, ruleMsgs;
    int cntStates = 0;   // the number of intermediate states
    int searchState = 0; // search state in guided search
+    bool printAndExit;
+    void printDerivationRules();

  public:
    Derivator(int maxDepth = 8, bool enableHashPruning = true,
              LogMode mode = LogMode::NoLog,
-              PassMode passMode = PassMode::Debug);
+              PassMode passMode = PassMode::Debug, bool printAndExit = false);
    void search(Formula &origin, int depth);
    void ruleBasedDFS(Formula &origin, int depth, vector<int> _rules,
                      map<int, vector<Var>> _substituteRules = {},
--- a/include/nnet/expr.h
+++ b/include/nnet/expr.h
@ -104,7 +104,7 @@ enum class NodeType {
    FuncNodeType
 };

-enum class FuncType { Relu, Tanh, PRelu };
+enum class FuncType { Relu = 1000, Tanh, PRelu };

 #define DEFINE_GETTYPE(CLASS, isScalar_v)                                      \
    NodeType getType() const override { return NodeType::CLASS##Type; }        \
@ -206,7 +206,8 @@ struct IterationType {
    enum { Loop, Sum };
    constexpr static int NumIterationType = 2;
 };
-class RangeOpNode : public OperatorNode {
+class RangeOpNode : public OperatorNode,
+                    public std::enable_shared_from_this<RangeOpNode> {
  public:
    enum { Summand, END_POS };
    constexpr static int Loop = IterationType::Loop;
@ -230,6 +231,7 @@ class RangeOpNode : public OperatorNode {
        return 0;
    };
    string toReadable() const override;
+    string getFullExpression();
    const Expr &getSummand() const { return subExprs[Summand]; }
    const vector<VarRangePair> &getVarRanges(int _index) const {
        return vars[_index];
@ -384,13 +386,16 @@ class FuncNode : public ExprNode {
 };

 // Wrappers for type deduction
-Subscript makeSubscript(const Expr &tensor, const VecExpr &subscripts);
-RangeOp makeRangeOperator(const vector<VarRangePair> &_loopIters,
-                          const vector<VarRangePair> &_sumIters, Expr _summand,
-                          const vector<int> &paddings = {});
-Tensor makeTensor(const string &name, const vector<int> &shape,
-                  const vector<int> &paddings = {},
-                  const Routine &source = nullptr);
+
+// make subscript
+Subscript mSub(const Expr &tensor, const VecExpr &subscripts);
+// make range operator
+RangeOp mL(const vector<VarRangePair> &_loopIters,
+           const vector<VarRangePair> &_sumIters, Expr _summand,
+           const vector<int> &paddings = {});
+// make tensor
+Tensor mT(const string &name, const vector<int> &shape,
+          const vector<int> &paddings = {}, const Routine &source = nullptr);

 // Pretty output for dbg with shared_ptr
 template <typename T, typename std::enable_if_t<std::is_base_of_v<ExprNode, T>>
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Liyan Zheng	295450e5f4	Add: show conv2gemm derivation	2023-11-10 22:49:07 +08:00
Liyan Zheng	29071ddcac	Rename: Expr construction helpers	2023-06-25 20:31:08 +08:00
Liyan Zheng	c6c445991a	Add: enable mutator search in python	2023-06-25 20:18:18 +08:00
Liyan Zheng	d25b606e12	Add: TF32 supports and accurate timing for conv	2023-05-07 13:22:39 +08:00
Liyan Zheng	abcfa76fb5	Add: efficient CUDA transpose for last two dims	2023-05-05 15:16:07 +08:00
Liyan Zheng	6a70555892	Add: TensorRT backend	2023-04-30 23:44:10 +08:00
Liyan Zheng	f47a411095	Add: export with random weight	2023-04-30 22:25:07 +08:00
Liyan Zheng	df2534d209	Fix: fuse Relu to GEMM/Conv	2023-04-30 16:16:16 +08:00
Liyan Zheng	a1f02593d3	Add: export Flatten operator to ONNX	2023-04-30 16:15:24 +08:00
Liyan Zheng	65b4b42fa0	Merge remote-tracking branch 'origin/NNET_OpSearch' into NNET_e2e Fix: update of derivator.h is missing in NNET_OpSearch	2023-04-30 15:48:48 +08:00
Liyan Zheng	b068442bfb	Add: print time in op evalution	2023-04-30 00:47:57 +08:00
Liyan Zheng	c6e7748786	Add: rule of Any+Relu->Any	2023-04-28 21:46:54 +08:00
Liyan Zheng	d0ae48d21d	Add: CUDA Matmul selection	2023-04-28 19:13:19 +08:00
Liyan Zheng	c875f3cbb8	Add: Matmul Transpose plans	2023-04-28 19:13:01 +08:00
Liyan Zheng	95a8b90fa7	Fix: add virutal method sync in Runtime	2023-04-28 00:44:54 +08:00
Liyan Zheng	c58b67f743	Chore: suppress output	2023-04-26 14:11:53 +08:00
Liyan Zheng	75c9226164	Merge branch 'NNET_e2e' into NNET_op_test	2023-04-25 04:32:18 +08:00
Liyan Zheng	f877eca517	Add: IT_ASSERT in checkCudaError	2023-04-25 04:25:10 +08:00
Liyan Zheng	b13b799fbe	All mdoels E2E	2023-04-25 04:24:43 +08:00
Liyan Zheng	350fc01d39	Add: Search Depth	2023-04-25 01:07:21 +08:00
whjthu	71f4f6e9d9	add op test for einnet	2023-04-24 21:48:29 +08:00
Liyan Zheng	1408d308cc	Add: FCRS log	2023-04-24 21:20:29 +08:00
Liyan Zheng	11229a2baa	Add: Figure 17	2023-04-24 21:07:30 +08:00
Liyan Zheng	2b85ac41ef	Fix: CUDA Relu for 2D/1D tensor	2023-04-24 16:14:26 +08:00
Liyan Zheng	1e46750159	Add conv2bgemm and fix mutator::runtime	2023-04-24 13:12:40 +08:00
Liyan Zheng	079985bc8c	Add: efficient transpose	2023-04-24 13:08:29 +08:00
Liyan Zheng	c1275cddb6	Fix: conv2dreduce_kernel_ offset	2023-04-24 02:30:41 +08:00
Liyan Zheng	51cc042f56	Add: nchw to nhwc conversion Fix: conv parameter error in to_onnx	2023-04-24 02:29:53 +08:00
Liyan Zheng	18d6ba4022	Merge branch 'NNET_e2e' into NNET_gcn	2023-04-23 23:20:46 +08:00
Liyan Zheng	4211fd1f32	Fix: matmul transpose in convNHWC2gemm rule	2023-04-23 22:54:50 +08:00
xxcclong	8409c1f9d4	tested fsrcnn	2023-04-23 22:19:51 +08:00
xxcclong	830b28913c	better transposed convreduce	2023-04-23 21:36:25 +08:00
Liyan Zheng	1ba78d7f89	Add: reduce in Any	2023-04-23 21:36:12 +08:00
xxcclong	777aebafc9	fsrcnn	2023-04-23 20:56:19 +08:00
whjthu	131a679340	gcn optimization	2023-04-23 13:43:41 +08:00
Liyan Zheng	5df2524ff9	Merge branch 'NNET_eliminateOP' into NNET_e2e	2023-04-23 13:35:29 +08:00
Liyan Zheng	f204866d93	Fix: reduce workspace size	2023-04-23 13:34:07 +08:00
Liyan Zheng	b9819e65c1	Fix: allow eliminate and fusion failure in search	2023-04-23 13:15:34 +08:00
Liyan Zheng	7277356744	Add: Reshape/Transpose elimination	2023-04-23 02:10:05 +08:00
whjthu	f820117acd	fix unused code	2023-04-23 00:18:26 +08:00
whjthu	1ab2118716	add AnyOp and cuda kernel	2023-04-23 00:16:03 +08:00
huangshuhong	ff97c879fb	add ConvNHWC and FSRCNN graph	2023-04-23 00:02:22 +08:00
Liyan Zheng	acc64fd32c	Merge branch 'NNET_transpose' into NNET_e2e Fix: gridSize and blockSize in Reshape kernel	2023-04-22 21:32:31 +08:00
Liyan Zheng	33ab5dcd3e	Fix: gbmm kernel	2023-04-22 21:14:52 +08:00
Liyan Zheng	e2f18272c9	Add: no malloc for reshape outputs	2023-04-22 21:13:57 +08:00
Liyan Zheng	40e6db6608	Add: tensor FUID in exported ONNX	2023-04-22 20:28:17 +08:00
Liyan Zheng	c451918224	Fix: tensor size overflow	2023-04-22 20:28:00 +08:00
whjthu	34ed298725	fix format	2023-04-22 17:00:52 +08:00
whjthu	664f0dbe02	support cuda transpose	2023-04-22 16:57:27 +08:00
Liyan Zheng	a732b6f176	Fix: ignore transpose in CudaGraph since no kernel	2023-04-22 16:08:40 +08:00
Liyan Zheng	0865f8d823	Chore: move TensorObj::clone to .cc	2023-04-22 16:03:16 +08:00
Liyan Zheng	84f9d6731a	Add: Longformer models	2023-04-22 16:00:29 +08:00
Liyan Zheng	4f02eeb08c	Add: G2BMM kernels generated by tvm 0.10	2023-04-22 15:40:59 +08:00
whjthu	225a42f22d	add rule for dilated conv	2023-04-21 23:40:45 +08:00
Liyan Zheng	4e9ece76f4	Chore: remove out-of-date code	2023-04-21 23:22:40 +08:00
Liyan Zheng	16a8c5dce5	Add: Conv1x1 rule	2023-04-21 23:21:04 +08:00
Liyan Zheng	d051460c23	Chore: suppress output	2023-04-21 22:58:18 +08:00
Liyan Zheng	d8a133684e	Add: remove independent tensors in graph	2023-04-21 22:57:23 +08:00
Liyan Zheng	9ce21200c4	Add: NMutator mode in python	2023-04-21 21:31:22 +08:00
Liyan Zheng	b943658713	Finish: GAN	2023-04-21 21:25:43 +08:00
Liyan Zheng	2cd75bd79b	Merge branch 'NNET_e2e_fix' into NNET_e2e Support CUDA Graph for TVM kernels	2023-04-21 13:18:44 +08:00
Liyan Zheng	f0fcbe825f	Add: python verification	2023-04-21 13:18:24 +08:00
huangshuhong	8c91faa948	remove expect	2023-04-21 00:17:04 +08:00
huangshuhong	c0ae03a2d7	fix tvm stream	2023-04-21 00:09:47 +08:00
Liyan Zheng	0cb8729bc1	Add: different ONNX names for inputs and weights	2023-04-20 21:51:47 +08:00
YdrMaster	8bc2d3e48d	fix: test graph handler Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-04-20 21:51:47 +08:00
YdrMaster	28b123753e	feat: 导入 Tensor 类型 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-04-20 21:51:47 +08:00
Liyan Zheng	94730d93b5	Add: hash match for membound kernels	2023-04-20 17:16:01 +08:00
Liyan Zheng	6d17c4caa2	Add: getPerfTime in run_models_nnet	2023-04-20 10:54:49 +08:00
Liyan Zheng	15d0eb79cd	Add: import ONNX with membound Op	2023-04-20 10:45:28 +08:00
Liyan Zheng	2a343e240e	Add: shape of intermediate tensor in exported ONNX	2023-04-20 10:45:28 +08:00
Liyan Zheng	34ca6bf149	Fix: skip check when Graph is exported to ONNX	2023-04-20 10:45:28 +08:00
YdrMaster	a6019e79e3	feat(py): 支持从 Graph 直接创建 OnnxStub Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-04-20 10:45:28 +08:00
YdrMaster	4e1cc8d3e4	refactor(py): 使用工厂方法创建 OnnxStub Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-04-20 10:44:39 +08:00
YdrMaster	725f9260cf	feat: 支持导出 membound Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-04-20 10:44:39 +08:00
YdrMaster	0edd138919	feat: 正反序列化分离为到 string 的和到 file 的 fix: 正确设置 `USE_CUDA` cfg todo: test_search 不过 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-04-20 10:44:39 +08:00
Liyan Zheng	0b23a065ca	Add: debug hacks for InfoGAN	2023-04-20 10:42:56 +08:00
Liyan Zheng	e86e993ed4	Add: CUDA graph stream capture (MemboundOp fails)	2023-04-19 16:32:16 +08:00
Liyan Zheng	e4c20a9ae2	Add: warmup and repeat args in timeNonCtcOperators	2023-04-19 16:22:59 +08:00
Liyan Zheng	537b3b4ea4	Add: Membound operator serialization	2023-04-18 21:53:48 +08:00
Liyan Zheng	2812900ea2	Fix: OpType and print device tensors	2023-04-18 20:28:08 +08:00
Liyan Zheng	01fc19795d	Add: time non-compile-cime-computable operators	2023-04-18 17:21:16 +08:00
Liyan Zheng	afc4123328	Chore: remove deprecated function	2023-04-18 17:21:16 +08:00
Liyan Zheng	b981951a47	Add: NMutator::memboundToJson to export memboundOp	2023-04-18 17:21:16 +08:00
Liyan Zheng	99b5c95455	Add: nnet::Serializer supports FuncNode	2023-04-18 17:21:16 +08:00
Liyan Zheng	9d50b30af8	Chore: disable nnet_unimplemented_continue output	2023-04-18 17:21:16 +08:00
Liyan Zheng	bc31219bde	Add: exclude compile-time computable operator time	2023-04-18 17:21:16 +08:00
Liyan Zheng	edf4e33353	Add: C++ callback to export ONNX	2023-04-18 17:19:05 +08:00
Liyan Zheng	872f3504a9	Add: RangeOpNode::getFullExpression()	2023-04-18 17:19:05 +08:00
Liyan Zheng	da49e91ab0	Add: fuse membound operators	2023-04-18 17:19:05 +08:00
Liyan Zheng	a6b8f344d4	Chore: simplify type names	2023-04-18 17:19:05 +08:00
Liyan Zheng	09293730ea	Add: export to ONNX with custom operators	2023-04-18 17:19:05 +08:00
Liyan Zheng	307614d95d	Add: infogan python interface	2023-04-18 17:16:25 +08:00
Liyan Zheng	f14edcd52f	Fix: avoid reload library	2023-04-18 17:16:25 +08:00
Liyan Zheng	d2d49c5d4f	Add: invoke TVM through pipe	2023-04-18 17:16:25 +08:00
Liyan Zheng	e72fe79168	Add: search engine uses estimated time	2023-04-18 17:16:25 +08:00
				`@ -0,0 +1 @@`
				`Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9`
				`@ -1 +0,0 @@`
				`Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77`
				`@ -1 +0,0 @@`
				`Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65`