From 26f0d13c26bf5b264b8d9ca65e3034bdb34aa552 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Tue, 18 Apr 2023 15:10:33 +0800
Subject: [PATCH] Dev for 202303ddl (#66)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add activation operatiopn relu, tanh, sigmoid on mlu

* commit for format

* add activation backward operation

* add test for activation_backward

* add test

* add convbpfilter

* fix

* add transpsoe code and test

* add trigon function operation on mlu: sin,cos,tan,asin,sinh,asinh

* add copy operation on mlu

* add ceil operation and floor operation

* add operation clip

* add operation cnnl div, test and test for divdemo bangc kernel

* add divnonan operation and test

* add erf operation

* add exp operation

* add operation fill

* add log operation

* add log1p operation

* add l2loss operation

* add maximum and minimum operation

* add mseloss operation

* add negTensor operation

* add power operation

* add reciprocal operation

* add sqrt and rsqrt operation

* add transform operation

* add addn operation

* add muln operation

* cherrry pick some operation

* add floordiv operation and floordivtrunc operation

* add floormod operation

* add cumsum operation

* add det operation

* add pad operation

* format

* add concat operation

* format

* add split operation

* fix concat and split operation

* add round operation

* add pooling operation

* add square operation

* add squaredDifference operation

* code format fix

* add flip operation

* code format fix

* add hardtanh operation

* add logic operation

* add addcdiv and addcmul operation

* add arange operation

* add bitcompute operation

* add net test

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* style: rename

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 用 NativeCpuRuntime 替换 CpuRuntime

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix code

* fix code

* fix code by review suggestion

* remove operation which is not the onnx operation

* fix format

* clang format

* refactor: tensor 的 print 加一层模板的 dataToString

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: onnx 导出

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 增加计算图优化接口

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add clip operation

* feat: 支持导入 clip

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* test: 导入导出测试加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix batch norm

* feat: 增加 Shape 算子

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入 unsqueeze

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正 clip 接口

feat: 支持导入 transpose
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add broadcast operation

* fix elementwise-broadcast

* fix elementwise broadcast

* add broadcast for gpu elementsie

* feat: pad 支持 axes 负数

feat: 不支持的 padding 导出为独立的 pad 算子

feat: 支持导入 onnxsim 过的 inception
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正池化的测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导出 pads，支持 inception 导入导出，已加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持 densenet 导入导出，并加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导入 squeeze

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix softmax

* feat: 导出 clip 和 transpose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持 Conv 的 bias

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: bias of conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: bias of conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导入 split

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导出 split

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: conv group

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: matmul 的 bias 没有放在输入里，修正

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix exmaple

* fix: 改正 reduce_mean 导出

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* refactor: 修改 slice 实现与 onnx 一致

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* style: 不导出两个 runtime 函数

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* doc: 中文使用指南

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* doc: 补全指南

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修复导入数据的问题

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 添加 Dropout 基本结构，但不支持两个输出是不同的类型

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 重新导出优化接口

feat: dropout 导入
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* build: BANG 选项加入 Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fxi code, change of test/kernels/bang/test* is use NativeCpuRuntime.
chaneg of include/bang/bang_runtime is for the cntoolkit upgrade.

* feat: 导出 bang runtime

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add USE_BANG=1

* fix matmul

* fix reshape

* fix

* fix activation

* fix transpose

* format

* format

* update Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入导出 ConvTranspose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add prelu on mlu

* fix: ConvTranspose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入导出 PRelu

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add convtrans on mlu

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* docs: 更新 README_CN.md

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix code by review suggestions

* style

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: Softmax 的 axis 可以用默认值？感觉是 onnx 不标准

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix cuda & intelcpu bugs after merging

---------

Signed-off-by: YdrMaster <ydrml@hotmail.com>
Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: whjthu <haojie0429@gmail.com>
---
 .github/workflows/build.yml                   |  56 +-
 .github/workflows/clang-format-check.yml      |   3 +
 .gitmodules                                   |   3 +
 CMakeLists.txt                                |  20 +-
 Makefile                                      |  15 +-
 README.md                                     |   4 +-
 README_CN.md                                  | 221 +++++++
 example                                       |   1 +
 include/bang/bang_element_wise.h              |  22 -
 include/core/constants.h                      |   2 +-
 include/core/data_type.h                      |  10 +
 include/core/graph.h                          |   2 +
 include/core/graph_handler.h                  |  12 +
 include/core/hash.h                           |   2 +-
 include/core/object.h                         |   2 +-
 include/core/operator.h                       | 123 +++-
 include/core/perf_engine.h                    |   2 +-
 include/core/ref.h                            |   2 +-
 include/core/tensor.h                         |  43 +-
 include/cuda/cuda_clip.h                      |   9 +
 include/cuda/cuda_element_wise.h              |   6 +-
 include/nnet/Pass/MatchComputationKernel.h    |   2 +-
 include/nnet/Pass/MatchMemBoundKernel.h       |   2 +-
 include/nnet/Pass/Pass.h                      |   2 +-
 include/nnet/Pass/Rule1VariableSplit.h        |   2 +-
 include/nnet/Pass/Rule2VariableMerging.h      |   2 +-
 include/nnet/Pass/Rule3StageSplit.h           |   2 +-
 include/nnet/Pass/Rule4StageMerging.h         |   2 +-
 include/nnet/Pass/Rule5RangeRelaxation.h      |   2 +-
 include/nnet/Pass/Rule6KenerlMatching.h       |   2 +-
 include/nnet/Pass/Rule7DLT.h                  |   2 +-
 include/nnet/Pass/Rule8GuidedDLT.h            |   2 +-
 include/nnet/Pass/Rule90TwoStageElementWise.h |   2 +-
 include/nnet/Pass/Rule91MergeStagesWithSum.h  |   2 +-
 include/nnet/Pass/Rule9RangeMagnify.h         |   2 +-
 include/nnet/ReplaceKit.h                     |   2 +-
 include/nnet/dbg.h                            |   4 +-
 include/nnet/dlt.h                            |   2 +-
 include/nnet/nmutator.h                       |   2 +-
 include/nnet/permutation.h                    |   2 +-
 include/nnet/ref.h                            |   2 +-
 include/nnet/test.h                           |   2 +-
 include/operators/activation_backward.h       |  32 +
 include/operators/batch_norm.h                |   8 +-
 include/operators/conv.h                      |  24 +
 include/operators/det.h                       |  22 +
 include/operators/dropout.h                   |  52 ++
 include/operators/element_wise.h              |  41 ++
 include/operators/matmul.h                    |   4 +-
 include/operators/slice.h                     |  26 +-
 include/operators/transpose.h                 |  22 +
 include/operators/unary.h                     | 262 ++++++++
 pyinfinitensor/pyproject.toml                 |   2 +-
 pyinfinitensor/src/pyinfinitensor/onnx.py     | 468 ++++++++++---
 pyinfinitensor/tests/test_onnx.py             |  66 +-
 src/core/graph.cc                             |   9 +
 src/core/graph_handler.cc                     |  64 ++
 src/core/tensor.cc                            | 121 ++--
 src/cuda/cuda_runtime.cc                      |   8 +-
 src/ffi/ffi_infinitensor.cc                   | 119 +++-
 src/kernels/bang/activation.cc                | 208 ++++++
 src/kernels/bang/activation_backward.cc       |  94 +++
 src/kernels/bang/cast.cc                      | 185 ++++++
 src/kernels/bang/ceil.cc                      |  46 ++
 src/kernels/bang/clip.cc                      |  42 ++
 src/kernels/bang/concat.cc                    |  68 ++
 src/kernels/bang/conv_trans.cc                |  88 +++
 src/kernels/bang/convbpfilter.cc              | 159 +++++
 src/kernels/bang/copy.cc                      |  46 ++
 src/kernels/bang/det.cc                       |  53 ++
 src/kernels/bang/element_wise.cc              | 616 +++++++++++++++++-
 src/kernels/bang/erf.cc                       |  47 ++
 src/kernels/bang/exp.cc                       |  47 ++
 src/kernels/bang/fill.cc                      |  40 ++
 src/kernels/bang/floor.cc                     |  46 ++
 src/kernels/bang/hardtanh.cc                  |  42 ++
 src/kernels/bang/l2loss.cc                    |  40 ++
 src/kernels/bang/log.cc                       |  62 ++
 src/kernels/bang/matmul.cc                    |  26 +-
 src/kernels/bang/negtensor.cc                 |  46 ++
 src/kernels/bang/pad.cc                       |  65 ++
 src/kernels/bang/pooling.cc                   |  73 +++
 src/kernels/bang/reciprocal.cc                |  46 ++
 src/kernels/bang/reshape.cc                   |  42 ++
 src/kernels/bang/rsqrt.cc                     |  47 ++
 src/kernels/bang/split.cc                     |  69 ++
 src/kernels/bang/sqrt.cc                      |  47 ++
 src/kernels/bang/transpose.cc                 |  60 ++
 src/kernels/bang/trigon.cc                    | 184 ++++++
 src/kernels/cpu/element_wise.cc               |  42 +-
 src/kernels/cpu/unary.cc                      |  21 +
 src/kernels/cuda/batch_norm.cc                |   9 +-
 src/kernels/cuda/clip.cc                      |  27 +
 src/kernels/cuda/clip.cu                      |  32 +
 src/kernels/cuda/element_wise.cc              |  53 +-
 src/kernels/cuda/element_wise.cu              |  59 +-
 src/kernels/cuda/pad_slice.cc                 |   2 +-
 src/kernels/cuda/unary.cc                     |  46 ++
 src/kernels/intelcpu/slice.cc                 |   2 +-
 src/kernels/mlu/CMakeLists.txt                |  46 --
 src/kernels/mlu/include/bang_div.h            |   7 -
 src/kernels/mlu/include/div.h                 |   7 -
 src/kernels/mlu/src/div.mlu                   |  24 -
 src/kernels/mlu/src/div_device.mlu            |  50 --
 src/operators/activation_backward.cc          |  37 ++
 src/operators/batch_norm.cc                   |   6 +-
 src/operators/conv.cc                         |  75 +++
 src/operators/det.cc                          |  43 ++
 src/operators/dropout.cc                      |  40 ++
 src/operators/element_wise.cc                 |  78 ++-
 src/operators/matmul.cc                       |   5 +-
 src/operators/pad.cc                          |   3 +-
 src/operators/reduce_mean.cc                  |   7 +-
 src/operators/slice.cc                        | 124 ++--
 src/operators/transpose.cc                    |  50 ++
 src/operators/unary.cc                        | 284 ++++++++
 test/core/test_graph_replace.cc               |   6 +-
 test/core/test_tensor_save.cc                 |   2 +
 .../bang/test_bang_activation_backward.cc     |  56 ++
 test/kernels/bang/test_bang_bitcompute.cc     |  51 ++
 test/kernels/bang/test_bang_cast.cc           |  40 ++
 test/kernels/bang/test_bang_ceil.cc           |  40 ++
 test/kernels/bang/test_bang_clip.cc           |  42 ++
 test/kernels/bang/test_bang_concat.cc         |  52 ++
 test/kernels/bang/test_bang_copy.cc           |  40 ++
 test/kernels/bang/test_bang_det.cc            |  41 ++
 test/kernels/bang/test_bang_erf.cc            |  40 ++
 test/kernels/bang/test_bang_exp.cc            |  40 ++
 test/kernels/bang/test_bang_fill.cc           |  40 ++
 test/kernels/bang/test_bang_floor.cc          |  40 ++
 test/kernels/bang/test_bang_floordiv.cc       |  49 ++
 test/kernels/bang/test_bang_floormod.cc       |  49 ++
 test/kernels/bang/test_bang_hardtanh.cc       |  43 ++
 test/kernels/bang/test_bang_l2loss.cc         |  40 ++
 test/kernels/bang/test_bang_log.cc            |  42 ++
 test/kernels/bang/test_bang_logic.cc          |  56 ++
 test/kernels/bang/test_bang_maximum.cc        |  46 ++
 test/kernels/bang/test_bang_minimum.cc        |  46 ++
 test/kernels/bang/test_bang_mseloss.cc        |  57 ++
 test/kernels/bang/test_bang_neg.cc            |  40 ++
 test/kernels/bang/test_bang_net.cc            |  47 ++
 test/kernels/bang/test_bang_optensor.cc       |   3 +
 test/kernels/bang/test_bang_pad.cc            |  44 ++
 test/kernels/bang/test_bang_pooling.cc        |  41 ++
 test/kernels/bang/test_bang_pow.cc            |  46 ++
 test/kernels/bang/test_bang_prelu.cc          |  46 ++
 test/kernels/bang/test_bang_reciprocal.cc     |  41 ++
 test/kernels/bang/test_bang_round.cc          |  40 ++
 test/kernels/bang/test_bang_rsqrt.cc          |  40 ++
 test/kernels/bang/test_bang_split.cc          |  48 ++
 test/kernels/bang/test_bang_sqrt.cc           |  40 ++
 test/kernels/bang/test_bang_square.cc         |  40 ++
 .../bang/test_bang_squaredDifference.cc       |  48 ++
 test/kernels/bang/test_bang_transpose.cc      |  43 ++
 test/kernels/bang/test_bang_trigon.cc         |  52 ++
 test/kernels/bang/test_bang_unary.cc          |  47 ++
 test/kernels/cuda/test_cuda_clip.cc           |  48 ++
 test/kernels/cuda/test_cuda_slice.cc          |   2 +-
 test/kernels/intelcpu/test_mkl_slice.cc       |   2 +-
 test/operators/test_clip.cc                   |  38 ++
 test/operators/test_slice.cc                  |   6 +-
 161 files changed, 6913 insertions(+), 614 deletions(-)
 create mode 100644 README_CN.md
 create mode 160000 example
 delete mode 100644 include/bang/bang_element_wise.h
 create mode 100644 include/cuda/cuda_clip.h
 create mode 100644 include/operators/activation_backward.h
 create mode 100644 include/operators/det.h
 create mode 100644 include/operators/dropout.h
 create mode 100644 include/operators/transpose.h
 create mode 100644 src/kernels/bang/activation.cc
 create mode 100644 src/kernels/bang/activation_backward.cc
 create mode 100644 src/kernels/bang/cast.cc
 create mode 100644 src/kernels/bang/ceil.cc
 create mode 100644 src/kernels/bang/clip.cc
 create mode 100644 src/kernels/bang/concat.cc
 create mode 100644 src/kernels/bang/conv_trans.cc
 create mode 100644 src/kernels/bang/convbpfilter.cc
 create mode 100644 src/kernels/bang/copy.cc
 create mode 100644 src/kernels/bang/det.cc
 create mode 100644 src/kernels/bang/erf.cc
 create mode 100644 src/kernels/bang/exp.cc
 create mode 100644 src/kernels/bang/fill.cc
 create mode 100644 src/kernels/bang/floor.cc
 create mode 100644 src/kernels/bang/hardtanh.cc
 create mode 100644 src/kernels/bang/l2loss.cc
 create mode 100644 src/kernels/bang/log.cc
 create mode 100644 src/kernels/bang/negtensor.cc
 create mode 100644 src/kernels/bang/pad.cc
 create mode 100644 src/kernels/bang/pooling.cc
 create mode 100644 src/kernels/bang/reciprocal.cc
 create mode 100644 src/kernels/bang/reshape.cc
 create mode 100644 src/kernels/bang/rsqrt.cc
 create mode 100644 src/kernels/bang/split.cc
 create mode 100644 src/kernels/bang/sqrt.cc
 create mode 100644 src/kernels/bang/transpose.cc
 create mode 100644 src/kernels/bang/trigon.cc
 create mode 100644 src/kernels/cuda/clip.cc
 create mode 100644 src/kernels/cuda/clip.cu
 delete mode 100644 src/kernels/mlu/CMakeLists.txt
 delete mode 100644 src/kernels/mlu/include/bang_div.h
 delete mode 100644 src/kernels/mlu/include/div.h
 delete mode 100644 src/kernels/mlu/src/div.mlu
 delete mode 100644 src/kernels/mlu/src/div_device.mlu
 create mode 100644 src/operators/activation_backward.cc
 create mode 100644 src/operators/det.cc
 create mode 100644 src/operators/dropout.cc
 create mode 100644 src/operators/transpose.cc
 create mode 100644 test/kernels/bang/test_bang_activation_backward.cc
 create mode 100644 test/kernels/bang/test_bang_bitcompute.cc
 create mode 100644 test/kernels/bang/test_bang_cast.cc
 create mode 100644 test/kernels/bang/test_bang_ceil.cc
 create mode 100644 test/kernels/bang/test_bang_clip.cc
 create mode 100644 test/kernels/bang/test_bang_concat.cc
 create mode 100644 test/kernels/bang/test_bang_copy.cc
 create mode 100644 test/kernels/bang/test_bang_det.cc
 create mode 100644 test/kernels/bang/test_bang_erf.cc
 create mode 100644 test/kernels/bang/test_bang_exp.cc
 create mode 100644 test/kernels/bang/test_bang_fill.cc
 create mode 100644 test/kernels/bang/test_bang_floor.cc
 create mode 100644 test/kernels/bang/test_bang_floordiv.cc
 create mode 100644 test/kernels/bang/test_bang_floormod.cc
 create mode 100644 test/kernels/bang/test_bang_hardtanh.cc
 create mode 100644 test/kernels/bang/test_bang_l2loss.cc
 create mode 100644 test/kernels/bang/test_bang_log.cc
 create mode 100644 test/kernels/bang/test_bang_logic.cc
 create mode 100644 test/kernels/bang/test_bang_maximum.cc
 create mode 100644 test/kernels/bang/test_bang_minimum.cc
 create mode 100644 test/kernels/bang/test_bang_mseloss.cc
 create mode 100644 test/kernels/bang/test_bang_neg.cc
 create mode 100644 test/kernels/bang/test_bang_net.cc
 create mode 100644 test/kernels/bang/test_bang_pad.cc
 create mode 100644 test/kernels/bang/test_bang_pooling.cc
 create mode 100644 test/kernels/bang/test_bang_pow.cc
 create mode 100644 test/kernels/bang/test_bang_prelu.cc
 create mode 100644 test/kernels/bang/test_bang_reciprocal.cc
 create mode 100644 test/kernels/bang/test_bang_round.cc
 create mode 100644 test/kernels/bang/test_bang_rsqrt.cc
 create mode 100644 test/kernels/bang/test_bang_split.cc
 create mode 100644 test/kernels/bang/test_bang_sqrt.cc
 create mode 100644 test/kernels/bang/test_bang_square.cc
 create mode 100644 test/kernels/bang/test_bang_squaredDifference.cc
 create mode 100644 test/kernels/bang/test_bang_transpose.cc
 create mode 100644 test/kernels/bang/test_bang_trigon.cc
 create mode 100644 test/kernels/bang/test_bang_unary.cc
 create mode 100644 test/kernels/cuda/test_cuda_clip.cc
 create mode 100644 test/operators/test_clip.cc

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 142bf78b..c595a3b6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -2,6 +2,9 @@ name: Build and test cpu
 on:
   push:
     branch: 'master'
+    paths-ignore:
+      - '**.md'
+      - 'LICENSE'
   pull_request:
     paths-ignore:
       - '**.md'
@@ -11,8 +14,11 @@ env:
   protobuf-download: https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protobuf-cpp-3.21.12.tar.gz
   protobuf-version: "3.21.12"
   python-version: "3.10"
+
   resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx
-  resnet-file: resnet18-v2-7.onnx
+  inception-download: https://media.githubusercontent.com/media/onnx/models/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx
+  densenet-download: https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-12.onnx
+  efficientnet-download: https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx
 
 jobs:
   build:
@@ -31,28 +37,28 @@ jobs:
     - name: Install libdw
       run: sudo apt-get update && sudo apt-get install libdw-dev
 
-    - name: Cache protobuf
-      id: cache-protobuf
-      uses: actions/cache@v3
-      with:
-        path: protobuf-${{ env.protobuf-version }}
-        key: protobuf-${{ env.protobuf-version }}
+    # - name: Cache protobuf
+    #   id: cache-protobuf
+    #   uses: actions/cache@v3
+    #   with:
+    #     path: protobuf-${{ env.protobuf-version }}
+    #     key: protobuf-${{ env.protobuf-version }}
 
-    - name: Download and compile protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        wget ${{ env.protobuf-download }}
-        tar xf protobuf-cpp-${{ env.protobuf-version }}.tar.gz
-        cd protobuf-${{ env.protobuf-version }}
-        ./autogen.sh
-        ./configure CFLAGS="-fPIC" CXXFLAGS="-fPIC"
-        make -j8
+    # - name: Download and compile protobuf
+    #   if: steps.cache-protobuf.outputs.cache-hit != 'true'
+    #   run: |
+    #     wget ${{ env.protobuf-download }}
+    #     tar xf protobuf-cpp-${{ env.protobuf-version }}.tar.gz
+    #     cd protobuf-${{ env.protobuf-version }}
+    #     ./autogen.sh
+    #     ./configure CFLAGS="-fPIC" CXXFLAGS="-fPIC"
+    #     make -j8
 
-    - name: Install protobuf
-      run: |
-        cd protobuf-${{ env.protobuf-version }}
-        sudo make install
-        sudo ldconfig
+    # - name: Install protobuf
+    #   run: |
+    #     cd protobuf-${{ env.protobuf-version }}
+    #     sudo make install
+    #     sudo ldconfig
 
     - name: Build
       run: make
@@ -65,8 +71,12 @@ jobs:
         python -m pip install --upgrade pip
         make install-python
 
-    - name: Download test model
-      run: wget ${{ env.resnet-download }}
+    - name: Download test models
+      run: |
+        wget ${{ env.resnet-download }}
+        wget ${{ env.inception-download }}
+        wget ${{ env.densenet-download }}
+        wget ${{ env.efficientnet-download }}
 
     - name: Test onnx frontend
       run: make test-onnx
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
index 02f6cd1c..dfd15b07 100644
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@@ -2,6 +2,9 @@ name: clang-format Check
 on:
   push:
     branch: 'master'
+    paths-ignore:
+      - '**.md'
+      - 'LICENSE'
   pull_request:
     paths-ignore:
       - '**.md'
diff --git a/.gitmodules b/.gitmodules
index a40171ca..02a80785 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "3rd-party/backward-cpp"]
 	path = 3rd-party/backward-cpp
 	url = git@github.com:bombela/backward-cpp.git
+[submodule "example"]
+	path = example
+	url = git@github.com:wanghailu0717/NNmodel.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3c06283..f5fd43ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,8 +7,8 @@ option(USE_CUDA "Support CUDA GPU" OFF)
 option(USE_BANG "Support BANG MLU" OFF)
 option(USE_INTELCPU "Support INTELCPU" OFF)
 option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
-option(USE_PROTOBUF "Serialize and deserialize tensors" ON)
-option(BUILD_TEST "Build tests" ON)
+option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
+option(BUILD_TEST "Build tests" OFF)
 
 cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
@@ -78,7 +78,7 @@ if(BUILD_TEST_EINNET)
   include_directories(${DMLC_INCLUDE_DIR})
   include_directories(${DLPACK_INCLUDE_DIR})
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels 
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
 endif()
 
 if(BUILD_TEST)
@@ -142,7 +142,7 @@ if(USE_BACKTRACE)
 endif()
 
 if(USE_INTELCPU)
-  add_compile_definitions(USE_INTELCPU=1) 
+  add_compile_definitions(USE_INTELCPU=1)
   find_package(MKL CONFIG REQUIRED)
 
   # Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-link-line-advisor.html
@@ -150,10 +150,11 @@ if(USE_INTELCPU)
 
   set(DNNL_CONFIGURATION "cpu_gomp")
   find_package(dnnl CONFIG REQUIRED)
-  if(dnnl_FOUND)          
+  if(dnnl_FOUND)
+      add_compile_definitions(USE_MKL=1)
       include_directories(BEFORE ${dnnl_DIR}/../../../cpu_gomp/include/)
-      link_directories(${dnnl_DIR}/../../../cpu_gomp/lib)   
-      target_link_libraries(InfiniTensor  dnnl)       
+      link_directories(${dnnl_DIR}/../../../cpu_gomp/lib)
+      target_link_libraries(InfiniTensor   dnnl)
   else()
       message(FATAL_ERROR "dnnl library not found")
   endif()
@@ -161,7 +162,7 @@ if(USE_INTELCPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMKL_ILP64 -qmkl=parallel -Werror  ${WNO_ERRORS}")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DMKL_ILP64 -qmkl=parallel ${WNO_ERRORS}") # Enable assertion
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DMKL_ILP64 -qmkl=parallel ${WNO_ERRORS}") # Enable assertion
-  
+
   find_package(IntelDPCPP REQUIRED)
 endif()
 
@@ -179,6 +180,7 @@ if(USE_CUDA)
 endif()
 
 if(USE_BANG)
+  add_compile_definitions(USE_BANG=1)
   include_directories(src/kernels/mlu/include)
   ################################################################################
   # Neuware Evironment
@@ -212,10 +214,8 @@ if(USE_BANG)
   ################################################################################
   # BangC Kernels
   ################################################################################
-  add_subdirectory(src/kernels/mlu)
 
   target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
-  target_link_libraries(InfiniTensor bangops)
 endif()
 
 # # Python bindings
diff --git a/Makefile b/Makefile
index 6b5fa090..0edf6c20 100644
--- a/Makefile
+++ b/Makefile
@@ -1,14 +1,17 @@
 ﻿.PHONY : build clean install-python test-cpp test-onnx
 
 TYPE ?= release
-CUDA ?= off
+CUDA ?= OFF
+BANG ?= OFF
 INTELCPU ?= off
+BACKTRACE ?= ON
+TEST ?= ON
 
-CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE) 
-
-ifeq ($(CUDA), ON)
-	CMAKE_OPT += -DUSE_CUDA=ON
-endif
+CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
+CMAKE_OPT += -DUSE_CUDA=$(CUDA)
+CMAKE_OPT += -DUSE_BANG=$(BANG)
+CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
+CMAKE_OPT += -DBUILD_TEST=$(TEST)
 
 ifeq ($(INTELCPU), ON)
 	CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp
diff --git a/README.md b/README.md
index ea404705..231831a3 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,9 @@ cmake -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp .. && make -j 12
 
 ---
 
-> Sets env: `CUDA=ON` to enable cuda.
+> - Sets env: `TEST=OFF` to accelerate compiling.
+> - Sets env: `CUDA=ON` to enable cuda.
+> - Sets env: `BANG=ON` to enable bang.
 
 ### CMake Options
 
diff --git a/README_CN.md b/README_CN.md
new file mode 100644
index 00000000..5480987b
--- /dev/null
+++ b/README_CN.md
@@ -0,0 +1,221 @@
+﻿# 使用指南
+
+## 目录
+
+- [编译](#编译)
+- [使用](#使用)
+- [python-前端应用指南](#python-前端应用指南)
+  - [导入-onnx-模型](#导入-onnx-模型)
+  - [导出-onnx-模型](#导出-onnx-模型)
+  - [执行推理](#执行推理)
+- [测试](#测试)
+
+## 编译
+
+推荐使用 Ubuntu-22.04，本文以此环境为例。
+
+1. 使用 apt 安装依赖
+
+   > 如果不使用 Ubuntu-22.04，部分软件版本可能不够高。
+
+   ```bash
+   sudo apt-get install make cmake build-essential python-is-python3 python-dev-is-python3 python3-pip libdw-dev
+   ```
+
+2. 更新 pip 并换清华源
+
+   ```bash
+   python -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip
+   pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+   ```
+
+3. 编译并安装 python 库
+
+   > 第一次执行会同时安装 python 依赖库，比较慢
+
+   仅编译 CPU 部分：
+
+   ```bash
+   make install-python
+   ```
+
+   编译 GPU 部分：
+
+   ```bash
+   make install-python CUDA=ON
+   ```
+
+## 使用
+
+项目管理功能已写到 [Makefile](Makefile)，支持下列功能：
+
+- 编译项目：`make`/`make build`
+- 清理生成文件：`make clean`
+- 安装 python 库：`make install-python`
+- 测试 c++ 后端：`make test-cpp`
+- 测试 python 前端：`make test-onnx`
+
+并使用下列环境变量传递选项参数：
+
+- `TYPE`：编译模式（`debug`/`release`），默认值为 `release`
+- `CUDA`：是否编译 CUDA 后端，默认为 `OFF`，`ON` 打开
+- `BANG`：是否编译寒武纪后端，默认为 `OFF`，`ON` 打开
+- `BACKTRACE`：是否启用栈回溯，默认为 `ON`，`OFF` 关闭，建议调试时打开
+- `TEST`：是否编译 `googletest`，默认为 `ON`，`OFF` 关闭，只有 `test-cpp` 时必要
+
+## python 前端应用指南
+
+`make install-python` 会将项目的 python 前端以 `pyinfinitensor` 为名字安装到系统目录，可以直接 `import pyinfinitensor` 来使用。现阶段，项目的主要用法是从 onnx 导入模型进行优化，然后可以再导出优化后的模型到 onnx，也可以直接运行推理。
+
+### 导入 onnx 模型
+
+支持的模型：
+
+- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
+- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
+- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
+- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
+
+```python
+import onnx
+from pyinfinitensor.onnx import OnnxStub
+from pyinfinitensor import backend
+
+stub = OnnxStub(onnx.load("model_file"), backend.cpu_runtime())
+```
+
+[`onnx.load`](https://onnx.ai/onnx/api/serialization.html#load-a-model) 是 onnx 提供的加载函数，将 onnx 文件读取为保存在内存中的 onnx 模型。
+
+`OnnxStub` 是 onnx 模型在项目中的表示，通过构造这个对象，将 onnx 模型导入到项目中。其构造器的第一个参数是 onnx 模型文件；第二个参数是模型运行的后端运行时，可以是 `backend.cpu_runtime()`、`backend.cuda_runtime()` 或 `backend.bang_runtime()`。
+
+构造出的 stub 对象可以用于操作项目中的模型和运行时。
+
+### 优化
+
+TODO
+
+### 导出 onnx 模型
+
+优化后的模型可以导出成 onnx 文件提供给其他运行时。
+
+```python
+with open("optimized.onnx", "wb") as f:
+    f.write(stub.to_onnx("optimized").SerializeToString())
+```
+
+`stub.to_onnx(<name>)` 将模型转换为 onnx 模型对象，`<name>` 将填写到 onnx 模型的 `name` 字段。序列化到文件的代码见[官方示例](https://onnx.ai/onnx/intro/python.html#model-serialization)。
+
+要可视化检查导出的模型文件，可以利用 [onnx 提供的功能](https://onnx.ai/onnx/api/shape_inference.html#infer-shapes)将所有的张量的形状推理出来再导出：
+
+```python
+from onnx.shape_inference import infer_shapes
+
+with open("optimized.onnx", "wb") as f:
+    f.write(infer_shapes(stub.to_onnx("optimized")).SerializeToString())
+```
+
+然后用 [Netron](https://netron.app/) 绘制计算图。
+
+### 执行推理
+
+也可以使用项目的运行时执行推理。
+
+第一步是将数据传入计算图。`OnnxStub.inputs` 是一个 `Dict[str, Tensor]`，保存着模型的所有输入的名字和对象。可以用 [`items()`](https://docs.python.org/zh-cn/3/library/stdtypes.html#dict.items) 来遍历。
+
+这个代码片段显示了如何打印出模型所有输入张量的名字、形状和对象指针：
+
+```python
+for name, tensor in stub.inputs.items():
+    print(name, tensor.shape(), tensor)
+```
+
+对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)，会打印出：
+
+```plaintext
+data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
+```
+
+当然，地址是随机的。这个输出表明需要输入一个名为 “data”，形为 1×3×224×224 的数据。通常来说，这表示一张 224×224 的 rgb 图片。而这个模型是一个 1000 分类的图像分类模型。
+
+为了方便，这里我们向模型传入一个随机的数据。
+
+```python
+import numpy
+
+stub.init()
+for name, tensor in stub.inputs.items():
+    print(name, tensor.shape(), tensor)
+    input = numpy.random.random(tensor.shape()).astype(numpy.float32)
+    tensor.copyin_float(input.flatten().tolist())
+```
+
+`stub.init()` 为所有张量分配空间。空间是预分配的，所以不支持动态 size 的模型。
+
+`tensor.copyin_float(<data>)` 向张量传入数据。其参数必须是一个 `List[float]`，即压平的数据。类似的函数还有 `copyin_int32(<data>)` 和 `copyin_int64(<data>)`
+
+然后，调用 `stub.run()` 执行推理：
+
+```python
+stub.run()
+```
+
+最后，将结果拷贝出来，传入类似：
+
+```python
+stub.init()
+for name, tensor in stub.outputs.items():
+    print(name, tensor.shape(), tensor)
+    print(tensor.copyout_float())
+```
+
+## 测试
+
+除了单元测试 `make test-cpp` 和 `make test-onnx` 之外，还可以用其他方式来测试单个模型导入导出和优化的正确性。
+
+这个脚本利用 onnxruntime 来测试导出的模型是否与导入的模型等价：
+
+```python
+import onnx
+import numpy
+import sys
+from onnx import ModelProto, ValueInfoProto
+from pyinfinitensor.onnx import OnnxStub
+from pyinfinitensor import backend
+from onnxruntime import InferenceSession
+
+
+def infer(model: ModelProto, input) -> dict:
+    collection = set()
+    for node in model.graph.node:
+        for output in node.output:
+            collection.add(output)
+    model.graph.output.extend([ValueInfoProto(name=x) for x in collection])
+    session = InferenceSession(model.SerializeToString())
+    i = session.get_inputs()[0].name
+    return dict(
+        zip(
+            [x.name for x in session.get_outputs()],
+            [x.flatten() for x in session.run(None, {i: input})],
+        )
+    )
+
+
+model0 = onnx.load(sys.argv[1])
+model1 = OnnxStub(model0, backend.cpu_runtime()).to_onnx("new")
+
+input_shape = [x.dim_value for x in model1.graph.input[0].type.tensor_type.shape.dim]
+input = numpy.random.random(input_shape).astype(numpy.float32)
+
+output0 = infer(model0, input)[model0.graph.output[0].name]
+output1 = infer(model1, input)[model1.graph.output[0].name]
+
+print("error =", sum((output1 - output0) ** 2) / len(output0))
+```
+
+要运行脚本，先安装 onnxruntime：
+
+```bash
+pip install onnxruntime
+```
+
+打印出的 `error = ...` 是两个模型输出张量的均方误差。对于不同的模型，这个误差最小为 0，最大不超过 1e-9。
diff --git a/example b/example
new file mode 160000
index 00000000..d6ac8c8c
--- /dev/null
+++ b/example
@@ -0,0 +1 @@
+Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9
diff --git a/include/bang/bang_element_wise.h b/include/bang/bang_element_wise.h
deleted file mode 100644
index a0bf03c4..00000000
--- a/include/bang/bang_element_wise.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-#include "bang/bang_runtime.h"
-#include "bang_div.h"
-#include "operators/element_wise.h"
-namespace infini {
-
-void element_wise_kernel(const RuntimeObj *obj, const Operator &_op) {
-    auto op = as<ElementWiseObj>(_op);
-    float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
-    float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
-    float *const cData = (op->getOutput()->getRawDataPtr<float *>());
-
-    auto dim = op->getInputs(0)->getDims();
-    auto context = dynamic_cast<const BangRuntimeObj *>(obj);
-    int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
-    if (op->getOpType() == OpType::Div)
-        div_kernel(context->cnnlHandle(), aData, bData, cData, n * c * h * w);
-    else
-        IT_TODO_HALT();
-}
-
-}; // namespace infini
diff --git a/include/core/constants.h b/include/core/constants.h
index 655c8989..bf74f47d 100644
--- a/include/core/constants.h
+++ b/include/core/constants.h
@@ -2,4 +2,4 @@
 
 namespace infini {
 constexpr double E_CONSTANT = 2.718281828459;
-}
\ No newline at end of file
+}
diff --git a/include/core/data_type.h b/include/core/data_type.h
index 72699e22..878f4bdb 100644
--- a/include/core/data_type.h
+++ b/include/core/data_type.h
@@ -52,4 +52,14 @@ template <> inline DataType DataType::get<int16_t>() { return Int16; }
 template <> inline DataType DataType::get<int32_t>() { return Int32; }
 template <> inline DataType DataType::get<int64_t>() { return Int64; }
 
+template <int index> struct DT {};
+template <> struct DT<0> { using t = float; };
+template <> struct DT<1> { using t = uint32_t; };
+template <> struct DT<2> { using t = uint8_t; };
+template <> struct DT<3> { using t = int8_t; };
+template <> struct DT<4> { using t = uint16_t; };
+template <> struct DT<5> { using t = int16_t; };
+template <> struct DT<6> { using t = int32_t; };
+template <> struct DT<7> { using t = int64_t; };
+
 } // namespace infini
diff --git a/include/core/graph.h b/include/core/graph.h
index 3d62be30..dab31d79 100644
--- a/include/core/graph.h
+++ b/include/core/graph.h
@@ -59,6 +59,8 @@ class GraphObj : public Object {
      */
     bool topo_sort();
 
+    void optimize();
+
     void dataMalloc();
 
     /**
diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index a76197de..7af4bbe1 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -46,6 +46,9 @@ class GraphHandlerObj {
 
     Tensor conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
                 int sh, int sw, int dh, int dw);
+    Tensor convTransposed2d(Tensor input, Tensor weight, Tensor output, int ph,
+                            int pw, int sh, int sw, int dh, int dw, int oph,
+                            int opw);
     Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
                   Tensor bias, ActType act);
     Tensor batchNorm(Tensor input, Tensor output, Tensor mean, Tensor var,
@@ -68,10 +71,17 @@ class GraphHandlerObj {
     Tensor tanh(Tensor x, Tensor y);
     Tensor softmax(Tensor x, Tensor y, int axis);
     Tensor abs(Tensor x, Tensor y);
+    Tensor shape(Tensor x, Tensor y);
     Tensor identity(Tensor x, Tensor y);
     Tensor flatten(Tensor s, Tensor y, int axis);
+    Tensor pRelu(Tensor x, Tensor slope, Tensor y);
+    Tensor clip(Tensor x, Tensor y, std::optional<float> min,
+                std::optional<float> max);
+    Tensor transpose(Tensor data, Tensor transposed, Shape perm);
     Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
     Tensor concat(TensorVec inputs, Tensor output, int dim);
+    TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
+                    int num_outputs);
     Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
     Tensor reduceMean(Tensor data, Tensor reduced,
                       const optional<vector<int>> &axes, bool keepdims);
@@ -85,6 +95,8 @@ class GraphHandlerObj {
 
     inline bool topo_sort() { return g->topo_sort(); }
 
+    inline void optimize() { g->optimize(); }
+
     //------ runtime
 
     inline void data_malloc() { g->dataMalloc(); }
diff --git a/include/core/hash.h b/include/core/hash.h
index 3963af91..79e64163 100644
--- a/include/core/hash.h
+++ b/include/core/hash.h
@@ -15,4 +15,4 @@ template <typename T> inline HashType hashVector(const vector<T> &vec) {
     return ret;
 }
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/include/core/object.h b/include/core/object.h
index ee5430e1..2db50ad7 100644
--- a/include/core/object.h
+++ b/include/core/object.h
@@ -68,4 +68,4 @@ inline std::ostream &operator<<(std::ostream &os, const Ref<T> &obj) {
     return os;
 }
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/include/core/operator.h b/include/core/operator.h
index 87069096..cca67297 100644
--- a/include/core/operator.h
+++ b/include/core/operator.h
@@ -6,6 +6,8 @@ enum class OpType {
     Unknown = 0,
     // linear
     Conv = 100,
+    ConvBackwardFilter,
+    ConvBackwardData,
     Matmul,
     ConvTrans,
     ConvTransNHWC,
@@ -34,10 +36,70 @@ enum class OpType {
     Softmax,
     Activation,
     Relu,
+    ReluBackward,
+    PRelu,
     Sigmoid,
+    SigmoidBackward,
     Tanh,
+    TanhBackward,
     Abs,
+    Sin,
+    Cos,
+    Tan,
+    ASin,
+    ACos,
+    ATan,
+    SinH,
+    CosH,
+    TanH,
+    ASinH,
+    ACosH,
+    ATanH,
     Resize,
+    Arange,
+    Shape,
+    Copy,
+    Ceil,
+    Floor,
+    Clip,
+    Erf,
+    Exp,
+    Fill,
+    Log,
+    L2Loss,
+    Maximum,
+    Minimum,
+    MSELoss,
+    Neg,
+    Power,
+    Reciprocal,
+    Sqrt,
+    Rsqrt,
+    Cast,
+    FloorDiv,
+    FloorMod,
+    Det,
+    Round,
+    Square,
+    SquaredDifference,
+    Hardtanh,
+    Equal,
+    NotEqual,
+    GreaterThan,
+    GreaterEqual,
+    LessThan,
+    LessEqual,
+    And,
+    Or,
+    Xor,
+    Not,
+    BitAnd,
+    BitOr,
+    BitXor,
+    BitNot,
+    BitLeftShift,
+    BitRightShift,
+    Dropout,
     //
     MemBound = 300,
 };
@@ -55,6 +117,8 @@ class OpRegistry {
             FOP(Unknown);
             // linear
             FOP(Conv);
+            FOP(ConvBackwardFilter);
+            FOP(ConvBackwardData);
             FOP(Matmul);
             FOP(ConvTrans);
             FOP(G2BMM);
@@ -76,15 +140,72 @@ class OpRegistry {
             FOP(ReduceMean);
             FOP(Reshape);
             FOP(Identity);
+            FOP(Shape);
             // element wise
             FOP(BatchNorm);
             FOP(Softmax);
             FOP(Activation);
             FOP(Relu);
+            FOP(ReluBackward);
+            FOP(PRelu);
             FOP(Sigmoid);
+            FOP(SigmoidBackward);
             FOP(Tanh);
+            FOP(TanhBackward);
             FOP(Abs);
-            FOP(ConvTransNHWC);
+            FOP(Sin);
+            FOP(Cos);
+            FOP(Tan);
+            FOP(ASin);
+            FOP(ACos);
+            FOP(ATan);
+            FOP(SinH);
+            FOP(CosH);
+            FOP(TanH);
+            FOP(ASinH);
+            FOP(ACosH);
+            FOP(ATanH);
+            FOP(Copy);
+            FOP(Ceil);
+            FOP(Floor);
+            FOP(Clip);
+            FOP(Erf);
+            FOP(Exp);
+            FOP(Fill);
+            FOP(Log);
+            FOP(L2Loss);
+            FOP(Maximum);
+            FOP(Minimum);
+            FOP(MSELoss);
+            FOP(Neg);
+            FOP(Power);
+            FOP(Reciprocal);
+            FOP(Sqrt);
+            FOP(Rsqrt);
+            FOP(Cast);
+            FOP(FloorDiv);
+            FOP(FloorMod);
+            FOP(Det);
+            FOP(Round);
+            FOP(Square);
+            FOP(SquaredDifference);
+            FOP(Hardtanh);
+            FOP(Equal);
+            FOP(NotEqual);
+            FOP(GreaterThan);
+            FOP(GreaterEqual);
+            FOP(LessThan);
+            FOP(LessEqual);
+            FOP(And);
+            FOP(Or);
+            FOP(Xor);
+            FOP(Not);
+            FOP(BitAnd);
+            FOP(BitOr);
+            FOP(BitXor);
+            FOP(BitNot);
+            FOP(BitLeftShift);
+            FOP(BitRightShift);
             //
             FOP(MemBound);
         default:
diff --git a/include/core/perf_engine.h b/include/core/perf_engine.h
index 58659134..fb65da34 100644
--- a/include/core/perf_engine.h
+++ b/include/core/perf_engine.h
@@ -49,4 +49,4 @@ class PerfEngine {
 void to_json(json &j, const PerfEngine &p);
 void from_json(const json &j, PerfEngine &p);
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/include/core/ref.h b/include/core/ref.h
index b88bca99..825b0f2c 100644
--- a/include/core/ref.h
+++ b/include/core/ref.h
@@ -40,4 +40,4 @@ std::vector<Ref<T>> wrefs_to_refs(const std::vector<WRef<T>> &wrefs) {
     return refs;
 }
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/include/core/tensor.h b/include/core/tensor.h
index 8f8fa356..a1081e15 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -73,7 +73,7 @@ class TensorObj : public TensorBaseObj {
     // FIXME: std::fucntion copies the generator instead of passing it by ref.
     // Thus the internal state of generator cannot be updated.
     void setData(
-        const std::function<void(void *, size_t, DataType)> &generator) const;
+        std::function<void(void *, size_t, DataType)> const &generator) const;
     Tensor clone() const {
         auto obj = make_ref<TensorObj>(*this);
         obj->freeData();
@@ -100,26 +100,53 @@ class TensorObj : public TensorBaseObj {
     template <typename T> bool equalData(const vector<T> &dataVector) {
         IT_ASSERT(DataType::get<T>() == dtype);
         IT_ASSERT(size() == dataVector.size());
-        return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size(),
-                             1e-6);
+        return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
     }
 
     size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const;
 
   private:
-    void printDataFloat(float *ptr) const;
-    void printDataUint32_t(uint32_t *ptr) const;
+    template <class T> string dataToString() const {
+        std::stringstream builder;
+        builder << "Tensor: " << guid << std::endl;
+
+        auto numDims = shape.size();
+        auto dimSzVec = vector<int>(numDims, 1);
+        auto ptr = data->getPtr<T *>();
+        dimSzVec[numDims - 1] = shape[numDims - 1];
+
+        for (int i = numDims - 1; i != 0; --i)
+            dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
+
+        for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
+            for (size_t j = 0; j < numDims; ++j)
+                if (i % dimSzVec[j] == 0)
+                    builder << "[";
+
+            builder << ptr[i];
+            for (size_t j = 0; j < numDims; ++j)
+                if ((int)i % dimSzVec[j] == dimSzVec[j] - 1)
+                    builder << "]";
+
+            if (i != size() - 1)
+                builder << ", ";
+
+            auto column = (size_t)dimSzVec[numDims - 1];
+            if (i % column == column - 1)
+                builder << std::endl;
+        }
+        return builder.str();
+    }
 
     template <typename T>
-    bool equalDataImpl(const T *a, const T *b, size_t size,
-                       double relativeError) const {
+    bool equalDataImpl(const T *a, const T *b, size_t size) const {
         for (size_t i = 0; i < size; ++i) {
             if constexpr (std::is_integral_v<T>) {
                 if (a[i] != b[i])
                     return false;
             } else if constexpr (std::is_floating_point_v<T>) {
                 if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
-                    relativeError) {
+                    1e-6) {
                     printf("Error on %lu: %f %f\n", i, a[i], b[i]);
                     return false;
                 }
diff --git a/include/cuda/cuda_clip.h b/include/cuda/cuda_clip.h
new file mode 100644
index 00000000..bb602803
--- /dev/null
+++ b/include/cuda/cuda_clip.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "operators/unary.h"
+
+namespace infini {
+void clip_kernel(float *input, float *output, int num, float minValue,
+                 float maxValue);
+
+}; // namespace infini
diff --git a/include/cuda/cuda_element_wise.h b/include/cuda/cuda_element_wise.h
index 39de0ea3..eb3b99a2 100644
--- a/include/cuda/cuda_element_wise.h
+++ b/include/cuda/cuda_element_wise.h
@@ -1,6 +1,8 @@
 #pragma once
 
 namespace infini {
-void div_kernel(float *a, float *b, float *c, int num);
-void pow_kernel(float *a, float *b, float *c, int num);
+void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
+void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
 }; // namespace infini
diff --git a/include/nnet/Pass/MatchComputationKernel.h b/include/nnet/Pass/MatchComputationKernel.h
index 4b182a89..270b6e7f 100644
--- a/include/nnet/Pass/MatchComputationKernel.h
+++ b/include/nnet/Pass/MatchComputationKernel.h
@@ -12,4 +12,4 @@ class MatchComputationKernel : public Pass {
     virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/MatchMemBoundKernel.h b/include/nnet/Pass/MatchMemBoundKernel.h
index 6b0a4bec..2f2bc2ce 100644
--- a/include/nnet/Pass/MatchMemBoundKernel.h
+++ b/include/nnet/Pass/MatchMemBoundKernel.h
@@ -12,4 +12,4 @@ class MatchMemBoundKernel : public Pass {
     virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/Pass.h b/include/nnet/Pass/Pass.h
index a8d517fa..9bf077be 100644
--- a/include/nnet/Pass/Pass.h
+++ b/include/nnet/Pass/Pass.h
@@ -38,4 +38,4 @@ class Pass {
     const VecExpr &getTransformations();
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/Rule1VariableSplit.h b/include/nnet/Pass/Rule1VariableSplit.h
index 06f2bb4e..397cc514 100644
--- a/include/nnet/Pass/Rule1VariableSplit.h
+++ b/include/nnet/Pass/Rule1VariableSplit.h
@@ -15,4 +15,4 @@ class Rule1VariableSplit : public Pass {
     Expr replaceIters(Expr cur, const Replace &replace);
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/Rule2VariableMerging.h b/include/nnet/Pass/Rule2VariableMerging.h
index 1f277f46..5acbb951 100644
--- a/include/nnet/Pass/Rule2VariableMerging.h
+++ b/include/nnet/Pass/Rule2VariableMerging.h
@@ -26,4 +26,4 @@ class Rule2VariableMerging : public Pass {
                                                     pair<Iterator, int> pb);
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/Rule3StageSplit.h b/include/nnet/Pass/Rule3StageSplit.h
index 99e172cf..3cc06eb3 100644
--- a/include/nnet/Pass/Rule3StageSplit.h
+++ b/include/nnet/Pass/Rule3StageSplit.h
@@ -16,4 +16,4 @@ class Rule3StageSplit : public Pass {
     vector<vector<Var>> getSplitSummationIters(RangeOp rangeOp);
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/Rule4StageMerging.h b/include/nnet/Pass/Rule4StageMerging.h
index 13f11074..a69e5754 100644
--- a/include/nnet/Pass/Rule4StageMerging.h
+++ b/include/nnet/Pass/Rule4StageMerging.h
@@ -19,4 +19,4 @@ class Rule4StageMerging : public Pass {
     virtual void transform(Formula &origin, int depth, Expr &rCur) override;
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/Rule5RangeRelaxation.h b/include/nnet/Pass/Rule5RangeRelaxation.h
index c1719dfc..364c9e9a 100644
--- a/include/nnet/Pass/Rule5RangeRelaxation.h
+++ b/include/nnet/Pass/Rule5RangeRelaxation.h
@@ -13,4 +13,4 @@ class Rule5RangeRelaxation : public Pass {
     virtual void transform(Formula &origin, int depth, Expr &rCur) override;
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/Rule6KenerlMatching.h b/include/nnet/Pass/Rule6KenerlMatching.h
index 19648eaf..53867b42 100644
--- a/include/nnet/Pass/Rule6KenerlMatching.h
+++ b/include/nnet/Pass/Rule6KenerlMatching.h
@@ -14,4 +14,4 @@ class Rule6KenerlMatching : public Pass {
     VecExpr matchElementWise(const RangeOp &rangeOp);
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/Rule7DLT.h b/include/nnet/Pass/Rule7DLT.h
index d2ce0ecc..d05f7014 100644
--- a/include/nnet/Pass/Rule7DLT.h
+++ b/include/nnet/Pass/Rule7DLT.h
@@ -13,4 +13,4 @@ class Rule7DLT : public Pass {
     vector<int> getFactors();
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/Rule8GuidedDLT.h b/include/nnet/Pass/Rule8GuidedDLT.h
index e6536df4..3167b337 100644
--- a/include/nnet/Pass/Rule8GuidedDLT.h
+++ b/include/nnet/Pass/Rule8GuidedDLT.h
@@ -45,4 +45,4 @@ class Rule8GuidedDLT : public Pass {
                               vector<Var> tensorDimAxes, vector<int> newShape);
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/Rule90TwoStageElementWise.h b/include/nnet/Pass/Rule90TwoStageElementWise.h
index ab37cf99..fb049041 100644
--- a/include/nnet/Pass/Rule90TwoStageElementWise.h
+++ b/include/nnet/Pass/Rule90TwoStageElementWise.h
@@ -13,4 +13,4 @@ class Rule90TwoStageElementWise : public Pass {
     VecExpr matchTwoStageElementWise(const RangeOp &rangeOp);
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/Rule91MergeStagesWithSum.h b/include/nnet/Pass/Rule91MergeStagesWithSum.h
index 221e1772..a9e861e0 100644
--- a/include/nnet/Pass/Rule91MergeStagesWithSum.h
+++ b/include/nnet/Pass/Rule91MergeStagesWithSum.h
@@ -12,4 +12,4 @@ class Rule91MergeStagesWithSum : public Pass {
     virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/Pass/Rule9RangeMagnify.h b/include/nnet/Pass/Rule9RangeMagnify.h
index 613de158..890d2dbb 100644
--- a/include/nnet/Pass/Rule9RangeMagnify.h
+++ b/include/nnet/Pass/Rule9RangeMagnify.h
@@ -12,4 +12,4 @@ class Rule9RangeMagnify : public Pass {
     virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/ReplaceKit.h b/include/nnet/ReplaceKit.h
index 19f2ec0e..bd28cf1e 100644
--- a/include/nnet/ReplaceKit.h
+++ b/include/nnet/ReplaceKit.h
@@ -43,4 +43,4 @@ class ReplaceKit {
                             const Expr &replacement);
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/dbg.h b/include/nnet/dbg.h
index f5894e4b..2727c928 100644
--- a/include/nnet/dbg.h
+++ b/include/nnet/dbg.h
@@ -231,7 +231,7 @@ template <typename... T> std::string type_list_to_string() {
         result.pop_back();
     }
     return result;
-}
+} // namespace dbg
 
 template <typename... T> std::string get_type_name(type_tag<std::tuple<T...>>) {
     return "std::tuple<" + type_list_to_string<T...>() + ">";
@@ -855,4 +855,4 @@ auto identity(T &&, U &&...u) -> last_t<U...> {
 #define dbg(...) dbg::identity(__VA_ARGS__)
 #endif // DBG_MACRO_DISABLE
 
-#endif // DBG_MACRO_DBG_H
\ No newline at end of file
+#endif // DBG_MACRO_DBG_H
diff --git a/include/nnet/dlt.h b/include/nnet/dlt.h
index 4e5e56ce..ecc9c6b2 100644
--- a/include/nnet/dlt.h
+++ b/include/nnet/dlt.h
@@ -50,4 +50,4 @@ class DLT {
                                           RangeOp rangeOp);
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/nmutator.h b/include/nnet/nmutator.h
index c3009b10..74834f7e 100644
--- a/include/nnet/nmutator.h
+++ b/include/nnet/nmutator.h
@@ -55,4 +55,4 @@ class NMutator : public Mutator {
     // Graph transformConv1xk(Operator op);
 };
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/include/nnet/permutation.h b/include/nnet/permutation.h
index 300a5d6b..4b1917b9 100644
--- a/include/nnet/permutation.h
+++ b/include/nnet/permutation.h
@@ -35,4 +35,4 @@ template <typename T> class SubsetGenerator {
     }
 };
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/nnet/ref.h b/include/nnet/ref.h
index d4c6ef63..b12f6ffb 100644
--- a/include/nnet/ref.h
+++ b/include/nnet/ref.h
@@ -115,4 +115,4 @@ constexpr Ref<_Tp> make_ref_from_tuple(_Tuple &&__t) {
 //     }
 // };
 
-// } // namespace nnet
\ No newline at end of file
+// } // namespace nnet
diff --git a/include/nnet/test.h b/include/nnet/test.h
index dbc32c34..6cf75873 100644
--- a/include/nnet/test.h
+++ b/include/nnet/test.h
@@ -27,4 +27,4 @@ namespace nnet {
 int matchExprResult(Derivator &derivator, string fn);
 bool checkExprLogSame(string fnPrefix, int start, int end);
 bool checkExprsEquvivalence(VecExpr exprs);
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/include/operators/activation_backward.h b/include/operators/activation_backward.h
new file mode 100644
index 00000000..5f55d8cc
--- /dev/null
+++ b/include/operators/activation_backward.h
@@ -0,0 +1,32 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class ActivationBackwardObj : public OperatorObj {
+  public:
+    ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y,
+                          Tensor x, Tensor diff_x);
+    OP_CLONE(ActivationBackwardObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 3; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+#define DEFINE_ACTIVATION_BACKWARD_OBJ(prefix, type)                           \
+    class prefix##Obj : public ActivationBackwardObj {                         \
+      public:                                                                  \
+        prefix##Obj(GraphObj *graph, Tensor y, Tensor diff_y, Tensor x,        \
+                    Tensor diff_x)                                             \
+            : ActivationBackwardObj(type, graph, y, diff_y, x, diff_x) {}      \
+    };
+
+DEFINE_ACTIVATION_BACKWARD_OBJ(ReluBackward, OpType::ReluBackward)
+DEFINE_ACTIVATION_BACKWARD_OBJ(SigmoidBackward, OpType::SigmoidBackward)
+DEFINE_ACTIVATION_BACKWARD_OBJ(TanhBackward, OpType::TanhBackward)
+}; // namespace infini
diff --git a/include/operators/batch_norm.h b/include/operators/batch_norm.h
index 76f1eff4..cfacf2ca 100644
--- a/include/operators/batch_norm.h
+++ b/include/operators/batch_norm.h
@@ -9,7 +9,7 @@ namespace infini {
  */
 class BatchNormObj : public OperatorObj {
     float momentum, eps;
-    bool training;
+    bool trainingMode;
 
   public:
     /**
@@ -28,11 +28,11 @@ class BatchNormObj : public OperatorObj {
      * Default is 0.9.
      * @param eps The epsilon value to use to avoid division by zero. Default is
      * 1e-5.
-     * @param training Set to true when used for training.
+     * @param trainingMode Set to true when used for training.
      */
     BatchNormObj(GraphObj *graph, Tensor input, Tensor output, Tensor mean,
                  Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
-                 float eps = 1e-5, bool training = false);
+                 float eps = 1e-5, bool trainingMode = false);
     OP_CLONE(BatchNormObj);
     optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
     std::string toString() const override;
@@ -42,7 +42,7 @@ class BatchNormObj : public OperatorObj {
     int numOutputs() const override { return outputs.size(); }
     float getMomentum() const { return momentum; }
     float getEps() const { return eps; }
-    bool getTraining() const { return training; }
+    bool getTrainingMode() const { return trainingMode; }
 
   private:
     vector<int> getWorkloadVector() const override;
diff --git a/include/operators/conv.h b/include/operators/conv.h
index 793afd48..449f4334 100644
--- a/include/operators/conv.h
+++ b/include/operators/conv.h
@@ -149,6 +149,29 @@ class ConvObj : public ConvBaseObj {
     void setAuxilaryAttributes(PaddingMode mode) override;
 };
 
+class ConvBackwardFilterObj : public ConvBaseObj {
+  private:
+    ActType act;
+
+  public:
+    ConvBackwardFilterObj(GraphObj *graph, Tensor inputX, Tensor diffY,
+                          Tensor diffW, int ph, int pw, int sh = 1, int sw = 1,
+                          int dh = 1, int dw = 1, Tensor bias = nullptr,
+                          ActType act = ActType::None);
+    // Constructors for setting padding mode
+    ConvBackwardFilterObj(GraphObj *graph, Tensor inputX, Tensor diffY,
+                          Tensor diffW, PaddingMode mode = PaddingMode::Same,
+                          int sh = 1, int sw = 1, int dh = 1, int dw = 1,
+                          Tensor bias = nullptr, ActType act = ActType::None);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    ActType getAct() const { return act; }
+    int getNumGroups() const override { return c / getChannelPerGroup(); }
+
+  private:
+    void setAuxilaryAttributes(PaddingMode mode) override;
+};
+
 class ConvTransposed2dObj : public ConvBaseObj {
   private:
     int oph, opw;
@@ -170,6 +193,7 @@ class ConvTransposed2dObj : public ConvBaseObj {
 
     optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
     int getNumGroups() const override { return group; }
+    std::pair<int, int> getOutputPadding() const { return {oph, opw}; }
 
   private:
     void setAuxilaryAttributes(PaddingMode mode) override;
diff --git a/include/operators/det.h b/include/operators/det.h
new file mode 100644
index 00000000..d5e887c1
--- /dev/null
+++ b/include/operators/det.h
@@ -0,0 +1,22 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class DetObj : public OperatorObj {
+  public:
+    enum Mode { NormalDet = 0, LogDet };
+    DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode);
+    OP_CLONE(DetObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+    Mode getMode() const { return modeValue; }
+
+  private:
+    Mode modeValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+}; // namespace infini
diff --git a/include/operators/dropout.h b/include/operators/dropout.h
new file mode 100644
index 00000000..8c4c7300
--- /dev/null
+++ b/include/operators/dropout.h
@@ -0,0 +1,52 @@
+﻿#pragma once
+#include "core/operator.h"
+
+namespace infini {
+/**
+ * @brief Copy a tensor along a centain dimension for multiple times.
+ */
+class DropoutObj : public OperatorObj {
+    float ratio;
+    // bool training_mode; // TODO must be false.
+
+  public:
+    /**
+     * @brief Dropout takes an input floating-point tensor, an input ratio
+     * (floating-point scalar) and an input training_mode (boolean scalar). It
+     * produces two tensor outputs, output (floating-point tensor) and mask
+     * (bool tensor). If training_mode is true then the output Y will be a
+     * random dropout; Note that this Dropout scales the masked input data by
+     * the following equation, so to convert the trained model into inference
+     * mode, the user can simply not pass training_mode input or set it to
+     * false.
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param data The input tensor.
+     * @param output The output tensor.
+     * @param mask The mask tensor.
+     * @param ratio The ratio of random dropout, with value in [0, 1). If this
+     * input was not set, or if it was set to 0, the output would be a simple
+     * copy of the input. If it’s non-zero, output will be a random dropout of
+     * the scaled input, which is typically the case during training.
+     * @param training_mode  If set to true then it indicates dropout is being
+     * used for training. It is an optional value hence unless specified
+     * explicitly, it is false. If it is false, ratio is ignored and the
+     * operation mimics inference mode where nothing will be dropped from the
+     * input data and if mask is requested as output it will contain all ones.
+     */
+    DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask,
+               float ratio, bool training_mode);
+    OP_CLONE(DropoutObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 2; }
+    float getRatio() const { return ratio; }
+    bool getTrainingMode() const { return false; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+} // namespace infini
diff --git a/include/operators/element_wise.h b/include/operators/element_wise.h
index c3ffe9f1..8bda60de 100644
--- a/include/operators/element_wise.h
+++ b/include/operators/element_wise.h
@@ -32,6 +32,25 @@ class ElementWiseObj : public OperatorObj {
     vector<int> getOpAttrVector() const override;
 };
 
+class MSELossObj : public OperatorObj {
+  public:
+    enum Reduction { None = 0, Sum, Mean };
+    MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
+               Reduction reduction, Tensor output);
+    OP_CLONE(MSELossObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    Reduction getReduction() const { return reductionMode; }
+    std::string toString() const override;
+    int numInputs() const override { return 2; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    Reduction reductionMode;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
 #define DEFINE_ELEMENT_WISE_OBJ(prefix, type)                                  \
     class prefix##Obj : public ElementWiseObj {                                \
       public:                                                                  \
@@ -46,4 +65,26 @@ DEFINE_ELEMENT_WISE_OBJ(Sub, OpType::Sub)
 DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
 DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
 DEFINE_ELEMENT_WISE_OBJ(Pow, OpType::Pow)
+DEFINE_ELEMENT_WISE_OBJ(Maximum, OpType::Maximum)
+DEFINE_ELEMENT_WISE_OBJ(Minimum, OpType::Minimum)
+DEFINE_ELEMENT_WISE_OBJ(Power, OpType::Power)
+DEFINE_ELEMENT_WISE_OBJ(FloorDiv, OpType::FloorDiv)
+DEFINE_ELEMENT_WISE_OBJ(FloorMod, OpType::FloorMod)
+DEFINE_ELEMENT_WISE_OBJ(SquaredDifference, OpType::SquaredDifference)
+DEFINE_ELEMENT_WISE_OBJ(Equal, OpType::Equal)
+DEFINE_ELEMENT_WISE_OBJ(NotEqual, OpType::NotEqual)
+DEFINE_ELEMENT_WISE_OBJ(GreaterThan, OpType::GreaterThan)
+DEFINE_ELEMENT_WISE_OBJ(GreaterEqual, OpType::GreaterEqual)
+DEFINE_ELEMENT_WISE_OBJ(LessThan, OpType::LessThan)
+DEFINE_ELEMENT_WISE_OBJ(LessEqual, OpType::LessEqual)
+DEFINE_ELEMENT_WISE_OBJ(And, OpType::And)
+DEFINE_ELEMENT_WISE_OBJ(Or, OpType::Or)
+DEFINE_ELEMENT_WISE_OBJ(Xor, OpType::Xor)
+DEFINE_ELEMENT_WISE_OBJ(Not, OpType::Not)
+DEFINE_ELEMENT_WISE_OBJ(BitAnd, OpType::BitAnd)
+DEFINE_ELEMENT_WISE_OBJ(BitOr, OpType::BitOr)
+DEFINE_ELEMENT_WISE_OBJ(BitXor, OpType::BitXor)
+DEFINE_ELEMENT_WISE_OBJ(BitNot, OpType::BitNot)
+DEFINE_ELEMENT_WISE_OBJ(BitLeftShift, OpType::BitLeftShift)
+DEFINE_ELEMENT_WISE_OBJ(BitRightShift, OpType::BitRightShift)
 }; // namespace infini
diff --git a/include/operators/matmul.h b/include/operators/matmul.h
index a1c57cfe..91a0a57c 100644
--- a/include/operators/matmul.h
+++ b/include/operators/matmul.h
@@ -47,10 +47,10 @@ class MatmulObj : public OperatorObj {
     std::string toString() const override;
     optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
 
-    int numInputs() const override { return 2; }
+    int numInputs() const override { return inputs.size(); }
     int numOutputs() const override { return 1; }
 
-    Tensor getBias() const { return inputs[2]; }
+    Tensor getBias() const { return inputs.size() > 2 ? inputs[2] : nullptr; }
     ActType getAct() const { return act; }
     auto getBMNKTransAB() const { return tuple(b, m, n, k, transA, transB); }
     bool getTransA() const { return transA; }
diff --git a/include/operators/slice.h b/include/operators/slice.h
index 7aeb0941..55acf505 100644
--- a/include/operators/slice.h
+++ b/include/operators/slice.h
@@ -7,7 +7,8 @@ namespace infini {
  *
  */
 class SliceObj : public OperatorObj {
-    vector<int> starts, ends; // the start no. and end no. for all dims.
+    template <class T> struct range_t { T start, end, step; };
+    vector<range_t<int>> axes;
 
   public:
     /**
@@ -33,9 +34,26 @@ class SliceObj : public OperatorObj {
 
     optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
     std::string toString() const override;
-    int numInputs() const override { return 1; }
-    int numOutputs() const override { return 1; }
-    Shape getStart() const { return starts; }
+    inline int numInputs() const override { return 1; }
+    inline int numOutputs() const override { return 1; }
+    inline Shape getStarts() const {
+        Shape ans(axes.size());
+        std::transform(axes.begin(), axes.end(), ans.begin(),
+                       [](auto x) { return x.start; });
+        return ans;
+    }
+    inline Shape getEnds() const {
+        Shape ans(axes.size());
+        std::transform(axes.begin(), axes.end(), ans.begin(),
+                       [](auto x) { return x.end; });
+        return ans;
+    }
+    inline Shape getSteps() const {
+        Shape ans(axes.size());
+        std::transform(axes.begin(), axes.end(), ans.begin(),
+                       [](auto x) { return x.step; });
+        return ans;
+    }
 
   private:
     vector<int> getWorkloadVector() const override;
diff --git a/include/operators/transpose.h b/include/operators/transpose.h
new file mode 100644
index 00000000..b26ed49a
--- /dev/null
+++ b/include/operators/transpose.h
@@ -0,0 +1,22 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class TransposeObj : public OperatorObj {
+  public:
+    TransposeObj(GraphObj *graph, Tensor input, Tensor output,
+                 vector<int> permute);
+    OP_CLONE(TransposeObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+    std::vector<int> getPermute() const { return transposePermute; }
+
+  private:
+    vector<int> transposePermute = {1, 1, 1, 1};
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+}; // namespace infini
diff --git a/include/operators/unary.h b/include/operators/unary.h
index e75025c5..1df2b4a7 100644
--- a/include/operators/unary.h
+++ b/include/operators/unary.h
@@ -28,6 +28,244 @@ class UnaryObj : public OperatorObj {
     vector<int> getOpAttrVector() const override;
 };
 
+class ClipObj : public OperatorObj {
+  public:
+    ClipObj(GraphObj *graph, Tensor input, Tensor output,
+            std::optional<float> min, std::optional<float> max);
+    OP_CLONE(ClipObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    std::optional<float> getMin() const { return minValue; };
+    std::optional<float> getMax() const { return maxValue; };
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    std::optional<float> minValue, maxValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class HardtanhObj : public OperatorObj {
+  public:
+    HardtanhObj(GraphObj *graph, Tensor input, Tensor output, float min,
+                float max);
+    OP_CLONE(HardtanhObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    float getMin() const { return minValue; };
+    float getMax() const { return maxValue; };
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    float minValue, maxValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class FlipObj : public OperatorObj {
+  public:
+    FlipObj(GraphObj *graph, Tensor input, Tensor output, vector<int> axis);
+    OP_CLONE(FlipObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    vector<int> getAxis() const { return axisValue; };
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    vector<int> axisValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class FillObj : public OperatorObj {
+  public:
+    FillObj(GraphObj *graph, Tensor input, Tensor output, float value);
+    OP_CLONE(FillObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    float getValue() const { return setValue; };
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    float setValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class L2LossObj : public OperatorObj {
+  public:
+    L2LossObj(GraphObj *graph, Tensor input, Tensor output);
+    OP_CLONE(L2LossObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class TransformObj : public OperatorObj {
+  public:
+    TransformObj(GraphObj *graph, Tensor input, Tensor output, float alpha,
+                 float beta);
+    OP_CLONE(TransformObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    float getAlpha() const { return alphaValue; }
+    float getBeta() const { return betaValue; }
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    float alphaValue, betaValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class CastObj : public OperatorObj {
+  public:
+    enum CastType {
+        Float2Half = 0,
+        Float2Int64,
+        Float2Int32,
+        Float2Int16,
+        Float2Int8,
+        Int322Float,
+        Int322Int8,
+        Int322Int16,
+        Int162Float,
+        Int162Int32,
+        Int82Float,
+        Int82Int16,
+        Int82Int32,
+        Uint82Float,
+        Uint82Int32,
+        Uint82Int64,
+        Int322Int64,
+        Int642Int32,
+        Int642Uint32,
+        Int642Float,
+        Uint322Int64,
+    };
+    CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type);
+    OP_CLONE(CastObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    vector<DataType> inferDataType(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    CastType getType() const { return castType; }
+    DataType getOutputDataType() const;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    CastType castType;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class CumsumObj : public OperatorObj {
+  public:
+    CumsumObj(GraphObj *graph, Tensor input, Tensor output, int axis,
+              bool exclusive, bool reverse);
+    OP_CLONE(CumsumObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int getAxis() const { return axisValue; }
+    float getExclusive() const { return exclusiveValue; }
+    float getReverse() const { return reverseValue; }
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    int axisValue;
+    bool exclusiveValue, reverseValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class ArangeObj : public OperatorObj {
+  public:
+    ArangeObj(GraphObj *graph, float start, float step, int length,
+              Tensor output);
+    OP_CLONE(ArangeObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 0; }
+    int numOutputs() const override { return 1; }
+    float getStartValue() { return startValue; }
+    float getStepValue() { return stepValue; }
+    int getLength() { return lengthValue; }
+
+  private:
+    float startValue, stepValue;
+    int lengthValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class ShapeObj : public OperatorObj {
+  public:
+    ShapeObj(GraphObj *graph, Tensor input, Tensor output);
+    OP_CLONE(ShapeObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+};
+
+class PReluObj : public OperatorObj {
+  public:
+    PReluObj(GraphObj *graph, Tensor input, Tensor alpha, Tensor output);
+    OP_CLONE(PReluObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 2; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class LogObj : public OperatorObj {
+  public:
+    enum LogType {
+        LogE = 0,
+        Log2,
+        Log10,
+    };
+    LogObj(GraphObj *graph, Tensor input, Tensor output, LogType type);
+    OP_CLONE(LogObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    LogType getType() const { return logType; }
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    LogType logType;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
 #define DEFINE_UNARY_OBJ(prefix, type)                                         \
     class prefix##Obj : public UnaryObj {                                      \
       public:                                                                  \
@@ -42,4 +280,28 @@ DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
 // DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)
 DEFINE_UNARY_OBJ(Abs, OpType::Abs)
 
+DEFINE_UNARY_OBJ(Sin, OpType::Sin)
+DEFINE_UNARY_OBJ(Cos, OpType::Cos)
+DEFINE_UNARY_OBJ(Tan, OpType::Tan)
+DEFINE_UNARY_OBJ(ASin, OpType::ASin)
+DEFINE_UNARY_OBJ(ACos, OpType::ACos)
+DEFINE_UNARY_OBJ(ATan, OpType::ATan)
+DEFINE_UNARY_OBJ(SinH, OpType::SinH)
+DEFINE_UNARY_OBJ(CosH, OpType::CosH)
+DEFINE_UNARY_OBJ(TanH, OpType::TanH)
+DEFINE_UNARY_OBJ(ASinH, OpType::ASinH)
+DEFINE_UNARY_OBJ(ACosH, OpType::ACosH)
+DEFINE_UNARY_OBJ(ATanH, OpType::ATanH)
+
+DEFINE_UNARY_OBJ(Copy, OpType::Copy)
+DEFINE_UNARY_OBJ(Ceil, OpType::Ceil)
+DEFINE_UNARY_OBJ(Floor, OpType::Floor)
+DEFINE_UNARY_OBJ(Erf, OpType::Erf)
+DEFINE_UNARY_OBJ(Exp, OpType::Exp)
+DEFINE_UNARY_OBJ(Neg, OpType::Neg)
+DEFINE_UNARY_OBJ(Reciprocal, OpType::Reciprocal)
+DEFINE_UNARY_OBJ(Sqrt, OpType::Sqrt)
+DEFINE_UNARY_OBJ(Rsqrt, OpType::Rsqrt)
+DEFINE_UNARY_OBJ(Round, OpType::Round)
+DEFINE_UNARY_OBJ(Square, OpType::Square)
 }; // namespace infini
diff --git a/pyinfinitensor/pyproject.toml b/pyinfinitensor/pyproject.toml
index aca83ee3..8c438812 100644
--- a/pyinfinitensor/pyproject.toml
+++ b/pyinfinitensor/pyproject.toml
@@ -8,7 +8,7 @@ version = "0.0.0"
 authors = [{ name = "YdrMaster", email = "ydrml@hotmail.com" }]
 description = "Python frontend of InfiniTensor"
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.7"
 keywords = ["optimizer"]
 license = { text = "Apache" }
 classifiers = ["Programming Language :: Python :: 3"]
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 360f5aaa..a450b281 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -22,12 +22,17 @@ from onnx.checker import (
     check_tensor,
 )
 from onnx.shape_inference import infer_shapes
+from onnx.numpy_helper import to_array
 from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
 from functools import reduce
 
-runtime = backend.runtime()
 
 class OnnxStub:
+    """
+    The Onnx model imported into infinitensor.
+    It can be generated from an Onnx model object.
+    """
+
     inputs: Dict[str, backend.Tensor] = {}
     outputs: Dict[str, backend.Tensor] = {}
     initializer: Dict[int, TensorProto] = {}
@@ -53,6 +58,8 @@ class OnnxStub:
             )
 
         for initializer in model.graph.initializer:
+            dims = [d for d in initializer.dims]
+            tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
             data[initializer.name] = initializer
 
         for node in model.graph.node:
@@ -61,14 +68,81 @@ class OnnxStub:
                     node,
                     {
                         "dilations": [1, 1],
-                        "pads": [0, 0],
+                        "pads": [0, 0, 0, 0],
                         "strides": [1, 1],
                     },
                 )
                 (d, p, s) = (
                     attributes[name] for name in ["dilations", "pads", "strides"]
                 )
-                tensors[node.output[0]] = self.handler.conv(
+                if p[0] != p[2] or p[1] != p[3]:
+                    adapt = "{}-adapt".format(node.output[0])
+                    tensors[adapt] = self.handler.pad(
+                        tensors[node.input[0]], None, p, [-2, -1]
+                    )
+                    p = [0, 0, 0, 0]
+                else:
+                    adapt = node.input[0]
+
+                if len(node.input) > 2:
+                    bias = "{}-bias".format(node.output[0])
+                    reshape = "{}-reshape".format(node.output[0])
+                    tensors[bias] = self.handler.conv(
+                        tensors[adapt],
+                        tensors[node.input[1]],
+                        None,
+                        p[0],
+                        p[1],
+                        s[0],
+                        s[1],
+                        d[0],
+                        d[1],
+                    )
+                    tensors[reshape] = self.handler.reshape(
+                        tensors[node.input[2]],
+                        None,
+                        [
+                            1,
+                            reduce(
+                                lambda acc, x: acc * x,
+                                _search_shape(model, node.input[2]),
+                            ),
+                            1,
+                            1,
+                        ],
+                    )
+                    tensors[node.output[0]] = self.handler.add(
+                        tensors[bias],
+                        tensors[reshape],
+                        tensors.get(node.output[0]),
+                    )
+                else:
+                    tensors[node.output[0]] = self.handler.conv(
+                        tensors[adapt],
+                        tensors[node.input[1]],
+                        tensors.get(node.output[0]),
+                        p[0],
+                        p[1],
+                        s[0],
+                        s[1],
+                        d[0],
+                        d[1],
+                    )
+            elif node.op_type == "ConvTranspose":
+                attributes = _parse_attribute(
+                    node,
+                    {
+                        "dilations": [1, 1],
+                        "pads": [0, 0],
+                        "strides": [1, 1],
+                        "output_padding": [0, 0],
+                    },
+                )
+                (d, p, s, op) = (
+                    attributes[name]
+                    for name in ["dilations", "pads", "strides", "output_padding"]
+                )
+                tensors[node.output[0]] = self.handler.convTransposed2d(
                     tensors[node.input[0]],
                     tensors[node.input[1]],
                     tensors.get(node.output[0]),
@@ -78,6 +152,8 @@ class OnnxStub:
                     s[1],
                     d[0],
                     d[1],
+                    op[0],
+                    op[1],
                 )
             elif node.op_type == "MatMul":
                 tensors[node.output[0]] = self.handler.matmul(
@@ -129,7 +205,7 @@ class OnnxStub:
                     {
                         "kernel_shape": None,
                         "dilations": [1, 1],
-                        "pads": [0, 0],
+                        "pads": [0, 0, 0, 0],
                         "strides": [1, 1],
                     },
                 )
@@ -137,56 +213,80 @@ class OnnxStub:
                     attributes[name]
                     for name in ["kernel_shape", "dilations", "pads", "strides"]
                 )
-                tensors[node.output[0]] = self.handler.maxPool(
-                    tensors[node.input[0]],
-                    tensors.get(node.output[0]),
-                    k[0],
-                    k[1],
-                    d[0],
-                    d[1],
-                    p[0],
-                    p[1],
-                    s[0],
-                    s[1],
-                )
+                if p[0] != p[2] or p[1] != p[3]:
+                    adapt = "{}-adapt".format(node.output[0])
+                    tensors[adapt] = self.handler.pad(
+                        tensors.get(node.input[0]), None, p, [-2, -1]
+                    )
+                    tensors[node.output[0]] = self.handler.maxPool(
+                        tensors[adapt],
+                        tensors.get(node.output[0]),
+                        k[0],
+                        k[1],
+                        d[0],
+                        d[1],
+                        0,
+                        0,
+                        s[0],
+                        s[1],
+                    )
+                else:
+                    tensors[node.output[0]] = self.handler.maxPool(
+                        tensors[node.input[0]],
+                        tensors.get(node.output[0]),
+                        k[0],
+                        k[1],
+                        d[0],
+                        d[1],
+                        p[0],
+                        p[1],
+                        s[0],
+                        s[1],
+                    )
             elif node.op_type == "AveragePool":
                 attributes = _parse_attribute(
                     node,
                     {
                         "kernel_shape": None,
-                        "pads": [0, 0],
+                        "pads": [0, 0, 0, 0],
                         "strides": [1, 1],
                     },
                 )
                 (k, p, s) = (
                     attributes[name] for name in ["kernel_shape", "pads", "strides"]
                 )
-                tensors[node.output[0]] = self.handler.avgPool(
-                    tensors[node.input[0]],
-                    tensors.get(node.output[0]),
-                    k[0],
-                    k[1],
-                    1,
-                    1,
-                    p[0],
-                    p[1],
-                    s[0],
-                    s[1],
-                )
+                if p[0] != p[2] or p[1] != p[3]:
+                    adapt = "{}-adapt".format(node.output[0])
+                    tensors[adapt] = self.handler.pad(
+                        tensors.get(node.input[0]), None, p, [-2, -1]
+                    )
+                    tensors[node.output[0]] = self.handler.avgPool(
+                        tensors[adapt],
+                        tensors.get(node.output[0]),
+                        k[0],
+                        k[1],
+                        1,
+                        1,
+                        0,
+                        0,
+                        s[0],
+                        s[1],
+                    )
+                else:
+                    tensors[node.output[0]] = self.handler.avgPool(
+                        tensors[node.input[0]],
+                        tensors.get(node.output[0]),
+                        k[0],
+                        k[1],
+                        1,
+                        1,
+                        p[0],
+                        p[1],
+                        s[0],
+                        s[1],
+                    )
             elif node.op_type == "GlobalAveragePool":
-                shape = next(
-                    (
-                        value.type.tensor_type.shape
-                        for value in model.graph.value_info
-                        if value.name == node.input[0]
-                    ),
-                    None,
-                ) or next(
-                    input.type.tensor_type.shape
-                    for input in model.graph.input
-                    if input.name == node.input[0]
-                )
-                [_, _, h, w] = _take_shape_dim(shape)
+                [_, _, h, w] = _search_shape(model, node.input[0])
                 tensors[node.output[0]] = self.handler.avgPool(
                     tensors[node.input[0]],
                     tensors.get(node.output[0]),
@@ -248,58 +348,123 @@ class OnnxStub:
                 tensors[node.output[0]] = self.handler.softmax(
                     tensors[node.input[0]],
                     tensors.get(node.output[0]),
-                    next((attr.i for attr in node.attribute if attr.name == "axis")),
+                    next(
+                        (attr.i for attr in node.attribute if attr.name == "axis"), -1
+                    ),
                 )
             elif node.op_type == "Abs":
                 tensors[node.output[0]] = self.handler.abs(
                     tensors[node.input[0]],
                     tensors.get(node.output[0]),
                 )
+            elif node.op_type == "Shape":
+                tensors[node.output[0]] = self.handler.shape(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
             elif node.op_type == "Identity":
                 tensors[node.output[0]] = self.handler.identity(
                     tensors[node.input[0]],
                     tensors.get(node.output[0]),
                 )
             elif node.op_type == "Flatten":
-                
                 tensors[node.output[0]] = self.handler.flatten(
                     tensors[node.input[0]],
                     tensors.get(node.output[0]),
                     next((attr.i for attr in node.attribute if attr.name == "axis")),
                 )
-            elif node.op_type == "Reshape":
-                input_shape = next(
-                    (
-                        value.type.tensor_type.shape
-                        for value in model.graph.value_info
-                        if value.name == node.input[0]
-                    ),
-                    None,
-                ) or next(
-                    input.type.tensor_type.shape
-                    for input in model.graph.input
-                    if input.name == node.input[0]
+            elif node.op_type == "PRelu":
+                tensors[node.output[0]] = self.handler.pRelu(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
                 )
-                dims = _take_shape_dim(input_shape)
+            elif node.op_type == "Clip":
+                tensors[node.output[0]] = self.handler.clip(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    next(_parse_data(data[node.input[1]]).__iter__(), None)
+                    if len(node.input) > 1
+                    else None,
+                    next(_parse_data(data[node.input[2]]).__iter__(), None)
+                    if len(node.input) > 2
+                    else None,
+                )
+            elif node.op_type == "Transpose":
+                perm = next(
+                    (attr.ints for attr in node.attribute if attr.name == "perm"), None
+                )
+                tensors[node.output[0]] = self.handler.transpose(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    perm,
+                )
+            elif node.op_type == "Reshape":
+                dims = _search_shape(model, node.input[0])
                 size = reduce(lambda acc, x: acc * x, dims)
-                output_shape = [int(i) for i in data[node.input[1]].int64_data]
-                for i, x in enumerate(output_shape):
+                input_shape = _parse_data(data[node.input[1]])
+                for i, x in enumerate(input_shape):
                     if x == 0:
-                        output_shape[i] = dims[i]
-                temp = reduce(lambda acc, x: acc * x, output_shape)
+                        input_shape[i] = dims[i]
+                temp = reduce(lambda acc, x: acc * x, input_shape, 1)
                 if temp < 0:
-                    output_shape[output_shape.index(-1)] = size // -temp
+                    input_shape[input_shape.index(-1)] = size // -temp
+                tensors[node.output[0]] = self.handler.reshape(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    input_shape,
+                )
+            elif node.op_type == "Squeeze":
+                input_shape = _search_shape(model, node.input[0])
+                axes = set(
+                    [int(i) for i in data[node.input[1]].int64_data]
+                    if len(node.input) > 1
+                    else _parse_attribute(node, {"axes": None})["axes"]
+                )
+                assert all(input_shape[d] == 1 for d in axes)
+                output_shape = []
+                for i, x in enumerate(input_shape):
+                    if i not in axes:
+                        output_shape.append(x)
                 tensors[node.output[0]] = self.handler.reshape(
                     tensors[node.input[0]],
                     tensors.get(node.output[0]),
                     output_shape,
                 )
+            elif node.op_type == "Unsqueeze":
+                input_shape = _search_shape(model, node.input[0])
+                axes = (
+                    [int(i) for i in data[node.input[1]].int64_data]
+                    if len(node.input) > 1
+                    else _parse_attribute(node, {"axes": None})["axes"]
+                )
+                for i in axes:
+                    input_shape.insert(i, 1)
+                tensors[node.output[0]] = self.handler.reshape(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    input_shape,
+                )
             elif node.op_type == "Concat":
                 tensors[node.output[0]] = self.handler.concat(
                     [tensors[name] for name in node.input],
                     tensors.get(node.output[0]),
                     next((attr.i for attr in node.attribute if attr.name == "axis")),
                 )
+            elif node.op_type == "Split":
+                for name, tensor in zip(
+                    node.output,
+                    self.handler.split(
+                        tensors[node.input[0]],
+                        None,
+                        next(
+                            (attr.i for attr in node.attribute if attr.name == "axis"),
+                            0,
+                        ),
+                        len(node.output),
+                    ),
+                ):
+                    tensors[name] = tensor
             elif node.op_type == "Gather":
                 tensors[node.output[0]] = self.handler.gather(
                     tensors[node.input[0]],
@@ -331,6 +496,22 @@ class OnnxStub:
                     _parse_data(data[node.input[1]]),
                     _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
                 )
+            elif node.op_type == "Dropout":
+                for name, tensor in zip(
+                    node.output,
+                    self.handler.dropout(
+                        tensors[node.input[0]],
+                        tensors.get(node.output[0]),
+                        tensors.get(node.output[1]) if len(node.output) > 1 else None,
+                        _parse_data(data[node.input[1]])[0]
+                        if len(node.input) > 1
+                        else 0.5,
+                        _parse_data(data[node.input[2]])[0]
+                        if len(node.input) > 2
+                        else False,
+                    ),
+                ):
+                    tensors[name] = tensor
             else:
                 raise Exception('Unsupported operator "{}"'.format(node.op_type))
 
@@ -344,11 +525,11 @@ class OnnxStub:
             else:
                 self.initializer[obj.fuid()] = tensor
                 if tensor.data_type == TensorProto.INT32:
-                    obj.copyin_int32([int(i) for i in tensor.int32_data])
+                    obj.copyin_int32(_parse_data(tensor))
                 elif tensor.data_type == TensorProto.INT64:
-                    obj.copyin_int64([int(i) for i in tensor.int64_data])
+                    obj.copyin_int64(_parse_data(tensor))
                 elif tensor.data_type == TensorProto.FLOAT:
-                    obj.copyin_float([int(i) for i in tensor.float_data])
+                    obj.copyin_float(_parse_data(tensor))
                 else:
                     assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
 
@@ -398,14 +579,15 @@ class OnnxStub:
                     self.count_in += 1
                     name = "input{}".format(self.count_in)
                     self.names[tensor] = name
-                    shape = tensor.shape()
-                    dtype = backend.tensor_dtype(tensor)
-                    value_info = make_tensor_value_info(name, dtype, shape)
-                    check_value_info(value_info)
-                    self.inputs.append(value_info)
                     if init != None:
                         init.name = name
                         self.initializers.append(init)
+                    else:
+                        shape = tensor.shape()
+                        dtype = backend.tensor_dtype(tensor)
+                        value_info = make_tensor_value_info(name, dtype, shape)
+                        check_value_info(value_info)
+                        self.inputs.append(value_info)
                 return name
 
             def push_data_input(
@@ -417,11 +599,8 @@ class OnnxStub:
                 vals: Any,
             ) -> str:
                 name = "{}_{}".format(node_name, attr_name)
-                value_info = make_tensor_value_info(name, elem_type, shape)
                 tensor = make_tensor(name, elem_type, shape, vals)
-                check_value_info(value_info)
                 check_tensor(tensor)
-                self.inputs.append(value_info)
                 self.initializers.append(tensor)
                 return name
 
@@ -459,20 +638,40 @@ class OnnxStub:
                 for (i, it) in enumerate(op.outputs())
             ]
             if ty == backend.OpType.Conv:
-                ph, pw, sh, sw, dh, dw = backend.conv_attrs_of(op)
+                ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op)
                 ctx.push_node(
                     make_node(
                         ty.name,
                         inputs,
                         outputs,
                         name,
+                        pads=[ph, pw, ph, pw],
+                        strides=[sh, sw],
+                        dilations=[dh, dw],
+                        group=op.inputs()[0].shape()[1] // op.inputs()[1].shape()[1],
+                    )
+                )
+            elif ty == backend.OpType.ConvTrans:
+                ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        "ConvTranspose",
+                        inputs,
+                        outputs,
+                        name,
                         pads=[ph, pw],
                         strides=[sh, sw],
                         dilations=[dh, dw],
+                        output_padding=[oph, opw],
                     )
                 )
             elif ty == backend.OpType.Matmul:
-                ctx.push_node(make_node("MatMul", inputs, outputs, name))
+                transA, transB = backend.matmul_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        "Gemm", inputs, outputs, name, transA=transA, transB=transB
+                    )
+                )
             elif ty == backend.OpType.BatchNorm:
                 inputs = [inputs[i] for i in [0, 3, 4, 1, 2]]
                 momentum, eps, training = backend.batch_norm_attrs_of(op)
@@ -496,7 +695,7 @@ class OnnxStub:
                         outputs,
                         name,
                         kernel_shape=[kh, kw],
-                        pads=[ph, pw],
+                        pads=[ph, pw, ph, pw],
                         dilations=[dh, dw],
                         strides=[sh, sw],
                     )
@@ -510,7 +709,7 @@ class OnnxStub:
                         outputs,
                         name,
                         kernel_shape=[kh, kw],
-                        pads=[ph, pw],
+                        pads=[ph, pw, ph, pw],
                         strides=[sh, sw],
                     )
                 )
@@ -526,17 +725,21 @@ class OnnxStub:
                 backend.OpType.Softmax,
                 backend.OpType.Abs,
                 backend.OpType.Identity,
+                backend.OpType.PRelu,
             ]:
                 ctx.push_node(make_node(ty.name, inputs, outputs, name))
             elif ty == backend.OpType.Flatten:
                 raise Exception("TODO")
+            elif ty == backend.OpType.Transpose:
+                perm = backend.transpose_permute_of(op)
+                ctx.push_node(make_node(ty.name, inputs, outputs, name, perm=perm))
             elif ty == backend.OpType.Reshape:
                 shape = backend.reshape_shape_of(op)
                 inputs.append(
                     ctx.push_data_input(
                         name,
                         "shape",
-                        TensorProto.INT32,
+                        TensorProto.INT64,
                         [len(shape)],
                         shape,
                     )
@@ -545,29 +748,81 @@ class OnnxStub:
             elif ty == backend.OpType.Concat:
                 axis = backend.concat_axis_of(op)
                 ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
+            elif ty == backend.OpType.Split:
+                axis = backend.split_axis_of(op)
+                num_outputs = len(outputs)
+                split = op.inputs()[0].shape()[axis] // num_outputs
+                inputs.append(
+                    ctx.push_data_input(
+                        name,
+                        "split",
+                        TensorProto.INT64,
+                        [len(outputs)],
+                        [split for _ in range(0, num_outputs)],
+                    )
+                )
+                ctx.push_node(
+                    make_node(
+                        ty.name,
+                        inputs,
+                        outputs,
+                        name,
+                        axis=axis,
+                    )
+                )
             elif ty == backend.OpType.Gather:
                 axis = backend.gather_axis_of(op)
                 ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
             elif ty == backend.OpType.ReduceMean:
-                axes = backend.reduce_mean_axes_of(op)
+                axes, keepdims = backend.reduce_mean_attrs_of(op)
                 inputs.append(
                     ctx.push_data_input(
-                        name, "axes", TensorProto.INT32, [len(axes)], axes
+                        name, "axes", TensorProto.INT64, [len(axes)], axes
                     )
                 )
-                ctx.push_node(make_node(ty.name, inputs, outputs, name, keepdims=1))
+                ctx.push_node(
+                    make_node(ty.name, inputs, outputs, name, keepdims=keepdims)
+                )
             elif ty == backend.OpType.Slice:
                 raise Exception("TODO")
             elif ty == backend.OpType.Pad:
-                raise Exception("TODO")
+                pads = backend.pad_pads_of(op)
+                inputs.append(
+                    ctx.push_data_input(
+                        name, "pads", TensorProto.INT64, [len(pads)], pads
+                    )
+                )
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            elif ty == backend.OpType.Clip:
+                min, max = backend.clip_attrs_of(op)
+                if min != None:
+                    inputs.append(
+                        ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [min])
+                    )
+                else:
+                    inputs.append(
+                        ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [])
+                    )
+                if max != None:
+                    inputs.append(
+                        ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [max])
+                    )
+                else:
+                    inputs.append(
+                        ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
+                    )
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
             else:
-                raise Exception("Unsupported OpType {}".format(ty.name))
+                raise Exception("Unsupported OpType", ty)
 
         return ctx.build(name)
 
     def init(self) -> None:
         self.handler.data_malloc()
 
+    def optimize(self) -> None:
+        self.handler.optimize()
+
     def run(self) -> None:
         self.handler.run()
 
@@ -576,9 +831,39 @@ def from_onnx(model: ModelProto, runtime):
     stub = OnnxStub(model, runtime)
     return stub.inputs, stub.outputs, stub.handler
 
-def run_onnx(model: ModelProto, runtime):
-    stub = OnnxStub(model, runtime)
-    stub.run()
+
+def _search_shape(model: ModelProto, name: str) -> List[int]:
+    ans = (
+        next(
+            (
+                [
+                    (d.dim_value if d.dim_value > 0 else 1)
+                    for d in tensor.type.tensor_type.shape.dim
+                ]
+                for tensor in model.graph.value_info
+                if tensor.name == name
+            ),
+            None,
+        )
+        or next(
+            (
+                [
+                    (d.dim_value if d.dim_value > 0 else 1)
+                    for d in tensor.type.tensor_type.shape.dim
+                ]
+                for tensor in model.graph.input
+                if tensor.name == name
+            ),
+            None,
+        )
+        or next(
+            [int(d) for d in tensor.dims]
+            for tensor in model.graph.initializer
+            if tensor.name == name
+        )
+    )
+    return ans
+
 
 def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
     for attr in node.attribute:
@@ -598,15 +883,8 @@ def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[st
     return attrs
 
 
-def _parse_data(tensor: TensorProto) -> List[Union[int, float]]:
-    if tensor.data_type == TensorProto.INT32:
-        return [int(i) for i in tensor.int32_data]
-    elif tensor.data_type == TensorProto.INT64:
-        return [int(i) for i in tensor.int64_data]
-    elif tensor.data_type == TensorProto.FLOAT:
-        return [float(i) for i in tensor.float_data]
-    else:
-        assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
+def _parse_data(tensor: TensorProto) -> List[Any]:
+    return to_array(tensor).flatten().tolist()
 
 
 def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py
index 99ad3b9d..fd589eeb 100644
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@@ -7,18 +7,20 @@ from onnx.helper import (
     make_graph,
     make_tensor_value_info,
 )
-from onnx.checker import check_model
-from pyinfinitensor.onnx import from_onnx, backend, runtime, run_onnx
+from onnx.checker import check_model, check_graph
+from onnx.shape_inference import infer_shapes
+from pyinfinitensor.onnx import from_onnx, OnnxStub, backend
 
 
 def make_and_import_model(graph: onnx.GraphProto):
+    check_graph(graph)
     model = make_model(graph)
     check_model(model)
-    from_onnx(model, runtime)
+    from_onnx(model, backend.cpu_runtime())
 
 
 class TestStringMethods(unittest.TestCase):
-    #def test_run(self):
+    # def test_run(self):
     #    model_file = next(
     #        (name for name in os.listdir() if name.endswith(".onnx")), None
     #    )
@@ -31,16 +33,17 @@ class TestStringMethods(unittest.TestCase):
     #        run_onnx(onnx.load(model_file), runtime)
 
     def test_load(self):
-        model_file = next(
-            (name for name in os.listdir() if name.endswith(".onnx")), None
-        )
-        if model_file != None:
-            print(
-                "model: {file}({size:.2f} MiB)".format(
-                    file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
+        for model_file in os.listdir():
+            if model_file.endswith(".onnx"):
+                print(
+                    "model: {file}({size:.2f} MiB)".format(
+                        file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
+                    )
                 )
-            )
-            from_onnx(onnx.load(model_file), runtime)
+                model = OnnxStub(onnx.load(model_file), backend.cpu_runtime()).to_onnx(
+                    "new"
+                )
+                model = infer_shapes(model)
 
     def test_tensor(self):
         x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 2, 3])
@@ -55,7 +58,7 @@ class TestStringMethods(unittest.TestCase):
             ["i", "w"],
             ["o"],
             "conv",
-            pads=[1, 1],
+            pads=[1, 1, 1, 1],
             strides=[2, 1],
             dilations=[1, 2],
         )
@@ -102,7 +105,7 @@ class TestStringMethods(unittest.TestCase):
             ["y"],
             kernel_shape=[3, 3],
             dilations=[1, 1],
-            pads=[0, 0],
+            pads=[0, 0, 0, 0],
             strides=[2, 2],
             name="maxPool",
         )
@@ -116,7 +119,7 @@ class TestStringMethods(unittest.TestCase):
             ["x"],
             ["y"],
             kernel_shape=[3, 3],
-            pads=[0, 0],
+            pads=[0, 0, 0, 0],
             strides=[2, 2],
             name="avgPool",
         )
@@ -206,7 +209,7 @@ class TestStringMethods(unittest.TestCase):
 
     def test_flatten(self):
         x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
-        y = make_tensor_value_info("y", TensorProto.FLOAT, [1*3,  5 * 7])
+        y = make_tensor_value_info("y", TensorProto.FLOAT, [1 * 3, 5 * 7])
         flatten = make_node("Flatten", ["x"], ["y"], axis=2, name="flatten")
         # make_and_import_model(
         make_graph([flatten], "flatten", [x], [y])
@@ -254,22 +257,19 @@ class TestStringMethods(unittest.TestCase):
 
     def test_slice(self):
         data = make_tensor_value_info("data", TensorProto.UINT32, [10, 64, 162, 162])
-        output = make_tensor_value_info("output", TensorProto.UINT32, [1, 0, 99, 95])
-        starts = make_tensor_value_info("starts", TensorProto.INT64, [4])
-        starts_data = make_tensor("starts", TensorProto.INT64, [4], [2, 10, 1, 5])
-        ends = make_tensor_value_info("ends", TensorProto.INT64, [4])
-        ends_data = make_tensor("ends", TensorProto.INT64, [4], [3, 10, 100, 100])
+        output = make_tensor_value_info("output", TensorProto.UINT32, [1, 1, 99, 95])
+        starts = make_tensor("starts", TensorProto.INT64, [4], [2, 9, 1, 5])
+        ends = make_tensor("ends", TensorProto.INT64, [4], [3, 10, 100, 100])
         slice = make_node("Slice", ["data", "starts", "ends"], ["output"], name="slice")
-        # FIXME 后端的实现是 axis:[start,end]，onnx 的实现是 axis:[start,end)
-        # make_and_import_model(
-        make_graph(
-            [slice],
-            "slice",
-            [data, starts, ends],
-            [output],
-            [starts_data, ends_data],
+        make_and_import_model(
+            make_graph(
+                [slice],
+                "slice",
+                [data],
+                [output],
+                [starts, ends],
+            )
         )
-        # )
 
     def test_pad(self):
         data = make_tensor_value_info("data", TensorProto.UINT32, [1, 64, 162, 162])
@@ -300,10 +300,10 @@ class TestStringMethods(unittest.TestCase):
         graph = make_graph([matmul, add], "lr", [x, a, b], [y])
         model = make_model(graph)
         check_model(model)
-        from_onnx(model, runtime)
+        from_onnx(model, backend.cpu_runtime())
 
     def test_frontend(self):
-        handler = backend.GraphHandler(runtime)
+        handler = backend.GraphHandler(backend.cpu_runtime())
         a = handler.tensor([1, 2, 3], 12)
         b = handler.tensor([1, 2, 3], 12)
         c = handler.tensor([1, 2, 3], 12)
diff --git a/src/core/graph.cc b/src/core/graph.cc
index 90147320..f52f8af7 100644
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@@ -114,6 +114,15 @@ bool GraphObj::topo_sort() {
     return this->sorted = true;
 }
 
+void GraphObj::optimize() {
+    for (auto &op : ops) {
+        switch (op->getOpType()) {
+        default:
+            break;
+        }
+    }
+}
+
 void GraphObj::dataMalloc() {
     for (auto &tensor : tensors) {
         tensor->dataMalloc();
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index 424ca276..61db36ac 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -11,6 +11,8 @@
 #include "operators/reshape.h"
 #include "operators/slice.h"
 #include "operators/softmax.h"
+#include "operators/split.h"
+#include "operators/transpose.h"
 #include "operators/unary.h"
 
 namespace infini {
@@ -35,6 +37,24 @@ Tensor GraphHandlerObj::conv(Tensor input, Tensor weight, Tensor output, int ph,
     }
 }
 
+Tensor GraphHandlerObj::convTransposed2d(Tensor input, Tensor weight,
+                                         Tensor output, int ph, int pw, int sh,
+                                         int sw, int dh, int dw, int oph,
+                                         int opw) {
+    if (output) {
+        g->addOpWithOutputs<ConvTransposed2dObj>(std::move(input),
+                                                 std::move(weight), output, ph,
+                                                 pw, sh, sw, dh, dw, oph, opw);
+        return output;
+    } else {
+        return g
+            ->addOp<ConvTransposed2dObj>(std::move(input), std::move(weight),
+                                         output, ph, pw, sh, sw, dh, dw, oph,
+                                         opw)
+            ->getOutput();
+    }
+}
+
 Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA,
                                bool transB, Tensor bias, ActType act) {
     if (y) {
@@ -128,9 +148,31 @@ DEFINE_UNARY_METHOD(relu, Relu)
 DEFINE_UNARY_METHOD(sigmoid, Sigmoid)
 DEFINE_UNARY_METHOD(tanh, Tanh)
 DEFINE_UNARY_METHOD(abs, Abs)
+DEFINE_UNARY_METHOD(shape, Shape)
+
 // see operators/reshape.h
 DEFINE_UNARY_METHOD(identity, Identity)
 
+Tensor GraphHandlerObj::pRelu(Tensor x, Tensor slope, Tensor y) {
+    if (y) {
+        g->addOpWithOutputs<PReluObj>(std::move(x), std::move(slope), y);
+        return y;
+    } else {
+        return g->addOp<PReluObj>(std::move(x), std::move(slope), y)
+            ->getOutput();
+    }
+}
+
+Tensor GraphHandlerObj::clip(Tensor x, Tensor y, std::optional<float> min,
+                             std::optional<float> max) {
+    if (y) {
+        g->addOpWithOutputs<ClipObj>(std::move(x), y, min, max);
+        return y;
+    } else {
+        return g->addOp<ClipObj>(std::move(x), y, min, max)->getOutput();
+    }
+}
+
 Tensor GraphHandlerObj::softmax(Tensor input, Tensor output, int axis) {
     if (output) {
         g->addOpWithOutputs<SoftmaxObj>(std::move(input), output, axis);
@@ -151,6 +193,16 @@ Tensor GraphHandlerObj::flatten(Tensor input, Tensor output, int axis) {
     }
 }
 
+Tensor GraphHandlerObj::transpose(Tensor data, Tensor transposed, Shape perm) {
+    if (transposed) {
+        g->addOpWithOutputs<TransposeObj>(std::move(data), transposed, perm);
+        return transposed;
+    } else {
+        return g->addOp<TransposeObj>(std::move(data), transposed, perm)
+            ->getOutput();
+    }
+}
+
 Tensor GraphHandlerObj::reshape(Tensor data, Tensor reshaped, Shape shape) {
     if (reshaped) {
         g->addOpWithOutputs<ReshapeObj>(std::move(data), reshaped,
@@ -171,6 +223,18 @@ Tensor GraphHandlerObj::concat(TensorVec inputs, Tensor output, int dim) {
     }
 }
 
+TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs,
+                                 int axis, int num_outputs) {
+    if (outputs) {
+        g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
+                                      num_outputs);
+        return *outputs;
+    } else {
+        return g->addOp<SplitObj>(std::move(input), outputs, axis, num_outputs)
+            ->getOutputs();
+    }
+}
+
 Tensor GraphHandlerObj::gather(Tensor data, Tensor indices, Tensor output,
                                int axis) {
     if (output) {
diff --git a/src/core/tensor.cc b/src/core/tensor.cc
index fd5ddde4..e63039d5 100644
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@@ -64,79 +64,24 @@ vector<size_t> TensorObj::getStride() const {
 
 void TensorObj::printData() const {
     IT_ASSERT(data != nullptr);
-    void *ptr = nullptr;
-    Blob buffer;
-    if (!runtime->isCpu()) {
-        buffer = NativeCpuRuntimeObj::getInstance()->allocBlob(getBytes());
-        runtime->copyBlobToCPU(buffer->getPtr<void *>(),
-                               getRawDataPtr<void *>(), getBytes());
-        ptr = buffer->getPtr<void *>();
-    } else
-        ptr = data->getPtr<float *>();
-    if (dtype == DataType::Float32)
-        printDataFloat(static_cast<float *>(ptr));
-    else if (dtype == DataType::UInt32)
-        printDataUint32_t(static_cast<uint32_t *>(ptr));
-    else
+    if (!runtime->isCpu())
         IT_TODO_HALT();
-}
 
-void TensorObj::printDataFloat(float *ptr) const {
-    std::cout << "Tensor: " << guid << std::endl;
-    auto numDims = shape.size();
-    auto dimSzVec = std::vector<int>(numDims, 1);
-    dimSzVec[numDims - 1] = shape[numDims - 1];
-    for (int i = numDims - 1; i != 0; --i)
-        dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
-    for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
-        if (iEnd > 1000 && i > 20 && i < iEnd - 20) {
-            printf("... , ");
-            i = iEnd - 20;
-            continue;
-        }
-        for (size_t j = 0; j < numDims; ++j) {
-            if (i % dimSzVec[j] == 0) {
-                std::cout << "[";
-            }
-        }
-        printf("%.1f", ptr[i]);
-        for (size_t j = 0; j < numDims; ++j) {
-            if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
-                std::cout << "]";
-            }
-        }
-        if (i != size() - 1)
-            std::cout << ", ";
-        if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1)
-            std::cout << std::endl;
-    }
-}
+#define TRY_PRINT(N)                                                           \
+    if (dtype == DataType(N))                                                  \
+        std::cout << dataToString<DT<N>::t>() << std::endl;
 
-void TensorObj::printDataUint32_t(uint32_t *ptr) const {
-    IT_ASSERT(data != nullptr);
-    std::cout << "Tensor: " << guid << std::endl;
-    auto numDims = shape.size();
-    auto dimSzVec = std::vector<int>(numDims, 1);
-    dimSzVec[numDims - 1] = shape[numDims - 1];
-    for (int i = numDims - 1; i != 0; --i)
-        dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
-    for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
-        for (size_t j = 0; j < numDims; ++j) {
-            if (i % dimSzVec[j] == 0) {
-                std::cout << "[";
-            }
-        }
-        std::cout << ptr[i];
-        for (size_t j = 0; j < numDims; ++j) {
-            if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
-                std::cout << "]";
-            }
-        }
-        if (i != size() - 1)
-            std::cout << ", ";
-        if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1)
-            std::cout << std::endl;
-    }
+    TRY_PRINT(0)          // fmt: new line
+    else TRY_PRINT(1)     //
+        else TRY_PRINT(2) //
+        else TRY_PRINT(3) //
+        else TRY_PRINT(4) //
+        else TRY_PRINT(5) //
+        else TRY_PRINT(6) //
+        else TRY_PRINT(7) //
+        else IT_TODO_HALT();
+
+#undef TRY_PRINT
 }
 
 bool TensorObj::equalData(const Tensor &rhs, double relativeError) const {
@@ -147,19 +92,27 @@ bool TensorObj::equalData(const Tensor &rhs, double relativeError) const {
     IT_ASSERT(rhs->getRuntime()->isCpu());
     if (size() != rhs->size())
         return false;
-    if (getDType() == DataType::UInt32)
-        return equalDataImpl(getRawDataPtr<uint32_t *>(),
-                             rhs->getRawDataPtr<uint32_t *>(), size(), 0);
-    else if (getDType() == DataType::Float32)
-        return equalDataImpl(getRawDataPtr<float *>(),
-                             rhs->getRawDataPtr<float *>(), size(),
-                             relativeError);
-    else
-        IT_TODO_HALT();
+
+#define TEST_EQUAL(N)                                                          \
+    if (dtype == DataType(N))                                                  \
+        return equalDataImpl(getRawDataPtr<DT<N>::t *>(),                      \
+                             rhs->getRawDataPtr<DT<N>::t *>(), size());
+
+    TEST_EQUAL(0)          // fmt: new line
+    else TEST_EQUAL(1)     //
+        else TEST_EQUAL(2) //
+        else TEST_EQUAL(3) //
+        else TEST_EQUAL(4) //
+        else TEST_EQUAL(5) //
+        else TEST_EQUAL(6) //
+        else TEST_EQUAL(7) //
+        else IT_TODO_HALT();
+
+#undef TEST_EQUAL
 }
 
 void TensorObj::dataMalloc() {
-    if (data == nullptr)
+    if (!data)
         data = runtime->allocBlob(getBytes());
 }
 
@@ -201,9 +154,9 @@ Shape TensorObj::getPosByOffset(size_t offset, Shape dim) const {
 size_t TensorObj::getOffsetByPos(Shape pos, Shape dim) const {
     int n = dim.size();
     size_t offset = pos.at(0);
-    for (auto i = 1; i < n; i++) {
+    for (auto i = 1; i < n; i++)
         offset = offset * dim.at(i) + pos.at(i);
-    }
+
     return offset;
 }
 
@@ -213,10 +166,10 @@ size_t TensorObj::getOffsetByBroadcastOffset(size_t bcOffset,
 
     Shape pos = bcPos;
     int n = shape.size();
-    for (auto i = 0; i < n; i++) {
+    for (auto i = 0; i < n; i++)
         if (shape.at(i) == 1)
             pos[i] = 0;
-    }
+
     return getOffsetByPos(pos, shape);
 }
 }; // namespace infini
diff --git a/src/cuda/cuda_runtime.cc b/src/cuda/cuda_runtime.cc
index 94c7744a..37b5e7cf 100644
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@@ -1,6 +1,7 @@
 #include "cuda/cuda_runtime.h"
 #include "core/kernel.h"
 #include "core/perf_engine.h"
+#include "core/runtime.h"
 #include "operators/conv.h"
 #include "operators/matmul.h"
 namespace infini {
@@ -16,10 +17,11 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
         auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
         auto perfData = perfEngine.getPerfData(perfKey);
         // IT_ASSERT(perfData, "No perf data for OP " + op->toString());
-        if (perfData)
+        if (perfData) {
             kernel->compute(op, perfData, this);
-        else
+        } else {
             kernel->compute(op, this);
+        }
     }
 }
 
@@ -73,4 +75,4 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
 
 string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 1b58abd5..6bb77256 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -3,15 +3,24 @@
 #include "operators/concat.h"
 #include "operators/conv.h"
 #include "operators/gather.h"
+#include "operators/matmul.h"
+#include "operators/pad.h"
 #include "operators/pooling.h"
 #include "operators/reduce_mean.h"
 #include "operators/reshape.h"
+#include "operators/split.h"
+#include "operators/transpose.h"
+#include "operators/unary.h"
+#include <algorithm>
 #include <pybind11/stl.h>
 
 #ifdef USE_CUDA
 #include "cuda/cuda_runtime.h"
 #include "cuda/operator_timer.h"
 #endif
+#ifdef USE_BANG
+#include "bang/bang_runtime.h"
+#endif
 #ifdef USE_INTELCPU
 #include "intelcpu/mkl_runtime.h"
 #include "intelcpu/operator_timer.h"
@@ -57,6 +66,7 @@ void export_values(py::module &m) {
         .VALUE(OpType, G2BMM)
         .VALUE(OpType, GBMM)
         .VALUE(OpType, Pad)
+        .VALUE(OpType, Clip)
         .VALUE(OpType, Slice)
         .VALUE(OpType, Concat)
         .VALUE(OpType, Split)
@@ -78,11 +88,12 @@ void export_values(py::module &m) {
         .VALUE(OpType, Softmax)
         .VALUE(OpType, Activation)
         .VALUE(OpType, Relu)
+        .VALUE(OpType, PRelu)
         .VALUE(OpType, Sigmoid)
         .VALUE(OpType, Tanh)
         .VALUE(OpType, Abs)
         .VALUE(OpType, Resize)
-        .VALUE(OpType, MemBound)
+        .VALUE(OpType, Dropout)
         .export_values();
 
 #undef VALUE
@@ -112,6 +123,10 @@ static int tensor_dtype(Tensor t) {
 static Ref<CudaRuntimeObj> cuda_runtime() { return make_ref<CudaRuntimeObj>(); }
 #endif
 
+#ifdef USE_BANG
+static Ref<BangRuntimeObj> bang_runtime() { return make_ref<BangRuntimeObj>(); }
+#endif
+
 #ifdef USE_INTELCPU
 static Ref<RuntimeObj> intelcpu_runtime() { return make_ref<MklRuntimeObj>(); }
 #endif
@@ -123,11 +138,27 @@ static std::tuple<int, int, int, int, int, int> conv_attrs_of(Operator op) {
                            conv->getDw(), conv->getSh(), conv->getSw());
 }
 
+static std::tuple<int, int, int, int, int, int, int, int>
+conv_trans_attrs_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::ConvTrans);
+    auto conv = dynamic_cast<const ConvTransposed2dObj *>(op.get());
+    auto [oph, opw] = conv->getOutputPadding();
+    return std::make_tuple(conv->getPh(), conv->getPw(), conv->getDh(),
+                           conv->getDw(), conv->getSh(), conv->getSw(), oph,
+                           opw);
+}
+
+static std::tuple<bool, bool> matmul_attrs_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::Matmul);
+    auto matmul = dynamic_cast<const MatmulObj *>(op.get());
+    return std::make_tuple(matmul->getTransA(), matmul->getTransB());
+}
+
 static std::tuple<float, float, bool> batch_norm_attrs_of(Operator op) {
     IT_ASSERT(op->getOpType() == OpType::BatchNorm);
     auto batchnorm = dynamic_cast<const BatchNormObj *>(op.get());
     return std::make_tuple(batchnorm->getMomentum(), batchnorm->getEps(),
-                           batchnorm->getTraining());
+                           batchnorm->getTrainingMode());
 }
 
 static std::tuple<int, int, int, int, int, int, int, int>
@@ -140,45 +171,88 @@ pool_attrs_of(Operator op) {
                            pool->getSh(), pool->getSw());
 }
 
+static std::tuple<std::optional<float>, std::optional<float>>
+clip_attrs_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::Clip);
+    auto clip = dynamic_cast<const ClipObj *>(op.get());
+    return std::make_tuple(clip->getMin(), clip->getMax());
+}
+
+static std::tuple<vector<int>, bool> reduce_mean_attrs_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::ReduceMean);
+    auto reduce_mean = dynamic_cast<const ReduceMeanObj *>(op.get());
+    auto &set = reduce_mean->getAxes();
+    return std::make_tuple(vector(set.begin(), set.end()),
+                           reduce_mean->getKeepDims());
+}
+
 static int concat_axis_of(Operator op) {
     IT_ASSERT(op->getOpType() == OpType::Concat);
     return dynamic_cast<const ConcatObj *>(op.get())->getDim();
 }
 
+static int split_axis_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::Split);
+    return dynamic_cast<const SplitObj *>(op.get())->getDim();
+}
+
 static int gather_axis_of(Operator op) {
     IT_ASSERT(op->getOpType() == OpType::Gather);
     return dynamic_cast<const GatherObj *>(op.get())->getAxis();
 }
 
-static vector<int> reduce_mean_axes_of(Operator op) {
-    IT_ASSERT(op->getOpType() == OpType::ReduceMean);
-    auto &set = dynamic_cast<const ReduceMeanObj *>(op.get())->getAxes();
-    return vector(set.begin(), set.end());
+static vector<int64_t> reshape_shape_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::Reshape);
+    auto shape = dynamic_cast<const ReshapeObj *>(op.get())->getShape();
+    vector<int64_t> ans(shape.size());
+    std::transform(shape.begin(), shape.end(), ans.begin(),
+                   [](auto x) { return static_cast<int64_t>(x); });
+    return ans;
 }
 
-static Shape reshape_shape_of(Operator op) {
-    IT_ASSERT(op->getOpType() == OpType::Reshape);
-    return dynamic_cast<const ReshapeObj *>(op.get())->getShape();
+static vector<int64_t> pad_pads_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::Pad);
+    auto shape = dynamic_cast<const PadObj *>(op.get())->getPads();
+    vector<int64_t> ans(shape.size());
+    std::transform(shape.begin(), shape.end(), ans.begin(),
+                   [](auto x) { return static_cast<int64_t>(x); });
+    return ans;
+}
+
+static vector<int> transpose_permute_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::Transpose);
+    return dynamic_cast<const TransposeObj *>(op.get())->getPermute();
 }
 
 void export_functions(py::module &m) {
 #define FUNCTION(NAME) def(#NAME, &NAME)
+    m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
 #ifdef USE_CUDA
-    m.def("runtime", cuda_runtime)
-#elif USE_INTELCPU
-    m.def("runtime", intelcpu_runtime)
-#else
-    m.def("runtime", &NativeCpuRuntimeObj::getInstance)
+        .def("cuda_runtime", cuda_runtime)
+#endif
+#ifdef USE_INTELCPU
+        .def("intelcpu_runtime", intelcpu_runtime)
+#endif
+#ifdef USE_CUDA
+        .FUNCTION(cuda_runtime)
+#endif
+#ifdef USE_BANG
+        .FUNCTION(bang_runtime)
 #endif
-
         .FUNCTION(conv_attrs_of)
+        .FUNCTION(conv_trans_attrs_of)
+        .FUNCTION(matmul_attrs_of)
         .FUNCTION(batch_norm_attrs_of)
         .FUNCTION(pool_attrs_of)
+        .FUNCTION(clip_attrs_of)
+        .FUNCTION(reduce_mean_attrs_of)
         .FUNCTION(tensor_dtype)
         .FUNCTION(reshape_shape_of)
+        .FUNCTION(pad_pads_of)
+        .FUNCTION(transpose_permute_of)
         .FUNCTION(concat_axis_of)
-        .FUNCTION(gather_axis_of)
-        .FUNCTION(reduce_mean_axes_of);
+        .FUNCTION(split_axis_of)
+        .FUNCTION(gather_axis_of);
 #undef FUNCTION
 }
 
@@ -191,6 +265,10 @@ void init_graph_builder(py::module &m) {
 #ifdef USE_CUDA
     py::class_<CudaRuntimeObj, std::shared_ptr<CudaRuntimeObj>, RuntimeObj>(
         m, "CudaRuntime");
+#endif
+#ifdef USE_BANG
+    py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
+        m, "BangRuntime");
 #endif
     py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
         .def("fuid", &TensorObj::getFuid, policy::automatic)
@@ -215,6 +293,7 @@ void init_graph_builder(py::module &m) {
         .def(py::init<Runtime>())
         .def("tensor", &Handler::tensor, policy::move)
         .def("conv", &Handler::conv, policy::move)
+        .def("convTransposed2d", &Handler::convTransposed2d, policy::move)
         .def("matmul", &Handler::matmul, policy::move)
         .def("batchNorm", &Handler::batchNorm, policy::move)
         .def("maxPool", &Handler::maxPool, policy::move)
@@ -229,15 +308,21 @@ void init_graph_builder(py::module &m) {
         .def("tanh", &Handler::tanh, policy::move)
         .def("softmax", &Handler::softmax, policy::move)
         .def("abs", &Handler::abs, policy::move)
+        .def("shape", &Handler::shape, policy::move)
         .def("identity", &Handler::identity, policy::move)
         .def("flatten", &Handler::flatten, policy::move)
+        .def("pRelu", &Handler::pRelu, policy::move)
+        .def("clip", &Handler::clip, policy::move)
+        .def("transpose", &Handler::transpose, policy::move)
         .def("reshape", &Handler::reshape, policy::move)
         .def("concat", &Handler::concat, policy::move)
+        .def("split", &Handler::split, policy::move)
         .def("gather", &Handler::gather, policy::move)
         .def("reduce_mean", &Handler::reduceMean, policy::move)
         .def("slice", &Handler::slice, policy::move)
         .def("pad", &Handler::pad, policy::move)
         .def("topo_sort", &Handler::topo_sort, policy::automatic)
+        .def("optimize", &Handler::optimize, policy::automatic)
         .def("operators", &Handler::operators, policy::move)
         .def("data_malloc", &Handler::data_malloc, policy::automatic)
         .def("run", &Handler::run, policy::automatic);
diff --git a/src/kernels/bang/activation.cc b/src/kernels/bang/activation.cc
new file mode 100644
index 00000000..935e2746
--- /dev/null
+++ b/src/kernels/bang/activation.cc
@@ -0,0 +1,208 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class UnaryCnnl : public BangKernelWithoutConfig {
+    virtual cnnlActivationMode_t getOpType() const = 0;
+    virtual float getCoef() const = 0;
+    virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        int len = dim.size();
+        int size = 1;
+        for (int i = 0; i < len; ++i) {
+            size *= dim[i];
+        }
+
+        int dim_array[1] = {size};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, 1, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, 1, dim_array));
+
+        // get op descriptor
+        cnnlActivationDescriptor_t opDesc;
+        checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
+        checkCnnlError(cnnlSetActivationDescriptor(
+            opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
+
+        auto [alpha, beta] = getAlphBeta();
+        cnnlStatus_t stat =
+            cnnlActivationForward(context->cnnlHandle(), opDesc, &alpha, aDesc,
+                                  aData, &beta, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyActivationDescriptor(opDesc));
+    }
+};
+
+class RoundCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class SquareCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlSquare(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class PReluCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<PReluObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        int alpha_array[4] = {1, 1, 1, 1};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, alpha_array));
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat = cnnlPrelu(context->cnnlHandle(), aDesc, aData,
+                                      bDesc, bData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class ReluCnnl : public UnaryCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_RELU;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+class SigmoidCnnl : public UnaryCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_SIGMOID;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+class TanhCnnl : public UnaryCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_TANH;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Relu, DataType::Float32, ReluCnnl,
+                "Relu_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::PRelu, DataType::Float32, PReluCnnl,
+                "PRelu_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, DataType::Float32, SigmoidCnnl,
+                "Sigmoid_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Tanh, DataType::Float32, TanhCnnl,
+                "Tanh_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Round, DataType::Float32, RoundCnnl,
+                "Round_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Square, DataType::Float32, SquareCnnl,
+                "Square_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/activation_backward.cc b/src/kernels/bang/activation_backward.cc
new file mode 100644
index 00000000..78be8c0d
--- /dev/null
+++ b/src/kernels/bang/activation_backward.cc
@@ -0,0 +1,94 @@
+#include "operators/activation_backward.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class ActivationBackwardCnnl : public BangKernelWithoutConfig {
+    virtual cnnlActivationMode_t getOpType() const = 0;
+    virtual float getCoef() const = 0;
+    virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ActivationBackwardObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const yData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const diffYData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const xData = (op->getInputs(2)->getRawDataPtr<void *>());
+        void *const diffXData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t yDesc, diffYDesc, xDesc, diffXDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&yDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(diffYDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&xDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(xDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(diffXDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get op descriptor
+        cnnlActivationDescriptor_t opDesc;
+        checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
+        checkCnnlError(cnnlSetActivationDescriptor(
+            opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
+
+        auto [alpha, beta] = getAlphBeta();
+        cnnlStatus_t stat = cnnlActivationBackward(
+            context->cnnlHandle(), opDesc, &alpha, yDesc, yData, diffYDesc,
+            diffYData, xDesc, xData, &beta, diffXDesc, diffXData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(yDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(diffYDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(xDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(diffXDesc));
+        checkCnnlError(cnnlDestroyActivationDescriptor(opDesc));
+    }
+};
+
+class ReluBackwardCnnl : public ActivationBackwardCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_RELU;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+class SigmoidBackwardCnnl : public ActivationBackwardCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_SIGMOID;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+class TanhBackwardCnnl : public ActivationBackwardCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_TANH;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, DataType::Float32,
+                ReluBackwardCnnl, "ReluBackward_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, DataType::Float32,
+                SigmoidBackwardCnnl, "SigmoidBackward_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, DataType::Float32,
+                TanhBackwardCnnl, "TanhBackward_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/cast.cc b/src/kernels/bang/cast.cc
new file mode 100644
index 00000000..35da0190
--- /dev/null
+++ b/src/kernels/bang/cast.cc
@@ -0,0 +1,185 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class CastCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<CastObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        cnnlCastDataType_t NlCastType;
+        CastObj::CastType type = op->getType();
+        switch (type) {
+        case CastObj::Float2Int64:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
+            NlCastType = CNNL_CAST_FLOAT_TO_INT64;
+            break;
+        case CastObj::Float2Int32:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
+            NlCastType = CNNL_CAST_FLOAT_TO_INT32;
+            break;
+        case CastObj::Float2Int16:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
+            NlCastType = CNNL_CAST_FLOAT_TO_INT16;
+            break;
+        case CastObj::Float2Int8:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
+            NlCastType = CNNL_CAST_FLOAT_TO_INT8;
+            break;
+        case CastObj::Int322Float:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+            NlCastType = CNNL_CAST_INT32_TO_FLOAT;
+            break;
+        case CastObj::Int322Int8:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
+            NlCastType = CNNL_CAST_INT32_TO_INT8;
+            break;
+        case CastObj::Int322Int16:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
+            NlCastType = CNNL_CAST_INT32_TO_INT16;
+            break;
+        case CastObj::Int162Float:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+            NlCastType = CNNL_CAST_INT16_TO_FLOAT;
+            break;
+        case CastObj::Int162Int32:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
+            NlCastType = CNNL_CAST_INT16_TO_INT32;
+            break;
+        case CastObj::Int82Float:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+            NlCastType = CNNL_CAST_INT8_TO_FLOAT;
+            break;
+        case CastObj::Int82Int16:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
+            NlCastType = CNNL_CAST_INT8_TO_INT16;
+            break;
+        case CastObj::Int82Int32:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
+            NlCastType = CNNL_CAST_INT8_TO_INT32;
+            break;
+        case CastObj::Uint82Float:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+            NlCastType = CNNL_CAST_UINT8_TO_FLOAT;
+            break;
+        case CastObj::Uint82Int32:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
+            NlCastType = CNNL_CAST_UINT8_TO_INT32;
+            break;
+        case CastObj::Uint82Int64:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
+            NlCastType = CNNL_CAST_UINT8_TO_INT64;
+            break;
+        case CastObj::Int322Int64:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
+            NlCastType = CNNL_CAST_INT32_TO_INT64;
+            break;
+        case CastObj::Int642Int32:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
+            NlCastType = CNNL_CAST_INT64_TO_INT32;
+            break;
+        case CastObj::Int642Uint32:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT32, 4, dim_array));
+            NlCastType = CNNL_CAST_INT64_TO_UINT32;
+            break;
+        case CastObj::Int642Float:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+            NlCastType = CNNL_CAST_INT64_TO_FLOAT;
+            break;
+        case CastObj::Uint322Int64:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT32, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
+            NlCastType = CNNL_CAST_UINT32_TO_INT64;
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+        cnnlStatus_t stat = cnnlCastDataType(context->cnnlHandle(), aDesc,
+                                             aData, NlCastType, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Cast, DataType::Float32, CastCnnl,
+                "Cast_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/ceil.cc b/src/kernels/bang/ceil.cc
new file mode 100644
index 00000000..5770f412
--- /dev/null
+++ b/src/kernels/bang/ceil.cc
@@ -0,0 +1,46 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class CeilCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Ceil, DataType::Float32, CeilCnnl,
+                "Ceil_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/clip.cc b/src/kernels/bang/clip.cc
new file mode 100644
index 00000000..bdfb473b
--- /dev/null
+++ b/src/kernels/bang/clip.cc
@@ -0,0 +1,42 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class ClipCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ClipObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        float min = op->getMin().value();
+        float max = op->getMax().value();
+
+        cnnlTensorDescriptor_t aDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Clip, DataType::Float32, ClipCnnl,
+                "Clip_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/concat.cc b/src/kernels/bang/concat.cc
new file mode 100644
index 00000000..1bfc1d33
--- /dev/null
+++ b/src/kernels/bang/concat.cc
@@ -0,0 +1,68 @@
+#include "operators/concat.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class ConcatCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ConcatObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        int num = op->numInputs();
+        int axis = op->getDim();
+        void *argv[num];
+        for (int i = 0; i < num; ++i) {
+            argv[i] = op->getInputs(i)->getRawDataPtr<void *>();
+        }
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t desc;
+
+        int dim_array[num][4];
+        for (int i = 0; i < num; ++i) {
+            auto dim = op->getInputs(i)->getDims();
+            if (dim.size() != 4) {
+                IT_TODO_HALT();
+            }
+            dim_array[i][0] = dim[0];
+            dim_array[i][1] = dim[1];
+            dim_array[i][2] = dim[2];
+            dim_array[i][3] = dim[3];
+        }
+
+        auto dim = op->getOutput()->getDims();
+        int dimout_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&desc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            desc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimout_array));
+        cnnlTensorDescriptor_t descArray[num];
+        for (int i = 0; i < num; ++i) {
+            checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
+            checkCnnlError(
+                cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
+                                        CNNL_DTYPE_FLOAT, 4, dim_array[i]));
+        }
+
+        size_t wsSize;
+        cnnlGetConcatWorkspaceSize(context->cnnlHandle(), num, &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlConcat(context->cnnlHandle(), num, axis, descArray, argv,
+                       wsData, wsSize, desc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        for (int i = 0; i < num; ++i) {
+            checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
+        }
+        checkCnnlError(cnnlDestroyTensorDescriptor(desc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Concat, DataType::Float32, ConcatCnnl,
+                "Concat_cnnl_BANG_Float32");
+}; // namespace infini
diff --git a/src/kernels/bang/conv_trans.cc b/src/kernels/bang/conv_trans.cc
new file mode 100644
index 00000000..3e22c03d
--- /dev/null
+++ b/src/kernels/bang/conv_trans.cc
@@ -0,0 +1,88 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/conv.h"
+
+namespace infini {
+class ConvTransCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ConvBaseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+        const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
+        const int cpg = op->getChannelPerGroup();
+        const int g = c / cpg;
+
+        int pad[4] = {ph, ph, pw, pw};
+        int stride[2] = {sh, sw};
+        int dilation[2] = {dh, dw};
+
+        cnnlConvolutionDescriptor_t convDesc;
+        checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
+        checkCnnlError(cnnlSetConvolutionDescriptor(
+            convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dimInputs0 = op->getInputs(0)->getDims();
+        auto dimInputs1 = op->getInputs(1)->getDims();
+        auto dimOutput = op->getOutput()->getDims();
+
+        if (dimInputs0.size() != 4)
+            IT_TODO_HALT();
+        if (dimInputs1.size() != 4)
+            IT_TODO_HALT();
+        if (dimOutput.size() != 4)
+            IT_TODO_HALT();
+
+        int inputs0[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2],
+                          dimInputs0[3]};
+        int inputs1[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2],
+                          dimInputs1[3]};
+        int output[4] = {dimOutput[0], dimOutput[1], dimOutput[2],
+                         dimOutput[3]};
+
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, inputs0));
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, inputs1));
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, output));
+
+        cnnlConvolutionBwdDataAlgo_t algo;
+        cnnlGetConvolutionBackwardDataAlgorithm(
+            context->cnnlHandle(), aDesc, bDesc, convDesc, cDesc,
+            CNNL_CONVOLUTION_BWD_DATA_FASTEST, &algo);
+        size_t wsSize;
+        cnnlGetConvolutionBackwardDataWorkspaceSize(context->cnnlHandle(),
+                                                    aDesc, bDesc, convDesc,
+                                                    cDesc, algo, &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlConvolutionBackwardData(
+            context->cnnlHandle(), NULL, aDesc, aData, bDesc, bData, convDesc,
+            algo, wsData, wsSize, NULL, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::ConvTrans, DataType::Float32,
+                ConvTransCnnl, "ConvTrans_cnnl_BANG_Float32");
+}; // namespace infini
diff --git a/src/kernels/bang/convbpfilter.cc b/src/kernels/bang/convbpfilter.cc
new file mode 100644
index 00000000..b360cedb
--- /dev/null
+++ b/src/kernels/bang/convbpfilter.cc
@@ -0,0 +1,159 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/conv.h"
+
+namespace infini {
+class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ConvBackwardFilterObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+        const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
+        const int cpg = op->getChannelPerGroup();
+        const int g = c / cpg;
+
+        int pad[4] = {ph, ph, pw, pw};
+        int stride[2] = {sh, sw};
+        int dilation[2] = {dh, dw};
+
+        cnnlConvolutionDescriptor_t convDesc;
+        checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
+        checkCnnlError(cnnlSetConvolutionDescriptor(
+            convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc, aDescTrans, bDescTrans,
+            cDescTrans;
+        auto dimInputs0 = op->getInputs(0)->getDims();
+        auto dimInputs1 = op->getInputs(1)->getDims();
+        auto dimOutput = op->getOutput()->getDims();
+
+        if (dimInputs0.size() != 4)
+            IT_TODO_HALT();
+        if (dimInputs1.size() != 4)
+            IT_TODO_HALT();
+        if (dimOutput.size() != 4)
+            IT_TODO_HALT();
+
+        int inputs0Array[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2],
+                               dimInputs0[3]};
+        int inputs1Array[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2],
+                               dimInputs1[3]};
+        int outputArray[4] = {dimOutput[0], dimOutput[1], dimOutput[2],
+                              dimOutput[3]};
+
+        int inputs0ArrayTrans[4] = {dimInputs0[0], dimInputs0[2], dimInputs0[3],
+                                    dimInputs0[1]};
+        int inputs1ArrayTrans[4] = {dimInputs1[0], dimInputs1[2], dimInputs1[3],
+                                    dimInputs1[1]};
+        int outputArrayTrans[4] = {dimOutput[0], dimOutput[2], dimOutput[3],
+                                   dimOutput[1]};
+
+        int transMode[4] = {0, 2, 3, 1};
+        cnnlTransposeDescriptor_t transDesc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&transDesc));
+        checkCnnlError(cnnlSetTransposeDescriptor(transDesc, 4, transMode));
+
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs0Array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDescTrans));
+        checkCnnlError(cnnlSetTensorDescriptor(aDescTrans, CNNL_LAYOUT_NHWC,
+                                               CNNL_DTYPE_FLOAT, 4,
+                                               inputs0ArrayTrans));
+
+        size_t wsTrans1Size = dimInputs0[0] * dimInputs0[1] * dimInputs0[2] *
+                              dimInputs0[3] * sizeof(float);
+        BangPtr wsTrans1Data = context->getWorkspace(wsTrans1Size);
+
+        cnnlStatus_t stat =
+            cnnlTranspose(context->cnnlHandle(), transDesc, aDesc, aData,
+                          aDescTrans, wsTrans1Data);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs1Array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDescTrans));
+        checkCnnlError(cnnlSetTensorDescriptor(bDescTrans, CNNL_LAYOUT_NHWC,
+                                               CNNL_DTYPE_FLOAT, 4,
+                                               inputs1ArrayTrans));
+
+        size_t wsTrans2Size = dimInputs1[0] * dimInputs1[1] * dimInputs1[2] *
+                              dimInputs1[3] * sizeof(float);
+        BangPtr wsTrans2Data = context->getWorkspace(wsTrans2Size);
+
+        stat = cnnlTranspose(context->cnnlHandle(), transDesc, bDesc, bData,
+                             bDescTrans, wsTrans2Data);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, outputArray));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDescTrans));
+        checkCnnlError(cnnlSetTensorDescriptor(cDescTrans, CNNL_LAYOUT_NHWC,
+                                               CNNL_DTYPE_FLOAT, 4,
+                                               outputArrayTrans));
+
+        size_t wsTrans3Size = dimOutput[0] * dimOutput[1] * dimOutput[2] *
+                              dimOutput[3] * sizeof(float);
+        BangPtr wsTrans3Data = context->getWorkspace(wsTrans3Size);
+
+        cnnlConvolutionBwdFilterAlgo_t algo;
+        cnnlGetConvolutionBackwardFilterAlgorithm(
+            context->cnnlHandle(), convDesc, aDescTrans, bDescTrans, cDescTrans,
+            CNNL_CONVOLUTION_BWD_FILTER_FASTEST, &algo);
+
+        size_t wsSize;
+        cnnlGetConvolutionBackwardFilterWorkspaceSize(
+            context->cnnlHandle(), aDescTrans, bDescTrans, cDescTrans, convDesc,
+            algo, &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        stat = cnnlConvolutionBackwardFilter(
+            context->cnnlHandle(), NULL, aDescTrans, wsTrans1Data, bDescTrans,
+            wsTrans2Data, convDesc, algo, wsData, wsSize, NULL, cDescTrans,
+            wsTrans3Data);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        int transMode2[4] = {0, 3, 1, 2};
+        cnnlTransposeDescriptor_t transOutputDesc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&transOutputDesc));
+        checkCnnlError(
+            cnnlSetTransposeDescriptor(transOutputDesc, 4, transMode2));
+
+        stat = cnnlTranspose(context->cnnlHandle(), transOutputDesc, cDescTrans,
+                             wsTrans3Data, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDescTrans));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDescTrans));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDescTrans));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(transDesc));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(transOutputDesc));
+        checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::ConvBackwardFilter, DataType::Float32,
+                ConvBackwardFilterCnnl, "ConvBackwardFilter_cnnl_BANG_Float32");
+}; // namespace infini
diff --git a/src/kernels/bang/copy.cc b/src/kernels/bang/copy.cc
new file mode 100644
index 00000000..37987729
--- /dev/null
+++ b/src/kernels/bang/copy.cc
@@ -0,0 +1,46 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class CopyCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlCopy(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Copy, DataType::Float32, CopyCnnl,
+                "Copy_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/det.cc b/src/kernels/bang/det.cc
new file mode 100644
index 00000000..02726e69
--- /dev/null
+++ b/src/kernels/bang/det.cc
@@ -0,0 +1,53 @@
+#include "operators/det.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class DetCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<DetObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        DetObj::Mode mode = op->getMode();
+        cnnlDetMode_t nlMode;
+        if (mode == DetObj::LogDet) {
+            nlMode = CNNL_DET_MODE_LOGDET;
+        } else {
+            nlMode = CNNL_DET_MODE_DET;
+        }
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dimin = op->getInputs(0)->getDims();
+        auto dimout = op->getOutput()->getDims();
+        if (dimin.size() != 4 || dimout.size() != 2)
+            IT_TODO_HALT();
+
+        int dimin_array[4] = {dimin[0], dimin[1], dimin[2], dimin[3]};
+        int dimout_array[2] = {dimout[0], dimout[1]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimin_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 2, dimout_array));
+
+        cnnlStatus_t stat =
+            cnnlDet(context->cnnlHandle(), nlMode, aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Det, DataType::Float32, DetCnnl,
+                "Det_cnnl_BANG_Float32");
+}; // namespace infini
diff --git a/src/kernels/bang/element_wise.cc b/src/kernels/bang/element_wise.cc
index 74505e9b..bca41e6f 100644
--- a/src/kernels/bang/element_wise.cc
+++ b/src/kernels/bang/element_wise.cc
@@ -1,5 +1,4 @@
 #include "operators/element_wise.h"
-#include "bang/bang_element_wise.h"
 #include "bang/bang_kernel_without_config.h"
 #include "bang/bang_runtime.h"
 
@@ -66,6 +65,514 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
     }
 };
 
+class LogicOpCnnl : public BangKernelWithoutConfig {
+    virtual cnnlLogicOp_t getOpType() const = 0;
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetLogicOpWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                    &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlLogicOp(context->cnnlHandle(), getOpType(), aDesc, aData, bDesc,
+                        bData, wsData, wsSize, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class BitComputeCnnl : public BangKernelWithoutConfig {
+    virtual cnnlBitComputeOp_t getOpType() const = 0;
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_INT32, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_INT32, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_INT32, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetBitComputeWorkspaceSize(context->cnnlHandle(), aDesc, bDesc,
+                                       cDesc, &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlBitCompute_v2(context->cnnlHandle(), getOpType(), aDesc, aData,
+                              bDesc, bData, cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class DivCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlDiv_v2(
+            context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc,
+            aData, bDesc, bData, wsData, wsSize, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class MaximumCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get op descriptor
+        size_t wsSize;
+        cnnlGetMaximumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlMaximum(context->cnnlHandle(), aDesc, aData, bDesc, bData,
+                        cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class MinimumCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get op descriptor
+        size_t wsSize;
+        cnnlGetMinimumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlMinimum(context->cnnlHandle(), aDesc, aData, bDesc, bData,
+                        cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class MSELossCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<MSELossObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        MSELossObj::Reduction reduction = op->getReduction();
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        int dim_out[4] = {1, 1, 1, 1};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        if (reduction == MSELossObj::None) {
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+        } else {
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_out));
+        }
+        cnnlStatus_t stat;
+        if (reduction == MSELossObj::None) {
+            stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_NONE, aDesc,
+                               aData, bDesc, bData, cDesc, cData);
+        } else if (reduction == MSELossObj::Sum) {
+            stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_SUM, aDesc,
+                               aData, bDesc, bData, cDesc, cData);
+        } else {
+            stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_MEAN, aDesc,
+                               aData, bDesc, bData, cDesc, cData);
+        }
+
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class PowerCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get op descriptor
+        size_t wsSize;
+        cnnlGetPowWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlPow(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                    aDesc, aData, bDesc, bData, wsData, wsSize, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class FloorDivCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetFloorDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                     &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlFloorDiv_v2(
+            context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc,
+            aData, bDesc, bData, cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class FloorModCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                     &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlFloorMod(context->cnnlHandle(), aDesc, aData, bDesc, bData,
+                         cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetSquaredDifferenceWorkspaceSize(context->cnnlHandle(), aDesc,
+                                              bDesc, cDesc, &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlSquaredDifference(context->cnnlHandle(), aDesc, aData, bDesc,
+                                  bData, cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
 class AddCnnl : public ElementWiseCnnl {
     cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_ADD; }
 };
@@ -81,12 +588,57 @@ class MulCnnl : public ElementWiseCnnl {
     cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_MUL; }
 };
 
-class ElementWiseBang : public BangKernelWithoutConfig {
-    void compute(const Operator &_op,
-                 const RuntimeObj *_context) const override {
-        element_wise_kernel(_context, _op);
-    }
+class EqualCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_EQ; }
 };
+class NotEqualCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_NE; }
+};
+class GreaterThanCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_GT; }
+};
+class GreaterEqualCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_GE; }
+};
+class LessThanCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_LT; }
+};
+class LessEqualCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_LE; }
+};
+class AndCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_AND; }
+};
+class OrCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_OR; }
+};
+class XorCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_XOR; }
+};
+class NotCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_NOT; }
+};
+
+class BitAndCnnl : public BitComputeCnnl {
+    cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BAND_OP; }
+};
+class BitOrCnnl : public BitComputeCnnl {
+    cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BOR_OP; }
+};
+class BitXorCnnl : public BitComputeCnnl {
+    cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BXOR_OP; }
+};
+class BitNotCnnl : public BitComputeCnnl {
+    cnnlBitComputeOp_t getOpType() const override { return CNNL_BNOT_OP; }
+};
+// class BitLeftShiftCnnl : public BitComputeCnnl {
+//     cnnlBitComputeOp_t getOpType() const override { return
+//     CNNL_BLEFT_SHIFT_OP_V2; }
+// };
+// class BitRightShiftCnnl : public BitComputeCnnl {
+//     cnnlBitComputeOp_t getOpType() const override { return
+//     CNNL_BLEFT_SHIFT_OP_V2; }
+// };
 
 REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
                 "Add_cnnl_BANG_Float32");
@@ -95,8 +647,56 @@ REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
 REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
                 "Mul_cnnl_BANG_Float32");
 
-REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, ElementWiseBang,
-                "Div_Bang_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, DivCnnl,
+                "Div_cnnl_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Maximum, DataType::Float32, MaximumCnnl,
+                "Maximum_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Minimum, DataType::Float32, MinimumCnnl,
+                "Minimum_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::MSELoss, DataType::Float32, MSELossCnnl,
+                "MSELoss_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Power, DataType::Float32, PowerCnnl,
+                "Power_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, DataType::Float32, FloorDivCnnl,
+                "FloorDiv_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::FloorMod, DataType::Float32, FloorModCnnl,
+                "FloorMod_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::SquaredDifference, DataType::Float32,
+                SquaredDifferenceCnnl, "SquaredDifference_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Equal, DataType::Float32, EqualCnnl,
+                "Equal_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::NotEqual, DataType::Float32, NotEqualCnnl,
+                "NotEqual_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::GreaterThan, DataType::Float32,
+                GreaterThanCnnl, "GreaterThan_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::GreaterEqual, DataType::Float32,
+                GreaterEqualCnnl, "GreaterEqual_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::LessThan, DataType::Float32, LessThanCnnl,
+                "LessThan_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::LessEqual, DataType::Float32,
+                LessEqualCnnl, "LessEqual_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::And, DataType::Float32, AndCnnl,
+                "And_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Or, DataType::Float32, OrCnnl,
+                "Or_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Xor, DataType::Float32, XorCnnl,
+                "Xor_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Not, DataType::Float32, NotCnnl,
+                "Not_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::BitAnd, DataType::Float32, BitAndCnnl,
+                "BitAnd_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::BitOr, DataType::Float32, BitOrCnnl,
+                "BitOr_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::BitXor, DataType::Float32, BitXorCnnl,
+                "BitXor_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::BitNot, DataType::Float32, BitNotCnnl,
+                "BitNot_cnnl_BANG_Float32");
+// REGISTER_KERNEL(Device::BANG, OpType::BitLeftShift, DataType::Float32,
+// BitLeftShiftCnnl,
+//                 "BitLeftShift_cnnl_BANG_Float32");
+// REGISTER_KERNEL(Device::BANG, OpType::BitRightShift, DataType::Float32,
+// BitRightShiftCnnl,
+//                 "BitRightShift_cnnl_BANG_Float32");
 // REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
 // ElementWiseBang,
 //                 "Pow_Bang_Float32");
diff --git a/src/kernels/bang/erf.cc b/src/kernels/bang/erf.cc
new file mode 100644
index 00000000..86c1e3a3
--- /dev/null
+++ b/src/kernels/bang/erf.cc
@@ -0,0 +1,47 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class ErfCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlErf_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                       aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Erf, DataType::Float32, ErfCnnl,
+                "Erf_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/exp.cc b/src/kernels/bang/exp.cc
new file mode 100644
index 00000000..9d7d31f4
--- /dev/null
+++ b/src/kernels/bang/exp.cc
@@ -0,0 +1,47 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class ExpCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlExp_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                       aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Exp, DataType::Float32, ExpCnnl,
+                "Exp_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/fill.cc b/src/kernels/bang/fill.cc
new file mode 100644
index 00000000..0f8fb846
--- /dev/null
+++ b/src/kernels/bang/fill.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class FillCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<FillObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        float value = op->getValue();
+
+        cnnlTensorDescriptor_t cDesc;
+        auto dim = op->getOutput()->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlFill(context->cnnlHandle(), value, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Fill, DataType::Float32, FillCnnl,
+                "Fill_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/floor.cc b/src/kernels/bang/floor.cc
new file mode 100644
index 00000000..a0f2a082
--- /dev/null
+++ b/src/kernels/bang/floor.cc
@@ -0,0 +1,46 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class FloorCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlFloor(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Floor, DataType::Float32, FloorCnnl,
+                "Floor_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/hardtanh.cc b/src/kernels/bang/hardtanh.cc
new file mode 100644
index 00000000..1c4ad697
--- /dev/null
+++ b/src/kernels/bang/hardtanh.cc
@@ -0,0 +1,42 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class HardtanhCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<HardtanhObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        float min = op->getMin();
+        float max = op->getMax();
+
+        cnnlTensorDescriptor_t aDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat = cnnlHardtanh(context->cnnlHandle(), aDesc, aData,
+                                         max, min, aDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Hardtanh, DataType::Float32, HardtanhCnnl,
+                "Hardtanh_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/l2loss.cc b/src/kernels/bang/l2loss.cc
new file mode 100644
index 00000000..d7c66859
--- /dev/null
+++ b/src/kernels/bang/l2loss.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class L2LossCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<L2LossObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlL2Loss(context->cnnlHandle(), aDesc, aData, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::L2Loss, DataType::Float32, L2LossCnnl,
+                "L2Loss_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/log.cc b/src/kernels/bang/log.cc
new file mode 100644
index 00000000..4976b1ca
--- /dev/null
+++ b/src/kernels/bang/log.cc
@@ -0,0 +1,62 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class LogCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<LogObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        auto type = op->getType();
+        cnnlLogBase_t base;
+        switch (type) {
+        case LogObj::Log2:
+            base = CNNL_LOG_2;
+            break;
+        case LogObj::LogE:
+            base = CNNL_LOG_E;
+            break;
+        case LogObj::Log10:
+            base = CNNL_LOG_10;
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlLog_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                       base, aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Log, DataType::Float32, LogCnnl,
+                "Log_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/matmul.cc b/src/kernels/bang/matmul.cc
index b30ecb87..98f30947 100644
--- a/src/kernels/bang/matmul.cc
+++ b/src/kernels/bang/matmul.cc
@@ -18,19 +18,27 @@ class MatmulCnnl : public BangKernelWithoutConfig {
         auto dimInputs0 = op->getInputs(0)->getDims();
         auto dimInputs1 = op->getInputs(1)->getDims();
         auto dimOutput = op->getOutput()->getDims();
-        if (dimInputs0.size() != 3)
-            IT_TODO_HALT();
-        if (dimInputs1.size() != 3)
-            IT_TODO_HALT();
-        if (dimOutput.size() != 3)
-            IT_TODO_HALT();
+        int input0_batch_size = 1;
+        int input1_batch_size = 1;
+        int output_batch_size = 1;
+        for (size_t i = 0; i < dimInputs0.size() - 2; ++i) {
+            input0_batch_size *= dimInputs0[i];
+            input1_batch_size *= dimInputs1[i];
+            output_batch_size *= dimOutput[i];
+        }
 
         bool transA = op->getTransA();
         bool transB = op->getTransB();
 
-        int inputs0Array[3] = {dimInputs0[0], dimInputs0[1], dimInputs0[2]};
-        int inputs1Array[3] = {dimInputs1[0], dimInputs1[1], dimInputs1[2]};
-        int outputArray[3] = {dimOutput[0], dimOutput[1], dimOutput[2]};
+        int inputs0Array[3] = {input0_batch_size,
+                               dimInputs0[dimInputs0.size() - 2],
+                               dimInputs0[dimInputs0.size() - 1]};
+        int inputs1Array[3] = {input1_batch_size,
+                               dimInputs1[dimInputs1.size() - 2],
+                               dimInputs1[dimInputs1.size() - 1]};
+        int outputArray[3] = {output_batch_size,
+                              dimOutput[dimOutput.size() - 2],
+                              dimOutput[dimOutput.size() - 1]};
 
         // get inputs
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
diff --git a/src/kernels/bang/negtensor.cc b/src/kernels/bang/negtensor.cc
new file mode 100644
index 00000000..b0171120
--- /dev/null
+++ b/src/kernels/bang/negtensor.cc
@@ -0,0 +1,46 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class NegTensorCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlNegTensor(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Neg, DataType::Float32, NegTensorCnnl,
+                "Neg_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/pad.cc b/src/kernels/bang/pad.cc
new file mode 100644
index 00000000..e211ee93
--- /dev/null
+++ b/src/kernels/bang/pad.cc
@@ -0,0 +1,65 @@
+#include "operators/pad.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class PadCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<PadObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getOutput()->getDims();
+        int dim_size = dim.size();
+        int dim_array[dim_size];
+        for (int i = 0; i < dim_size; ++i) {
+            dim_array[i] = dim[i];
+        }
+        int paddings[dim_size * 2];
+        std::vector<int> pads = op->getPads();
+        if (pads.size() == 2 && dim_size != 1) {
+            for (int i = 0; i < dim_size * 2; i += 2) {
+                paddings[i] = pads[0];
+                paddings[i + 1] = pads[1];
+            }
+        } else {
+            for (int i = 0; i < dim_size * 2; i += 2) {
+                paddings[i] = pads[i / 2];
+                paddings[i + 1] = pads[i / 2 + dim_size];
+            }
+        }
+        int dimout_array[dim_size];
+        for (int i = 0; i < dim_size; ++i) {
+            dimout_array[i] = dim[i] + paddings[2 * i] + paddings[2 * i + 1];
+        }
+        float paddingValue = 0.0;
+        // input
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dim_size, dim_array));
+        // output
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, dim_size,
+                                               dimout_array));
+
+        cnnlStatus_t stat = cnnlPad(context->cnnlHandle(), aDesc, aData,
+                                    paddings, &paddingValue, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Pad, DataType::Float32, PadCnnl,
+                "Pad_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/pooling.cc b/src/kernels/bang/pooling.cc
new file mode 100644
index 00000000..5abbbf56
--- /dev/null
+++ b/src/kernels/bang/pooling.cc
@@ -0,0 +1,73 @@
+#include "operators/pooling.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class PoolingCnnl : public BangKernelWithoutConfig {
+    virtual cnnlPoolingMode_t getPoolingMode() const = 0;
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<PoolingObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const outData = (op->getOutput()->getRawDataPtr<void *>());
+
+        const auto [n, c, h, w, kh, kw] = op->getNCHWRS();
+        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+
+        // get inputs
+        int inArray[4] = {n, c, h, w};
+        cnnlTensorDescriptor_t inDesc;
+        checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, inArray));
+
+        // get maxpool descriptor
+        cnnlPoolingDescriptor_t poolingDesc;
+        checkCnnlError(cnnlCreatePoolingDescriptor(&poolingDesc));
+        checkCnnlError(cnnlSetPooling2dDescriptor_v2(
+            poolingDesc, getPoolingMode(), CNNL_NOT_PROPAGATE_NAN, kh, kw, ph,
+            ph, pw, pw, sh, sw, dh, dw, false));
+
+        // get outputs
+        auto outVec = op->getOutput()->getDims();
+        int outArray[4] = {outVec[0], outVec[1], outVec[2], outVec[3]};
+        cnnlTensorDescriptor_t outDesc;
+        checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, outArray));
+        size_t wsSize;
+        cnnlGetPoolingWorkspaceSize(context->cnnlHandle(), getPoolingMode(),
+                                    outVec[3], outVec[2], &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        float alpha = 1.f, beta = 0.f;
+        checkCnnlError(cnnlPoolingForward(context->cnnlHandle(), poolingDesc,
+                                          &alpha, inDesc, inData, &beta,
+                                          outDesc, outData, wsData, wsSize));
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(outDesc));
+        checkCnnlError(cnnlDestroyPoolingDescriptor(poolingDesc));
+    }
+};
+
+class maxPoolCnnl : public PoolingCnnl {
+    cnnlPoolingMode_t getPoolingMode() const override {
+        return CNNL_POOLING_MAX;
+    }
+};
+
+class avgPoolCnnl : public PoolingCnnl {
+    cnnlPoolingMode_t getPoolingMode() const override {
+        return CNNL_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::MaxPool, DataType::Float32, maxPoolCnnl,
+                "MaxPool_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::AvgPool, DataType::Float32, avgPoolCnnl,
+                "AvgPool_cnnl_BANG_Float32");
+}; // namespace infini
diff --git a/src/kernels/bang/reciprocal.cc b/src/kernels/bang/reciprocal.cc
new file mode 100644
index 00000000..38a22fd6
--- /dev/null
+++ b/src/kernels/bang/reciprocal.cc
@@ -0,0 +1,46 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class ReciprocalCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlReciprocal(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Reciprocal, DataType::Float32,
+                ReciprocalCnnl, "Reciprocal_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/reshape.cc b/src/kernels/bang/reshape.cc
new file mode 100644
index 00000000..5b4e8613
--- /dev/null
+++ b/src/kernels/bang/reshape.cc
@@ -0,0 +1,42 @@
+#include "operators/reshape.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class CopyBang : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ReshapeObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        auto inData = op->getInputs(0)->getRawDataPtr<void *>();
+        auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
+        cnnlTensorDescriptor_t aDesc;
+        auto dim = op->getInputs(0)->getDims();
+        int len = dim.size();
+        int size = 1;
+        for (int i = 0; i < len; ++i) {
+            size *= dim[i];
+        }
+
+        int dim_array[1] = {size};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, 1, dim_array));
+        cnnlStatus_t stat =
+            cnnlCopy(context->cnnlHandle(), aDesc, inData, aDesc, outData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+    }
+};
+// reshape/flatten/identity all act as copying from input to output.
+REGISTER_KERNEL(Device::BANG, OpType::Reshape, DataType::Float32, CopyBang,
+                "Reshape_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Flatten, DataType::Float32, CopyBang,
+                "Flatten_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Identity, DataType::Float32, CopyBang,
+                "Identity_BANG_Float32");
+
+} // namespace infini
diff --git a/src/kernels/bang/rsqrt.cc b/src/kernels/bang/rsqrt.cc
new file mode 100644
index 00000000..fea06e13
--- /dev/null
+++ b/src/kernels/bang/rsqrt.cc
@@ -0,0 +1,47 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class RsqrtCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlRsqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                         aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Rsqrt, DataType::Float32, RsqrtCnnl,
+                "Rsqrt_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/split.cc b/src/kernels/bang/split.cc
new file mode 100644
index 00000000..bfa842bc
--- /dev/null
+++ b/src/kernels/bang/split.cc
@@ -0,0 +1,69 @@
+#include "operators/split.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class SplitCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<SplitObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        int num = op->numOutputs();
+        int axis = op->getDim();
+        void *argv[num];
+        for (int i = 0; i < num; ++i) {
+            argv[i] = op->getOutput(i)->getRawDataPtr<void *>();
+        }
+        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t desc;
+
+        int dimout_array[num][4];
+        for (int i = 0; i < num; ++i) {
+            auto dim = op->getOutput(i)->getDims();
+            if (dim.size() != 4) {
+                IT_TODO_HALT();
+            }
+            dimout_array[i][0] = dim[0];
+            dimout_array[i][1] = dim[1];
+            dimout_array[i][2] = dim[2];
+            dimout_array[i][3] = dim[3];
+        }
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4) {
+            IT_TODO_HALT();
+        }
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        checkCnnlError(cnnlCreateTensorDescriptor(&desc));
+        checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        cnnlTensorDescriptor_t descArray[num];
+        for (int i = 0; i < num; ++i) {
+            checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
+            checkCnnlError(
+                cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
+                                        CNNL_DTYPE_FLOAT, 4, dimout_array[i]));
+        }
+
+        size_t wsSize;
+        cnnlGetSplitWorkspaceSize(context->cnnlHandle(), num, &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlSplit(context->cnnlHandle(), num, axis, desc, inputData, wsData,
+                      wsSize, descArray, argv);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        for (int i = 0; i < num; ++i) {
+            checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
+        }
+        checkCnnlError(cnnlDestroyTensorDescriptor(desc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Split, DataType::Float32, SplitCnnl,
+                "Split_cnnl_BANG_Float32");
+}; // namespace infini
diff --git a/src/kernels/bang/sqrt.cc b/src/kernels/bang/sqrt.cc
new file mode 100644
index 00000000..68715912
--- /dev/null
+++ b/src/kernels/bang/sqrt.cc
@@ -0,0 +1,47 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class SqrtCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlSqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                        aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Sqrt, DataType::Float32, SqrtCnnl,
+                "Sqrt_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/transpose.cc b/src/kernels/bang/transpose.cc
new file mode 100644
index 00000000..c484824c
--- /dev/null
+++ b/src/kernels/bang/transpose.cc
@@ -0,0 +1,60 @@
+#include "operators/transpose.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class TransposeCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<TransposeObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dimin = op->getInputs(0)->getDims();
+        auto dimout = op->getOutput()->getDims();
+        if (dimin.size() != 4 || dimout.size() != 4)
+            IT_TODO_HALT();
+
+        int dimin_array[4] = {dimin[0], dimin[1], dimin[2], dimin[3]};
+        int dimout_array[4] = {dimout[0], dimout[1], dimout[2], dimout[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimin_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimout_array));
+
+        // get op descriptor
+        auto permute = op->getPermute();
+        cnnlTransposeDescriptor_t opDesc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
+        checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute.data()));
+
+        size_t wsSize;
+        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aDesc, opDesc,
+                                      &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlTranspose_v2(context->cnnlHandle(), opDesc, aDesc, aData, cDesc,
+                             cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Transpose, DataType::Float32,
+                TransposeCnnl, "Transpose_cnnl_BANG_Float32");
+}; // namespace infini
diff --git a/src/kernels/bang/trigon.cc b/src/kernels/bang/trigon.cc
new file mode 100644
index 00000000..4378aa6e
--- /dev/null
+++ b/src/kernels/bang/trigon.cc
@@ -0,0 +1,184 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class TrigonCnnl : public BangKernelWithoutConfig {
+    virtual cnnlTrigonFunctionMode_t getOpType() const = 0;
+    virtual cnnlComputationPreference_t getPrefer() const = 0;
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get op descriptor
+        cnnlTrigonDescriptor_t opDesc;
+        checkCnnlError(cnnlCreateTrigonDescriptor(&opDesc));
+        checkCnnlError(cnnlSetTrigonDescriptor(opDesc, getOpType()));
+
+        cnnlStatus_t stat = cnnlTrigonForward(context->cnnlHandle(), opDesc,
+                                              aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyTrigonDescriptor(opDesc));
+    }
+};
+
+class SinCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_SIN;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class CosCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_COS;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class TanCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_TAN;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class ASinCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_ASIN;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class ACosCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_ACOS;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class ATanCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_ATAN;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class SinHCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_SINH;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class CosHCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_COSH;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class TanHCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_TANH;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class ASinHCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_ASINH;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class ACosHCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_ACOSH;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class ATanHCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_ATANH;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Sin, DataType::Float32, SinCnnl,
+                "Sin_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Cos, DataType::Float32, CosCnnl,
+                "Cos_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Tan, DataType::Float32, TanCnnl,
+                "Tan_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ASin, DataType::Float32, ASinCnnl,
+                "ASin_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ACos, DataType::Float32, ACosCnnl,
+                "ACos_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ATan, DataType::Float32, ATanCnnl,
+                "ATan_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::SinH, DataType::Float32, SinHCnnl,
+                "SinH_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::CosH, DataType::Float32, CosHCnnl,
+                "CosH_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::TanH, DataType::Float32, TanHCnnl,
+                "TanH_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ASinH, DataType::Float32, ASinHCnnl,
+                "ASinH_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ACosH, DataType::Float32, ACosHCnnl,
+                "ACosH_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ATanH, DataType::Float32, ATanHCnnl,
+                "ATanH_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/cpu/element_wise.cc b/src/kernels/cpu/element_wise.cc
index 81fed9e4..8657a1fe 100644
--- a/src/kernels/cpu/element_wise.cc
+++ b/src/kernels/cpu/element_wise.cc
@@ -11,17 +11,37 @@ template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
         T *inptr1 = op->getInputs(1)->getRawDataPtr<T *>();
         T *outptr = op->getOutput()->getRawDataPtr<T *>();
 
-        auto outDim = op->getOutput()->getDims();
+        int a[4] = {1, 1, 1, 1};
+        int b[4] = {1, 1, 1, 1};
+        int c[4] = {1, 1, 1, 1};
+        auto a_input = op->getInputs(0)->getDims();
+        auto b_input = op->getInputs(1)->getDims();
+        auto c_output = op->getOutput()->getDims();
+        std::copy(a_input.begin(), a_input.end(), a + (4 - a_input.size()));
+        std::copy(b_input.begin(), b_input.end(), b + (4 - b_input.size()));
+        std::copy(c_output.begin(), c_output.end(), c + (4 - c_output.size()));
+
         auto n = op->getOutput()->size();
-        for (size_t offset = 0; offset < n; offset++) {
-            // For now,we only process the same dims here, broardcast will be
-            // considered in the opt layer.
-            /*auto offset0 =
-                op->getInputs(0)->getOffsetByBroadcastOffset(offset, outDim);
-            auto offset1 =
-                op->getInputs(1)->getOffsetByBroadcastOffset(offset, outDim);
-            outptr[offset] = doCompute(inptr0[offset0], inptr1[offset1]);*/
-            outptr[offset] = doCompute(inptr0[offset], inptr1[offset]);
+        for (size_t i = 0; i < n; ++i) {
+            int c0_index = i / (c[1] * c[2] * c[3]);
+            int c1_index = (i % (c[1] * c[2] * c[3])) / (c[2] * c[3]);
+            int c2_index = ((i % (c[1] * c[2] * c[3])) % (c[2] * c[3])) / c[3];
+            int c3_index = ((i % (c[1] * c[2] * c[3])) % (c[2] * c[3])) % c[3];
+
+            int a0_index = c0_index % a[0];
+            int a1_index = c1_index % a[1];
+            int a2_index = c2_index % a[2];
+            int a3_index = c3_index % a[3];
+
+            int b0_index = c0_index % b[0];
+            int b1_index = c1_index % b[1];
+            int b2_index = c2_index % b[2];
+            int b3_index = c3_index % b[3];
+            outptr[i] = doCompute(
+                inptr0[a0_index * a[1] * a[2] * a[3] + a1_index * a[2] * a[3] +
+                       a2_index * a[3] + a3_index],
+                inptr1[b0_index * b[1] * b[2] * b[3] + b1_index * b[2] * b[3] +
+                       b2_index * b[3] + b3_index]);
         }
     }
 };
@@ -55,4 +75,4 @@ REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::UInt32, NaiveDiv<uint32_t>,
                 "divNaive_CPU_uint32");
 REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::Float32, NaiveDiv<float>,
                 "divNaive_CPU_float32");
-}; // namespace infini
\ No newline at end of file
+}; // namespace infini
diff --git a/src/kernels/cpu/unary.cc b/src/kernels/cpu/unary.cc
index 8464a0d9..c32e5652 100644
--- a/src/kernels/cpu/unary.cc
+++ b/src/kernels/cpu/unary.cc
@@ -56,6 +56,25 @@ template <typename T> class NaiveAbs : public NativeUnary<T> {
     T doCompute(T val) const override { return val < 0 ? -val : val; }
 };
 
+template <typename T> class Clip : public CpuKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+        auto op = as<ClipObj>(_op);
+        T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
+        T *outptr = op->getOutput()->getRawDataPtr<T *>();
+        auto minValue = op->getMin();
+        auto maxValue = op->getMax();
+
+        auto n = op->getOutput()->size();
+        for (size_t offset = 0; offset < n; offset++) {
+            auto val = *inptr++;
+            *outptr++ = (minValue && val < *minValue)   ? *minValue
+                        : (maxValue && val > *maxValue) ? *maxValue
+                                                        : val;
+        }
+    }
+};
+
 REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32,
                 NaiveRelu<uint32_t>, "reluNaive_CPU_uint32");
 REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu<float>,
@@ -76,4 +95,6 @@ REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::UInt32,
                 NaiveSoftmax<uint32_t>, "softmaxNaive_CPU_uint32");
 REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::Float32,
                 NaiveSoftmax<float>, "softmaxNaive_CPU_float32");
+REGISTER_KERNEL(Device::CPU, OpType::Clip, DataType::Float32, Clip<float>,
+                "Clip_CPU_float32");
 }; // namespace infini
diff --git a/src/kernels/cuda/batch_norm.cc b/src/kernels/cuda/batch_norm.cc
index ce1aaf27..111137ff 100644
--- a/src/kernels/cuda/batch_norm.cc
+++ b/src/kernels/cuda/batch_norm.cc
@@ -20,18 +20,17 @@ class BatchNormCudnn : public CudaKernelWithoutConfig {
         auto dims = op->getInputs(0)->getDims();
         // Only 4D and 5D tensors are supported by
         // cudnnBatchNormalizationForwardInference
-        IT_ASSERT(dims.size() == 4 || dims.size() == 5);
+        IT_ASSERT(dims.size() == 4);
 
-        int dimArray[CUDNN_DIM_MAX], strideArray[CUDNN_DIM_MAX],
-            dimPArray[CUDNN_DIM_MAX], stridePArray[CUDNN_DIM_MAX];
+        int dimArray[4], strideArray[4], dimPArray[4], stridePArray[4];
         for (size_t i = 0; i < dims.size(); ++i) {
             dimArray[i] = dims[i];
             strideArray[i] = op->getInputs(0)->getStride()[i];
             dimPArray[i] = 1;
             stridePArray[i] = 1;
         }
-        dimPArray[1] = op->getInputs(0)->getDims()[1];
-        stridePArray[1] = op->getInputs(0)->getStride()[1];
+        dimPArray[1] = op->getInputs(1)->getDims()[0];
+        stridePArray[0] = op->getInputs(1)->getDims()[0];
         // get inputs
         cudnnTensorDescriptor_t inDesc;
         checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
diff --git a/src/kernels/cuda/clip.cc b/src/kernels/cuda/clip.cc
new file mode 100644
index 00000000..b4865504
--- /dev/null
+++ b/src/kernels/cuda/clip.cc
@@ -0,0 +1,27 @@
+#include "cuda/cuda_clip.h"
+#include "cuda/cuda_kernel_wihtout_config.h"
+#include "cuda/cuda_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+
+class ClipCuda : public CudaKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ClipObj>(_op);
+
+        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
+        auto min = op->getMin();
+        auto max = op->getMax();
+        auto dim = op->getInputs(0)->getDims();
+        int num = dim[0] * dim[1] * dim[2] * dim[3];
+        clip_kernel((float *)inputData, (float *)outputData, num,
+                    min ? *min : NAN, max ? *max : NAN);
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::Clip, DataType::Float32, ClipCuda,
+                "Clip_CUDA_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/cuda/clip.cu b/src/kernels/cuda/clip.cu
new file mode 100644
index 00000000..eabc4926
--- /dev/null
+++ b/src/kernels/cuda/clip.cu
@@ -0,0 +1,32 @@
+#include "core/common.h"
+#include "core/constants.h"
+#include "cuda/cuda_common.h"
+#include <math.h>
+
+using infini::E_CONSTANT;
+constexpr unsigned int num_threads() { return 32 * 4; }
+constexpr int thread_work_size() { return 4; }
+constexpr int block_work_size() { return thread_work_size() * num_threads(); }
+
+__global__ void _clip_kernel(float *input, float *output, int n, float minValue,
+                             float maxValue) {
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+    int stride = blockDim.x * gridDim.x;
+    for (int i = index; i < n; i += stride) {
+        output[i] = (!isnan(minValue) && input[i] < minValue)
+                        ? minValue
+                        : (!isnan(maxValue) && input[i] > maxValue)
+                        ? maxValue : input[i];
+    }
+}
+
+namespace infini {
+void clip_kernel(float *input, float *output, int num, float minValue,
+                 float maxValue) {
+    int blocksize = block_work_size();
+    int gridsize = (num + block_work_size() - 1) / block_work_size();
+    _clip_kernel<<<blocksize, gridsize>>>(input, output, num, minValue,
+                                          maxValue);
+}
+
+}; // namespace infini
diff --git a/src/kernels/cuda/element_wise.cc b/src/kernels/cuda/element_wise.cc
index 97835746..a74d1bdf 100644
--- a/src/kernels/cuda/element_wise.cc
+++ b/src/kernels/cuda/element_wise.cc
@@ -19,24 +19,37 @@ class ElementWiseCudnn : public CudaKernelWithoutConfig {
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
 
         cudnnTensorDescriptor_t aDesc, bDesc, cDesc;
-        auto dim = op->getInputs(0)->getDims();
-        if (dim.size() != 4)
+        auto a_dim = op->getInputs(0)->getDims();
+        auto b_dim = op->getInputs(1)->getDims();
+        auto c_dim = op->getOutput()->getDims();
+
+        if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4)
             IT_TODO_HALT();
-        int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
+
+        int a[4] = {1, 1, 1, 1};
+        int b[4] = {1, 1, 1, 1};
+        int c[4] = {1, 1, 1, 1};
+
+        std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
+        std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
+        std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
 
         // get inputs
         checkCudnnError(cudnnCreateTensorDescriptor(&aDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(
-            aDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
+        checkCudnnError(cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW,
+                                                   CUDNN_DATA_FLOAT, a[0], a[1],
+                                                   a[2], a[3]));
 
         checkCudnnError(cudnnCreateTensorDescriptor(&bDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(
-            bDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
+        checkCudnnError(cudnnSetTensor4dDescriptor(bDesc, CUDNN_TENSOR_NCHW,
+                                                   CUDNN_DATA_FLOAT, b[0], b[1],
+                                                   b[2], b[3]));
 
         // get outputs
         checkCudnnError(cudnnCreateTensorDescriptor(&cDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(
-            cDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
+        checkCudnnError(cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW,
+                                                   CUDNN_DATA_FLOAT, c[0], c[1],
+                                                   c[2], c[3]));
 
         // get op descriptor
         cudnnOpTensorDescriptor_t opDesc;
@@ -81,13 +94,27 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
         float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
         float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
         float *const cData = (op->getOutput()->getRawDataPtr<float *>());
+        auto a_dim = op->getInputs(0)->getDims();
+        auto b_dim = op->getInputs(1)->getDims();
+        auto c_dim = op->getOutput()->getDims();
+
+        if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4)
+            IT_TODO_HALT();
+
+        int a[4] = {1, 1, 1, 1};
+        int b[4] = {1, 1, 1, 1};
+        int c[4] = {1, 1, 1, 1};
+
+        std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
+        std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
+        std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
 
-        auto dim = op->getInputs(0)->getDims();
-        int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
         if (op->getOpType() == OpType::Div)
-            div_kernel(aData, bData, cData, n * c * h * w);
+            div_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
+                       b[2], b[3], c[0], c[1], c[2], c[3]);
         else if (op->getOpType() == OpType::Pow)
-            pow_kernel(aData, bData, cData, n * c * h * w);
+            pow_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
+                       b[2], b[3], c[0], c[1], c[2], c[3]);
         else
             IT_TODO_HALT();
     }
diff --git a/src/kernels/cuda/element_wise.cu b/src/kernels/cuda/element_wise.cu
index be7d4495..b28f0144 100644
--- a/src/kernels/cuda/element_wise.cu
+++ b/src/kernels/cuda/element_wise.cu
@@ -5,34 +5,75 @@ constexpr unsigned int num_threads() { return 32 * 4; }
 constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
 
-__global__ void _div_kernel(float *x, float *y, float *z, int n) {
+__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
+                                                          int b0, int b1, int b2, int b3,
+                                                          int c0, int c1, int c2, int c3) {
     int index = threadIdx.x + blockIdx.x * blockDim.x;
     int stride = blockDim.x * gridDim.x;
+    int n = c0 * c1 * c2 * c3;
+
     for (int i = index; i < n; i += stride) {
-        z[i] = x[i] / y[i];
+        int c0_index = i/ (c1 * c2 * c3);
+        int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
+        int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
+        int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
+
+        int a0_index = c0_index % a0;
+        int a1_index = c1_index % a1;
+        int a2_index = c2_index % a2;
+        int a3_index = c3_index % a3;
+
+        int b0_index = c0_index % b0;
+        int b1_index = c1_index % b1;
+        int b2_index = c2_index % b2;
+        int b3_index = c3_index % b3;
+        z[i] = x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index] / y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index];
     }
 }
 
-__global__ void _pow_kernel(float *x, float *y, float *z, int n) {
+__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
+                                                          int b0, int b1, int b2, int b3,
+                                                          int c0, int c1, int c2, int c3) {
     int index = threadIdx.x + blockIdx.x * blockDim.x;
     int stride = blockDim.x * gridDim.x;
+    int n = c0 * c1 * c2 * c3;
+
     for (int i = index; i < n; i += stride) {
-        z[i] = pow(x[i], y[i]);
+        int c0_index = i/ (c1 * c2 * c3);
+        int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
+        int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
+        int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
+
+        int a0_index = c0_index % a0;
+        int a1_index = c1_index % a1;
+        int a2_index = c2_index % a2;
+        int a3_index = c3_index % a3;
+
+        int b0_index = c0_index % b0;
+        int b1_index = c1_index % b1;
+        int b2_index = c2_index % b2;
+        int b3_index = c3_index % b3;
+        z[i] = pow(x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index], y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]);
     }
 }
 
 namespace infini {
-void div_kernel(float *a, float *b, float *c, int num) {
+void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+                                              int b0, int b1, int b2, int b3,
+                                              int c0, int c1, int c2, int c3) {
 
     int blocksize = block_work_size();
+    int num = c0*c1*c2*c3;
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _div_kernel<<<blocksize, gridsize>>>(a, b, c, num);
+    _div_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
 }
-void pow_kernel(float *a, float *b, float *c, int num) {
-
+void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+                                              int b0, int b1, int b2, int b3,
+                                              int c0, int c1, int c2, int c3) {
     int blocksize = block_work_size();
+    int num = c0*c1*c2*c3;
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _pow_kernel<<<blocksize, gridsize>>>(a, b, c, num);
+    _pow_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
 }
 
 }; // namespace infini
diff --git a/src/kernels/cuda/pad_slice.cc b/src/kernels/cuda/pad_slice.cc
index 04982a41..561a0923 100644
--- a/src/kernels/cuda/pad_slice.cc
+++ b/src/kernels/cuda/pad_slice.cc
@@ -34,7 +34,7 @@ class SliceCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig {
     void compute(const Operator &op,
                  const RuntimeObj *_context) const override {
         do_compute(op->getOutput(), op->getInputs(0),
-                   as<SliceObj>(op)->getStart(), false);
+                   as<SliceObj>(op)->getStarts(), false);
     }
 };
 
diff --git a/src/kernels/cuda/unary.cc b/src/kernels/cuda/unary.cc
index a944b8af..b4ac496f 100644
--- a/src/kernels/cuda/unary.cc
+++ b/src/kernels/cuda/unary.cc
@@ -60,6 +60,52 @@ class ActivationCudnn : public CudaKernelWithoutConfig {
     }
 };
 
+class SoftmaxCudnn : public CudaKernelWithoutConfig {
+    virtual cudnnSoftmaxAlgorithm_t getAlgorithmType() const = 0;
+    virtual cudnnSoftmaxMode_t getModeType() const = 0;
+    virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
+
+        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cudnnTensorDescriptor_t inputDesc, outputDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() > 4)
+            IT_TODO_HALT();
+        int dim_array[4] = {1, 1, 1, 1};
+        memcpy(dim_array + (4 - dim.size()), dim.data(),
+               dim.size() * sizeof(int));
+
+        // get inputs
+        checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
+        checkCudnnError(cudnnSetTensor4dDescriptor(
+            inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dim_array[0],
+            dim_array[1], dim_array[2], dim_array[3]));
+
+        // get outputs
+        checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
+        checkCudnnError(cudnnSetTensor4dDescriptor(
+            outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dim_array[0],
+            dim_array[1], dim_array[2], dim_array[3]));
+
+        auto [alpha, beta] = getAlphBeta();
+        cudnnStatus_t stat = cudnnSoftmaxForward(
+            context->cudnnHandle(), getAlgorithmType(), getModeType(), &alpha,
+            inputDesc, inputData, &beta, outputDesc, outputData);
+        if (stat != CUDNN_STATUS_SUCCESS)
+            return;
+
+        // Destories in CUDA does not require sync. But cuDNN does not state
+        // whether sync is required before destories.
+        checkCudnnError(cudnnDestroyTensorDescriptor(inputDesc));
+        checkCudnnError(cudnnDestroyTensorDescriptor(outputDesc));
+    }
+};
+
 class ReluCudnn : public ActivationCudnn {
     cudnnActivationMode_t getOpType() const override {
         return CUDNN_ACTIVATION_RELU;
diff --git a/src/kernels/intelcpu/slice.cc b/src/kernels/intelcpu/slice.cc
index 8a5a489f..663897cc 100644
--- a/src/kernels/intelcpu/slice.cc
+++ b/src/kernels/intelcpu/slice.cc
@@ -23,7 +23,7 @@ class MklSlice : public MklKernelWithoutConfig {
         std::vector<dnnl_dim_t> sDims, offsets;
         for (int i = 0; i < ndim; ++i) {
             sDims.push_back(oDims.at(i));
-            offsets.push_back(op->getStart().at(i));
+            offsets.push_back(op->getStarts().at(i));
         }
         auto sliceMd = srcMd.submemory_desc(sDims, offsets);
         auto sliceMemory =
diff --git a/src/kernels/mlu/CMakeLists.txt b/src/kernels/mlu/CMakeLists.txt
deleted file mode 100644
index b8cd41b0..00000000
--- a/src/kernels/mlu/CMakeLists.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-cmake_minimum_required(VERSION 3.3)
-project(bangops)
-include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
-set(LIBRARY_OUTPUT_PATH "${CMAKE_BINARY_DIR}/lib")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fPIC -std=c++11 -pthread -pipe")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS} -O3")
-set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} -Wl,--gc-sections -fPIC")
-
-# check `NEUWARE_HOME` env
-message(${NEUWARE_HOME})
-if(EXISTS ${NEUWARE_HOME})
-  include_directories("${NEUWARE_HOME}/include")
-  link_directories("${NEUWARE_HOME}/lib64")
-  link_directories("${NEUWARE_HOME}/lib")
-  set(NEUWARE_ROOT_DIR "${NEUWARE_HOME}")
-else()
-  message(FATAL_ERROR "NEUWARE directory cannot be found, refer README.md to prepare NEUWARE_HOME environment.")
-endif()
-
-# setup cmake search path
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
-  "${CMAKE_SOURCE_DIR}/cmake"
-  "${NEUWARE_HOME}/cmake"
-  "${NEUWARE_HOME}/cmake/modules"
-)
-
-# include FindBANG.cmake and check cncc
-find_package(BANG)
-if(NOT BANG_FOUND)
-  message(FATAL_ERROR "BANG cannot be found.")
-elseif (NOT BANG_CNCC_EXECUTABLE)
-  message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake")
-endif()
-
-# setup cncc flags
-set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -fPIC -Wall -Werror -std=c++11 -pthread")
-set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -O3")
-set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}" "--bang-mlu-arch=mtp_220"
-                                         "--bang-mlu-arch=mtp_270"
-                                         "--bang-mlu-arch=mtp_290"
-                                         "--bang-mlu-arch=mtp_372"
-)
-
-file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/src/*.mlu")
-bang_add_library(bangops SHARED ${src_files})
-
diff --git a/src/kernels/mlu/include/bang_div.h b/src/kernels/mlu/include/bang_div.h
deleted file mode 100644
index 0f81c721..00000000
--- a/src/kernels/mlu/include/bang_div.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#pragma once
-#include "cnnl.h"
-namespace infini {
-void div_kernel(cnnlHandle_t handle, const float *input1, const float *input2,
-                float *output, const uint32_t num);
-
-}; // namespace infini
diff --git a/src/kernels/mlu/include/div.h b/src/kernels/mlu/include/div.h
deleted file mode 100644
index 83520d1f..00000000
--- a/src/kernels/mlu/include/div.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef BANG_KERNELS_DIVOPERATION_DIV_H_
-#define BANG_KERNELS_DIVOPERATION_DIV_H_
-
-__mlu_global__ void MLUDivKernelUnion1(float *output, float *input1,
-                                       float *input2, uint32_t num);
-
-#endif // BANG_KERNELS_DIVOPERATION_DIV_H_
diff --git a/src/kernels/mlu/src/div.mlu b/src/kernels/mlu/src/div.mlu
deleted file mode 100644
index 63456f96..00000000
--- a/src/kernels/mlu/src/div.mlu
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "bang_div.h"
-#include "div.h"
-namespace infini {
-void div_kernel(cnnlHandle_t handle,
-                const float *input1,
-                const float *input2,
-                float *output,
-                const uint32_t num) {
-    // 任务类型和调度方法
-    cnrtDim3_t k_dim;
-    cnrtFunctionType_t k_type;
-    cnrtQueue_t queue;
-    cnnlGetQueue(handle, &queue);
-    k_dim.x = 4;
-    k_dim.y = 8;
-    k_dim.z = 1;
-    k_type = CNRT_FUNC_TYPE_UNION1;
-    // launch 任务
-    MLUDivKernelUnion1<<<k_dim, k_type, queue>>>((float*)output,
-                                                  (float*)input1,
-                                                  (float*)input2,
-                                                  num);
-}
-};
diff --git a/src/kernels/mlu/src/div_device.mlu b/src/kernels/mlu/src/div_device.mlu
deleted file mode 100644
index bee83eb8..00000000
--- a/src/kernels/mlu/src/div_device.mlu
+++ /dev/null
@@ -1,50 +0,0 @@
-#include "div.h"
-
-#define NRAM_USE_SIZE 102400
-
-__nram__ char left[NRAM_USE_SIZE];
-__nram__ char right[NRAM_USE_SIZE];
-__nram__ char output[NRAM_USE_SIZE];
-
-template<typename T>
-__mlu_device__ void DivFunction(T* output1, T* input1, T* input2, size_t num) {
-    int use_nram_size = NRAM_USE_SIZE;
-    int deal_align = use_nram_size / sizeof(T);
-    int num_per_core = num / taskDim;
-    int num_rem = num % taskDim;
-    int easy = num_per_core;
-    int hard = num_per_core + (num_rem != 0 ? 1 : 0);
-    int my = taskId < num_rem ? hard : easy;
-    int start = (taskId < num_rem) ? (hard * taskId) : (hard * num_rem + (taskId - num_rem) * easy);
-    char* input1_start = (char*)input1 + start * sizeof(T);
-    char* input2_start = (char*)input2 + start * sizeof(T);
-    char* output_start = (char*)output1 + start * sizeof(T);
-
-    int my_repeat = my / deal_align;
-    int my_rem = my % deal_align;
-    for(int i = 0; i < my_repeat; ++i) {
-        __memcpy(left, input1_start, use_nram_size, GDRAM2NRAM);
-        __memcpy(right, input2_start, use_nram_size, GDRAM2NRAM);
-        __bang_active_recip((T*)right, (T*)right, deal_align);
-        __bang_mul((T*)output, (T*)left, (T*)right, deal_align);
-        __memcpy(output_start, output, use_nram_size, NRAM2GDRAM);
-        input1_start += use_nram_size;
-        input2_start += use_nram_size;
-        output_start += use_nram_size;
-    }
-    if(my_rem) {
-        __memcpy(left, input1_start, my_rem * sizeof(T), GDRAM2NRAM);
-        __memcpy(right, input2_start, my_rem * sizeof(T), GDRAM2NRAM);
-        __bang_active_recip((T*)right, (T*)right, deal_align);
-        __bang_mul((T*)output, (T*)left, (T*)right, deal_align);
-        __memcpy(output_start, output, my_rem * sizeof(T), NRAM2GDRAM);
-    }
-}
-
-__mlu_global__ void MLUDivKernelUnion1(float *output,
-                                       float *input1,
-                                       float *input2,
-                                       uint32_t num) {
-    DivFunction((float*)output, (float*)input1, (float*)input2, num);
-}
-
diff --git a/src/operators/activation_backward.cc b/src/operators/activation_backward.cc
new file mode 100644
index 00000000..47a42086
--- /dev/null
+++ b/src/operators/activation_backward.cc
@@ -0,0 +1,37 @@
+#include "operators/activation_backward.h"
+
+namespace infini {
+ActivationBackwardObj::ActivationBackwardObj(OpType type, GraphObj *graph,
+                                             Tensor y, Tensor diff_y, Tensor x,
+                                             Tensor diff_x)
+    : OperatorObj(type, {y, diff_y, x}, {diff_x}) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>>
+ActivationBackwardObj::inferShape(const TensorVec &inputs) const {
+    return {{inputs[0]->getDims()}};
+}
+
+std::string ActivationBackwardObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> ActivationBackwardObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> ActivationBackwardObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+}; // namespace infini
diff --git a/src/operators/batch_norm.cc b/src/operators/batch_norm.cc
index b744df5a..f85b72f1 100644
--- a/src/operators/batch_norm.cc
+++ b/src/operators/batch_norm.cc
@@ -3,10 +3,10 @@
 namespace infini {
 BatchNormObj::BatchNormObj(GraphObj *graph, Tensor input, Tensor output,
                            Tensor mean, Tensor var, Tensor scale, Tensor bias,
-                           float momentum, float eps, bool training)
+                           float momentum, float eps, bool trainingMode)
     : OperatorObj(OpType::BatchNorm, {input, mean, var, scale, bias}, {output}),
-      momentum(momentum), eps(eps), training(training) {
-    if (training)
+      momentum(momentum), eps(eps), trainingMode(trainingMode) {
+    if (trainingMode)
         IT_TODO_HALT();
 
     IT_ASSERT(checkValid(graph));
diff --git a/src/operators/conv.cc b/src/operators/conv.cc
index 78f0a891..36d97081 100644
--- a/src/operators/conv.cc
+++ b/src/operators/conv.cc
@@ -178,6 +178,81 @@ void ConvTransposed2dObj::setAuxilaryAttributes(PaddingMode mode) {
     }
 }
 
+void ConvBackwardFilterObj::setAuxilaryAttributes(PaddingMode mode) {
+    const Tensor &inputX = inputs[0];
+    const Tensor &diffY = inputs[1];
+    n = inputX->getDims()[0], c = inputX->getDims()[1],
+    h = inputX->getDims()[2], w = inputX->getDims()[3], f = diffY->getDims()[0],
+    r = diffY->getDims()[2], s = diffY->getDims()[3];
+    if (mode == PaddingMode::Same) {
+        int oh = h / sh;
+        int ow = w / sw;
+        ph = (h - oh * sh + (r - sh) * dh) / 2;
+        pw = (w - ow * sw + (s - sw) * dw) / 2;
+    } else if (mode == PaddingMode::Valid) {
+        ph = pw = 0;
+    }
+}
+
+ConvBackwardFilterObj::ConvBackwardFilterObj(GraphObj *graph, Tensor inputX,
+                                             Tensor diffY, Tensor diffW, int ph,
+                                             int pw, int sh, int sw, int dh,
+                                             int dw, Tensor bias, ActType act)
+    : ConvBaseObj(OpType::Conv, {inputX, diffY}, diffW, ph, pw, sh, sw, dh, dw,
+                  inputX, diffY),
+      act(act) {
+    if (bias)
+        IT_TODO_HALT();
+    setAuxilaryAttributes(PaddingMode::Other);
+    IT_ASSERT(checkValid(graph));
+}
+
+ConvBackwardFilterObj::ConvBackwardFilterObj(GraphObj *graph, Tensor inputX,
+                                             Tensor diffY, Tensor diffW,
+                                             PaddingMode mode, int sh, int sw,
+                                             int dh, int dw, Tensor bias,
+                                             ActType act)
+    : ConvBaseObj(OpType::Conv, {inputX, diffY}, diffW, mode, sh, sw, dh, dw,
+                  inputX, diffY),
+      act(act) {
+    if (bias)
+        IT_TODO_HALT();
+    setAuxilaryAttributes(mode);
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>>
+ConvBackwardFilterObj::inferShape(const TensorVec &inputs) const {
+    const auto &inputX = inputs[0], &diffY = inputs[1];
+    auto n = inputX->getDims()[0];
+    auto h = inputX->getDims()[2];
+    auto w = inputX->getDims()[3];
+    auto f = diffY->getDims()[0];
+    auto r = diffY->getDims()[2];
+    auto s = diffY->getDims()[3];
+    int on = n, oc = f;
+    int oh = 0, ow = 0;
+    // For NCHW+FCRS layout, C of input is divisable by C of weight
+    if (inputX->getDims()[1] % diffY->getDims()[1] != 0)
+        return {};
+    // Set padding size
+    if (padding == PaddingMode::Other) {
+        oh = (h - (r - sh) * dh + ph * 2) / sh;
+        ow = (w - (s - sw) * dw + pw * 2) / sw;
+    } else if (padding == PaddingMode::Same) {
+        oh = h / sh;
+        ow = w / sw;
+        // ph = (h - oh * sh + (r - sh) * dh) / 2;
+        // pw = (w - ow * sw + (s - sw) * dw) / 2;
+    } else if (padding == PaddingMode::Valid) {
+        int ph = 0;
+        int pw = 0;
+        oh = (h - (r - sh) * dh + ph * 2) / sh;
+        ow = (w - (s - sw) * dw + pw * 2) / sw;
+    }
+    return {{{on, oc, oh, ow}}};
+}
+
 ConvTransposed2dNHWCObj::ConvTransposed2dNHWCObj(GraphObj *graph, Tensor input,
                                                  Tensor weight, Tensor output,
                                                  int ph, int pw, int sh, int sw,
diff --git a/src/operators/det.cc b/src/operators/det.cc
new file mode 100644
index 00000000..e83f67ed
--- /dev/null
+++ b/src/operators/det.cc
@@ -0,0 +1,43 @@
+#include "operators/det.h"
+
+namespace infini {
+DetObj::DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode)
+    : OperatorObj(OpType::Det, {input}, {output}), modeValue(mode) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> DetObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    auto input = A->getDims();
+    int length = input.size();
+    if (length == 2) {
+        std::vector<int> output = {1};
+        return {{output}};
+    } else {
+        std::vector<int> output(input.begin(), input.end() - 2);
+        return {{output}};
+    }
+}
+
+std::string DetObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> DetObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> DetObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+}; // namespace infini
diff --git a/src/operators/dropout.cc b/src/operators/dropout.cc
new file mode 100644
index 00000000..9a59942c
--- /dev/null
+++ b/src/operators/dropout.cc
@@ -0,0 +1,40 @@
+#include "operators/dropout.h"
+
+namespace infini {
+
+DropoutObj::DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask,
+                       float ratio, bool training_mode)
+    : OperatorObj(OpType::Dropout, {data}, {output, mask}), ratio(ratio) {
+    IT_ASSERT(0 <= ratio && ratio < 1);
+    IT_ASSERT(!training_mode);
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> DropoutObj::inferShape(const TensorVec &inputs) const {
+    auto shape = inputs[0]->getDims();
+    return {{shape, shape}};
+}
+std::string DropoutObj::toString() const {
+    std::ostringstream os;
+    os << "Dropout[" << getGuid() << "](" << vecToString(inputs[0]->getDims())
+       << ", "
+       << "ratio=" << ratio << ", "
+       << "training_mode=false, "
+       << "input=" << inputs[0]->getGuid() << ", "
+       << "outputs=" << outputs[0]->getGuid() << ", " << outputs[1]->getGuid()
+       << ")";
+    return os.str();
+}
+
+vector<int> DropoutObj::getWorkloadVector() const {
+    vector<int> ret = inputs[0]->getDims();
+    ret.emplace_back(static_cast<int>(ratio));
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+vector<int> DropoutObj::getOpAttrVector() const {
+    return {enum_to_underlying(type), static_cast<int>(ratio), false};
+}
+
+} // namespace infini
diff --git a/src/operators/element_wise.cc b/src/operators/element_wise.cc
index bb13586a..b2f9e0cc 100644
--- a/src/operators/element_wise.cc
+++ b/src/operators/element_wise.cc
@@ -12,23 +12,29 @@ ElementWiseObj::inferShape(const TensorVec &inputs) const {
     // For now,we only process the same dims here, broardcast will be considered
     // in the opt layer.
     const auto A = inputs[0], B = inputs[1];
-    if (A->getDims().size() != B->getDims().size() ||
-        A->getDims() != B->getDims())
-        return {};
+    int max_len = std::max(A->getDims().size(), B->getDims().size());
+    std::vector<int> A_(max_len, 1);
+    std::vector<int> B_(max_len, 1);
+    std::vector<int> res(max_len, 1);
+    memcpy(A_.data() + max_len - A->getDims().size(), A->getDims().data(),
+           A->getDims().size() * sizeof(int));
+    memcpy(B_.data() + max_len - B->getDims().size(), B->getDims().data(),
+           B->getDims().size() * sizeof(int));
+    // std::copy(A->getDims().begin(), A->getDims().end(), A_.begin() + (max_len
+    // - A->getDims().size())); std::copy(B->getDims().begin(),
+    // B->getDims().end(), B_.begin() + (max_len - B->getDims().size()));
+    // std::copy(A->getDims().rbegin(), A->getDims().rend(), A_.rbegin());
+    // std::copy(B->getDims().rbegin(), B->getDims().rend(), B_.rbegin());
 
-    return {{A->getDims()}};
-    /*
-    int n = A->getDims().size();
-    Shape shape;
-    for (int i = 0; i < n; i++) {
-        auto dimA = A->getDims().at(i);
-        auto dimB = B->getDims().at(i);
-        if (!(dimA == dimB || dimA == 1 || dimB == 1))
+    for (int i = 0; i < max_len; ++i) {
+        if (A_[i] == B_[i] || (A_[i] == 1 || B_[i] == 1)) {
+            res[i] = std::max(A_[i], B_[i]);
+        } else {
             return {};
-        auto dimI = dimA > dimB ? dimA : dimB;
-        shape.emplace_back(dimI);
+        }
     }
-    return {{shape}};*/
+
+    return {{res}};
 }
 
 std::string ElementWiseObj::toString() const {
@@ -54,4 +60,48 @@ vector<int> ElementWiseObj::getOpAttrVector() const {
     return {enum_to_underlying(type)};
 }
 
+MSELossObj::MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
+                       Reduction reduction, Tensor output)
+    : OperatorObj(OpType::MSELoss, {input0, input1}, {output}),
+      reductionMode(reduction) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> MSELossObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0], B = inputs[1];
+    if (A->getDims().size() != B->getDims().size() ||
+        A->getDims() != B->getDims())
+        return {};
+
+    if (reductionMode == None) {
+        return {{A->getDims()}};
+    } else {
+        Shape temp = {1};
+        return {{temp}};
+    }
+}
+
+std::string MSELossObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << vecToString(inputs[1]->getDims()) << ",";
+    os << "input0=" << inputs[0]->getGuid() << ",";
+    os << "input1=" << inputs[1]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+// use output dim or inputs dim?
+vector<int> MSELossObj::getWorkloadVector() const {
+    vector<int> ret = outputs[0]->getDims();
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+vector<int> MSELossObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
 }; // namespace infini
diff --git a/src/operators/matmul.cc b/src/operators/matmul.cc
index 4587efa8..3fb75371 100644
--- a/src/operators/matmul.cc
+++ b/src/operators/matmul.cc
@@ -4,8 +4,9 @@ namespace infini {
 
 MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
                      bool transB, [[maybe_unused]] Tensor bias, ActType act)
-    : OperatorObj(OpType::Matmul, {A, B}, {C}), transA(transA), transB(transB),
-      act(act) {
+    : OperatorObj(OpType::Matmul,
+                  bias ? TensorVec{A, B, bias} : TensorVec{A, B}, {C}),
+      transA(transA), transB(transB), act(act), b(1) {
     auto shape_a = A->getDims();
     auto shape_b = B->getDims();
     int dimA = shape_a.size(), dimB = shape_b.size();
diff --git a/src/operators/pad.cc b/src/operators/pad.cc
index 1624236e..2b853769 100644
--- a/src/operators/pad.cc
+++ b/src/operators/pad.cc
@@ -13,7 +13,8 @@ PadObj::PadObj(GraphObj *graph, Tensor input, Tensor output,
         pads = vector<int>(nDims * 2, 0);
 
         for (size_t i = 0; i < nAxis; ++i) {
-            auto j = (*axes)[i];
+            auto k = (*axes)[i];
+            auto j = k < 0 ? nDims + k : k;
             pads[j] = _pads[i];
             pads[j + nDims] = _pads[i + nAxis];
         }
diff --git a/src/operators/reduce_mean.cc b/src/operators/reduce_mean.cc
index 95e09dc5..d55377df 100644
--- a/src/operators/reduce_mean.cc
+++ b/src/operators/reduce_mean.cc
@@ -37,9 +37,10 @@ ReduceMeanObj::inferShape(const TensorVec &inputs) const {
             if (!isReduced(i))
                 ret.emplace_back(dims[i]);
         }
-        if (ret.size() == (size_t)0)
-            ret.emplace_back(1);
-        return {{ret}};
+        if (ret.empty())
+            return {{{1}}};
+        else
+            return {{ret}};
     }
 }
 
diff --git a/src/operators/slice.cc b/src/operators/slice.cc
index e5a5ec8d..a61d9d85 100644
--- a/src/operators/slice.cc
+++ b/src/operators/slice.cc
@@ -3,77 +3,103 @@
 namespace infini {
 SliceObj::SliceObj(GraphObj *graph, Tensor input, Tensor output,
                    const vector<int> &starts, const vector<int> &ends,
-                   const optional<vector<int>> &axes,
-                   const optional<vector<int>> &steps)
+                   const optional<vector<int>> &_axes,
+                   const optional<vector<int>> &_steps)
     : OperatorObj(OpType::Slice, {input}, {output}) {
-    if (steps)
-        IT_TODO_HALT();
-    IT_ASSERT(starts.size() == ends.size());
+    auto shape = input->getDims(); // shape of input
+    map<size_t, size_t> axes;
+    vector<int> steps;
+    {
+        auto size = starts.size();      // size of starts
+        IT_ASSERT(size == ends.size()); // size of ends
 
-    if (!axes) {
-        this->starts = starts;
-        this->ends = ends;
-    } else {
-        auto nAxis = (*axes).size();
-        IT_ASSERT(starts.size() == nAxis);
+        if (_axes) {
+            IT_ASSERT(size == _axes->size());
+            // onnx doc: "Behavior is undefined if an axis is repeated."
+            IT_ASSERT(size == std::set(_axes->begin(), _axes->end()).size());
 
-        auto dims = input->getDims();
-        this->starts = vector<int>(dims.size(), 0);
-        this->ends.resize(dims.size());
-        std::transform(dims.begin(), dims.end(), this->ends.begin(),
-                       [](auto x) { return x - 1; });
+            for (size_t i = 0; i < size; ++i) {
+                auto index = _axes->at(i);
+                if (index < 0)
+                    index += shape.size();
+                axes[index] = i;
+            }
+        } else
+            for (size_t i = 0; i < size; ++i)
+                axes[i] = i;
 
-        for (size_t j = 0; j < nAxis; ++j) {
-            auto i = (*axes)[j];
-            if (i < 0)
-                IT_TODO_HALT();
-            this->starts[i] = starts[j];
-            this->ends[i] = ends[j];
+        if (_steps) {
+            IT_ASSERT(size == _steps->size());
+            // onnx doc: "‘steps’ cannot be 0."
+            IT_ASSERT(std::find(_steps->begin(), _steps->end(), 0) ==
+                      _steps->end());
+            steps = *_steps;
+        } else {
+            steps.reserve(size);
+            for (size_t i = 0; i < size; ++i)
+                steps.push_back(1);
         }
     }
+
+    auto size = shape.size();
+    this->axes.reserve(size);
+    for (size_t i = 0; i < size; ++i)
+        if (auto _i = axes.find(i); _i != axes.end()) {
+            auto __i = _i->second;
+            auto start = starts[__i];
+            auto end = ends[__i];
+            this->axes.push_back({start >= 0 ? start : start + shape[__i],
+                                  end >= 0 ? end : end + shape[__i],
+                                  steps[__i]});
+        } else {
+            this->axes.push_back({0, shape[i], 1});
+        }
     IT_ASSERT(checkValid(graph));
 }
 
 optional<vector<Shape>> SliceObj::inferShape(const TensorVec &inputs) const {
-    auto dims = inputs[0]->getDims();
-    int nDims = dims.size();
-    if (nDims != (int)starts.size())
-        return {};
-    for (int i = 0; i < nDims; ++i) {
-        if (starts[i] < 0 || ends[i] >= dims[i] || starts[i] > ends[i])
-            return {};
-        dims[i] = ends[i] - starts[i] + 1;
+    Shape ans;
+    ans.reserve(axes.size());
+    for (const auto &range : axes) {
+        auto step = std::abs(range.step);
+        ans.push_back((range.end - range.start + step - 1) / step);
     }
-
-    return {{dims}};
+    return {{ans}};
 }
 
 std::string SliceObj::toString() const {
     std::ostringstream os;
-    os << "Slice"
-       << "[" << getGuid() << "]";
-    os << "(";
-    os << vecToString(inputs[0]->getDims()) << ",";
-    os << "starts=" << vecToString(starts) << ",";
-    os << "ends=" << vecToString(ends) << ",";
-    os << "input=" << inputs[0]->getGuid() << ",";
-    os << "output=" << outputs[0]->getGuid() << ")";
+    os << "Slice[" << getGuid() << "][";
+    for (const auto &range : axes) {
+        os << range.start << ':' << range.step << ':' << range.end << ", ";
+    }
+    os << "]("
+       << "input=" << inputs[0]->getGuid() << ", "
+       << "output=" << outputs[0]->getGuid() << ")";
     return os.str();
 }
 
 vector<int> SliceObj::getWorkloadVector() const {
-    vector<int> ret = inputs[0]->getDims();
-    ret.insert(ret.end(), starts.begin(), starts.end());
-    ret.insert(ret.end(), ends.begin(), ends.end());
-    ret.emplace(ret.begin(), enum_to_underlying(type));
-    return ret;
+    auto ans = getOpAttrVector();
+    {
+        auto i = inputs[0]->getDims();
+        ans.insert(ans.end(), i.begin(), i.end());
+    }
+    if (!outputs.empty()) {
+        auto o = outputs[0]->getDims();
+        ans.insert(ans.end(), o.begin(), o.end());
+    }
+    return ans;
 }
 
 vector<int> SliceObj::getOpAttrVector() const {
-    vector<int> ret = starts;
-    ret.insert(ret.end(), ends.begin(), ends.end());
-    ret.emplace(ret.begin(), enum_to_underlying(type));
-    return ret;
+    vector<int> ans{enum_to_underlying(type)};
+    for (const auto &range : axes) {
+        ans.push_back(range.start);
+        ans.push_back(range.end);
+        ans.push_back(range.step);
+    }
+    return ans;
 }
 
 } // namespace infini
diff --git a/src/operators/transpose.cc b/src/operators/transpose.cc
new file mode 100644
index 00000000..616eeb14
--- /dev/null
+++ b/src/operators/transpose.cc
@@ -0,0 +1,50 @@
+#include "operators/transpose.h"
+
+namespace infini {
+TransposeObj::TransposeObj(GraphObj *graph, Tensor input, Tensor output,
+                           vector<int> permute)
+    : OperatorObj(OpType::Transpose, {input}, {output}) {
+    if (permute.size() != 4) {
+        IT_TODO_HALT();
+    }
+    transposePermute[0] = permute[0];
+    transposePermute[1] = permute[1];
+    transposePermute[2] = permute[2];
+    transposePermute[3] = permute[3];
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>>
+TransposeObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    auto input = A->getDims();
+    auto output = input;
+
+    for (int i = 0; i < 4; ++i) {
+        output[i] = input[transposePermute[i]];
+    }
+    return {{output}};
+}
+
+std::string TransposeObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> TransposeObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> TransposeObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+}; // namespace infini
diff --git a/src/operators/unary.cc b/src/operators/unary.cc
index cecf0e33..6f85cecf 100644
--- a/src/operators/unary.cc
+++ b/src/operators/unary.cc
@@ -32,4 +32,288 @@ vector<int> UnaryObj::getOpAttrVector() const {
     return {enum_to_underlying(type)};
 }
 
+ClipObj::ClipObj(GraphObj *graph, Tensor input, Tensor output,
+                 std::optional<float> min, std::optional<float> max)
+    : OperatorObj(OpType::Clip, {input}, {output}), minValue(min),
+      maxValue(max) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> ClipObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string ClipObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> ClipObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> ClipObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+HardtanhObj::HardtanhObj(GraphObj *graph, Tensor input, Tensor output,
+                         float min, float max)
+    : OperatorObj(OpType::Hardtanh, {input}, {output}), minValue(min),
+      maxValue(max) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> HardtanhObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string HardtanhObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> HardtanhObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> HardtanhObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+FillObj::FillObj(GraphObj *graph, Tensor input, Tensor output, float value)
+    : OperatorObj(OpType::Fill, {input}, {output}), setValue(value) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> FillObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string FillObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> FillObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> FillObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+L2LossObj::L2LossObj(GraphObj *graph, Tensor input, Tensor output)
+    : OperatorObj(OpType::L2Loss, {input}, {output}) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> L2LossObj::inferShape(const TensorVec &inputs) const {
+    Shape temp = {1};
+    return {{temp}};
+}
+
+std::string L2LossObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> L2LossObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> L2LossObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+CastObj::CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type)
+    : OperatorObj(OpType::Cast, {input}, {output}), castType(type) {
+    IT_ASSERT(checkValid(graph));
+}
+
+vector<DataType> CastObj::inferDataType(const TensorVec &inputs) const {
+    auto input_dataType = inputs[0]->getDType();
+    auto output_dataType = getOutputDataType();
+    for (const auto &tensor : inputs)
+        IT_ASSERT(input_dataType == tensor->getDType());
+    return vector(numOutputs(), output_dataType);
+}
+
+optional<vector<Shape>> CastObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string CastObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> CastObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> CastObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+DataType CastObj::getOutputDataType() const {
+    switch (castType) {
+    case CastObj::Float2Int64:
+        return DataType::Int64;
+    case CastObj::Float2Int32:
+        return DataType::Int32;
+    case CastObj::Float2Int16:
+        return DataType::Int16;
+    case CastObj::Float2Int8:
+        return DataType::Int8;
+    case CastObj::Int322Float:
+        return DataType::Float32;
+    case CastObj::Int322Int8:
+        return DataType::Int8;
+    case CastObj::Int322Int16:
+        return DataType::Int16;
+    case CastObj::Int162Float:
+        return DataType::Float32;
+    case CastObj::Int162Int32:
+        return DataType::Int32;
+    case CastObj::Int82Float:
+        return DataType::Float32;
+    case CastObj::Int82Int16:
+        return DataType::Int16;
+    case CastObj::Int82Int32:
+        return DataType::Int32;
+    case CastObj::Uint82Float:
+        return DataType::Float32;
+    case CastObj::Uint82Int32:
+        return DataType::Int32;
+    case CastObj::Uint82Int64:
+        return DataType::Int64;
+    case CastObj::Int322Int64:
+        return DataType::Int64;
+    case CastObj::Int642Int32:
+        return DataType::Int32;
+    case CastObj::Int642Uint32:
+        return DataType::UInt32;
+    case CastObj::Int642Float:
+        return DataType::Float32;
+    case CastObj::Uint322Int64:
+        return DataType::Int64;
+    default:
+        IT_TODO_HALT();
+    }
+}
+
+ShapeObj::ShapeObj(GraphObj *graph, Tensor input, Tensor output)
+    : OperatorObj(OpType::Shape, {input}, {output}) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> ShapeObj::inferShape(const TensorVec &inputs) const {
+    return {{{static_cast<int>(inputs[0]->getDims().size())}}};
+}
+
+std::string ShapeObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]("
+       << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+PReluObj::PReluObj(GraphObj *graph, Tensor input, Tensor alpha, Tensor output)
+    : OperatorObj(OpType::PRelu, {input, alpha}, {output}) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> PReluObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string PReluObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> PReluObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> PReluObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+LogObj::LogObj(GraphObj *graph, Tensor input, Tensor output, LogType type)
+    : OperatorObj(OpType::Log, {input}, {output}), logType(type) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> LogObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string LogObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> LogObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> LogObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
 }; // namespace infini
diff --git a/test/core/test_graph_replace.cc b/test/core/test_graph_replace.cc
index 5959bc7d..a77624ec 100644
--- a/test/core/test_graph_replace.cc
+++ b/test/core/test_graph_replace.cc
@@ -281,7 +281,7 @@ TEST(MatchGraph, multi_input_output) {
     SubGraph subg0 = make_ref<SubGraphObj>(runtime, TensorVec{i0, i1});
     {
         auto slice = subg0->addOp<SliceObj>(i0, nullptr, vector<int>{0, 0},
-                                            vector<int>{55, 55},
+                                            vector<int>{56, 56},
                                             vector<int>{2, 3}, std::nullopt);
         auto relu0 = subg0->addOp<ReluObj>(slice->getOutput(0), nullptr);
         Tensor w0 = subg0->addTensor(Shape{256, 64, 1, 1}, DataType::UInt32);
@@ -303,7 +303,7 @@ TEST(MatchGraph, multi_input_output) {
     SubGraph subg1 = make_ref<SubGraphObj>(runtime, TensorVec{i1, i0});
     {
         auto slice = subg1->addOp<SliceObj>(i0, nullptr, vector<int>{0, 0},
-                                            vector<int>{55, 55},
+                                            vector<int>{56, 56},
                                             vector<int>{2, 3}, std::nullopt);
         auto relu0 = subg1->addOp<ReluObj>(slice->getOutput(0), nullptr);
         Tensor w0 = subg1->addTensor(Shape{256, 64, 1, 1}, DataType::UInt32);
@@ -328,7 +328,7 @@ TEST(MatchGraph, multi_input_output) {
 
         auto slice = subg2->addOp<SliceObj>(
             extend->getOutput(0), nullptr, vector<int>{0, 0},
-            vector<int>{55, 55}, vector<int>{2, 3}, std::nullopt);
+            vector<int>{56, 56}, vector<int>{2, 3}, std::nullopt);
 
         auto extend1 = subg2->addOp<ExtendObj>(i1, nullptr, 1, 3);
         auto add = subg2->addOp<AddObj>(extend1->getOutput(0),
diff --git a/test/core/test_tensor_save.cc b/test/core/test_tensor_save.cc
index f5a84618..87dfda24 100644
--- a/test/core/test_tensor_save.cc
+++ b/test/core/test_tensor_save.cc
@@ -7,6 +7,7 @@
 namespace infini {
 
 TEST(Prtotbuf, save_and_load) {
+#ifdef TENSOR_PROTOBUF
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
     Graph g = make_ref<GraphObj>(runtime);
     Tensor i0 = g->addTensor({1, 3, 4}, DataType::Float32);
@@ -28,6 +29,7 @@ TEST(Prtotbuf, save_and_load) {
     u1->load("u.pb");
     u1->printData();
     EXPECT_TRUE(u1->equalData(u0));
+#endif
 }
 
 } // namespace infini
diff --git a/test/kernels/bang/test_bang_activation_backward.cc b/test/kernels/bang/test_bang_activation_backward.cc
new file mode 100644
index 00000000..3c90dc4e
--- /dev/null
+++ b/test/kernels/bang/test_bang_activation_backward.cc
@@ -0,0 +1,56 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/activation_backward.h"
+#include "operators/element_wise.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T, class D>
+void testActivationBackward(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor yCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    Tensor diffYCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    Tensor xCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+
+    yCpu->dataMalloc();
+    diffYCpu->dataMalloc();
+    xCpu->dataMalloc();
+
+    yCpu->setData(generator);
+    diffYCpu->setData(generator);
+    xCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto yGpu = bangGraph->cloneTensor(yCpu);
+    auto diffYGpu = bangGraph->cloneTensor(diffYCpu);
+    auto xGpu = bangGraph->cloneTensor(xCpu);
+    auto gpuOp = bangGraph->addOp<T>(yGpu, diffYGpu, xGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto diffXGpu = gpuOp->getOutput();
+
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_ActivationBackward, run) {
+    testActivationBackward<ReluBackwardObj, ReluObj>(IncrementalGenerator(),
+                                                     Shape{1, 2, 2, 3});
+    testActivationBackward<SigmoidBackwardObj, SigmoidObj>(
+        IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testActivationBackward<TanhBackwardObj, TanhObj>(IncrementalGenerator(),
+                                                     Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_bitcompute.cc b/test/kernels/bang/test_bang_bitcompute.cc
new file mode 100644
index 00000000..ebe1710c
--- /dev/null
+++ b/test/kernels/bang/test_bang_bitcompute.cc
@@ -0,0 +1,51 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testBitCompute(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_BitCompute, run) {
+    testBitCompute<BitAndObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testBitCompute<BitOrObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testBitCompute<BitXorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testBitCompute<BitNotObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_cast.cc b/test/kernels/bang/test_bang_cast.cc
new file mode 100644
index 00000000..7bcc44ea
--- /dev/null
+++ b/test/kernels/bang/test_bang_cast.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testCast(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, CastObj::Float2Int32);
+    auto outputGpu = gpuOp->getOutput();
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Cast, run) {
+    testCast<CastObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_ceil.cc b/test/kernels/bang/test_bang_ceil.cc
new file mode 100644
index 00000000..696fb5ba
--- /dev/null
+++ b/test/kernels/bang/test_bang_ceil.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testCeil(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Ceil, run) {
+    testCeil<CeilObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_clip.cc b/test/kernels/bang/test_bang_clip.cc
new file mode 100644
index 00000000..3ae9bbf7
--- /dev/null
+++ b/test/kernels/bang/test_bang_clip.cc
@@ -0,0 +1,42 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testClip(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    float min = 1.0;
+    float max = 4.0;
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, min, max);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Clip, run) {
+    testClip<ClipObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_concat.cc b/test/kernels/bang/test_bang_concat.cc
new file mode 100644
index 00000000..4cf130e3
--- /dev/null
+++ b/test/kernels/bang/test_bang_concat.cc
@@ -0,0 +1,52 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/concat.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testConcat(const std::function<void(void *, size_t, DataType)> &generator,
+                const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp =
+        bangGraph->addOp<T>(TensorVec{inputGpu1, inputGpu2}, nullptr, 2);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->print();
+    inputCpu1->printData();
+    inputCpu2->print();
+    inputCpu2->printData();
+    outputGpu2Cpu->print();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Concat, run) {
+    testConcat<ConcatObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_copy.cc b/test/kernels/bang/test_bang_copy.cc
new file mode 100644
index 00000000..7b5ca90f
--- /dev/null
+++ b/test/kernels/bang/test_bang_copy.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testCopy(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(outputGpu2Cpu->equalData(inputCpu));
+}
+
+TEST(cnnl_Copy, run) {
+    testCopy<CopyObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_det.cc b/test/kernels/bang/test_bang_det.cc
new file mode 100644
index 00000000..a65747ba
--- /dev/null
+++ b/test/kernels/bang/test_bang_det.cc
@@ -0,0 +1,41 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/det.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testDet(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, DetObj::NormalDet);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Det, run) {
+    testDet<DetObj>(IncrementalGenerator(), Shape{1, 1, 2, 2});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_erf.cc b/test/kernels/bang/test_bang_erf.cc
new file mode 100644
index 00000000..13d4b471
--- /dev/null
+++ b/test/kernels/bang/test_bang_erf.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testErf(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Erf, run) {
+    testErf<ErfObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_exp.cc b/test/kernels/bang/test_bang_exp.cc
new file mode 100644
index 00000000..08b6ef0b
--- /dev/null
+++ b/test/kernels/bang/test_bang_exp.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testExp(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Exp, run) {
+    testExp<ExpObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_fill.cc b/test/kernels/bang/test_bang_fill.cc
new file mode 100644
index 00000000..f7c8f463
--- /dev/null
+++ b/test/kernels/bang/test_bang_fill.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testFill(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    float value = 1.0;
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, value);
+    auto outputGpu = gpuOp->getOutput();
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Fill, run) {
+    testFill<FillObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_floor.cc b/test/kernels/bang/test_bang_floor.cc
new file mode 100644
index 00000000..aeb3306a
--- /dev/null
+++ b/test/kernels/bang/test_bang_floor.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testFloor(const std::function<void(void *, size_t, DataType)> &generator,
+               const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Floor, run) {
+    testFloor<FloorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_floordiv.cc b/test/kernels/bang/test_bang_floordiv.cc
new file mode 100644
index 00000000..cf4539a9
--- /dev/null
+++ b/test/kernels/bang/test_bang_floordiv.cc
@@ -0,0 +1,49 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testFloorDiv(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_FloorDiv, run) {
+    testFloorDiv<FloorDivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_floormod.cc b/test/kernels/bang/test_bang_floormod.cc
new file mode 100644
index 00000000..2d1eaa1b
--- /dev/null
+++ b/test/kernels/bang/test_bang_floormod.cc
@@ -0,0 +1,49 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testFloorMod(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_FloorMod, run) {
+    testFloorMod<FloorModObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_hardtanh.cc b/test/kernels/bang/test_bang_hardtanh.cc
new file mode 100644
index 00000000..0381d195
--- /dev/null
+++ b/test/kernels/bang/test_bang_hardtanh.cc
@@ -0,0 +1,43 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testHardtanh(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    float min = 1.0;
+    float max = 4.0;
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, min, max);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Hardtanh, run) {
+    testHardtanh<HardtanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_l2loss.cc b/test/kernels/bang/test_bang_l2loss.cc
new file mode 100644
index 00000000..3cde0874
--- /dev/null
+++ b/test/kernels/bang/test_bang_l2loss.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testL2Loss(const std::function<void(void *, size_t, DataType)> &generator,
+                const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_L2Loss, run) {
+    testL2Loss<L2LossObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_log.cc b/test/kernels/bang/test_bang_log.cc
new file mode 100644
index 00000000..2ba6df9b
--- /dev/null
+++ b/test/kernels/bang/test_bang_log.cc
@@ -0,0 +1,42 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testLog(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape, LogObj::LogType type) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, type);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Log, run) {
+    testLog<LogObj>(IncrementalGenerator(), Shape{1, 2, 2, 3}, LogObj::Log2);
+    testLog<LogObj>(IncrementalGenerator(), Shape{1, 2, 2, 3}, LogObj::LogE);
+    testLog<LogObj>(IncrementalGenerator(), Shape{1, 2, 2, 3}, LogObj::Log10);
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_logic.cc b/test/kernels/bang/test_bang_logic.cc
new file mode 100644
index 00000000..b9bf73d5
--- /dev/null
+++ b/test/kernels/bang/test_bang_logic.cc
@@ -0,0 +1,56 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testLogicOp(const std::function<void(void *, size_t, DataType)> &generator,
+                 const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_LogicOp, run) {
+    testLogicOp<EqualObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<NotEqualObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<GreaterThanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<GreaterEqualObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<LessThanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<LessEqualObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<AndObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<OrObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<XorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<NotObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_maximum.cc b/test/kernels/bang/test_bang_maximum.cc
new file mode 100644
index 00000000..e147ef69
--- /dev/null
+++ b/test/kernels/bang/test_bang_maximum.cc
@@ -0,0 +1,46 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testMaximum(const std::function<void(void *, size_t, DataType)> &generator,
+                 const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Maximum, run) {
+    testMaximum<MaximumObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_minimum.cc b/test/kernels/bang/test_bang_minimum.cc
new file mode 100644
index 00000000..6575e10b
--- /dev/null
+++ b/test/kernels/bang/test_bang_minimum.cc
@@ -0,0 +1,46 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testMinimum(const std::function<void(void *, size_t, DataType)> &generator,
+                 const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Minimum, run) {
+    testMinimum<MinimumObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_mseloss.cc b/test/kernels/bang/test_bang_mseloss.cc
new file mode 100644
index 00000000..b2c25f7c
--- /dev/null
+++ b/test/kernels/bang/test_bang_mseloss.cc
@@ -0,0 +1,57 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testMSELoss(const std::function<void(void *, size_t, DataType)> &generator,
+                 const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp1 =
+        bangGraph->addOp<T>(inputGpu1, inputGpu2, MSELossObj::None, nullptr);
+    auto gpuOp2 =
+        bangGraph->addOp<T>(inputGpu1, inputGpu2, MSELossObj::Sum, nullptr);
+    auto gpuOp3 =
+        bangGraph->addOp<T>(inputGpu1, inputGpu2, MSELossObj::Mean, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu1 = gpuOp1->getOutput();
+    auto outputGpu2 = gpuOp2->getOutput();
+    auto outputGpu3 = gpuOp3->getOutput();
+    auto outputGpu2Cpu1 = outputGpu1->clone(cpuRuntime);
+    auto outputGpu2Cpu2 = outputGpu2->clone(cpuRuntime);
+    auto outputGpu2Cpu3 = outputGpu3->clone(cpuRuntime);
+    // Check
+    outputGpu2Cpu1->printData();
+    outputGpu2Cpu2->printData();
+    outputGpu2Cpu3->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_MSELoss, run) {
+    testMSELoss<MSELossObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_neg.cc b/test/kernels/bang/test_bang_neg.cc
new file mode 100644
index 00000000..2d93ca2f
--- /dev/null
+++ b/test/kernels/bang/test_bang_neg.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testNeg(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Neg, run) {
+    testNeg<NegObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_net.cc b/test/kernels/bang/test_bang_net.cc
new file mode 100644
index 00000000..b0fe11d4
--- /dev/null
+++ b/test/kernels/bang/test_bang_net.cc
@@ -0,0 +1,47 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+void testNet(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<AddObj>(inputGpu1, inputGpu2, nullptr);
+    auto outputGpu = gpuOp->getOutput();
+    auto gpuOp2 = bangGraph->addOp<SigmoidObj>(outputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu2 = gpuOp2->getOutput();
+    auto outputGpu2Cpu2 = outputGpu2->clone(cpuRuntime);
+    // Check
+    inputCpu2->printData();
+    outputGpu2Cpu2->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Net, run) { testNet(IncrementalGenerator(), Shape{1, 2, 2, 3}); }
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_optensor.cc b/test/kernels/bang/test_bang_optensor.cc
index 436ab6dc..c46f80d3 100644
--- a/test/kernels/bang/test_bang_optensor.cc
+++ b/test/kernels/bang/test_bang_optensor.cc
@@ -42,6 +42,8 @@ void testOptensor(
     cpuRuntime->run(cpuGraph);
     auto outputCpu = cpuOp->getOutput();
     // Check
+    outputCpu->printData();
+    outputGpu2Cpu->printData();
     EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
 }
 
@@ -49,6 +51,7 @@ TEST(cuDNN_OpTensor, run) {
     testOptensor<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
     testOptensor<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
     testOptensor<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testOptensor<DivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
 }
 
 } // namespace infini
diff --git a/test/kernels/bang/test_bang_pad.cc b/test/kernels/bang/test_bang_pad.cc
new file mode 100644
index 00000000..2abf1eec
--- /dev/null
+++ b/test/kernels/bang/test_bang_pad.cc
@@ -0,0 +1,44 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/pad.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testPad(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, vector<int>{1, 1, 1, 1},
+                                     vector<int>{0, 3});
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu->print();
+    inputCpu->printData();
+    outputGpu2Cpu->print();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Pad, run) {
+    testPad<PadObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_pooling.cc b/test/kernels/bang/test_bang_pooling.cc
new file mode 100644
index 00000000..20347ae9
--- /dev/null
+++ b/test/kernels/bang/test_bang_pooling.cc
@@ -0,0 +1,41 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/pooling.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
+                 const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Pooling, run) {
+    testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
+    testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_pow.cc b/test/kernels/bang/test_bang_pow.cc
new file mode 100644
index 00000000..06dfc6dd
--- /dev/null
+++ b/test/kernels/bang/test_bang_pow.cc
@@ -0,0 +1,46 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testPow(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Pow, run) {
+    testPow<PowerObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_prelu.cc b/test/kernels/bang/test_bang_prelu.cc
new file mode 100644
index 00000000..bb85e8b1
--- /dev/null
+++ b/test/kernels/bang/test_bang_prelu.cc
@@ -0,0 +1,46 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testPRelu(const std::function<void(void *, size_t, DataType)> &generator,
+               const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    Tensor alphaCpu =
+        make_ref<TensorObj>(Shape{1, 1, 1, 1}, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+    alphaCpu->dataMalloc();
+    alphaCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto alphaGpu = bangGraph->cloneTensor(alphaCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, alphaGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    alphaCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_PRelu, run) {
+    testPRelu<PReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_reciprocal.cc b/test/kernels/bang/test_bang_reciprocal.cc
new file mode 100644
index 00000000..a2a12723
--- /dev/null
+++ b/test/kernels/bang/test_bang_reciprocal.cc
@@ -0,0 +1,41 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testReciprocal(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Reciprocal, run) {
+    testReciprocal<ReciprocalObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_round.cc b/test/kernels/bang/test_bang_round.cc
new file mode 100644
index 00000000..26d0e56e
--- /dev/null
+++ b/test/kernels/bang/test_bang_round.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testRound(const std::function<void(void *, size_t, DataType)> &generator,
+               const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Round, run) {
+    testRound<RoundObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_rsqrt.cc b/test/kernels/bang/test_bang_rsqrt.cc
new file mode 100644
index 00000000..a420a638
--- /dev/null
+++ b/test/kernels/bang/test_bang_rsqrt.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testRsqrt(const std::function<void(void *, size_t, DataType)> &generator,
+               const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Rsqrt, run) {
+    testRsqrt<RsqrtObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_split.cc b/test/kernels/bang/test_bang_split.cc
new file mode 100644
index 00000000..09bcac0a
--- /dev/null
+++ b/test/kernels/bang/test_bang_split.cc
@@ -0,0 +1,48 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/split.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testSplit(const std::function<void(void *, size_t, DataType)> &generator,
+               const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, std::nullopt, 3, 3);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto o0Cpu = gpuOp->getOutput(0)->clone(cpuRuntime);
+    auto o1Cpu = gpuOp->getOutput(1)->clone(cpuRuntime);
+    auto o2Cpu = gpuOp->getOutput(2)->clone(cpuRuntime);
+    // Check
+    inputCpu1->print();
+    inputCpu1->printData();
+    o0Cpu->print();
+    o0Cpu->printData();
+    o1Cpu->print();
+    o1Cpu->printData();
+    o2Cpu->print();
+    o2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Split, run) {
+    testSplit<SplitObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_sqrt.cc b/test/kernels/bang/test_bang_sqrt.cc
new file mode 100644
index 00000000..da9e8338
--- /dev/null
+++ b/test/kernels/bang/test_bang_sqrt.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testSqrt(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Sqrt, run) {
+    testSqrt<SqrtObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_square.cc b/test/kernels/bang/test_bang_square.cc
new file mode 100644
index 00000000..f759f790
--- /dev/null
+++ b/test/kernels/bang/test_bang_square.cc
@@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testSquare(const std::function<void(void *, size_t, DataType)> &generator,
+                const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Square, run) {
+    testSquare<SquareObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_squaredDifference.cc b/test/kernels/bang/test_bang_squaredDifference.cc
new file mode 100644
index 00000000..33efa755
--- /dev/null
+++ b/test/kernels/bang/test_bang_squaredDifference.cc
@@ -0,0 +1,48 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testSquaredDifference(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_SquaredDifference, run) {
+    testSquaredDifference<SquaredDifferenceObj>(IncrementalGenerator(),
+                                                Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_transpose.cc b/test/kernels/bang/test_bang_transpose.cc
new file mode 100644
index 00000000..b9dd6d4b
--- /dev/null
+++ b/test/kernels/bang/test_bang_transpose.cc
@@ -0,0 +1,43 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/transpose.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testTranspose(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    vector<int> permute = {0, 1, 3, 2};
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, permute);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Transpose, run) {
+    testTranspose<TransposeObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_trigon.cc b/test/kernels/bang/test_bang_trigon.cc
new file mode 100644
index 00000000..59fec809
--- /dev/null
+++ b/test/kernels/bang/test_bang_trigon.cc
@@ -0,0 +1,52 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testTrigon(const std::function<void(void *, size_t, DataType)> &generator,
+                const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // CPU
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Trigon, run) {
+    testTrigon<SinObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+    testTrigon<CosObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+    testTrigon<TanObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+    testTrigon<ASinObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+    testTrigon<ACosObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+    testTrigon<ATanObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+    testTrigon<SinHObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+    testTrigon<CosHObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+    testTrigon<TanHObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+    testTrigon<ASinHObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+    testTrigon<ACosHObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+    testTrigon<ATanHObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_unary.cc b/test/kernels/bang/test_bang_unary.cc
new file mode 100644
index 00000000..68534a3f
--- /dev/null
+++ b/test/kernels/bang/test_bang_unary.cc
@@ -0,0 +1,47 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
+               const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // CPU
+    Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
+    auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr);
+    cpuGraph->dataMalloc();
+    cpuRuntime->run(cpuGraph);
+    auto outputCpu = cpuOp->getOutput();
+    // Check
+    EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
+}
+
+TEST(cnnl_Unary, run) {
+    testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testUnary<TanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/cuda/test_cuda_clip.cc b/test/kernels/cuda/test_cuda_clip.cc
new file mode 100644
index 00000000..2c6abaf7
--- /dev/null
+++ b/test/kernels/cuda/test_cuda_clip.cc
@@ -0,0 +1,48 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testClip(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
+    auto inputGpu = cudaGraph->cloneTensor(inputCpu);
+    float min = 2.0;
+    float max = 4.0;
+    auto gpuOp = cudaGraph->addOp<T>(inputGpu, nullptr, min, max);
+    cudaGraph->dataMalloc();
+    cudaRuntime->run(cudaGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // CPU
+    Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
+    auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr, min, max);
+    cpuGraph->dataMalloc();
+    cpuRuntime->run(cpuGraph);
+    auto outputCpu = cpuOp->getOutput();
+    // Check
+    EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
+}
+
+TEST(cuDNN_Unary, run) {
+    testClip<ClipObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/kernels/cuda/test_cuda_slice.cc b/test/kernels/cuda/test_cuda_slice.cc
index 14926ea3..850fc2a8 100644
--- a/test/kernels/cuda/test_cuda_slice.cc
+++ b/test/kernels/cuda/test_cuda_slice.cc
@@ -20,7 +20,7 @@ TEST(CUDA_Slice, run) {
     Graph g = make_ref<GraphObj>(cudaRuntime);
     auto i = g->cloneTensor(icpu);
     auto op =
-        g->addOp<SliceObj>(i, nullptr, vector<int>{1, 1}, vector<int>{1, 4},
+        g->addOp<SliceObj>(i, nullptr, vector<int>{1, 1}, vector<int>{2, 5},
                            vector<int>{0, 3}, std::nullopt);
 
     // allocate CUDA memory
diff --git a/test/kernels/intelcpu/test_mkl_slice.cc b/test/kernels/intelcpu/test_mkl_slice.cc
index 04a5ae86..b840adbf 100644
--- a/test/kernels/intelcpu/test_mkl_slice.cc
+++ b/test/kernels/intelcpu/test_mkl_slice.cc
@@ -12,7 +12,7 @@ TEST(MKL_Slice, run) {
     // Build input data
     Tensor i = g->addTensor(Shape{3, 2, 1, 5}, DataType::Float32);
     auto op =
-        g->addOp<SliceObj>(i, nullptr, vector<int>{1, 1}, vector<int>{1, 4},
+        g->addOp<SliceObj>(i, nullptr, vector<int>{1, 1}, vector<int>{2, 5},
                            vector<int>{0, 3}, std::nullopt);
     g->dataMalloc();
     i->setData(IncrementalGenerator());
diff --git a/test/operators/test_clip.cc b/test/operators/test_clip.cc
new file mode 100644
index 00000000..424f0c60
--- /dev/null
+++ b/test/operators/test_clip.cc
@@ -0,0 +1,38 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testClip(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph Graph = make_ref<GraphObj>(cpuRuntime);
+    float min = 1.0;
+    float max = 4.0;
+    auto Op = Graph->addOp<T>(inputCpu, nullptr, min, max);
+    Graph->dataMalloc();
+    cpuRuntime->run(Graph);
+    auto output = Op->getOutput();
+    inputCpu->printData();
+    output->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Clip, run) {
+    testClip<ClipObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
diff --git a/test/operators/test_slice.cc b/test/operators/test_slice.cc
index 436deada..02bd8bb7 100644
--- a/test/operators/test_slice.cc
+++ b/test/operators/test_slice.cc
@@ -9,10 +9,10 @@ TEST(Slice, ShapeInference) {
     {
         Graph g = make_ref<GraphObj>(cpuRuntime);
         Tensor i = g->addTensor({10, 64, 162, 162}, DataType::UInt32);
-        auto op = g->addOp<SliceObj>(i, nullptr, vector<int>{2, 10, 1, 5},
+        auto op = g->addOp<SliceObj>(i, nullptr, vector<int>{2, 9, 1, 5},
                                      vector<int>{3, 10, 100, 100}, std::nullopt,
                                      std::nullopt);
-        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 100, 96}));
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 1, 99, 95}));
     }
     {
         Graph g = make_ref<GraphObj>(cpuRuntime);
@@ -20,7 +20,7 @@ TEST(Slice, ShapeInference) {
         auto op = g->addOp<SliceObj>(i, nullptr, vector<int>{2, 5},
                                      vector<int>{3, 100}, vector<int>{1, 3},
                                      std::nullopt);
-        EXPECT_EQ(op->getOutput()->getDims(), (Shape{10, 2, 162, 96}));
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{10, 1, 162, 95}));
     }
 }