From feccd4f318d58d019508c60026de6f1fbee91bda Mon Sep 17 00:00:00 2001
From: constroy Li <constroy.li@gmail.com>
Date: Mon, 30 Oct 2023 15:04:16 +0800
Subject: [PATCH 01/28] fix tensor parallel for llama (#159)

* fix Slice

* change default rounds of timeit to 10 to reduce time

* fix slice with large ends

* Reshape support Int64

* support position_ids as input

* skip last MatMul in Llama

* skip infer_shapes to parse large model

* update launch.py

* fix split_concat_kernel

* print more message in launch.py

* Reshape supports both Int32 and Int64

* try infer_shapes and warn about failure

* fix format

---------

Co-authored-by: whjthu <haojie0429@gmail.com>
---
 examples/distributed/launch.py            | 43 +++++++++++++----------
 examples/distributed/parallel_opt.py      | 10 ++++--
 include/core/common.h                     |  4 +--
 pyinfinitensor/src/pyinfinitensor/onnx.py | 19 +++++++---
 src/kernels/cuda/matmul.cc                |  1 +
 src/kernels/cuda/reshape.cc               |  4 +++
 src/kernels/cuda/split_concat.cu          |  8 +++--
 src/operators/concat.cc                   |  6 ++--
 src/operators/slice.cc                    | 15 +++++---
 9 files changed, 70 insertions(+), 40 deletions(-)

diff --git a/examples/distributed/launch.py b/examples/distributed/launch.py
index 64930e6e..58f7efb3 100644
--- a/examples/distributed/launch.py
+++ b/examples/distributed/launch.py
@@ -5,6 +5,7 @@ import multiprocessing as mp
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 from onnx.external_data_helper import convert_model_to_external_data
+from onnx.shape_inference import infer_shapes_path
 import numpy as np
 from parallel_opt import parallel_model
 
@@ -44,16 +45,18 @@ def parse_args():
     )
 
 
-def run_model(model, runtime, inputs: np.array, n=20):
+def run_model(model, runtime, inputs, n=10):
     stub = OnnxStub(model, runtime)
-    next(stub.inputs.items().__iter__())[1].copyin_numpy(inputs)
-    stub.tune()
+    for tensor, input in zip(stub.inputs.values(), inputs):
+        tensor.copyin_numpy(input)
+    # stub.tune()
     stub.run()
     # get outputs
-    outputs = np.array(next(stub.outputs.items().__iter__())[1].copyout_float())
+    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
 
     # bench
-    next(stub.inputs.items().__iter__())[1].copyin_numpy(inputs)
+    for tensor, input in zip(stub.inputs.values(), inputs):
+        tensor.copyin_numpy(input)
     begin = time.time()
     for _ in range(n):
         stub.run()
@@ -64,13 +67,12 @@ def run_model(model, runtime, inputs: np.array, n=20):
 
 
 def run_and_compare(name, model, runtime):
-    data = np.load(f"{name}_inputs.npy")
+    input_ids = np.load(f"{name}_inputs.npy")
+    position_ids = np.arange(input_ids.shape[-1])
     results = np.load(f"{name}_results.npy")
-    outputs = run_model(model, runtime, data)
-    print("outputs sum:", outputs.sum())
-    print("max abs diff:", abs(outputs - results).max())
-    print("max rel diff:", abs((outputs - results) / results).max())
-    # assert np.allclose(outputs, results, rtol=1e-3, atol=1e-6)
+    outputs = run_model(model, runtime, (input_ids, position_ids))
+    print("outputs abs mean:", abs(outputs).mean())
+    np.testing.assert_allclose(outputs, results, rtol=1e-6, atol=1e-3)
 
 
 def start_worker(
@@ -81,14 +83,13 @@ def start_worker(
     extern_path = f"./{dist_name}_rank{rank}.pb"
     if os.path.exists(extern_path):
         os.remove(extern_path)
-    convert_model_to_external_data(
+    onnx.save_model(
         model,
-        all_tensors_to_one_file=True,
+        f"./{dist_name}_rank{rank}.onnx",
+        save_as_external_data=True,
         location=extern_path,
-        size_threshold=1024,
-        convert_attribute=False,
     )
-    onnx.save(model, f"./{dist_name}_rank{rank}.onnx")
+    infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
     runtime = backend.CudaRuntime(local_rank)
     # print("init comm")
     runtime.init_comm(
@@ -106,10 +107,12 @@ def start_single(name, model):
 
 def gen_standard(name, model, voc_size, bs, len):
     # generate standard results
-    data = np.random.randint(0, voc_size, (bs, len), dtype=np.int32)
-    np.save(f"{name}_inputs", data)
+    input_ids = np.random.randint(0, voc_size, (bs, len))
+    position_ids = np.arange(len)
+    np.save(f"{name}_inputs", input_ids)
     runtime = backend.CudaRuntime(0)
-    outputs = run_model(model, runtime, data, 1)
+    outputs = run_model(model, runtime, (input_ids, position_ids), 1)
+    print("outputs abs mean:", abs(outputs).mean())
     np.save(f"{name}_results", outputs)
 
 
@@ -128,12 +131,14 @@ def main():
 
     # run single process.
     # use standalone process to isolate cuda.
+    print("run model by single GPU.")
     p = mp.Process(target=start_single, args=(name, model))
     p.start()
     p.join()
 
     # run distributed parallel.
     world_size = nnodes * nproc_per_node
+    print(f"run model by {world_size} GPU in parallel.")
     workers = [
         mp.Process(
             target=start_worker,
diff --git a/examples/distributed/parallel_opt.py b/examples/distributed/parallel_opt.py
index 42465a69..3ddf2ead 100644
--- a/examples/distributed/parallel_opt.py
+++ b/examples/distributed/parallel_opt.py
@@ -11,6 +11,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
     vinfo = {info.name: info for info in model.graph.value_info}
     vinfo.update({info.name: info for info in model.graph.input})
     vinfo.update({info.name: info for info in model.graph.output})
+    output = {info.name: info for info in model.graph.output}
     place: Dict[str, Placement] = {}
     nodes: List[NodeProto] = []
 
@@ -56,7 +57,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
         ndim = len(vinfo[output].type.tensor_type.shape.dim)
         out_plc = Shard(ndim - 1) if in_plc.is_replicate() else _Partial()
         place[node.output[0]] = out_plc
-        
+
     def shard_concat(node: NodeProto):
         # hack for kvcache
         in_plc = place[node.input[1]]
@@ -154,7 +155,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
             ), f"{place[node.input[0]]} != {place[node.input[1]]}"
             place[node.output[0]] = place[node.input[0]]
         elif node.op_type == "Concat":
-            shard_concat(node)            
+            shard_concat(node)
 
     def find_successor(op_type: str, idx: int, search_limit: int = 1):
         for node in model.graph.node[idx + 1 : idx + 1 + search_limit]:
@@ -175,6 +176,9 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
         if (node.op_type == "MatMul" or node.op_type == "Gemm") and any(
             input in data for input in node.input
         ):
+            # FIXME(constroy): the last MatMul should not be sharded as TP.
+            if node.output[0] in output:
+                continue
             groups = 1
             # If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups
             split_node = find_successor("Split", index, search_limit=2)
@@ -218,7 +222,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
     new_input = []
     for info in model.graph.input:
         new_input.append(vinfo[info.name])
-    
+
     graph = helper.make_graph(
         nodes,
         model.graph.name + f"_{tp_rank}",
diff --git a/include/core/common.h b/include/core/common.h
index 749caff2..81e704f8 100644
--- a/include/core/common.h
+++ b/include/core/common.h
@@ -75,7 +75,7 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
 
 double timeit(
     const std::function<void()> &func,
-    const std::function<void(void)> &sync = []() {}, int warmupRounds = 200,
-    int timingRounds = 200);
+    const std::function<void(void)> &sync = []() {}, int warmupRounds = 10,
+    int timingRounds = 10);
 
 } // namespace infini
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index d11fbb90..6d0da9f8 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -28,6 +28,7 @@ from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
 from functools import reduce
 from onnxsim import simplify
 import copy
+import warnings
 
 
 class OnnxStub:
@@ -48,7 +49,10 @@ class OnnxStub:
         self.inputs: Dict[str, backend.Tensor] = {}
         self.outputs: Dict[str, backend.Tensor] = {}
         self.initializer: Dict[int, TensorProto] = {}
-        model = infer_shapes(model)
+        try:
+            model = infer_shapes(model)
+        except:
+            warnings.warn("infer_shapes failed.")
         self.handler = backend.GraphHandler(runtime)
 
         tensors: Dict[str, backend.Tensor] = dict()
@@ -603,15 +607,20 @@ class OnnxStub:
                         != 0,
                     )
                 elif node.op_type == "Slice":
+
+                    def clamp(nums):
+                        MAX_INT = 0x7FFFFFFF
+                        return [min(x, MAX_INT) for x in nums]
+
                     tensors[node.output[0]] = self.handler.slice(
                         tensors[node.input[0]],
                         tensors.get(node.output[0]),
-                        _parse_data(data[node.input[1]]),
-                        _parse_data(data[node.input[2]]),
-                        _parse_data(data[node.input[3]])
+                        clamp(_parse_data(data[node.input[1]])),
+                        clamp(_parse_data(data[node.input[2]])),
+                        clamp(_parse_data(data[node.input[3]]))
                         if len(node.input) > 3
                         else None,
-                        _parse_data(data[node.input[4]])
+                        clamp(_parse_data(data[node.input[4]]))
                         if len(node.input) > 4
                         else None,
                     )
diff --git a/src/kernels/cuda/matmul.cc b/src/kernels/cuda/matmul.cc
index 9cd4b0b3..2d457cbc 100644
--- a/src/kernels/cuda/matmul.cc
+++ b/src/kernels/cuda/matmul.cc
@@ -58,6 +58,7 @@ class matmulCublas : public Kernel {
             SmallArray inputShape, outputShape;
             int nDims = out->getRank();
             IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
+            // FIXME(constroy): use size_t for outputsize.
             int outputsize = 1; // the length of the output vector after flatten
             int offset = nDims - inC->getRank();
             for (int i = 0; i < offset; ++i)
diff --git a/src/kernels/cuda/reshape.cc b/src/kernels/cuda/reshape.cc
index 77070c23..7be6aca8 100644
--- a/src/kernels/cuda/reshape.cc
+++ b/src/kernels/cuda/reshape.cc
@@ -13,6 +13,10 @@ class CopyCuda : public CudaKernelWithoutConfig {
 // reshape/flatten/identity all act as copying from input to output.
 REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Float32, CopyCuda,
                 "Reshape_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Int64, CopyCuda,
+                "Reshape_CUDA_Int64");
+REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Int32, CopyCuda,
+                "Reshape_CUDA_Int32");
 REGISTER_KERNEL(Device::CUDA, OpType::Flatten, DataType::Float32, CopyCuda,
                 "Flatten_CUDA_Float32");
 REGISTER_KERNEL(Device::CUDA, OpType::Identity, DataType::Float32, CopyCuda,
diff --git a/src/kernels/cuda/split_concat.cu b/src/kernels/cuda/split_concat.cu
index 73f29482..193501e0 100644
--- a/src/kernels/cuda/split_concat.cu
+++ b/src/kernels/cuda/split_concat.cu
@@ -51,13 +51,15 @@ __global__ void _split_concat_kernel(ElementTensorMetadata elemMeta,
 
 namespace infini {
 
-// TODO: when dim=0, the operation can be executed in-place 
+// TODO: when dim=0, the operation can be executed in-place
 void split_concat_kernel(const ElementTensorMetadata &eleMeta,
                          const ComposedTensorMetadata &compMeta, int dim,
                          int batchSize, int nDims, bool isSplit) {
     dim3 blockSize = dim3(32 * 16);
-    // gridsize =n_elements / blockSize
-    int gridDimX = (eleMeta.nElements[0] - 1) / (32 * 16) + 1;
+    // gridsize = max_n_elements / blockSize
+    int max_n_elements =
+        *std::max_element(eleMeta.nElements, eleMeta.nElements + batchSize);
+    int gridDimX = (max_n_elements - 1) / (32 * 16) + 1;
     // each y is a split among the batch
     dim3 gridSize(gridDimX, batchSize);
 
diff --git a/src/operators/concat.cc b/src/operators/concat.cc
index de836d58..95535233 100644
--- a/src/operators/concat.cc
+++ b/src/operators/concat.cc
@@ -2,10 +2,10 @@
 #include "utils/operator_utils.h"
 
 namespace infini {
-ConcatObj::ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim)
-    : OperatorObj(OpType::Concat, inputs, {output}), dim(dim) {
+ConcatObj::ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int _dim)
+    : OperatorObj(OpType::Concat, inputs, {output}) {
     int rank = inputs[0]->getRank();
-    dim = get_real_axis(dim, rank);
+    dim = get_real_axis(_dim, rank);
     IT_ASSERT(checkValid(graph));
 }
 
diff --git a/src/operators/slice.cc b/src/operators/slice.cc
index 1ded2745..0db3b1a2 100644
--- a/src/operators/slice.cc
+++ b/src/operators/slice.cc
@@ -43,17 +43,22 @@ SliceObj::SliceObj(GraphObj *graph, Tensor input, Tensor output,
 
     auto size = shape.size();
     this->axes.reserve(size);
-    for (size_t i = 0; i < size; ++i)
+    for (size_t i = 0; i < size; ++i) {
+        auto len = shape[i];
         if (auto _i = axes.find(i); _i != axes.end()) {
             auto __i = _i->second;
             auto start = starts[__i];
             auto end = ends[__i];
-            this->axes.push_back({start >= 0 ? start : start + shape[__i],
-                                  end >= 0 ? end : end + shape[__i],
-                                  steps[__i]});
+            if (start > len)
+                start = len;
+            if (end > len)
+                end = len;
+            this->axes.push_back({start >= 0 ? start : start + len,
+                                  end >= 0 ? end : end + len, steps[__i]});
         } else {
-            this->axes.push_back({0, shape[i], 1});
+            this->axes.push_back({0, len, 1});
         }
+    }
     IT_ASSERT(checkValid(graph));
 }
 

From 23b825efc48879974819eb5d6d2edd5618b55845 Mon Sep 17 00:00:00 2001
From: Bolun Zhang <48948016+Chamberlain0w0@users.noreply.github.com>
Date: Mon, 30 Oct 2023 16:01:05 +0800
Subject: [PATCH 02/28] Xpu task4 support: add softmax (#172)

* add softmax on kunlun

* format

---------

Co-authored-by: Bolun <bolunz@u.nus.edu>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 src/kernels/kunlun/softmax.cc              |  26 ++++
 test/kernels/kunlun/test_kunlun_softmax.cc | 136 +++++++++++++++++++++
 2 files changed, 162 insertions(+)
 create mode 100644 src/kernels/kunlun/softmax.cc
 create mode 100644 test/kernels/kunlun/test_kunlun_softmax.cc

diff --git a/src/kernels/kunlun/softmax.cc b/src/kernels/kunlun/softmax.cc
new file mode 100644
index 00000000..56374766
--- /dev/null
+++ b/src/kernels/kunlun/softmax.cc
@@ -0,0 +1,26 @@
+#include "operators/softmax.h"
+#include "kunlun/kunlun_kernel_without_config.h"
+#include "kunlun/kunlun_runtime.h"
+
+namespace infini {
+class SoftmaxXdnn : public KUNLUNKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<SoftmaxObj>(_op);
+        auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
+        auto dim = op->getInputs(0)->getDims();
+        auto axis = op->getAxis();
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        auto ret = baidu::xpu::api::softmax<float>(
+            context->KUNLUNHandle(), (float *)aData, (float *)cData, dim, axis);
+        assert(ret == 0);
+        return;
+    }
+};
+
+REGISTER_KERNEL(Device::KUNLUN, OpType::Softmax, DataType::Float32, SoftmaxXdnn,
+                "Softmax_xdnn_KUNLUN_Float32");
+}; // namespace infini
diff --git a/test/kernels/kunlun/test_kunlun_softmax.cc b/test/kernels/kunlun/test_kunlun_softmax.cc
new file mode 100644
index 00000000..77d6dbd8
--- /dev/null
+++ b/test/kernels/kunlun/test_kunlun_softmax.cc
@@ -0,0 +1,136 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "kunlun/kunlun_runtime.h"
+#include "operators/softmax.h"
+#include "test.h"
+#include <cmath>
+namespace infini {
+
+TEST(XDNN_Softmax, run_axis1) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto kunlunRuntime = make_ref<KUNLUNRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
+
+    // KUNLUN XPU
+    Graph kunlunGraph = make_ref<GraphObj>(kunlunRuntime);
+    auto inputKunlun = kunlunGraph->cloneTensor(inputCpu);
+    auto kunlunOp = kunlunGraph->addOp<SoftmaxObj>(inputKunlun, nullptr, 1);
+    kunlunGraph->dataMalloc();
+    inputKunlun->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
+    kunlunRuntime->run(kunlunGraph);
+    auto outputKunlun = kunlunOp->getOutput();
+    auto outputKunlun2Cpu = outputKunlun->clone(cpuRuntime);
+
+    // Check
+    EXPECT_TRUE(outputKunlun2Cpu->equalData(
+        vector<float>{0.032058604, 0.08714432, 0.23688284, 0.6439143,
+                      0.032058604, 0.08714432, 0.23688284, 0.6439143}));
+}
+
+TEST(XDNN_Softmax, run_axis0) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto kunlunRuntime = make_ref<KUNLUNRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
+
+    // KUNLUN XPU
+    Graph kunlunGraph = make_ref<GraphObj>(kunlunRuntime);
+    auto inputKunlun = kunlunGraph->cloneTensor(inputCpu);
+    auto kunlunOp = kunlunGraph->addOp<SoftmaxObj>(inputKunlun, nullptr, 0);
+    kunlunGraph->dataMalloc();
+    inputKunlun->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
+    kunlunRuntime->run(kunlunGraph);
+    auto outputKunlun = kunlunOp->getOutput();
+    auto outputKunlun2Cpu = outputKunlun->clone(cpuRuntime);
+
+    // Check
+    EXPECT_TRUE(
+        outputKunlun2Cpu->equalData(vector<float>{0., 0., 0., 0., 1, 1, 1, 1}));
+}
+
+TEST(XDNN_Softmax2, run_axis1) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto kunlunRuntime = make_ref<KUNLUNRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
+
+    // KUNLUN XPU
+    Graph kunlunGraph = make_ref<GraphObj>(kunlunRuntime);
+    auto inputKunlun = kunlunGraph->cloneTensor(inputCpu);
+    auto kunlunOp = kunlunGraph->addOp<SoftmaxObj>(inputKunlun, nullptr, 1);
+    kunlunGraph->dataMalloc();
+    inputKunlun->setData(IncrementalGenerator());
+    kunlunRuntime->run(kunlunGraph);
+    auto outputKunlun = kunlunOp->getOutput();
+    auto outputKunlun2Cpu = outputKunlun->clone(cpuRuntime);
+
+    // Check
+    EXPECT_TRUE(outputKunlun2Cpu->equalData(vector<float>{
+        0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138, 0.9820138,
+        0.9820138, 0.9820138, 0.0179862, 0.0179862, 0.0179862, 0.0179862,
+        0.9820138, 0.9820138, 0.9820138, 0.9820138}));
+}
+
+TEST(XDNN_Softmax2, run_axis2) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto kunlunRuntime = make_ref<KUNLUNRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
+
+    // KUNLUN XPU
+    Graph kunlunGraph = make_ref<GraphObj>(kunlunRuntime);
+    auto inputKunlun = kunlunGraph->cloneTensor(inputCpu);
+    auto kunlunOp = kunlunGraph->addOp<SoftmaxObj>(inputKunlun, nullptr, 2);
+    kunlunGraph->dataMalloc();
+    inputKunlun->setData(IncrementalGenerator());
+    kunlunRuntime->run(kunlunGraph);
+    auto outputKunlun = kunlunOp->getOutput();
+    auto outputKunlun2Cpu = outputKunlun->clone(cpuRuntime);
+
+    // Check
+    EXPECT_TRUE(outputKunlun2Cpu->equalData(vector<float>{
+        0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029,
+        0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971, 0.8807971,
+        0.1192029, 0.1192029, 0.8807971, 0.8807971}));
+}
+
+TEST(XDNN_Softmax2, run_axis3) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto kunlunRuntime = make_ref<KUNLUNRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
+
+    // KUNLUN XPU
+    Graph kunlunGraph = make_ref<GraphObj>(kunlunRuntime);
+    auto inputKunlun = kunlunGraph->cloneTensor(inputCpu);
+    auto kunlunOp = kunlunGraph->addOp<SoftmaxObj>(inputKunlun, nullptr, 3);
+    kunlunGraph->dataMalloc();
+    inputKunlun->setData(IncrementalGenerator());
+    kunlunRuntime->run(kunlunGraph);
+    auto outputKunlun = kunlunOp->getOutput();
+    auto outputKunlun2Cpu = outputKunlun->clone(cpuRuntime);
+
+    // Check
+    EXPECT_TRUE(outputKunlun2Cpu->equalData(vector<float>{
+        0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
+        0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
+        0.2689414, 0.7310586, 0.2689414, 0.7310586}));
+}
+} // namespace infini

From ec3adf6fa73cc6390f09a9bbd23910640d9ed000 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
Date: Tue, 31 Oct 2023 10:47:36 +0800
Subject: [PATCH 03/28] support 8D tensor, add test example (#170)

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 include/cuda/cuda_split_concat.h      |  2 +-
 test/kernels/cuda/test_cuda_concat.cc | 36 +++++++++++++++++++++++++++
 test/kernels/cuda/test_cuda_split.cc  | 34 +++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/include/cuda/cuda_split_concat.h b/include/cuda/cuda_split_concat.h
index 699f15bc..58bdf330 100644
--- a/include/cuda/cuda_split_concat.h
+++ b/include/cuda/cuda_split_concat.h
@@ -3,7 +3,7 @@
 #include <cstdio>
 
 const int BATCH_SIZE = 32; // parallel tensor number.
-const int DIM_MAX_SIZE = 4;
+const int DIM_MAX_SIZE = 8;
 
 // Concat operator acts like element tensors composing to one big tensor,and
 // split operator acts like one big tensor being composed by element
diff --git a/test/kernels/cuda/test_cuda_concat.cc b/test/kernels/cuda/test_cuda_concat.cc
index 013d25b5..2c76f405 100644
--- a/test/kernels/cuda/test_cuda_concat.cc
+++ b/test/kernels/cuda/test_cuda_concat.cc
@@ -122,4 +122,40 @@ TEST(Concat, Cuda_dim0) {
     EXPECT_TRUE(oCpu->equalData(vector<float>{0, 1, 2, 1, 1, 1, 0, 1, 2}));
 }
 
+TEST(Concat, CudaHigh) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    auto t1 = gCpu->addTensor({2, 2, 3, 1, 2}, DataType::Float32);
+    auto t2 = gCpu->addTensor({2, 2, 1, 1, 2}, DataType::Float32);
+    auto t3 = gCpu->addTensor({2, 2, 2, 1, 2}, DataType::Float32);
+    gCpu->dataMalloc();
+    t1->setData(IncrementalGenerator());
+    t2->setData(OneGenerator());
+    t3->setData(OneGenerator());
+
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+    auto t1Gpu = gCuda->cloneTensor(t1);
+    auto t2Gpu = gCuda->cloneTensor(t2);
+    auto t3Gpu = gCuda->cloneTensor(t3);
+
+    auto op =
+        gCuda->addOp<ConcatObj>(TensorVec{t1Gpu, t2Gpu, t3Gpu}, nullptr, 2);
+    gCuda->dataMalloc();
+    t1Gpu->setData(IncrementalGenerator());
+    t2Gpu->setData(OneGenerator());
+    t3Gpu->setData(OneGenerator());
+    cudaRuntime->run(gCuda);
+
+    // cudaPrintTensor(op->getOutput());
+    //  copy output from CUDA to CPU
+    auto oCpu = gCpu->cloneTensor(op->getOutput());
+    EXPECT_TRUE(oCpu->equalData(
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  1., 1., 1., 1., 1., 1.,
+                      6.,  7.,  8.,  9.,  10., 11., 1., 1., 1., 1., 1., 1.,
+                      12., 13., 14., 15., 16., 17., 1., 1., 1., 1., 1., 1.,
+                      18., 19., 20., 21., 22., 23., 1., 1., 1., 1., 1., 1.}));
+}
 } // namespace infini
diff --git a/test/kernels/cuda/test_cuda_split.cc b/test/kernels/cuda/test_cuda_split.cc
index 5a32f27f..2cab944e 100644
--- a/test/kernels/cuda/test_cuda_split.cc
+++ b/test/kernels/cuda/test_cuda_split.cc
@@ -39,6 +39,40 @@ TEST(Split, Cuda) {
         12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38, 39}));
 }
 
+TEST(Split, CudaHigh) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    auto input = gCpu->addTensor({2, 6, 2, 1, 2}, DataType::Float32);
+    gCpu->dataMalloc();
+    input->setData(IncrementalGenerator());
+
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+    auto inputGpu = gCuda->cloneTensor(input);
+    auto op = gCuda->addOp<SplitObj>(inputGpu, std::nullopt, 1, 3);
+    gCuda->dataMalloc();
+    inputGpu->setData(IncrementalGenerator());
+
+    cudaRuntime->run(gCuda);
+
+    //  copy output from CUDA to CPU
+    EXPECT_EQ(op->getOutputs().size(), (size_t)3);
+    auto o0Cpu = gCpu->cloneTensor(op->getOutput(0));
+    auto o1Cpu = gCpu->cloneTensor(op->getOutput(1));
+    auto o2Cpu = gCpu->cloneTensor(op->getOutput(2));
+    EXPECT_TRUE(
+        o0Cpu->equalData(vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 24., 25.,
+                                       26., 27., 28., 29., 30., 31.}));
+    EXPECT_TRUE(o1Cpu->equalData(vector<float>{8., 9., 10., 11., 12., 13., 14.,
+                                               15., 32., 33., 34., 35., 36.,
+                                               37., 38., 39.}));
+    EXPECT_TRUE(o2Cpu->equalData(vector<float>{16., 17., 18., 19., 20., 21.,
+                                               22., 23., 40., 41., 42., 43.,
+                                               44., 45., 46., 47.}));
+}
+
 TEST(Split, Cuda_dim0) {
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
     Graph gCpu = make_ref<GraphObj>(runtime);

From 1a6fccccbe884b5217524f235e7bcc09ebabae60 Mon Sep 17 00:00:00 2001
From: Derui Yang <ydrml@hotmail.com>
Date: Fri, 3 Nov 2023 13:21:49 +0800
Subject: [PATCH 04/28] =?UTF-8?q?test:=20=E6=94=AF=E6=8C=81=E7=BC=96?=
 =?UTF-8?q?=E8=AF=91=20einnet=20=E5=8D=95=E5=85=83=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=EF=BC=8C=E4=BD=86=E4=B8=8D=E6=98=AF=E6=89=80=E6=9C=89=E6=B5=8B?=
 =?UTF-8?q?=E8=AF=95=E9=83=BD=E8=83=BD=E9=80=9A=E8=BF=87=20(#174)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test: 支持编译 einnet 单元测试，但不是所有测试都能通过

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* Fix: locating resource files and skip codegen

- Change the path parameters in `matchExprResult` and `checkExprLogSame` to paths relative to the project home
- Skip NNetMemboundOp tests as they require codegen

---------

Signed-off-by: YdrMaster <ydrml@hotmail.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
---
 CMakeLists.txt               | 20 +++++++++++---------
 Makefile                     |  9 ++++-----
 include/nnet/test.h          |  4 ++--
 src/nnet/test.cc             | 33 +++++++++++++++++++++++++++------
 test/nnet/test_OpSearch.cc   | 11 +++++------
 test/nnet/test_TConv2gemm.cc |  6 +++---
 test/nnet/test_conv2conv.cc  | 12 ++++--------
 test/nnet/test_conv2gemm.cc  |  8 ++++----
 test/nnet/test_g2bmm.cc      |  6 ++----
 test/nnet/test_memboundOp.cc | 25 ++++++++++++++++++++++---
 test/nnet/test_mutator.cc    |  4 ++++
 11 files changed, 88 insertions(+), 50 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d942fcd4..b08d9f85 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,7 +21,6 @@ project(InfiniTensor C CXX)
 
 cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
-cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
 
 set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
 # Build Type
@@ -95,16 +94,17 @@ add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
 include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)
 
 # TVM backend
-if(BUILD_TEST_EINNET)
-  if (NOT TVM_INCLUDE_DIR OR NOT DMLC_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR)
-    message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_TEST_EINNET is ON")
-  endif()
+if(BUILD_NNET AND BUILD_TEST)
   # TVM and DMLC for invoking TVM packed functions
   include_directories(${TVM_INCLUDE_DIR})
   include_directories(${DMLC_INCLUDE_DIR})
   include_directories(${DLPACK_INCLUDE_DIR})
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
+  if (TVM_INCLUDE_DIR AND DMLC_INCLUDE_DIR AND DLPACK_INCLUDE_DIR AND DLPACK_INCLUDE_DIR)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
+  else()
+    # message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_NNET AND BUILD_TEST is ON")
+  endif()
 endif()
 
 if(BUILD_TEST)
@@ -130,6 +130,8 @@ if(BUILD_NNET)
   add_compile_definitions(BUILD_NNET=1)
   file(GLOB_RECURSE SRC_NNET src/nnet/*.cc)
   list (APPEND SRC ${SRC_NNET})
+  # For locating resource files
+  set_source_files_properties(src/nnet/test.cc PROPERTIES COMPILE_OPTIONS "-DINFINI_PROJECT_HOME=${CMAKE_CURRENT_SOURCE_DIR}")
 endif()
 
 if(USE_CUDA)
@@ -161,7 +163,7 @@ endif()
 target_link_libraries(InfiniTensor pybind11::embed)
 
 # TVM backend
-if(BUILD_TEST_EINNET)
+if(BUILD_NNET AND BUILD_TEST AND TVM_LIB_DIR)
   target_link_libraries(InfiniTensor ${TVM_LIB_DIR}/libtvm.so)
 endif()
 
@@ -333,7 +335,7 @@ if(BUILD_TEST)
   if(BUILD_TEST_PET)
     build_test(test/pet/*.cc)
   endif()
-  if(BUILD_TEST_EINNET)
+  if(BUILD_NNET AND BUILD_TEST)
     build_test(test/nnet/test_*.cc)
 
     # Build expression reader
diff --git a/Makefile b/Makefile
index 19f1b353..302f47b8 100644
--- a/Makefile
+++ b/Makefile
@@ -7,12 +7,13 @@ KUNLUN ?= OFF
 INTELCPU ?= off
 BACKTRACE ?= ON
 TEST ?= ON
+NNET ?= OFF
 FORMAT_ORIGIN ?=
 # Docker build options
 DOCKER_NAME ?= infinitensor
 DOCKER_IMAGE_NAME ?= infinitensor
 DOCKER_FILE ?= infinitensor_ubuntu_22.04.dockerfile
-DOCKER_RUN_OPTION ?= 
+DOCKER_RUN_OPTION ?=
 
 # CUDA option.
 ifeq ($(CUDA), ON)
@@ -22,13 +23,13 @@ ifeq ($(CUDA), ON)
 	DOCKER_RUN_OPTION += --gpus all -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v `pwd`:`pwd` -w `pwd`
 endif
 
-
 CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
 CMAKE_OPT += -DUSE_CUDA=$(CUDA)
 CMAKE_OPT += -DUSE_BANG=$(BANG)
 CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
 CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
 CMAKE_OPT += -DBUILD_TEST=$(TEST)
+CMAKE_OPT += -DBUILD_NNET=$(NNET)
 
 ifeq ($(INTELCPU), ON)
 	CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp
@@ -60,7 +61,7 @@ test-api:
 	@echo
 	python3 pyinfinitensor/tests/test_api.py
 
-docker-build: 
+docker-build:
 	docker build -f scripts/dockerfile/$(DOCKER_FILE) -t $(DOCKER_NAME) .
 
 docker-run:
@@ -71,5 +72,3 @@ docker-start:
 
 docker-exec:
 	docker exec -it $(DOCKER_IMAGE_NAME) bash
-
-
diff --git a/include/nnet/test.h b/include/nnet/test.h
index 6cf75873..1d24bbd4 100644
--- a/include/nnet/test.h
+++ b/include/nnet/test.h
@@ -24,7 +24,7 @@
 // clang-format on
 
 namespace nnet {
-int matchExprResult(Derivator &derivator, string fn);
-bool checkExprLogSame(string fnPrefix, int start, int end);
+int matchExprResult(Derivator &derivator, string pathRelativeToProjectHome);
+bool checkExprLogSame(string pathRelativeToProjectHome, int start, int end);
 bool checkExprsEquvivalence(VecExpr exprs);
 } // namespace nnet
diff --git a/src/nnet/test.cc b/src/nnet/test.cc
index 6c9738f2..fbf41e20 100644
--- a/src/nnet/test.cc
+++ b/src/nnet/test.cc
@@ -3,10 +3,30 @@
 #include "nnet/Visitor/HashVisitor.h"
 #include "nnet/Visitor/Interpreter.h"
 #include "nnet/Visitor/Serializer.h"
+#include <filesystem>
 namespace nnet {
 
-int matchExprResult(Derivator &derivator, string fn) {
-    auto ans = Serializer().deserialize(fn);
+std::filesystem::path getProjectHome() {
+#ifndef INFINI_PROJECT_HOME
+#error INFINI_PROJECT_HOME is not defined
+#endif
+
+#define Q(x) #x
+#define QUOTE(x) Q(x)
+#define PROJECT_HOME QUOTE(INFINI_PROJECT_HOME)
+    return std::filesystem::path(PROJECT_HOME);
+#undef PROJECT_HOME
+#undef QUOTE
+#undef Q
+}
+
+string getResourceFilePath(string path) {
+    return (getProjectHome() / path).string();
+}
+
+int matchExprResult(Derivator &derivator, string pathRelativeToProjectHome) {
+    auto fullPath = getResourceFilePath(pathRelativeToProjectHome);
+    auto ans = Serializer().deserialize(fullPath);
     auto hashAns = HashVisitor()(ans);
     int match = 0;
     for (const auto &candidate : derivator.getCandidates()) {
@@ -16,16 +36,17 @@ int matchExprResult(Derivator &derivator, string fn) {
     return match;
 }
 
-bool checkExprLogSame(string fnPrefix, int start, int end) {
+bool checkExprLogSame(string pathRelativeToProjectHome, int start, int end) {
+    auto fullPath = getResourceFilePath(pathRelativeToProjectHome);
     Serializer serializer;
-    string fn0 = fnPrefix + to_string(start) + ".expr";
+    string fn0 = fullPath + to_string(start) + ".expr";
     Expr expr0 = serializer.deserialize(fn0);
     RangeOp range0 = as<RangeOpNode>(expr0);
     Interpreter interpreter(range0);
     auto ans0 = interpreter.interpretUniformSample(range0);
     dbg(expr0, ans0);
     for (int i = start + 1; i < end; ++i) {
-        string fn1 = fnPrefix + to_string(i) + ".expr";
+        string fn1 = fullPath + to_string(i) + ".expr";
         Expr expr1 = serializer.deserialize(fn1);
         RangeOp range1 = as<RangeOpNode>(expr1);
         dbg(fn1, expr1);
@@ -67,4 +88,4 @@ bool checkExprsEquvivalence(VecExpr exprs) {
     return true;
 }
 
-} // namespace nnet
\ No newline at end of file
+} // namespace nnet
diff --git a/test/nnet/test_OpSearch.cc b/test/nnet/test_OpSearch.cc
index f969ccf7..14a6c737 100644
--- a/test/nnet/test_OpSearch.cc
+++ b/test/nnet/test_OpSearch.cc
@@ -85,7 +85,7 @@ TEST_F(OpSearch, Conv2gemm_NCHW_FCRS_search) {
 
     EXPECT_GE(derivator.getNumCandidates(), 1);
     int nMatches = matchExprResult(
-        derivator, "../test/nnet/log/conv2gemm/Conv2gemm_NCHW_FCRS_11.expr");
+        derivator, "test/nnet/log/conv2gemm/Conv2gemm_NCHW_FCRS_11.expr");
     EXPECT_GE(nMatches, 1);
     // derivator.print();
     derivator.printStatistics();
@@ -160,7 +160,7 @@ TEST_F(OpSearch, TConv2gemm_TConv4x4_NHWF_RSFC_search) {
     EXPECT_GE(derivator.getNumCandidates(), 1);
     int nMatches = matchExprResult(
         derivator,
-        "../test/nnet/log/TConv4x4_NHWF_RSFC/TConv4x4_NHWF_RSFC_18.expr");
+        "test/nnet/log/TConv4x4_NHWF_RSFC/TConv4x4_NHWF_RSFC_18.expr");
     EXPECT_GE(nMatches, 1);
     derivator.printStatistics();
 }
@@ -197,8 +197,7 @@ TEST_F(OpSearch, Conv2conv_5x5_RuleBased_NCHW_FCRS) {
         derivator.search(conv_9x9, 0);
 
     int nMatches = matchExprResult(
-        derivator,
-        "../test/nnet/log/conv2conv/Conv2conv_5x5_NCHW_FCRS_15.expr");
+        derivator, "test/nnet/log/conv2conv/Conv2conv_5x5_NCHW_FCRS_15.expr");
     // derivator.print();
     derivator.printStatistics();
     EXPECT_GE(nMatches, 1);
@@ -236,8 +235,8 @@ TEST_F(OpSearch, G2BMM_RuleBased) {
 
     EXPECT_GE(derivator.getNumCandidates(), 1);
     int nMatches =
-        matchExprResult(derivator, "../test/nnet/log/g2bmm/G2BMM_9.expr");
+        matchExprResult(derivator, "test/nnet/log/g2bmm/G2BMM_9.expr");
     EXPECT_GE(nMatches, 1);
     // derivator.print();
     derivator.printStatistics();
-}
\ No newline at end of file
+}
diff --git a/test/nnet/test_TConv2gemm.cc b/test/nnet/test_TConv2gemm.cc
index 73f32088..3b84b843 100644
--- a/test/nnet/test_TConv2gemm.cc
+++ b/test/nnet/test_TConv2gemm.cc
@@ -365,14 +365,14 @@ TEST(TConv2gemm, TConv4x4_NHWF_RSFC_search) {
     // }
     int nMatches = matchExprResult(
         derivator,
-        "../test/nnet/log/TConv4x4_NHWF_RSFC/TConv4x4_NHWF_RSFC_18.expr");
+        "test/nnet/log/TConv4x4_NHWF_RSFC/TConv4x4_NHWF_RSFC_18.expr");
     EXPECT_GE(nMatches, 1);
     derivator.printStatistics();
 }
 
 TEST(TConv2gemm, TConv4x4_NHWF_FRSC_CheckDerivationCorrectness_log) {
     const string fnPrefix =
-        "../test/nnet/log/TConv4x4_NHWF_RSFC/TConv4x4_NHWF_RSFC_";
+        "test/nnet/log/TConv4x4_NHWF_RSFC/TConv4x4_NHWF_RSFC_";
     EXPECT_TRUE(checkExprLogSame(fnPrefix, 0, 11));
 }
 
@@ -388,4 +388,4 @@ TEST(Conv2conv, InfoGAN_ConvTranspose_3_OOB_Test) {
     dbg(expr);
     Derivator derivator;
     derivator.checkOOB(as<RangeOpNode>(expr));
-}
\ No newline at end of file
+}
diff --git a/test/nnet/test_conv2conv.cc b/test/nnet/test_conv2conv.cc
index 8e961e95..a77ab39a 100644
--- a/test/nnet/test_conv2conv.cc
+++ b/test/nnet/test_conv2conv.cc
@@ -37,10 +37,8 @@ TEST(Conv2conv, 9x9_NCHW_FCRS) {
         derivator.ruleBasedDFS(conv_9x9, 0, rules, {}, true);
     } else
         derivator.search(conv_9x9, 0);
-
     int nMatches = matchExprResult(
-        derivator,
-        "../test/nnet/log/conv2conv/Conv2conv_9x9_NCHW_FCRS_14.expr");
+        derivator, "test/nnet/log/conv2conv/Conv2conv_9x9_NCHW_FCRS_14.expr");
     derivator.print();
     derivator.printStatistics();
     EXPECT_GE(nMatches, 1);
@@ -81,8 +79,7 @@ TEST(Conv2conv, 6x6_RuleBased_NCHW_FCRS) {
 
     ASSERT_GE(derivator.getNumCandidates(), 1);
     int nMatches = matchExprResult(
-        derivator,
-        "../test/nnet/log/conv2conv/Conv2conv_6x6_NCHW_FCRS_14.expr");
+        derivator, "test/nnet/log/conv2conv/Conv2conv_6x6_NCHW_FCRS_14.expr");
     derivator.print();
     derivator.printStatistics();
     EXPECT_GE(nMatches, 1);
@@ -121,9 +118,8 @@ TEST(Conv2conv, 5x5_RuleBased_NCHW_FCRS) {
         derivator.search(conv_9x9, 0);
 
     int nMatches = matchExprResult(
-        derivator,
-        "../test/nnet/log/conv2conv/Conv2conv_5x5_NCHW_FCRS_15.expr");
+        derivator, "test/nnet/log/conv2conv/Conv2conv_5x5_NCHW_FCRS_15.expr");
     derivator.print();
     derivator.printStatistics();
     EXPECT_GE(nMatches, 1);
-}
\ No newline at end of file
+}
diff --git a/test/nnet/test_conv2gemm.cc b/test/nnet/test_conv2gemm.cc
index 9827497b..3f15d328 100644
--- a/test/nnet/test_conv2gemm.cc
+++ b/test/nnet/test_conv2gemm.cc
@@ -147,7 +147,7 @@ TEST(Conv2gemm, timing_NHWC_RSFC_search) {
 
 // Conv2gemm requires thorough update, this is disabled temporarily
 TEST(Conv2gemm, CheckCorrectness) {
-    const string fnPrefix = "../test/nnet/log/conv2gemm/Conv2gemm_NCHW_RSFC_";
+    const string fnPrefix = "test/nnet/log/conv2gemm/Conv2gemm_NCHW_RSFC_";
     // conv2gemm_7 has T3
     EXPECT_TRUE(checkExprLogSame(fnPrefix, 0, 7));
 }
@@ -182,7 +182,7 @@ TEST(Conv2gemm, NCHW_RSFC_search) {
 
     ASSERT_GE(derivator.getNumCandidates(), 1);
     int nMatches = matchExprResult(
-        derivator, "../test/nnet/log/conv2gemm/Conv2gemm_NCHW_RSFC_11.expr");
+        derivator, "test/nnet/log/conv2gemm/Conv2gemm_NCHW_RSFC_11.expr");
     EXPECT_GE(nMatches, 1);
     // derivator.print();
     derivator.printStatistics();
@@ -278,6 +278,6 @@ TEST(Conv2gemm1x7, NCHW_FCRS_search) {
     ASSERT_GE(derivator.getNumCandidates(), 1);
     int nMatches = matchExprResult(
         derivator,
-        "../test/nnet/log/conv2gemm_1x7/Conv2gemm_1x7_NCHW_FCRS_11.expr");
+        "test/nnet/log/conv2gemm_1x7/Conv2gemm_1x7_NCHW_FCRS_11.expr");
     EXPECT_GE(nMatches, 1);
-}
\ No newline at end of file
+}
diff --git a/test/nnet/test_g2bmm.cc b/test/nnet/test_g2bmm.cc
index e285fb98..6ae1150e 100644
--- a/test/nnet/test_g2bmm.cc
+++ b/test/nnet/test_g2bmm.cc
@@ -36,10 +36,8 @@ TEST(GBMM, RuleBased) {
     } else {
         derivator.search(dialted_g2bmm, 0);
     }
-
     ASSERT_GE(derivator.getNumCandidates(), 1);
-    int nMatches =
-        matchExprResult(derivator, "../test/nnet/log/gbmm/GBMM_9.expr");
+    int nMatches = matchExprResult(derivator, "test/nnet/log/gbmm/GBMM_9.expr");
     EXPECT_GE(nMatches, 1);
     derivator.print();
     derivator.printStatistics();
@@ -78,7 +76,7 @@ TEST(G2BMM, RuleBased) {
 
     ASSERT_GE(derivator.getNumCandidates(), 1);
     int nMatches =
-        matchExprResult(derivator, "../test/nnet/log/g2bmm/G2BMM_9.expr");
+        matchExprResult(derivator, "test/nnet/log/g2bmm/G2BMM_9.expr");
     EXPECT_GE(nMatches, 1);
     derivator.print();
     derivator.printStatistics();
diff --git a/test/nnet/test_memboundOp.cc b/test/nnet/test_memboundOp.cc
index 910344f2..49716161 100644
--- a/test/nnet/test_memboundOp.cc
+++ b/test/nnet/test_memboundOp.cc
@@ -1,3 +1,5 @@
+#ifdef USE_CUDA
+
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "cuda/cuda_runtime.h"
@@ -12,7 +14,22 @@
 using namespace infini;
 using namespace std;
 
-TEST(nnet, MemboundOpInterpretation) {
+class NNetMemboundOp : public ::testing::Test {
+  protected:
+    void SetUp() override {
+        if (!hasTVMBackend())
+            GTEST_SKIP() << "Skipping test since no TVM backend.\n";
+    }
+
+  private:
+    static bool hasTVMBackend() {
+        // TODO: as the dispatch mechanism of backend is going to change, this
+        // function is to be implemented.
+        return false;
+    }
+};
+
+TEST_F(NNetMemboundOp, MemboundOpInterpretation) {
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
     Graph g = make_ref<GraphObj>(runtime);
     Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
@@ -41,7 +58,7 @@ TEST(nnet, MemboundOpInterpretation) {
     EXPECT_TRUE(membound->getOutput()->equalData(ans));
 }
 
-TEST(nnet, MemboundOp_Ansor_Codegen) {
+TEST_F(NNetMemboundOp, MemboundOp_Ansor_Codegen) {
     auto runtime = make_ref<CudaRuntimeObj>();
     Runtime cpu = NativeCpuRuntimeObj::getInstance();
     Graph gCpu = make_ref<GraphObj>(cpu);
@@ -91,7 +108,7 @@ pair<std::vector<nnet::Tensor>, nnet::Expr> getPReluExpr(int size) {
     return {{A, B}, ret};
 }
 
-TEST(nnet, PRelu_Ansor_Codegen) {
+TEST_F(NNetMemboundOp, PRelu_Ansor_Codegen) {
     auto cuda = make_ref<CudaRuntimeObj>();
     Runtime cpu = NativeCpuRuntimeObj::getInstance();
     Graph g = make_ref<GraphObj>(cuda);
@@ -116,3 +133,5 @@ TEST(nnet, PRelu_Ansor_Codegen) {
     auto oCpu = gCpu->cloneTensor(o0);
     EXPECT_TRUE(oCpu->equalData(ans));
 }
+
+#endif
diff --git a/test/nnet/test_mutator.cc b/test/nnet/test_mutator.cc
index cf4d8ab2..abc3f604 100644
--- a/test/nnet/test_mutator.cc
+++ b/test/nnet/test_mutator.cc
@@ -1,3 +1,5 @@
+#ifdef USE_CUDA
+
 #include "core/blob.h"
 #include "core/dummy_mutator.h"
 #include "core/graph.h"
@@ -477,3 +479,5 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {
 //     EXPECT_TRUE(graph->verification(bestGraph.get(), true));
 // }
 } // namespace infini
+
+#endif

From d3e75432917addb80f97b851b1d681a0db52af89 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
Date: Mon, 6 Nov 2023 08:56:23 +0800
Subject: [PATCH 05/28] Cuda softmax (#129)

* "add softmax.cu,.cc,.h"

* Modify cuda softmax

* "modified the introduction of softmax.cu"

* "add format of cuda_softmax.h"

* "modified where.cc(.cu,.h) and softmax.cu"

* "modified format"

* Fix cpu softmax kernel

* "modified the // introduction of softmax.cu"

* "modified softmax.cu and use 1D block"

* "modified softmax.cu,format, and use 1D block"

* "introduce share mem to speed softmax"

* "reduce the input of function"

* modified the format

* remodify 2D block softmax

* remodify 1D block softmax

* modified the share memory

* add warp reduce

* conflict solve two

* remove extra space line

* solve comment

---------

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
---
 include/cuda/cuda_softmax.h          |   6 +
 include/cuda/softmax.h               |   6 -
 src/kernels/cpu/unary.cc             |   3 +-
 src/kernels/cuda/softmax.cc          |  26 +--
 src/kernels/cuda/softmax.cu          | 236 +++++++++++++++++++--------
 test/kernels/cuda/test_cuda_where.cc |  34 ++--
 6 files changed, 209 insertions(+), 102 deletions(-)
 create mode 100644 include/cuda/cuda_softmax.h
 delete mode 100644 include/cuda/softmax.h

diff --git a/include/cuda/cuda_softmax.h b/include/cuda/cuda_softmax.h
new file mode 100644
index 00000000..671f46f8
--- /dev/null
+++ b/include/cuda/cuda_softmax.h
@@ -0,0 +1,6 @@
+#pragma once
+#include "utils/small_array.h"
+namespace infini {
+void softmax_kernel(int num_blocks, float *input, float *output, int size,
+                    int dimsize, int stride);
+}
diff --git a/include/cuda/softmax.h b/include/cuda/softmax.h
deleted file mode 100644
index 5c0eccf9..00000000
--- a/include/cuda/softmax.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#pragma once
-
-namespace infini {
-void softmax_kernel(int max_threadblock_size, int batch_size, float *x,
-                    float *y, int dim, int stride);
-}
diff --git a/src/kernels/cpu/unary.cc b/src/kernels/cpu/unary.cc
index 8975d7cd..3ea61b41 100644
--- a/src/kernels/cpu/unary.cc
+++ b/src/kernels/cpu/unary.cc
@@ -1,6 +1,7 @@
 #include "operators/unary.h"
 #include "core/constants.h"
 #include "core/kernel.h"
+#include "operators/softmax.h"
 
 namespace infini {
 template <typename T> class NativeUnary : public CpuKernelWithoutConfig {
@@ -22,7 +23,7 @@ template <typename T> class NativeUnary : public CpuKernelWithoutConfig {
 template <typename T> class NaiveSoftmax : public CpuKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *context) const override {
-        auto op = as<UnaryObj>(_op);
+        auto op = as<SoftmaxObj>(_op);
         T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
         T *outptr = op->getOutput()->getRawDataPtr<T *>();
 
diff --git a/src/kernels/cuda/softmax.cc b/src/kernels/cuda/softmax.cc
index 437ed849..024288c2 100644
--- a/src/kernels/cuda/softmax.cc
+++ b/src/kernels/cuda/softmax.cc
@@ -1,30 +1,30 @@
 #include "operators/softmax.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"
-#include "cuda/softmax.h"
+#include "cuda/cuda_softmax.h"
 
 namespace infini {
-class SoftmaxCudnn : public CudaKernelWithoutConfig {
+class SoftmaxCuda : public CudaKernelWithoutConfig {
 
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<SoftmaxObj>(_op);
-        auto x = op->getInputs(0)->getRawDataPtr<float *>();
-        auto y = op->getOutput(0)->getRawDataPtr<float *>();
+        auto input = op->getInputs(0)->getRawDataPtr<float *>();
+        auto output = op->getOutput(0)->getRawDataPtr<float *>();
+        const auto &inShape = op->getInputs(0)->getDims(); // input shape
         auto dims = op->getInputs(0)->getDims();
 
-        int batch_size = 1;
-        for (size_t i = 0; i < dims.size(); ++i)
-            batch_size *= dims[i];
-        int dim = dims[op->getAxis()];
+        int size; // size = i(JKS) + j(KS) + k(S) + s
+        size = op->getOutput(0)->size();
+        int dimsize = dims[op->getAxis()];
+        int stride = op->getInputs(0)->getStride().at(op->getAxis());
 
-        int block_num = batch_size / dim;
-        int max_threadblock_size = batch_size / block_num;
-        softmax_kernel(max_threadblock_size, block_num, x, y, dim,
-                       op->getInputs(0)->getStride().at(op->getAxis()));
+        int num_blocks = size / dimsize;
+        softmax_kernel(num_blocks, (float *)input, (float *)output, size,
+                       dimsize, stride);
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, SoftmaxCudnn,
+REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, SoftmaxCuda,
                 "Softmax_CUDA_Float32");
 } // namespace infini
diff --git a/src/kernels/cuda/softmax.cu b/src/kernels/cuda/softmax.cu
index 1f7f39e6..7e85ec43 100644
--- a/src/kernels/cuda/softmax.cu
+++ b/src/kernels/cuda/softmax.cu
@@ -1,77 +1,183 @@
 #include "cuda/cuda_common.h"
-#include "cuda/softmax.h"
 #include <cub/cub.cuh>
 
-struct __align__(8) MD {
-    float data;
-    float d;
+struct __align__(8) DataMaxSum { // update the global max and sum, store the
+                                 // output at max_tmp and sum_tmp
+    float max_tmp;               // store max
+    float sum_tmp;               // store sum
+};
+__device__ __forceinline__ DataMaxSum reduce_dms_op(DataMaxSum a,
+                                                    DataMaxSum b) {
+    bool a_bigger = (a.max_tmp > b.max_tmp);
+    DataMaxSum bigger = a_bigger ? a : b;
+    DataMaxSum smaller = a_bigger ? b : a;
+    bigger.sum_tmp = bigger.sum_tmp +
+                     smaller.sum_tmp * __expf(smaller.max_tmp - bigger.max_tmp);
+
+    return bigger;
+}
+template <int BLOCK_DIM>
+__launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel(
+    float *__restrict input, float *__restrict output, int size, int dimsize,
+    int stride) { // if set axis = 1, inputShape=[I,J,K,S]
+                  // tid = i(JKS) + j(KS) + k(S) + s
+
+    // blockDim.x = size/dimsize = IKS
+    // blockIdx.x = i(KS) + k(S) + s,blockIdx.x%stride = k(S) + s
+
+    int tid =
+        blockIdx.x % stride + (blockIdx.x - blockIdx.x % stride) *
+                                  dimsize; // now, tid = i(JKS) + k(S) + s;
+
+    DataMaxSum dms_partial;
+    dms_partial.max_tmp = -__FLT_MAX__;
+    dms_partial.sum_tmp = 0.0f;
+    DataMaxSum dms_input;
+    for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+
+        dms_input.max_tmp =
+            input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
+
+        dms_input.sum_tmp = 1.0f;
+        dms_partial = reduce_dms_op(dms_partial,
+                                    dms_input); // reduce the data to one block
+    }
+    typedef cub::BlockReduce<DataMaxSum, BLOCK_DIM> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    __shared__ DataMaxSum dms_total;
+    DataMaxSum dms_block =
+        BlockReduce(temp_storage).Reduce(dms_partial, reduce_dms_op);
+    if (threadIdx.x ==
+        0) { // must set threadIdx.x = 0 write the output to memory
+        dms_total = dms_block;
+    }
+    __syncthreads();
+    //-----------------
+
+    for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+        output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
+            __expf(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
+                   dms_total.max_tmp) *
+            __fdividef(1.0F, dms_total.sum_tmp);
+    }
+}
+
+template <typename T> struct SumOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return a + b;
+    }
 };
 
-__device__ __forceinline__ MD reduce_md_op(MD a, MD b) {
-    bool a_bigger = (a.data > b.data);
-    MD bigger_m = a_bigger ? a : b;
-    MD smaller_m = a_bigger ? b : a;
-    MD res;
-    res.d = bigger_m.d + smaller_m.d * __expf(smaller_m.data - bigger_m.data);
-    res.data = bigger_m.data;
-    return res;
-}
-
-template <int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void online_softmax(const float *__restrict in, float *__restrict out,
-                        int dimSize, int stride) {
-
-    // reposition in and out to data for the current vector
-    int blockOffset = blockIdx.x;
-    if (blockIdx.x >= stride) {
-        int tmp = blockIdx.x % stride;
-        blockOffset = tmp + (blockIdx.x - tmp) * dimSize;
+template <typename T> struct MaxOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return max(a, b);
     }
-    in += blockOffset;
-    out += blockOffset;
-
-    MD md_partial;
-    md_partial.data = -FLT_MAX;
-    md_partial.d = 0.0F;
-
-    for (int elem_id = threadIdx.x; elem_id < dimSize;
-         elem_id += THREADBLOCK_SIZE) {
-        MD new_elem;
-        new_elem.data = in[elem_id * stride];
-        new_elem.d = 1.0F;
-        md_partial = reduce_md_op(md_partial, new_elem);
+};
+template <template <typename> class ReductionOp, typename T,
+          int thread_group_width>
+__inline__ __device__ T WarpAllReduce(T val) {
+    for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+        val = ReductionOp<T>()(val, __shfl_xor_sync(0xffffffff, val, mask));
     }
-
-    // blockreduce for THREADBLOCK_SIZE threads.
-    // The actrual threads num used in the block is "dimsSize"
-    typedef cub::BlockReduce<MD, THREADBLOCK_SIZE> BlockReduce;
-
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-    __shared__ MD md_total;
-
-    MD md = BlockReduce(temp_storage).Reduce(md_partial, reduce_md_op);
-    if (threadIdx.x == 0)
-        md_total = md;
-    __syncthreads();
-
-    float d_total_inverse = __fdividef(1.0F, md_total.d);
-    for (int elem_id = threadIdx.x; elem_id < dimSize;
-         elem_id += THREADBLOCK_SIZE)
-        out[elem_id * stride] =
-            __expf(in[elem_id * stride] - md_total.data) * d_total_inverse;
+    return val;
 }
+template <int BLOCK_DIM_x, int BLOCK_DIM_y>
+__global__ void _warpSoftmaxKernel(float *__restrict input,
+                                   float *__restrict output, int size,
+                                   int dimsize, int stride) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+    int otherSize = size / dimsize;
+    int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
 
+    if (otherIdx < otherSize) {
+
+        __shared__ float max_total[BLOCK_DIM_y];
+        __shared__ float sum_total[BLOCK_DIM_y];
+        float max_data = -__FLT_MAX__;
+
+        for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
+            max_data =
+                max(max_data,
+                    input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride]);
+        }
+
+        max_data = WarpAllReduce<MaxOp, float, BLOCK_DIM_x>(max_data);
+
+        if (threadIdx.x == 0)
+            max_total[threadIdx.y] = max_data;
+
+        //--------------------------------------------
+        float sum_data = 0.0f;
+
+        for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
+            sum_data +=
+                __expf(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                       max_total[threadIdx.y]);
+        }
+
+        sum_data = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sum_data);
+
+        if (threadIdx.x == 0)
+            sum_total[threadIdx.y] = sum_data;
+
+        //--------------------------------------------
+
+        for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
+            output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
+                __expf(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                       max_total[threadIdx.y]) *
+                __fdividef(1.0F, sum_total[threadIdx.y]);
+        }
+    }
+}
+//-----------------
+
+//-----------------
 namespace infini {
-void softmax_kernel(int max_threadblock_size, int blockNum, float *in,
-                    float *out, int dimSize, int stride) {
-    if (max_threadblock_size >= 255)
-        online_softmax<256><<<blockNum, 256>>>(in, out, dimSize, stride);
-    else if (max_threadblock_size >= 128)
-        online_softmax<128><<<blockNum, 128>>>(in, out, dimSize, stride);
-    else if (max_threadblock_size >= 64)
-        online_softmax<64><<<blockNum, 64>>>(in, out, dimSize, stride);
-    else
-        online_softmax<32><<<blockNum, 32>>>(in, out, dimSize, stride);
+void softmax_kernel(int num_blocks, float *input, float *output, int size,
+                    int dimsize, int stride) {
+
+    if (dimsize > 1024) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<1024>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 31) {
+        int BLOCK_DIM_x = 32;
+        int BLOCK_DIM_y = 32;
+        int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        _warpSoftmaxKernel<32, 32>
+            <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 15) {
+        int BLOCK_DIM_x = 16;
+        int BLOCK_DIM_y = 64;
+        int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        _warpSoftmaxKernel<16, 64>
+            <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 7) {
+        int BLOCK_DIM_x = 8;
+        int BLOCK_DIM_y = 128;
+        int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        _warpSoftmaxKernel<8, 128>
+            <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
+    } else {
+        int BLOCK_DIM_x = 4;
+        int BLOCK_DIM_y = 256;
+        int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        _warpSoftmaxKernel<4, 256>
+            <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
+    }
 }
 } // namespace infini
diff --git a/test/kernels/cuda/test_cuda_where.cc b/test/kernels/cuda/test_cuda_where.cc
index 74f114d4..32c2f253 100644
--- a/test/kernels/cuda/test_cuda_where.cc
+++ b/test/kernels/cuda/test_cuda_where.cc
@@ -8,38 +8,38 @@
 
 namespace infini {
 
-void test_where(const Shape &inputxshape, const vector<float> &inputxdata,
-                const Shape &inputyshape, const vector<float> &inputydata,
-                const Shape &conditionshape,
-                const vector<uint8_t> &conditiondata,
+void test_where(const Shape &inputXShape, const vector<float> &inputXData,
+                const Shape &inputYShape, const vector<float> &inputYData,
+                const Shape &conditionShape,
+                const vector<uint8_t> &conditionData,
                 const vector<float> &ExpectData) {
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
     Graph gCpu = make_ref<GraphObj>(runtime);
-    auto condition = gCpu->addTensor(conditionshape, DataType::UInt8);
-    auto inputx = gCpu->addTensor(inputxshape, DataType::Float32);
-    auto inputy = gCpu->addTensor(inputyshape, DataType::Float32);
+    auto condition = gCpu->addTensor(conditionShape, DataType::UInt8);
+    auto inputX = gCpu->addTensor(inputXShape, DataType::Float32);
+    auto inputY = gCpu->addTensor(inputYShape, DataType::Float32);
 
     gCpu->dataMalloc();
-    condition->copyin(conditiondata); //
-    inputx->copyin(inputxdata);
-    inputy->copyin(inputydata); //
+    condition->copyin(conditionData); //
+    inputX->copyin(inputXData);
+    inputY->copyin(inputYData); //
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
 
     auto conditionGpu = gCuda->cloneTensor(condition);
-    auto inputxGpu = gCuda->cloneTensor(inputx);
-    auto inputyGpu = gCuda->cloneTensor(inputy);
+    auto inputXGpu = gCuda->cloneTensor(inputX);
+    auto inputYGpu = gCuda->cloneTensor(inputY);
 
-    auto op = gCuda->addOp<WhereObj>(inputxGpu, inputyGpu, conditionGpu,
+    auto op = gCuda->addOp<WhereObj>(inputXGpu, inputYGpu, conditionGpu,
                                      nullptr); // WhereObj
     gCuda->dataMalloc();
-    conditionGpu->copyin(conditiondata);
-    inputxGpu->copyin(inputxdata);
-    inputyGpu->copyin(inputydata);
+    conditionGpu->copyin(conditionData);
+    inputXGpu->copyin(inputXData);
+    inputYGpu->copyin(inputYData);
     cudaRuntime->run(gCuda);
 
-    auto oCpu = gCpu->cloneTensor(op->getOutput()); // move data from gpu to cpu
+    auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
     oCpu->printData();                              //->printData
     EXPECT_TRUE(oCpu->equalData(ExpectData));
 }

From 1ea450882be35b6a4a1d0d660bb9a40f9d2ce7f4 Mon Sep 17 00:00:00 2001
From: Hardy <100662313+wanghailu0717@users.noreply.github.com>
Date: Fri, 10 Nov 2023 17:52:09 +0800
Subject: [PATCH 06/28] add reduce_mean and gather on kunlun (#169)

* add reduce_mean and gather

* fix format

* fix gather

* fix

* fix xpu, add where operation, fix element-wise operation

* fix format

---------

Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 src/kernels/kunlun/element_wise.cc | 97 ++++++++++++++++++++++++++++++
 src/kernels/kunlun/gather.cc       | 29 +++++++++
 src/kernels/kunlun/matmul.cc       | 21 +++----
 src/kernels/kunlun/reduce_mean.cc  | 30 +++++++++
 src/kernels/kunlun/select.cc       | 32 ++++++++++
 src/kernels/kunlun/split.cc        |  3 -
 6 files changed, 196 insertions(+), 16 deletions(-)
 create mode 100644 src/kernels/kunlun/gather.cc
 create mode 100644 src/kernels/kunlun/reduce_mean.cc
 create mode 100644 src/kernels/kunlun/select.cc

diff --git a/src/kernels/kunlun/element_wise.cc b/src/kernels/kunlun/element_wise.cc
index f71c11bc..3370eb1a 100644
--- a/src/kernels/kunlun/element_wise.cc
+++ b/src/kernels/kunlun/element_wise.cc
@@ -15,6 +15,12 @@ class AddXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::broadcast_add<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (float *)cData, aDim, bDim);
@@ -35,6 +41,12 @@ class SubXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::broadcast_sub<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (float *)cData, aDim, bDim);
@@ -55,6 +67,12 @@ class MulXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::broadcast_mul<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (float *)cData, aDim, bDim);
@@ -75,6 +93,12 @@ class DivXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::broadcast_div<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (float *)cData, aDim, bDim);
@@ -95,6 +119,13 @@ class PowXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
+
         auto ret = baidu::xpu::api::broadcast_pow<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (float *)cData, aDim, bDim);
@@ -115,6 +146,12 @@ class MaxXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::broadcast_max<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (float *)cData, aDim, bDim);
@@ -135,6 +172,12 @@ class MinXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::broadcast_min<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (float *)cData, aDim, bDim);
@@ -157,6 +200,12 @@ class EqualXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::broadcast_equal<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (bool *)wsData, aDim, bDim);
@@ -181,6 +230,12 @@ class GreaterEqualXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::broadcast_greater_equal<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (bool *)wsData, aDim, bDim);
@@ -205,6 +260,12 @@ class GreaterThanXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::broadcast_greater_than<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (bool *)wsData, aDim, bDim);
@@ -229,6 +290,12 @@ class LessEqualXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::broadcast_less_equal<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (bool *)wsData, aDim, bDim);
@@ -253,6 +320,12 @@ class LessThanXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::broadcast_less_than<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (bool *)wsData, aDim, bDim);
@@ -275,6 +348,12 @@ class FloorDivXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::broadcast_floordiv<float>(
             context->KUNLUNHandle(), (float *)aData, (float *)bData,
             (float *)cData, aDim, bDim);
@@ -317,6 +396,12 @@ class AndXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::logical_and<bool>(
             context->KUNLUNHandle(), (bool *)aData, (bool *)bData,
             (bool *)wsData, len);
@@ -341,6 +426,12 @@ class OrXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::logical_or<bool>(
             context->KUNLUNHandle(), (bool *)aData, (bool *)bData,
             (bool *)wsData, len);
@@ -365,6 +456,12 @@ class XorXdnn : public KUNLUNKernelWithoutConfig {
 
         auto aDim = op->getInputs(0)->getDims();
         auto bDim = op->getInputs(1)->getDims();
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
         auto ret = baidu::xpu::api::logical_xor<bool>(
             context->KUNLUNHandle(), (bool *)aData, (bool *)bData,
             (bool *)wsData, len);
diff --git a/src/kernels/kunlun/gather.cc b/src/kernels/kunlun/gather.cc
new file mode 100644
index 00000000..f94d24fa
--- /dev/null
+++ b/src/kernels/kunlun/gather.cc
@@ -0,0 +1,29 @@
+#include "operators/gather.h"
+#include "kunlun/kunlun_kernel_without_config.h"
+#include "kunlun/kunlun_runtime.h"
+
+namespace infini {
+class GatherXdnn : public KUNLUNKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<GatherObj>(_op);
+        auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        auto shape = op->getInputs(0)->getDims();
+        auto index = op->getInputs(1)->getDims();
+        auto axis = op->getAxis();
+        auto ret = baidu::xpu::api::gather<float, int>(
+            context->KUNLUNHandle(), (float *)aData, (int *)bData,
+            (float *)cData, shape, index.size(), axis);
+        assert(ret == 0);
+        return;
+    }
+};
+
+REGISTER_KERNEL(Device::KUNLUN, OpType::Gather, DataType::Float32, GatherXdnn,
+                "Gather_xdnn_KUNLUN_Float32");
+}; // namespace infini
diff --git a/src/kernels/kunlun/matmul.cc b/src/kernels/kunlun/matmul.cc
index 91240ce3..8506e812 100644
--- a/src/kernels/kunlun/matmul.cc
+++ b/src/kernels/kunlun/matmul.cc
@@ -13,21 +13,16 @@ class MatmulXdnn : public KUNLUNKernelWithoutConfig {
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
         bool transA = op->getTransA();
         bool transB = op->getTransB();
-        if (op->getInputs(0)->getDims().size() != 2 ||
-            op->getInputs(1)->getDims().size() != 2) {
-            IT_TODO_HALT();
-        }
 
-        auto m = transA ? op->getInputs(0)->getDims()[1]
-                        : op->getInputs(0)->getDims()[0];
-        auto n = transB ? op->getInputs(1)->getDims()[0]
-                        : op->getInputs(1)->getDims()[1];
-        auto k = transA ? op->getInputs(0)->getDims()[0]
-                        : op->getInputs(0)->getDims()[1];
+        auto b = op->getB();
+        auto m = op->getM();
+        auto n = op->getN();
+        auto k = op->getK();
 
-        auto ret = baidu::xpu::api::fc<float, float, float, int>(
-            context->KUNLUNHandle(), (float *)aData, (float *)bData,
-            (float *)cData, m, n, k, transA, transB, nullptr, nullptr, nullptr);
+        auto ret = baidu::xpu::api::fc_batched<float, float, float, float>(
+            context->KUNLUNHandle(), b, transA, transB, m, n, k, 1.0,
+            (float *)aData, m * k, (float *)bData, n * k, 0.0, (float *)cData,
+            m * n, nullptr, nullptr);
         assert(ret == 0);
         return;
     }
diff --git a/src/kernels/kunlun/reduce_mean.cc b/src/kernels/kunlun/reduce_mean.cc
new file mode 100644
index 00000000..08a01fd6
--- /dev/null
+++ b/src/kernels/kunlun/reduce_mean.cc
@@ -0,0 +1,30 @@
+#include "operators/reduce_mean.h"
+#include "kunlun/kunlun_kernel_without_config.h"
+#include "kunlun/kunlun_runtime.h"
+
+namespace infini {
+class ReduceMeanXdnn : public KUNLUNKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ReduceMeanObj>(_op);
+        auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        auto axes_set = op->getAxes();
+        std::vector<int> axes;
+        axes.assign(axes_set.begin(), axes_set.end());
+        auto shape = op->getInputs(0)->getDims();
+
+        auto ret = baidu::xpu::api::reduce_mean<float>(
+            context->KUNLUNHandle(), (float *)aData, (float *)cData, shape,
+            axes);
+        assert(ret == 0);
+        return;
+    }
+};
+
+REGISTER_KERNEL(Device::KUNLUN, OpType::ReduceMean, DataType::Float32,
+                ReduceMeanXdnn, "ReduceMean_xdnn_KUNLUN_Float32");
+}; // namespace infini
diff --git a/src/kernels/kunlun/select.cc b/src/kernels/kunlun/select.cc
new file mode 100644
index 00000000..d6318e46
--- /dev/null
+++ b/src/kernels/kunlun/select.cc
@@ -0,0 +1,32 @@
+#include "kunlun/kunlun_kernel_without_config.h"
+#include "kunlun/kunlun_runtime.h"
+#include "operators/where.h"
+
+namespace infini {
+class WhereXdnn : public KUNLUNKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<WhereObj>(_op);
+        auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getInputs(2)->getRawDataPtr<void *>());
+        void *const dData = (op->getOutput()->getRawDataPtr<void *>());
+
+        auto aDim = op->getInputs(0)->getDims();
+        auto bDim = op->getInputs(1)->getDims();
+        auto cDim = op->getInputs(2)->getDims();
+        auto dDim = op->getOutput()->getDims();
+
+        auto ret = baidu::xpu::api::select<float>(
+            context->KUNLUNHandle(), (bool *)cData, (float *)aData,
+            (float *)bData, (float *)dData, cDim, aDim);
+        assert(ret == 0);
+        return;
+    }
+};
+
+REGISTER_KERNEL(Device::KUNLUN, OpType::Where, DataType::Float32, WhereXdnn,
+                "Where_xdnn_KUNLUN_Float32");
+}; // namespace infini
diff --git a/src/kernels/kunlun/split.cc b/src/kernels/kunlun/split.cc
index 301ef027..46276c85 100644
--- a/src/kernels/kunlun/split.cc
+++ b/src/kernels/kunlun/split.cc
@@ -22,9 +22,6 @@ class SplitXdnn : public KUNLUNKernelWithoutConfig {
         std::vector<int> splitList;
         for (int i = 0; i < num; ++i) {
             auto dim = op->getOutput(i)->getDims();
-            if (dim.size() != 4) {
-                IT_TODO_HALT();
-            }
             splitList.push_back(dim[axis]);
         }
 

From 50862df765a55786d4392a4494af739a7b87f5d0 Mon Sep 17 00:00:00 2001
From: Hardy <100662313+wanghailu0717@users.noreply.github.com>
Date: Fri, 10 Nov 2023 17:58:26 +0800
Subject: [PATCH 07/28] [Kunlun & CUDA & BANG] add depth2space operator (#178)

* add depth2space operator

* fix format

* add depth2space on cambricon bang

* add depth2space on gpu

---------

Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 include/core/graph_handler.h              |  2 +
 include/operators/transpose.h             | 29 +++++++++
 pyinfinitensor/src/pyinfinitensor/onnx.py | 15 +++++
 src/core/graph_handler.cc                 | 13 ++++
 src/ffi/ffi_infinitensor.cc               | 12 +++-
 src/kernels/bang/transpose.cc             | 56 ++++++++++++++++
 src/kernels/cuda/transpose.cc             | 47 ++++++++++++++
 src/kernels/kunlun/transpose.cc           | 27 ++++++++
 src/operators/transpose.cc                | 78 +++++++++++++++++++++++
 9 files changed, 278 insertions(+), 1 deletion(-)

diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index 87e909f8..61826893 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -86,6 +86,8 @@ class GraphHandlerObj {
     Tensor allReduceAvg(Tensor input, Tensor output);
     TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
     Tensor broadcast(Tensor input, Tensor output, int root);
+    Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
+                        std::string mode);
 
     //------ modifiers
 
diff --git a/include/operators/transpose.h b/include/operators/transpose.h
index c20d0a08..9fcd1617 100644
--- a/include/operators/transpose.h
+++ b/include/operators/transpose.h
@@ -19,4 +19,33 @@ class TransposeObj : public OperatorObj {
     vector<int> getWorkloadVector() const override;
     vector<int> getOpAttrVector() const override;
 };
+
+class DepthToSpaceObj : public OperatorObj {
+  public:
+    DepthToSpaceObj(GraphObj *graph, Tensor input, Tensor output, int blocksize,
+                    std::string mode);
+    OP_CLONE(DepthToSpaceObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+    int getBlockSize() const { return blockSize; }
+    int getMode() const { return D2SMode; }
+    auto getModeString() const { return D2SModeString; }
+    auto getReshapeDim() const { return reshapeDim; }
+    auto getTransposeDim() const { return transposeDim; }
+    auto getOutDim() const { return outDim; }
+
+  private:
+    int blockSize;
+    int D2SMode;
+    std::string D2SModeString;
+    mutable std::vector<int> reshapeDim = {1, 1, 1, 1, 1, 1};
+    mutable std::vector<int> transposeDim = {1, 1, 1, 1, 1, 1};
+    mutable std::vector<int> outDim = {1, 1, 1, 1};
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
 } // namespace infini
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 6d0da9f8..cc5498f9 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -491,6 +491,21 @@ class OnnxStub:
                         tensors.get(node.output[0]),
                         perm,
                     )
+                elif node.op_type == "DepthToSpace":
+                    blocksize = next(
+                        (attr.i for attr in node.attribute if attr.name == "blocksize"),
+                        None,
+                    )
+                    mode = next(
+                        (attr.s for attr in node.attribute if attr.name == "mode"),
+                        None,
+                    )
+                    tensors[node.output[0]] = self.handler.depthToSpace(
+                        tensors[node.input[0]],
+                        tensors.get(node.output[0]),
+                        blocksize,
+                        mode,
+                    )
                 elif node.op_type == "Reshape":
                     dims = _search_shape(model, node.input[0])
                     size = reduce(lambda acc, x: acc * x, dims)
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index 77fbcf2d..ddf53884 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -425,6 +425,19 @@ Tensor GraphHandlerObj::where(Tensor inputX, Tensor inputY, Tensor condition,
     }
 }
 
+Tensor GraphHandlerObj::depthToSpace(Tensor input, Tensor output, int blocksize,
+                                     std::string mode) {
+    if (output) {
+        g->addOpWithOutputs<DepthToSpaceObj>(std::move(input), output,
+                                             blocksize, mode);
+        return output;
+    } else {
+        return g
+            ->addOp<DepthToSpaceObj>(std::move(input), output, blocksize, mode)
+            ->getOutput();
+    }
+}
+
 static CastType inferCastType(Tensor input, int to) {
     auto iType = input->getDType();
     auto oType = DataType(to);
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index e1a726c3..3612269e 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -111,6 +111,7 @@ void export_values(py::module &m) {
         .VALUE(OpType, Expand)
         .VALUE(OpType, Erf)
         .VALUE(OpType, Where)
+        .VALUE(OpType, DepthToSpace)
         .export_values();
 
 #undef VALUE
@@ -286,6 +287,13 @@ static int cast_to_of(Operator op) {
     return castOutputDtype.getIndex();
 }
 
+static std::tuple<int, std::string> depth_to_space_attrs_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::DepthToSpace);
+    auto depth_to_space = dynamic_cast<const DepthToSpaceObj *>(op.get());
+    return std::make_tuple(depth_to_space->getBlockSize(),
+                           depth_to_space->getModeString());
+}
+
 void export_functions(py::module &m) {
 #define FUNCTION(NAME) def(#NAME, &NAME)
     m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
@@ -321,7 +329,8 @@ void export_functions(py::module &m) {
         .FUNCTION(split_axis_of)
         .FUNCTION(gather_axis_of)
         .FUNCTION(flatten_axis_of)
-        .FUNCTION(cast_to_of);
+        .FUNCTION(cast_to_of)
+        .FUNCTION(depth_to_space_attrs_of);
 #undef FUNCTION
 }
 
@@ -477,6 +486,7 @@ void init_graph_builder(py::module &m) {
         .def("pRelu", &Handler::pRelu, policy::move)
         .def("clip", &Handler::clip, policy::move)
         .def("transpose", &Handler::transpose, policy::move)
+        .def("depthToSpace", &Handler::depthToSpace, policy::move)
         .def("reshape", &Handler::reshape, policy::move)
         .def("concat", &Handler::concat, policy::move)
         .def("split", &Handler::split, policy::move)
diff --git a/src/kernels/bang/transpose.cc b/src/kernels/bang/transpose.cc
index c87c4c28..ff2783b5 100644
--- a/src/kernels/bang/transpose.cc
+++ b/src/kernels/bang/transpose.cc
@@ -48,6 +48,62 @@ class TransposeCnnl : public BangKernelWithoutConfig {
     }
 };
 
+class DepthToSpaceCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<DepthToSpaceObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        auto reshape = op->getReshapeDim();
+        auto transpose = op->getTransposeDim();
+        auto mode = op->getMode();
+
+        std::vector<int> permute;
+        if (mode == 0) {
+            permute = {0, 3, 4, 1, 5, 2};
+        } else {
+            permute = {0, 1, 4, 2, 5, 3};
+        }
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dimout = op->getOutput()->getDims();
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, reshape.size(),
+                                               reshape.data()));
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(
+            cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
+                                    transpose.size(), transpose.data()));
+
+        cnnlTransposeDescriptor_t opDesc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
+        checkCnnlError(
+            cnnlSetTransposeDescriptor(opDesc, permute.size(), permute.data()));
+
+        size_t wsSize;
+        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aDesc, opDesc,
+                                      &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlTranspose_v2(context->cnnlHandle(), opDesc, aDesc, aData, cDesc,
+                             cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
+    }
+};
+
 REGISTER_KERNEL(Device::BANG, OpType::Transpose, DataType::Float32,
                 TransposeCnnl, "Transpose_cnnl_BANG_Float32");
+
+REGISTER_KERNEL(Device::BANG, OpType::DepthToSpace, DataType::Float32,
+                DepthToSpaceCnnl, "DepthToSpace_cnnl_BANG_Float32");
 }; // namespace infini
diff --git a/src/kernels/cuda/transpose.cc b/src/kernels/cuda/transpose.cc
index 37f97cd9..774cb37f 100644
--- a/src/kernels/cuda/transpose.cc
+++ b/src/kernels/cuda/transpose.cc
@@ -43,7 +43,54 @@ class TransposeCuda : public CudaKernelWithoutConfig {
     }
 };
 
+class DepthToSpaceCuda : public CudaKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<DepthToSpaceObj>(_op);
+
+        auto input = op->getInputs(0);
+        auto output = op->getOutput();
+        void *const inputData = input->getRawDataPtr<void *>();
+        void *const outputData = output->getRawDataPtr<void *>();
+        const auto &reshape = op->getReshapeDim();
+        const auto &transpose = op->getTransposeDim();
+        auto mode = op->getMode();
+
+        std::vector<int> perm;
+        if (mode == 0) {
+            perm = {0, 3, 4, 1, 5, 2};
+        } else {
+            perm = {0, 1, 4, 2, 5, 3};
+        }
+
+        int size = input->size();
+        int nDims = reshape.size();
+
+        // Compute strides
+        SmallArray strides, buffer;
+        IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
+        int curStride = 1;
+        for (int i = nDims - 1; i >= 0; --i) {
+            buffer.data[i] = curStride;
+            curStride *= reshape[i];
+        }
+        for (int i = 0; i < nDims; ++i) {
+            strides.data[i] = buffer.data[perm[i]];
+        }
+
+        SmallArray outputDims;
+        for (int i = 0; i < nDims; ++i) {
+            outputDims.data[i] = transpose[i];
+        }
+
+        transpose_kernel((float *)inputData, (float *)outputData, nDims, size,
+                         strides, outputDims);
+    }
+};
+
 REGISTER_KERNEL(Device::CUDA, OpType::Transpose, DataType::Float32,
                 TransposeCuda, "Transpose_CUDA_Float32");
 
+REGISTER_KERNEL(Device::CUDA, OpType::DepthToSpace, DataType::Float32,
+                DepthToSpaceCuda, "DepthToSpace_CUDA_Float32");
 } // namespace infini
diff --git a/src/kernels/kunlun/transpose.cc b/src/kernels/kunlun/transpose.cc
index 443df8d9..817c32e2 100644
--- a/src/kernels/kunlun/transpose.cc
+++ b/src/kernels/kunlun/transpose.cc
@@ -27,6 +27,33 @@ class TransposeXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
+class DepthToSpaceXdnn : public KUNLUNKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<DepthToSpaceObj>(_op);
+        auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        auto reshape = op->getReshapeDim();
+        auto mode = op->getMode();
+        std::vector<int> permute;
+        if (mode == 0) {
+            permute = {0, 3, 4, 1, 5, 2};
+        } else {
+            permute = {0, 1, 4, 2, 5, 3};
+        }
+        auto ret = baidu::xpu::api::transpose<float>(
+            context->KUNLUNHandle(), (float *)aData, (float *)cData, reshape,
+            permute);
+        assert(ret == 0);
+        return;
+    }
+};
+
 REGISTER_KERNEL(Device::KUNLUN, OpType::Transpose, DataType::Float32,
                 TransposeXdnn, "Transpose_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::DepthToSpace, DataType::Float32,
+                DepthToSpaceXdnn, "DepthToSpace_xdnn_KUNLUN_Float32");
 }; // namespace infini
diff --git a/src/operators/transpose.cc b/src/operators/transpose.cc
index 9a457647..f4c6a28d 100644
--- a/src/operators/transpose.cc
+++ b/src/operators/transpose.cc
@@ -53,4 +53,82 @@ vector<int> TransposeObj::getOpAttrVector() const {
     return {type.underlying()};
 }
 
+DepthToSpaceObj::DepthToSpaceObj(GraphObj *graph, Tensor input, Tensor output,
+                                 int blocksize, std::string mode)
+    : OperatorObj(OpType::DepthToSpace, {input}, {output}) {
+    blockSize = blocksize;
+    D2SMode = 0;
+    D2SModeString = "DCR";
+    if (mode == "CRD") {
+        D2SMode = 1;
+        D2SModeString = "CRD";
+    }
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>>
+DepthToSpaceObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    auto inputDim = A->getDims();
+    IT_ASSERT(inputDim.size() == 4);
+    if (D2SMode == 0) {
+        reshapeDim[0] = inputDim[0];
+        reshapeDim[1] = blockSize;
+        reshapeDim[2] = blockSize;
+        reshapeDim[3] = inputDim[1] / (blockSize * blockSize);
+        reshapeDim[4] = inputDim[2];
+        reshapeDim[5] = inputDim[3];
+        transposeDim[0] = reshapeDim[0];
+        transposeDim[1] = reshapeDim[3];
+        transposeDim[2] = reshapeDim[4];
+        transposeDim[3] = reshapeDim[1];
+        transposeDim[4] = reshapeDim[5];
+        transposeDim[5] = reshapeDim[2];
+        outDim[0] = inputDim[0];
+        outDim[1] = inputDim[1] / (blockSize * blockSize);
+        outDim[2] = inputDim[2] * blockSize;
+        outDim[3] = inputDim[3] * blockSize;
+    } else {
+        reshapeDim[0] = inputDim[0];
+        reshapeDim[1] = inputDim[1] / (blockSize * blockSize);
+        reshapeDim[2] = blockSize;
+        reshapeDim[3] = blockSize;
+        reshapeDim[4] = inputDim[2];
+        reshapeDim[5] = inputDim[3];
+        transposeDim[0] = reshapeDim[0];
+        transposeDim[1] = reshapeDim[1];
+        transposeDim[2] = reshapeDim[4];
+        transposeDim[3] = reshapeDim[2];
+        transposeDim[4] = reshapeDim[5];
+        transposeDim[5] = reshapeDim[3];
+        outDim[0] = inputDim[0];
+        outDim[1] = inputDim[1] / (blockSize * blockSize);
+        outDim[2] = inputDim[2] * blockSize;
+        outDim[3] = inputDim[3] * blockSize;
+    }
+
+    return {{outDim}};
+}
+
+std::string DepthToSpaceObj::toString() const {
+    std::ostringstream os;
+    os << type.toString() << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> DepthToSpaceObj::getWorkloadVector() const {
+    vector<int> ret{type.underlying()};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> DepthToSpaceObj::getOpAttrVector() const {
+    return {type.underlying()};
+}
+
 }; // namespace infini

From f22fa2766ef78c9296c8365ac76d6b540b8aa503 Mon Sep 17 00:00:00 2001
From: Hardy <100662313+wanghailu0717@users.noreply.github.com>
Date: Fri, 10 Nov 2023 18:02:44 +0800
Subject: [PATCH 08/28] add reduce_mean and gather on bang (#167)

* add code

* fix reduce_mean

* add softmax on BANG

* fix gather

* fix boradcast on ele kernel when dim size is zero

* add where kernel and fix softmax kernel

* fix convbpdata bug

* fix format

---------

Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 src/kernels/bang/activation.cc   | 69 ++++++++++++++++++++++++++++
 src/kernels/bang/conv_trans.cc   | 21 +++------
 src/kernels/bang/element_wise.cc | 78 ++++++++++++++++++++++++++++++++
 src/kernels/bang/gather.cc       | 53 ++++++++++++++++++++++
 src/kernels/bang/reduce_mean.cc  | 69 ++++++++++++++++++++++++++++
 src/kernels/bang/where.cc        | 73 ++++++++++++++++++++++++++++++
 6 files changed, 349 insertions(+), 14 deletions(-)
 create mode 100644 src/kernels/bang/gather.cc
 create mode 100644 src/kernels/bang/reduce_mean.cc
 create mode 100644 src/kernels/bang/where.cc

diff --git a/src/kernels/bang/activation.cc b/src/kernels/bang/activation.cc
index 23b86cc4..87b8396f 100644
--- a/src/kernels/bang/activation.cc
+++ b/src/kernels/bang/activation.cc
@@ -1,5 +1,6 @@
 #include "bang/bang_kernel_without_config.h"
 #include "bang/bang_runtime.h"
+#include "operators/softmax.h"
 #include "operators/unary.h"
 
 namespace infini {
@@ -113,6 +114,72 @@ class PReluCnnl : public BangKernelWithoutConfig {
     }
 };
 
+class SoftmaxCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<SoftmaxObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto aDim = op->getInputs(0)->getDims();
+
+        cnnlSoftmaxMode_t mode;
+        size_t axis = op->getAxis();
+        std::vector<int> inDim = {1, 1, 1};
+        std::vector<int> outDim = inDim;
+
+        if (axis == 0) {
+            mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
+            inDim[0] = aDim[0];
+            inDim[1] = aDim[1];
+            for (size_t i = 2; i < aDim.size(); ++i) {
+                inDim[2] *= aDim[i];
+            }
+            outDim = inDim;
+        } else if (axis == aDim.size() - 1) {
+            mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
+            inDim[0] = aDim[0];
+            for (size_t i = 1; i < axis; ++i) {
+                inDim[1] *= aDim[i];
+            }
+            inDim[2] = aDim[axis];
+            outDim = inDim;
+        } else {
+            mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
+            for (size_t i = 0; i < axis; ++i) {
+                inDim[0] *= aDim[i];
+            }
+            inDim[1] = aDim[axis];
+            for (size_t i = axis + 1; i < aDim.size(); ++i) {
+                inDim[2] *= aDim[i];
+            }
+            outDim = inDim;
+        }
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, inDim.size(),
+                                               inDim.data()));
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, outDim.size(),
+                                               outDim.data()));
+        float alpha = 1.0;
+        float beta = 0.0;
+        cnnlStatus_t stat =
+            cnnlSoftmaxForward_v2(context->cnnlHandle(), CNNL_SOFTMAX_ACCURATE,
+                                  mode, CNNL_COMPUTATION_HIGH_PRECISION, &alpha,
+                                  aDesc, aData, &beta, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
 class ReluCnnl : public UnaryCnnl {
     cnnlActivationMode_t getOpType() const override {
         return CNNL_ACTIVATION_RELU;
@@ -135,5 +202,7 @@ REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, DataType::Float32, SigmoidCnnl,
                 "Sigmoid_cnnl_BANG_Float32");
 REGISTER_KERNEL(Device::BANG, OpType::Round, DataType::Float32, RoundCnnl,
                 "Round_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Softmax, DataType::Float32, SoftmaxCnnl,
+                "Softmax_cnnl_BANG_Float32");
 
 }; // namespace infini
diff --git a/src/kernels/bang/conv_trans.cc b/src/kernels/bang/conv_trans.cc
index 05ec04fb..baa84c8a 100644
--- a/src/kernels/bang/conv_trans.cc
+++ b/src/kernels/bang/conv_trans.cc
@@ -39,24 +39,17 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
         if (dimOutput.size() != 4)
             IT_TODO_HALT();
 
-        int inputs0[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2],
-                          dimInputs0[3]};
-        int inputs1[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2],
-                          dimInputs1[3]};
-        int output[4] = {dimOutput[0], dimOutput[1], dimOutput[2],
-                         dimOutput[3]};
-
         // get inputs
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, 4, inputs0));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs0.data()));
         checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, 4, inputs1));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs1.data()));
         // get outputs
         checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, 4, output));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimOutput.data()));
 
         cnnlConvolutionBwdDataAlgo_t algo;
         cnnlGetConvolutionBackwardDataAlgorithm(
@@ -69,7 +62,7 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
         BangPtr wsData = context->getWorkspace(wsSize);
 
         cnnlStatus_t stat = cnnlConvolutionBackwardData(
-            context->cnnlHandle(), NULL, aDesc, aData, bDesc, bData, convDesc,
+            context->cnnlHandle(), NULL, bDesc, bData, aDesc, aData, convDesc,
             algo, wsData, wsSize, NULL, cDesc, cData);
         if (stat != CNNL_STATUS_SUCCESS)
             return;
diff --git a/src/kernels/bang/element_wise.cc b/src/kernels/bang/element_wise.cc
index 9255e3da..9c1d95b4 100644
--- a/src/kernels/bang/element_wise.cc
+++ b/src/kernels/bang/element_wise.cc
@@ -21,6 +21,13 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
         auto a_dim = op->getInputs(0)->getDims();
         auto b_dim = op->getInputs(1)->getDims();
         auto c_dim = op->getOutput()->getDims();
+        if (a_dim.size() == 0) {
+            a_dim.push_back(1);
+        }
+
+        if (b_dim.size() == 0) {
+            b_dim.push_back(1);
+        }
 
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
         checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
@@ -77,6 +84,13 @@ class LogicOpCnnl : public BangKernelWithoutConfig {
         auto a_dim = op->getInputs(0)->getDims();
         auto b_dim = op->getInputs(1)->getDims();
         auto c_dim = op->getOutput()->getDims();
+        if (a_dim.size() == 0) {
+            a_dim.push_back(1);
+        }
+
+        if (b_dim.size() == 0) {
+            b_dim.push_back(1);
+        }
 
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
         checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
@@ -123,6 +137,13 @@ class BitComputeCnnl : public BangKernelWithoutConfig {
         auto a_dim = op->getInputs(0)->getDims();
         auto b_dim = op->getInputs(1)->getDims();
         auto c_dim = op->getOutput()->getDims();
+        if (a_dim.size() == 0) {
+            a_dim.push_back(1);
+        }
+
+        if (b_dim.size() == 0) {
+            b_dim.push_back(1);
+        }
 
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
         checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
@@ -168,6 +189,13 @@ class DivCnnl : public BangKernelWithoutConfig {
         auto a_dim = op->getInputs(0)->getDims();
         auto b_dim = op->getInputs(1)->getDims();
         auto c_dim = op->getOutput()->getDims();
+        if (a_dim.size() == 0) {
+            a_dim.push_back(1);
+        }
+
+        if (b_dim.size() == 0) {
+            b_dim.push_back(1);
+        }
 
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
         checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
@@ -213,6 +241,13 @@ class MaximumCnnl : public BangKernelWithoutConfig {
         auto a_dim = op->getInputs(0)->getDims();
         auto b_dim = op->getInputs(1)->getDims();
         auto c_dim = op->getOutput()->getDims();
+        if (a_dim.size() == 0) {
+            a_dim.push_back(1);
+        }
+
+        if (b_dim.size() == 0) {
+            b_dim.push_back(1);
+        }
 
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
         checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
@@ -257,6 +292,13 @@ class MinimumCnnl : public BangKernelWithoutConfig {
         auto a_dim = op->getInputs(0)->getDims();
         auto b_dim = op->getInputs(1)->getDims();
         auto c_dim = op->getOutput()->getDims();
+        if (a_dim.size() == 0) {
+            a_dim.push_back(1);
+        }
+
+        if (b_dim.size() == 0) {
+            b_dim.push_back(1);
+        }
 
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
         checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
@@ -301,6 +343,13 @@ class MSELossCnnl : public BangKernelWithoutConfig {
         auto a_dim = op->getInputs(0)->getDims();
         auto b_dim = op->getInputs(1)->getDims();
         auto c_dim = op->getOutput()->getDims();
+        if (a_dim.size() == 0) {
+            a_dim.push_back(1);
+        }
+
+        if (b_dim.size() == 0) {
+            b_dim.push_back(1);
+        }
 
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
         checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
@@ -351,6 +400,14 @@ class PowerCnnl : public BangKernelWithoutConfig {
         auto b_dim = op->getInputs(1)->getDims();
         auto c_dim = op->getOutput()->getDims();
 
+        if (a_dim.size() == 0) {
+            a_dim.push_back(1);
+        }
+
+        if (b_dim.size() == 0) {
+            b_dim.push_back(1);
+        }
+
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
         checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
                                                CNNL_DTYPE_FLOAT, a_dim.size(),
@@ -395,6 +452,13 @@ class FloorDivCnnl : public BangKernelWithoutConfig {
         auto a_dim = op->getInputs(0)->getDims();
         auto b_dim = op->getInputs(1)->getDims();
         auto c_dim = op->getOutput()->getDims();
+        if (a_dim.size() == 0) {
+            a_dim.push_back(1);
+        }
+
+        if (b_dim.size() == 0) {
+            b_dim.push_back(1);
+        }
 
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
         checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
@@ -440,6 +504,13 @@ class FloorModCnnl : public BangKernelWithoutConfig {
         auto a_dim = op->getInputs(0)->getDims();
         auto b_dim = op->getInputs(1)->getDims();
         auto c_dim = op->getOutput()->getDims();
+        if (a_dim.size() == 0) {
+            a_dim.push_back(1);
+        }
+
+        if (b_dim.size() == 0) {
+            b_dim.push_back(1);
+        }
 
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
         checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
@@ -485,6 +556,13 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
         auto a_dim = op->getInputs(0)->getDims();
         auto b_dim = op->getInputs(1)->getDims();
         auto c_dim = op->getOutput()->getDims();
+        if (a_dim.size() == 0) {
+            a_dim.push_back(1);
+        }
+
+        if (b_dim.size() == 0) {
+            b_dim.push_back(1);
+        }
 
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
         checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
diff --git a/src/kernels/bang/gather.cc b/src/kernels/bang/gather.cc
new file mode 100644
index 00000000..b5a326fc
--- /dev/null
+++ b/src/kernels/bang/gather.cc
@@ -0,0 +1,53 @@
+#include "operators/gather.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class GatherCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<GatherObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto aDim = op->getInputs(0)->getDims();
+        auto bDim = op->getInputs(1)->getDims();
+        auto cDim = op->getOutput()->getDims();
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, aDim.size(),
+                                               aDim.data()));
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_INT32, bDim.size(),
+                                               bDim.data()));
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, cDim.size(),
+                                               cDim.data()));
+
+        BangPtr wsData = context->getWorkspace(aDim.size() * 4);
+        context->copyBlobFromCPU(wsData, aDim.data(), aDim.size() * 4);
+
+        auto axis = op->getAxis();
+        cnnlStatus_t stat =
+            cnnlGatherV2(context->cnnlHandle(), axis, aDesc, aData,
+                         (int *)wsData, bDesc, (int *)bData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Gather, DataType::Float32, GatherCnnl,
+                "Gather_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/reduce_mean.cc b/src/kernels/bang/reduce_mean.cc
new file mode 100644
index 00000000..2dd77e10
--- /dev/null
+++ b/src/kernels/bang/reduce_mean.cc
@@ -0,0 +1,69 @@
+#include "operators/reduce_mean.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class ReduceMeanCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ReduceMeanObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        auto aDim = op->getInputs(0)->getDims();
+        auto axes_set = op->getAxes();
+        std::vector<int> axes;
+        axes.assign(axes_set.begin(), axes_set.end());
+        auto bDim = aDim;
+        for (auto it : axes) {
+            bDim[it] = 1;
+        }
+
+        cnnlTensorDescriptor_t inDesc, outDesc;
+        checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
+        checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, aDim.size(),
+                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, bDim.size(),
+                                               bDim.data()));
+
+        // get reduce descriptor
+        cnnlReduceDescriptor_t reduceDesc;
+        checkCnnlError(cnnlCreateReduceDescriptor(&reduceDesc));
+        checkCnnlError(cnnlSetReduceDescriptor_v2(
+            reduceDesc, axes.data(), axes.size(), CNNL_REDUCE_AVG,
+            CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES,
+            CNNL_32BIT_INDICES, 0.0));
+
+        // get workspace
+        size_t workspaceSize = 0;
+        checkCnnlError(cnnlGetReduceOpWorkspaceSize(context->cnnlHandle(),
+                                                    inDesc, outDesc, reduceDesc,
+                                                    &workspaceSize));
+        int indicesSize = axes.size() * sizeof(int);
+        BangPtr wsData = context->getWorkspace(workspaceSize + indicesSize);
+
+        BangPtr indicesData = (char *)wsData + workspaceSize;
+        context->copyBlobFromCPU(indicesData, axes.data(), indicesSize);
+
+        // reduce
+        float alpha = 1.f, beta = 0.f;
+        checkCnnlError(cnnlReduce(
+            context->cnnlHandle(), reduceDesc, wsData, workspaceSize, &alpha,
+            inDesc, aData, indicesSize, indicesData, &beta, outDesc, cData));
+
+        // Destories in CUDA does not require sync. But cuDNN does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(outDesc));
+        checkCnnlError(cnnlDestroyReduceDescriptor(reduceDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::ReduceMean, DataType::Float32,
+                ReduceMeanCnnl, "ReduceMean_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/where.cc b/src/kernels/bang/where.cc
new file mode 100644
index 00000000..725b63e0
--- /dev/null
+++ b/src/kernels/bang/where.cc
@@ -0,0 +1,73 @@
+#include "operators/where.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class WhereCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<WhereObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getInputs(2)->getRawDataPtr<void *>());
+        void *const dData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc, dDesc;
+        auto aDim = op->getInputs(0)->getDims();
+        auto bDim = op->getInputs(1)->getDims();
+        auto cDim = op->getInputs(2)->getDims();
+        auto dDim = op->getOutput()->getDims();
+
+        if (aDim.size() == 0) {
+            aDim.push_back(1);
+        }
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
+        if (cDim.size() == 0) {
+            cDim.push_back(1);
+        }
+        if (dDim.size() == 0) {
+            dDim.push_back(1);
+        }
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, aDim.size(),
+                                               aDim.data()));
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, bDim.size(),
+                                               bDim.data()));
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_BOOL, cDim.size(),
+                                               cDim.data()));
+        checkCnnlError(cnnlCreateTensorDescriptor(&dDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(dDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, dDim.size(),
+                                               dDim.data()));
+        size_t wsSize;
+        cnnlGetSelectV2WorkspaceSize(context->cnnlHandle(), cDesc, aDesc, bDesc,
+                                     &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlSelectV2(context->cnnlHandle(), cDesc, cData, aDesc, aData,
+                         bDesc, bData, wsData, wsSize, dDesc, dData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(dDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Where, DataType::Float32, WhereCnnl,
+                "Where_cnnl_BANG_Float32");
+
+}; // namespace infini

From 965df4e294d11663950f76a7c4a9d189d9e132d9 Mon Sep 17 00:00:00 2001
From: xiaonans <51065160+xiaonans@users.noreply.github.com>
Date: Tue, 14 Nov 2023 23:44:22 +0800
Subject: [PATCH 09/28] [feature] add fused attention_kvcache operator support
 (#179)

* [feature] add fused attention_kvcache operator support

* add test to attention_kvcache op

* Add space line at EOF

---------

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 include/core/graph_handler.h              |   3 +
 include/core/op_type.h                    |   1 +
 include/cuda/cuda_attention_kvcache.h     |  15 +++
 include/operators/attention_kvcache.h     |  43 ++++++++
 pyinfinitensor/src/pyinfinitensor/onnx.py |  13 +++
 src/core/graph_handler.cc                 |  22 ++++
 src/ffi/ffi_infinitensor.cc               |   1 +
 src/kernels/cuda/attention_kvcache.cc     |  52 +++++++++
 src/kernels/cuda/attention_kvcache.cu     | 128 ++++++++++++++++++++++
 src/operators/attention_kvcache.cc        |  55 ++++++++++
 test/kernels/cuda/test_cuda_attention.cc  |  42 +++++++
 11 files changed, 375 insertions(+)
 create mode 100644 include/cuda/cuda_attention_kvcache.h
 create mode 100644 include/operators/attention_kvcache.h
 create mode 100644 src/kernels/cuda/attention_kvcache.cc
 create mode 100644 src/kernels/cuda/attention_kvcache.cu
 create mode 100644 src/operators/attention_kvcache.cc
 create mode 100644 test/kernels/cuda/test_cuda_attention.cc

diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index 61826893..c91c4901 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -64,6 +64,9 @@ class GraphHandlerObj {
     Tensor transpose(Tensor data, Tensor transposed, Shape perm);
     Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
     Tensor concat(TensorVec inputs, Tensor output, int dim);
+    Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
+                            Tensor input_q, Tensor input_k, Tensor input_v,
+                            Tensor position_id, Tensor output_matmul);
     TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
                     int num_outputs);
     Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
diff --git a/include/core/op_type.h b/include/core/op_type.h
index ad2e6acb..91a0b99a 100644
--- a/include/core/op_type.h
+++ b/include/core/op_type.h
@@ -25,6 +25,7 @@ struct OpType {
         Asinh,              // Unary
         Atan,               // Unary
         Atanh,              // Unary
+        AttentionKVCache,   // Fusion
         AveragePool,        // Pool
         BatchNormalization, //
         Bernoulli,          //
diff --git a/include/cuda/cuda_attention_kvcache.h b/include/cuda/cuda_attention_kvcache.h
new file mode 100644
index 00000000..880a814f
--- /dev/null
+++ b/include/cuda/cuda_attention_kvcache.h
@@ -0,0 +1,15 @@
+#pragma once
+#include <cstdio>
+
+struct AttentionKVCacheMetadata {
+    int dimSize[4];
+    int stride[4];
+};
+
+namespace infini {
+void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
+                              float *input_q, float *input_k, float *input_v,
+                              int *position_id, float *output_matmul,
+                              const AttentionKVCacheMetadata &compMeta);
+
+} // namespace infini
diff --git a/include/operators/attention_kvcache.h b/include/operators/attention_kvcache.h
new file mode 100644
index 00000000..f319eb6c
--- /dev/null
+++ b/include/operators/attention_kvcache.h
@@ -0,0 +1,43 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+/**
+ * @brief Fused Attention with KVCache input operator. All the input and output
+ * tensors should have the same rank except for the position_id.
+ *
+ */
+class AttentionKVCacheObj : public OperatorObj {
+    int dim;
+
+  public:
+    /**
+     * @brief Construct a new AttentionKVCache object.
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param input_k_cache The k_cache input tensor.
+     * @param input_v_cache The v_cache input tensor.
+     * @param input_q The query input tensor.
+     * @param input_k The key input tensor.
+     * @param input_v The value input tensor.
+     * @param position_id The positon id of the query,
+     * @param output_matmul The query output tensor.
+     */
+    AttentionKVCacheObj(GraphObj *graph, Tensor input_k_cache,
+                        Tensor input_v_cache, Tensor input_q, Tensor input_k,
+                        Tensor input_v, Tensor position_id,
+                        Tensor output_matmul);
+    OP_CLONE(AttentionKVCacheObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 6; }
+    int numOutputs() const override { return 1; }
+    int getDim() const { return dim; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+} // namespace infini
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index cc5498f9..7360002a 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -46,6 +46,9 @@ class OnnxStub:
                 model = model_simp
         except ValidationError:
             pass
+        except RuntimeError:
+            pass
+        
         self.inputs: Dict[str, backend.Tensor] = {}
         self.outputs: Dict[str, backend.Tensor] = {}
         self.initializer: Dict[int, TensorProto] = {}
@@ -560,6 +563,16 @@ class OnnxStub:
                             (attr.i for attr in node.attribute if attr.name == "axis")
                         ),
                     )
+                elif node.op_type == "AttentionKVCache":
+                    tensors[node.output[0]] = self.handler.attentionKVCache(
+                        tensors[node.input[0]],
+                        tensors[node.input[1]],
+                        tensors[node.input[2]],
+                        tensors[node.input[3]],
+                        tensors[node.input[4]],
+                        tensors[node.input[5]],
+                        tensors.get(node.output[0]),
+                    )
                 elif node.op_type == "Split":
                     for name, tensor in zip(
                         node.output,
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index ddf53884..32b99b63 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -1,6 +1,7 @@
 ﻿#include "core/graph_handler.h"
 #include "operators/all_gather.h"
 #include "operators/all_reduce.h"
+#include "operators/attention_kvcache.h"
 #include "operators/batch_norm.h"
 #include "operators/broadcast.h"
 #include "operators/concat.h"
@@ -239,6 +240,27 @@ Tensor GraphHandlerObj::concat(TensorVec inputs, Tensor output, int dim) {
     }
 }
 
+Tensor GraphHandlerObj::attentionKVCache(Tensor input_k_cache,
+                                         Tensor input_v_cache, Tensor input_q,
+                                         Tensor input_k, Tensor input_v,
+                                         Tensor position_id,
+                                         Tensor output_matmul) {
+    if (output_matmul) {
+        g->addOpWithOutputs<AttentionKVCacheObj>(
+            std::move(input_k_cache), std::move(input_v_cache),
+            std::move(input_q), std::move(input_k), std::move(input_v),
+            std::move(position_id), output_matmul);
+        return {output_matmul};
+    } else {
+        return g
+            ->addOp<AttentionKVCacheObj>(
+                std::move(input_k_cache), std::move(input_v_cache),
+                std::move(input_q), std::move(input_k), std::move(input_v),
+                std::move(position_id), output_matmul)
+            ->getOutput();
+    }
+}
+
 TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs,
                                  int axis, int num_outputs) {
     if (outputs) {
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 3612269e..ca427dab 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -489,6 +489,7 @@ void init_graph_builder(py::module &m) {
         .def("depthToSpace", &Handler::depthToSpace, policy::move)
         .def("reshape", &Handler::reshape, policy::move)
         .def("concat", &Handler::concat, policy::move)
+        .def("attentionKVCache", &Handler::attentionKVCache, policy::move)
         .def("split", &Handler::split, policy::move)
         .def("gather", &Handler::gather, policy::move)
         .def("gatherElements", &Handler::gatherElements, policy::move)
diff --git a/src/kernels/cuda/attention_kvcache.cc b/src/kernels/cuda/attention_kvcache.cc
new file mode 100644
index 00000000..0d21603a
--- /dev/null
+++ b/src/kernels/cuda/attention_kvcache.cc
@@ -0,0 +1,52 @@
+#include "operators/attention_kvcache.h"
+#include "cuda/cuda_attention_kvcache.h"
+#include "cuda/cuda_kernel_wihtout_config.h"
+#include <functional>
+
+namespace infini {
+
+class AttentionKVCacheCompute {
+    void initAttentionKVCacheMetadata(AttentionKVCacheMetadata &metadata,
+                                      Tensor tensor) const {
+        int nDims = tensor->getRank();
+        auto strides = tensor->getStride();
+        IT_ASSERT(nDims == 4);
+        IT_ASSERT(strides.size() == (size_t)nDims);
+        for (int i = 0; i < nDims; ++i) {
+            metadata.dimSize[i] = tensor->getDims().at(i);
+            metadata.stride[i] = strides.at(i);
+        }
+    }
+
+  public:
+    void do_compute(Tensor input_k_cache, Tensor input_v_cache, Tensor input_q,
+                    Tensor input_k, Tensor input_v, Tensor position_id,
+                    Tensor output_matmul) const {
+        AttentionKVCacheMetadata metadata;
+        initAttentionKVCacheMetadata(metadata, input_v_cache);
+
+        attention_kvcache_kernel(input_k_cache->getRawDataPtr<float *>(),
+                                 input_v_cache->getRawDataPtr<float *>(),
+                                 input_q->getRawDataPtr<float *>(),
+                                 input_k->getRawDataPtr<float *>(),
+                                 input_v->getRawDataPtr<float *>(),
+                                 position_id->getRawDataPtr<int *>(),
+                                 output_matmul->getRawDataPtr<float *>(),
+                                 metadata);
+    }
+};
+
+class AttentionKVCacheCuda : private AttentionKVCacheCompute,
+                             public CudaKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        do_compute(_op->getInputs()[0], _op->getInputs()[1],
+                   _op->getInputs()[2], _op->getInputs()[3],
+                   _op->getInputs()[4], _op->getInputs()[5],
+                   _op->getOutputs()[0]);
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::AttentionKVCache, DataType::Float32,
+                AttentionKVCacheCuda, "AttentionKVCache_CUDA_Float32");
+} // namespace infini
diff --git a/src/kernels/cuda/attention_kvcache.cu b/src/kernels/cuda/attention_kvcache.cu
new file mode 100644
index 00000000..ece6659f
--- /dev/null
+++ b/src/kernels/cuda/attention_kvcache.cu
@@ -0,0 +1,128 @@
+#include "cuda/cuda_common.h"
+#include "cuda/cuda_attention_kvcache.h"
+#define WARP_SIZE 32
+#define BLOCKSIZE WARP_SIZE
+#define SEQ_UNIT 64
+
+__global__ void _attention_kvcache_kernel(float* input_k_cache,
+                                              float* input_v_cache, 
+                                              float* input_q, 
+                                              float* input_k, 
+                                              float* input_v, 
+                                              int* position_id,
+                                              float* output_matmul,
+                                              AttentionKVCacheMetadata compMeta) {
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int group_id = threadIdx.x / WARP_SIZE;
+    int parallel_idx = blockIdx.x * (blockDim.x / WARP_SIZE) + group_id;
+
+    if(parallel_idx >= compMeta.dimSize[0] * compMeta.dimSize[1])
+        return;
+
+    float ptr_V[SEQ_UNIT*2]; 
+    float ptr_K[SEQ_UNIT*2]; 
+    float ptr_Q[2]; 
+    float ptr_P[SEQ_UNIT];
+
+    float ptr_O[2];
+    float ptr_max[1];
+    float ptr_sum[1];
+
+    float ptr_max_last[1];
+    float ptr_sum_last[1];
+    float ptr_O_last[2];
+
+    (float2 &)ptr_Q[0] = (float2 &)input_q[(lane_id * 2) + (parallel_idx * 64)];
+
+    int SEQ_LENGTH = position_id[0] + 1;
+
+    int common_idx = (lane_id * 2) + (parallel_idx * compMeta.stride[1]);
+
+
+    for (int idx_seq = 0; idx_seq < SEQ_LENGTH; idx_seq += SEQ_UNIT){ 
+        ptr_max_last[0] = ptr_max[0];
+        ptr_sum_last[0] = ptr_sum[0];
+        (float2 &)ptr_O_last[0] = (float2 &)ptr_O[0];
+
+        #pragma unroll
+        for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) { 
+            if(idx_SEQ_UNIT + idx_seq < SEQ_LENGTH - 1){                  
+                (float2 &)ptr_K[idx_SEQ_UNIT * 2] 
+                    = (float2 &) input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
+            }
+            else{
+                (float2 &)ptr_K[idx_SEQ_UNIT * 2] 
+                    = (float2 &) input_k[((lane_id * 2) + parallel_idx * compMeta.stride[2])];
+                (float2 &)input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])] =
+                    (float2 &)ptr_K[idx_SEQ_UNIT * 2];
+            }
+            ptr_K[idx_SEQ_UNIT * 2] = ptr_Q[0] * ptr_K[idx_SEQ_UNIT * 2];
+            ptr_K[idx_SEQ_UNIT * 2 + 1] = ptr_Q[1] * ptr_K[idx_SEQ_UNIT * 2 + 1];
+
+            #pragma unroll
+            for (int offset = 16; offset > 0; offset /= 2) {
+                ptr_K[idx_SEQ_UNIT * 2] += __shfl_down_sync(0xffffffff, ptr_K[idx_SEQ_UNIT * 2], offset);
+            }
+            ptr_P[idx_SEQ_UNIT] = ptr_K[idx_SEQ_UNIT * 2];
+            #pragma unroll
+            for (int offset = 16; offset > 0; offset /= 2){
+                ptr_K[((idx_SEQ_UNIT * 2) + 1)] += __shfl_down_sync(0xffffffff, ptr_K[((idx_SEQ_UNIT * 2) + 1)], offset);
+            }
+            ptr_P[idx_SEQ_UNIT] += ptr_K[((idx_SEQ_UNIT * 2) + 1)];
+        }
+
+        #pragma unroll
+        for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) { 
+            ptr_P[idx_SEQ_UNIT] = __shfl_sync(0xffffffff, ptr_P[idx_SEQ_UNIT], 0); 
+            ptr_P[idx_SEQ_UNIT] /= 8;
+            ptr_max[0] = (idx_SEQ_UNIT == 0) ? ptr_P[0] : max(ptr_max[0], ptr_P[idx_SEQ_UNIT]);
+        }
+        ptr_max[0] = (idx_seq == 0) ? ptr_max[0] : max(ptr_max[0], ptr_max_last[0]);
+
+        ptr_sum[0] = 0;
+        #pragma unroll
+        for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) { 
+            ptr_P[idx_SEQ_UNIT] = expf(ptr_P[idx_SEQ_UNIT] - ptr_max[0]);
+            ptr_sum[0] += ptr_P[idx_SEQ_UNIT];
+        }
+        ptr_sum[0] = (idx_seq == 0) ? ptr_sum[0] : expf(ptr_max_last[0] - ptr_max[0]) * ptr_sum_last[0] + ptr_sum[0]; 
+
+        ptr_O[0] = 0;
+        ptr_O[1] = 0;
+        #pragma unroll
+        for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) { 
+            if(idx_SEQ_UNIT + idx_seq < SEQ_LENGTH - 1){                  
+                (float2 &)ptr_V[idx_SEQ_UNIT * 2] 
+                    = (float2 &) input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
+            }
+            else{
+                (float2 &)ptr_V[idx_SEQ_UNIT * 2] 
+                    = (float2 &) input_v[((lane_id * 2) + parallel_idx * compMeta.stride[2])];
+                (float2 &)input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])] =
+                    (float2 &)ptr_V[idx_SEQ_UNIT * 2];
+            }
+
+            ptr_P[idx_SEQ_UNIT] /= ptr_sum[0];
+
+            ptr_O[0] = fmaf(ptr_P[idx_SEQ_UNIT],  ptr_V[(idx_SEQ_UNIT * 2)], ptr_O[0]);
+            ptr_O[1] = fmaf(ptr_P[idx_SEQ_UNIT],  ptr_V[(idx_SEQ_UNIT * 2) + 1], ptr_O[1]);
+        }
+        ptr_O[0] = (idx_seq == 0) ? ptr_O[0] : ptr_O[0] + ptr_O_last[0]  * expf(ptr_max_last[0] - ptr_max[0]) * ptr_sum_last[0] / ptr_sum[0];  
+        ptr_O[1] = (idx_seq == 0) ? ptr_O[1] : ptr_O[1] + ptr_O_last[1]  * expf(ptr_max_last[0] - ptr_max[0]) * ptr_sum_last[0] / ptr_sum[0]; 
+    }
+    (float2 &)output_matmul[(lane_id * 2) + (parallel_idx * compMeta.dimSize[3])] = (float2 &)ptr_O[0];
+}
+
+namespace infini {
+void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache, float *input_q, float *input_k,
+                          float *input_v, int *position_id, float *output_matmul,
+                          const AttentionKVCacheMetadata &compMeta) {
+    IT_ASSERT(compMeta.dimSize[3] == 64);
+    dim3 gridDim(compMeta.dimSize[0]*compMeta.dimSize[1]/(BLOCKSIZE/WARP_SIZE), 1);
+    dim3 blockDim(BLOCKSIZE, 1);
+
+    _attention_kvcache_kernel<<<gridDim, blockDim>>>(
+        input_k_cache, input_v_cache, input_q, input_k, input_v, position_id, output_matmul, compMeta);
+}
+
+} // namespace infini
diff --git a/src/operators/attention_kvcache.cc b/src/operators/attention_kvcache.cc
new file mode 100644
index 00000000..9893f509
--- /dev/null
+++ b/src/operators/attention_kvcache.cc
@@ -0,0 +1,55 @@
+#include "operators/attention_kvcache.h"
+#include "utils/operator_utils.h"
+
+namespace infini {
+AttentionKVCacheObj::AttentionKVCacheObj(GraphObj *graph, Tensor input_k_cache,
+                                         Tensor input_v_cache, Tensor input_q,
+                                         Tensor input_k, Tensor input_v,
+                                         Tensor position_id,
+                                         Tensor output_matmul)
+    : OperatorObj(OpType::AttentionKVCache,
+                  TensorVec{input_k_cache, input_v_cache, input_q, input_k,
+                            input_v, position_id},
+                  {output_matmul}) {
+    int rank = inputs[0]->getRank();
+    IT_ASSERT(rank == 4);
+    dim = 2;
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>>
+AttentionKVCacheObj::inferShape(const TensorVec &inputs) const {
+    IT_ASSERT(inputs.size() == 6);
+    Shape dims = inputs[0]->getDims();
+    ShapeElem n = dims.at(dim);
+    dims[dim] = n + 1;
+    return {{inputs[2]->getDims()}};
+}
+
+std::string AttentionKVCacheObj::toString() const {
+    std::ostringstream os;
+    os << "AttentionKVCache[" << getGuid() << "]";
+    os << "(";
+    for (auto input : inputs)
+        os << vecToString(input->getDims()) << ",";
+    os << "dim=" << dim << ",";
+    os << "input=";
+    for (auto input : inputs)
+        os << input->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> AttentionKVCacheObj::getWorkloadVector() const {
+    vector<int> ret = getOutputs()[0]->getDims();
+    ret.emplace(ret.begin(), (int)inputs.size());
+    ret.emplace(ret.begin(), dim);
+    ret.emplace(ret.begin(), type.underlying());
+    return ret;
+}
+
+vector<int> AttentionKVCacheObj::getOpAttrVector() const {
+    return {type.underlying(), dim};
+}
+
+} // namespace infini
diff --git a/test/kernels/cuda/test_cuda_attention.cc b/test/kernels/cuda/test_cuda_attention.cc
new file mode 100644
index 00000000..3ccf861d
--- /dev/null
+++ b/test/kernels/cuda/test_cuda_attention.cc
@@ -0,0 +1,42 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/attention_kvcache.h"
+
+#include "test.h"
+
+namespace infini {
+TEST(AttentionKVCache, Cuda) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+    auto input_k_cache_d = gCuda->addTensor({1, 1, 1, 64}, DataType::Float32);
+    auto input_v_cache_d = gCuda->addTensor({1, 1, 1, 64}, DataType::Float32);
+    auto input_q_d = gCuda->addTensor({1, 1, 1, 64}, DataType::Float32);
+    auto input_k_d = gCuda->addTensor({1, 1, 1, 64}, DataType::Float32);
+    auto input_v_d = gCuda->addTensor({1, 1, 1, 64}, DataType::Float32);
+    auto position_id_d = gCuda->addTensor({1, 1}, DataType::UInt32);
+
+    auto op = gCuda->addOp<AttentionKVCacheObj>(
+        input_k_cache_d, input_v_cache_d, input_q_d, input_k_d, input_v_d,
+        position_id_d, nullptr);
+    gCuda->dataMalloc();
+
+    input_q_d->setData(OneGenerator());
+    input_k_d->setData(OneGenerator());
+    input_v_d->setData(OneGenerator());
+    position_id_d->setData(IncrementalGenerator());
+    cudaRuntime->run(gCuda);
+
+    auto oCpu = gCpu->cloneTensor(op->getOutput());
+    EXPECT_TRUE(oCpu->equalData(vector<float>{
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}));
+}
+
+} // namespace infini

From 331f7ab2b8ab284af0ba44d1510164ffe6123f5c Mon Sep 17 00:00:00 2001
From: zhangyunze <93699316+bitzyz@users.noreply.github.com>
Date: Thu, 23 Nov 2023 13:11:50 +0800
Subject: [PATCH 10/28] support Dynamic tensor infer shape and fix memory pool
 (#176)

* feat: support dynamic tensor part1

* feat: support dynamic-tensor part2

* feat: support dynamic tensor part 3

* fix: fix some ..

* - add kvcache example

* feat: support concat to identity kernel

* add a simple mempory pool for allocator

* fix: rebase to master

* fix bug after merging

* - remove outdated script

* fix: fix as review

---------

Co-authored-by: kilinchange <kilinchange@163.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 include/core/graph.h                      |   9 +-
 include/core/graph_handler.h              |  13 ++-
 include/core/lazy_allocator.h             |  19 ++++
 include/core/operator.h                   |   5 +-
 include/core/tensor.h                     |   1 +
 include/cuda/cuda_element_wise.h          |   9 +-
 include/cuda/cuda_pad_slice.h             |   3 +-
 include/operators/G2BMM.h                 |   2 +-
 include/operators/GBMM.h                  |   2 +-
 include/operators/activation_backward.h   |   2 +-
 include/operators/all_gather.h            |   2 +-
 include/operators/all_reduce.h            |   2 +-
 include/operators/attention_kvcache.h     |   2 +-
 include/operators/batch_norm.h            |   2 +-
 include/operators/broadcast.h             |   2 +-
 include/operators/concat.h                |   2 +-
 include/operators/conv.h                  |   8 +-
 include/operators/det.h                   |   2 +-
 include/operators/dropout.h               |   2 +-
 include/operators/element_wise.h          |   4 +-
 include/operators/expand.h                |   2 +-
 include/operators/extend.h                |   2 +-
 include/operators/gather.h                |   4 +-
 include/operators/matmul.h                |   2 +-
 include/operators/membound.h              |   2 +-
 include/operators/pad.h                   |   2 +-
 include/operators/pooling.h               |   2 +-
 include/operators/reduce_mean.h           |   2 +-
 include/operators/reshape.h               |  13 ++-
 include/operators/resize.h                |   2 +-
 include/operators/slice.h                 |   2 +-
 include/operators/softmax.h               |   2 +-
 include/operators/split.h                 |   2 +-
 include/operators/transpose.h             |   4 +-
 include/operators/unary.h                 |  24 ++---
 include/operators/where.h                 |   2 +-
 pyinfinitensor/src/pyinfinitensor/onnx.py |  34 ++++--
 pyinfinitensor/tests/test_onnx.py         |  18 ++++
 src/core/graph.cc                         |  58 +++++++++-
 src/core/graph_handler.cc                 |   8 ++
 src/core/lazy_allocator.cc                |  68 +++++++++---
 src/core/operator.cc                      |   4 +-
 src/core/tensor.cc                        |   7 ++
 src/ffi/ffi_infinitensor.cc               |  15 ++-
 src/kernels/cuda/element_wise.cc          |  19 +++-
 src/kernels/cuda/element_wise.cu          | 125 ++++++++++++++++++----
 src/kernels/cuda/pad_slice.cc             |   7 +-
 src/kernels/cuda/pad_slice.cu             |  19 ++--
 src/kernels/cuda/split_concat.cc          |  15 +++
 src/operators/G2BMM.cc                    |   7 +-
 src/operators/GBMM.cc                     |   9 +-
 src/operators/activation_backward.cc      |   2 +-
 src/operators/all_gather.cc               |   3 +-
 src/operators/attention_kvcache.cc        |   2 +-
 src/operators/batch_norm.cc               |   3 +-
 src/operators/concat.cc                   |   9 +-
 src/operators/conv.cc                     |  65 +++++------
 src/operators/det.cc                      |   2 +-
 src/operators/dropout.cc                  |   2 +-
 src/operators/element_wise.cc             |   5 +-
 src/operators/expand.cc                   |   2 +-
 src/operators/extend.cc                   |   2 +-
 src/operators/gather.cc                   |   2 +-
 src/operators/gather_elements.cc          |   3 +-
 src/operators/matmul.cc                   |  32 +++---
 src/operators/membound.cc                 |   2 +-
 src/operators/pad.cc                      |   2 +-
 src/operators/pooling.cc                  |   2 +-
 src/operators/reduce_mean.cc              |   3 +-
 src/operators/reshape.cc                  |  46 ++++++--
 src/operators/resize.cc                   |   2 +-
 src/operators/slice.cc                    |   2 +-
 src/operators/split.cc                    |   2 +-
 src/operators/transpose.cc                |   6 +-
 src/operators/unary.cc                    |  18 ++--
 src/operators/where.cc                    |   2 +-
 test/kernels/cuda/test_cuda_concat.cc     |  29 +++++
 test/operators/test_concat.cc             |   9 ++
 78 files changed, 605 insertions(+), 229 deletions(-)

diff --git a/include/core/graph.h b/include/core/graph.h
index a8fc6485..184dcac6 100644
--- a/include/core/graph.h
+++ b/include/core/graph.h
@@ -53,6 +53,7 @@ class GraphObj : public Object {
     const TensorVec &getTensors() const { return tensors; }
     const OpVec &getOperators() const { return ops; }
     OpVec getComputeOps() const;
+    Tensor getTensor(int) const;
 
     /**
      * Sort the nodes in topological order.
@@ -64,7 +65,13 @@ class GraphObj : public Object {
 
     void optimize();
 
-    void dataMalloc(bool useNaiveAllocator = false);
+    void shape_infer();
+
+    void dataMalloc(bool useNaiveAllocator = false, size_t memPoolSize = 0);
+
+    Tensor cloneKV(Tensor &tensor);
+
+    void freeHeap();
 
     /**
      * @brief Add an operator and create its outputs. Output tensor arguments
diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index c91c4901..4b66f11a 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -81,6 +81,7 @@ class GraphHandlerObj {
     Tensor cast(Tensor input, Tensor output, int to);
     Tensor expand(Tensor input, Tensor output, Shape dims);
     Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
+    std::vector<int> getDims(Tensor x) { return x->getDims(); }
 
     Tensor allReduceSum(Tensor input, Tensor output);
     Tensor allReduceProd(Tensor input, Tensor output);
@@ -98,9 +99,19 @@ class GraphHandlerObj {
 
     inline void optimize() { g->optimize(); }
 
+    inline void shape_infer() { g->shape_infer(); }
+
+    void change_shape(const vector<int> &shape, int tensorId);
     //------ runtime
 
-    inline void data_malloc() { g->dataMalloc(); }
+    inline void data_malloc(bool useNaiveAllocator = false,
+                            size_t memPoolSize = 0) {
+        g->dataMalloc(useNaiveAllocator, memPoolSize);
+    }
+
+    inline Tensor clone_KV(Tensor &tensor) { return g->cloneKV(tensor); }
+
+    inline void free_heap() { g->freeHeap(); }
 
     inline void tune() { g->getRuntime()->run(g, true); }
 
diff --git a/include/core/lazy_allocator.h b/include/core/lazy_allocator.h
index 5f073845..f4147851 100644
--- a/include/core/lazy_allocator.h
+++ b/include/core/lazy_allocator.h
@@ -26,14 +26,23 @@ class LazyAllocator {
 
     size_t weightPeak = 0;
 
+    size_t heapPeak = 0;
+
     size_t alignment;
 
+    bool hasMemPool = false;
+
+    size_t memPoolSize = 0;
+
     // pointer to the memory actually allocated
     void *ptr = nullptr;
 
     // pointer to the weight memory space
     void *weightPtr = nullptr;
 
+    // memory pool ptr
+    void *memPoolPtr = nullptr;
+
     // // a cache designed for a batch size that has already occurred
     // std::unordered_map<size_t, std::unordered_map<TensorObj *, size_t>>
     // batchsizeToTensorOffset;
@@ -68,6 +77,10 @@ class LazyAllocator {
 
     void init();
 
+    void setMemPool(size_t memPoolSize);
+
+    bool getMemPoolStatus();
+
     // function: simulate memory allocation
     // arguments：
     //     size: size of memory block to be allocated
@@ -76,6 +89,10 @@ class LazyAllocator {
 
     size_t allocWeight(size_t size);
 
+    size_t heapAlloc(size_t size);
+
+    void freeHeap();
+
     // function: simulate memory free
     // arguments:
     //     addr: head address offset of memory block to be free
@@ -92,6 +109,8 @@ class LazyAllocator {
 
     void *getWeightPtr();
 
+    void *getHeapPtr();
+
     void info();
 
   private:
diff --git a/include/core/operator.h b/include/core/operator.h
index d7a57633..cc8ce174 100644
--- a/include/core/operator.h
+++ b/include/core/operator.h
@@ -55,8 +55,7 @@ class OperatorObj : public Object {
 
   public:
     OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
-    virtual optional<vector<Shape>>
-    inferShape(const TensorVec &inputs) const = 0;
+    virtual optional<vector<Shape>> inferShape(const TensorVec &inputs) = 0;
     virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
     /**
      * @brief Constructs outputs (if requried) and check whether the operator is
@@ -105,7 +104,7 @@ class OperatorObj : public Object {
                            const TensorVec &newOutputs) const = 0;
 
   protected:
-    optional<vector<Shape>> inferShape() const;
+    optional<vector<Shape>> inferShape();
     vector<DataType> inferDataType() const;
 
   private:
diff --git a/include/core/tensor.h b/include/core/tensor.h
index 48590fd6..cb09261a 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -31,6 +31,7 @@ class TensorObj : public TensorBaseObj {
     size_t getBytes() const { return _size * dtype.getSize(); }
 
     Shape getDims() const { return shape; }
+    void setShape(Shape shape_);
     size_t getRank() const { return shape.size(); }
     Shape getStride() const;
     size_t getOffset(const vector<int> &ds) const;
diff --git a/include/cuda/cuda_element_wise.h b/include/cuda/cuda_element_wise.h
index eb3b99a2..db9c16f1 100644
--- a/include/cuda/cuda_element_wise.h
+++ b/include/cuda/cuda_element_wise.h
@@ -1,8 +1,13 @@
 #pragma once
 
 namespace infini {
-void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+void div_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
                 int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
-void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+void add_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
                 int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
+void pow_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
+                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
+void less_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
+                 int b0, int b1, int b2, int b3, int c0, int c1, int c2,
+                 int c3);
 }; // namespace infini
diff --git a/include/cuda/cuda_pad_slice.h b/include/cuda/cuda_pad_slice.h
index db032fa0..9c044145 100644
--- a/include/cuda/cuda_pad_slice.h
+++ b/include/cuda/cuda_pad_slice.h
@@ -10,10 +10,11 @@ typedef struct {
     int wholeNDim[MAX_DIM];  // dim size after padding or before slicing
     int partNDim[MAX_DIM];   // dim size before padding or after slicing
     int partStride[MAX_DIM]; // stride before padding or after slicing
+    int DType;
 } TransMetaData;
 
 namespace infini {
-void pad_slice_kernel(float *partData, float *wholeData,
+void pad_slice_kernel(void *partData, void *wholeData,
                       const TransMetaData &metadata, int nDims, int num,
                       bool isPad);
 } // namespace infini
diff --git a/include/operators/G2BMM.h b/include/operators/G2BMM.h
index 52f2a2c8..f1a48383 100644
--- a/include/operators/G2BMM.h
+++ b/include/operators/G2BMM.h
@@ -35,7 +35,7 @@ class G2BMMObj : public OperatorObj {
     OP_CLONE(G2BMMObj);
 
     std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     int numInputs() const override { return 2; }
     int numOutputs() const override { return 1; }
diff --git a/include/operators/GBMM.h b/include/operators/GBMM.h
index ebfed659..1329996d 100644
--- a/include/operators/GBMM.h
+++ b/include/operators/GBMM.h
@@ -33,7 +33,7 @@ class GBMMObj : public OperatorObj {
     OP_CLONE(GBMMObj);
 
     std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     int numInputs() const override { return 2; }
     int numOutputs() const override { return 1; }
diff --git a/include/operators/activation_backward.h b/include/operators/activation_backward.h
index 5f55d8cc..ae050733 100644
--- a/include/operators/activation_backward.h
+++ b/include/operators/activation_backward.h
@@ -7,7 +7,7 @@ class ActivationBackwardObj : public OperatorObj {
     ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y,
                           Tensor x, Tensor diff_x);
     OP_CLONE(ActivationBackwardObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 3; }
diff --git a/include/operators/all_gather.h b/include/operators/all_gather.h
index 423974f6..c38d0a3e 100644
--- a/include/operators/all_gather.h
+++ b/include/operators/all_gather.h
@@ -27,7 +27,7 @@ class AllGatherObj : public OperatorObj {
 
     int numInputs() const override { return 1; }
     int numOutputs() const override { return world_size; }
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
 
diff --git a/include/operators/all_reduce.h b/include/operators/all_reduce.h
index f91b3ad1..08635d71 100644
--- a/include/operators/all_reduce.h
+++ b/include/operators/all_reduce.h
@@ -33,7 +33,7 @@ class AllReduceBaseObj : public OperatorObj {
     int numInputs() const override { return 1; }
     int numOutputs() const override { return 1; }
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override {
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
         return {{inputs[0]->getDims()}};
     };
 
diff --git a/include/operators/attention_kvcache.h b/include/operators/attention_kvcache.h
index f319eb6c..0472b222 100644
--- a/include/operators/attention_kvcache.h
+++ b/include/operators/attention_kvcache.h
@@ -29,7 +29,7 @@ class AttentionKVCacheObj : public OperatorObj {
                         Tensor output_matmul);
     OP_CLONE(AttentionKVCacheObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 6; }
diff --git a/include/operators/batch_norm.h b/include/operators/batch_norm.h
index cfacf2ca..ce7314aa 100644
--- a/include/operators/batch_norm.h
+++ b/include/operators/batch_norm.h
@@ -34,7 +34,7 @@ class BatchNormObj : public OperatorObj {
                  Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
                  float eps = 1e-5, bool trainingMode = false);
     OP_CLONE(BatchNormObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
     std::string toString() const override;
 
     // output size will be 3 when training
diff --git a/include/operators/broadcast.h b/include/operators/broadcast.h
index 1a15b770..551fd8ce 100644
--- a/include/operators/broadcast.h
+++ b/include/operators/broadcast.h
@@ -26,7 +26,7 @@ class BroadcastObj : public OperatorObj {
     int numInputs() const override { return 1; }
     int numOutputs() const override { return 1; }
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override {
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
         return {{inputs[0]->getDims()}};
     };
 
diff --git a/include/operators/concat.h b/include/operators/concat.h
index c3d9c4f3..2d130112 100644
--- a/include/operators/concat.h
+++ b/include/operators/concat.h
@@ -22,7 +22,7 @@ class ConcatObj : public OperatorObj {
     ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim);
     OP_CLONE(ConcatObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return inputs.size(); }
diff --git a/include/operators/conv.h b/include/operators/conv.h
index 449f4334..00420c84 100644
--- a/include/operators/conv.h
+++ b/include/operators/conv.h
@@ -142,7 +142,7 @@ class ConvObj : public ConvBaseObj {
             ActType act = ActType::None);
     OP_CLONE(ConvObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
     int getNumGroups() const override { return c / getChannelPerGroup(); }
 
   private:
@@ -164,7 +164,7 @@ class ConvBackwardFilterObj : public ConvBaseObj {
                           int sh = 1, int sw = 1, int dh = 1, int dw = 1,
                           Tensor bias = nullptr, ActType act = ActType::None);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
     ActType getAct() const { return act; }
     int getNumGroups() const override { return c / getChannelPerGroup(); }
 
@@ -191,7 +191,7 @@ class ConvTransposed2dObj : public ConvBaseObj {
                         Tensor bias = nullptr, ActType act = ActType::None);
     OP_CLONE(ConvTransposed2dObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
     int getNumGroups() const override { return group; }
     std::pair<int, int> getOutputPadding() const { return {oph, opw}; }
 
@@ -218,7 +218,7 @@ class ConvTransposed2dNHWCObj : public ConvBaseObj {
                             Tensor bias = nullptr, ActType act = ActType::None);
     OP_CLONE(ConvTransposed2dNHWCObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
     int getNumGroups() const override { return group; }
 
   private:
diff --git a/include/operators/det.h b/include/operators/det.h
index d5e887c1..8a64a279 100644
--- a/include/operators/det.h
+++ b/include/operators/det.h
@@ -7,7 +7,7 @@ class DetObj : public OperatorObj {
     enum Mode { NormalDet = 0, LogDet };
     DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode);
     OP_CLONE(DetObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
diff --git a/include/operators/dropout.h b/include/operators/dropout.h
index 8c4c7300..330c94b6 100644
--- a/include/operators/dropout.h
+++ b/include/operators/dropout.h
@@ -37,7 +37,7 @@ class DropoutObj : public OperatorObj {
     DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask,
                float ratio, bool training_mode);
     OP_CLONE(DropoutObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
diff --git a/include/operators/element_wise.h b/include/operators/element_wise.h
index e198de75..f0275add 100644
--- a/include/operators/element_wise.h
+++ b/include/operators/element_wise.h
@@ -21,7 +21,7 @@ class ElementWiseObj : public OperatorObj {
      */
     ElementWiseObj(OpType type, GraphObj *graph, Tensor input0, Tensor input1,
                    Tensor output);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 2; }
@@ -38,7 +38,7 @@ class MSELossObj : public OperatorObj {
     MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
                Reduction reduction, Tensor output);
     OP_CLONE(MSELossObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     Reduction getReduction() const { return reductionMode; }
     std::string toString() const override;
diff --git a/include/operators/expand.h b/include/operators/expand.h
index 8a3558ca..5f82768d 100644
--- a/include/operators/expand.h
+++ b/include/operators/expand.h
@@ -21,7 +21,7 @@ class ExpandObj : public OperatorObj {
      */
     ExpandObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
     OP_CLONE(ExpandObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
diff --git a/include/operators/extend.h b/include/operators/extend.h
index f749793f..77ac2ff7 100644
--- a/include/operators/extend.h
+++ b/include/operators/extend.h
@@ -23,7 +23,7 @@ class ExtendObj : public OperatorObj {
     ExtendObj(GraphObj *graph, Tensor input, Tensor output, int dim,
               int num = 1);
     OP_CLONE(ExtendObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
diff --git a/include/operators/gather.h b/include/operators/gather.h
index ff35aba8..b1390834 100644
--- a/include/operators/gather.h
+++ b/include/operators/gather.h
@@ -39,7 +39,7 @@ class GatherObj : public GatherBaseObj {
               int axis);
     OP_CLONE(GatherObj);
     std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
     vector<DataType> inferDataType(const TensorVec &inputs) const override;
 
   private:
@@ -69,7 +69,7 @@ class GatherElementsObj : public GatherBaseObj {
                       Tensor output, int axis);
     OP_CLONE(GatherElementsObj);
     std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
     vector<DataType> inferDataType(const TensorVec &inputs) const override;
 
   private:
diff --git a/include/operators/matmul.h b/include/operators/matmul.h
index 91a0a57c..35a4c0a8 100644
--- a/include/operators/matmul.h
+++ b/include/operators/matmul.h
@@ -45,7 +45,7 @@ class MatmulObj : public OperatorObj {
     OP_CLONE(MatmulObj);
 
     std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     int numInputs() const override { return inputs.size(); }
     int numOutputs() const override { return 1; }
diff --git a/include/operators/membound.h b/include/operators/membound.h
index 4a444553..c9123b4f 100644
--- a/include/operators/membound.h
+++ b/include/operators/membound.h
@@ -21,7 +21,7 @@ class MemBoundObj : public OperatorObj {
     OP_CLONE(MemBoundObj);
 
     std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     int numInputs() const override { return inputs.size(); }
     int numOutputs() const override { return outputs.size(); }
diff --git a/include/operators/pad.h b/include/operators/pad.h
index 7a25d8bd..3305e127 100644
--- a/include/operators/pad.h
+++ b/include/operators/pad.h
@@ -27,7 +27,7 @@ class PadObj : public OperatorObj {
            const vector<int> &pads, const optional<vector<int>> &axes);
     OP_CLONE(PadObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
     std::string toString() const override;
     int numInputs() const override { return 1; }
     int numOutputs() const override { return 1; }
diff --git a/include/operators/pooling.h b/include/operators/pooling.h
index 7f28224d..31752dee 100644
--- a/include/operators/pooling.h
+++ b/include/operators/pooling.h
@@ -41,7 +41,7 @@ class PoolingObj : public OperatorObj {
                int ceilMode);
     OP_CLONE(PoolingObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
     std::string toString() const override;
     int numInputs() const override { return 1; }
     int numOutputs() const override { return 1; }
diff --git a/include/operators/reduce_mean.h b/include/operators/reduce_mean.h
index ef74cd2e..18ef38b1 100644
--- a/include/operators/reduce_mean.h
+++ b/include/operators/reduce_mean.h
@@ -23,7 +23,7 @@ class ReduceMeanObj : public OperatorObj {
     ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
                   const optional<vector<int>> &axes, bool keepDims = true);
     OP_CLONE(ReduceMeanObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
diff --git a/include/operators/reshape.h b/include/operators/reshape.h
index 00ae5b0a..43244436 100644
--- a/include/operators/reshape.h
+++ b/include/operators/reshape.h
@@ -9,6 +9,7 @@ namespace infini {
  */
 class ReshapeObj : public OperatorObj {
     Shape dims;
+    Shape outputShape;
 
   public:
     /**
@@ -17,18 +18,20 @@ class ReshapeObj : public OperatorObj {
      * @param graph The computation graph that this operator belongs to.
      * @param input The input tensor.
      * @param output The output tensor.
-     * @param dims The shape of the output tensor.
+     * @param dims The shape to infer the output shape.
+     * @param outputShape The real shape of output tensor.
      */
     ReshapeObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
     OP_CLONE(ReshapeObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
     int numOutputs() const override { return 1; }
 
-    inline Shape getShape() const { return dims; }
+    inline Shape getShape() const { return outputShape; }
+    inline Shape getDims() const { return dims; }
 
   private:
     vector<int> getWorkloadVector() const override;
@@ -55,7 +58,7 @@ class FlattenObj : public OperatorObj {
     FlattenObj(GraphObj *graph, Tensor input, Tensor output, int axis);
     OP_CLONE(FlattenObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
@@ -85,7 +88,7 @@ class IdentityObj : public OperatorObj {
     IdentityObj(GraphObj *graph, Tensor input, Tensor output);
     OP_CLONE(IdentityObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
diff --git a/include/operators/resize.h b/include/operators/resize.h
index a762ea30..96283c12 100644
--- a/include/operators/resize.h
+++ b/include/operators/resize.h
@@ -60,7 +60,7 @@ class ResizeObj : public OperatorObj {
 
     // Operator clone(TensorVec inputs, TensorVec outputs) override;
     vector<DataType> inferDataType(const TensorVec &inputs) const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
     std::string toString() const override;
     int numInputs() const override { return inputs.size(); }
     int numOutputs() const override { return 1; }
diff --git a/include/operators/slice.h b/include/operators/slice.h
index 55acf505..188c804d 100644
--- a/include/operators/slice.h
+++ b/include/operators/slice.h
@@ -32,7 +32,7 @@ class SliceObj : public OperatorObj {
              const optional<vector<int>> &steps);
     OP_CLONE(SliceObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
     std::string toString() const override;
     inline int numInputs() const override { return 1; }
     inline int numOutputs() const override { return 1; }
diff --git a/include/operators/softmax.h b/include/operators/softmax.h
index 0611f63f..b24c0ffb 100644
--- a/include/operators/softmax.h
+++ b/include/operators/softmax.h
@@ -10,7 +10,7 @@ class SoftmaxObj : public OperatorObj {
 
     OP_CLONE(SoftmaxObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override {
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
         return {{inputs[0]->getDims()}};
     };
 
diff --git a/include/operators/split.h b/include/operators/split.h
index 61aa43a2..a4032463 100644
--- a/include/operators/split.h
+++ b/include/operators/split.h
@@ -37,7 +37,7 @@ class SplitObj : public OperatorObj {
              int dim, const vector<int> &ratio);
     OP_CLONE(SplitObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
diff --git a/include/operators/transpose.h b/include/operators/transpose.h
index 9fcd1617..6a8cfcc6 100644
--- a/include/operators/transpose.h
+++ b/include/operators/transpose.h
@@ -7,7 +7,7 @@ class TransposeObj : public OperatorObj {
     TransposeObj(GraphObj *graph, Tensor input, Tensor output,
                  vector<int> permute);
     OP_CLONE(TransposeObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
@@ -25,7 +25,7 @@ class DepthToSpaceObj : public OperatorObj {
     DepthToSpaceObj(GraphObj *graph, Tensor input, Tensor output, int blocksize,
                     std::string mode);
     OP_CLONE(DepthToSpaceObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
diff --git a/include/operators/unary.h b/include/operators/unary.h
index 0bbe314c..c3e628d4 100644
--- a/include/operators/unary.h
+++ b/include/operators/unary.h
@@ -17,7 +17,7 @@ class UnaryObj : public OperatorObj {
      * @param output The output tensor.
      */
     UnaryObj(OpType type, GraphObj *graph, Tensor input, Tensor output);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
@@ -33,7 +33,7 @@ class ClipObj : public OperatorObj {
     ClipObj(GraphObj *graph, Tensor input, Tensor output,
             std::optional<float> min, std::optional<float> max);
     OP_CLONE(ClipObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     std::optional<float> getMin() const { return minValue; };
@@ -52,7 +52,7 @@ class HardtanhObj : public OperatorObj {
     HardtanhObj(GraphObj *graph, Tensor input, Tensor output, float min,
                 float max);
     OP_CLONE(HardtanhObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     float getMin() const { return minValue; };
@@ -70,7 +70,7 @@ class FlipObj : public OperatorObj {
   public:
     FlipObj(GraphObj *graph, Tensor input, Tensor output, vector<int> axis);
     OP_CLONE(FlipObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     vector<int> getAxis() const { return axisValue; };
@@ -87,7 +87,7 @@ class FillObj : public OperatorObj {
   public:
     FillObj(GraphObj *graph, Tensor input, Tensor output, float value);
     OP_CLONE(FillObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     float getValue() const { return setValue; };
@@ -104,7 +104,7 @@ class L2LossObj : public OperatorObj {
   public:
     L2LossObj(GraphObj *graph, Tensor input, Tensor output);
     OP_CLONE(L2LossObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
@@ -120,7 +120,7 @@ class TransformObj : public OperatorObj {
     TransformObj(GraphObj *graph, Tensor input, Tensor output, float alpha,
                  float beta);
     OP_CLONE(TransformObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     float getAlpha() const { return alphaValue; }
@@ -165,7 +165,7 @@ class CastObj : public OperatorObj {
   public:
     CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type);
     OP_CLONE(CastObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
     vector<DataType> inferDataType(const TensorVec &inputs) const override;
 
     std::string toString() const override;
@@ -185,7 +185,7 @@ class CumsumObj : public OperatorObj {
     CumsumObj(GraphObj *graph, Tensor input, Tensor output, int axis,
               bool exclusive, bool reverse);
     OP_CLONE(CumsumObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int getAxis() const { return axisValue; }
@@ -205,7 +205,7 @@ class ShapeObj : public OperatorObj {
   public:
     ShapeObj(GraphObj *graph, Tensor input, Tensor output);
     OP_CLONE(ShapeObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 1; }
@@ -216,7 +216,7 @@ class PReluObj : public OperatorObj {
   public:
     PReluObj(GraphObj *graph, Tensor input, Tensor alpha, Tensor output);
     OP_CLONE(PReluObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return 2; }
@@ -236,7 +236,7 @@ class LogObj : public OperatorObj {
     };
     LogObj(GraphObj *graph, Tensor input, Tensor output, LogType type);
     OP_CLONE(LogObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     LogType getType() const { return logType; }
diff --git a/include/operators/where.h b/include/operators/where.h
index 6422fe34..249f9c46 100644
--- a/include/operators/where.h
+++ b/include/operators/where.h
@@ -22,7 +22,7 @@ class WhereObj : public OperatorObj {
              Tensor output);
     OP_CLONE(WhereObj);
 
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
     int numInputs() const override { return inputs.size(); }
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 7360002a..d48ef52a 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -48,7 +48,7 @@ class OnnxStub:
             pass
         except RuntimeError:
             pass
-        
+
         self.inputs: Dict[str, backend.Tensor] = {}
         self.outputs: Dict[str, backend.Tensor] = {}
         self.initializer: Dict[int, TensorProto] = {}
@@ -510,19 +510,11 @@ class OnnxStub:
                         mode,
                     )
                 elif node.op_type == "Reshape":
-                    dims = _search_shape(model, node.input[0])
-                    size = reduce(lambda acc, x: acc * x, dims)
-                    input_shape = _parse_data(data[node.input[1]])
-                    for i, x in enumerate(input_shape):
-                        if x == 0:
-                            input_shape[i] = dims[i]
-                    temp = reduce(lambda acc, x: acc * x, input_shape, 1)
-                    if temp < 0:
-                        input_shape[input_shape.index(-1)] = size // -temp
+                    shape = _parse_data(data[node.input[1]])
                     tensors[node.output[0]] = self.handler.reshape(
                         tensors[node.input[0]],
                         tensors.get(node.output[0]),
-                        input_shape,
+                        shape,
                     )
                 elif node.op_type == "Squeeze":
                     input_shape = _search_shape(model, node.input[0])
@@ -1112,6 +1104,26 @@ class OnnxStub:
     def optimize(self) -> None:
         self.handler.optimize()
 
+    def clone_KV(self, tensor: backend.Tensor) -> backend.Tensor:
+        return self.handler.clone_KV(tensor)
+
+    def free_heap(self) -> None:
+        self.handler.free_heap()
+
+    def set_input(self, inputShapes: List[int]) -> None:
+        for newInput, oldInput in zip(inputShapes, self.inputs):
+            oldTensor = self.inputs[oldInput]
+            self.handler.change_shape(newInput, oldTensor.fuid())
+        self.handler.shape_infer()
+        self.handler.data_malloc()
+
+    def getShape(self, name: str) -> List[int]:
+        if name in self.inputs:
+            ans = self.handler.getDims(self.inputs[name])
+        else:
+            ans = self.handler.getDims(self.outputs[name])
+        return ans
+
     def tune(self) -> None:
         self.handler.tune()
 
diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py
index 035baf34..79df0294 100644
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@@ -209,6 +209,7 @@ class TestStringMethods(unittest.TestCase):
         make_and_import_model(make_graph([relu], "relu", [x], [y]))
 
     """Gelu operator is not supported by onnx 14.1 currently."""
+
     def test_gelu(self):
         pass
         # x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
@@ -500,5 +501,22 @@ class TestStringMethods(unittest.TestCase):
         make_and_import_model(make_graph([where], "where", [x, y, con], [output]))
 
 
+class TestDynamicTensor(unittest.TestCase):
+    def test_dynamic_tensor(self):
+        filename = r"resnet18-v2-7.onnx"
+        current_path = os.getcwd()
+        model_file = ""
+        for root, dirs, files in os.walk(current_path):
+            if filename in files:
+                model_file = os.path.join(root, filename)
+        model = OnnxStub(onnx.load(model_file), backend.cpu_runtime())
+        output_key = list(model.outputs.keys())[0]
+        old_output_shape = model.getShape(output_key)
+        self.assertEqual(old_output_shape, ([1, 1000]))
+        model.set_input([[5, 3, 224, 224]])
+        new_output_shape = model.getShape(output_key)
+        self.assertEqual(new_output_shape, ([5, 1000]))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/src/core/graph.cc b/src/core/graph.cc
index 7e902247..dd474d11 100644
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@@ -1,5 +1,7 @@
 #include "core/graph.h"
+#include "operators/reshape.h"
 #include <algorithm>
+#include <numeric>
 #include <queue>
 
 namespace infini {
@@ -123,10 +125,40 @@ void GraphObj::optimize() {
     }
 }
 
-void GraphObj::dataMalloc(bool useNaiveAllocator) {
+Tensor GraphObj::getTensor(int fuid) const {
+    for (auto tensor : tensors) {
+        if (tensor->getFuid() == fuid) {
+            return tensor;
+        }
+    }
+    return nullptr;
+}
+
+void GraphObj::shape_infer() {
+    for (auto &op : ops) {
+        auto ans = op->inferShape();
+        IT_ASSERT(ans.has_value());
+        auto oldOutputs = op->getOutputs();
+        IT_ASSERT(ans.value().size() == oldOutputs.size());
+        // replace the old outputshape and size with new one
+        for (int i = 0; i < (int)ans.value().size(); ++i) {
+            auto newShape = ans.value()[i];
+            auto oldShape = oldOutputs[i]->getDims();
+            auto fuid = oldOutputs[i]->getFuid();
+            if (newShape != oldShape) {
+                auto tensor = this->getTensor(fuid);
+                tensor->setShape(newShape);
+            }
+        }
+    }
+}
+
+void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
     // topological sorting first
     IT_ASSERT(topo_sort() == true);
     if (useNaiveAllocator) {
+        // can not set memory pool when use naive allocator
+        IT_ASSERT(memPoolSize == 0);
         // used for debugging memory out-of-bounds access, tensors will not be
         // released correctly
         // note: behavior may not match running in non-naive mode, and it may
@@ -136,6 +168,9 @@ void GraphObj::dataMalloc(bool useNaiveAllocator) {
         }
         return;
     }
+    if (memPoolSize > 0) {
+        allocator.setMemPool(memPoolSize);
+    }
     // count the number of times all tensors are used
     std::unordered_map<TensorObj *, size_t> tensorToRefCount;
     // record the memory address offsets of all tensors to be allocated
@@ -222,6 +257,27 @@ void GraphObj::dataMalloc(bool useNaiveAllocator) {
     }
 }
 
+Tensor GraphObj::cloneKV(Tensor &tensor) {
+    auto obj = tensor->clone();
+    if (allocator.getMemPoolStatus()) {
+        if (tensor->hasData()) {
+            obj->setDataBlob(make_ref<BlobObj>(
+                tensor->runtime,
+                static_cast<uint8_t *>(allocator.getHeapPtr()) +
+                    allocator.heapAlloc(tensor->getBytes())));
+            obj->copyData(tensor);
+        }
+    } else {
+        if (tensor->hasData()) {
+            obj->dataMalloc();
+            obj->copyData(tensor);
+        }
+    }
+    return obj;
+}
+
+void GraphObj::freeHeap() { this->allocator.freeHeap(); }
+
 Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
     return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
 }
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index 32b99b63..d2f54b2d 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -20,6 +20,7 @@
 #include "operators/transpose.h"
 #include "operators/unary.h"
 #include "operators/where.h"
+#include <numeric>
 
 namespace infini {
 
@@ -555,4 +556,11 @@ static DataType dtype_repr_convert(int dtype) {
     }
 }
 
+void GraphHandlerObj::change_shape(const vector<int> &shape, int tensorId) {
+    auto tensor = g->getTensor(tensorId);
+    IT_ASSERT(tensor != nullptr);
+    IT_ASSERT(shape.size() != 0);
+    tensor->setShape(shape);
+}
+
 } // namespace infini
diff --git a/src/core/lazy_allocator.cc b/src/core/lazy_allocator.cc
index c3407320..60f74c75 100644
--- a/src/core/lazy_allocator.cc
+++ b/src/core/lazy_allocator.cc
@@ -30,6 +30,9 @@ LazyAllocator::~LazyAllocator() {
     if (this->weightPtr != nullptr) {
         runtime->dealloc(this->weightPtr);
     }
+    if (this->memPoolPtr != nullptr) {
+        runtime->dealloc(this->memPoolPtr);
+    }
 }
 
 void LazyAllocator::init() {
@@ -44,6 +47,17 @@ void LazyAllocator::init() {
     this->ptr = nullptr;
 }
 
+void LazyAllocator::setMemPool(size_t memPoolSize) {
+    IT_ASSERT(memPoolSize > 0);
+    if (!this->hasMemPool) {
+        this->hasMemPool = true;
+        this->memPoolSize = memPoolSize;
+        this->memPoolPtr = runtime->alloc(memPoolSize);
+    }
+}
+
+bool LazyAllocator::getMemPoolStatus() { return this->hasMemPool; }
+
 size_t LazyAllocator::alloc(size_t size) {
     // pad the size to the multiple of alignment
     size = this->getAlignedSize(size);
@@ -102,6 +116,17 @@ size_t LazyAllocator::allocWeight(size_t size) {
     return retAddr;
 }
 
+size_t LazyAllocator::heapAlloc(size_t size) {
+    size = this->getAlignedSize(size);
+    this->heapPeak += size;
+    IT_ASSERT(this->memPoolSize >=
+              this->weightPeak + this->peak + this->heapPeak);
+    size_t retAddr = this->memPoolSize - this->heapPeak;
+    return retAddr;
+}
+
+void LazyAllocator::freeHeap() { this->heapPeak = 0; }
+
 void LazyAllocator::free(size_t addr, size_t size) {
     IT_ASSERT(this->ptr == nullptr);
     size = getAlignedSize(size);
@@ -143,25 +168,40 @@ void LazyAllocator::free(size_t addr, size_t size) {
 }
 
 void *LazyAllocator::getPtr() {
-    if (this->ptr == nullptr) {
-        this->ptr = runtime->alloc(this->peak);
-        // #ifdef DEBUG_MODE
-        //         printf("LazyAllocator really alloc non-weight: %p %lu
-        //         bytes\n", this->ptr, peak);
-        // #endif
+    if (!hasMemPool) {
+        if (this->ptr == nullptr) {
+            this->ptr = runtime->alloc(this->peak);
+            // #ifdef DEBUG_MODE
+            //         printf("LazyAllocator really alloc non-weight: %p %lu
+            //         bytes\n", this->ptr, peak);
+            // #endif
+        }
+        return this->ptr;
+    } else {
+        IT_ASSERT(this->memPoolSize >= this->weightPeak + this->peak);
+        return static_cast<uint8_t *>(this->memPoolPtr) + weightPeak;
     }
-    return this->ptr;
 }
 
 void *LazyAllocator::getWeightPtr() {
-    if (this->weightPtr == nullptr) {
-        this->weightPtr = runtime->alloc(this->weightPeak);
-        // #ifdef DEBUG_MODE
-        //         printf("LazyAllocator really alloc weight: %p %lu bytes\n",
-        //                this->weightPtr, weightPeak);
-        // #endif
+    if (!hasMemPool) {
+        if (this->weightPtr == nullptr) {
+            this->weightPtr = runtime->alloc(this->weightPeak);
+            // #ifdef DEBUG_MODE
+            //         printf("LazyAllocator really alloc weight: %p %lu
+            //         bytes\n",
+            //                this->weightPtr, weightPeak);
+            // #endif
+        }
+        return this->weightPtr;
+    } else {
+        return this->memPoolPtr;
     }
-    return this->weightPtr;
+}
+
+void *LazyAllocator::getHeapPtr() {
+    IT_ASSERT(hasMemPool);
+    return this->memPoolPtr;
 }
 
 size_t LazyAllocator::getAlignedSize(size_t size) {
diff --git a/src/core/operator.cc b/src/core/operator.cc
index 462cb2a2..6a9ea1b8 100644
--- a/src/core/operator.cc
+++ b/src/core/operator.cc
@@ -77,9 +77,7 @@ bool OperatorObj::checkValid(GraphObj *graph) {
     return true;
 }
 
-optional<vector<Shape>> OperatorObj::inferShape() const {
-    return inferShape(inputs);
-}
+optional<vector<Shape>> OperatorObj::inferShape() { return inferShape(inputs); }
 
 vector<DataType> OperatorObj::inferDataType(const TensorVec &inputs) const {
     auto dataType = inputs[0]->getDType();
diff --git a/src/core/tensor.cc b/src/core/tensor.cc
index e34fb8bc..5be8a18d 100644
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@@ -59,6 +59,13 @@ Shape TensorObj::getStride() const {
     return stride;
 }
 
+void TensorObj::setShape(Shape shape_) {
+    shape = shape_;
+    size_t size = std::accumulate(shape.begin(), shape.end(), 1,
+                                  [](auto acc, auto x) { return acc * x; });
+    _size = size;
+}
+
 void TensorObj::printData() const {
     IT_ASSERT(data != nullptr);
     if (!runtime->isCpu())
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index ca427dab..5033a191 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -446,7 +446,10 @@ void init_graph_builder(py::module &m) {
              })
         .def("has_target", &TensorObj::hasTarget, policy::automatic)
         .def("src", &TensorObj::getSource, policy::move)
-        .def("printData", &TensorObj::printData, policy::automatic);
+        .def("printData", &TensorObj::printData, policy::automatic)
+        .def("copy_data",
+             py::overload_cast<const Tensor &>(&TensorObj::copyData),
+             policy::move);
     py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
         .def("op_type", &OperatorObj::getOpType, policy::automatic)
         .def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_),
@@ -466,6 +469,7 @@ void init_graph_builder(py::module &m) {
         .def("add", &Handler::add, policy::move)
         .def("sub", &Handler::sub, policy::move)
         .def("mul", &Handler::mul, policy::move)
+        .def("max", &Handler::max, policy::move)
         .def("div", &Handler::div, policy::move)
         .def("pow", &Handler::pow, policy::move)
         .def("min", &Handler::min, policy::move)
@@ -510,10 +514,17 @@ void init_graph_builder(py::module &m) {
         .def("topo_sort", &Handler::topo_sort, policy::automatic)
         .def("optimize", &Handler::optimize, policy::automatic)
         .def("operators", &Handler::operators, policy::move)
-        .def("data_malloc", &Handler::data_malloc, policy::automatic)
+        .def("data_malloc", &Handler::data_malloc,
+             py::arg("useNaiveAllocator") = false, py::arg("memPoolSize") = 0,
+             policy::automatic)
+        .def("clone_KV", &Handler::clone_KV, policy::move)
+        .def("free_heap", &Handler::free_heap, policy::move)
         .def("get_perf_time", &Handler::get_perf_time, policy::automatic)
         .def("tune", &Handler::tune, policy::automatic)
         .def("run", &Handler::run, policy::automatic)
+        .def("shape_infer", &Handler::shape_infer, policy::automatic)
+        .def("change_shape", &Handler::change_shape, policy::automatic)
+        .def("getDims", &Handler::getDims, policy::automatic)
         .def("get_perf_time", &Handler::get_perf_time, policy::automatic);
 }
 
diff --git a/src/kernels/cuda/element_wise.cc b/src/kernels/cuda/element_wise.cc
index 99b586fb..8603c198 100644
--- a/src/kernels/cuda/element_wise.cc
+++ b/src/kernels/cuda/element_wise.cc
@@ -44,7 +44,6 @@ class ElementWiseCudnn : public CudaKernelWithoutConfig {
         std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
         std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
         std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
-
         // get inputs
         checkCudnnError(cudnnCreateTensorDescriptor(&aDesc));
         checkCudnnError(cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW,
@@ -110,9 +109,9 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
-        float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
-        float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
-        float *const cData = (op->getOutput()->getRawDataPtr<float *>());
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
         auto a_dim = op->getInputs(0)->getDims();
         auto b_dim = op->getInputs(1)->getDims();
         auto c_dim = op->getOutput()->getDims();
@@ -134,7 +133,13 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
         else if (op->getOpType() == OpType::Pow)
             pow_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
                        b[2], b[3], c[0], c[1], c[2], c[3]);
-        else
+        else if (op->getOpType() == OpType::Add) {
+            add_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
+                       b[2], b[3], c[0], c[1], c[2], c[3]);
+        } else if (op->getOpType() == OpType::Less) {
+            less_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
+                        b[2], b[3], c[0], c[1], c[2], c[3]);
+        } else
             IT_TODO_HALT();
     }
 };
@@ -152,6 +157,10 @@ REGISTER_KERNEL(Device::CUDA, OpType::Max, DataType::Float32, MaxCudnn,
 
 REGISTER_KERNEL(Device::CUDA, OpType::Div, DataType::Float32, ElementWiseCuda,
                 "Div_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Add, DataType::Int64, ElementWiseCuda,
+                "Add_CUDA_Int64");
 REGISTER_KERNEL(Device::CUDA, OpType::Pow, DataType::Float32, ElementWiseCuda,
                 "Pow__CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Less, DataType::Int64, ElementWiseCuda,
+                "Less__CUDA_Int64");
 }; // namespace infini
diff --git a/src/kernels/cuda/element_wise.cu b/src/kernels/cuda/element_wise.cu
index 93e384d3..9d1b101a 100644
--- a/src/kernels/cuda/element_wise.cu
+++ b/src/kernels/cuda/element_wise.cu
@@ -5,9 +5,10 @@ constexpr unsigned int num_threads() { return 32 * 4; }
 constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
 
-__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1,
-                            int a2, int a3, int b0, int b1, int b2, int b3,
-                            int c0, int c1, int c2, int c3) {
+template <class T>
+__global__ void _div_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
+                            int a3, int b0, int b1, int b2, int b3, int c0,
+                            int c1, int c2, int c3) {
     int index = threadIdx.x + blockIdx.x * blockDim.x;
     int stride = blockDim.x * gridDim.x;
     int n = c0 * c1 * c2 * c3;
@@ -27,16 +28,17 @@ __global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1,
         int b1_index = c1_index % b1;
         int b2_index = c2_index % b2;
         int b3_index = c3_index % b3;
-        z[i] = x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 + a2_index * a3 +
-                 a3_index] /
-               y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 + b2_index * b3 +
-                 b3_index];
+        ((T *)z)[i] = ((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
+                               a2_index * a3 + a3_index] /
+                      ((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
+                               b2_index * b3 + b3_index];
     }
 }
 
-__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1,
-                            int a2, int a3, int b0, int b1, int b2, int b3,
-                            int c0, int c1, int c2, int c3) {
+template <class T>
+__global__ void _add_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
+                            int a3, int b0, int b1, int b2, int b3, int c0,
+                            int c1, int c2, int c3) {
     int index = threadIdx.x + blockIdx.x * blockDim.x;
     int stride = blockDim.x * gridDim.x;
     int n = c0 * c1 * c2 * c3;
@@ -56,32 +58,115 @@ __global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1,
         int b1_index = c1_index % b1;
         int b2_index = c2_index % b2;
         int b3_index = c3_index % b3;
-        z[i] = pow(x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
-                     a2_index * a3 + a3_index],
-                   y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
-                     b2_index * b3 + b3_index]);
+        ((T *)z)[i] = ((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
+                               a2_index * a3 + a3_index] +
+                      ((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
+                               b2_index * b3 + b3_index];
+    }
+}
+
+template <class T>
+__global__ void _pow_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
+                            int a3, int b0, int b1, int b2, int b3, int c0,
+                            int c1, int c2, int c3) {
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+    int stride = blockDim.x * gridDim.x;
+    int n = c0 * c1 * c2 * c3;
+
+    for (int i = index; i < n; i += stride) {
+        int c0_index = i / (c1 * c2 * c3);
+        int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
+        int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
+        int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
+
+        int a0_index = c0_index % a0;
+        int a1_index = c1_index % a1;
+        int a2_index = c2_index % a2;
+        int a3_index = c3_index % a3;
+
+        int b0_index = c0_index % b0;
+        int b1_index = c1_index % b1;
+        int b2_index = c2_index % b2;
+        int b3_index = c3_index % b3;
+        ((T *)z)[i] =
+            pow(((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
+                         a2_index * a3 + a3_index],
+                ((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
+                         b2_index * b3 + b3_index]);
+    }
+}
+
+template <class T>
+__global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
+                             int a3, int b0, int b1, int b2, int b3, int c0,
+                             int c1, int c2, int c3) {
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+    int stride = blockDim.x * gridDim.x;
+    int n = c0 * c1 * c2 * c3;
+
+    for (int i = index; i < n; i += stride) {
+        int c0_index = i / (c1 * c2 * c3);
+        int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
+        int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
+        int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
+
+        int a0_index = c0_index % a0;
+        int a1_index = c1_index % a1;
+        int a2_index = c2_index % a2;
+        int a3_index = c3_index % a3;
+
+        int b0_index = c0_index % b0;
+        int b1_index = c1_index % b1;
+        int b2_index = c2_index % b2;
+        int b3_index = c3_index % b3;
+        ((bool *)z)[i] =
+            ((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
+                     a2_index * a3 + a3_index] <
+                    ((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
+                             b2_index * b3 + b3_index]
+                ? true
+                : false;
     }
 }
 
 namespace infini {
-void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+void div_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
                 int b0, int b1, int b2, int b3, int c0, int c1, int c2,
                 int c3) {
 
     int blocksize = block_work_size();
     int num = c0 * c1 * c2 * c3;
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _div_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
-                                         b3, c0, c1, c2, c3);
+    _div_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1,
+                                                b2, b3, c0, c1, c2, c3);
 }
-void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+void add_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
+                int b0, int b1, int b2, int b3, int c0, int c1, int c2,
+                int c3) {
+
+    int blocksize = block_work_size();
+    int num = c0 * c1 * c2 * c3;
+    int gridsize = (num + block_work_size() - 1) / block_work_size();
+    _add_kernel<int64_t><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0,
+                                                  b1, b2, b3, c0, c1, c2, c3);
+}
+void pow_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
                 int b0, int b1, int b2, int b3, int c0, int c1, int c2,
                 int c3) {
     int blocksize = block_work_size();
     int num = c0 * c1 * c2 * c3;
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _pow_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
-                                         b3, c0, c1, c2, c3);
+    _pow_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1,
+                                                b2, b3, c0, c1, c2, c3);
+}
+void less_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
+                 int b0, int b1, int b2, int b3, int c0, int c1, int c2,
+                 int c3) {
+    int blocksize = block_work_size();
+    int num = c0 * c1 * c2 * c3;
+    int gridsize = (num + block_work_size() - 1) / block_work_size();
+    _less_kernel<int64_t><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0,
+                                                   b1, b2, b3, c0, c1, c2, c3);
 }
 
 }; // namespace infini
diff --git a/src/kernels/cuda/pad_slice.cc b/src/kernels/cuda/pad_slice.cc
index 2e7e3931..1ff4dffa 100644
--- a/src/kernels/cuda/pad_slice.cc
+++ b/src/kernels/cuda/pad_slice.cc
@@ -16,8 +16,9 @@ class PadSliceCudaCompute {
             metadata.partNDim[i] = partTensor->getDims()[i];
             metadata.partStride[i] = partTensor->getStride()[i];
         }
-        pad_slice_kernel(partTensor->getRawDataPtr<float *>(),
-                         wholeTensor->getRawDataPtr<float *>(), metadata, nDims,
+        metadata.DType = partTensor->getDType().getIndex();
+        pad_slice_kernel(partTensor->getRawDataPtr<void *>(),
+                         wholeTensor->getRawDataPtr<void *>(), metadata, nDims,
                          wholeTensor->size(), isPad);
     }
 };
@@ -40,6 +41,8 @@ class SliceCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig {
 
 REGISTER_KERNEL(Device::CUDA, OpType::Slice, DataType::Float32, SliceCuda,
                 "Slice__CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Slice, DataType::Int64, SliceCuda,
+                "Slice__CUDA_Int64");
 REGISTER_KERNEL(Device::CUDA, OpType::Pad, DataType::Float32, PadCuda,
                 "Pad__CUDA_Float32");
 } // namespace infini
diff --git a/src/kernels/cuda/pad_slice.cu b/src/kernels/cuda/pad_slice.cu
index 828aba3e..f119bd9c 100644
--- a/src/kernels/cuda/pad_slice.cu
+++ b/src/kernels/cuda/pad_slice.cu
@@ -1,3 +1,4 @@
+#include "core/data_type.h"
 #include "cuda/cuda_common.h"
 #include "cuda/cuda_pad_slice.h"
 
@@ -19,9 +20,9 @@ __device__ int WholeTensorOffset2PartTensorOffset(int wholeOffset,
     return offset;
 }
 
-__global__ void _pad_slice_kernel(float *part, float *whole,
-                                  TransMetaData metaData, int nDims, int num,
-                                  bool isPad) {
+template <typename T>
+__global__ void _pad_slice_kernel(T *part, T *whole, TransMetaData metaData,
+                                  int nDims, int num, bool isPad) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     if (tid >= num)
         return;
@@ -41,12 +42,18 @@ __global__ void _pad_slice_kernel(float *part, float *whole,
 }
 
 namespace infini {
-void pad_slice_kernel(float *partData, float *wholeData,
+void pad_slice_kernel(void *partData, void *wholeData,
                       const TransMetaData &metadata, int nDims, int num,
                       bool isPad) {
     int blockSize = 32 * 16;
     int gridSize = (num + blockSize - 1) / blockSize;
-    _pad_slice_kernel<<<gridSize, blockSize>>>(partData, wholeData, metadata,
-                                               nDims, num, isPad);
+    if (metadata.DType == DataType::Int64.getIndex()) {
+        _pad_slice_kernel<int64_t>
+            <<<gridSize, blockSize>>>((int64_t *)partData, (int64_t *)wholeData,
+                                      metadata, nDims, num, isPad);
+    } else if (metadata.DType == DataType::Float32.getIndex()) {
+        _pad_slice_kernel<float><<<gridSize, blockSize>>>(
+            (float *)partData, (float *)wholeData, metadata, nDims, num, isPad);
+    }
 }
 } // namespace infini
diff --git a/src/kernels/cuda/split_concat.cc b/src/kernels/cuda/split_concat.cc
index dbe2a7ac..d3f8a551 100644
--- a/src/kernels/cuda/split_concat.cc
+++ b/src/kernels/cuda/split_concat.cc
@@ -59,6 +59,21 @@ class CudaCompute {
 class ConcatCuda : private CudaCompute, public CudaKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
+        auto inputs = _op->getInputs();
+        if (inputs.size() == 2) {
+            for (size_t i = 0; i < 2; i++) {
+                if (inputs[i]->size() == 0) {
+                    auto inData =
+                        _op->getInputs(1 - i)->getRawDataPtr<void *>();
+                    auto outData =
+                        _op->getOutputs()[0]->getRawDataPtr<void *>();
+                    cudaMemcpyAsync(outData, inData,
+                                    _op->getInputs(1 - i)->getBytes(),
+                                    cudaMemcpyDeviceToDevice);
+                    return;
+                }
+            }
+        }
         do_compute(_op->getOutput(), _op->getInputs(),
                    as<ConcatObj>(_op)->getDim(), _op->getOutput()->getRank(),
                    false);
diff --git a/src/operators/G2BMM.cc b/src/operators/G2BMM.cc
index 499c1f77..81c8737f 100644
--- a/src/operators/G2BMM.cc
+++ b/src/operators/G2BMM.cc
@@ -20,15 +20,18 @@ string G2BMMObj::toString() const {
     return os.str();
 }
 
-optional<vector<Shape>> G2BMMObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> G2BMMObj::inferShape(const TensorVec &inputs) {
     auto A = inputs[0], B = inputs[1];
+    b = A->getDims()[0];
+    m = A->getDims()[1];
+    k = A->getDims()[2];
 
     IT_ASSERT(A->getRank() == 3 && B->getRank() == 3);
     IT_ASSERT(A->getDims()[0] == B->getDims()[0]);
     IT_ASSERT(A->getDims()[1] == B->getDims()[1]);
     IT_ASSERT(A->getDims()[2] == B->getDims()[2]);
     IT_ASSERT(width >= 0);
-    int b(A->getDims()[0]), m(A->getDims()[1]), n(2 * width + 1);
+    int n(2 * width + 1);
     return {{{b, m, n}}};
 }
 
diff --git a/src/operators/GBMM.cc b/src/operators/GBMM.cc
index d51128fa..f8ee09c0 100644
--- a/src/operators/GBMM.cc
+++ b/src/operators/GBMM.cc
@@ -21,15 +21,18 @@ string GBMMObj::toString() const {
     return os.str();
 }
 
-optional<vector<Shape>> GBMMObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> GBMMObj::inferShape(const TensorVec &inputs) {
     auto A = inputs[0], B = inputs[1];
+    b = A->getDims()[0];
+    m = A->getDims()[1];
+    w = (A->getDims()[2] - 1) / 2;
+    n = B->getDims()[2];
 
     IT_ASSERT(A->getRank() == 3 && B->getRank() == 3);
     IT_ASSERT(A->getDims()[0] == B->getDims()[0]);
     IT_ASSERT(A->getDims()[1] == B->getDims()[1]);
     IT_ASSERT(A->getDims()[2] % 2 != 0);
-    int b(A->getDims()[0]), m(A->getDims()[1]), k(B->getDims()[2]);
-    return {{{b, m, k}}};
+    return {{{b, m, n}}};
 }
 
 vector<int> GBMMObj::getWorkloadVector() const {
diff --git a/src/operators/activation_backward.cc b/src/operators/activation_backward.cc
index b968c936..142c3692 100644
--- a/src/operators/activation_backward.cc
+++ b/src/operators/activation_backward.cc
@@ -9,7 +9,7 @@ ActivationBackwardObj::ActivationBackwardObj(OpType type, GraphObj *graph,
 }
 
 optional<vector<Shape>>
-ActivationBackwardObj::inferShape(const TensorVec &inputs) const {
+ActivationBackwardObj::inferShape(const TensorVec &inputs) {
     return {{inputs[0]->getDims()}};
 }
 
diff --git a/src/operators/all_gather.cc b/src/operators/all_gather.cc
index 127c3b8d..e4ffe9bf 100644
--- a/src/operators/all_gather.cc
+++ b/src/operators/all_gather.cc
@@ -10,8 +10,7 @@ AllGatherObj::AllGatherObj(GraphObj *graph, Tensor input,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>>
-AllGatherObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> AllGatherObj::inferShape(const TensorVec &inputs) {
     Shape input_shape = inputs[0]->getDims();
     vector<Shape> output_shapes(getWorldSize(), input_shape);
     return output_shapes;
diff --git a/src/operators/attention_kvcache.cc b/src/operators/attention_kvcache.cc
index 9893f509..492a76f7 100644
--- a/src/operators/attention_kvcache.cc
+++ b/src/operators/attention_kvcache.cc
@@ -18,7 +18,7 @@ AttentionKVCacheObj::AttentionKVCacheObj(GraphObj *graph, Tensor input_k_cache,
 }
 
 optional<vector<Shape>>
-AttentionKVCacheObj::inferShape(const TensorVec &inputs) const {
+AttentionKVCacheObj::inferShape(const TensorVec &inputs) {
     IT_ASSERT(inputs.size() == 6);
     Shape dims = inputs[0]->getDims();
     ShapeElem n = dims.at(dim);
diff --git a/src/operators/batch_norm.cc b/src/operators/batch_norm.cc
index ba68cbfd..bbd4c1bf 100644
--- a/src/operators/batch_norm.cc
+++ b/src/operators/batch_norm.cc
@@ -13,8 +13,7 @@ BatchNormObj::BatchNormObj(GraphObj *graph, Tensor input, Tensor output,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>>
-BatchNormObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> BatchNormObj::inferShape(const TensorVec &inputs) {
     auto input = inputs[0];
     auto mean = inputs[1];
     auto var = inputs[2];
diff --git a/src/operators/concat.cc b/src/operators/concat.cc
index 95535233..021aefef 100644
--- a/src/operators/concat.cc
+++ b/src/operators/concat.cc
@@ -9,9 +9,16 @@ ConcatObj::ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int _dim)
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> ConcatObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> ConcatObj::inferShape(const TensorVec &inputs) {
     Shape dims = inputs[0]->getDims();
     auto rank = inputs[0]->getRank();
+    if (inputs.size() == 2) {
+        for (size_t i = 0; i < inputs.size(); ++i) {
+            if (inputs[i]->size() == 0) {
+                return {{inputs[1 - i]->getDims()}};
+            }
+        }
+    }
     ShapeElem n = dims.at(dim);
     for (auto itr = inputs.begin() + 1; itr != inputs.end(); ++itr) {
         auto input = *itr;
diff --git a/src/operators/conv.cc b/src/operators/conv.cc
index 8c3eafb4..77fc9aef 100644
--- a/src/operators/conv.cc
+++ b/src/operators/conv.cc
@@ -82,14 +82,15 @@ ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> ConvObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> ConvObj::inferShape(const TensorVec &inputs) {
     const auto &input = inputs[0], &weight = inputs[1];
-    auto n = input->getDims()[0];
-    auto h = input->getDims()[2];
-    auto w = input->getDims()[3];
-    auto f = weight->getDims()[0];
-    auto r = weight->getDims()[2];
-    auto s = weight->getDims()[3];
+    n = input->getDims()[0];
+    c = input->getDims()[1];
+    h = input->getDims()[2];
+    w = input->getDims()[3];
+    f = weight->getDims()[0];
+    r = weight->getDims()[2];
+    s = weight->getDims()[3];
     int on = n, oc = f;
     int oh = 0, ow = 0;
     // For NCHW+FCRS layout, C of input is divisable by C of weight
@@ -141,15 +142,15 @@ ConvTransposed2dObj::ConvTransposed2dObj(GraphObj *graph, Tensor input,
 }
 
 optional<vector<Shape>>
-ConvTransposed2dObj::inferShape(const TensorVec &inputs) const {
+ConvTransposed2dObj::inferShape(const TensorVec &inputs) {
     const Tensor &input = inputs[0], &weight = inputs[1];
-    auto n = input->getDims()[0];
-    auto f = input->getDims()[1];
-    auto h = input->getDims()[2];
-    auto w = input->getDims()[3];
-    auto c = weight->getDims()[1];
-    auto r = weight->getDims()[2];
-    auto s = weight->getDims()[3];
+    n = input->getDims()[0];
+    f = input->getDims()[1];
+    h = input->getDims()[2];
+    w = input->getDims()[3];
+    c = weight->getDims()[1];
+    r = weight->getDims()[2];
+    s = weight->getDims()[3];
     IT_ASSERT(f == weight->getDims()[0]);
 
     int on = n, oc = c * group;
@@ -219,14 +220,15 @@ ConvBackwardFilterObj::ConvBackwardFilterObj(GraphObj *graph, Tensor inputX,
 }
 
 optional<vector<Shape>>
-ConvBackwardFilterObj::inferShape(const TensorVec &inputs) const {
+ConvBackwardFilterObj::inferShape(const TensorVec &inputs) {
     const auto &inputX = inputs[0], &diffY = inputs[1];
-    auto n = inputX->getDims()[0];
-    auto h = inputX->getDims()[2];
-    auto w = inputX->getDims()[3];
-    auto f = diffY->getDims()[0];
-    auto r = diffY->getDims()[2];
-    auto s = diffY->getDims()[3];
+    n = inputX->getDims()[0];
+    c = inputX->getDims()[1];
+    h = inputX->getDims()[2];
+    w = inputX->getDims()[3];
+    f = diffY->getDims()[0];
+    r = diffY->getDims()[2];
+    s = diffY->getDims()[3];
     int on = n, oc = f;
     int oh = 0, ow = 0;
     // For NCHW+FCRS layout, C of input is divisable by C of weight
@@ -280,17 +282,16 @@ ConvTransposed2dNHWCObj::ConvTransposed2dNHWCObj(GraphObj *graph, Tensor input,
 }
 
 optional<vector<Shape>>
-ConvTransposed2dNHWCObj::inferShape(const TensorVec &inputs) const {
+ConvTransposed2dNHWCObj::inferShape(const TensorVec &inputs) {
     const Tensor &input = inputs[0], &weight = inputs[1];
-    auto n = input->getDims()[0];
-    auto f = input->getDims()[3];
-    auto h = input->getDims()[1];
-    auto w = input->getDims()[2];
-    auto c = weight->getDims()[3];
-    auto r = weight->getDims()[1];
-    auto s = weight->getDims()[2];
-    if (f != weight->getDims()[0])
-        return {};
+    n = input->getDims()[0];
+    f = input->getDims()[3];
+    h = input->getDims()[1];
+    w = input->getDims()[2];
+    c = weight->getDims()[3];
+    r = weight->getDims()[1];
+    s = weight->getDims()[2];
+    IT_ASSERT(f == weight->getDims()[0]);
 
     int on = n, oc = c * group;
     int oh = 0, ow = 0;
diff --git a/src/operators/det.cc b/src/operators/det.cc
index 473982cd..f5d16af5 100644
--- a/src/operators/det.cc
+++ b/src/operators/det.cc
@@ -6,7 +6,7 @@ DetObj::DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode)
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> DetObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> DetObj::inferShape(const TensorVec &inputs) {
     const auto A = inputs[0];
     auto input = A->getDims();
     int rank = A->getRank();
diff --git a/src/operators/dropout.cc b/src/operators/dropout.cc
index 08eca92a..7dcb70db 100644
--- a/src/operators/dropout.cc
+++ b/src/operators/dropout.cc
@@ -10,7 +10,7 @@ DropoutObj::DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> DropoutObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> DropoutObj::inferShape(const TensorVec &inputs) {
     auto shape = inputs[0]->getDims();
     return {{shape, shape}};
 }
diff --git a/src/operators/element_wise.cc b/src/operators/element_wise.cc
index d86ccccf..6445c0d5 100644
--- a/src/operators/element_wise.cc
+++ b/src/operators/element_wise.cc
@@ -8,8 +8,7 @@ ElementWiseObj::ElementWiseObj(OpType type, GraphObj *graph, Tensor input0,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>>
-ElementWiseObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> ElementWiseObj::inferShape(const TensorVec &inputs) {
     const auto A = inputs[0], B = inputs[1];
     auto res = infer_broadcast(A->getDims(), B->getDims());
     return {{res}};
@@ -45,7 +44,7 @@ MSELossObj::MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> MSELossObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> MSELossObj::inferShape(const TensorVec &inputs) {
     const auto A = inputs[0], B = inputs[1];
     IT_ASSERT(A->getRank() == B->getRank());
     IT_ASSERT(A->getDims() == B->getDims());
diff --git a/src/operators/expand.cc b/src/operators/expand.cc
index faebb34a..8ffcc75b 100644
--- a/src/operators/expand.cc
+++ b/src/operators/expand.cc
@@ -8,7 +8,7 @@ ExpandObj::ExpandObj(GraphObj *graph, Tensor input, Tensor output, Shape dims)
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> ExpandObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> ExpandObj::inferShape(const TensorVec &inputs) {
     auto shape_input = inputs[0]->getDims();
     Shape ret = infer_broadcast(shape_input, dims);
     return {{ret}};
diff --git a/src/operators/extend.cc b/src/operators/extend.cc
index e8587dbb..c3a678e3 100644
--- a/src/operators/extend.cc
+++ b/src/operators/extend.cc
@@ -11,7 +11,7 @@ ExtendObj::ExtendObj(GraphObj *graph, Tensor input, Tensor output, int dim,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> ExtendObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> ExtendObj::inferShape(const TensorVec &inputs) {
     auto ret = inputs[0]->getDims();
     ret[dim] = ret[dim] * (num + 1);
     return {{ret}};
diff --git a/src/operators/gather.cc b/src/operators/gather.cc
index 0cddca3c..b0c8a77a 100644
--- a/src/operators/gather.cc
+++ b/src/operators/gather.cc
@@ -10,7 +10,7 @@ GatherObj::GatherObj(GraphObj *graph, Tensor input, Tensor indices,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> GatherObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> GatherObj::inferShape(const TensorVec &inputs) {
     auto dims0 = inputs[0]->getDims();
     auto dims1 = inputs[1]->getDims();
 
diff --git a/src/operators/gather_elements.cc b/src/operators/gather_elements.cc
index a1e6bffe..2e224f3e 100644
--- a/src/operators/gather_elements.cc
+++ b/src/operators/gather_elements.cc
@@ -24,8 +24,7 @@ bool checkShape(Tensor input, Tensor indices, int axis) {
     return true;
 }
 
-optional<vector<Shape>>
-GatherElementsObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> GatherElementsObj::inferShape(const TensorVec &inputs) {
     IT_ASSERT(checkShape(inputs[0], inputs[1], axis));
     auto indicesDims = inputs[1]->getDims(); // output has same shape as indices
     return {{indicesDims}};
diff --git a/src/operators/matmul.cc b/src/operators/matmul.cc
index 00207e77..60cbb826 100644
--- a/src/operators/matmul.cc
+++ b/src/operators/matmul.cc
@@ -9,25 +9,6 @@ MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
     : OperatorObj(OpType::MatMul,
                   bias ? TensorVec{A, B, bias} : TensorVec{A, B}, {C}),
       transA(transA), transB(transB), act(act), b(1) {
-    auto shape_a = A->getDims();
-    auto shape_b = B->getDims();
-    int rankA = A->getRank();
-    int rankB = B->getRank();
-    IT_ASSERT(rankA >= 2 && rankB >= 2);
-    Shape shape_a1(shape_a.begin(), shape_a.begin() + (rankA - 2));
-    Shape shape_b1(shape_b.begin(), shape_b.begin() + (rankB - 2));
-    auto ret = infer_broadcast(shape_a1, shape_b1);
-    if (ret.empty()) {
-        b = 1;
-    } else {
-        b = std::accumulate(ret.begin(), ret.end(), 1, std::multiplies<int>());
-    }
-    auto kA = *(transA ? shape_a.rbegin() + 1 : shape_a.rbegin());
-    auto kB = *(transB ? shape_b.rbegin() : shape_b.rbegin() + 1);
-    IT_ASSERT(kA == kB);
-    m = *(transA ? shape_a.rbegin() : shape_a.rbegin() + 1);
-    n = *(transB ? shape_b.rbegin() + 1 : shape_b.rbegin());
-    k = kA;
     IT_ASSERT(checkValid(graph));
 }
 
@@ -40,7 +21,7 @@ string MatmulObj::toString() const {
     return os.str();
 }
 
-optional<vector<Shape>> MatmulObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> MatmulObj::inferShape(const TensorVec &inputs) {
     auto A = inputs[0], B = inputs[1];
     auto shapeA = A->getDims();
     auto shapeB = B->getDims();
@@ -49,6 +30,17 @@ optional<vector<Shape>> MatmulObj::inferShape(const TensorVec &inputs) const {
     Shape shapeA1(shapeA.begin(), shapeA.begin() + (rankA - 2));
     Shape shapeB1(shapeB.begin(), shapeB.begin() + (rankB - 2));
     Shape ret = infer_broadcast(shapeA1, shapeB1);
+    if (ret.empty()) {
+        b = 1;
+    } else {
+        b = std::accumulate(ret.begin(), ret.end(), 1, std::multiplies<int>());
+    }
+    auto kA = *(transA ? shapeA.rbegin() + 1 : shapeA.rbegin());
+    auto kB = *(transB ? shapeB.rbegin() : shapeB.rbegin() + 1);
+    IT_ASSERT(kA == kB);
+    m = *(transA ? shapeA.rbegin() : shapeA.rbegin() + 1);
+    n = *(transB ? shapeB.rbegin() + 1 : shapeB.rbegin());
+    k = kA;
     ret.emplace_back(m);
     ret.emplace_back(n);
     return {{ret}};
diff --git a/src/operators/membound.cc b/src/operators/membound.cc
index be757b36..2aa07ae2 100644
--- a/src/operators/membound.cc
+++ b/src/operators/membound.cc
@@ -60,7 +60,7 @@ string MemBoundObj::toString() const {
     return os.str();
 }
 
-optional<vector<Shape>> MemBoundObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> MemBoundObj::inferShape(const TensorVec &inputs) {
     // inputs have to match nnetInputs excatly
     if (inputs.size() != nnetInputs.size())
         return {};
diff --git a/src/operators/pad.cc b/src/operators/pad.cc
index b870e449..96b9811f 100644
--- a/src/operators/pad.cc
+++ b/src/operators/pad.cc
@@ -22,7 +22,7 @@ PadObj::PadObj(GraphObj *graph, Tensor input, Tensor output,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> PadObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> PadObj::inferShape(const TensorVec &inputs) {
     auto dims = inputs[0]->getDims();
     int rank = inputs[0]->getRank();
     IT_ASSERT(rank * 2 == (int)pads.size());
diff --git a/src/operators/pooling.cc b/src/operators/pooling.cc
index b1bb2e3d..836ac522 100644
--- a/src/operators/pooling.cc
+++ b/src/operators/pooling.cc
@@ -12,7 +12,7 @@ PoolingObj::PoolingObj(GraphObj *graph, OpType optype, Tensor input,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> PoolingObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> PoolingObj::inferShape(const TensorVec &inputs) {
     const auto &input = inputs[0];
     auto h = input->getDims()[input->getRank() - 2],
          w = input->getDims()[input->getRank() - 1];
diff --git a/src/operators/reduce_mean.cc b/src/operators/reduce_mean.cc
index e3a5ec97..cf801c59 100644
--- a/src/operators/reduce_mean.cc
+++ b/src/operators/reduce_mean.cc
@@ -21,8 +21,7 @@ bool ReduceMeanObj::isReduced(int idx) const {
     return axes.find(idx) != axes.end();
 }
 
-optional<vector<Shape>>
-ReduceMeanObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> ReduceMeanObj::inferShape(const TensorVec &inputs) {
     auto dims = inputs[0]->getDims();
     auto rank = inputs[0]->getRank();
 
diff --git a/src/operators/reshape.cc b/src/operators/reshape.cc
index df216601..2a65345e 100644
--- a/src/operators/reshape.cc
+++ b/src/operators/reshape.cc
@@ -1,5 +1,6 @@
 #include "operators/reshape.h"
 #include "utils/operator_utils.h"
+#include <numeric>
 
 namespace infini {
 ReshapeObj::ReshapeObj(GraphObj *graph, Tensor input, Tensor output, Shape dims)
@@ -7,14 +8,37 @@ ReshapeObj::ReshapeObj(GraphObj *graph, Tensor input, Tensor output, Shape dims)
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> ReshapeObj::inferShape(const TensorVec &inputs) const {
-    size_t size = 1;
-    for (size_t i = 0; i < dims.size(); ++i) {
-        size *= dims.at(i);
+optional<vector<Shape>> ReshapeObj::inferShape(const TensorVec &inputs) {
+    int count = 0;
+    for (auto x : dims) {
+        if (x == -1) {
+            count++;
+        }
+        IT_ASSERT(x == -1 || x >= 0);
     }
-    IT_ASSERT(size == inputs[0]->size());
+    IT_ASSERT(count == 0 || count == 1);
+    auto inputShape = inputs[0]->getDims();
+    int size = inputs[0]->size();
+    int index = -1;
+    outputShape = dims;
+    for (int i = 0; i < (int)dims.size(); ++i) {
+        if (dims[i] == 0) {
+            outputShape[i] = inputShape[i];
+        }
+        if (dims[i] == -1) {
+            index = i;
+        }
+    }
+    if (index != -1) {
+        outputShape[index] =
+            size / (-std::accumulate(outputShape.begin(), outputShape.end(), 1,
+                                     [](auto acc, auto x) { return acc * x; }));
+    }
+    int outputSize = std::accumulate(outputShape.begin(), outputShape.end(), 1,
+                                     [](auto acc, auto x) { return acc * x; });
+    IT_ASSERT(outputSize == size);
 
-    return {{dims}};
+    return {{outputShape}};
 }
 
 std::string ReshapeObj::toString() const {
@@ -22,7 +46,7 @@ std::string ReshapeObj::toString() const {
     os << "Reshape[" << getGuid() << "]";
     os << "(";
     os << vecToString(inputs[0]->getDims()) << ",";
-    os << "dims=" << vecToString(dims) << ",";
+    os << "outputShape=" << vecToString(outputShape) << ",";
     os << "input=" << inputs[0]->getGuid() << ",";
     os << "output=" << outputs[0]->getGuid() << ")";
     return os.str();
@@ -30,12 +54,12 @@ std::string ReshapeObj::toString() const {
 
 vector<int> ReshapeObj::getWorkloadVector() const {
     vector<int> ret = inputs[0]->getDims();
-    ret.insert(ret.end(), dims.begin(), dims.end());
+    ret.insert(ret.end(), outputShape.begin(), outputShape.end());
     ret.emplace(ret.begin(), type.underlying());
     return ret;
 }
 vector<int> ReshapeObj::getOpAttrVector() const {
-    vector<int> ret = dims;
+    vector<int> ret = outputShape;
     ret.emplace(ret.begin(), type.underlying());
     return ret;
 }
@@ -47,7 +71,7 @@ FlattenObj::FlattenObj(GraphObj *graph, Tensor input, Tensor output, int _axis)
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> FlattenObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> FlattenObj::inferShape(const TensorVec &inputs) {
     int sizeB = 1, sizeE = 1;
     auto dims = getInputs(0)->getDims();
     int rank = getInputs(0)->getRank();
@@ -84,7 +108,7 @@ IdentityObj::IdentityObj(GraphObj *graph, Tensor input, Tensor output)
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> IdentityObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> IdentityObj::inferShape(const TensorVec &inputs) {
     return {{getInputs(0)->getDims()}};
 }
 
diff --git a/src/operators/resize.cc b/src/operators/resize.cc
index 11933414..0f0b08fe 100644
--- a/src/operators/resize.cc
+++ b/src/operators/resize.cc
@@ -206,7 +206,7 @@ float ResizeObj::round_int(float x) const {
 }
 
 // output shape is related to sizes/scales value.
-optional<vector<Shape>> ResizeObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> ResizeObj::inferShape(const TensorVec &inputs) {
     auto inDims = inputs[0]->getDims();
     Shape ret = inDims;
     int rank = inputs[0]->getRank();
diff --git a/src/operators/slice.cc b/src/operators/slice.cc
index 0db3b1a2..691a63b5 100644
--- a/src/operators/slice.cc
+++ b/src/operators/slice.cc
@@ -62,7 +62,7 @@ SliceObj::SliceObj(GraphObj *graph, Tensor input, Tensor output,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> SliceObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> SliceObj::inferShape(const TensorVec &inputs) {
     Shape ans;
     ans.reserve(axes.size());
     for (const auto &range : axes) {
diff --git a/src/operators/split.cc b/src/operators/split.cc
index be541326..95c7034f 100644
--- a/src/operators/split.cc
+++ b/src/operators/split.cc
@@ -35,7 +35,7 @@ SplitObj::SplitObj(GraphObj *graph, Tensor input,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> SplitObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> SplitObj::inferShape(const TensorVec &inputs) {
     IT_ASSERT(num != -1 && ratio.size() != 0);
     auto inputDims = inputs[0]->getDims();
     int totalSize = inputDims.at(dim);
diff --git a/src/operators/transpose.cc b/src/operators/transpose.cc
index f4c6a28d..9a05a785 100644
--- a/src/operators/transpose.cc
+++ b/src/operators/transpose.cc
@@ -16,8 +16,7 @@ TransposeObj::TransposeObj(GraphObj *graph, Tensor input, Tensor output,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>>
-TransposeObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> TransposeObj::inferShape(const TensorVec &inputs) {
     const auto A = inputs[0];
     auto input_dim = A->getDims();
     auto output_dim = input_dim;
@@ -66,8 +65,7 @@ DepthToSpaceObj::DepthToSpaceObj(GraphObj *graph, Tensor input, Tensor output,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>>
-DepthToSpaceObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> DepthToSpaceObj::inferShape(const TensorVec &inputs) {
     const auto A = inputs[0];
     auto inputDim = A->getDims();
     IT_ASSERT(inputDim.size() == 4);
diff --git a/src/operators/unary.cc b/src/operators/unary.cc
index 7f98940a..79d2ab83 100644
--- a/src/operators/unary.cc
+++ b/src/operators/unary.cc
@@ -6,7 +6,7 @@ UnaryObj::UnaryObj(OpType type, GraphObj *graph, Tensor input, Tensor output)
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> UnaryObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> UnaryObj::inferShape(const TensorVec &inputs) {
     const auto A = inputs[0];
     return {{A->getDims()}};
 }
@@ -37,7 +37,7 @@ ClipObj::ClipObj(GraphObj *graph, Tensor input, Tensor output,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> ClipObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> ClipObj::inferShape(const TensorVec &inputs) {
     const auto A = inputs[0];
     return {{A->getDims()}};
 }
@@ -68,7 +68,7 @@ HardtanhObj::HardtanhObj(GraphObj *graph, Tensor input, Tensor output,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> HardtanhObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> HardtanhObj::inferShape(const TensorVec &inputs) {
     const auto A = inputs[0];
     return {{A->getDims()}};
 }
@@ -97,7 +97,7 @@ FillObj::FillObj(GraphObj *graph, Tensor input, Tensor output, float value)
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> FillObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> FillObj::inferShape(const TensorVec &inputs) {
     const auto A = inputs[0];
     return {{A->getDims()}};
 }
@@ -124,7 +124,7 @@ L2LossObj::L2LossObj(GraphObj *graph, Tensor input, Tensor output)
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> L2LossObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> L2LossObj::inferShape(const TensorVec &inputs) {
     Shape temp = {1};
     return {{temp}};
 }
@@ -159,7 +159,7 @@ vector<DataType> CastObj::inferDataType(const TensorVec &inputs) const {
     return vector(numOutputs(), output_dataType);
 }
 
-optional<vector<Shape>> CastObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> CastObj::inferShape(const TensorVec &inputs) {
     const auto A = inputs[0];
     return {{A->getDims()}};
 }
@@ -241,7 +241,7 @@ ShapeObj::ShapeObj(GraphObj *graph, Tensor input, Tensor output)
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> ShapeObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> ShapeObj::inferShape(const TensorVec &inputs) {
     return {{{static_cast<int>(inputs[0]->getRank())}}};
 }
 
@@ -257,7 +257,7 @@ PReluObj::PReluObj(GraphObj *graph, Tensor input, Tensor alpha, Tensor output)
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> PReluObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> PReluObj::inferShape(const TensorVec &inputs) {
     const auto A = inputs[0];
     return {{A->getDims()}};
 }
@@ -286,7 +286,7 @@ LogObj::LogObj(GraphObj *graph, Tensor input, Tensor output, LogType type)
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> LogObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> LogObj::inferShape(const TensorVec &inputs) {
     const auto A = inputs[0];
     return {{A->getDims()}};
 }
diff --git a/src/operators/where.cc b/src/operators/where.cc
index 290ca7c6..7eac50d7 100644
--- a/src/operators/where.cc
+++ b/src/operators/where.cc
@@ -10,7 +10,7 @@ WhereObj::WhereObj(GraphObj *graph, Tensor inputX, Tensor inputY,
     IT_ASSERT(checkValid(graph));
 }
 
-optional<vector<Shape>> WhereObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>> WhereObj::inferShape(const TensorVec &inputs) {
     auto shapeX = inputs[0]->getDims();
     auto shapeY = inputs[1]->getDims();
     auto shapeCon = inputs[2]->getDims();
diff --git a/test/kernels/cuda/test_cuda_concat.cc b/test/kernels/cuda/test_cuda_concat.cc
index 2c76f405..12e18a56 100644
--- a/test/kernels/cuda/test_cuda_concat.cc
+++ b/test/kernels/cuda/test_cuda_concat.cc
@@ -158,4 +158,33 @@ TEST(Concat, CudaHigh) {
                       12., 13., 14., 15., 16., 17., 1., 1., 1., 1., 1., 1.,
                       18., 19., 20., 21., 22., 23., 1., 1., 1., 1., 1., 1.}));
 }
+
+TEST(ConcatToIdentity, Cuda) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    auto t1 = gCpu->addTensor({2, 2, 3, 1}, DataType::Float32);
+    auto t2 = gCpu->addTensor({0}, DataType::Float32);
+    gCpu->dataMalloc();
+    t1->setData(IncrementalGenerator());
+    t2->setData(OneGenerator());
+
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+    auto t1Gpu = gCuda->cloneTensor(t1);
+    auto t2Gpu = gCuda->cloneTensor(t2);
+
+    auto op = gCuda->addOp<ConcatObj>(TensorVec{t1Gpu, t2Gpu}, nullptr, 2);
+    gCuda->dataMalloc();
+    t1Gpu->setData(IncrementalGenerator());
+    t2Gpu->setData(OneGenerator());
+    cudaRuntime->run(gCuda);
+
+    // cudaPrintTensor(op->getOutput());
+    //  copy output from CUDA to CPU
+    auto oCpu = gCpu->cloneTensor(op->getOutput());
+    EXPECT_TRUE(
+        oCpu->equalData(vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}));
+}
 } // namespace infini
diff --git a/test/operators/test_concat.cc b/test/operators/test_concat.cc
index 9a0fe74e..32b50aa2 100644
--- a/test/operators/test_concat.cc
+++ b/test/operators/test_concat.cc
@@ -14,4 +14,13 @@ TEST(Concat, ShapeInfer) {
     EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 9}));
 }
 
+TEST(Concat, ShapeInfer1) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+    auto t1 = g->addTensor({1, 3, 2, 4}, DataType::Float32);
+    auto t2 = g->addTensor({0}, DataType::Float32);
+
+    auto op = g->addOp<ConcatObj>(TensorVec{t1, t2}, nullptr, 3);
+    EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 4}));
+}
 } // namespace infini

From 595a9906d2b1df7697956e0f27983122f8df5d13 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
Date: Fri, 24 Nov 2023 09:24:25 +0800
Subject: [PATCH 11/28] add infer index function (#175)

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 include/cuda/cuda_where.h       |  5 +-
 include/utils/broadcast_shape.h |  6 +--
 src/kernels/cuda/where.cc       |  8 +--
 src/kernels/cuda/where.cu       | 94 ++++++++++++++-------------------
 4 files changed, 52 insertions(+), 61 deletions(-)

diff --git a/include/cuda/cuda_where.h b/include/cuda/cuda_where.h
index 15ad29ec..bc6d3e81 100644
--- a/include/cuda/cuda_where.h
+++ b/include/cuda/cuda_where.h
@@ -5,7 +5,8 @@
 namespace infini {
 void whereKernel(const float *inputX, const float *inputY,
                  const uint8_t *condition, float *output, int nDims,
-                 SmallArray inputXShape, SmallArray inputYShape,
-                 SmallArray conditionShape, SmallArray outputShape);
+                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
+                 SmallArray conditionShape, SmallArray outputShape, int xSize,
+                 int ySize, int cSize);
 
 }; // namespace infini
diff --git a/include/utils/broadcast_shape.h b/include/utils/broadcast_shape.h
index 1f45ddcc..e794ff90 100644
--- a/include/utils/broadcast_shape.h
+++ b/include/utils/broadcast_shape.h
@@ -3,11 +3,11 @@
 namespace infini {
 void broadcastShape(const Shape &originShape, SmallArray &modifyShape,
                     int nDims, int size) {
-    for (int i = nDims - 1; i >= 0; --i) {
+    for (int i = nDims - size - 1; i >= 0; --i) {
         modifyShape.data[i] = 1;
     }
-    for (int i = size - 1; i >= 0; --i) {
-        modifyShape.data[i + nDims - size] = originShape[i];
+    for (int i = nDims - 1; i >= nDims - size; --i) {
+        modifyShape.data[i] = originShape[i - nDims + size];
     }
 }
 
diff --git a/src/kernels/cuda/where.cc b/src/kernels/cuda/where.cc
index 9898ab7d..df5e4476 100644
--- a/src/kernels/cuda/where.cc
+++ b/src/kernels/cuda/where.cc
@@ -23,21 +23,23 @@ class WhereCuda : public CudaKernelWithoutConfig {
         const int xSize = op->getInputs(0)->getRank();
         const int ySize = op->getInputs(1)->getRank();
         const int cSize = op->getInputs(2)->getRank();
+
         int nDims = op->getOutput()->getDims().size();
         IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
-
+        int outputsize = 1;
         SmallArray inputXShape, inputYShape, conditionShape, outputShape;
         for (int i = nDims - 1; i >= 0; --i) {
             outputShape.data[i] = opOutputShape[i];
+            outputsize *= outputShape.data[i];
         }
-
         broadcastShape(opInputXShape, inputXShape, nDims, xSize);
         broadcastShape(opInputYShape, inputYShape, nDims, ySize);
         broadcastShape(opConditionShape, conditionShape, nDims, cSize);
 
         whereKernel((float *)inputXData, (float *)inputYData,
                     (uint8_t *)conditionData, (float *)outputData, nDims,
-                    inputXShape, inputYShape, conditionShape, outputShape);
+                    outputsize, inputXShape, inputYShape, conditionShape,
+                    outputShape, xSize, ySize, cSize);
     }
 };
 
diff --git a/src/kernels/cuda/where.cu b/src/kernels/cuda/where.cu
index ce6579f8..ac8b514a 100644
--- a/src/kernels/cuda/where.cu
+++ b/src/kernels/cuda/where.cu
@@ -1,61 +1,40 @@
 #include "cuda/cuda_common.h"
 #include "utils/small_array.h"
 
+__device__ int inferIndex(infini::SmallArray inputShape,
+                          infini::SmallArray outputShape, int nDims, int size,
+                          int outputIdx) {
+    int inputIdx = 0;
+    int tempInput = 1;
+    int tempOutput = 1;
+    for (int i = nDims - 1; i >= nDims - size; --i) {
+        tempOutput = outputIdx % outputShape.data[i];
+        if (inputShape.data[i] != 1) {
+            inputIdx += tempInput * tempOutput;
+        }
+        tempInput *= inputShape.data[i];
+        outputIdx /= outputShape.data[i];
+    }
+    return inputIdx;
+}
 __global__ void _whereKernel(const float *inputX, const float *inputY,
                              const uint8_t *condition, float *output, int nDims,
                              int outputsize, infini::SmallArray inputXShape,
                              infini::SmallArray inputYShape,
                              infini::SmallArray conditionShape,
-                             infini::SmallArray outputShape) {
+                             infini::SmallArray outputShape, int xSize,
+                             int ySize, int cSize) {
 
     int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
     if (outputIdx < outputsize) {
-        int inputXIdx = 0;
-        int temp_inputX = 1;
+        int conditionIdx =
+            inferIndex(conditionShape, outputShape, nDims, cSize, outputIdx);
+        int inputXIdx =
+            inferIndex(inputXShape, outputShape, nDims, xSize, outputIdx);
 
-        int inputYIdx = 0;
-        int temp_inputY = 1;
+        int inputYIdx =
+            inferIndex(inputYShape, outputShape, nDims, ySize, outputIdx);
 
-        int conditionIdx = 0;
-        int temp_condition = 1;
-
-        int tmp = 1;       // stored s,k,j,i in order
-        int v = outputIdx; // v = i(JKS) + j(KS) + k(S) + s
-        for (int i = nDims - 1; i >= 0; --i) {
-            if (i == 0) {
-                tmp = v; // i = outputIdx/(JKS)
-            } else {
-                tmp = v % outputShape.data[i]; // store s,k,j in order
-            }
-            if (inputXShape.data[i] == 1) {
-                inputXIdx += 0;
-            } else {
-                inputXIdx +=
-                    tmp *
-                    temp_inputX; // otherwise +i(JKS) or j(KS) or k(S) or s
-            }
-            temp_inputX *= inputXShape.data[i];
-            //----------------------------
-            if (inputYShape.data[i] == 1) {
-                inputYIdx += 0;
-            } else {
-                inputYIdx +=
-                    tmp *
-                    temp_inputY; // otherwise +i(JKS) or j(KS) or k(S) or s
-            }
-            temp_inputY *= inputYShape.data[i];
-            //--------------------------
-            if (conditionShape.data[i] == 1) {
-                conditionIdx += 0;
-            } else {
-                conditionIdx +=
-                    tmp *
-                    temp_condition; // otherwise +i(JKS) or j(KS) or k(S) or s
-            }
-            temp_condition *= conditionShape.data[i];
-            //-------------------------
-            v = v / outputShape.data[i];
-        }
         output[outputIdx] =
             condition[conditionIdx] ? inputX[inputXIdx] : inputY[inputYIdx];
     }
@@ -64,17 +43,26 @@ __global__ void _whereKernel(const float *inputX, const float *inputY,
 namespace infini {
 void whereKernel(const float *inputX, const float *inputY,
                  const uint8_t *condition, float *output, int nDims,
-                 SmallArray inputXShape, SmallArray inputYShape,
-                 SmallArray conditionShape, SmallArray outputShape) {
-    int outputsize = 1;
-
-    for (int i = 0; i < nDims; i++) {
-        outputsize *= outputShape.data[i];
+                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
+                 SmallArray conditionShape, SmallArray outputShape, int xSize,
+                 int ySize, int cSize) {
+    int blocksize;
+    if (outputsize > 511) {
+        blocksize = 1024;
+    } else if (outputsize > 255) {
+        blocksize = 512;
+    } else if (outputsize > 127) {
+        blocksize = 256;
+    } else if (outputsize > 63) {
+        blocksize = 128;
+    } else if (outputsize > 31) {
+        blocksize = 64;
+    } else {
+        blocksize = 32;
     }
-    int blocksize = 32 * 16;
     int gridsize = (outputsize + blocksize - 1) / blocksize;
     _whereKernel<<<gridsize, blocksize>>>(
         inputX, inputY, condition, output, nDims, outputsize, inputXShape,
-        inputYShape, conditionShape, outputShape);
+        inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
 }
 } // namespace infini

From 6ece3f4a777483cb11339f8892b08415504b337c Mon Sep 17 00:00:00 2001
From: PanZezhong1725 <141193946+PanZezhong1725@users.noreply.github.com>
Date: Fri, 24 Nov 2023 09:29:58 +0800
Subject: [PATCH 12/28] Add ReduceSum op and kernel (#160)

* Add reduceSum op and kernel

* fix merge and format

* Reduce: reuse cat macro, add doc string

---------

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 include/core/graph_handler.h                  |  2 +
 include/operators/{reduce_mean.h => reduce.h} | 26 ++++--
 pyinfinitensor/src/pyinfinitensor/onnx.py     | 49 +++++++++--
 pyinfinitensor/tests/test_onnx.py             |  8 ++
 src/core/graph_handler.cc                     | 29 ++++---
 src/ffi/ffi_infinitensor.cc                   | 19 +++--
 .../cuda/{reduce_mean.cc => reduce.cc}        | 24 +++++-
 src/operators/{reduce_mean.cc => reduce.cc}   | 31 ++++---
 test/core/test_graph_replace.cc               |  2 +-
 test/kernels/cuda/test_cuda_reduce.cc         | 83 +++++++++++++++++++
 test/kernels/cuda/test_cuda_reduce_mean.cc    | 61 --------------
 .../{test_reduce_mean.cc => test_reduce.cc}   | 22 +++--
 12 files changed, 235 insertions(+), 121 deletions(-)
 rename include/operators/{reduce_mean.h => reduce.h} (59%)
 rename src/kernels/cuda/{reduce_mean.cc => reduce.cc} (87%)
 rename src/operators/{reduce_mean.cc => reduce.cc} (67%)
 create mode 100644 test/kernels/cuda/test_cuda_reduce.cc
 delete mode 100644 test/kernels/cuda/test_cuda_reduce_mean.cc
 rename test/operators/{test_reduce_mean.cc => test_reduce.cc} (68%)

diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index 4b66f11a..8c4f59bc 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -73,6 +73,8 @@ class GraphHandlerObj {
     Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
     Tensor reduceMean(Tensor data, Tensor reduced,
                       const optional<vector<int>> &axes, bool keepdims);
+    Tensor reduceSum(Tensor data, Tensor reduced,
+                     const optional<vector<int>> &axes, bool keepdims);
     Tensor slice(Tensor input, Tensor output, const vector<int> &starts,
                  const vector<int> &ends, const optional<vector<int>> &axes,
                  const optional<vector<int>> &steps);
diff --git a/include/operators/reduce_mean.h b/include/operators/reduce.h
similarity index 59%
rename from include/operators/reduce_mean.h
rename to include/operators/reduce.h
index 18ef38b1..defcf9b3 100644
--- a/include/operators/reduce_mean.h
+++ b/include/operators/reduce.h
@@ -3,26 +3,29 @@
 
 namespace infini {
 /**
- * @brief Compute the mean of input tensor's elements along certain axes.
+ * @brief Compute the reduction of input tensor's elements along certain axes.
  *
  */
-class ReduceMeanObj : public OperatorObj {
+class ReduceBaseObj : public OperatorObj {
+  protected:
     set<int> axes; // axis to reduce
     bool keepDims;
 
   public:
     /**
-     * @brief Construct a new ReduceMean object.
+     * @brief Construct a new Reduce object.
      *
      * @param graph The computation graph that this operator belongs to.
+     * @param opType The operation type. Should be a Reduce operation.
      * @param input The input tensor.
      * @param output The output tensor.
      * @param axes Axes to reduce.
      * @param keepDims Keep the reduced dimensions or not.
      */
-    ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
-                  const optional<vector<int>> &axes, bool keepDims = true);
-    OP_CLONE(ReduceMeanObj);
+    ReduceBaseObj(GraphObj *graph, OpType opType, Tensor input, Tensor output,
+                  const optional<vector<int>> &axes, bool keepDims);
+    virtual ~ReduceBaseObj() {}
+    OP_CLONE(ReduceBaseObj);
     optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
 
     std::string toString() const override;
@@ -38,4 +41,15 @@ class ReduceMeanObj : public OperatorObj {
     vector<int> getOpAttrVector() const override;
 };
 
+class ReduceMeanObj : public ReduceBaseObj {
+  public:
+    ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
+                  const optional<vector<int>> &axes, bool keepDims = true);
+};
+
+class ReduceSumObj : public ReduceBaseObj {
+  public:
+    ReduceSumObj(GraphObj *graph, Tensor input, Tensor output,
+                 const optional<vector<int>> &axes, bool keepDims = true);
+};
 } // namespace infini
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index d48ef52a..ad842d5b 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -604,7 +604,7 @@ class OnnxStub:
                         ),
                     )
                 elif node.op_type == "ReduceMean":
-                    tensors[node.output[0]] = self.handler.reduce_mean(
+                    tensors[node.output[0]] = self.handler.reduceMean(
                         tensors[node.input[0]],
                         tensors.get(node.output[0]),
                         # NOTE(constroy): `axes` is an attribute until opset version 13.
@@ -678,12 +678,40 @@ class OnnxStub:
                         next((attr.i for attr in node.attribute if attr.name == "to")),
                     )
                 elif node.op_type == "ReduceSum":
-                    # ReduceSum is only implemented as allReduceSum.
-                    assert any(attr.name == "communicator" for attr in node.attribute)
-                    tensors[node.output[0]] = self.handler.allReduceSum(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
+                    if any(attr.name == "communicator" for attr in node.attribute):
+                        # ReduceSum with communicator is treated as allReduceSum.
+                        tensors[node.output[0]] = self.handler.allReduceSum(
+                            tensors[node.input[0]],
+                            tensors.get(node.output[0]),
+                        )
+                    else: 
+                        # NOTE: `axes` is an attribute until opset version 13.
+                        if len(node.input) > 1:
+                            axis = _parse_data(data[node.input[1]])
+                        else:
+                            axis =  next(
+                                (
+                                    attr.ints
+                                    for attr in node.attribute
+                                    if attr.name == "axes"
+                                ),
+                                None,
+                            )
+                        keepdims = next(
+                            (
+                                attr.i
+                                for attr in node.attribute
+                                if attr.name == "keepdims"
+                            ),
+                            1,
+                        ) != 0
+
+                        tensors[node.output[0]] = self.handler.reduceSum(
+                            tensors[node.input[0]],
+                            tensors.get(node.output[0]),
+                            axis,
+                            keepdims,
+                        )
                 elif node.op_type == "AllReduceSum":
                     tensors[node.output[0]] = self.handler.allReduceSum(
                         tensors[node.input[0]],
@@ -1044,8 +1072,11 @@ class OnnxStub:
             elif ty == backend.OpTypeId.Gather:
                 axis = backend.gather_axis_of(op)
                 ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
-            elif ty == backend.OpTypeId.ReduceMean:
-                axes, keepdims = backend.reduce_mean_attrs_of(op)
+            elif ty in [
+                backend.OpTypeId.ReduceMean,
+                backend.OpTypeId.ReduceSum
+            ]:
+                axes, keepdims = backend.reduce_attrs_of(op)
                 inputs.append(
                     ctx.push_data_input(
                         name, "axes", TensorProto.INT64, [len(axes)], axes
diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py
index 79df0294..8e1587b9 100644
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@@ -337,6 +337,14 @@ class TestStringMethods(unittest.TestCase):
             "ReduceMean", ["data"], ["reduced"], keepdims=1, name="reduceMean"
         )
         make_and_import_model(make_graph([reduceMean], "reduceMean", [data], [reduced]))
+    
+    def test_reduce_sum(self):
+        data = make_tensor_value_info("data", TensorProto.FLOAT, [2, 3, 3, 4])
+        reduced = make_tensor_value_info("reduced", TensorProto.FLOAT, [1, 1, 1, 1])
+        reduceSum = make_node(
+            "ReduceSum", ["data"], ["reduced"], keepdims=1, name="reduceSum"
+        )
+        make_and_import_model(make_graph([reduceSum], "reduceSum", [data], [reduced]))
 
     def test_slice(self):
         data = make_tensor_value_info("data", TensorProto.UINT32, [10, 64, 162, 162])
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index d2f54b2d..fdceba62 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -12,7 +12,7 @@
 #include "operators/matmul.h"
 #include "operators/pad.h"
 #include "operators/pooling.h"
-#include "operators/reduce_mean.h"
+#include "operators/reduce.h"
 #include "operators/reshape.h"
 #include "operators/slice.h"
 #include "operators/softmax.h"
@@ -302,18 +302,23 @@ Tensor GraphHandlerObj::gatherElements(Tensor data, Tensor indices,
     }
 }
 
-Tensor GraphHandlerObj::reduceMean(Tensor data, Tensor reduced,
-                                   const optional<vector<int>> &axes,
-                                   bool keepdims) {
-    if (reduced) {
-        g->addOpWithOutputs<ReduceMeanObj>(std::move(data), reduced, axes,
-                                           keepdims);
-        return reduced;
-    } else {
-        return g->addOp<ReduceMeanObj>(std::move(data), reduced, axes, keepdims)
-            ->getOutput();
+#define DEFINE_REDUCE_METHOD(name, obj)                                        \
+    Tensor GraphHandlerObj::name(Tensor data, Tensor reduced,                  \
+                                 const optional<vector<int>> &axes,            \
+                                 bool keepdims) {                              \
+        if (reduced) {                                                         \
+            g->addOpWithOutputs<_CAT(obj, Obj)>(std::move(data), reduced,      \
+                                                axes, keepdims);               \
+            return reduced;                                                    \
+        } else {                                                               \
+            return g                                                           \
+                ->addOp<_CAT(obj, Obj)>(std::move(data), reduced, axes,        \
+                                        keepdims)                              \
+                ->getOutput();                                                 \
+        }                                                                      \
     }
-}
+DEFINE_REDUCE_METHOD(reduceMean, ReduceMean)
+DEFINE_REDUCE_METHOD(reduceSum, ReduceSum)
 
 Tensor GraphHandlerObj::slice(Tensor input, Tensor output,
                               const vector<int> &starts,
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 5033a191..0bdfdcf9 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -8,7 +8,7 @@
 #include "operators/matmul.h"
 #include "operators/pad.h"
 #include "operators/pooling.h"
-#include "operators/reduce_mean.h"
+#include "operators/reduce.h"
 #include "operators/reshape.h"
 #include "operators/split.h"
 #include "operators/transpose.h"
@@ -90,6 +90,7 @@ void export_values(py::module &m) {
         .VALUE(OpType, Gather)
         .VALUE(OpType, GatherElements)
         .VALUE(OpType, ReduceMean)
+        .VALUE(OpType, ReduceSum)
         .VALUE(OpType, Reshape)
         .VALUE(OpType, Flatten)
         .VALUE(OpType, Identity)
@@ -219,12 +220,13 @@ clip_attrs_of(Operator op) {
     return std::make_tuple(clip->getMin(), clip->getMax());
 }
 
-static std::tuple<vector<int>, bool> reduce_mean_attrs_of(Operator op) {
-    IT_ASSERT(op->getOpType() == OpType::ReduceMean);
-    auto reduce_mean = dynamic_cast<const ReduceMeanObj *>(op.get());
-    auto &set = reduce_mean->getAxes();
+static std::tuple<vector<int>, bool> reduce_attrs_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::ReduceMean ||
+              op->getOpType() == OpType::ReduceSum);
+    auto reduce = dynamic_cast<const ReduceBaseObj *>(op.get());
+    auto &set = reduce->getAxes();
     return std::make_tuple(vector(set.begin(), set.end()),
-                           reduce_mean->getKeepDims());
+                           reduce->getKeepDims());
 }
 
 static int concat_axis_of(Operator op) {
@@ -319,7 +321,7 @@ void export_functions(py::module &m) {
         .FUNCTION(batch_norm_attrs_of)
         .FUNCTION(pool_attrs_of)
         .FUNCTION(clip_attrs_of)
-        .FUNCTION(reduce_mean_attrs_of)
+        .FUNCTION(reduce_attrs_of)
         .FUNCTION(tensor_dtype)
         .FUNCTION(reshape_shape_of)
         .FUNCTION(expand_shape_of)
@@ -497,7 +499,8 @@ void init_graph_builder(py::module &m) {
         .def("split", &Handler::split, policy::move)
         .def("gather", &Handler::gather, policy::move)
         .def("gatherElements", &Handler::gatherElements, policy::move)
-        .def("reduce_mean", &Handler::reduceMean, policy::move)
+        .def("reduceMean", &Handler::reduceMean, policy::move)
+        .def("reduceSum", &Handler::reduceSum, policy::move)
         .def("slice", &Handler::slice, policy::move)
         .def("pad", &Handler::pad, policy::move)
         .def("allReduceSum", &Handler::allReduceSum, policy::move)
diff --git a/src/kernels/cuda/reduce_mean.cc b/src/kernels/cuda/reduce.cc
similarity index 87%
rename from src/kernels/cuda/reduce_mean.cc
rename to src/kernels/cuda/reduce.cc
index 6ae357c8..840a572f 100644
--- a/src/kernels/cuda/reduce_mean.cc
+++ b/src/kernels/cuda/reduce.cc
@@ -1,12 +1,14 @@
-#include "operators/reduce_mean.h"
+#include "operators/reduce.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"
 
 namespace infini {
-class ReduceMeanCudnn : public CudaKernelWithoutConfig {
+class ReduceCudnnBase : public CudaKernelWithoutConfig {
+    virtual cudnnReduceTensorOp_t getReduceOp() const = 0;
+
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
-        auto op = as<ReduceMeanObj>(_op);
+        auto op = as<ReduceBaseObj>(_op);
         auto input = op->getInputs(0);
         auto output = op->getOutput();
         auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
@@ -71,7 +73,7 @@ class ReduceMeanCudnn : public CudaKernelWithoutConfig {
         cudnnReduceTensorDescriptor_t reduceDesc;
         checkCudnnError(cudnnCreateReduceTensorDescriptor(&reduceDesc));
         checkCudnnError(cudnnSetReduceTensorDescriptor(
-            reduceDesc, CUDNN_REDUCE_TENSOR_AVG, CUDNN_DATA_FLOAT,
+            reduceDesc, getReduceOp(), CUDNN_DATA_FLOAT,
             CUDNN_NOT_PROPAGATE_NAN, CUDNN_REDUCE_TENSOR_NO_INDICES,
             CUDNN_32BIT_INDICES));
 
@@ -106,6 +108,20 @@ class ReduceMeanCudnn : public CudaKernelWithoutConfig {
     }
 };
 
+class ReduceMeanCudnn : public ReduceCudnnBase {
+    cudnnReduceTensorOp_t getReduceOp() const override {
+        return CUDNN_REDUCE_TENSOR_AVG;
+    }
+};
+
+class ReduceSumCudnn : public ReduceCudnnBase {
+    cudnnReduceTensorOp_t getReduceOp() const override {
+        return CUDNN_REDUCE_TENSOR_ADD;
+    }
+};
+
 REGISTER_KERNEL(Device::CUDA, OpType::ReduceMean, DataType::Float32,
                 ReduceMeanCudnn, "ReduceMean_cuDNN_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::ReduceSum, DataType::Float32,
+                ReduceSumCudnn, "ReduceSum_cuDNN_CUDA_Float32");
 }; // namespace infini
diff --git a/src/operators/reduce_mean.cc b/src/operators/reduce.cc
similarity index 67%
rename from src/operators/reduce_mean.cc
rename to src/operators/reduce.cc
index cf801c59..1626cb15 100644
--- a/src/operators/reduce_mean.cc
+++ b/src/operators/reduce.cc
@@ -1,10 +1,11 @@
-#include "operators/reduce_mean.h"
+#include "operators/reduce.h"
 #include "utils/operator_utils.h"
 
 namespace infini {
-ReduceMeanObj::ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
-                             const optional<vector<int>> &_axes, bool keepDims)
-    : OperatorObj(OpType::ReduceMean, {input}, {output}), keepDims(keepDims) {
+ReduceBaseObj::ReduceBaseObj(GraphObj *graph, OpType opType, Tensor input,
+                             Tensor output, const optional<vector<int>> &_axes,
+                             bool keepDims)
+    : OperatorObj(opType, {input}, {output}), keepDims(keepDims) {
     const auto size = input->getRank();
     if (_axes) {
         for (auto idx : *_axes) {
@@ -17,11 +18,11 @@ ReduceMeanObj::ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
     IT_ASSERT(checkValid(graph));
 }
 
-bool ReduceMeanObj::isReduced(int idx) const {
+bool ReduceBaseObj::isReduced(int idx) const {
     return axes.find(idx) != axes.end();
 }
 
-optional<vector<Shape>> ReduceMeanObj::inferShape(const TensorVec &inputs) {
+optional<vector<Shape>> ReduceBaseObj::inferShape(const TensorVec &inputs) {
     auto dims = inputs[0]->getDims();
     auto rank = inputs[0]->getRank();
 
@@ -43,10 +44,9 @@ optional<vector<Shape>> ReduceMeanObj::inferShape(const TensorVec &inputs) {
     }
 }
 
-std::string ReduceMeanObj::toString() const {
+std::string ReduceBaseObj::toString() const {
     std::ostringstream os;
-    os << "ReduceMean"
-       << "[" << getGuid() << "]";
+    os << type.toString() << "[" << getGuid() << "]";
     os << "(";
     os << vecToString(inputs[0]->getDims()) << ",";
 
@@ -66,7 +66,7 @@ std::string ReduceMeanObj::toString() const {
     return os.str();
 }
 
-vector<int> ReduceMeanObj::getWorkloadVector() const {
+vector<int> ReduceBaseObj::getWorkloadVector() const {
     vector<int> ret = inputs[0]->getDims();
     ret.emplace(ret.begin(), type.underlying());
     ret.emplace_back((int)keepDims);
@@ -74,9 +74,18 @@ vector<int> ReduceMeanObj::getWorkloadVector() const {
     return ret;
 }
 
-vector<int> ReduceMeanObj::getOpAttrVector() const {
+vector<int> ReduceBaseObj::getOpAttrVector() const {
     vector<int> ret = {type.underlying(), (int)keepDims};
     ret.insert(ret.end(), axes.begin(), axes.end());
     return ret;
 }
+
+ReduceMeanObj::ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
+                             const optional<vector<int>> &_axes, bool keepDims)
+    : ReduceBaseObj(graph, OpType::ReduceMean, input, output, _axes, keepDims) {
+}
+
+ReduceSumObj::ReduceSumObj(GraphObj *graph, Tensor input, Tensor output,
+                           const optional<vector<int>> &_axes, bool keepDims)
+    : ReduceBaseObj(graph, OpType::ReduceSum, input, output, _axes, keepDims) {}
 } // namespace infini
diff --git a/test/core/test_graph_replace.cc b/test/core/test_graph_replace.cc
index cada8860..fdecd3ce 100644
--- a/test/core/test_graph_replace.cc
+++ b/test/core/test_graph_replace.cc
@@ -7,7 +7,7 @@
 #include "operators/extend.h"
 #include "operators/pad.h"
 #include "operators/pooling.h"
-#include "operators/reduce_mean.h"
+#include "operators/reduce.h"
 #include "operators/slice.h"
 #include "operators/split.h"
 #include "operators/unary.h"
diff --git a/test/kernels/cuda/test_cuda_reduce.cc b/test/kernels/cuda/test_cuda_reduce.cc
new file mode 100644
index 00000000..9ce31032
--- /dev/null
+++ b/test/kernels/cuda/test_cuda_reduce.cc
@@ -0,0 +1,83 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/reduce.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <typename ReduceObjT>
+void test_reduce(const Shape &shape, const vector<float> &data,
+                 const optional<const vector<int>> &axis, bool keepDims,
+                 const vector<float> &ExpectData) {
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor icpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+
+    // Build CUDA graph
+    Graph g = make_ref<GraphObj>(cudaRuntime);
+    auto i = g->cloneTensor(icpu);
+    auto op = g->addOp<ReduceObjT>(i, nullptr, axis, keepDims);
+
+    // allocate CUDA memory
+    g->dataMalloc();
+    i->copyin(data);
+
+    // Execute on CUDA
+    cudaRuntime->run(g);
+
+    // clone CUDA output to CPU
+    auto o = op->getOutput();
+    auto ocpu = o->clone(cpuRuntime);
+
+    //  check results on CPU
+    EXPECT_TRUE(ocpu->equalData(ExpectData));
+}
+
+TEST(CUDA_ReduceMean, run) {
+    test_reduce<ReduceMeanObj>(
+        Shape{3, 2, 2}, vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2},
+        std::nullopt, true, vector<float>{18.25});
+    test_reduce<ReduceMeanObj>(
+        Shape{1, 3, 2, 2, 1},
+        vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2}, std::nullopt,
+        false, vector<float>{18.25});
+
+    test_reduce<ReduceMeanObj>(
+        Shape{2, 3, 2, 2},
+        vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                      12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+        vector<int>{1, 2}, false, vector<float>{5, 6, 17, 18});
+    test_reduce<ReduceMeanObj>(
+        Shape{2, 3, 2, 2, 1},
+        vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                      12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+        vector<int>{1, 2}, true, vector<float>{5, 6, 17, 18});
+}
+
+TEST(CUDA_ReduceSum, run) {
+    test_reduce<ReduceSumObj>(Shape{3, 2, 2},
+                              vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+                              std::nullopt, true, vector<float>{12});
+    test_reduce<ReduceSumObj>(Shape{1, 3, 2, 2, 1},
+                              vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+                              std::nullopt, false, vector<float>{12});
+
+    test_reduce<ReduceSumObj>(
+        Shape{2, 3, 2, 2},
+        vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                      12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+        vector<int>{1, 2}, false, vector<float>{30, 36, 102, 108});
+    test_reduce<ReduceSumObj>(
+        Shape{2, 3, 2, 2, 1},
+        vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                      12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+        vector<int>{1, 2}, true, vector<float>{30, 36, 102, 108});
+}
+
+} // namespace infini
diff --git a/test/kernels/cuda/test_cuda_reduce_mean.cc b/test/kernels/cuda/test_cuda_reduce_mean.cc
deleted file mode 100644
index 2ad672a7..00000000
--- a/test/kernels/cuda/test_cuda_reduce_mean.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "core/graph.h"
-#include "core/kernel.h"
-#include "core/runtime.h"
-#include "cuda/cuda_runtime.h"
-#include "cuda/cuda_utility.h"
-#include "operators/reduce_mean.h"
-
-#include "test.h"
-
-namespace infini {
-
-void test_reducemean(const Shape &shape, const vector<float> &data,
-                     const optional<const vector<int>> &axis, bool keepDims,
-                     const vector<float> &ExpectData) {
-    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
-    auto cudaRuntime = make_ref<CudaRuntimeObj>();
-
-    // Build input data on CPU
-    Tensor icpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
-
-    // Build CUDA graph
-    Graph g = make_ref<GraphObj>(cudaRuntime);
-    auto i = g->cloneTensor(icpu);
-    auto op = g->addOp<ReduceMeanObj>(i, nullptr, axis, keepDims);
-
-    // allocate CUDA memory
-    g->dataMalloc();
-    i->copyin(data);
-
-    // Execute on CUDA
-    cudaRuntime->run(g);
-
-    // clone CUDA output to CPU
-    auto o = op->getOutput();
-    auto ocpu = o->clone(cpuRuntime);
-
-    //  check results on CPU
-    EXPECT_TRUE(ocpu->equalData(ExpectData));
-}
-
-TEST(CUDA_ReduceMean, run) {
-    test_reducemean(Shape{3, 2, 2},
-                    vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2},
-                    std::nullopt, true, vector<float>{18.25});
-    test_reducemean(Shape{1, 3, 2, 2, 1},
-                    vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2},
-                    std::nullopt, false, vector<float>{18.25});
-
-    test_reducemean(Shape{2, 3, 2, 2},
-                    vector<float>{0,  1,  2,  3,  4,  5,  6,  7,
-                                  8,  9,  10, 11, 12, 13, 14, 15,
-                                  16, 17, 18, 19, 20, 21, 22, 23},
-                    vector<int>{1, 2}, false, vector<float>{5, 6, 17, 18});
-    test_reducemean(Shape{2, 3, 2, 2, 1},
-                    vector<float>{0,  1,  2,  3,  4,  5,  6,  7,
-                                  8,  9,  10, 11, 12, 13, 14, 15,
-                                  16, 17, 18, 19, 20, 21, 22, 23},
-                    vector<int>{1, 2}, true, vector<float>{5, 6, 17, 18});
-}
-
-} // namespace infini
diff --git a/test/operators/test_reduce_mean.cc b/test/operators/test_reduce.cc
similarity index 68%
rename from test/operators/test_reduce_mean.cc
rename to test/operators/test_reduce.cc
index 336d4018..83269bb0 100644
--- a/test/operators/test_reduce_mean.cc
+++ b/test/operators/test_reduce.cc
@@ -1,51 +1,55 @@
 #include "core/graph.h"
 #include "core/kernel.h"
 #include "core/runtime.h"
-#include "operators/reduce_mean.h"
+#include "operators/reduce.h"
 
 #include "test.h"
 
 namespace infini {
 
-TEST(ReduceMean, ShapeInference) {
+template <typename ReduceObjT> void testShapeInference() {
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
     {
         Graph g = make_ref<GraphObj>(runtime);
         Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
-        auto op = g->addOp<ReduceMeanObj>(i, nullptr, std::nullopt, true);
+        auto op = g->addOp<ReduceObjT>(i, nullptr, std::nullopt, true);
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 1, 1, 1}));
     }
     {
         Graph g = make_ref<GraphObj>(runtime);
         Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
-        auto op = g->addOp<ReduceMeanObj>(i, nullptr, vector<int>{1, 3}, true);
+        auto op = g->addOp<ReduceObjT>(i, nullptr, vector<int>{1, 3}, true);
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 3, 1}));
     }
     {
         Graph g = make_ref<GraphObj>(runtime);
         Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
-        auto op = g->addOp<ReduceMeanObj>(i, nullptr, vector<int>{-3, 3}, true);
+        auto op = g->addOp<ReduceObjT>(i, nullptr, vector<int>{-3, 3}, true);
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 3, 1}));
     }
     {
         Graph g = make_ref<GraphObj>(runtime);
         Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
-        auto op = g->addOp<ReduceMeanObj>(i, nullptr, std::nullopt, false);
+        auto op = g->addOp<ReduceObjT>(i, nullptr, std::nullopt, false);
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{1}));
     }
     {
         Graph g = make_ref<GraphObj>(runtime);
         Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
-        auto op = g->addOp<ReduceMeanObj>(i, nullptr, vector<int>{1, 3}, false);
+        auto op = g->addOp<ReduceObjT>(i, nullptr, vector<int>{1, 3}, false);
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3}));
     }
     {
         Graph g = make_ref<GraphObj>(runtime);
         Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
-        auto op =
-            g->addOp<ReduceMeanObj>(i, nullptr, vector<int>{-3, 3}, false);
+        auto op = g->addOp<ReduceObjT>(i, nullptr, vector<int>{-3, 3}, false);
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3}));
     }
 }
 
+TEST(ReduceMean, ShapeInference) {
+    testShapeInference<ReduceMeanObj>();
+    testShapeInference<ReduceSumObj>();
+}
+
 } // namespace infini

From a7293c12bae09cd3bc787808a90547d56b6b58a9 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
Date: Fri, 24 Nov 2023 15:15:14 +0800
Subject: [PATCH 13/28] Add layer normalization  (#181)

* - add layernorm kernel

* success:add layernorm kernel and test

* fix: remove unusalble comments

* fix: modify code as reviewer suggested

* debug,modified .cu and test

* optional bias support

* overloading function

* fix bug after merging; remove time constrain in conv test

---------

Co-authored-by: kilinchange <kilinchange@163.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 include/core/graph_handler.h              |   2 +
 include/cuda/cuda_layernorm.h             |  11 +
 include/operators/layer_norm.h            |  30 ++
 include/utils/operator_utils.h            |   2 +
 pyinfinitensor/src/pyinfinitensor/onnx.py |  19 +
 src/core/graph_handler.cc                 |  18 +
 src/ffi/ffi_infinitensor.cc               |   1 +
 src/kernels/cuda/layer_norm.cc            |  45 +++
 src/kernels/cuda/layer_norm.cu            | 421 ++++++++++++++++++++++
 src/operators/layer_norm.cc               |  64 ++++
 src/utils/operator_utils.cc               |  23 ++
 test/kernels/cuda/test_cuda_layernorm.cc  | 146 ++++++++
 test/operators/test_conv.cc               |   4 -
 13 files changed, 782 insertions(+), 4 deletions(-)
 create mode 100644 include/cuda/cuda_layernorm.h
 create mode 100644 include/operators/layer_norm.h
 create mode 100644 src/kernels/cuda/layer_norm.cc
 create mode 100644 src/kernels/cuda/layer_norm.cu
 create mode 100644 src/operators/layer_norm.cc
 create mode 100644 test/kernels/cuda/test_cuda_layernorm.cc

diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index 8c4f59bc..76f6e0c2 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -30,6 +30,8 @@ class GraphHandlerObj {
     Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
                               Tensor var, Tensor scale, Tensor bias,
                               float momentum, float eps, bool training);
+    Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
+                              Tensor bias, float eps, int axis, int stash_type);
 
     Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
                    int ph, int pw, int sh, int sw, int ceilMode);
diff --git a/include/cuda/cuda_layernorm.h b/include/cuda/cuda_layernorm.h
new file mode 100644
index 00000000..997c8a06
--- /dev/null
+++ b/include/cuda/cuda_layernorm.h
@@ -0,0 +1,11 @@
+#pragma once
+#include "operators/unary.h"
+
+namespace infini {
+void LaynormKernel(const float *input, const float *scale, const float eps,
+                   int size, int scaleSize, const int dimsize, const int stride,
+                   float *output, const float *bias, int biasSize);
+void LaynormKernel(const float *input, const float *scale, const float eps,
+                   int size, int scaleSize, const int dimsize, const int stride,
+                   float *output);
+}; // namespace infini
diff --git a/include/operators/layer_norm.h b/include/operators/layer_norm.h
new file mode 100644
index 00000000..8534648f
--- /dev/null
+++ b/include/operators/layer_norm.h
@@ -0,0 +1,30 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class LayerNormObj : public OperatorObj {
+    float eps;
+    int axis, stash_type;
+
+  public:
+    LayerNormObj(GraphObj *graph, Tensor input, Tensor scale, Tensor output,
+                 Tensor bias = nullptr, float eps = 1e-5, int axis = -1,
+                 int stash_type = 1);
+    OP_CLONE(LayerNormObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    std::string toString() const override;
+
+    Tensor getBias() const { return inputs.size() > 2 ? inputs[2] : nullptr; }
+    int numInputs() const override { return inputs.size(); }
+    int numOutputs() const override { return outputs.size(); }
+    float getEps() const { return eps; }
+    int getAxis() const { return axis; }
+    int getStashType() const { return stash_type; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+
+    vector<DataType> inferDataType(const TensorVec &inputs) const override;
+};
+} // namespace infini
diff --git a/include/utils/operator_utils.h b/include/utils/operator_utils.h
index 01703252..4f6a6985 100644
--- a/include/utils/operator_utils.h
+++ b/include/utils/operator_utils.h
@@ -10,6 +10,8 @@ namespace infini {
 Shape infer_broadcast(const Shape &A, const Shape &B);
 // Launch the real axis based on rank and current axis
 int get_real_axis(const int &axis, const int &rank);
+// check if tensor B is unidirectional broadcastable to tensor A
+bool is_unidirectional_broadcasting(const Shape &A, const Shape &B);
 } // namespace infini
 
 #endif
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index ad842d5b..f0326d88 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -238,6 +238,25 @@ class OnnxStub:
                         eps,
                         training != 0,
                     )
+                elif node.op_type == "LayerNormalization":
+                    (input, scale) = (tensors[node.input[i]] for i in [0, 1])
+                    bias = None if len(node.input) < 3 else tensors[node.input[2]]
+                    output = tensors.get(node.output[0])
+                    attributes = _parse_attribute(
+                        node, {"axis": -1, "epsilon": 1e-05, "stash_type": 1}
+                    )
+                    (axis, eps, stash_type) = (
+                        attributes[name] for name in ["axis", "epsilon", "stash_type"]
+                    )
+                    tensors[node.output[0]] = self.handler.layerNormalization(
+                        input,
+                        scale,
+                        output,
+                        bias,
+                        eps,
+                        axis,
+                        stash_type,
+                    )
                 elif node.op_type == "MaxPool":
                     attributes = _parse_attribute(
                         node,
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index fdceba62..de156c43 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -9,6 +9,7 @@
 #include "operators/element_wise.h"
 #include "operators/expand.h"
 #include "operators/gather.h"
+#include "operators/layer_norm.h"
 #include "operators/matmul.h"
 #include "operators/pad.h"
 #include "operators/pooling.h"
@@ -96,6 +97,23 @@ Tensor GraphHandlerObj::batchNormalization(Tensor input, Tensor output,
     }
 }
 
+Tensor GraphHandlerObj::layerNormalization(Tensor input, Tensor scale,
+                                           Tensor output, Tensor bias,
+                                           float eps, int axis,
+                                           int stash_type) {
+    if (output) {
+        g->addOpWithOutputs<LayerNormObj>(std::move(input), std::move(scale),
+                                          output, std::move(bias), eps, axis,
+                                          stash_type);
+        return output;
+    } else {
+        return g
+            ->addOp<LayerNormObj>(std::move(input), std::move(scale), output,
+                                  std::move(bias), eps, axis, stash_type)
+            ->getOutput();
+    }
+}
+
 Tensor GraphHandlerObj::maxPool(Tensor input, Tensor output, int kh, int kw,
                                 int dh, int dw, int ph, int pw, int sh, int sw,
                                 int ceilMode) {
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 0bdfdcf9..408d3514 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -466,6 +466,7 @@ void init_graph_builder(py::module &m) {
         .def("convTransposed2d", &Handler::convTransposed2d, policy::move)
         .def("matmul", &Handler::matmul, policy::move)
         .def("batchNormalization", &Handler::batchNormalization, policy::move)
+        .def("layerNormalization", &Handler::layerNormalization, policy::move)
         .def("maxPool", &Handler::maxPool, policy::move)
         .def("avgPool", &Handler::avgPool, policy::move)
         .def("add", &Handler::add, policy::move)
diff --git a/src/kernels/cuda/layer_norm.cc b/src/kernels/cuda/layer_norm.cc
new file mode 100644
index 00000000..a301eb0b
--- /dev/null
+++ b/src/kernels/cuda/layer_norm.cc
@@ -0,0 +1,45 @@
+#include "operators/layer_norm.h"
+#include "cuda/cuda_kernel_wihtout_config.h"
+#include "cuda/cuda_layernorm.h"
+#include "cuda/cuda_runtime.h"
+
+namespace infini {
+
+class LayerNormCuda : public CudaKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<LayerNormObj>(_op);
+
+        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const scaleData = (op->getInputs(1)->getRawDataPtr<void *>());
+
+        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
+        const auto &opOutputShape = op->getOutput()->getDims();
+
+        float eps = op->getEps();
+        const int axis = op->getAxis();
+        const int stride = op->getInputs(0)->getStride().at(axis);
+
+        auto dims = op->getInputs(0)->getDims();
+        int dimsize = dims[op->getAxis()];
+        int size = op->getOutput(0)->size();
+        int scaleSize = op->getInputs(1)->size();
+        if (op->numInputs() == 3) {
+            void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
+            int biasSize = op->getInputs(2)->size();
+            // printf("kernel bias:true:%d\n", 1);
+            LaynormKernel((float *)inputData, (float *)scaleData, eps, size,
+                          scaleSize, dimsize, stride, (float *)outputData,
+                          (float *)biasData, biasSize);
+        } else {
+            // printf("kernel bias:false:%d\n", 0);
+            LaynormKernel((float *)inputData, (float *)scaleData, eps, size,
+                          scaleSize, dimsize, stride, (float *)outputData);
+        }
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::LayerNormalization, DataType::Float32,
+                LayerNormCuda, "LayerNorm_CUDA_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/cuda/layer_norm.cu b/src/kernels/cuda/layer_norm.cu
new file mode 100644
index 00000000..c5e6e492
--- /dev/null
+++ b/src/kernels/cuda/layer_norm.cu
@@ -0,0 +1,421 @@
+#include "cuda/cuda_common.h"
+#include <cub/cub.cuh>
+
+template <int BLOCK_DIM>
+__launch_bounds__(BLOCK_DIM) __global__
+    void blockLaynormKernel(const float *input, const float *scale,
+                            const int dimsize, const int stride, float *output,
+                            const float eps, int scaleSize, const float *bias,
+                            int biasSize) {
+    // len(scale) = len(bias) = dimsize
+    int tmp = blockIdx.x % stride;
+    int tid = (blockIdx.x - tmp) * dimsize + tmp;
+    float muPartial = 0.0f;
+    for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+        muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
+    }
+    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    __shared__ float mu;
+    float muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum());
+    if (threadIdx.x ==
+        0) { // must set threadIdx.x = 0 write the output to memory
+        mu = muBlock / dimsize;
+    }
+    __syncthreads();
+
+    float sigma2Partial = 0.0f;
+    for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+        sigma2Partial +=
+            (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
+            (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu);
+    }
+    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+
+    __shared__ float sigma2;
+    float sigma2Block =
+        BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
+    if (threadIdx.x ==
+        0) { // must set threadIdx.x = 0 write the output to memory
+        sigma2 = sigma2Block / dimsize;
+    }
+    __syncthreads();
+    if (biasSize == dimsize) {
+        if (scaleSize == dimsize) {
+            for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+
+                output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
+                    scale[threadIdx.x + ph * BLOCK_DIM] *
+                        (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
+                         mu) /
+                        sqrt(sigma2 + eps) +
+                    bias[threadIdx.x + ph * BLOCK_DIM];
+            }
+        } else {
+            for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+
+                output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
+                    scale[0] *
+                        (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
+                         mu) /
+                        sqrt(sigma2 + eps) +
+                    bias[threadIdx.x + ph * BLOCK_DIM];
+            }
+        }
+    } else {
+        if (scaleSize == dimsize) {
+            for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+
+                output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
+                    scale[threadIdx.x + ph * BLOCK_DIM] *
+                        (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
+                         mu) /
+                        sqrt(sigma2 + eps) +
+                    bias[0];
+            }
+        } else {
+            for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+
+                output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
+                    scale[0] *
+                        (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
+                         mu) /
+                        sqrt(sigma2 + eps) +
+                    bias[0];
+            }
+        }
+    }
+}
+//-----------------
+template <int BLOCK_DIM>
+__launch_bounds__(BLOCK_DIM) __global__
+    void blockLaynormKernel(const float *input, const float *scale,
+                            const int dimsize, const int stride, float *output,
+                            const float eps, int scaleSize) {
+    // len(scale) = len(bias) = dimsize
+    int tmp = blockIdx.x % stride;
+    int tid = (blockIdx.x - tmp) * dimsize + tmp;
+    float muPartial = 0.0f;
+    for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+        muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
+    }
+    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    __shared__ float mu;
+    float muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum());
+    if (threadIdx.x ==
+        0) { // must set threadIdx.x = 0 write the output to memory
+        mu = muBlock / dimsize;
+    }
+    __syncthreads();
+
+    float sigma2Partial = 0.0f;
+    for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+        sigma2Partial +=
+            (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
+            (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu);
+    }
+    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+
+    __shared__ float sigma2;
+    float sigma2Block =
+        BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
+    if (threadIdx.x ==
+        0) { // must set threadIdx.x = 0 write the output to memory
+        sigma2 = sigma2Block / dimsize;
+    }
+    __syncthreads();
+    if (scaleSize == dimsize) {
+        for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+
+            output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
+                scale[threadIdx.x + ph * BLOCK_DIM] *
+                (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) /
+                sqrt(sigma2 + eps);
+        }
+    } else {
+        for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+
+            output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
+                scale[0] *
+                (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) /
+                sqrt(sigma2 + eps);
+        }
+    }
+}
+//-----------------
+template <typename T> struct SumOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return a + b;
+    }
+};
+
+template <template <typename> class ReductionOp, typename T,
+          int thread_group_width>
+__inline__ __device__ T WarpAllReduce(T val) {
+    for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+        val = ReductionOp<T>()(val, __shfl_xor_sync(0xffffffff, val, mask));
+    }
+    return val;
+}
+template <int BLOCK_DIM_x, int BLOCK_DIM_y>
+__global__ void warpLaynormKernel(const float *input, const float *scale,
+                                  const int dimsize, const int stride,
+                                  float *output, const float eps, int scaleSize,
+                                  int otherSize, const float *bias,
+                                  int biasSize) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+
+    int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
+    if (otherIdx < otherSize) {
+
+        __shared__ float muTotal[BLOCK_DIM_y];
+        __shared__ float sigma2Total[BLOCK_DIM_y];
+
+        float muPartial = 0.0f;
+
+        for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
+            muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride];
+        }
+
+        muPartial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(muPartial);
+
+        if (threadIdx.x == 0)
+            muTotal[threadIdx.y] = muPartial / dimsize;
+
+        //--------------------------------------------
+        float sigma2Partial = 0.0f;
+
+        for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
+            sigma2Partial +=
+                (input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                 muTotal[threadIdx.y]) *
+                (input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                 muTotal[threadIdx.y]);
+        }
+
+        sigma2Partial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sigma2Partial);
+
+        if (threadIdx.x == 0)
+            sigma2Total[threadIdx.y] = sigma2Partial / dimsize;
+
+        //--------------------------------------------
+        if (biasSize == dimsize) {
+            if (scaleSize == dimsize) {
+                for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize;
+                     ph++) {
+
+                    output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
+                        scale[threadIdx.x + ph * BLOCK_DIM_x] *
+                            (input[tid +
+                                   (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                             muTotal[threadIdx.y]) /
+                            sqrt(sigma2Total[threadIdx.y] + eps) +
+                        bias[threadIdx.x + ph * BLOCK_DIM_x];
+                }
+            } else {
+                for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize;
+                     ph++) {
+
+                    output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
+                        scale[0] *
+                            (input[tid +
+                                   (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                             muTotal[threadIdx.y]) /
+                            sqrt(sigma2Total[threadIdx.y] + eps) +
+                        bias[threadIdx.x + ph * BLOCK_DIM_x];
+                }
+            }
+        } else {
+            if (scaleSize == dimsize) {
+                for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize;
+                     ph++) {
+
+                    output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
+                        scale[threadIdx.x + ph * BLOCK_DIM_x] *
+                            (input[tid +
+                                   (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                             muTotal[threadIdx.y]) /
+                            sqrt(sigma2Total[threadIdx.y] + eps) +
+                        bias[0];
+                }
+            } else {
+                for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize;
+                     ph++) {
+
+                    output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
+                        scale[0] *
+                            (input[tid +
+                                   (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                             muTotal[threadIdx.y]) /
+                            sqrt(sigma2Total[threadIdx.y] + eps) +
+                        bias[0];
+                }
+            }
+        }
+    }
+}
+template <int BLOCK_DIM_x, int BLOCK_DIM_y>
+__global__ void warpLaynormKernel(const float *input, const float *scale,
+                                  const int dimsize, const int stride,
+                                  float *output, const float eps, int scaleSize,
+                                  int otherSize) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+
+    int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
+    if (otherIdx < otherSize) {
+
+        __shared__ float muTotal[BLOCK_DIM_y];
+        __shared__ float sigma2Total[BLOCK_DIM_y];
+
+        float muPartial = 0.0f;
+
+        for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
+            muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride];
+        }
+
+        muPartial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(muPartial);
+
+        if (threadIdx.x == 0)
+            muTotal[threadIdx.y] = muPartial / dimsize;
+
+        //--------------------------------------------
+        float sigma2Partial = 0.0f;
+
+        for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
+            sigma2Partial +=
+                (input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                 muTotal[threadIdx.y]) *
+                (input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                 muTotal[threadIdx.y]);
+        }
+
+        sigma2Partial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sigma2Partial);
+
+        if (threadIdx.x == 0)
+            sigma2Total[threadIdx.y] = sigma2Partial / dimsize;
+
+        //--------------------------------------------
+        if (scaleSize == dimsize) {
+            for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
+
+                output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
+                    scale[threadIdx.x + ph * BLOCK_DIM_x] *
+                    (input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                     muTotal[threadIdx.y]) /
+                    sqrt(sigma2Total[threadIdx.y] + eps);
+            }
+        } else {
+            for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
+
+                output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
+                    scale[0] *
+                    (input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                     muTotal[threadIdx.y]) /
+                    sqrt(sigma2Total[threadIdx.y] + eps);
+            }
+        }
+    }
+}
+namespace infini {
+void LaynormKernel(const float *input, const float *scale, const float eps,
+                   int size, int scaleSize, const int dimsize, const int stride,
+                   float *output, const float *bias, int biasSize) {
+    int num_block = size / dimsize;
+    if (dimsize > 1024) {
+        int BLOCK_DIM = 1024;
+
+        blockLaynormKernel<1024>
+            <<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
+                                       eps, scaleSize, bias, biasSize);
+    } else if (dimsize > 31) {
+        int BLOCK_DIM_x = 32;
+        int BLOCK_DIM_y = 32;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<32, 32><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+            bias, biasSize);
+    } else if (dimsize > 15) {
+        int BLOCK_DIM_x = 16;
+        int BLOCK_DIM_y = 64;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<16, 64><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+            bias, biasSize);
+    } else if (dimsize > 7) {
+        int BLOCK_DIM_x = 8;
+        int BLOCK_DIM_y = 128;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<8, 128><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+            bias, biasSize);
+    } else {
+        int BLOCK_DIM_x = 4;
+        int BLOCK_DIM_y = 256;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<4, 256><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+            bias, biasSize);
+    }
+}
+
+void LaynormKernel(const float *input, const float *scale, const float eps,
+                   int size, int scaleSize, const int dimsize, const int stride,
+                   float *output) {
+    int num_block = size / dimsize;
+    if (dimsize > 1024) {
+        int BLOCK_DIM = 1024;
+
+        blockLaynormKernel<1024><<<num_block, BLOCK_DIM>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize);
+    } else if (dimsize > 31) {
+        int BLOCK_DIM_x = 32;
+        int BLOCK_DIM_y = 32;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<32, 32><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+    } else if (dimsize > 15) {
+        int BLOCK_DIM_x = 16;
+        int BLOCK_DIM_y = 64;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<16, 64><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+    } else if (dimsize > 7) {
+        int BLOCK_DIM_x = 8;
+        int BLOCK_DIM_y = 128;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<8, 128><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+    } else {
+        int BLOCK_DIM_x = 4;
+        int BLOCK_DIM_y = 256;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<4, 256><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+    }
+}
+} // namespace infini
diff --git a/src/operators/layer_norm.cc b/src/operators/layer_norm.cc
new file mode 100644
index 00000000..68649215
--- /dev/null
+++ b/src/operators/layer_norm.cc
@@ -0,0 +1,64 @@
+#include "operators/layer_norm.h"
+#include "utils/operator_utils.h"
+
+namespace infini {
+LayerNormObj::LayerNormObj(GraphObj *graph, Tensor input, Tensor scale,
+                           Tensor output, [[maybe_unused]] Tensor bias,
+                           float eps, int axis_, int stash_type)
+    : OperatorObj(OpType::LayerNormalization,
+                  bias ? TensorVec{input, scale, bias}
+                       : TensorVec{input, scale},
+                  {output}),
+      eps(eps), stash_type(stash_type) {
+    const auto size = input->getRank();
+    axis = get_real_axis(axis_, size);
+    IT_ASSERT(
+        is_unidirectional_broadcasting(input->getDims(), scale->getDims()));
+    if (bias) {
+        IT_ASSERT(
+            is_unidirectional_broadcasting(input->getDims(), bias->getDims()));
+    }
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> LayerNormObj::inferShape(const TensorVec &inputs) {
+    return {{inputs[0]->getDims()}};
+}
+
+vector<DataType> LayerNormObj::inferDataType(const TensorVec &inputs) const {
+    IT_ASSERT(inputs.size() == 2 || inputs.size() == 3);
+    IT_ASSERT(inputs[1]->getDType() == DataType::Float32);
+    if (inputs.size() == 3) {
+        IT_ASSERT(inputs[2]->getDType() == DataType::Float32);
+    }
+    return {inputs[0]->getDType()};
+}
+
+std::string LayerNormObj::toString() const {
+    std::ostringstream os;
+    os << "layerNormalization[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "axis=" << axis << ",";
+    os << "eps=" << eps << ",";
+    os << "stash_type=" << stash_type << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "scale=" << inputs[1]->getGuid() << ",";
+    // os << "bias=" << inputs[2]->getGuid() << ",";
+    os << "output=";
+    for (auto output : outputs)
+        os << output->getGuid() << ",";
+    return os.str();
+}
+
+vector<int> LayerNormObj::getWorkloadVector() const {
+    vector<int> ret = inputs[0]->getDims();
+    ret.emplace(ret.begin(), type.underlying());
+    return ret;
+}
+
+vector<int> LayerNormObj::getOpAttrVector() const {
+    return {type.underlying(), axis, stash_type};
+}
+
+} // namespace infini
diff --git a/src/utils/operator_utils.cc b/src/utils/operator_utils.cc
index 2d26fee5..a9b81a5e 100644
--- a/src/utils/operator_utils.cc
+++ b/src/utils/operator_utils.cc
@@ -41,4 +41,27 @@ int get_real_axis(const int &axis, const int &rank) {
     }
     return newAxis;
 }
+
+bool is_unidirectional_broadcasting(const Shape &A, const Shape &B) {
+    // check if tensor B is unidirectional broadcastable to tensor A
+    auto B_ = B;
+    int rankA = A.size();
+    int rankB = B.size();
+    if (rankA < rankB) {
+        return false;
+    }
+    if (rankA > rankB) {
+        for (auto i = 0; i < rankA - rankB; ++i) {
+            B_.insert(B_.begin(), 1);
+        }
+    }
+    for (auto i = 0; i < rankA; ++i) {
+        if (A[i] == B_[i] || B_[i] == 1) {
+            continue;
+        } else {
+            return false;
+        }
+    }
+    return true;
+}
 } // namespace infini
diff --git a/test/kernels/cuda/test_cuda_layernorm.cc b/test/kernels/cuda/test_cuda_layernorm.cc
new file mode 100644
index 00000000..18b8c4df
--- /dev/null
+++ b/test/kernels/cuda/test_cuda_layernorm.cc
@@ -0,0 +1,146 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/layer_norm.h"
+
+#include "test.h"
+
+namespace infini {
+
+void test_layernorm(
+    const Shape &inputShape, const vector<float> &inputData,
+    const Shape &scaleShape, const vector<float> &scaleData, float eps,
+    int axis, int stash_type, const vector<float> &ExpectData,
+    const std::optional<Shape> &bShape = std::nullopt,
+    const std::optional<std::vector<float>> &biasData = std::nullopt) {
+
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    if (bShape.has_value() && biasData.has_value()) {
+        Shape biasShape = *bShape;
+
+        auto bias = gCpu->addTensor(biasShape, DataType::Float32);
+        auto input = gCpu->addTensor(inputShape, DataType::Float32);
+        auto scale = gCpu->addTensor(scaleShape, DataType::Float32);
+        gCpu->dataMalloc();
+        bias->copyin(*biasData); //
+        // bias->printData();
+        input->copyin(inputData);
+        scale->copyin(scaleData); //
+        auto cudaRuntime = make_ref<CudaRuntimeObj>();
+        Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+        auto biasGpu = gCuda->cloneTensor(bias);
+        auto inputGpu = gCuda->cloneTensor(input);
+        auto scaleGpu = gCuda->cloneTensor(scale);
+        // gCpu->cloneTensor(biasGpu)->printData();
+        auto op =
+            gCuda->addOp<LayerNormObj>(inputGpu, scaleGpu, nullptr, biasGpu,
+                                       eps, axis, stash_type); // LayernormObj
+        gCuda->dataMalloc();
+        biasGpu->copyin(*biasData);
+        // gCpu->cloneTensor(biasGpu)->printData();
+        inputGpu->copyin(inputData);
+        scaleGpu->copyin(scaleData);
+        cudaRuntime->run(gCuda);
+
+        auto oCpu =
+            gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
+        oCpu->printData();                      //->printData
+        EXPECT_TRUE(oCpu->equalData(ExpectData));
+    } else {
+
+        auto input = gCpu->addTensor(inputShape, DataType::Float32);
+        auto scale = gCpu->addTensor(scaleShape, DataType::Float32);
+        gCpu->dataMalloc();
+
+        input->copyin(inputData);
+        scale->copyin(scaleData); //
+        auto cudaRuntime = make_ref<CudaRuntimeObj>();
+        Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+        auto inputGpu = gCuda->cloneTensor(input);
+        auto scaleGpu = gCuda->cloneTensor(scale);
+        auto op =
+            gCuda->addOp<LayerNormObj>(inputGpu, scaleGpu, nullptr, nullptr,
+                                       eps, axis, stash_type); // LayernormObj
+        gCuda->dataMalloc();
+
+        inputGpu->copyin(inputData);
+        scaleGpu->copyin(scaleData);
+        cudaRuntime->run(gCuda);
+
+        auto oCpu =
+            gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
+        oCpu->printData();                      //->printData
+        EXPECT_TRUE(oCpu->equalData(ExpectData));
+    }
+}
+
+TEST(CUDA_Layernorm, run) {
+    test_layernorm(
+        Shape{2, 3, 2, 3},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
+                      9.,  10., 11., 12., 13., 14., 15., 16., 17.,
+                      18., 19., 20., 21., 22., 23., 24., 25., 26.,
+                      27., 28., 29., 30., 31., 32., 33., 34., 35.},
+        Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1e-5, 3, 1,
+        vector<float>{
+            -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
+            -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
+            -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
+            -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
+            -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
+            -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678},
+        Shape{3}, vector<float>{0, 0, 0});
+    test_layernorm(
+        Shape{2, 3, 2, 3},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
+                      9.,  10., 11., 12., 13., 14., 15., 16., 17.,
+                      18., 19., 20., 21., 22., 23., 24., 25., 26.,
+                      27., 28., 29., 30., 31., 32., 33., 34., 35.},
+        Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1e-5, 3, 1,
+        vector<float>{
+            -0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
+            -0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
+            -0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
+            -0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
+            -0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
+            -0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679},
+        Shape{3}, vector<float>{0.3, 0.2, 0.5});
+    test_layernorm(
+        Shape{2, 3, 2, 3},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
+                      9.,  10., 11., 12., 13., 14., 15., 16., 17.,
+                      18., 19., 20., 21., 22., 23., 24., 25., 26.,
+                      27., 28., 29., 30., 31., 32., 33., 34., 35.},
+        Shape{1}, vector<float>{0.3}, 1e-5, 3, 1,
+        vector<float>{
+            -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207,
+            -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207,
+            -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207,
+            -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207,
+            -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207,
+            -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207},
+        Shape{3}, vector<float>{0.3, 0.2, 0.5});
+    test_layernorm(
+        Shape{2, 3, 2, 3},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
+                      9.,  10., 11., 12., 13., 14., 15., 16., 17.,
+                      18., 19., 20., 21., 22., 23., 24., 25., 26.,
+                      27., 28., 29., 30., 31., 32., 33., 34., 35.},
+        Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1e-5, 3, 1,
+        vector<float>{-0.3674207, 0.0000000,  0.6123678,  -0.3674207,
+                      0.0000000,  0.6123678,  -0.3674207, 0.0000000,
+                      0.6123678,  -0.3674207, 0.0000000,  0.6123678,
+                      -0.3674207, 0.0000000,  0.6123678,  -0.3674207,
+                      0.0000000,  0.6123678,  -0.3674207, 0.0000000,
+                      0.6123678,  -0.3674207, 0.0000000,  0.6123678,
+                      -0.3674207, 0.0000000,  0.6123678,  -0.3674207,
+                      0.0000000,  0.6123678,  -0.3674207, 0.0000000,
+                      0.6123678,  -0.3674207, 0.0000000,  0.6123678});
+
+} // python output
+
+} // namespace infini
diff --git a/test/operators/test_conv.cc b/test/operators/test_conv.cc
index 8ab50c0e..37b0079c 100644
--- a/test/operators/test_conv.cc
+++ b/test/operators/test_conv.cc
@@ -53,10 +53,6 @@ TEST(Conv, NaiveCPU) {
     i0->setData(IncrementalGenerator());
     w0->setData(IncrementalGenerator());
     runtime->run(g, true, true);
-    double perfTime = runtime->getPerfTime(g);
-    // The example Conv takes 0.015ms with one core
-    EXPECT_GT(perfTime, 0);
-    EXPECT_LT(perfTime, 5); // FIXME: why may it cost 4.8 ms sometimes
     // check answer
     auto ans =
         make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32, runtime);

From 3ead20a23a43f27c40eaa4edc50d78e2fd994da1 Mon Sep 17 00:00:00 2001
From: Hardy <100662313+wanghailu0717@users.noreply.github.com>
Date: Fri, 24 Nov 2023 15:16:25 +0800
Subject: [PATCH 14/28] Fix workspace & bang conv  (#183)

* fix bang workspace

* fix convbpdata

* fix code

* add code

* fix

* fix

* fix conv

* fix test conv

---------

Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 CMakeLists.txt                 |  2 +-
 include/bang/bang_runtime.h    | 11 +++++++++--
 src/bang/bang_runtime.cc       |  3 +++
 src/kernels/bang/conv.cc       |  8 ++++----
 src/kernels/bang/conv_trans.cc |  2 +-
 5 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b08d9f85..1101a8c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -118,7 +118,7 @@ if(BUILD_TEST)
   include_directories(3rd-party/googletest/googletest/include)
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations -Wno-error=pointer-arith")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion
 
diff --git a/include/bang/bang_runtime.h b/include/bang/bang_runtime.h
index 684e238f..2dde7756 100644
--- a/include/bang/bang_runtime.h
+++ b/include/bang/bang_runtime.h
@@ -9,6 +9,7 @@ class BangRuntimeObj : public RuntimeObj {
     cnnlHandle_t cnnl;
     BangPtr workspace;
     size_t workspaceSize;
+    mutable size_t cursor;
 
   public:
     BangRuntimeObj() : RuntimeObj(Device::BANG) {
@@ -24,6 +25,7 @@ class BangRuntimeObj : public RuntimeObj {
         // 10GB for Longformer
         // size_t longformerNum = 3lu * (1 << 30);
         workspaceSize = 7ll << 30; // 7 GB
+        cursor = 0;
         workspace = alloc(workspaceSize);
     }
     virtual ~BangRuntimeObj() {
@@ -45,10 +47,15 @@ class BangRuntimeObj : public RuntimeObj {
     void dealloc(void *ptr) override { checkBangError(cnrtFree(ptr)); }
     cnnlHandle_t cnnlHandle() const { return cnnl; }
     BangPtr getWorkspace(size_t size) const {
-        IT_ASSERT(size <= workspaceSize);
-        return workspace;
+        IT_ASSERT((cursor + size) <= workspaceSize);
+        cursor += size;
+        void *temp = workspace;
+        temp += (cursor - size);
+        return temp;
     }
 
+    void resetWorkspace() const { cursor = 0; }
+
     void copyBlobFromCPU(void *dst, const void *src,
                          size_t bytes) const override {
         checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
diff --git a/src/bang/bang_runtime.cc b/src/bang/bang_runtime.cc
index 9d422a56..c9f9a933 100644
--- a/src/bang/bang_runtime.cc
+++ b/src/bang/bang_runtime.cc
@@ -20,12 +20,14 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
         auto perfData = perfEngine.getPerfData(perfKey);
         if (!perfData && !tune) {
             kernel->compute(op, this);
+            this->resetWorkspace();
             continue;
         }
 
         PerfRecord record;
         if (!perfData) {
             record = kernel->tune(op, this);
+            this->resetWorkspace();
             perfEngine.setPerfData(perfKey, record);
         } else
             record = perfData;
@@ -36,6 +38,7 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
         if (profiling) {
             double t = timeit([&]() { kernel->compute(op, record, this); },
                               [&]() { sync(); }, 1, 1);
+            this->resetWorkspace();
             op->print();
             printf(" op_time on bang %lf\n", t);
             totalTime += t;
diff --git a/src/kernels/bang/conv.cc b/src/kernels/bang/conv.cc
index e55c749e..d9ff3df8 100644
--- a/src/kernels/bang/conv.cc
+++ b/src/kernels/bang/conv.cc
@@ -118,8 +118,8 @@ class ConvCnnl : public BangKernelWithoutConfig {
             cnnlGetTensorElementNum(cInDesc) * sizeof(float));
 
         stat = cnnlConvolutionForward(
-            context->cnnlHandle(), convDesc, algo, NULL, aDesc, aData, bDesc,
-            bData, NULL, NULL, wsData, wsSize, NULL, cInDesc, cDataIn);
+            context->cnnlHandle(), convDesc, algo, NULL, aDesc, aDataOut, bDesc,
+            bDataOut, NULL, NULL, wsData, wsSize, NULL, cInDesc, cDataIn);
         if (stat != CNNL_STATUS_SUCCESS)
             return;
 
@@ -130,10 +130,10 @@ class ConvCnnl : public BangKernelWithoutConfig {
 
         cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cInDesc, opOutDesc,
                                       &wsSize);
-        wsData = context->getWorkspace(wsSize);
+        BangPtr wsData2 = context->getWorkspace(wsSize);
 
         stat = cnnlTranspose_v2(context->cnnlHandle(), opOutDesc, cInDesc,
-                                cDataIn, cDesc, cData, wsData, wsSize);
+                                cDataIn, cDesc, cData, wsData2, wsSize);
         if (stat != CNNL_STATUS_SUCCESS)
             return;
 
diff --git a/src/kernels/bang/conv_trans.cc b/src/kernels/bang/conv_trans.cc
index baa84c8a..a081e279 100644
--- a/src/kernels/bang/conv_trans.cc
+++ b/src/kernels/bang/conv_trans.cc
@@ -57,7 +57,7 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
             CNNL_CONVOLUTION_BWD_DATA_FASTEST, &algo);
         size_t wsSize;
         cnnlGetConvolutionBackwardDataWorkspaceSize(context->cnnlHandle(),
-                                                    aDesc, bDesc, convDesc,
+                                                    bDesc, aDesc, convDesc,
                                                     cDesc, algo, &wsSize);
         BangPtr wsData = context->getWorkspace(wsSize);
 

From 67974aee8afe3cf7a91f0fe9da72a636480b44fa Mon Sep 17 00:00:00 2001
From: Hardy <100662313+wanghailu0717@users.noreply.github.com>
Date: Mon, 27 Nov 2023 14:18:12 +0800
Subject: [PATCH 15/28] Fix
 https://github.com/InfiniTensor/InfiniTensor/pull/160 (#185)

Co-authored-by: wanghailu <wanghailu0717@163.com>
---
 src/kernels/bang/reduce_mean.cc   | 2 +-
 src/kernels/kunlun/reduce_mean.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/kernels/bang/reduce_mean.cc b/src/kernels/bang/reduce_mean.cc
index 2dd77e10..1b55c2ca 100644
--- a/src/kernels/bang/reduce_mean.cc
+++ b/src/kernels/bang/reduce_mean.cc
@@ -1,6 +1,6 @@
-#include "operators/reduce_mean.h"
 #include "bang/bang_kernel_without_config.h"
 #include "bang/bang_runtime.h"
+#include "operators/reduce.h"
 
 namespace infini {
 class ReduceMeanCnnl : public BangKernelWithoutConfig {
diff --git a/src/kernels/kunlun/reduce_mean.cc b/src/kernels/kunlun/reduce_mean.cc
index 08a01fd6..c7cf19ac 100644
--- a/src/kernels/kunlun/reduce_mean.cc
+++ b/src/kernels/kunlun/reduce_mean.cc
@@ -1,6 +1,6 @@
-#include "operators/reduce_mean.h"
 #include "kunlun/kunlun_kernel_without_config.h"
 #include "kunlun/kunlun_runtime.h"
+#include "operators/reduce.h"
 
 namespace infini {
 class ReduceMeanXdnn : public KUNLUNKernelWithoutConfig {

From c143eebdf7e004e39e25946ef245e6bfba97d100 Mon Sep 17 00:00:00 2001
From: Derui Yang <ydrml@hotmail.com>
Date: Mon, 11 Dec 2023 10:44:06 +0800
Subject: [PATCH 16/28] =?UTF-8?q?=E4=B8=8D=E4=BE=9D=E8=B5=96=20onnx=20mode?=
 =?UTF-8?q?ls=20=E7=9A=84=E6=A8=A1=E5=9E=8B=E5=AD=98=E5=82=A8=20(#196)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 .github/workflows/build.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1f6b2758..cf681e83 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -14,10 +14,10 @@ env:
   protobuf-version: "3.21.12"
   python-version: "3.10"
 
-  resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx
-  inception-download: https://media.githubusercontent.com/media/onnx/models/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx
-  densenet-download: https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-12.onnx
-  efficientnet-download: https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx
+  resnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/resnet18-v2-7.onnx
+  inception-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/inception-v2-9.onnx
+  densenet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/densenet-12.onnx
+  efficientnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/efficientnet-lite4-11.onnx
 
 jobs:
   build:

From a3929c25f812690687c4fb4310b667c686b63505 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
Date: Thu, 14 Dec 2023 16:38:03 +0800
Subject: [PATCH 17/28] Add send and recv operators based on NCCL (#182)

* baseline sendrecv, bug

* success sendrecv

* get rank from comm

* set output shape

* successful:set output shape equal to input shape

* shape as attribute

* success:shape as attribute

* success send recv, output 0

* add onnx test

* split send and recv

* success split send and recv

* test-onnx bug

* success test-onnx

* modified onnx.py

* solve review
---
 include/core/graph_handler.h              |  3 +
 include/core/op_type.h                    |  2 +
 include/operators/recv.h                  | 46 ++++++++++++
 include/operators/send.h                  | 42 +++++++++++
 pyinfinitensor/src/pyinfinitensor/onnx.py | 80 ++++++++++++++++----
 pyinfinitensor/tests/test_onnx.py         | 26 ++++++-
 src/core/graph.cc                         | 92 ++++++++++++++---------
 src/core/graph_handler.cc                 | 35 +++++++++
 src/core/operator.cc                      |  6 +-
 src/ffi/ffi_infinitensor.cc               |  2 +
 src/kernels/cuda/recv.cc                  | 47 ++++++++++++
 src/kernels/cuda/send.cc                  | 43 +++++++++++
 src/operators/recv.cc                     | 49 ++++++++++++
 src/operators/send.cc                     | 46 ++++++++++++
 test/kernels/cuda/test_cuda_sendrecv.cc   | 90 ++++++++++++++++++++++
 test/operators/test_sendrecv.cc           | 38 ++++++++++
 16 files changed, 595 insertions(+), 52 deletions(-)
 create mode 100644 include/operators/recv.h
 create mode 100644 include/operators/send.h
 create mode 100644 src/kernels/cuda/recv.cc
 create mode 100644 src/kernels/cuda/send.cc
 create mode 100644 src/operators/recv.cc
 create mode 100644 src/operators/send.cc
 create mode 100644 test/kernels/cuda/test_cuda_sendrecv.cc
 create mode 100644 test/operators/test_sendrecv.cc

diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index 76f6e0c2..f095db81 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -94,6 +94,9 @@ class GraphHandlerObj {
     Tensor allReduceAvg(Tensor input, Tensor output);
     TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
     Tensor broadcast(Tensor input, Tensor output, int root);
+    Tensor send(Tensor input, int source, int destination, Tensor output);
+    Tensor recv(Tensor output, int source, int destination, Shape dims,
+                int outputType, Tensor input);
     Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
                         std::string mode);
 
diff --git a/include/core/op_type.h b/include/core/op_type.h
index 91a0b99a..1652a677 100644
--- a/include/core/op_type.h
+++ b/include/core/op_type.h
@@ -232,6 +232,8 @@ struct OpType {
         AllReduceAvg,
         AllGather,
         Broadcast,
+        Send,
+        Recv,
     } type;
 
     constexpr OpType(decltype(type) t) : type(t) {}
diff --git a/include/operators/recv.h b/include/operators/recv.h
new file mode 100644
index 00000000..faed3407
--- /dev/null
+++ b/include/operators/recv.h
@@ -0,0 +1,46 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+/**
+ *
+ * https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2193/user-guide/docs/index.html
+ */
+class RecvObj : public OperatorObj {
+
+  public:
+    /**
+     * @brief Construct a new SendRecv object
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param input default nullptr, because recv does not have input.
+     * @param output recv output
+     * @param source the send rank
+     * @param destination the recv rank
+     * @param dims The shape of the output tensor.
+     */
+    RecvObj(GraphObj *graph, Tensor output, int source, int destination,
+            Shape dims, int outputType, Tensor input = nullptr);
+    OP_CLONE(RecvObj);
+
+    int numInputs() const override { return inputs.size(); }
+    int numOutputs() const override { return 1; }
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    std::string toString() const override;
+    DataType getDType() const;
+    int getSourceRank() const { return source; }
+    int getDestinationRank() const { return destination; }
+    inline Shape getShape() const { return dims; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+    vector<DataType> inferDataType(const TensorVec &inputs) const override;
+
+  protected:
+    int source;
+    int destination;
+    Shape dims;
+    int outputType;
+};
+} // namespace infini
diff --git a/include/operators/send.h b/include/operators/send.h
new file mode 100644
index 00000000..07f5e78b
--- /dev/null
+++ b/include/operators/send.h
@@ -0,0 +1,42 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+/**
+ *
+ * https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2193/user-guide/docs/index.html
+ */
+class SendObj : public OperatorObj {
+
+  public:
+    /**
+     * @brief Construct a new SendRecv object
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param input send input
+     * @param output recv output
+     * @param source the send rank
+     * @param destination the recv rank
+     */
+    SendObj(GraphObj *graph, Tensor input, int source, int destination,
+            Tensor output = nullptr);
+    OP_CLONE(SendObj);
+
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return outputs.size(); }
+    std::string toString() const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    int getSourceRank() const { return source; }
+    int getDestinationRank() const { return destination; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+    vector<DataType> inferDataType(const TensorVec &inputs) const override;
+
+  protected:
+    int source;
+    int destination;
+};
+} // namespace infini
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index f0326d88..90a3d3ab 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -703,12 +703,12 @@ class OnnxStub:
                             tensors[node.input[0]],
                             tensors.get(node.output[0]),
                         )
-                    else: 
+                    else:
                         # NOTE: `axes` is an attribute until opset version 13.
                         if len(node.input) > 1:
                             axis = _parse_data(data[node.input[1]])
                         else:
-                            axis =  next(
+                            axis = next(
                                 (
                                     attr.ints
                                     for attr in node.attribute
@@ -716,14 +716,17 @@ class OnnxStub:
                                 ),
                                 None,
                             )
-                        keepdims = next(
-                            (
-                                attr.i
-                                for attr in node.attribute
-                                if attr.name == "keepdims"
-                            ),
-                            1,
-                        ) != 0
+                        keepdims = (
+                            next(
+                                (
+                                    attr.i
+                                    for attr in node.attribute
+                                    if attr.name == "keepdims"
+                                ),
+                                1,
+                            )
+                            != 0
+                        )
 
                         tensors[node.output[0]] = self.handler.reduceSum(
                             tensors[node.input[0]],
@@ -775,6 +778,58 @@ class OnnxStub:
                             0,
                         ),
                     )
+                elif node.op_type == "Send":
+                    source = next(
+                        (attr.i for attr in node.attribute if attr.name == "source"),
+                        0,
+                    )
+                    destination = next(
+                        (
+                            attr.i
+                            for attr in node.attribute
+                            if attr.name == "destination"
+                        ),
+                        0,
+                    )
+
+                    self.handler.send(
+                        tensors[node.input[0]],
+                        source,
+                        destination,
+                        None,
+                    )
+                elif node.op_type == "Recv":
+                    source = next(
+                        (attr.i for attr in node.attribute if attr.name == "source"),
+                        0,
+                    )
+                    destination = next(
+                        (
+                            attr.i
+                            for attr in node.attribute
+                            if attr.name == "destination"
+                        ),
+                        0,
+                    )
+
+                    for attr in node.attribute:
+                        if attr.name == "shape":
+                            shapeBasic = attr.ints
+                    shape = []
+                    for item in shapeBasic:
+                        shape.append(item)
+
+                    for attr in node.attribute:
+                        if attr.name == "dataType":
+                            outputType = attr.i
+                    tensors[node.output[0]] = self.handler.recv(
+                        tensors.get(node.output[0]),
+                        source,
+                        destination,
+                        shape,
+                        outputType,
+                        None,
+                    )
                 elif node.op_type == "Expand":
                     shape = _parse_data(data[node.input[1]])
                     tensors[node.output[0]] = self.handler.expand(
@@ -1091,10 +1146,7 @@ class OnnxStub:
             elif ty == backend.OpTypeId.Gather:
                 axis = backend.gather_axis_of(op)
                 ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
-            elif ty in [
-                backend.OpTypeId.ReduceMean,
-                backend.OpTypeId.ReduceSum
-            ]:
+            elif ty in [backend.OpTypeId.ReduceMean, backend.OpTypeId.ReduceSum]:
                 axes, keepdims = backend.reduce_attrs_of(op)
                 inputs.append(
                     ctx.push_data_input(
diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py
index 8e1587b9..ca290d76 100644
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@@ -337,7 +337,7 @@ class TestStringMethods(unittest.TestCase):
             "ReduceMean", ["data"], ["reduced"], keepdims=1, name="reduceMean"
         )
         make_and_import_model(make_graph([reduceMean], "reduceMean", [data], [reduced]))
-    
+
     def test_reduce_sum(self):
         data = make_tensor_value_info("data", TensorProto.FLOAT, [2, 3, 3, 4])
         reduced = make_tensor_value_info("reduced", TensorProto.FLOAT, [1, 1, 1, 1])
@@ -508,6 +508,29 @@ class TestStringMethods(unittest.TestCase):
         where = make_node("Where", ["x", "y", "con"], ["output"], name="where")
         make_and_import_model(make_graph([where], "where", [x, y, con], [output]))
 
+    def test_send(self):
+        sendInput = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 5, 7])
+        send = make_node("Send", ["input"], [], name="send", source=0, destination=1)
+        graph = make_graph([send], "send", [sendInput], [])
+        model = make_model(graph)
+        from_onnx(model, backend.cpu_runtime())
+
+    def test_recv(self):
+        recvOutput = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 5, 7])
+        recv = make_node(
+            "Recv",
+            [],
+            ["output"],
+            name="recv",
+            source=0,
+            destination=1,
+            shape=[1, 3, 5, 7],
+            dataType=1,
+        )
+        graph = make_graph([recv], "recv", [], [recvOutput])
+        model = make_model(graph)
+        from_onnx(model, backend.cpu_runtime())
+
 
 class TestDynamicTensor(unittest.TestCase):
     def test_dynamic_tensor(self):
@@ -517,6 +540,7 @@ class TestDynamicTensor(unittest.TestCase):
         for root, dirs, files in os.walk(current_path):
             if filename in files:
                 model_file = os.path.join(root, filename)
+
         model = OnnxStub(onnx.load(model_file), backend.cpu_runtime())
         output_key = list(model.outputs.keys())[0]
         old_output_shape = model.getShape(output_key)
diff --git a/src/core/graph.cc b/src/core/graph.cc
index dd474d11..5eb67402 100644
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@@ -11,20 +11,33 @@ GraphObj::GraphObj(Runtime runtime, OpVec ops_in)
     map<UidBaseType, Tensor> tensorPool;
     // Clone tensors
     for (const auto &op : ops_in) {
-        for (const auto &t : op->getInputs())
-            if (tensorPool.find(t->getFuid()) == tensorPool.end())
-                tensorPool[t->getFuid()] = cloneTensor(t);
-        for (const auto &t : op->getOutputs())
-            if (tensorPool.find(t->getFuid()) == tensorPool.end())
-                tensorPool[t->getFuid()] = cloneTensor(t);
+        for (const auto &t : op->getInputs()) {
+            if (t) {
+                if (tensorPool.find(t->getFuid()) == tensorPool.end())
+                    tensorPool[t->getFuid()] = cloneTensor(t);
+            }
+        }
+        for (const auto &t : op->getOutputs()) {
+            if (t) {
+                if (tensorPool.find(t->getFuid()) == tensorPool.end())
+                    tensorPool[t->getFuid()] = cloneTensor(t);
+            }
+        }
     }
     // Clone operators and add connections
     for (const auto &op : ops_in) {
         TensorVec inputs, outputs;
-        for (const auto &t : op->getInputs())
-            inputs.emplace_back(tensorPool.at(t->getFuid()));
-        for (const auto &t : op->getOutputs())
-            outputs.emplace_back(tensorPool.at(t->getFuid()));
+        for (const auto &t : op->getInputs()) {
+            if (t) {
+                inputs.emplace_back(tensorPool.at(t->getFuid()));
+            }
+        }
+
+        for (const auto &t : op->getOutputs()) {
+            if (t) {
+                outputs.emplace_back(tensorPool.at(t->getFuid()));
+            }
+        }
         addOperatorAndConnect(op->clone(inputs, outputs));
     }
 }
@@ -33,17 +46,21 @@ void GraphObj::addOperatorAndConnect(const Operator &op) {
     sorted = false;
     ops.push_back(op);
     for (auto &input : op->getInputs()) {
-        input->addTarget(op);
-        if (auto pred = input->getSource()) {
-            pred->addSuccessors(op);
-            op->addPredecessors(pred);
+        if (input) {
+            input->addTarget(op);
+            if (auto pred = input->getSource()) {
+                pred->addSuccessors(op);
+                op->addPredecessors(pred);
+            }
         }
     }
     for (auto &output : op->getOutputs()) {
-        output->setSource(op);
-        for (auto &succ : output->getTargets()) {
-            succ->addPredecessors(op);
-            op->addSuccessors(succ);
+        if (output) {
+            output->setSource(op);
+            for (auto &succ : output->getTargets()) {
+                succ->addPredecessors(op);
+                op->addSuccessors(succ);
+            }
         }
     }
 }
@@ -88,8 +105,9 @@ bool GraphObj::topo_sort() {
             const auto is_head = std::all_of(
                 this_inputs.begin(), this_inputs.end(), [&](const auto &input) {
                     auto src = input->getSource();
-                    return src // If the source node is in the waiting list,
-                               // means that this node is not the head node.
+                    return src // If the source node is in the waiting
+                               // list, means that this node is not the
+                               // head node.
                                ? waiting.find(src) == waiting.end()
                                // This tensor has no source node,
                                // it must be a input tensor.
@@ -110,7 +128,6 @@ bool GraphObj::topo_sort() {
             return false;
         }
     }
-
     // Done.
     this->ops = std::move(sorted);
     return this->sorted = true;
@@ -155,6 +172,7 @@ void GraphObj::shape_infer() {
 
 void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
     // topological sorting first
+
     IT_ASSERT(topo_sort() == true);
     if (useNaiveAllocator) {
         // can not set memory pool when use naive allocator
@@ -222,24 +240,28 @@ void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
         // memory should be allocated for the op's output first
         auto outputs = op->getOutputs();
         for (auto &tensor : outputs) {
-            if (tensor->isOthers()) {
-                tensorToOffset[tensor.get()] =
-                    allocator.alloc(tensor->getBytes());
+            if (tensor) {
+                if (tensor->isOthers()) {
+                    tensorToOffset[tensor.get()] =
+                        allocator.alloc(tensor->getBytes());
+                }
             }
         }
         auto inputs = op->getInputs();
         for (auto &tensor : inputs) {
-            if (tensor->isOthers()) {
-                auto tensorIter = tensorToRefCount.find(tensor.get());
-                IT_ASSERT(tensorIter != tensorToRefCount.end());
-                IT_ASSERT(tensorToRefCount[tensor.get()] > 0);
-                tensorToRefCount[tensor.get()] -= 1;
-                if (tensorToRefCount[tensor.get()] == 0) {
-                    // indicate that this tensor will no longer be used and
-                    // perform memory free
-                    tensorToRefCount.erase(tensor.get());
-                    allocator.free(tensorToOffset[tensor.get()],
-                                   tensor->getBytes());
+            if (tensor) {
+                if (tensor->isOthers()) {
+                    auto tensorIter = tensorToRefCount.find(tensor.get());
+                    IT_ASSERT(tensorIter != tensorToRefCount.end());
+                    IT_ASSERT(tensorToRefCount[tensor.get()] > 0);
+                    tensorToRefCount[tensor.get()] -= 1;
+                    if (tensorToRefCount[tensor.get()] == 0) {
+                        // indicate that this tensor will no longer be used and
+                        // perform memory free
+                        tensorToRefCount.erase(tensor.get());
+                        allocator.free(tensorToOffset[tensor.get()],
+                                       tensor->getBytes());
+                    }
                 }
             }
         }
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index de156c43..1eb73499 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -13,8 +13,10 @@
 #include "operators/matmul.h"
 #include "operators/pad.h"
 #include "operators/pooling.h"
+#include "operators/recv.h"
 #include "operators/reduce.h"
 #include "operators/reshape.h"
+#include "operators/send.h"
 #include "operators/slice.h"
 #include "operators/softmax.h"
 #include "operators/split.h"
@@ -434,6 +436,39 @@ Tensor GraphHandlerObj::broadcast(Tensor input, Tensor output, int root) {
     }
 }
 
+Tensor GraphHandlerObj::send(Tensor input, int source, int destination,
+                             Tensor output) {
+    if (output) {
+
+        g->addOpWithOutputs<SendObj>(std::move(input), source, destination,
+                                     output);
+
+        return output;
+    } else {
+        return g->addOp<SendObj>(std::move(input), source, destination, output)
+            ->getOutput();
+    }
+}
+
+Tensor GraphHandlerObj::recv(Tensor output, int source, int destination,
+                             Shape dims, int outputType, Tensor input) {
+
+    if (output) {
+
+        g->addOpWithOutputs<RecvObj>(output, source, destination,
+                                     std::move(dims), outputType,
+                                     std::move(input));
+
+        return output;
+    } else {
+
+        return g
+            ->addOp<RecvObj>(output, source, destination, std::move(dims),
+                             outputType, std::move(input))
+            ->getOutput();
+    }
+}
+
 Tensor GraphHandlerObj::cast(Tensor input, Tensor output, int to) {
     if (output) {
         g->addOpWithOutputs<CastObj>(std::move(input), output,
diff --git a/src/core/operator.cc b/src/core/operator.cc
index 6a9ea1b8..4fd4e6de 100644
--- a/src/core/operator.cc
+++ b/src/core/operator.cc
@@ -6,8 +6,10 @@ namespace infini {
 
 OperatorObj::OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs)
     : type(opType), inputs(inputs), outputs(outputs) {
-    for (const auto &t : inputs)
-        IT_ASSERT(t);
+    if (opType != OpType::Recv) {
+        for (const auto &t : inputs)
+            IT_ASSERT(t);
+    }
 }
 
 void OperatorObj::removePredecessors(const Operator &op) {
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 408d3514..ca99a4c3 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -511,6 +511,8 @@ void init_graph_builder(py::module &m) {
         .def("allReduceAvg", &Handler::allReduceAvg, policy::move)
         .def("allGather", &Handler::allGather, policy::move)
         .def("broadcast", &Handler::broadcast, policy::move)
+        .def("send", &Handler::send, policy::move)
+        .def("recv", &Handler::recv, policy::move)
         .def("cast", &Handler::cast, policy::move)
         .def("expand", &Handler::expand, policy::move)
         .def("erf", &Handler::erf, policy::move)
diff --git a/src/kernels/cuda/recv.cc b/src/kernels/cuda/recv.cc
new file mode 100644
index 00000000..7fd7ee49
--- /dev/null
+++ b/src/kernels/cuda/recv.cc
@@ -0,0 +1,47 @@
+#ifdef INFINI_USE_NCCL
+#include "operators/recv.h"
+#include "cuda/cuda_kernel_wihtout_config.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/nccl_communicator.h"
+
+namespace infini {
+class RecvNCCL : public CudaKernelWithoutConfig {
+  public:
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<RecvObj>(_op);
+        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
+
+        void *output = op->getOutput(0)->getRawDataPtr<void *>();
+        IT_ASSERT(op->getDType() == DataType::Float32);
+        const auto shape = op->getShape();
+        int nDims = shape.size();
+        int outputCount = 1;
+        for (int i = 0; i < nDims; i++) {
+            outputCount *= shape[i];
+        }
+
+        ncclComm_t comm =
+            dynamic_cast<NcclCommunicatorObj &>(context->getCommunicator())
+                .getNcclComm();
+        // TODO: Using default stream 0 for now.
+        int rank;
+
+        checkNcclError(ncclCommUserRank(comm, &rank));
+
+        int source = op->getSourceRank();
+        int destination = op->getDestinationRank();
+
+        if (rank == destination) {
+
+            checkNcclError(
+                ncclRecv(output, outputCount, ncclFloat, source, comm, 0));
+        }
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::Recv, DataType::Float32, RecvNCCL,
+                "Recv_NCCL_CUDA_Float32");
+} // namespace infini
+
+#endif
diff --git a/src/kernels/cuda/send.cc b/src/kernels/cuda/send.cc
new file mode 100644
index 00000000..38684062
--- /dev/null
+++ b/src/kernels/cuda/send.cc
@@ -0,0 +1,43 @@
+#ifdef INFINI_USE_NCCL
+#include "operators/send.h"
+#include "cuda/cuda_kernel_wihtout_config.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/nccl_communicator.h"
+
+namespace infini {
+class SendNCCL : public CudaKernelWithoutConfig {
+  public:
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<SendObj>(_op);
+        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
+        void *input = op->getInputs(0)->getRawDataPtr<void *>();
+
+        IT_ASSERT(op->getDType() == DataType::Float32);
+        size_t inputCount =
+            op->getInputs(0)->getBytes() / op->getDType().getSize();
+
+        ncclComm_t comm =
+            dynamic_cast<NcclCommunicatorObj &>(context->getCommunicator())
+                .getNcclComm();
+        // TODO: Using default stream 0 for now.
+        int rank;
+
+        checkNcclError(ncclCommUserRank(comm, &rank));
+
+        int source = op->getSourceRank();
+        int destination = op->getDestinationRank();
+
+        if (rank == source) {
+
+            checkNcclError(
+                ncclSend(input, inputCount, ncclFloat, destination, comm, 0));
+        }
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::Send, DataType::Float32, SendNCCL,
+                "Send_NCCL_CUDA_Float32");
+} // namespace infini
+
+#endif
diff --git a/src/operators/recv.cc b/src/operators/recv.cc
new file mode 100644
index 00000000..6883b636
--- /dev/null
+++ b/src/operators/recv.cc
@@ -0,0 +1,49 @@
+#include "operators/recv.h"
+
+namespace infini {
+RecvObj::RecvObj(GraphObj *graph, Tensor output, int source, int destination,
+                 Shape dims, int outputType, [[maybe_unused]] Tensor input)
+    : OperatorObj(OpType::Recv, input ? TensorVec{input} : TensorVec{},
+                  TensorVec{output}),
+      source(source), destination(destination), dims(std::move(dims)),
+      outputType(outputType) {
+
+    IT_ASSERT(checkValid(graph));
+}
+optional<vector<Shape>> RecvObj::inferShape(const TensorVec &inputs) {
+    return {{dims}};
+}
+vector<DataType> RecvObj::inferDataType(const TensorVec &inputs) const {
+    return {{DataType(outputType)}};
+}
+DataType RecvObj::getDType() const { return getOutput(0)->getDType(); }
+std::string RecvObj::toString() const {
+    std::ostringstream os;
+    os << "Recv"
+       << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(dims) << ",";
+    os << "output=" << outputs[0]->getGuid() << ",";
+    os << "dims=" << vecToString(dims) << ")";
+    return os.str();
+}
+
+vector<int> RecvObj::getWorkloadVector() const {
+    vector<int> ret = dims;
+    ret.insert(ret.end(), dims.begin(), dims.end());
+    ret.emplace(ret.begin(), type.underlying());
+
+    ret.emplace_back(source);
+    ret.emplace_back(destination);
+
+    return ret;
+}
+
+vector<int> RecvObj::getOpAttrVector() const {
+    vector<int> ret = dims;
+    ret.emplace(ret.begin(), type.underlying());
+    ret.emplace_back(source);
+    ret.emplace_back(destination);
+    return ret;
+}
+} // namespace infini
diff --git a/src/operators/send.cc b/src/operators/send.cc
new file mode 100644
index 00000000..bc349ceb
--- /dev/null
+++ b/src/operators/send.cc
@@ -0,0 +1,46 @@
+#include "operators/send.h"
+
+namespace infini {
+SendObj::SendObj(GraphObj *graph, Tensor input, int source, int destination,
+                 [[maybe_unused]] Tensor output)
+    : OperatorObj(OpType::Send, TensorVec{input},
+                  TensorVec{output ? output : nullptr}),
+      source(source), destination(destination) {
+
+    IT_ASSERT(checkValid(graph));
+}
+optional<vector<Shape>> SendObj::inferShape(const TensorVec &inputs) {
+    return {{inputs[0]->getDims()}};
+}
+vector<DataType> SendObj::inferDataType(const TensorVec &inputs) const {
+    return {{inputs[0]->getDType()}};
+}
+
+std::string SendObj::toString() const {
+    std::ostringstream os;
+    os << "Send"
+       << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> SendObj::getWorkloadVector() const {
+    vector<int> ret = inputs[0]->getDims();
+
+    ret.emplace(ret.begin(), type.underlying());
+    ret.emplace_back(source);
+    ret.emplace_back(destination);
+
+    return ret;
+}
+
+vector<int> SendObj::getOpAttrVector() const {
+    vector<int> ret = inputs[0]->getDims();
+    ret.emplace(ret.begin(), type.underlying());
+    ret.emplace_back(source);
+    ret.emplace_back(destination);
+    return ret;
+}
+} // namespace infini
diff --git a/test/kernels/cuda/test_cuda_sendrecv.cc b/test/kernels/cuda/test_cuda_sendrecv.cc
new file mode 100644
index 00000000..4be24b52
--- /dev/null
+++ b/test/kernels/cuda/test_cuda_sendrecv.cc
@@ -0,0 +1,90 @@
+#ifdef INFINI_USE_NCCL
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/recv.h"
+#include "operators/send.h"
+#include "test.h"
+#include <nccl.h>
+#include <thread>
+
+namespace infini {
+
+void sendrecv(const string taskName, int deviceID, vector<float> data,
+              const Shape &dataShape, int WORLD_SIZE, int source,
+              int destination) {
+    // Create Runtimes and initiate communication
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    Runtime cudaRuntime = make_ref<CudaRuntimeObj>(deviceID);
+    cudaRuntime->initComm(taskName, WORLD_SIZE, deviceID);
+
+    if (deviceID == source) {
+        Graph gSend = make_ref<GraphObj>(cudaRuntime);
+        auto input = gSend->addTensor(Shape{static_cast<int>(data.size())},
+                                      DataType::Float32);
+        auto opSend =
+            gSend->addOp<SendObj>(input, source, destination, nullptr);
+
+        // Copy data from CPU to GPU
+        gSend->dataMalloc();
+        input->copyin(data);
+        cudaRuntime->run(gSend);
+    }
+
+    // ----------------
+
+    if (deviceID == destination) {
+        Graph gRecv = make_ref<GraphObj>(cudaRuntime);
+        int outputType = 1;
+        // auto input =
+        // gRecv->addTensor(Shape{static_cast<int>(data.size())},DataType::Float32);
+        auto opRecv = gRecv->addOp<RecvObj>(nullptr, source, destination,
+                                            dataShape, outputType, nullptr);
+        gRecv->dataMalloc();
+        cudaRuntime->run(gRecv);
+
+        auto result = opRecv->getOutput()->clone(cpuRuntime);
+        EXPECT_TRUE(result->equalData(data));
+    }
+}
+
+TEST(CUDA_SendRecv1, run) {
+    // Only 1 device gets data. Every rank should have the same data after
+    // sendrecv.
+    vector<float> data = {2., 3., 5., 6.};
+
+    int WORLD_SIZE = 4;
+    int source = 0;
+    int destination = 2;
+    std::vector<std::thread> threads;
+    for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) {
+        threads.emplace_back(sendrecv, "test_sendrecv", gpu, data, Shape{2, 2},
+                             WORLD_SIZE, source, destination);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+}
+
+TEST(CUDA_SendRecv2, run) {
+    // Only 1 device gets data. Every rank should have the same data after
+    // sendrecv.
+    vector<float> data = {2., 3., 5., 6.};
+
+    int WORLD_SIZE = 3;
+    int source = 0;
+    int destination = 2;
+    std::vector<std::thread> threads;
+    for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) {
+        threads.emplace_back(sendrecv, "test_sendrecv", gpu, data, Shape{2, 2},
+                             WORLD_SIZE, source, destination);
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+}
+} // namespace infini
+#endif
diff --git a/test/operators/test_sendrecv.cc b/test/operators/test_sendrecv.cc
new file mode 100644
index 00000000..44cbb141
--- /dev/null
+++ b/test/operators/test_sendrecv.cc
@@ -0,0 +1,38 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/recv.h"
+#include "operators/send.h"
+#include "test.h"
+
+namespace infini {
+TEST(Send, ShapeTypeInfer) {
+    auto runtime = NativeCpuRuntimeObj::getInstance();
+    int source = 0;
+    int destination = 1;
+    Shape dims = {1, 3, 2, 4};
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor input = g->addTensor(dims, DataType::Float32);
+        auto op = g->addOp<SendObj>(input, source, destination, nullptr);
+        EXPECT_EQ(op->getOpType(), OpType::Send);
+        EXPECT_EQ(op->getInputs(0)->getDims(), (dims));
+        EXPECT_EQ(op->getInputs(0)->getDType(), DataType::Float32);
+    }
+}
+TEST(Recv, ShapeTypeInfer) {
+    auto runtime = NativeCpuRuntimeObj::getInstance();
+    int source = 0;
+    int destination = 1;
+    Shape dims = {1, 3, 2, 4};
+    int outputType = 1;
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor input = g->addTensor(dims, DataType::Float32);
+        auto op = g->addOp<RecvObj>(nullptr, source, destination, dims,
+                                    outputType, input);
+        EXPECT_EQ(op->getOpType(), OpType::Recv);
+        EXPECT_EQ(op->getOutput()->getDims(), (dims));
+        EXPECT_EQ(op->getOutput()->getDType(), DataType::Float32);
+    }
+}
+} // namespace infini

From 9a9587556c3271b6c7e6fb75a26cb8a60c195baf Mon Sep 17 00:00:00 2001
From: learner2468 <152614978+learner2468@users.noreply.github.com>
Date: Thu, 14 Dec 2023 19:42:43 +0800
Subject: [PATCH 18/28] Add examples: inference of Paddle models (#192)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add paddle model and infer with InfiniTensor

* Remove unused import

---------

Co-authored-by: kilinchange <44265800+kilinchange@users.noreply.github.com>

【Hackathon No.106】Add paddle model and infer with InfiniTensor
---
 examples/python/paddle_densenet.py  | 80 ++++++++++++++++++++++++++++
 examples/python/paddle_inception.py | 80 ++++++++++++++++++++++++++++
 examples/python/paddle_model_dev.md | 31 +++++++++++
 examples/python/paddle_resnet.py    | 81 +++++++++++++++++++++++++++++
 4 files changed, 272 insertions(+)
 create mode 100644 examples/python/paddle_densenet.py
 create mode 100644 examples/python/paddle_inception.py
 create mode 100644 examples/python/paddle_model_dev.md
 create mode 100644 examples/python/paddle_resnet.py

diff --git a/examples/python/paddle_densenet.py b/examples/python/paddle_densenet.py
new file mode 100644
index 00000000..c193df8f
--- /dev/null
+++ b/examples/python/paddle_densenet.py
@@ -0,0 +1,80 @@
+
+import paddle
+import paddle.vision.transforms as T
+from paddle.vision.datasets import Cifar10
+from pyinfinitensor.onnx import OnnxStub, backend
+import onnx
+import itertools
+
+def run_cifar_train_and_infer():
+    
+    paddle.device.set_device("gpu")
+
+    transform = T.Compose(
+        [
+            T.Resize(224),
+            T.ToTensor(),
+            T.Normalize(
+                mean=[0.5, 0.5, 0.5],
+                std=[0.5, 0.5, 0.5],
+                to_rgb=True,
+            ),
+        ]
+    )
+    
+    # 下载数据集并初始化 DataSet
+    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
+    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
+
+    # 模型组网并初始化网络
+    densenet = paddle.vision.models.DenseNet(num_classes=10)
+    model = paddle.Model(densenet)
+
+    # 模型训练的配置准备，准备损失函数，优化器和评价指标
+    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
+                paddle.nn.CrossEntropyLoss(),
+                paddle.metric.Accuracy())
+
+    # 模型训练
+    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
+    # 模型评估
+    model.evaluate(test_dataset, batch_size=64, verbose=1)
+
+    # export to ONNX
+    save_path = 'onnx.save/densenet' # 需要保存的路径
+    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
+    paddle.onnx.export(densenet, save_path, input_spec=[x_spec], opset_version=11)
+
+    # 加载onnx模型并放到Infinitensor中
+    model_path = save_path + ".onnx"
+    onnx_model = onnx.load(model_path)
+    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
+    model = gofusion_model
+    model.init()
+
+    # 启动推理
+    cifar10_test = Cifar10(
+        mode="test",
+        transform=transform,  # apply transform to every image
+        backend="cv2",  # use OpenCV as image transform backend
+    )
+    batch_size = 1
+    total_size = 0
+    total_acc = 0.0
+    for data in itertools.islice(iter(cifar10_test), 10000):
+        images, labels = data
+        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
+        model.run()
+        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
+        outputs = paddle.to_tensor(outputs)
+        outputs = paddle.reshape(outputs, (1, 10))
+        labels = paddle.to_tensor(labels)
+        labels = paddle.reshape(labels, (1,1))
+        acc = paddle.metric.accuracy(outputs, labels)
+        total_acc += acc
+        total_size += batch_size
+    print("test acc: {}".format(total_acc.numpy() / total_size))
+
+
+if __name__ == "__main__":
+    run_cifar_train_and_infer()
diff --git a/examples/python/paddle_inception.py b/examples/python/paddle_inception.py
new file mode 100644
index 00000000..9f4d9ca5
--- /dev/null
+++ b/examples/python/paddle_inception.py
@@ -0,0 +1,80 @@
+import paddle
+import paddle.vision.transforms as T
+from paddle.vision.datasets import Cifar10
+from pyinfinitensor.onnx import OnnxStub, backend
+import onnx
+import itertools
+
+def run_cifar_train_and_infer():
+    
+    paddle.device.set_device("gpu")
+
+    transform = T.Compose(
+        [
+            T.Resize(224),
+            T.ToTensor(),
+            T.Normalize(
+                mean=[0.5, 0.5, 0.5],
+                std=[0.5, 0.5, 0.5],
+                to_rgb=True,
+            ),
+        ]
+    )
+    
+    # 下载数据集并初始化 DataSet
+    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
+    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
+
+    # 模型组网并初始化网络
+    inception = paddle.vision.models.InceptionV3(num_classes=10)
+    model = paddle.Model(inception)
+
+    # 模型训练的配置准备，准备损失函数，优化器和评价指标
+    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
+                paddle.nn.CrossEntropyLoss(),
+                paddle.metric.Accuracy())
+
+    # 模型训练
+    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
+    # 模型评估
+    model.evaluate(test_dataset, batch_size=64, verbose=1)
+
+    # export to ONNX
+    save_path = 'onnx.save/inception' # 需要保存的路径
+    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
+    paddle.onnx.export(inception, save_path, input_spec=[x_spec], opset_version=11)
+
+    # 加载onnx模型并放到Infinitensor中
+    model_path = save_path + ".onnx"
+    onnx_model = onnx.load(model_path)
+    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
+    model = gofusion_model
+    model.init()
+
+    # 启动推理
+    cifar10_test = Cifar10(
+        mode="test",
+        transform=transform,  # apply transform to every image
+        backend="cv2",  # use OpenCV as image transform backend
+    )
+    batch_size = 1
+    total_size = 0
+    total_acc = 0.0
+    for data in itertools.islice(iter(cifar10_test), 10000):
+        images, labels = data
+        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
+        model.run()
+        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
+        outputs = paddle.to_tensor(outputs)
+        outputs = paddle.reshape(outputs, (1, 10))
+        labels = paddle.to_tensor(labels)
+        labels = paddle.reshape(labels, (1,1))
+        acc = paddle.metric.accuracy(outputs, labels)
+        total_acc += acc
+        total_size += batch_size
+    print("test acc: {}".format(total_acc.numpy() / total_size))
+
+
+
+if __name__ == "__main__":
+    run_cifar_train_and_infer() 
diff --git a/examples/python/paddle_model_dev.md b/examples/python/paddle_model_dev.md
new file mode 100644
index 00000000..f22355ea
--- /dev/null
+++ b/examples/python/paddle_model_dev.md
@@ -0,0 +1,31 @@
+## Description
+
+This is a doc to tell you how to run paddle*.py in your machine. If your model run on other machines except Nvidia, you may need to make some change.
+
+## What do we do in paddle*.py files?
+
+1. Train model and evalute model with Cifar10 dataset
+
+2. Export paddle model to onnx model
+
+3. Load onnx model, infer with InfiniTensor and calculate the inference accuracy
+
+## Command
+
+1. Go to `/examples/python` folder 
+
+2. Run the following command
+   
+   1. ```
+      python paddle_resnet.py
+      python paddle_densenet.py
+      python paddle_inception.py
+      ```
+
+## What should I do if I use other device(MLU, XPU, NPU)?
+
+You need to change this code:
+
+```
+paddle.device.set_device("gpu") # Change gpu to mlu, xpu or npu
+```
diff --git a/examples/python/paddle_resnet.py b/examples/python/paddle_resnet.py
new file mode 100644
index 00000000..8005f341
--- /dev/null
+++ b/examples/python/paddle_resnet.py
@@ -0,0 +1,81 @@
+
+import paddle
+import paddle.vision.transforms as T
+from paddle.vision.datasets import Cifar10
+from pyinfinitensor.onnx import OnnxStub, backend
+import onnx
+import itertools
+from paddle.vision.models.resnet import BasicBlock
+
+def run_cifar_train_and_infer():
+    
+    paddle.device.set_device("gpu")
+
+    transform = T.Compose(
+        [
+            T.Resize(224),
+            T.ToTensor(),
+            T.Normalize(
+                mean=[0.5, 0.5, 0.5],
+                std=[0.5, 0.5, 0.5],
+                to_rgb=True,
+            ),
+        ]
+    )
+    
+    # 下载数据集并初始化 DataSet
+    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
+    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
+
+    # 模型组网并初始化网络
+    resnet = paddle.vision.models.ResNet(BasicBlock, depth=18, num_classes=10)
+    model = paddle.Model(resnet)
+
+    # 模型训练的配置准备，准备损失函数，优化器和评价指标
+    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
+                paddle.nn.CrossEntropyLoss(),
+                paddle.metric.Accuracy())
+
+    # 模型训练
+    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
+    # 模型评估
+    model.evaluate(test_dataset, batch_size=64, verbose=1)
+
+    # export to ONNX
+    save_path = 'onnx.save/resnet' # 需要保存的路径
+    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
+    paddle.onnx.export(resnet, save_path, input_spec=[x_spec], opset_version=11)
+
+    # 加载onnx模型并放到Infinitensor中
+    model_path = save_path + ".onnx"
+    onnx_model = onnx.load(model_path)
+    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
+    model = gofusion_model
+    model.init()
+
+    # 启动推理
+    cifar10_test = Cifar10(
+        mode="test",
+        transform=transform,  # apply transform to every image
+        backend="cv2",  # use OpenCV as image transform backend
+    )
+    batch_size = 1
+    total_size = 0
+    total_acc = 0.0
+    for data in itertools.islice(iter(cifar10_test), 10000):
+        images, labels = data
+        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
+        model.run()
+        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
+        outputs = paddle.to_tensor(outputs)
+        outputs = paddle.reshape(outputs, (1, 10))
+        labels = paddle.to_tensor(labels)
+        labels = paddle.reshape(labels, (1,1))
+        acc = paddle.metric.accuracy(outputs, labels)
+        total_acc += acc
+        total_size += batch_size
+    print("test acc: {}".format(total_acc.numpy() / total_size))
+
+
+if __name__ == "__main__":
+    run_cifar_train_and_infer()

From 3f34372012402e62498cdbdc980f8591b0ed35a9 Mon Sep 17 00:00:00 2001
From: Chenjie Duan <44265800+kilinchange@users.noreply.github.com>
Date: Wed, 27 Dec 2023 09:43:57 +0800
Subject: [PATCH 19/28] - modify error info when kernel not found (#191)

* - modify error info when kernel not found

* - modify code as reviewer suggested

---------

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 include/core/kernel.h          |  9 ++++-----
 include/utils/operator_utils.h |  5 ++++-
 src/utils/operator_utils.cc    | 26 ++++++++++++++++++++++++++
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/include/core/kernel.h b/include/core/kernel.h
index 3ef0d1b9..a19f3f1a 100644
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@@ -2,6 +2,7 @@
 #include "core/common.h"
 #include "core/operator.h"
 #include "core/tensor.h"
+#include "utils/operator_utils.h"
 #include <functional>
 #include <nlohmann/json.hpp>
 using json = nlohmann::json;
@@ -102,11 +103,9 @@ class KernelRegistry {
     }
     Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
         auto it = kernels.find(kernelAttrs);
-        IT_ASSERT(it != kernels.end(),
-                  "Kernel not found for key {" +
-                      to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
-                      ", " + std::to_string(std::get<1>(kernelAttrs)) + ", " +
-                      std::get<2>(kernelAttrs).toString() + "}");
+        IT_ASSERT(it != kernels.end(), "Kernel not found for key {" +
+                                           get_kernel_attrs_str(kernelAttrs) +
+                                           "}");
         return std::get<0>(it->second);
     }
     const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
diff --git a/include/utils/operator_utils.h b/include/utils/operator_utils.h
index 4f6a6985..1b3a1eb2 100644
--- a/include/utils/operator_utils.h
+++ b/include/utils/operator_utils.h
@@ -2,6 +2,7 @@
 #ifndef OPERATOR_UTIL_H
 #define OPERATOR_UTIL_H
 
+#include "core/operator.h"
 #include "core/tensor.h"
 
 namespace infini {
@@ -10,8 +11,10 @@ namespace infini {
 Shape infer_broadcast(const Shape &A, const Shape &B);
 // Launch the real axis based on rank and current axis
 int get_real_axis(const int &axis, const int &rank);
-// check if tensor B is unidirectional broadcastable to tensor A
+// Check if tensor B is unidirectional broadcastable to tensor A
 bool is_unidirectional_broadcasting(const Shape &A, const Shape &B);
+// Convert KernelAttrs to a string representation
+std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs);
 } // namespace infini
 
 #endif
diff --git a/src/utils/operator_utils.cc b/src/utils/operator_utils.cc
index a9b81a5e..76a1d91f 100644
--- a/src/utils/operator_utils.cc
+++ b/src/utils/operator_utils.cc
@@ -1,4 +1,5 @@
 #include "utils/operator_utils.h"
+#include "core/runtime.h"
 
 namespace infini {
 
@@ -64,4 +65,29 @@ bool is_unidirectional_broadcasting(const Shape &A, const Shape &B) {
     }
     return true;
 }
+
+std::string device_to_str(Device device) {
+    std::string deviceStr;
+    switch (device) {
+    case Device::CPU:
+        return "CPU";
+    case Device::CUDA:
+        return "CUDA";
+    case Device::BANG:
+        return "BANG";
+    case Device::INTELCPU:
+        return "INTELCPU";
+    case Device::KUNLUN:
+        return "KUNLUN";
+    default:
+        IT_TODO_HALT();
+    }
+}
+
+std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs) {
+    std::string deviceStr = device_to_str(std::get<0>(kernelAttrs));
+    std::string opStr = OpType(std::get<1>(kernelAttrs)).toString();
+    std::string datatypeStr = std::get<2>(kernelAttrs).toString();
+    return deviceStr + ", " + opStr + ", " + datatypeStr;
+}
 } // namespace infini

From 5ac0ab442f6ff431e2b964f042cb5b6474f36355 Mon Sep 17 00:00:00 2001
From: Hardy <100662313+wanghailu0717@users.noreply.github.com>
Date: Thu, 28 Dec 2023 13:44:10 +0800
Subject: [PATCH 20/28] Fix bang (#198)

* fix bang batchnorm

* fix pooling test bang

* add test batchnorm

* HIGH PRECISION ACTIVATION

* fix pooling

* fix matmul

* fix test

* add layernorm

* fix softmax

* fix

* better code

* fix

* fix worlflow

* fix workflow

* fix

* fix

* fxi matmul

* add LRN

* fix lrn

* fix lrn

---------

Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: Baoming Li <1508269885@qq.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 include/core/graph_handler.h              |   2 +
 include/operators/lrn.h                   |  29 +++++
 pyinfinitensor/src/pyinfinitensor/onnx.py |  30 +++++
 src/core/graph_handler.cc                 |  14 +++
 src/ffi/ffi_infinitensor.cc               |  14 ++-
 src/kernels/bang/activation.cc            |  73 +++++++-----
 src/kernels/bang/batchnorm.cc             |  84 ++++++++++----
 src/kernels/bang/layer_norm.cc            |  64 +++++++++++
 src/kernels/bang/lrn.cc                   |  62 ++++++++++
 src/kernels/bang/matmul.cc                |  38 ++++++-
 src/kernels/bang/pad.cc                   |  26 ++---
 src/kernels/bang/pooling.cc               |   3 +-
 src/operators/lrn.cc                      |  36 ++++++
 test/kernels/bang/test_bang_batch_norm.cc |  57 ++++++++++
 test/kernels/bang/test_bang_concat.cc     |   2 +
 test/kernels/bang/test_bang_pooling.cc    |  15 ++-
 test/kernels/bang/test_bang_softmax.cc    | 131 ++++++++++++++++++++++
 17 files changed, 606 insertions(+), 74 deletions(-)
 create mode 100644 include/operators/lrn.h
 create mode 100644 src/kernels/bang/layer_norm.cc
 create mode 100644 src/kernels/bang/lrn.cc
 create mode 100644 src/operators/lrn.cc
 create mode 100644 test/kernels/bang/test_bang_batch_norm.cc
 create mode 100644 test/kernels/bang/test_bang_softmax.cc

diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index f095db81..dc95873c 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -99,6 +99,8 @@ class GraphHandlerObj {
                 int outputType, Tensor input);
     Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
                         std::string mode);
+    Tensor lrn(Tensor input, Tensor output, float alpha, float beta, float bias,
+               int size);
 
     //------ modifiers
 
diff --git a/include/operators/lrn.h b/include/operators/lrn.h
new file mode 100644
index 00000000..e86dbdc4
--- /dev/null
+++ b/include/operators/lrn.h
@@ -0,0 +1,29 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class LRNObj : public OperatorObj {
+
+  public:
+    LRNObj(GraphObj *graph, Tensor inputX, Tensor inputY, float alpha,
+           float beta, float bias, int size);
+    OP_CLONE(LRNObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return inputs.size(); }
+    int numOutputs() const override { return 1; }
+    auto getAlphaBetaBias() const {
+        return tuple(alpha_value, beta_value, bias_value);
+    }
+    auto getSize() const { return size_value; }
+
+  private:
+    float alpha_value, beta_value, bias_value;
+    int size_value;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+} // namespace infini
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 90a3d3ab..fa1facea 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -857,6 +857,22 @@ class OnnxStub:
                     tensors[output_name] = self.handler.tensor(dims, tensor.data_type)
                     data[output_name] = tensor
                     tensors[output_name].set_weight()
+                elif node.op_type == "LRN":
+                    attributes = _parse_attribute(
+                        node, {"alpha": 0.0001, "beta": 0.75, "bias": 1.0, "size": 1}
+                    )
+                    (alpha, beta, bias, size) = (
+                        attributes[name]
+                        for name in ["alpha", "beta", "bias", "size"]
+                    )
+                    tensors[node.output[0]] = self.handler.lrn(
+                        tensors[node.input[0]],
+                        tensors.get(node.output[0]),
+                        alpha,
+                        beta,
+                        bias,
+                        size,
+                    )
                 else:
                     raise Exception('Unsupported operator "{}"'.format(node.op_type))
                 new_node_name.append(node.name)
@@ -1195,6 +1211,20 @@ class OnnxStub:
             elif ty == backend.OpTypeId.Expand:
                 shape = backend.expand_shape_of(op)
                 ctx.push_node(make_node(ty.name, inputs, outputs, name, shape=shape))
+            elif ty == backend.OpTypeId.LRN:
+                alpha, beta, bias, size = backend.lrn_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        ty.name,
+                        inputs,
+                        outputs,
+                        name,
+                        alpha,
+                        beta,
+                        bias,
+                        size,
+                    )
+                )
             else:
                 raise Exception("Unsupported OpType", ty)
 
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index 1eb73499..dd2c425f 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -10,6 +10,7 @@
 #include "operators/expand.h"
 #include "operators/gather.h"
 #include "operators/layer_norm.h"
+#include "operators/lrn.h"
 #include "operators/matmul.h"
 #include "operators/pad.h"
 #include "operators/pooling.h"
@@ -519,6 +520,19 @@ Tensor GraphHandlerObj::depthToSpace(Tensor input, Tensor output, int blocksize,
     }
 }
 
+Tensor GraphHandlerObj::lrn(Tensor input, Tensor output, float alpha,
+                            float beta, float bias, int size) {
+    if (output) {
+        g->addOpWithOutputs<LRNObj>(std::move(input), output, alpha, beta, bias,
+                                    size);
+        return output;
+    } else {
+        return g
+            ->addOp<LRNObj>(std::move(input), output, alpha, beta, bias, size)
+            ->getOutput();
+    }
+}
+
 static CastType inferCastType(Tensor input, int to) {
     auto iType = input->getDType();
     auto oType = DataType(to);
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index ca99a4c3..99b18172 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -5,6 +5,7 @@
 #include "operators/conv.h"
 #include "operators/expand.h"
 #include "operators/gather.h"
+#include "operators/lrn.h"
 #include "operators/matmul.h"
 #include "operators/pad.h"
 #include "operators/pooling.h"
@@ -113,6 +114,7 @@ void export_values(py::module &m) {
         .VALUE(OpType, Erf)
         .VALUE(OpType, Where)
         .VALUE(OpType, DepthToSpace)
+        .VALUE(OpType, LRN)
         .export_values();
 
 #undef VALUE
@@ -296,6 +298,14 @@ static std::tuple<int, std::string> depth_to_space_attrs_of(Operator op) {
                            depth_to_space->getModeString());
 }
 
+static std::tuple<float, float, float, int> lrn_attrs_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::LRN);
+    auto lrn = dynamic_cast<const LRNObj *>(op.get());
+    auto [alpha, beta, bias] = lrn->getAlphaBetaBias();
+    auto size = lrn->getSize();
+    return std::make_tuple(alpha, beta, bias, size);
+}
+
 void export_functions(py::module &m) {
 #define FUNCTION(NAME) def(#NAME, &NAME)
     m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
@@ -332,7 +342,8 @@ void export_functions(py::module &m) {
         .FUNCTION(gather_axis_of)
         .FUNCTION(flatten_axis_of)
         .FUNCTION(cast_to_of)
-        .FUNCTION(depth_to_space_attrs_of);
+        .FUNCTION(depth_to_space_attrs_of)
+        .FUNCTION(lrn_attrs_of);
 #undef FUNCTION
 }
 
@@ -517,6 +528,7 @@ void init_graph_builder(py::module &m) {
         .def("expand", &Handler::expand, policy::move)
         .def("erf", &Handler::erf, policy::move)
         .def("where", &Handler::where, policy::move)
+        .def("lrn", &Handler::lrn, policy::move)
         .def("topo_sort", &Handler::topo_sort, policy::automatic)
         .def("optimize", &Handler::optimize, policy::automatic)
         .def("operators", &Handler::operators, policy::move)
diff --git a/src/kernels/bang/activation.cc b/src/kernels/bang/activation.cc
index 87b8396f..1d7b0c20 100644
--- a/src/kernels/bang/activation.cc
+++ b/src/kernels/bang/activation.cc
@@ -30,8 +30,9 @@ class UnaryCnnl : public BangKernelWithoutConfig {
                                                cDim.data()));
         cnnlActivationDescriptor_t opDesc;
         checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
-        checkCnnlError(cnnlSetActivationDescriptor(
-            opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
+        checkCnnlError(cnnlSetActivationDescriptor_v2(
+            opDesc, getOpType(), CNNL_ACTIVATION_HIGH_PRECISION,
+            CNNL_NOT_PROPAGATE_NAN, getCoef()));
 
         auto [alpha, beta] = getAlphBeta();
         cnnlStatus_t stat =
@@ -131,31 +132,51 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
         std::vector<int> inDim = {1, 1, 1};
         std::vector<int> outDim = inDim;
 
-        if (axis == 0) {
-            mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
-            inDim[0] = aDim[0];
-            inDim[1] = aDim[1];
-            for (size_t i = 2; i < aDim.size(); ++i) {
-                inDim[2] *= aDim[i];
+        if (aDim.size() >= 3) {
+            if (axis == 0) {
+                mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
+                inDim[0] = aDim[0];
+                inDim[1] = aDim[1];
+                for (size_t i = 2; i < aDim.size(); ++i) {
+                    inDim[2] *= aDim[i];
+                }
+                outDim = inDim;
+            } else if (axis == aDim.size() - 1) {
+                mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
+                inDim[0] = aDim[0];
+                for (size_t i = 1; i < axis; ++i) {
+                    inDim[1] *= aDim[i];
+                }
+                inDim[2] = aDim[axis];
+                outDim = inDim;
+            } else {
+                mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
+                for (size_t i = 0; i < axis; ++i) {
+                    inDim[0] *= aDim[i];
+                }
+                inDim[1] = aDim[axis];
+                for (size_t i = axis + 1; i < aDim.size(); ++i) {
+                    inDim[2] *= aDim[i];
+                }
+                outDim = inDim;
             }
-            outDim = inDim;
-        } else if (axis == aDim.size() - 1) {
-            mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
-            inDim[0] = aDim[0];
-            for (size_t i = 1; i < axis; ++i) {
-                inDim[1] *= aDim[i];
+        } else if (aDim.size() == 2) {
+            if (axis == 0) {
+                mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
+                inDim = aDim;
+                inDim.push_back(1);
+                outDim = inDim;
+            } else {
+                mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
+                inDim = aDim;
+                inDim.insert(inDim.begin(), 1);
+                outDim = inDim;
             }
-            inDim[2] = aDim[axis];
-            outDim = inDim;
         } else {
-            mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
-            for (size_t i = 0; i < axis; ++i) {
-                inDim[0] *= aDim[i];
-            }
-            inDim[1] = aDim[axis];
-            for (size_t i = axis + 1; i < aDim.size(); ++i) {
-                inDim[2] *= aDim[i];
-            }
+            mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
+            inDim = aDim;
+            inDim.push_back(1);
+            inDim.push_back(1);
             outDim = inDim;
         }
 
@@ -171,8 +192,8 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
         float beta = 0.0;
         cnnlStatus_t stat =
             cnnlSoftmaxForward_v2(context->cnnlHandle(), CNNL_SOFTMAX_ACCURATE,
-                                  mode, CNNL_COMPUTATION_HIGH_PRECISION, &alpha,
-                                  aDesc, aData, &beta, cDesc, cData);
+                                  mode, CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
+                                  &alpha, aDesc, aData, &beta, cDesc, cData);
         if (stat != CNNL_STATUS_SUCCESS)
             return;
         checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
diff --git a/src/kernels/bang/batchnorm.cc b/src/kernels/bang/batchnorm.cc
index d6b9ce53..a1bc81c0 100644
--- a/src/kernels/bang/batchnorm.cc
+++ b/src/kernels/bang/batchnorm.cc
@@ -17,51 +17,87 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
         void *const output = (op->getOutput()->getRawDataPtr<void *>());
 
         auto dims = op->getInputs(0)->getDims();
-
+        auto outDims = op->getOutput()->getDims();
         if (dims.size() != 4)
             IT_TODO_HALT();
 
-        int dimArray[4], strideArray[4], dimPArray[1], stridePArray[1];
+        int dimsTrans[4] = {dims[0], dims[2], dims[3], dims[1]};
+        int dimsOutTrans[4] = {outDims[0], outDims[2], outDims[3], outDims[1]};
+        int permute[4] = {0, 2, 3, 1};
+        int permuteOut[4] = {0, 3, 1, 2};
 
-        for (size_t i = 0; i < dims.size(); ++i) {
-            dimArray[i] = dims[i];
-            strideArray[i] = op->getInputs(0)->getStride()[i];
-        }
-        int w = dimArray[3];
-        dimArray[3] = dimArray[1];
-        int h = dimArray[2];
-        dimArray[1] = h;
-        dimArray[2] = w;
-
-        dimPArray[0] = op->getInputs(1)->getDims()[0];
-        stridePArray[0] = op->getInputs(1)->getDims()[0];
         // get inputs
-        cnnlTensorDescriptor_t inDesc;
+        cnnlTensorDescriptor_t inDesc, intransDesc, outDesc, outtransDesc;
         checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
-        checkCnnlError(cnnlSetTensorDescriptorEx(inDesc, CNNL_LAYOUT_NHWC,
-                                                 CNNL_DTYPE_FLOAT, dims.size(),
-                                                 dimArray, strideArray));
+        checkCnnlError(cnnlCreateTensorDescriptor(&intransDesc));
+        checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
+        checkCnnlError(cnnlCreateTensorDescriptor(&outtransDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, dims.size(),
+                                               dims.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(intransDesc, CNNL_LAYOUT_NHWC,
+                                               CNNL_DTYPE_FLOAT, dims.size(),
+                                               dimsTrans));
+        checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, outDims.size(),
+                                               outDims.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(outtransDesc, CNNL_LAYOUT_NHWC,
+                                               CNNL_DTYPE_FLOAT, outDims.size(),
+                                               dimsOutTrans));
+        cnnlTransposeDescriptor_t opDesc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
+        checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
+        size_t wsSize;
+        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), inDesc, opDesc,
+                                      &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+        BangPtr inputTrans = context->getWorkspace(
+            cnnlGetTensorElementNum(inDesc) * sizeof(float));
+        BangPtr outputTrans = context->getWorkspace(
+            cnnlGetTensorElementNum(inDesc) * sizeof(float));
+        cnnlStatus_t stat =
+            cnnlTranspose_v2(context->cnnlHandle(), opDesc, inDesc, input,
+                             intransDesc, inputTrans, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
 
         // get bnScaleBiasMeanVarDesc
+        auto dimsScaleBiasMeanVar = op->getInputs(1)->getDims();
         cnnlTensorDescriptor_t paraDesc;
         checkCnnlError(cnnlCreateTensorDescriptor(&paraDesc));
-        checkCnnlError(cnnlSetTensorDescriptorEx(paraDesc, CNNL_LAYOUT_ARRAY,
-                                                 CNNL_DTYPE_FLOAT, 1, dimPArray,
-                                                 stridePArray));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            paraDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
+            dimsScaleBiasMeanVar.size(), dimsScaleBiasMeanVar.data()));
 
         float alpha = 1.f, beta = 0.f;
         // This mode is intended for use after convolutional layers
-        cnnlStatus_t stat = cnnlBatchNormForwardInference(
-            context->cnnlHandle(), &alpha, &beta, inDesc, input, paraDesc,
-            scale, bias, mean, var, op->getEps(), inDesc, output);
+        stat = cnnlBatchNormForwardInference(
+            context->cnnlHandle(), &alpha, &beta, intransDesc, inputTrans,
+            paraDesc, scale, bias, mean, var, op->getEps(), outtransDesc,
+            outputTrans);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
 
+        cnnlTransposeDescriptor_t op2Desc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&op2Desc));
+        checkCnnlError(cnnlSetTransposeDescriptor(op2Desc, 4, permuteOut));
+        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), intransDesc,
+                                      op2Desc, &wsSize);
+        BangPtr ws2Data = context->getWorkspace(wsSize);
+        stat = cnnlTranspose_v2(context->cnnlHandle(), op2Desc, outtransDesc,
+                                outputTrans, outDesc, output, ws2Data, wsSize);
         if (stat != CNNL_STATUS_SUCCESS)
             return;
 
         // Destories in BANG does not require sync. But cnnl does not state
         // whether sync is required before destories.
         checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(outDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(intransDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(outtransDesc));
         checkCnnlError(cnnlDestroyTensorDescriptor(paraDesc));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(op2Desc));
     }
 };
 
diff --git a/src/kernels/bang/layer_norm.cc b/src/kernels/bang/layer_norm.cc
new file mode 100644
index 00000000..231177c5
--- /dev/null
+++ b/src/kernels/bang/layer_norm.cc
@@ -0,0 +1,64 @@
+#include "operators/layer_norm.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+
+class LayerNormCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<LayerNormObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const scaleData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *biasData = NULL;
+        if (op->numInputs() == 3) {
+            biasData = (op->getInputs(2)->getRawDataPtr<void *>());
+        }
+        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
+
+        auto inDims = op->getInputs(0)->getDims();
+        auto outDims = op->getOutput()->getDims();
+        auto fiterDims = op->getOutput(1)->getDims();
+
+        float eps = op->getEps();
+        const int axis = op->getAxis();
+
+        cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc;
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, inDims.size(),
+                                               inDims.data()));
+        checkCnnlError(cnnlCreateTensorDescriptor(&fiterDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            fiterDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, fiterDims.size(),
+            fiterDims.data()));
+        checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, outDims.size(),
+                                               outDims.data()));
+        size_t wsSize;
+        cnnlGetLayerNormOpWorkspaceSize(context->cnnlHandle(), axis, inDesc,
+                                        &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlLayerNormForward(
+            context->cnnlHandle(), inDesc, inputData, axis, fiterDesc,
+            scaleData, biasData, eps, wsData, wsSize, outDesc, outputData,
+            inDesc, NULL, NULL);
+
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(fiterDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(outDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::LayerNormalization, DataType::Float32,
+                LayerNormCnnl, "LayerNorm_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/lrn.cc b/src/kernels/bang/lrn.cc
new file mode 100644
index 00000000..4183f0fd
--- /dev/null
+++ b/src/kernels/bang/lrn.cc
@@ -0,0 +1,62 @@
+#include "operators/lrn.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class LRNCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<LRNObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto aDim = op->getInputs(0)->getDims();
+        auto cDim = op->getOutput()->getDims();
+        auto [alpha, beta, bias] = op->getAlphaBetaBias();
+        auto size = op->getSize();
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, aDim.size(),
+                                               aDim.data()));
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, cDim.size(),
+                                               cDim.data()));
+
+        size_t extra_size;
+        cnnlGetLrnExtraInputSize_v2(context->cnnlHandle(), cDesc,
+                                    CNNL_LRN_LOCAL_SIZE, size, &extra_size);
+        void *extra_cpu = NULL;
+        extra_cpu = malloc(extra_size);
+        BangPtr extra_mlu = context->getWorkspace(extra_size);
+        cnnlInitLrnExtraInput(context->cnnlHandle(), CNNL_LRN_LOCAL_SIZE, size,
+                              (double)alpha, (double)beta, (double)bias, aDesc,
+                              cDesc, extra_cpu);
+        cnrtMemcpy(extra_mlu, extra_cpu, extra_size,
+                   CNRT_MEM_TRANS_DIR_HOST2DEV);
+
+        size_t wsSize;
+        cnnlGetLrnWorkspaceSize_v2(context->cnnlHandle(), aDesc, cDesc,
+                                   CNNL_LRN_LOCAL_SIZE, size, &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlLrn_v2(
+            context->cnnlHandle(), CNNL_LRN_LOCAL_SIZE, size, (double)alpha,
+            (double)beta, (double)bias, wsData, wsSize, aDesc, aData, extra_mlu,
+            extra_size, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::LRN, DataType::Float32, LRNCnnl,
+                "LRN_cnnl_BANG_Float32");
+
+}; // namespace infini
diff --git a/src/kernels/bang/matmul.cc b/src/kernels/bang/matmul.cc
index 39888e71..368d6b1c 100644
--- a/src/kernels/bang/matmul.cc
+++ b/src/kernels/bang/matmul.cc
@@ -10,15 +10,29 @@ class MatmulCnnl : public BangKernelWithoutConfig {
         auto op = as<MatmulObj>(_op);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
+        auto input_num = op->numInputs();
+
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *biasData = NULL;
+        if (input_num > 2) {
+            biasData = (op->getInputs(2)->getRawDataPtr<void *>());
+        }
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
 
-        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc, biasDesc;
         auto dimInputs0 = op->getInputs(0)->getDims();
         auto dimInputs1 = op->getInputs(1)->getDims();
+        std::vector<int> dimBias;
+        if (input_num > 2) {
+            dimBias = op->getInputs(2)->getDims();
+        }
+
         auto dimOutput = op->getOutput()->getDims();
 
+        float alpha = 1.0;
+        float beta = 0.0;
+
         int32_t transA = op->getTransA();
         int32_t transB = op->getTransB();
 
@@ -37,6 +51,13 @@ class MatmulCnnl : public BangKernelWithoutConfig {
             cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
                                     dimOutput.size(), dimOutput.data()));
 
+        if (input_num > 2) {
+            checkCnnlError(cnnlCreateTensorDescriptor(&biasDesc));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                biasDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dimBias.size(),
+                dimBias.data()));
+        }
+
         cnnlMatMulDescriptor_t bmm_desc;
         cnnlMatMulDescCreate(&bmm_desc);
         cnnlSetMatMulDescAttr(bmm_desc, CNNL_MATMUL_DESC_TRANSA, &transA,
@@ -47,8 +68,6 @@ class MatmulCnnl : public BangKernelWithoutConfig {
         cnnlMatMulAlgo_t bmm_algo;
         cnnlMatMulAlgoCreate(&bmm_algo);
 
-        float alpha = 1.0;
-        float beta = 0.0;
         int count = 0;
 
         cnnlMatMulHeuristicResult_t desc;
@@ -66,9 +85,22 @@ class MatmulCnnl : public BangKernelWithoutConfig {
         if (stat != CNNL_STATUS_SUCCESS)
             return;
 
+        wsData = NULL;
+        if (input_num > 2) {
+            cnnlGetBiasAddWorkspaceSize(context->cnnlHandle(), biasDesc, cDesc,
+                                        &wsSize);
+            stat = cnnlBiasAdd(context->cnnlHandle(), &alpha, biasDesc,
+                               biasData, wsData, wsSize, &alpha, cDesc, cData);
+            if (stat != CNNL_STATUS_SUCCESS)
+                return;
+        }
+
         checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
         checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
         checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        if (input_num > 2) {
+            checkCnnlError(cnnlDestroyTensorDescriptor(biasDesc));
+        }
         checkCnnlError(cnnlMatMulDescDestroy(bmm_desc));
         checkCnnlError(cnnlMatMulAlgoDestroy(bmm_algo));
         checkCnnlError(cnnlDestroyMatMulHeuristicResult(desc));
diff --git a/src/kernels/bang/pad.cc b/src/kernels/bang/pad.cc
index e211ee93..c2503ca0 100644
--- a/src/kernels/bang/pad.cc
+++ b/src/kernels/bang/pad.cc
@@ -13,14 +13,14 @@ class PadCnnl : public BangKernelWithoutConfig {
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
 
         cnnlTensorDescriptor_t aDesc, cDesc;
-        auto dim = op->getOutput()->getDims();
-        int dim_size = dim.size();
-        int dim_array[dim_size];
-        for (int i = 0; i < dim_size; ++i) {
-            dim_array[i] = dim[i];
-        }
+        auto dimIn = op->getInputs(0)->getDims();
+        auto dimOut = op->getOutput()->getDims();
+
+        int dim_size = dimIn.size();
         int paddings[dim_size * 2];
+
         std::vector<int> pads = op->getPads();
+
         if (pads.size() == 2 && dim_size != 1) {
             for (int i = 0; i < dim_size * 2; i += 2) {
                 paddings[i] = pads[0];
@@ -32,20 +32,18 @@ class PadCnnl : public BangKernelWithoutConfig {
                 paddings[i + 1] = pads[i / 2 + dim_size];
             }
         }
-        int dimout_array[dim_size];
-        for (int i = 0; i < dim_size; ++i) {
-            dimout_array[i] = dim[i] + paddings[2 * i] + paddings[2 * i + 1];
-        }
+
         float paddingValue = 0.0;
         // input
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(
-            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dim_size, dim_array));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, dimIn.size(),
+                                               dimIn.data()));
         // output
         checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
         checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, dim_size,
-                                               dimout_array));
+                                               CNNL_DTYPE_FLOAT, dimOut.size(),
+                                               dimOut.data()));
 
         cnnlStatus_t stat = cnnlPad(context->cnnlHandle(), aDesc, aData,
                                     paddings, &paddingValue, cDesc, cData);
diff --git a/src/kernels/bang/pooling.cc b/src/kernels/bang/pooling.cc
index 8a91b466..f3cf04bc 100644
--- a/src/kernels/bang/pooling.cc
+++ b/src/kernels/bang/pooling.cc
@@ -21,13 +21,14 @@ class PoolingCnnl : public BangKernelWithoutConfig {
         checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
         checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
                                                CNNL_DTYPE_FLOAT, 4, inArray));
+        bool mode = op->getCeilMode();
 
         // get maxpool descriptor
         cnnlPoolingDescriptor_t poolingDesc;
         checkCnnlError(cnnlCreatePoolingDescriptor(&poolingDesc));
         checkCnnlError(cnnlSetPooling2dDescriptor_v2(
             poolingDesc, getPoolingMode(), CNNL_NOT_PROPAGATE_NAN, kh, kw, ph,
-            ph, pw, pw, sh, sw, dh, dw, false));
+            ph, pw, pw, sh, sw, dh, dw, mode));
 
         // get outputs
         // TODO: verify ceiling mode
diff --git a/src/operators/lrn.cc b/src/operators/lrn.cc
new file mode 100644
index 00000000..5cdc29a6
--- /dev/null
+++ b/src/operators/lrn.cc
@@ -0,0 +1,36 @@
+#include "operators/lrn.h"
+#include "utils/operator_utils.h"
+
+namespace infini {
+
+LRNObj::LRNObj(GraphObj *graph, Tensor input, Tensor output, float alpha,
+               float beta, float bias, int size)
+    : OperatorObj(OpType::LRN, TensorVec{input}, {output}), alpha_value(alpha),
+      beta_value(beta), bias_value(bias), size_value(size) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> LRNObj::inferShape(const TensorVec &inputs) {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string LRNObj::toString() const {
+    std::ostringstream os;
+    os << "LRN[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> LRNObj::getWorkloadVector() const {
+    vector<int> ret = getOutput()->getDims();
+    ret.emplace(ret.begin(), type.underlying());
+    return ret;
+}
+
+vector<int> LRNObj::getOpAttrVector() const { return {type.underlying()}; }
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_batch_norm.cc b/test/kernels/bang/test_bang_batch_norm.cc
new file mode 100644
index 00000000..cf79ff8d
--- /dev/null
+++ b/test/kernels/bang/test_bang_batch_norm.cc
@@ -0,0 +1,57 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/batch_norm.h"
+#include "test.h"
+
+namespace infini {
+
+TEST(BANG_BatchNorm, run) {
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build cpu graph
+    Graph gCpu = make_ref<GraphObj>(cpuRuntime);
+    auto iCpu = gCpu->addTensor(Shape{1, 3, 2, 2}, DataType::Float32);
+    auto meanCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
+    auto varCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
+    auto scaleCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
+    auto biasCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
+
+    // Build input data on CPU
+    gCpu->dataMalloc();
+    iCpu->setData(IncrementalGenerator());
+    meanCpu->copyin(vector<float>{1, 6, 9});
+    varCpu->copyin(vector<float>{4, 1, 9});
+    scaleCpu->setData(OneGenerator());
+    biasCpu->setData(ZeroGenerator());
+
+    Graph g = make_ref<GraphObj>(bangRuntime);
+
+    auto i = g->cloneTensor(iCpu);
+    auto mean = g->cloneTensor(meanCpu);
+    auto var = g->cloneTensor(varCpu);
+    auto scale = g->cloneTensor(scaleCpu);
+    auto bias = g->cloneTensor(biasCpu);
+    auto op =
+        g->addOp<BatchNormObj>(i, nullptr, mean, var, scale, bias, 0.9, 0);
+
+    g->dataMalloc();
+    i->setData(IncrementalGenerator());
+    mean->copyin(vector<float>{1, 6, 9});
+    var->copyin(vector<float>{4, 1, 9});
+    scale->setData(OneGenerator());
+    bias->setData(ZeroGenerator());
+
+    bangRuntime->run(g);
+
+    auto o = op->getOutput();
+    auto ocpu = o->clone(cpuRuntime);
+
+    // check results on CPU
+    EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 2}));
+    EXPECT_TRUE(ocpu->equalData(vector<float>{
+        -0.5, 0, 0.5, 1, -2, -1, 0, 1, -0.333333, 0, 0.3333333, 0.6666667}));
+}
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_concat.cc b/test/kernels/bang/test_bang_concat.cc
index 4cf130e3..3e0c2775 100644
--- a/test/kernels/bang/test_bang_concat.cc
+++ b/test/kernels/bang/test_bang_concat.cc
@@ -32,6 +32,8 @@ void testConcat(const std::function<void(void *, size_t, DataType)> &generator,
     auto gpuOp =
         bangGraph->addOp<T>(TensorVec{inputGpu1, inputGpu2}, nullptr, 2);
     bangGraph->dataMalloc();
+    inputGpu1->setData(generator);
+    inputGpu2->setData(generator);
     bangRuntime->run(bangGraph);
     auto outputGpu = gpuOp->getOutput();
     auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
diff --git a/test/kernels/bang/test_bang_pooling.cc b/test/kernels/bang/test_bang_pooling.cc
index 4bbc8091..8032f213 100644
--- a/test/kernels/bang/test_bang_pooling.cc
+++ b/test/kernels/bang/test_bang_pooling.cc
@@ -18,8 +18,14 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
 
     // Build input data on CPU
     Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
-    inputCpu->dataMalloc();
+    Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
+    auto cpuOp =
+        cpuGraph->addOp<T>(inputCpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
+    cpuGraph->addTensor(inputCpu);
+    cpuGraph->dataMalloc();
     inputCpu->setData(generator);
+    cpuRuntime->run(cpuGraph);
+    auto outputCpu = cpuOp->getOutput();
 
     // GPU
     Graph bangGraph = make_ref<GraphObj>(bangRuntime);
@@ -27,17 +33,16 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
     auto gpuOp =
         bangGraph->addOp<T>(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
     bangGraph->dataMalloc();
+    inputGpu->setData(generator);
     bangRuntime->run(bangGraph);
     auto outputGpu = gpuOp->getOutput();
     auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
-    inputCpu->printData();
-    outputGpu2Cpu->printData();
     EXPECT_TRUE(1);
 }
 
 TEST(cnnl_Pooling, run) {
-    testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
-    testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
+    testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 3, 5, 5});
+    testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 3, 5, 5});
 }
 
 } // namespace infini
diff --git a/test/kernels/bang/test_bang_softmax.cc b/test/kernels/bang/test_bang_softmax.cc
new file mode 100644
index 00000000..0ce65776
--- /dev/null
+++ b/test/kernels/bang/test_bang_softmax.cc
@@ -0,0 +1,131 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/softmax.h"
+#include "test.h"
+#include <cmath>
+namespace infini {
+
+TEST(cuDNN_Softmax, run_axis1) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
+    bangGraph->dataMalloc();
+    inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    EXPECT_TRUE(outputGpu2Cpu->equalData(
+        vector<float>{0.032058604, 0.08714432, 0.23688284, 0.6439143,
+                      0.032058604, 0.08714432, 0.23688284, 0.6439143}));
+}
+
+TEST(cuDNN_Softmax, run_axis0) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 0);
+    bangGraph->dataMalloc();
+    inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    EXPECT_TRUE(
+        outputGpu2Cpu->equalData(vector<float>{0., 0., 0., 0., 1, 1, 1, 1}));
+}
+
+TEST(cuDNN_Softmax2, run_axis1) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
+    bangGraph->dataMalloc();
+    inputGpu->setData(IncrementalGenerator());
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
+        0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138, 0.9820138,
+        0.9820138, 0.9820138, 0.0179862, 0.0179862, 0.0179862, 0.0179862,
+        0.9820138, 0.9820138, 0.9820138, 0.9820138}));
+}
+
+TEST(cuDNN_Softmax2, run_axis2) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 2);
+    bangGraph->dataMalloc();
+    inputGpu->setData(IncrementalGenerator());
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
+        0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029,
+        0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971, 0.8807971,
+        0.1192029, 0.1192029, 0.8807971, 0.8807971}));
+}
+
+TEST(cuDNN_Softmax2, run_axis3) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 3);
+    bangGraph->dataMalloc();
+    inputGpu->setData(IncrementalGenerator());
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
+        0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
+        0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
+        0.2689414, 0.7310586, 0.2689414, 0.7310586}));
+}
+} // namespace infini

From 6e7bd6ca0cf3f21f5fd8801f6f34d7fafaf2a1c2 Mon Sep 17 00:00:00 2001
From: Chenjie Duan <44265800+kilinchange@users.noreply.github.com>
Date: Thu, 28 Dec 2023 21:31:39 +0800
Subject: [PATCH 21/28] fix(perf.py): change NNmodel commit to fix perf.py
 (#203)

---
 examples/NNmodel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/NNmodel b/examples/NNmodel
index 51d31052..b896cec2 160000
--- a/examples/NNmodel
+++ b/examples/NNmodel
@@ -1 +1 @@
-Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77
+Subproject commit b896cec2dba5b8522b141ac4f89eb43074ee1b98

From 3967b437c8d423ff49037e846fcd14d2c58463fb Mon Sep 17 00:00:00 2001
From: zhangyunze <93699316+bitzyz@users.noreply.github.com>
Date: Thu, 28 Dec 2023 21:39:24 +0800
Subject: [PATCH 22/28] fix Issue 187 split infershape wrong (#197)

* fix: fix splitOp to support unequal portions

* fix: fix as review comment

---------

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 include/core/graph_handler.h              |  2 +-
 pyinfinitensor/src/pyinfinitensor/onnx.py | 16 +++++++++++-
 pyinfinitensor/tests/test_onnx.py         |  6 +++++
 src/core/graph_handler.cc                 | 26 ++++++++++++++----
 test/kernels/cuda/test_cuda_split.cc      | 32 +++++++++++++++++++++++
 5 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index dc95873c..caec8ca2 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -70,7 +70,7 @@ class GraphHandlerObj {
                             Tensor input_q, Tensor input_k, Tensor input_v,
                             Tensor position_id, Tensor output_matmul);
     TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
-                    int num_outputs);
+                    std::variant<int, vector<int>> numOrRatio);
     Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
     Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
     Tensor reduceMean(Tensor data, Tensor reduced,
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index fa1facea..80457ecd 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -585,6 +585,20 @@ class OnnxStub:
                         tensors.get(node.output[0]),
                     )
                 elif node.op_type == "Split":
+                    split = (
+                        _parse_data(data[node.input[1]])
+                        if (len(node.input) > 1)
+                        else None
+                    )
+                    if split is None:
+                        split = next(
+                            (
+                                attr.ints
+                                for attr in node.attribute
+                                if attr.name == "split"
+                            ),
+                            None,
+                        )
                     for name, tensor in zip(
                         node.output,
                         self.handler.split(
@@ -598,7 +612,7 @@ class OnnxStub:
                                 ),
                                 0,
                             ),
-                            len(node.output),
+                            split if split is not None else len(node.output),
                         ),
                     ):
                         tensors[name] = tensor
diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py
index ca290d76..02911b50 100644
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@@ -435,6 +435,12 @@ class TestStringMethods(unittest.TestCase):
         split = make_node("Split", ["input"], ["output"], name="split", axis=0)
         make_and_import_model(make_graph([split], "split", [input], []))
 
+    def test_split1(self):
+        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
+        splitAttr = make_tensor_value_info("split", TensorProto.INT64, [2, 1])
+        split = make_node("Split", ["input", "split"], ["output"], name="split", axis=1)
+        make_and_import_model(make_graph([split], "split", [input, splitAttr], []))
+
     def test_allBroadcast(self):
         input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
         output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index dd2c425f..e323538a 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -25,6 +25,7 @@
 #include "operators/unary.h"
 #include "operators/where.h"
 #include <numeric>
+#include <variant>
 
 namespace infini {
 
@@ -284,14 +285,29 @@ Tensor GraphHandlerObj::attentionKVCache(Tensor input_k_cache,
 }
 
 TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs,
-                                 int axis, int num_outputs) {
+                                 int axis,
+                                 std::variant<int, vector<int>> numOrRatio) {
     if (outputs) {
-        g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
-                                      num_outputs);
+        if (std::holds_alternative<int>(numOrRatio)) {
+            g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
+                                          std::get<int>(numOrRatio));
+        } else {
+            g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
+                                          std::get<vector<int>>(numOrRatio));
+        }
         return *outputs;
     } else {
-        return g->addOp<SplitObj>(std::move(input), outputs, axis, num_outputs)
-            ->getOutputs();
+        if (std::holds_alternative<int>(numOrRatio)) {
+            return g
+                ->addOp<SplitObj>(std::move(input), outputs, axis,
+                                  std::get<int>(numOrRatio))
+                ->getOutputs();
+        } else {
+            return g
+                ->addOp<SplitObj>(std::move(input), outputs, axis,
+                                  std::get<vector<int>>(numOrRatio))
+                ->getOutputs();
+        }
     }
 }
 
diff --git a/test/kernels/cuda/test_cuda_split.cc b/test/kernels/cuda/test_cuda_split.cc
index 2cab944e..43700b77 100644
--- a/test/kernels/cuda/test_cuda_split.cc
+++ b/test/kernels/cuda/test_cuda_split.cc
@@ -73,6 +73,38 @@ TEST(Split, CudaHigh) {
                                                44., 45., 46., 47.}));
 }
 
+TEST(Split, SplitWithRatio) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    auto input = gCpu->addTensor({2, 6, 2, 1, 2}, DataType::Float32);
+    gCpu->dataMalloc();
+    input->setData(IncrementalGenerator());
+
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+    auto inputGpu = gCuda->cloneTensor(input);
+    vector<int> split = {2, 4};
+    auto op = gCuda->addOp<SplitObj>(inputGpu, std::nullopt, 1, split);
+    gCuda->dataMalloc();
+    inputGpu->setData(IncrementalGenerator());
+
+    cudaRuntime->run(gCuda);
+
+    //  copy output from CUDA to CPU
+    EXPECT_EQ(op->getOutputs().size(), (size_t)2);
+    auto o0Cpu = gCpu->cloneTensor(op->getOutput(0));
+    auto o1Cpu = gCpu->cloneTensor(op->getOutput(1));
+    EXPECT_TRUE(
+        o0Cpu->equalData(vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 24., 25.,
+                                       26., 27., 28., 29., 30., 31.}));
+    EXPECT_TRUE(o1Cpu->equalData(
+        vector<float>{8.,  9.,  10., 11., 12., 13., 14., 15., 16., 17., 18.,
+                      19., 20., 21., 22., 23., 32., 33., 34., 35., 36., 37.,
+                      38., 39., 40., 41., 42., 43., 44., 45., 46., 47.}));
+}
+
 TEST(Split, Cuda_dim0) {
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
     Graph gCpu = make_ref<GraphObj>(runtime);

From 83f1de93d09ea0e09a28ea521399132e9caac35c Mon Sep 17 00:00:00 2001
From: Chenjie Duan <44265800+kilinchange@users.noreply.github.com>
Date: Fri, 29 Dec 2023 13:32:56 +0800
Subject: [PATCH 23/28] add frontend resize kernel (#194)

* - add frontend resize kernel

* - fix resize test

* - fix bug
- add onnx test for resize

* fix: modify codes as reviewer suggested

---------

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 include/core/graph_handler.h              |  6 +++
 include/operators/resize.h                | 54 +++++++++++++++++++++
 pyinfinitensor/src/pyinfinitensor/onnx.py | 59 +++++++++++++++++++++++
 pyinfinitensor/tests/test_onnx.py         |  8 +++
 src/core/graph_handler.cc                 | 59 +++++++++++++++++++++++
 src/ffi/ffi_infinitensor.cc               |  1 +
 6 files changed, 187 insertions(+)

diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index caec8ca2..313a1f79 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -65,6 +65,12 @@ class GraphHandlerObj {
                 std::optional<float> max);
     Tensor transpose(Tensor data, Tensor transposed, Shape perm);
     Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
+    Tensor resize(Tensor input, Tensor output,
+                  const std::optional<vector<int>> &axes, Tensor sizes,
+                  Tensor scales, Tensor roi, vector<uint32_t> sizes_,
+                  vector<float> scales_, vector<float> roi_, string mode,
+                  string ratioPolicy, string nearestMode,
+                  string coordTransMode);
     Tensor concat(TensorVec inputs, Tensor output, int dim);
     Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
                             Tensor input_q, Tensor input_k, Tensor input_v,
diff --git a/include/operators/resize.h b/include/operators/resize.h
index 96283c12..220ef719 100644
--- a/include/operators/resize.h
+++ b/include/operators/resize.h
@@ -27,6 +27,60 @@ class ResizeObj : public OperatorObj {
     enum class EKeepAspectRatioPolicy { stretch, notLarger, notSmaller, none };
     enum class ECoeffMode { nearest, linear, cubic };
 
+    static ECoordinateTransMode fromECoordinateTransModeStr(string mode) {
+        if (mode == "half_pixel") {
+            return ECoordinateTransMode::halfPixel;
+        } else if (mode == "asymmetric") {
+            return ECoordinateTransMode::asymmetric;
+        } else if (mode == "align_corners") {
+            return ECoordinateTransMode::alignCorners;
+        } else if (mode == "pytorch_half_pixel") {
+            return ECoordinateTransMode::pytorchHalfPixel;
+        } else if (mode == "tf_crop_and_resize") {
+            return ECoordinateTransMode::tfCropAndResize;
+        } else {
+            IT_TODO_HALT();
+        }
+    }
+
+    static ENearestMode fromENearestModeStr(string mode) {
+        if (mode == "round_prefer_floor") {
+            return ENearestMode::roundPreferFloor;
+        } else if (mode == "round_prefer_ceil") {
+            return ENearestMode::roundPreferCeil;
+        } else if (mode == "floor") {
+            return ENearestMode::floor;
+        } else if (mode == "ceil") {
+            return ENearestMode::ceil;
+        } else {
+            return ENearestMode::none;
+        }
+    }
+
+    static EKeepAspectRatioPolicy fromRatioPolicyStr(string ratioPolicyStr) {
+        if (ratioPolicyStr == "stretch") {
+            return EKeepAspectRatioPolicy::stretch;
+        } else if (ratioPolicyStr == "not_larger") {
+            return EKeepAspectRatioPolicy::notLarger;
+        } else if (ratioPolicyStr == "not_smaller") {
+            return EKeepAspectRatioPolicy::notSmaller;
+        } else {
+            return EKeepAspectRatioPolicy::none;
+        }
+    }
+
+    static ECoeffMode fromECoeffModeStr(string mode) {
+        if (mode == "nearest") {
+            return ECoeffMode::nearest;
+        } else if (mode == "linear") {
+            return ECoeffMode::linear;
+        } else if (mode == "cubic") {
+            return ECoeffMode::cubic;
+        } else {
+            IT_TODO_HALT();
+        }
+    }
+
   private:
     vector<int> axes;
     vector<float> scales;
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 80457ecd..c63746af 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -535,6 +535,65 @@ class OnnxStub:
                         tensors.get(node.output[0]),
                         shape,
                     )
+                elif node.op_type == "Resize":
+                    output = tensors.get(node.output[0])
+                    attributes = _parse_attribute(
+                        node,
+                        {
+                            "antialias": 0,
+                            "axes": None,
+                            "coordinate_transformation_mode": "half_pixel",
+                            "cubic_coeff_a": -0.75,
+                            "exclude_outside": 0,
+                            "extrapolation_value": 0.0,
+                            "keep_aspect_ratio_policy": "none",
+                            "mode": "nearest",
+                            "nearest_mode": "none",
+                        },
+                    )
+                    (
+                        axes,
+                        keep_aspect_ratio_policy,
+                        coordinate_transformation_mode,
+                        mode,
+                        nearest_mode,
+                    ) = (
+                        attributes[name]
+                        for name in [
+                            "axes",
+                            "keep_aspect_ratio_policy",
+                            "coordinate_transformation_mode",
+                            "mode",
+                            "nearest_mode",
+                        ]
+                    )
+                    if len(node.input) > 1:
+                        roiVal = _parse_data(data[node.input[1]])
+                    else:
+                        roiVal = []
+                    if len(node.input) > 2:
+                        scalesVal = _parse_data(data[node.input[2]])
+                    else:
+                        scalesVal = []
+                    if len(node.input) > 3:
+                        sizesVal = _parse_data(data[node.input[3]])
+                    else:
+                        sizesVal = []
+                    tensors[node.output[0]] = self.handler.resize(
+                        tensors[node.input[0]],
+                        output,
+                        axes,
+                        tensors[node.input[3]] if len(node.input) > 3 else None,
+                        tensors[node.input[2]] if len(node.input) > 2 else None,
+                        tensors[node.input[1]] if len(node.input) > 1 else None,
+                        sizesVal,
+                        scalesVal,
+                        roiVal,
+                        mode,
+                        keep_aspect_ratio_policy,
+                        nearest_mode,
+                        coordinate_transformation_mode,
+                    )
                 elif node.op_type == "Squeeze":
                     input_shape = _search_shape(model, node.input[0])
                     axes = set(
diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py
index 02911b50..f5d5a426 100644
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@@ -295,6 +295,14 @@ class TestStringMethods(unittest.TestCase):
             make_graph([reshape], "reshape", [data, shape], [reshaped], [shape_data])
         )
 
+    def test_resize(self):
+        x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 128, 40, 40])
+        roi = make_tensor("roi", TensorProto.FLOAT, [0], [])
+        scales = make_tensor("scales", TensorProto.FLOAT, [4], [1, 1, 2, 2])
+        y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 128, 80, 80])
+        reshape = make_node("Resize", ["x", "roi", "scales"], ["y"], name="resize")
+        make_and_import_model(make_graph([reshape], "resize", [x], [y], [roi, scales]))
+
     def test_concat(self):
         input1 = make_tensor_value_info("input1", TensorProto.FLOAT, [1, 3, 2, 4])
         input2 = make_tensor_value_info("input2", TensorProto.FLOAT, [1, 3, 2, 5])
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index e323538a..7fc6f977 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -17,6 +17,7 @@
 #include "operators/recv.h"
 #include "operators/reduce.h"
 #include "operators/reshape.h"
+#include "operators/resize.h"
 #include "operators/send.h"
 #include "operators/slice.h"
 #include "operators/softmax.h"
@@ -254,6 +255,64 @@ Tensor GraphHandlerObj::reshape(Tensor data, Tensor reshaped, Shape shape) {
     }
 }
 
+Tensor GraphHandlerObj::resize(Tensor input, Tensor output,
+                               const std::optional<vector<int>> &axes,
+                               Tensor sizes, Tensor scales, Tensor roi,
+                               vector<uint32_t> sizes_, vector<float> scales_,
+                               vector<float> roi_, string mode,
+                               string ratioPolicy, string nearestMode,
+                               string coordTransMode) {
+    if (sizes_.size() > 0) {
+        sizes->dataMalloc();
+        sizes->copyin<uint32_t>(sizes_);
+    }
+    if (scales_.size() > 0) {
+        scales->dataMalloc();
+        scales->copyin<float>(scales_);
+    }
+    if (roi_.size() > 0) {
+        roi->dataMalloc();
+        roi->copyin<float>(roi_);
+    }
+    ResizeObj::EKeepAspectRatioPolicy ratioPolicy_ =
+        ResizeObj::fromRatioPolicyStr(ratioPolicy);
+    ResizeObj::ENearestMode nearestMode_ =
+        ResizeObj::fromENearestModeStr(nearestMode);
+    ResizeObj::ECoordinateTransMode coordTransMode_ =
+        ResizeObj::fromECoordinateTransModeStr(coordTransMode);
+    ResizeObj::ECoeffMode mode_ = ResizeObj::fromECoeffModeStr(mode);
+    if (output) {
+        if (mode == "nearest") {
+            g->addOpWithOutputs<ResizeObj>(
+                std::move(input), output, std::move(axes), std::move(sizes),
+                std::move(scales), std::move(roi), ratioPolicy_, nearestMode_,
+                coordTransMode_);
+        } else {
+            g->addOpWithOutputs<ResizeObj>(
+                std::move(input), output, std::move(axes), std::move(sizes),
+                std::move(scales), std::move(roi), mode_, ratioPolicy_,
+                coordTransMode_);
+        }
+        return output;
+    } else {
+        if (mode == "nearest") {
+            return g
+                ->addOp<ResizeObj>(std::move(input), output, std::move(axes),
+                                   std::move(sizes), std::move(scales),
+                                   std::move(roi), ratioPolicy_, nearestMode_,
+                                   coordTransMode_)
+                ->getOutput();
+        } else {
+            return g
+                ->addOp<ResizeObj>(std::move(input), output, std::move(axes),
+                                   std::move(sizes), std::move(scales),
+                                   std::move(roi), mode_, ratioPolicy_,
+                                   coordTransMode_)
+                ->getOutput();
+        }
+    }
+}
+
 Tensor GraphHandlerObj::concat(TensorVec inputs, Tensor output, int dim) {
     if (output) {
         g->addOpWithOutputs<ConcatObj>(std::move(inputs), output, dim);
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 99b18172..c23009b5 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -506,6 +506,7 @@ void init_graph_builder(py::module &m) {
         .def("transpose", &Handler::transpose, policy::move)
         .def("depthToSpace", &Handler::depthToSpace, policy::move)
         .def("reshape", &Handler::reshape, policy::move)
+        .def("resize", &Handler::resize, policy::move)
         .def("concat", &Handler::concat, policy::move)
         .def("attentionKVCache", &Handler::attentionKVCache, policy::move)
         .def("split", &Handler::split, policy::move)

From 42032356fb330f4a9f965116e7411f790e558bab Mon Sep 17 00:00:00 2001
From: Hardy <100662313+wanghailu0717@users.noreply.github.com>
Date: Wed, 3 Jan 2024 13:28:03 +0800
Subject: [PATCH 24/28] Bang cncl (#163)

* MLU CNCL base

* add FindCNCL.cmake, not find -lcncl

* bangPrintFloat not find

* docker:make sucessful, test error

* delete net file and onnxtest.py

* init

* fix cncl

* format

* fix

* format

* fix cncl

* run dist gpt2 on mlu

* format

* fix import error on mlu docker

* run llama single card

* run distributed llama2

* add test for slice/reduce on mlu

* fix cncl related test

* fix format

* format

* delete comments

* change GPU to MLU

* MLU CNCL base

* add FindCNCL.cmake, not find -lcncl

* bangPrintFloat not find

* docker:make sucessful, test error

* delete net file and onnxtest.py

* init

* fix cncl

* format

* fix

* format

* fix cncl

* run dist gpt2 on mlu

* format

* fix import error on mlu docker

* run llama single card

* run distributed llama2

* add test for slice/reduce on mlu

* fix cncl related test

* fix format

* format

* delete comments

* change GPU to MLU

* modify launch script

* fix name

* fix format

* fix gather

* format python script

---------

Co-authored-by: xgqdut2016 <kenan_gewei@163.com>
Co-authored-by: Bolun <chamberlain0w0@gmail.com>
Co-authored-by: Bolun Zhang <48948016+Chamberlain0w0@users.noreply.github.com>
---
 CMakeLists.txt                                |  12 +-
 Makefile                                      |   1 +
 cmake/FindCNCL.cmake                          |  76 +++++++
 examples/distributed/bang_launch.py           | 196 ++++++++++++++++++
 examples/distributed/parallel_opt.py          |   2 +-
 include/bang/bang_runtime.h                   |  16 +-
 include/bang/cncl_communicator.h              |  79 +++++++
 include/core/tensor.h                         |   4 +-
 src/bang/bang_runtime.cc                      |  14 ++
 src/ffi/ffi_infinitensor.cc                   |   4 +-
 src/kernels/bang/all_gather.cc                |  49 +++++
 src/kernels/bang/all_reduce.cc                |  53 +++++
 src/kernels/bang/broadcast.cc                 |  34 +++
 src/kernels/bang/gather.cc                    |   2 +
 .../bang/{reduce_mean.cc => reduce.cc}        |  20 +-
 src/kernels/bang/reshape.cc                   |   2 +
 src/kernels/bang/slice.cc                     |  64 ++++++
 test/bang/test_cncl_comm.cc                   |  58 ++++++
 test/kernels/bang/test_bang_all_gather.cc     |  60 ++++++
 test/kernels/bang/test_bang_all_reduce.cc     | 124 +++++++++++
 test/kernels/bang/test_bang_broadcast.cc      |  65 ++++++
 test/kernels/bang/test_bang_reduce.cc         |  82 ++++++++
 test/kernels/bang/test_bang_slice.cc          |  39 ++++
 23 files changed, 1040 insertions(+), 16 deletions(-)
 create mode 100644 cmake/FindCNCL.cmake
 create mode 100644 examples/distributed/bang_launch.py
 create mode 100644 include/bang/cncl_communicator.h
 create mode 100644 src/kernels/bang/all_gather.cc
 create mode 100644 src/kernels/bang/all_reduce.cc
 create mode 100644 src/kernels/bang/broadcast.cc
 rename src/kernels/bang/{reduce_mean.cc => reduce.cc} (82%)
 create mode 100644 src/kernels/bang/slice.cc
 create mode 100644 test/bang/test_cncl_comm.cc
 create mode 100644 test/kernels/bang/test_bang_all_gather.cc
 create mode 100644 test/kernels/bang/test_bang_all_reduce.cc
 create mode 100644 test/kernels/bang/test_bang_broadcast.cc
 create mode 100644 test/kernels/bang/test_bang_reduce.cc
 create mode 100644 test/kernels/bang/test_bang_slice.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1101a8c2..70508c79 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,7 +13,7 @@ if(USE_CUDA)
     message("CMake 3.18 or higher is required for setting CUDAToolkit")
     cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit
 else()
-    cmake_minimum_required(VERSION 3.12)
+    cmake_minimum_required(VERSION 3.17)
 endif()
 
 include(CMakeDependentOption)
@@ -245,6 +245,7 @@ if(USE_BANG)
   find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64")
   find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64")
   find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64")
+  find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
 
   if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
@@ -261,7 +262,13 @@ if(USE_BANG)
   # BangC Kernels
   ################################################################################
 
-  target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
+  target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
+  if (BUILD_DIST)
+    message(STATUS "Add BUILD_DIST, use CNCL with BANG")
+    
+    add_compile_definitions(INFINI_USE_CNCL=1)
+    
+  endif()
 endif()
 
 if(USE_KUNLUN)
@@ -324,6 +331,7 @@ if(BUILD_TEST)
     endif()
     if (USE_BANG)
       build_test(test/kernels/bang/*.cc)
+      build_test(test/bang/*.cc)
     endif()
     if (USE_KUNLUN)
       build_test(test/kernels/kunlun/*.cc)
diff --git a/Makefile b/Makefile
index 302f47b8..d21a406b 100644
--- a/Makefile
+++ b/Makefile
@@ -29,6 +29,7 @@ CMAKE_OPT += -DUSE_BANG=$(BANG)
 CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
 CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
 CMAKE_OPT += -DBUILD_TEST=$(TEST)
+CMAKE_OPT += -DBUILD_DIST=ON
 CMAKE_OPT += -DBUILD_NNET=$(NNET)
 
 ifeq ($(INTELCPU), ON)
diff --git a/cmake/FindCNCL.cmake b/cmake/FindCNCL.cmake
new file mode 100644
index 00000000..31351dda
--- /dev/null
+++ b/cmake/FindCNCL.cmake
@@ -0,0 +1,76 @@
+SET(CNCL_LIB_SEARCH_PATHS $ENV{NEUWARE_HOME}/lib64)
+SET(CNCL_INCLUDE_SEARCH_PATHS $ENV{NEUWARE_HOME}/include)
+
+set(CNCL_INCLUDE_DIR $ENV{NEUWARE_HOME}/include)
+set(CNCL_LIB_DIR $ENV{NEUWARE_HOME}/lib64)
+set(CNCL_VERSION $ENV{CNCL_VERSION} CACHE STRING "Version of CNCL to build with")
+
+if ($ENV{CNCL_ROOT_DIR})
+  message(WARNING "CNCL_ROOT_DIR is deprecated. Please set CNCL_ROOT instead.")
+endif()
+list(APPEND CNCL_ROOT $ENV{CNCL_ROOT_DIR} ${MLU_TOOLKIT_ROOT_DIR})
+# Compatible layer for CMake <3.12. CNCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${CNCL_ROOT})
+
+find_path(CNCL_INCLUDE_DIRS
+  NAMES cncl.h
+  HINTS ${CNCL_INCLUDE_DIR})
+
+if (USE_STATIC_CNCL)
+  MESSAGE(STATUS "USE_STATIC_CNCL is set. Linking with static CNCL library.")
+  SET(CNCL_LIBNAME "CNCL_static")
+  if (CNCL_VERSION)  # Prefer the versioned library if a specific CNCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+else()
+  SET(CNCL_LIBNAME "cncl")
+  if (CNCL_VERSION)  # Prefer the versioned library if a specific CNCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+endif()
+
+find_library(CNCL_LIBRARIES
+  NAMES ${CNCL_LIBNAME}
+  HINTS ${CNCL_LIB_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CNCL DEFAULT_MSG CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
+
+if(CNCL_FOUND)  # obtaining CNCL version and some sanity checks
+  set (CNCL_HEADER_FILE "${CNCL_INCLUDE_DIRS}/cncl.h")
+  message (STATUS "Determining CNCL version from ${CNCL_HEADER_FILE}...")
+  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
+  list (APPEND CMAKE_REQUIRED_INCLUDES ${CNCL_INCLUDE_DIRS})
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(CNCL_VERSION_CODE CNCL.h CNCL_VERSION_DEFINED)
+
+  if (CNCL_VERSION_DEFINED)
+    set(file "${PROJECT_BINARY_DIR}/detect_cncl_version.cc")
+    file(WRITE ${file} "
+      #include <iostream>
+      #include <cncl.h>
+      int main()
+      {
+        std::cout << CNCL_MAJOR << '.' << CNCL_MINOR << '.' << CNCL_PATCH << std::endl;
+        int x;
+        CNCLGetVersion(&x);
+        return x == CNCL_VERSION_CODE;
+      }
+")
+    try_run(CNCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+          RUN_OUTPUT_VARIABLE CNCL_VERSION_FROM_HEADER
+          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${CNCL_INCLUDE_DIRS}"
+          LINK_LIBRARIES ${CNCL_LIBRARIES})
+    if (NOT CNCL_VERSION_MATCHED)
+      message(FATAL_ERROR "Found CNCL header version and library version do not match! \
+(include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES}) Please set CNCL_INCLUDE_DIR and CNCL_LIB_DIR manually.")
+    endif()
+    message(STATUS "CNCL version: ${CNCL_VERSION_FROM_HEADER}")
+  else()
+    # message(STATUS "CNCL version < 2.3.5-5")
+  endif ()
+  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
+
+  message(STATUS "Found CNCL (include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES})")
+  mark_as_advanced(CNCL_ROOT_DIR CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
+endif()
diff --git a/examples/distributed/bang_launch.py b/examples/distributed/bang_launch.py
new file mode 100644
index 00000000..518935b5
--- /dev/null
+++ b/examples/distributed/bang_launch.py
@@ -0,0 +1,196 @@
+import argparse
+import os
+import time
+import multiprocessing as mp
+from pyinfinitensor.onnx import OnnxStub, backend
+import onnx
+from onnx.shape_inference import infer_shapes_path
+import numpy as np
+from parallel_opt import parallel_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
+    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
+    parser.add_argument(
+        "--nproc_per_node", type=int, default=2, help="number of processes per node"
+    )
+    parser.add_argument(
+        "--name", type=str, default="test", help="name of this instance."
+    )
+    parser.add_argument(
+        "--model", type=str, default="/data/onnx_models/llama2/llama_bs1_seq1024.onnx", 
+        help="path to the ONNX model file."
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
+    parser.add_argument("--length", type=int, default=1, help="sequence length.")
+    parser.add_argument(
+        "--gen_std",
+        default=False,
+        action="store_true",
+        help="whether to generate the standard results.",
+    )
+    args = parser.parse_args()
+    print("arg setting: ", args)
+    return (
+        args.num_nodes,
+        args.nproc_per_node,
+        args.name,
+        args.model,
+        args.batch_size,
+        args.length,
+        args.gen_std,
+    )
+
+
+def run_model(model, runtime, world_size=1, rank=0, n=10):
+    stub = OnnxStub(model, runtime)
+    load_inputs(stub, world_size, rank)
+    # stub.tune()
+    stub.run()
+    # get outputs
+    time.sleep(0.01)
+    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
+
+    # bench
+    begin = time.time()
+    for _ in range(n):
+        stub.run()
+    end = time.time()
+    avg_time = (end - begin) / n
+    print(f"average time: {avg_time}")
+    return outputs
+
+
+def run_and_compare(name, model, runtime, world_size=1, rank = 0):
+    results = np.load(f"./data/output.npy")
+    outputs = run_model(model, runtime, world_size, rank)
+    print("answer argmax:", np.argmax(results))
+    print("output argmax:", np.argmax(outputs))
+    #np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
+    getDiff(results, outputs)
+
+
+def start_worker(
+    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
+):
+    dist_name = name + "_dist"
+    model = parallel_model(model, world_size, rank)
+    extern_path = f"./{dist_name}_rank{rank}.pb"
+    if os.path.exists(extern_path):
+        os.remove(extern_path)
+    onnx.save_model(
+        model,
+        f"./{dist_name}_rank{rank}.onnx",
+        save_as_external_data=True,
+        location=extern_path,
+    )
+    infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
+    runtime = backend.BangRuntime(local_rank)
+    # print("init comm")
+    runtime.init_comm(
+        dist_name,
+        world_size,
+        rank,
+    )
+    run_and_compare(name, model, runtime, world_size, rank)
+
+
+def start_single(name, model):
+    runtime = backend.BangRuntime(0)
+    run_and_compare(name, model, runtime)
+
+
+def generate_input_output(model):
+    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
+    runtime = backend.BangRuntime(0)
+    stub = OnnxStub(model, runtime)
+    position_id = 0
+    for i, (name, tensor) in enumerate(stub.inputs.items()):
+        input = tensor.copyout_numpy()
+        if np.issubdtype(input.dtype, np.integer):
+            if input.size == 1:
+                # input = np.array([position_id])
+                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
+            else:
+                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
+        elif input.dtype == np.bool_:
+            input = np.random.randint(0,2,size=input.shape) > 0
+        else:
+            if i == 0:
+                input = np.ones(input.shape).astype(input.dtype)
+                position_id = input.shape[-1] - 1
+            else:
+                input = np.random.rand(*input.shape).astype(input.dtype)
+        tensor.copyin_numpy(input)
+        np.save(f"./data/input_{i}", input)
+    stub.run()
+    time.sleep(0.01)
+    output = next(stub.outputs.values().__iter__()).copyout_numpy()
+    if np.isnan(output).any():
+        print("Nan in output")
+    np.save(f"./data/output", output)
+
+
+def load_inputs(stub, world_size=1, rank=0):
+    for i, (name, tensor) in enumerate(stub.inputs.items()):
+        input = np.load(f"./data/input_{i}.npy")
+        if all(x == y for x,y in zip(input.shape,tensor.shape())):
+            tensor.copyin_numpy(input)
+        else:
+            tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
+
+def getDiff(base, test):
+    absolute_diff = np.abs(np.subtract(base, test))
+    max_absolute_diff = np.max(absolute_diff)
+
+    baseCopy = base.astype(np.float64).ravel()
+    testCopy = test.astype(np.float64).ravel()
+    upValue = np.sum(np.abs(baseCopy - testCopy))
+    downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
+    max_relative_diff = upValue / downValue
+    print(f"Max absolute difference: {max_absolute_diff}\n"
+          f"Max relative difference: {max_relative_diff}")
+    return max_absolute_diff, max_relative_diff
+
+
+def main():
+    nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
+
+    model = onnx.load(model_path)
+
+    # generate standart output
+    if gen_std:
+        print("Generate inputs and outputs.")
+        p = mp.Process(target=generate_input_output, args=[model])
+        p.start()
+        p.join()
+        return
+
+    # run single process.
+    # use standalone process to isolate cuda.
+    print("run model by single MLU.")
+    p = mp.Process(target=start_single, args=(name, model))
+    p.start()
+    p.join()
+
+    # run distributed parallel.
+    world_size = nnodes * nproc_per_node
+    print(f"run model by {world_size} MLUs in parallel.")
+    workers = [
+        mp.Process(
+            target=start_worker,
+            args=(name, world_size, rank, rank % nproc_per_node, model),
+        )
+        for rank in range(world_size)
+    ]
+
+    for w in workers:
+        w.start()
+
+    for w in workers:
+        w.join()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/distributed/parallel_opt.py b/examples/distributed/parallel_opt.py
index 3ddf2ead..1214b6b3 100644
--- a/examples/distributed/parallel_opt.py
+++ b/examples/distributed/parallel_opt.py
@@ -115,7 +115,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
         assert out_dims[s_dim] % tp_world_size == 0, out_dims
         out_dims[s_dim] //= tp_world_size
         # if ONNX uses the same tensor for multiple Reshape Nodes, then rename it to distingush from others.
-        # node.input[1] = node.output[0] + "_shape"
+        node.input[1] = node.output[0] + "_shape"
         data[node.input[1]] = numpy_helper.from_array(out_dims, name=node.input[1])
         place[node.output[0]] = Shard(s_dim)
 
diff --git a/include/bang/bang_runtime.h b/include/bang/bang_runtime.h
index 2dde7756..e1ca6b38 100644
--- a/include/bang/bang_runtime.h
+++ b/include/bang/bang_runtime.h
@@ -7,17 +7,19 @@ namespace infini {
 class BangRuntimeObj : public RuntimeObj {
   private:
     cnnlHandle_t cnnl;
+    cnrtQueue_t queue;
+    std::unique_ptr<CommunicatorObj> comm;
     BangPtr workspace;
     size_t workspaceSize;
     mutable size_t cursor;
 
   public:
-    BangRuntimeObj() : RuntimeObj(Device::BANG) {
+    explicit BangRuntimeObj(int deviceId = 0)
+        : RuntimeObj(Device::BANG, deviceId) {
         cnInit(0);
         CNdev dev;
-        cnDeviceGet(&dev, 0);
+        cnDeviceGet(&dev, deviceId);
         checkBangError(cnrtSetDevice(dev));
-        cnrtQueue_t queue;
         checkBangError(cnrtQueueCreate(&queue));
 
         checkCnnlError(cnnlCreate(&cnnl));
@@ -30,6 +32,7 @@ class BangRuntimeObj : public RuntimeObj {
     }
     virtual ~BangRuntimeObj() {
         dealloc(workspace);
+        checkBangError(cnrtQueueDestroy(queue));
         checkCnnlError(cnnlDestroy(cnnl));
     }
     string toString() const override;
@@ -73,10 +76,9 @@ class BangRuntimeObj : public RuntimeObj {
         checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
                                   CNRT_MEM_TRANS_DIR_PEER2PEER));
     }
-
-    void initComm(const string &, int, int) override { IT_TODO_HALT(); }
-
-    CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
+    void initComm(const string &name, int worldSize, int rank) final;
+    CommunicatorObj &getCommunicator() const override { return *comm; }
+    cnrtQueue_t getBangQueue() const { return queue; }
 
   private:
     void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
diff --git a/include/bang/cncl_communicator.h b/include/bang/cncl_communicator.h
new file mode 100644
index 00000000..0999686c
--- /dev/null
+++ b/include/bang/cncl_communicator.h
@@ -0,0 +1,79 @@
+#pragma once
+#include "bang_common.h"
+#include "core/communicator.h"
+#include <chrono>
+#include <cncl.h>
+#include <cnrt.h>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <mutex>
+#include <thread>
+
+namespace infini {
+
+class CnclCommunicatorObj final : public CommunicatorObj {
+  private:
+    cnclComm_t *comms;
+
+  public:
+    CnclCommunicatorObj(const string &name, int worldSize, int rank)
+        : CommunicatorObj(worldSize, rank) {
+        const std::string filePath("./" + name + "_cncl_id.bin");
+        cnclCliqueId clique_id;
+        if (rank == 0) {
+            CNCL_CHECK(cnclGetCliqueId(&clique_id));
+            std::ofstream ofs(filePath, std::ios::binary);
+            ofs.write((char *)&clique_id, sizeof(cnclCliqueId));
+
+        } else {
+            auto begin = std::chrono::steady_clock::now();
+            while (!std::filesystem::exists(filePath)) {
+                auto now = std::chrono::steady_clock::now();
+                _IT_ASSERT_2(now < begin + std::chrono::seconds(10),
+                             "time limit (10s) exceeded.");
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            }
+            std::ifstream ifs(filePath, std::ios::binary);
+            ifs.read((char *)&clique_id, sizeof(cnclCliqueId));
+        }
+
+        int num_comms = 1;
+        int *dev_list = new int[num_comms];
+        int *rank_list = new int[num_comms];
+        comms = new cnclComm_t[num_comms];
+        uint32_t num_dev = 0;
+        checkBangError(cnrtGetDeviceCount(&num_dev));
+
+        for (int i = 0; i < num_comms; i++) {
+            rank_list[i] = rank;
+            dev_list[i] = rank_list[i] % num_dev;
+        }
+
+        CNCL_CHECK(cnclInitComms(comms, num_comms, dev_list, rank_list,
+                                 worldSize, &clique_id));
+
+        if (rank == 0) {
+            std::filesystem::remove(filePath);
+        }
+
+        delete[] dev_list;
+        delete[] rank_list;
+    }
+
+    ~CnclCommunicatorObj() {
+        CNCL_CHECK(cnclDestroyComms(comms, 1));
+        delete[] comms;
+    }
+
+    // Get the actual cnclComm_t
+    cnclComm_t getCnclComm() { return comms[0]; }
+
+    virtual string toString() const final {
+        std::ostringstream oss;
+        oss << "CNCL communicator";
+        return oss.str();
+    }
+};
+
+} // namespace infini
diff --git a/include/core/tensor.h b/include/core/tensor.h
index cb09261a..95229c14 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -8,7 +8,9 @@
 #if USE_CUDA
 #include "cuda/cuda_runtime.h"
 #endif
-
+#if USE_BANG
+#include "bang/bang_runtime.h"
+#endif
 namespace infini {
 
 // TODO: how to deal with this
diff --git a/src/bang/bang_runtime.cc b/src/bang/bang_runtime.cc
index c9f9a933..2f16b500 100644
--- a/src/bang/bang_runtime.cc
+++ b/src/bang/bang_runtime.cc
@@ -1,6 +1,9 @@
 #include "bang/bang_runtime.h"
 #include "core/kernel.h"
 #include "core/perf_engine.h"
+#ifdef INFINI_USE_CNCL
+#include "bang/cncl_communicator.h"
+#endif
 
 namespace infini {
 
@@ -59,4 +62,15 @@ void BangRuntimeObj::sync() const { cnrtSyncDevice(); }
 
 string BangRuntimeObj::toString() const { return "BANG Runtime"; }
 
+void BangRuntimeObj::initComm(const string &name, int worldSize, int rank) {
+    IT_ASSERT(worldSize > 0);
+    IT_ASSERT(rank >= 0);
+    IT_ASSERT(rank < worldSize);
+    IT_ASSERT(!comm) << "communicator is already initialized.";
+#ifdef INFINI_USE_CNCL
+    comm = std::make_unique<CnclCommunicatorObj>(name, worldSize, rank);
+#else
+    IT_TODO_HALT_MSG("Not compiled with CNCL.");
+#endif
+}
 } // namespace infini
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index c23009b5..eadd4a4e 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -399,7 +399,9 @@ void init_graph_builder(py::module &m) {
 #endif
 #ifdef USE_BANG
     py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
-        m, "BangRuntime");
+        m, "BangRuntime")
+        .def(py::init<int>(), py::arg("device") = 0)
+        .def("init_comm", &BangRuntimeObj::initComm);
 #endif
 #ifdef USE_KUNLUN
     py::class_<KUNLUNRuntimeObj, std::shared_ptr<KUNLUNRuntimeObj>, RuntimeObj>(
diff --git a/src/kernels/bang/all_gather.cc b/src/kernels/bang/all_gather.cc
new file mode 100644
index 00000000..d44569fe
--- /dev/null
+++ b/src/kernels/bang/all_gather.cc
@@ -0,0 +1,49 @@
+#ifdef INFINI_USE_CNCL
+#include "operators/all_gather.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "bang/cncl_communicator.h"
+#include <thread>
+namespace infini {
+class AllGatherCNCL : public BangKernelWithoutConfig {
+  public:
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<AllGatherObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        int world_size = op->getWorldSize();
+        // Check if world size info in operator matches runtime
+        IT_ASSERT(world_size == context->getCommunicator().getWorldSize());
+
+        void *input = op->getInputs(0)->getRawDataPtr<void *>();
+        BangPtr output_temp =
+            context->getWorkspace(op->getInputs(0)->getBytes() * world_size);
+        // void *output = op->getOutput()->getRawDataPtr<void *>();
+        // IT_ASSERT(op->getDType() == DataType::Float32);
+        checkBangError(cnrtMalloc(&output_temp,
+                                  op->getInputs(0)->getBytes() * world_size));
+        size_t bytes = op->getInputs(0)->getBytes();
+        size_t count = bytes / op->getDType().getSize();
+
+        cnclComm_t comm =
+            dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
+                .getCnclComm();
+        cnrtQueue_t queue = context->getBangQueue();
+        CNCL_CHECK(
+            cnclAllGather(input, output_temp, count, cnclFloat32, comm, queue));
+        checkBangError(cnrtQueueSync(queue));
+        for (int i = 0; i < world_size; ++i) {
+            Tensor output = op->getOutput(i);
+            context->copyBlobInsideRuntime(
+                output->getRawDataPtr<float *>(),
+                static_cast<float *>(output_temp) + i * count, bytes);
+        }
+        checkBangError(cnrtFree(output_temp));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::AllGather, DataType::Float32,
+                AllGatherCNCL, "AllGather_CNCL_BANG_Float32");
+} // namespace infini
+
+#endif
diff --git a/src/kernels/bang/all_reduce.cc b/src/kernels/bang/all_reduce.cc
new file mode 100644
index 00000000..4e9266fb
--- /dev/null
+++ b/src/kernels/bang/all_reduce.cc
@@ -0,0 +1,53 @@
+#ifdef INFINI_USE_CNCL
+#include "operators/all_reduce.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "bang/cncl_communicator.h"
+#include <thread>
+namespace infini {
+class AllReduceCNCL : public BangKernelWithoutConfig {
+  public:
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<AllReduceBaseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        void *input = op->getInputs(0)->getRawDataPtr<void *>();
+        void *output = op->getOutput()->getRawDataPtr<void *>();
+        IT_ASSERT(op->getDType() == DataType::Float32);
+        size_t count = op->getInputs(0)->size();
+        cnclComm_t comm =
+            dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
+                .getCnclComm();
+        cnrtQueue_t queue = context->getBangQueue();
+        // checkBangError(cnrtQueueSync(queue));
+        CNCL_CHECK(cnclAllReduce(input, output, count, cnclFloat, getRedOp(),
+                                 comm, queue));
+        checkBangError(cnrtQueueSync(queue));
+    }
+
+    virtual cnclReduceOp_t getRedOp() const = 0;
+};
+
+class AllReduceSumCNCL : public AllReduceCNCL {
+    cnclReduceOp_t getRedOp() const override { return cnclSum; }
+};
+class AllReduceProdCNCL : public AllReduceCNCL {
+    cnclReduceOp_t getRedOp() const override { return cnclProd; }
+};
+class AllReduceMinCNCL : public AllReduceCNCL {
+    cnclReduceOp_t getRedOp() const override { return cnclMin; }
+};
+class AllReduceMaxCNCL : public AllReduceCNCL {
+    cnclReduceOp_t getRedOp() const override { return cnclMax; }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::AllReduceSum, DataType::Float32,
+                AllReduceSumCNCL, "AllReduce_Sum_CNCL_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::AllReduceProd, DataType::Float32,
+                AllReduceProdCNCL, "AllReduce_Prod_CNCL_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::AllReduceMin, DataType::Float32,
+                AllReduceMinCNCL, "AllReduce_Min_CNCL_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::AllReduceMax, DataType::Float32,
+                AllReduceMaxCNCL, "AllReduce_Max_CNCL_BANG_Float32");
+} // namespace infini
+#endif
diff --git a/src/kernels/bang/broadcast.cc b/src/kernels/bang/broadcast.cc
new file mode 100644
index 00000000..411506c5
--- /dev/null
+++ b/src/kernels/bang/broadcast.cc
@@ -0,0 +1,34 @@
+#ifdef INFINI_USE_CNCL
+#include "operators/broadcast.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "bang/cncl_communicator.h"
+#include <thread>
+namespace infini {
+class BroadcastCNCL : public BangKernelWithoutConfig {
+  public:
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<BroadcastObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        void *input = op->getInputs(0)->getRawDataPtr<void *>();
+        void *output = op->getOutput()->getRawDataPtr<void *>();
+        IT_ASSERT(op->getDType() == DataType::Float32);
+        size_t count = op->getInputs(0)->getBytes() / op->getDType().getSize();
+
+        cnclComm_t comm =
+            dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
+                .getCnclComm();
+        cnrtQueue_t queue = context->getBangQueue();
+        // TODO: Using default stream 0 for now.
+        CNCL_CHECK(cnclBroadcast(input, output, count, cnclFloat32,
+                                 op->getRoot(), comm, queue));
+        checkBangError(cnrtQueueSync(queue));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Broadcast, DataType::Float32,
+                BroadcastCNCL, "Broadcast_CNCL_BANG_Float32");
+} // namespace infini
+
+#endif
diff --git a/src/kernels/bang/gather.cc b/src/kernels/bang/gather.cc
index b5a326fc..dc3ee636 100644
--- a/src/kernels/bang/gather.cc
+++ b/src/kernels/bang/gather.cc
@@ -23,6 +23,8 @@ class GatherCnnl : public BangKernelWithoutConfig {
                                                CNNL_DTYPE_FLOAT, aDim.size(),
                                                aDim.data()));
         checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(
+            cnnlSetTensorDescriptorPointerMode(bDesc, CNNL_POINTER_MODE_HOST));
         checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY,
                                                CNNL_DTYPE_INT32, bDim.size(),
                                                bDim.data()));
diff --git a/src/kernels/bang/reduce_mean.cc b/src/kernels/bang/reduce.cc
similarity index 82%
rename from src/kernels/bang/reduce_mean.cc
rename to src/kernels/bang/reduce.cc
index 1b55c2ca..88d1e645 100644
--- a/src/kernels/bang/reduce_mean.cc
+++ b/src/kernels/bang/reduce.cc
@@ -1,12 +1,14 @@
+#include "operators/reduce.h"
 #include "bang/bang_kernel_without_config.h"
 #include "bang/bang_runtime.h"
-#include "operators/reduce.h"
 
 namespace infini {
-class ReduceMeanCnnl : public BangKernelWithoutConfig {
+class ReduceCnnlBase : public BangKernelWithoutConfig {
+    virtual cnnlReduceOp_t getReduceOp() const = 0;
+
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
-        auto op = as<ReduceMeanObj>(_op);
+        auto op = as<ReduceBaseObj>(_op);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -34,7 +36,7 @@ class ReduceMeanCnnl : public BangKernelWithoutConfig {
         cnnlReduceDescriptor_t reduceDesc;
         checkCnnlError(cnnlCreateReduceDescriptor(&reduceDesc));
         checkCnnlError(cnnlSetReduceDescriptor_v2(
-            reduceDesc, axes.data(), axes.size(), CNNL_REDUCE_AVG,
+            reduceDesc, axes.data(), axes.size(), getReduceOp(),
             CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES,
             CNNL_32BIT_INDICES, 0.0));
 
@@ -63,7 +65,17 @@ class ReduceMeanCnnl : public BangKernelWithoutConfig {
     }
 };
 
+class ReduceMeanCnnl : public ReduceCnnlBase {
+    cnnlReduceOp_t getReduceOp() const override { return CNNL_REDUCE_AVG; }
+};
+
+class ReduceSumCnnl : public ReduceCnnlBase {
+    cnnlReduceOp_t getReduceOp() const override { return CNNL_REDUCE_ADD; }
+};
+
 REGISTER_KERNEL(Device::BANG, OpType::ReduceMean, DataType::Float32,
                 ReduceMeanCnnl, "ReduceMean_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ReduceSum, DataType::Float32,
+                ReduceSumCnnl, "ReduceSum_cnnl_BANG_Float32");
 
 }; // namespace infini
diff --git a/src/kernels/bang/reshape.cc b/src/kernels/bang/reshape.cc
index 564ed1d7..f5628a7b 100644
--- a/src/kernels/bang/reshape.cc
+++ b/src/kernels/bang/reshape.cc
@@ -27,6 +27,8 @@ class CopyBang : public BangKernelWithoutConfig {
 // reshape/flatten/identity all act as copying from input to output.
 REGISTER_KERNEL(Device::BANG, OpType::Reshape, DataType::Float32, CopyBang,
                 "Reshape_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Reshape, DataType::Int64, CopyBang,
+                "Reshape_BANG_Int64");
 REGISTER_KERNEL(Device::BANG, OpType::Flatten, DataType::Float32, CopyBang,
                 "Flatten_BANG_Float32");
 REGISTER_KERNEL(Device::BANG, OpType::Identity, DataType::Float32, CopyBang,
diff --git a/src/kernels/bang/slice.cc b/src/kernels/bang/slice.cc
new file mode 100644
index 00000000..5cc772aa
--- /dev/null
+++ b/src/kernels/bang/slice.cc
@@ -0,0 +1,64 @@
+#include "operators/slice.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class SliceCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<SliceObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        auto starts = op->getStarts();
+        auto ends = op->getEnds();
+        auto steps = op->getSteps();
+
+        int32_t starts_array[starts.size()];
+        int32_t ends_array[ends.size()];
+        int32_t steps_array[steps.size()];
+
+        for (size_t i = 0; i < starts.size(); i++) {
+            starts_array[i] = starts[i];
+            ends_array[i] = ends[i];
+            steps_array[i] = steps[i];
+        }
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        auto aDim = op->getInputs(0)->getDims();
+        int aDim_size = aDim.size();
+        int aDim_array[aDim_size];
+        for (int i = 0; i < aDim_size; ++i) {
+            aDim_array[i] = aDim[i];
+        }
+        auto cDim = op->getOutput()->getDims();
+        int cDim_size = cDim.size();
+        int cDim_array[cDim_size];
+        for (int i = 0; i < cDim_size; ++i) {
+            cDim_array[i] = cDim[i];
+        }
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        // input
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, aDim_size, aDim_array));
+        // output
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, cDim_size, cDim_array));
+
+        cnnlStatus_t stat =
+            cnnlStridedSlice(context->cnnlHandle(), aDesc, aData, starts_array,
+                             ends_array, steps_array, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Slice, DataType::Float32, SliceCnnl,
+                "Slice_cnnl_BANG_Float32");
+}; // namespace infini
diff --git a/test/bang/test_cncl_comm.cc b/test/bang/test_cncl_comm.cc
new file mode 100644
index 00000000..50b47434
--- /dev/null
+++ b/test/bang/test_cncl_comm.cc
@@ -0,0 +1,58 @@
+#ifdef INFINI_USE_CNCL
+#include "bang/bang_runtime.h"
+#include "bang/cncl_communicator.h"
+#include "test.h"
+
+static int WORLD_SIZE = 2;
+
+namespace infini {
+
+void allReduceSum(float *data, int deviceId) {
+    // Create Runtime and setup communication
+    BangRuntimeObj *bang_runtime = new BangRuntimeObj(deviceId);
+    int rank = deviceId;
+    bang_runtime->initComm("test_cncl_comm", WORLD_SIZE, rank);
+    cnclComm_t comm =
+        dynamic_cast<CnclCommunicatorObj &>(bang_runtime->getCommunicator())
+            .getCnclComm();
+    cnrtQueue_t queue = bang_runtime->getBangQueue();
+    // Copy data
+    float *data_mlu;
+    checkBangError(cnrtMalloc((void **)&data_mlu, sizeof(float)));
+    checkBangError(
+        cnrtMemcpy(data_mlu, data, sizeof(float), cnrtMemcpyHostToDev));
+    // Do AllReduce
+    CNCL_CHECK(
+        cnclAllReduce(data_mlu, data_mlu, 1, cnclFloat, cnclSum, comm, queue));
+
+    checkBangError(cnrtQueueSync(queue));
+    // Copy data back and sync device
+    checkBangError(
+        cnrtMemcpy(data, data_mlu, sizeof(float), cnrtMemcpyDevToHost));
+    ASSERT_EQ(*data, 5.0f);
+}
+
+// Setup communication between 2 threads, each controlling 1 MLU.
+// Do AllReduce Sum on {1.0, 4.0}. Results should be {5.0, 5.0}.
+TEST(CNCL, multi_mlu_communication) {
+    float data[] = {1.0, 4.0};
+
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        pid_t pid = fork();
+        if (pid == 0) {
+            // Child process
+            allReduceSum(&data[i], i);
+            exit(0); // Ensure child process exits to avoid unnecessary
+                     // repetition in parent
+        } else if (pid < 0) {
+            std::cerr << "Error creating process" << std::endl;
+        }
+    }
+    // Wait for all child processes to finish
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        wait(NULL);
+    }
+}
+
+} // namespace infini
+#endif
diff --git a/test/kernels/bang/test_bang_all_gather.cc b/test/kernels/bang/test_bang_all_gather.cc
new file mode 100644
index 00000000..038cc7ab
--- /dev/null
+++ b/test/kernels/bang/test_bang_all_gather.cc
@@ -0,0 +1,60 @@
+#ifdef INFINI_USE_CNCL
+#include "bang/bang_runtime.h"
+#include "bang/cncl_communicator.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/all_gather.h"
+#include "test.h"
+#include <cncl.h>
+#include <thread>
+
+static int WORLD_SIZE = 2;
+
+namespace infini {
+
+void allGather(const string taskName, int deviceID, vector<float> data,
+               vector<vector<float>> ans) {
+    // Create Runtimes and initiate communication
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    Runtime bangRuntime = make_ref<BangRuntimeObj>(deviceID);
+    bangRuntime->initComm(taskName, WORLD_SIZE, deviceID);
+    // Create Graph and insert allReduce operation
+    Graph g = make_ref<GraphObj>(bangRuntime);
+    auto input =
+        g->addTensor(Shape{static_cast<int>(data.size())}, DataType::Float32);
+    auto op = g->addOp<AllGatherObj>(input, std::nullopt, WORLD_SIZE);
+    // Copy data from CPU to MLU
+    g->dataMalloc();
+    input->copyin(data);
+    // Run operation
+    bangRuntime->run(g);
+    // Copy output from MLU to CPU
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        auto result = op->getOutputs()[i]->clone(cpuRuntime);
+        EXPECT_TRUE(result->equalData(ans[i]));
+    }
+}
+
+TEST(BANG_AllGather, run) {
+    vector<float> data[2] = {{2., 3.}, {5., 6.}};
+    vector<vector<float>> ans = {{2., 3.}, {5., 6.}};
+
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        pid_t pid = fork();
+        if (pid == 0) {
+            // Child process
+            allGather("test_all_gather", i, data[i], ans);
+            exit(0); // Ensure child process exits to avoid unnecessary
+                     // repetition in parent
+        } else if (pid < 0) {
+            std::cerr << "Error creating process" << std::endl;
+        }
+    }
+    // Wait for all child processes to finish
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        wait(NULL);
+    }
+}
+
+} // namespace infini
+#endif
diff --git a/test/kernels/bang/test_bang_all_reduce.cc b/test/kernels/bang/test_bang_all_reduce.cc
new file mode 100644
index 00000000..a10a9288
--- /dev/null
+++ b/test/kernels/bang/test_bang_all_reduce.cc
@@ -0,0 +1,124 @@
+#ifdef INFINI_USE_CNCL
+#include "bang/bang_runtime.h"
+#include "bang/cncl_communicator.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/all_reduce.h"
+#include "test.h"
+#include <cncl.h>
+#include <future>
+#include <thread>
+
+static int WORLD_SIZE = 2;
+
+namespace infini {
+
+template <typename OperatorObj>
+void allReduce(const string taskName, int deviceID, vector<float> data,
+               vector<float> ans) {
+    // Create Runtimes and initiate communication
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    Runtime bangRuntime = make_ref<BangRuntimeObj>(deviceID);
+    bangRuntime->initComm(taskName, WORLD_SIZE, deviceID);
+    // Create Graph and insert allReduce operation
+    Graph g = make_ref<GraphObj>(bangRuntime);
+    auto input =
+        g->addTensor(Shape{static_cast<int>(data.size())}, DataType::Float32);
+    auto op = g->addOp<OperatorObj>(input, nullptr);
+    // Copy data from CPU to MLU
+    g->dataMalloc();
+    input->copyin(data);
+    // Run operation
+    bangRuntime->run(g);
+    // Copy output from MLU to CPU
+    auto result = op->getOutput()->clone(cpuRuntime);
+
+    EXPECT_TRUE(result->equalData(ans));
+}
+
+TEST(BANG_AllReduce, sum) {
+    vector<float> data[2] = {{2., 3.}, {5., 6.}};
+    vector<float> ans = {7., 9.};
+
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        pid_t pid = fork();
+        if (pid == 0) {
+            // Child process
+            allReduce<AllReduceSumObj>("test_allreduce_sum", i, data[i], ans);
+            exit(0); // Ensure child process exits to avoid unnecessary
+                     // repetition in parent
+        } else if (pid < 0) {
+            std::cerr << "Error creating process" << std::endl;
+        }
+    }
+    // Wait for all child processes to finish
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        wait(NULL);
+    }
+}
+
+TEST(BANG_AllReduce, prod) {
+    vector<float> data[2] = {{2., 3.}, {5., 6.}};
+    vector<float> ans = {10., 18.};
+
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        pid_t pid = fork();
+        if (pid == 0) {
+            // Child process
+            allReduce<AllReduceProdObj>("test_allreduce_prod", i, data[i], ans);
+            exit(0); // Ensure child process exits to avoid unnecessary
+                     // repetition in parent
+        } else if (pid < 0) {
+            std::cerr << "Error creating process" << std::endl;
+        }
+    }
+    // Wait for all child processes to finish
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        wait(NULL);
+    }
+}
+
+TEST(BANG_AllReduce, min) {
+    vector<float> data[2] = {{2., 3.}, {5., 6.}};
+    vector<float> ans = {2., 3.};
+
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        pid_t pid = fork();
+        if (pid == 0) {
+            // Child process
+            allReduce<AllReduceMinObj>("test_allreduce_min", i, data[i], ans);
+            exit(0); // Ensure child process exits to avoid unnecessary
+                     // repetition in parent
+        } else if (pid < 0) {
+            std::cerr << "Error creating process" << std::endl;
+        }
+    }
+    // Wait for all child processes to finish
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        wait(NULL);
+    }
+}
+
+TEST(BANG_AllReduce, max) {
+    vector<float> data[2] = {{2., 3.}, {5., 6.}};
+    vector<float> ans = {5., 6.};
+
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        pid_t pid = fork();
+        if (pid == 0) {
+            // Child process
+            allReduce<AllReduceMaxObj>("test_allreduce_max", i, data[i], ans);
+            exit(0); // Ensure child process exits to avoid unnecessary
+                     // repetition in parent
+        } else if (pid < 0) {
+            std::cerr << "Error creating process" << std::endl;
+        }
+    }
+    // Wait for all child processes to finish
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        wait(NULL);
+    }
+}
+
+} // namespace infini
+#endif
diff --git a/test/kernels/bang/test_bang_broadcast.cc b/test/kernels/bang/test_bang_broadcast.cc
new file mode 100644
index 00000000..e05666f7
--- /dev/null
+++ b/test/kernels/bang/test_bang_broadcast.cc
@@ -0,0 +1,65 @@
+#ifdef INFINI_USE_CNCL
+#include "bang/bang_runtime.h"
+#include "bang/cncl_communicator.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/broadcast.h"
+#include "test.h"
+#include <cncl.h>
+#include <thread>
+
+static int WORLD_SIZE = 2;
+static int root = 0;
+
+namespace infini {
+
+void broadcast(const string taskName, int deviceID, vector<float> data,
+               vector<float> ans) {
+    // Create Runtimes and initiate communication
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    Runtime bangRuntime = make_ref<BangRuntimeObj>(deviceID);
+    bangRuntime->initComm(taskName, WORLD_SIZE, deviceID);
+    // Create Graph and insert allReduce operation
+    Graph g = make_ref<GraphObj>(bangRuntime);
+    auto input =
+        g->addTensor(Shape{static_cast<int>(data.size())}, DataType::Float32);
+    auto op = g->addOp<BroadcastObj>(input, nullptr, root);
+    // Copy data from CPU to GPU
+    g->dataMalloc();
+    // Only rank 0 has the data
+    if (deviceID == root) {
+        input->copyin(data);
+    }
+    // Run broadcast operation
+    bangRuntime->run(g);
+    // Copy output from GPU to CPU
+    auto result = op->getOutput()->clone(cpuRuntime);
+
+    EXPECT_TRUE(result->equalData(ans));
+}
+
+TEST(BANG_Broadcast, run) {
+    // Only 1 device gets data. Every rank should have the same data after
+    // broadcast.
+    vector<float> data = {2., 3., 5., 6.};
+    vector<float> ans = {2., 3., 5., 6.};
+
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        pid_t pid = fork();
+        if (pid == 0) {
+            // Child process
+            broadcast("test_broadcast", i, data, ans);
+            exit(0); // Ensure child process exits to avoid unnecessary
+                     // repetition in parent
+        } else if (pid < 0) {
+            std::cerr << "Error creating process" << std::endl;
+        }
+    }
+    // Wait for all child processes to finish
+    for (int i = 0; i < WORLD_SIZE; ++i) {
+        wait(NULL);
+    }
+}
+
+} // namespace infini
+#endif
diff --git a/test/kernels/bang/test_bang_reduce.cc b/test/kernels/bang/test_bang_reduce.cc
new file mode 100644
index 00000000..485e16f0
--- /dev/null
+++ b/test/kernels/bang/test_bang_reduce.cc
@@ -0,0 +1,82 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/reduce.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <typename ReduceObjT>
+void test_reduce(const Shape &shape, const vector<float> &data,
+                 const optional<const vector<int>> &axis, bool keepDims,
+                 const vector<float> &ExpectData) {
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor icpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+
+    // Build BANG graph
+    Graph g = make_ref<GraphObj>(bangRuntime);
+    auto i = g->cloneTensor(icpu);
+    auto op = g->addOp<ReduceObjT>(i, nullptr, axis, keepDims);
+
+    // allocate BANG memory
+    g->dataMalloc();
+    i->copyin(data);
+
+    // Execute on BANG
+    bangRuntime->run(g);
+
+    // clone BANG output to CPU
+    auto o = op->getOutput();
+    auto ocpu = o->clone(cpuRuntime);
+
+    //  check results on CPU
+    EXPECT_TRUE(ocpu->equalData(ExpectData));
+}
+
+TEST(BANG_ReduceMean, run) {
+    test_reduce<ReduceMeanObj>(
+        Shape{3, 2, 2}, vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2},
+        std::nullopt, true, vector<float>{18.25});
+    test_reduce<ReduceMeanObj>(
+        Shape{1, 3, 2, 2, 1},
+        vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2}, std::nullopt,
+        false, vector<float>{18.25});
+
+    test_reduce<ReduceMeanObj>(
+        Shape{2, 3, 2, 2},
+        vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                      12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+        vector<int>{1, 2}, false, vector<float>{5, 6, 17, 18});
+    test_reduce<ReduceMeanObj>(
+        Shape{2, 3, 2, 2, 1},
+        vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                      12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+        vector<int>{1, 2}, true, vector<float>{5, 6, 17, 18});
+}
+
+TEST(BANG_ReduceSum, run) {
+    test_reduce<ReduceSumObj>(Shape{3, 2, 2},
+                              vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+                              std::nullopt, true, vector<float>{12});
+    test_reduce<ReduceSumObj>(Shape{1, 3, 2, 2, 1},
+                              vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+                              std::nullopt, false, vector<float>{12});
+
+    test_reduce<ReduceSumObj>(
+        Shape{2, 3, 2, 2},
+        vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                      12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+        vector<int>{1, 2}, false, vector<float>{30, 36, 102, 108});
+    test_reduce<ReduceSumObj>(
+        Shape{2, 3, 2, 2, 1},
+        vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                      12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+        vector<int>{1, 2}, true, vector<float>{30, 36, 102, 108});
+}
+
+} // namespace infini
diff --git a/test/kernels/bang/test_bang_slice.cc b/test/kernels/bang/test_bang_slice.cc
new file mode 100644
index 00000000..0f932409
--- /dev/null
+++ b/test/kernels/bang/test_bang_slice.cc
@@ -0,0 +1,39 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/slice.h"
+#include "test.h"
+
+namespace infini {
+TEST(BANG_Slice, run) {
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor icpu =
+        make_ref<TensorObj>(Shape{3, 2, 1, 5}, DataType::Float32, cpuRuntime);
+    icpu->dataMalloc();
+    icpu->setData(IncrementalGenerator());
+
+    // Build CUDA graph;
+    Graph g = make_ref<GraphObj>(bangRuntime);
+    auto i = g->cloneTensor(icpu);
+    auto op =
+        g->addOp<SliceObj>(i, nullptr, vector<int>{1, 1}, vector<int>{2, 5},
+                           vector<int>{0, 3}, std::nullopt);
+
+    // allocate CUDA memory
+    g->dataMalloc();
+    i->setData(IncrementalGenerator());
+
+    // Execute on CUDA
+    bangRuntime->run(g);
+
+    // clone CUDA output to CPU
+    auto o = op->getOutput();
+    auto cpuo = o->clone(cpuRuntime);
+    // bangPrintTensor(o);
+    //  check results on CPU
+    EXPECT_TRUE(cpuo->equalData(vector<float>{11, 12, 13, 14, 16, 17, 18, 19}));
+}
+} // namespace infini

From b15c4979fadffb432e4f865aa6a8ea2c9df8037b Mon Sep 17 00:00:00 2001
From: zhangyunze <93699316+bitzyz@users.noreply.github.com>
Date: Fri, 5 Jan 2024 08:40:18 +0800
Subject: [PATCH 25/28] fix Issue-189 question 1-15 (#195)

* fix: fix nativecpu elementwise only support 4d tensor

* fix format

---------

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 include/utils/operator_utils.h                |  5 ++
 src/kernels/cpu/element_wise.cc               | 54 +++++++++----------
 src/utils/operator_utils.cc                   | 25 +++++++++
 .../nativecpu/test_nativecpu_elementwise.cc   | 44 +++++++++++++++
 4 files changed, 100 insertions(+), 28 deletions(-)
 create mode 100644 test/kernels/nativecpu/test_nativecpu_elementwise.cc

diff --git a/include/utils/operator_utils.h b/include/utils/operator_utils.h
index 1b3a1eb2..b0871c0b 100644
--- a/include/utils/operator_utils.h
+++ b/include/utils/operator_utils.h
@@ -13,6 +13,11 @@ Shape infer_broadcast(const Shape &A, const Shape &B);
 int get_real_axis(const int &axis, const int &rank);
 // Check if tensor B is unidirectional broadcastable to tensor A
 bool is_unidirectional_broadcasting(const Shape &A, const Shape &B);
+// Locate the index with size from Shape
+Shape locate_index(size_t inputN, const Shape &shape);
+// Delocate the ShapeIndex from Shape with broadcast
+size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
+                      const Shape &stride);
 // Convert KernelAttrs to a string representation
 std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs);
 } // namespace infini
diff --git a/src/kernels/cpu/element_wise.cc b/src/kernels/cpu/element_wise.cc
index 8d225779..ff03350c 100644
--- a/src/kernels/cpu/element_wise.cc
+++ b/src/kernels/cpu/element_wise.cc
@@ -1,5 +1,6 @@
 #include "operators/element_wise.h"
 #include "core/kernel.h"
+#include "utils/operator_utils.h"
 
 namespace infini {
 template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
@@ -11,37 +12,34 @@ template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
         T *inptr1 = op->getInputs(1)->getRawDataPtr<T *>();
         T *outptr = op->getOutput()->getRawDataPtr<T *>();
 
-        int a[4] = {1, 1, 1, 1};
-        int b[4] = {1, 1, 1, 1};
-        int c[4] = {1, 1, 1, 1};
-        auto a_input = op->getInputs(0)->getDims();
-        auto b_input = op->getInputs(1)->getDims();
-        auto c_output = op->getOutput()->getDims();
-        std::copy(a_input.begin(), a_input.end(), a + (4 - a_input.size()));
-        std::copy(b_input.begin(), b_input.end(), b + (4 - b_input.size()));
-        std::copy(c_output.begin(), c_output.end(), c + (4 - c_output.size()));
+        auto shapeA = op->getInputs(0)->getDims();
+        auto shapeB = op->getInputs(1)->getDims();
+        auto shapeC = op->getOutput()->getDims();
+        auto rank = op->getOutput()->getRank();
+        Shape a(rank, 1);
+        Shape b(rank, 1);
+        std::copy(shapeA.begin(), shapeA.end(),
+                  a.begin() + (rank - shapeA.size()));
+        std::copy(shapeB.begin(), shapeB.end(),
+                  b.begin() + (rank - shapeB.size()));
+        auto getStride = [&](const Shape &shape) {
+            int p = 1;
+            Shape stride(rank);
+            for (auto i = rank; i > 0; --i) {
+                stride[i - 1] = p;
+                p = p * shape[i - 1];
+            }
+            return stride;
+        };
+        Shape strideA = getStride(a);
+        Shape strideB = getStride(b);
 
         auto n = op->getOutput()->size();
         for (size_t i = 0; i < n; ++i) {
-            int c0_index = i / (c[1] * c[2] * c[3]);
-            int c1_index = (i % (c[1] * c[2] * c[3])) / (c[2] * c[3]);
-            int c2_index = ((i % (c[1] * c[2] * c[3])) % (c[2] * c[3])) / c[3];
-            int c3_index = ((i % (c[1] * c[2] * c[3])) % (c[2] * c[3])) % c[3];
-
-            int a0_index = c0_index % a[0];
-            int a1_index = c1_index % a[1];
-            int a2_index = c2_index % a[2];
-            int a3_index = c3_index % a[3];
-
-            int b0_index = c0_index % b[0];
-            int b1_index = c1_index % b[1];
-            int b2_index = c2_index % b[2];
-            int b3_index = c3_index % b[3];
-            outptr[i] = doCompute(
-                inptr0[a0_index * a[1] * a[2] * a[3] + a1_index * a[2] * a[3] +
-                       a2_index * a[3] + a3_index],
-                inptr1[b0_index * b[1] * b[2] * b[3] + b1_index * b[2] * b[3] +
-                       b2_index * b[3] + b3_index]);
+            auto shapeIndexC = locate_index(i, shapeC);
+            auto indexA = delocate_index(shapeIndexC, a, strideA);
+            auto indexB = delocate_index(shapeIndexC, b, strideB);
+            outptr[i] = doCompute(inptr0[indexA], inptr1[indexB]);
         }
     }
 };
diff --git a/src/utils/operator_utils.cc b/src/utils/operator_utils.cc
index 76a1d91f..6687a8fd 100644
--- a/src/utils/operator_utils.cc
+++ b/src/utils/operator_utils.cc
@@ -66,6 +66,31 @@ bool is_unidirectional_broadcasting(const Shape &A, const Shape &B) {
     return true;
 }
 
+Shape locate_index(size_t inputN, const Shape &shape) {
+    Shape ans(shape.size());
+    auto i = ans.rbegin();
+    auto j = shape.rbegin(), ej = shape.rend();
+    while (j != ej) {
+        auto div = std::div(inputN, *j++);
+        *i++ = div.rem;
+        inputN = div.quot;
+    }
+    return ans;
+}
+
+size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
+                      const Shape &stride) {
+    size_t ans = 0;
+    Shape index(shapeIndex.size());
+    IT_ASSERT(shapeIndex.size() == shape.size());
+    IT_ASSERT(shape.size() == stride.size());
+    for (size_t i = 0; i < shape.size(); ++i) {
+        index[i] = shapeIndex[i] % shape[i];
+        ans += index[i] * stride[i];
+    }
+    return ans;
+}
+
 std::string device_to_str(Device device) {
     std::string deviceStr;
     switch (device) {
diff --git a/test/kernels/nativecpu/test_nativecpu_elementwise.cc b/test/kernels/nativecpu/test_nativecpu_elementwise.cc
new file mode 100644
index 00000000..c6ef1911
--- /dev/null
+++ b/test/kernels/nativecpu/test_nativecpu_elementwise.cc
@@ -0,0 +1,44 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+using ExpectOutput = vector<float>;
+template <class T>
+void testElementWiseNativeCpu(
+    const std::function<void(void *, size_t, DataType)> &generator1,
+    const std::function<void(void *, size_t, DataType)> &generator2,
+    const Shape &shape1, const Shape &shape2, const ExpectOutput &ansVec) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+    auto t1 = g->addTensor(shape1, DataType::Float32);
+    auto t2 = g->addTensor(shape2, DataType::Float32);
+
+    auto op = g->addOp<T>(t1, t2, nullptr);
+    g->dataMalloc();
+    t1->setData(generator1);
+    t2->setData(generator2);
+
+    runtime->run(g);
+    EXPECT_TRUE(op->getOutput()->equalData(ansVec));
+}
+
+TEST(ElementWise, NativeCpu) {
+    testElementWiseNativeCpu<AddObj>(
+        IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
+        Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 4, 5, 6, 6, 7, 8, 10, 11, 12});
+    testElementWiseNativeCpu<MulObj>(
+        IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
+        Shape{2, 1, 1}, ExpectOutput{0, 0, 0, 3, 4, 5, 0, 0, 0, 9, 10, 11});
+    testElementWiseNativeCpu<SubObj>(
+        IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
+        Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 2, 3, 4, 6, 7, 8, 8, 9, 10});
+    testElementWiseNativeCpu<DivObj>(
+        IncrementalGenerator(), OneGenerator(), Shape{1, 2, 2, 3, 1},
+        Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+}
+
+} // namespace infini

From 46e61a5bd44947cc1bfe06a7ea73716e07a33e1e Mon Sep 17 00:00:00 2001
From: PanZezhong1725 <141193946+PanZezhong1725@users.noreply.github.com>
Date: Fri, 5 Jan 2024 09:19:50 +0800
Subject: [PATCH 26/28] =?UTF-8?q?=E4=BF=AE=E6=AD=A3Slice=E5=86=85=E5=AD=98?=
 =?UTF-8?q?=E8=B6=8A=E7=95=8C=E9=97=AE=E9=A2=98=20(#204)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix (slice): add guard for area out of range

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
---
 src/kernels/cuda/pad_slice.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/cuda/pad_slice.cu b/src/kernels/cuda/pad_slice.cu
index f119bd9c..cd6bc37b 100644
--- a/src/kernels/cuda/pad_slice.cu
+++ b/src/kernels/cuda/pad_slice.cu
@@ -35,7 +35,7 @@ __global__ void _pad_slice_kernel(T *part, T *whole, TransMetaData metaData,
                 whole[tid] = 0;
             else
                 whole[tid] = part[offset];
-        else
+        else if (offset >= 0)
             part[offset] = whole[tid];
         tid += stride;
     }

From 58993d433980f76528b9ad0004f1a7919c1c63cb Mon Sep 17 00:00:00 2001
From: zhangyunze <93699316+bitzyz@users.noreply.github.com>
Date: Fri, 12 Jan 2024 14:54:27 +0800
Subject: [PATCH 27/28] =?UTF-8?q?=E8=A7=A3=E9=99=A4=E5=89=8D=E7=AB=AF?=
 =?UTF-8?q?=E5=AF=B9onnx=20infershape=E5=8A=9F=E8=83=BD=E7=9A=84=E4=BE=9D?=
 =?UTF-8?q?=E8=B5=96=20(#206)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: SqueezeOp lift the dependency of onnx infershape.

* feat: UnsqueezeOp lift the dependency of onnx infershape.

* feat: lift the dependency of onnx infershape

* fix: fix Makefile off nccl
---
 Makefile                                  |   1 -
 include/core/graph_handler.h              |   2 +
 include/operators/squeeze.h               |  39 +++++++
 include/operators/unsqueeze.h             |  38 +++++++
 pyinfinitensor/src/pyinfinitensor/onnx.py | 127 ++++++++++------------
 pyinfinitensor/tests/test_onnx.py         |  22 ++++
 src/core/graph_handler.cc                 |  24 ++++
 src/ffi/ffi_infinitensor.cc               |  26 +++++
 src/kernels/cuda/reshape.cc               |   4 +
 src/operators/squeeze.cc                  |  60 ++++++++++
 src/operators/unsqueeze.cc                |  52 +++++++++
 test/operators/test_reshape.cc            |  34 ++++++
 12 files changed, 358 insertions(+), 71 deletions(-)
 create mode 100644 include/operators/squeeze.h
 create mode 100644 include/operators/unsqueeze.h
 create mode 100644 src/operators/squeeze.cc
 create mode 100644 src/operators/unsqueeze.cc

diff --git a/Makefile b/Makefile
index d21a406b..302f47b8 100644
--- a/Makefile
+++ b/Makefile
@@ -29,7 +29,6 @@ CMAKE_OPT += -DUSE_BANG=$(BANG)
 CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
 CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
 CMAKE_OPT += -DBUILD_TEST=$(TEST)
-CMAKE_OPT += -DBUILD_DIST=ON
 CMAKE_OPT += -DBUILD_NNET=$(NNET)
 
 ifeq ($(INTELCPU), ON)
diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index 313a1f79..0e1472bb 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -71,6 +71,8 @@ class GraphHandlerObj {
                   vector<float> scales_, vector<float> roi_, string mode,
                   string ratioPolicy, string nearestMode,
                   string coordTransMode);
+    Tensor squeeze(Tensor input, Tensor output, Shape axes);
+    Tensor unsqueeze(Tensor input, Tensor output, Shape axes);
     Tensor concat(TensorVec inputs, Tensor output, int dim);
     Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
                             Tensor input_q, Tensor input_k, Tensor input_v,
diff --git a/include/operators/squeeze.h b/include/operators/squeeze.h
new file mode 100644
index 00000000..d1d99683
--- /dev/null
+++ b/include/operators/squeeze.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "core/operator.h"
+
+namespace infini {
+
+/**
+ * @brief Remove single-dimensional entries from the shape of a tensor.
+ *
+ */
+class SqueezeObj : public OperatorObj {
+    Shape axes;
+
+  public:
+    /**
+     * @brief Construct a new Squeeze object.
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param input The input tensor.
+     * @param output The output tensor.
+     * @param axes List of integers indicating the dimensions to squeeze.
+     */
+    SqueezeObj(GraphObj *graph, Tensor input, Tensor output, Shape axes);
+    OP_CLONE(SqueezeObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+    inline Shape getAxes() const { return axes; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+} // namespace infini
diff --git a/include/operators/unsqueeze.h b/include/operators/unsqueeze.h
new file mode 100644
index 00000000..f496d32a
--- /dev/null
+++ b/include/operators/unsqueeze.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "core/operator.h"
+
+namespace infini {
+/**
+ * @brief nsert single-dimensional entries to the shape of an input tensor.
+ *
+ */
+class UnsqueezeObj : public OperatorObj {
+    Shape axes;
+
+  public:
+    /**
+     * @brief Construct a new Unsqueeze object.
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param input The input tensor.
+     * @param output The output tensor.
+     * @param axes List of integers indicating the dimensions to be inserted.
+     */
+    UnsqueezeObj(GraphObj *graph, Tensor input, Tensor output, Shape axes);
+    OP_CLONE(UnsqueezeObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+    inline Shape getAxes() const { return axes; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+} // namespace infini
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index c63746af..192e5273 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -52,10 +52,10 @@ class OnnxStub:
         self.inputs: Dict[str, backend.Tensor] = {}
         self.outputs: Dict[str, backend.Tensor] = {}
         self.initializer: Dict[int, TensorProto] = {}
-        try:
-            model = infer_shapes(model)
-        except:
-            warnings.warn("infer_shapes failed.")
+        # try:
+        #     model = infer_shapes(model)
+        # except:
+        #     warnings.warn("infer_shapes failed.")
         self.handler = backend.GraphHandler(runtime)
 
         tensors: Dict[str, backend.Tensor] = dict()
@@ -135,7 +135,7 @@ class OnnxStub:
                                 1,
                                 reduce(
                                     lambda acc, x: acc * x,
-                                    _search_shape(model, node.input[2]),
+                                    tensors[node.input[2]].shape(),
                                 ),
                                 1,
                                 1,
@@ -357,7 +357,7 @@ class OnnxStub:
                             ceil_mode,
                         )
                 elif node.op_type == "GlobalAveragePool":
-                    [_, _, h, w] = _search_shape(model, node.input[0])
+                    [_, _, h, w] = tensors[node.input[0]].shape()
                     tensors[node.output[0]] = self.handler.avgPool(
                         tensors[node.input[0]],
                         tensors.get(node.output[0]),
@@ -595,35 +595,43 @@ class OnnxStub:
                         coordinate_transformation_mode,
                     )
                 elif node.op_type == "Squeeze":
-                    input_shape = _search_shape(model, node.input[0])
-                    axes = set(
-                        [int(i) for i in data[node.input[1]].int64_data]
+                    axes = (
+                        _parse_data(data[node.input[1]])
                         if len(node.input) > 1
-                        else _parse_attribute(node, {"axes": None})["axes"]
+                        else None
                     )
-                    assert all(input_shape[d] == 1 for d in axes)
-                    output_shape = []
-                    for i, x in enumerate(input_shape):
-                        if i not in axes:
-                            output_shape.append(x)
-                    tensors[node.output[0]] = self.handler.reshape(
+                    if axes is None:
+                        axes = next(
+                            (
+                                attr.ints
+                                for attr in node.attribute
+                                if attr.name == "axes"
+                            ),
+                            [],
+                        )
+                    tensors[node.output[0]] = self.handler.squeeze(
                         tensors[node.input[0]],
                         tensors.get(node.output[0]),
-                        output_shape,
+                        axes,
                     )
                 elif node.op_type == "Unsqueeze":
-                    input_shape = _search_shape(model, node.input[0])
                     axes = (
-                        [int(i) for i in data[node.input[1]].int64_data]
+                        _parse_data(data[node.input[1]])
                         if len(node.input) > 1
-                        else _parse_attribute(node, {"axes": None})["axes"]
+                        else None
                     )
-                    for i in axes:
-                        input_shape.insert(i, 1)
-                    tensors[node.output[0]] = self.handler.reshape(
+                    if axes is None:
+                        axes = next(
+                            (
+                                attr.ints
+                                for attr in node.attribute
+                                if attr.name == "axes"
+                            )
+                        )
+                    tensors[node.output[0]] = self.handler.unsqueeze(
                         tensors[node.input[0]],
                         tensors.get(node.output[0]),
-                        input_shape,
+                        axes,
                     )
                 elif node.op_type == "Concat":
                     tensors[node.output[0]] = self.handler.concat(
@@ -935,8 +943,7 @@ class OnnxStub:
                         node, {"alpha": 0.0001, "beta": 0.75, "bias": 1.0, "size": 1}
                     )
                     (alpha, beta, bias, size) = (
-                        attributes[name]
-                        for name in ["alpha", "beta", "bias", "size"]
+                        attributes[name] for name in ["alpha", "beta", "bias", "size"]
                     )
                     tensors[node.output[0]] = self.handler.lrn(
                         tensors[node.input[0]],
@@ -1207,6 +1214,30 @@ class OnnxStub:
                     )
                 )
                 ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            elif ty == backend.OpTypeId.Squeeze:
+                axes = backend.squeeze_axes_of(op)
+                inputs.append(
+                    ctx.push_data_input(
+                        name,
+                        "axes",
+                        TensorProto.INT64,
+                        [len(axes)],
+                        axes,
+                    )
+                )
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            elif ty == backend.OpTypeId.Unsqueeze:
+                axes = backend.unsqueeze_axes_of(op)
+                inputs.append(
+                    ctx.push_data_input(
+                        name,
+                        "axes",
+                        TensorProto.INT64,
+                        [len(axes)],
+                        axes,
+                    )
+                )
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
             elif ty == backend.OpTypeId.Concat:
                 axis = backend.concat_axis_of(op)
                 ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
@@ -1344,50 +1375,6 @@ def from_onnx(model: ModelProto, runtime):
     return stub.inputs, stub.outputs, stub.handler
 
 
-def _search_shape(model: ModelProto, name: str) -> List[int]:
-    ans = (
-        next(
-            (
-                [
-                    (d.dim_value if d.dim_value > 0 else 1)
-                    for d in tensor.type.tensor_type.shape.dim
-                ]
-                for tensor in model.graph.value_info
-                if tensor.name == name
-            ),
-            None,
-        )
-        or next(
-            (
-                [
-                    (d.dim_value if d.dim_value > 0 else 1)
-                    for d in tensor.type.tensor_type.shape.dim
-                ]
-                for tensor in model.graph.input
-                if tensor.name == name
-            ),
-            None,
-        )
-        or next(
-            (
-                [
-                    (d.dim_value if d.dim_value > 0 else 1)
-                    for d in tensor.type.tensor_type.shape.dim
-                ]
-                for tensor in model.graph.output
-                if tensor.name == name
-            ),
-            None,
-        )
-        or next(
-            [int(d) for d in tensor.dims]
-            for tensor in model.graph.initializer
-            if tensor.name == name
-        )
-    )
-    return ans
-
-
 def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
     for attr in node.attribute:
         if attr.type == AttributeProto.INT:
diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py
index f5d5a426..4d9c7574 100644
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@@ -303,6 +303,28 @@ class TestStringMethods(unittest.TestCase):
         reshape = make_node("Resize", ["x", "roi", "scales"], ["y"], name="resize")
         make_and_import_model(make_graph([reshape], "resize", [x], [y], [roi, scales]))
 
+    def test_squeeze(self):
+        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 1, 5])
+        axes = make_tensor_value_info("axes", TensorProto.INT64, [2])
+        axes_data = make_tensor("axes", TensorProto.INT64, [2], [0, 2])
+        output = make_tensor_value_info("output", TensorProto.FLOAT, [3, 5])
+        squeeze = make_node("Squeeze", ["input", "axes"], ["output"], name="squeeze")
+        make_and_import_model(
+            make_graph([squeeze], "squeeze", [input, axes], [output], [axes_data])
+        )
+
+    def test_unsqueeze(self):
+        input = make_tensor_value_info("input", TensorProto.FLOAT, [2, 3, 4, 5])
+        axes = make_tensor_value_info("axes", TensorProto.INT64, [2])
+        axes_data = make_tensor("axes", TensorProto.INT64, [2], [0, 2])
+        output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 2, 1, 3, 4, 5])
+        unsqueeze = make_node(
+            "Unsqueeze", ["input", "axes"], ["output"], name="unsqueeze"
+        )
+        make_and_import_model(
+            make_graph([unsqueeze], "unsqueeze", [input, axes], [output], [axes_data])
+        )
+
     def test_concat(self):
         input1 = make_tensor_value_info("input1", TensorProto.FLOAT, [1, 3, 2, 4])
         input2 = make_tensor_value_info("input2", TensorProto.FLOAT, [1, 3, 2, 5])
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index 7fc6f977..415ea947 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -22,8 +22,10 @@
 #include "operators/slice.h"
 #include "operators/softmax.h"
 #include "operators/split.h"
+#include "operators/squeeze.h"
 #include "operators/transpose.h"
 #include "operators/unary.h"
+#include "operators/unsqueeze.h"
 #include "operators/where.h"
 #include <numeric>
 #include <variant>
@@ -608,6 +610,28 @@ Tensor GraphHandlerObj::lrn(Tensor input, Tensor output, float alpha,
     }
 }
 
+Tensor GraphHandlerObj::squeeze(Tensor input, Tensor output, Shape axes) {
+    if (output) {
+        g->addOpWithOutputs<SqueezeObj>(std::move(input), output,
+                                        std::move(axes));
+        return output;
+    } else {
+        return g->addOp<SqueezeObj>(std::move(input), output, std::move(axes))
+            ->getOutput();
+    }
+}
+
+Tensor GraphHandlerObj::unsqueeze(Tensor input, Tensor output, Shape axes) {
+    if (output) {
+        g->addOpWithOutputs<UnsqueezeObj>(std::move(input), output,
+                                          std::move(axes));
+        return output;
+    } else {
+        return g->addOp<UnsqueezeObj>(std::move(input), output, std::move(axes))
+            ->getOutput();
+    }
+}
+
 static CastType inferCastType(Tensor input, int to) {
     auto iType = input->getDType();
     auto oType = DataType(to);
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index eadd4a4e..b565ad4d 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -12,8 +12,10 @@
 #include "operators/reduce.h"
 #include "operators/reshape.h"
 #include "operators/split.h"
+#include "operators/squeeze.h"
 #include "operators/transpose.h"
 #include "operators/unary.h"
+#include "operators/unsqueeze.h"
 #include <algorithm>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
@@ -93,6 +95,8 @@ void export_values(py::module &m) {
         .VALUE(OpType, ReduceMean)
         .VALUE(OpType, ReduceSum)
         .VALUE(OpType, Reshape)
+        .VALUE(OpType, Squeeze)
+        .VALUE(OpType, Unsqueeze)
         .VALUE(OpType, Flatten)
         .VALUE(OpType, Identity)
         .VALUE(OpType, BatchNormalization)
@@ -256,6 +260,24 @@ static vector<int64_t> reshape_shape_of(Operator op) {
     return ans;
 }
 
+static vector<int64_t> squeeze_axes_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::Squeeze);
+    auto axes = dynamic_cast<const SqueezeObj *>(op.get())->getAxes();
+    vector<int64_t> ans(axes.size());
+    std::transform(axes.begin(), axes.end(), ans.begin(),
+                   [](auto x) { return static_cast<int64_t>(x); });
+    return ans;
+}
+
+static vector<int64_t> unsqueeze_axes_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::Unsqueeze);
+    auto axes = dynamic_cast<const UnsqueezeObj *>(op.get())->getAxes();
+    vector<int64_t> ans(axes.size());
+    std::transform(axes.begin(), axes.end(), ans.begin(),
+                   [](auto x) { return static_cast<int64_t>(x); });
+    return ans;
+}
+
 static vector<int64_t> expand_shape_of(Operator op) {
     IT_ASSERT(op->getOpType() == OpType::Expand);
     auto shape = dynamic_cast<const ExpandObj *>(op.get())->getShape();
@@ -343,6 +365,8 @@ void export_functions(py::module &m) {
         .FUNCTION(flatten_axis_of)
         .FUNCTION(cast_to_of)
         .FUNCTION(depth_to_space_attrs_of)
+        .FUNCTION(squeeze_axes_of)
+        .FUNCTION(unsqueeze_axes_of)
         .FUNCTION(lrn_attrs_of);
 #undef FUNCTION
 }
@@ -509,6 +533,8 @@ void init_graph_builder(py::module &m) {
         .def("depthToSpace", &Handler::depthToSpace, policy::move)
         .def("reshape", &Handler::reshape, policy::move)
         .def("resize", &Handler::resize, policy::move)
+        .def("squeeze", &Handler::squeeze, policy::move)
+        .def("unsqueeze", &Handler::unsqueeze, policy::move)
         .def("concat", &Handler::concat, policy::move)
         .def("attentionKVCache", &Handler::attentionKVCache, policy::move)
         .def("split", &Handler::split, policy::move)
diff --git a/src/kernels/cuda/reshape.cc b/src/kernels/cuda/reshape.cc
index 7be6aca8..232bcdf6 100644
--- a/src/kernels/cuda/reshape.cc
+++ b/src/kernels/cuda/reshape.cc
@@ -19,6 +19,10 @@ REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Int32, CopyCuda,
                 "Reshape_CUDA_Int32");
 REGISTER_KERNEL(Device::CUDA, OpType::Flatten, DataType::Float32, CopyCuda,
                 "Flatten_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Squeeze, DataType::Float32, CopyCuda,
+                "Squeeze_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Unsqueeze, DataType::Float32, CopyCuda,
+                "Unsqueeze_CUDA_Float32");
 REGISTER_KERNEL(Device::CUDA, OpType::Identity, DataType::Float32, CopyCuda,
                 "Identity_CUDA_Float32");
 
diff --git a/src/operators/squeeze.cc b/src/operators/squeeze.cc
new file mode 100644
index 00000000..1609ecb9
--- /dev/null
+++ b/src/operators/squeeze.cc
@@ -0,0 +1,60 @@
+#include "operators/squeeze.h"
+#include "utils/operator_utils.h"
+
+namespace infini {
+SqueezeObj::SqueezeObj(GraphObj *graph, Tensor input, Tensor output, Shape axes)
+    : OperatorObj(OpType::Squeeze, {input}, {output}), axes(std::move(axes)) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> SqueezeObj::inferShape(const TensorVec &inputs) {
+    Shape inputDim = inputs[0]->getDims();
+    Shape outputShape;
+    auto rank = inputs[0]->getRank();
+    if (axes.size() == 0) {
+        for (int i = 0; i < (int)rank; ++i) {
+            if (inputDim[i] == 1) {
+                axes.emplace_back(i);
+            }
+        }
+    }
+    auto new_axes = axes;
+    std::transform(axes.begin(), axes.end(), new_axes.begin(),
+                   [inputDim, rank](auto x) {
+                       x = get_real_axis(x, rank);
+                       IT_ASSERT(inputDim[x] == 1);
+                       return x;
+                   });
+    for (int i = 0; i < (int)rank; ++i) {
+        auto it = std::find(new_axes.begin(), new_axes.end(), i);
+        if (it == new_axes.end()) {
+            outputShape.emplace_back(inputDim[i]);
+        }
+    }
+    return {{outputShape}};
+}
+
+std::string SqueezeObj::toString() const {
+    std::ostringstream os;
+    os << "Squeeze[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "axes=" << vecToString(axes) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> SqueezeObj::getWorkloadVector() const {
+    vector<int> ret = inputs[0]->getDims();
+    ret.insert(ret.end(), axes.begin(), axes.end());
+    ret.emplace(ret.begin(), type.underlying());
+    return ret;
+}
+vector<int> SqueezeObj::getOpAttrVector() const {
+    vector<int> ret = axes;
+    ret.emplace(ret.begin(), type.underlying());
+    return ret;
+}
+
+} // namespace infini
diff --git a/src/operators/unsqueeze.cc b/src/operators/unsqueeze.cc
new file mode 100644
index 00000000..090450bf
--- /dev/null
+++ b/src/operators/unsqueeze.cc
@@ -0,0 +1,52 @@
+#include "operators/unsqueeze.h"
+#include "utils/operator_utils.h"
+
+namespace infini {
+UnsqueezeObj::UnsqueezeObj(GraphObj *graph, Tensor input, Tensor output,
+                           Shape axes)
+    : OperatorObj(OpType::Unsqueeze, {input}, {output}), axes(std::move(axes)) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> UnsqueezeObj::inferShape(const TensorVec &inputs) {
+    Shape inputDim = inputs[0]->getDims();
+    auto rank = inputs[0]->getRank() + axes.size();
+    Shape outputShape(rank, -1);
+    for (size_t i = 0; i < axes.size(); ++i) {
+        axes[i] = get_real_axis(axes[i], rank);
+        IT_ASSERT(outputShape[axes[i]] == -1, "Axes have duplicate");
+        outputShape[axes[i]] = 1;
+    }
+    auto it = inputDim.begin();
+    for (size_t i = 0; i < outputShape.size(); ++i) {
+        if (outputShape[i] == -1) {
+            outputShape[i] = *it++;
+        }
+    }
+    return {{outputShape}};
+}
+
+std::string UnsqueezeObj::toString() const {
+    std::ostringstream os;
+    os << "Unsqueeze[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "axes=" << vecToString(axes) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> UnsqueezeObj::getWorkloadVector() const {
+    vector<int> ret = inputs[0]->getDims();
+    ret.insert(ret.end(), axes.begin(), axes.end());
+    ret.emplace(ret.begin(), type.underlying());
+    return ret;
+}
+vector<int> UnsqueezeObj::getOpAttrVector() const {
+    vector<int> ret = axes;
+    ret.emplace(ret.begin(), type.underlying());
+    return ret;
+}
+
+} // namespace infini
diff --git a/test/operators/test_reshape.cc b/test/operators/test_reshape.cc
index 00ab514f..39fa823d 100644
--- a/test/operators/test_reshape.cc
+++ b/test/operators/test_reshape.cc
@@ -2,6 +2,8 @@
 #include "core/kernel.h"
 #include "core/runtime.h"
 #include "operators/reshape.h"
+#include "operators/squeeze.h"
+#include "operators/unsqueeze.h"
 
 #include "test.h"
 
@@ -54,4 +56,36 @@ TEST(Identity, ShapeInference) {
     }
 }
 
+TEST(Squeeze, ShapeInference) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({2, 3, 1, 4}, DataType::Float32);
+        auto op = g->addOp<SqueezeObj>(i, nullptr, Shape{-2});
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({1, 1, 3, 4}, DataType::Float32);
+        auto op = g->addOp<SqueezeObj>(i, nullptr, Shape{});
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{3, 4}));
+    }
+}
+
+TEST(Unsqueeze, ShapeInference) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({2, 3, 4}, DataType::Float32);
+        auto op = g->addOp<UnsqueezeObj>(i, nullptr, Shape{0, 1});
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 1, 2, 3, 4}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({2, 3, 4}, DataType::Float32);
+        auto op = g->addOp<UnsqueezeObj>(i, nullptr, Shape{-1, -2});
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 1, 1}));
+    }
+}
+
 } // namespace infini

From 51086d2b8d36ef6aa513cb017ed6361934950066 Mon Sep 17 00:00:00 2001
From: Chenjie Duan <44265800+kilinchange@users.noreply.github.com>
Date: Mon, 15 Jan 2024 11:02:13 +0800
Subject: [PATCH 28/28] Modify kernel registration & support fp16 (#205)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* - Remove dataType from the kernel registration.

* - support fp16 for conv

* - cpu kernel: adapt the new registration mechanism

* modified all register kernel

* add where fp16

* add layernorm fp16

* add split_concat fp16

* - element_wise support fp16

* feat: support transpose fp16

* feat: support sliceOp fp16

* - unary support fp16

* - feat: support reduceOp fp16

* feat: support matmulOp/expandOp fp16

* feat: support powOp int8

* add cuda cast & support half-precision for gather

* style: fix style

* feat:support int8 for gather

* style:fix style

* modified test_cuda_conv_transposed

* fix: fix dist code to support fp16

* fix(graph.cc): fix topo_sort

* fix: fix recv and send kernel registration

* feat: add field tensors for stub

* refactor(frontend): 先排序后构图

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 为中间结果提供tensor到node的mapping

* fix (slice): add guard for area out of range

* fix: fix matmul fp16

* fix: fix re-dataMalloc for weight tensor and use of naive allocator

* feat: add dataType filter for cuda kernel

* feat: bang kernel adapt the new registration mechanism

* fix: fix some error on mlu

* feat: intelcpu kernel adapt the new registration mechanism

* feat: modify kernel registration on kunlun

* fix intelcpu compiler bug

* feat: bang reshape support all dataType

* fix: fix bang reduce

* fix(all_reduce.cc): fix as reviewer suggessted

* fix: fix style and restore unary test codes

---------

Signed-off-by: YdrMaster <ydrml@hotmail.com>
Co-authored-by: xgqdut2016 <kenan_gewei@163.com>
Co-authored-by: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
Co-authored-by: zhangyunze <z13785159769@163.com>
Co-authored-by: OdinaryWord <sx-hz@163.com>
Co-authored-by: YdrMaster <ydrml@hotmail.com>
Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
---
 examples/distributed/parallel_opt.py          |   11 +-
 include/core/kernel.h                         |   12 +-
 include/core/operator.h                       |    3 +-
 include/core/tensor.h                         |   12 +-
 include/cuda/cuda_element_wise.h              |   21 +-
 include/cuda/cuda_expand.h                    |    5 +-
 include/cuda/cuda_layernorm.h                 |    6 +
 include/cuda/cuda_softmax.h                   |    4 +-
 include/cuda/cuda_split_concat.h              |   17 +-
 include/cuda/cuda_transpose.h                 |    2 +-
 include/cuda/cuda_unary.h                     |   55 +-
 include/cuda/cuda_utility.h                   |   26 +-
 include/cuda/cuda_where.h                     |    7 +-
 include/cuda/gather.h                         |    3 +-
 include/utils/data_generator.h                |    6 +
 pyinfinitensor/src/pyinfinitensor/onnx.py     | 1642 ++++++++---------
 src/bang/bang_runtime.cc                      |    3 +-
 src/core/graph.cc                             |   52 +-
 src/core/runtime.cc                           |    6 +-
 src/cuda/cuda_runtime.cc                      |    6 +-
 src/cuda/cuda_utility.cu                      |   53 +
 src/kernels/bang/activation.cc                |   21 +-
 src/kernels/bang/activation_backward.cc       |   13 +-
 src/kernels/bang/batchnorm.cc                 |    5 +-
 src/kernels/bang/cast.cc                      |    3 +-
 src/kernels/bang/ceil.cc                      |    4 +-
 src/kernels/bang/clip.cc                      |    4 +-
 src/kernels/bang/concat.cc                    |    4 +-
 src/kernels/bang/conv.cc                      |    4 +-
 src/kernels/bang/conv_trans.cc                |    5 +-
 src/kernels/bang/convbpfilter.cc              |    5 +-
 src/kernels/bang/det.cc                       |    4 +-
 src/kernels/bang/element_wise.cc              |  105 +-
 src/kernels/bang/erf.cc                       |    4 +-
 src/kernels/bang/exp.cc                       |    4 +-
 src/kernels/bang/fill.cc                      |    4 +-
 src/kernels/bang/floor.cc                     |    3 +-
 src/kernels/bang/gather.cc                    |    4 +-
 src/kernels/bang/hardtanh.cc                  |    5 +-
 src/kernels/bang/l2loss.cc                    |    4 +-
 src/kernels/bang/layer_norm.cc                |    5 +-
 src/kernels/bang/log.cc                       |    4 +-
 src/kernels/bang/lrn.cc                       |    4 +-
 src/kernels/bang/matmul.cc                    |    4 +-
 src/kernels/bang/negtensor.cc                 |    4 +-
 src/kernels/bang/pad.cc                       |    4 +-
 src/kernels/bang/pooling.cc                   |    9 +-
 src/kernels/bang/reciprocal.cc                |    5 +-
 src/kernels/bang/reduce.cc                    |    9 +-
 src/kernels/bang/reshape.cc                   |   17 +-
 src/kernels/bang/rsqrt.cc                     |    4 +-
 src/kernels/bang/split.cc                     |    4 +-
 src/kernels/bang/sqrt.cc                      |    4 +-
 src/kernels/bang/transpose.cc                 |   10 +-
 src/kernels/bang/trigon.cc                    |   37 +-
 src/kernels/bang/where.cc                     |    4 +-
 src/kernels/cpu/concat.cc                     |   28 +-
 src/kernels/cpu/conv.cc                       |   28 +-
 src/kernels/cpu/element_wise.cc               |  173 +-
 src/kernels/cpu/matmul.cc                     |   28 +-
 src/kernels/cpu/membound.cc                   |    4 +-
 src/kernels/cpu/pooling.cc                    |  117 +-
 src/kernels/cpu/split.cc                      |   27 +-
 src/kernels/cpu/transpose.cc                  |   29 +-
 src/kernels/cpu/unary.cc                      |  396 ++--
 src/kernels/cuda/G2BMM.cc                     |    4 +-
 src/kernels/cuda/GBMM.cc                      |    4 +-
 src/kernels/cuda/all_gather.cc                |    4 +-
 src/kernels/cuda/all_reduce.cc                |   35 +-
 src/kernels/cuda/attention_kvcache.cc         |    5 +-
 src/kernels/cuda/batch_norm.cc                |    5 +-
 src/kernels/cuda/broadcast.cc                 |    4 +-
 src/kernels/cuda/clip.cc                      |    5 +-
 src/kernels/cuda/conv.cc                      |   93 +-
 src/kernels/cuda/conv_half.cc                 |  261 ---
 src/kernels/cuda/conv_transposed.cc           |   10 +-
 src/kernels/cuda/element_wise.cc              |   71 +-
 src/kernels/cuda/element_wise.cu              |  102 +-
 src/kernels/cuda/expand.cc                    |    6 +-
 src/kernels/cuda/expand.cu                    |   61 +-
 src/kernels/cuda/extend.cc                    |    4 +-
 src/kernels/cuda/gather.cc                    |   21 +-
 src/kernels/cuda/gather.cu                    |   21 +-
 src/kernels/cuda/gather_elements.cc           |    7 +-
 src/kernels/cuda/layer_norm.cc                |   43 +-
 src/kernels/cuda/layer_norm.cu                |  297 ++-
 src/kernels/cuda/matmul.cc                    |   62 +-
 .../cuda/membound_tvm_extract_source.cc       |    5 +-
 .../cuda/membound_tvm_packed_function.cc      |    4 +-
 src/kernels/cuda/pad_slice.cc                 |   10 +-
 src/kernels/cuda/pad_slice.cu                 |   81 +-
 src/kernels/cuda/pooling.cc                   |   10 +-
 src/kernels/cuda/recv.cc                      |    3 +-
 src/kernels/cuda/reduce.cc                    |   29 +-
 src/kernels/cuda/reshape.cc                   |   21 +-
 src/kernels/cuda/resize.cc                    |    4 +-
 src/kernels/cuda/send.cc                      |    3 +-
 src/kernels/cuda/softmax.cc                   |   14 +-
 src/kernels/cuda/softmax.cu                   |  270 ++-
 src/kernels/cuda/split_concat.cc              |   54 +-
 src/kernels/cuda/split_concat.cu              |   34 +-
 src/kernels/cuda/transpose.cc                 |   20 +-
 src/kernels/cuda/transpose.cu                 |   60 +-
 src/kernels/cuda/unary.cc                     |  111 +-
 src/kernels/cuda/unary.cu                     |  209 ++-
 src/kernels/cuda/where.cc                     |   20 +-
 src/kernels/cuda/where.cu                     |   40 +-
 src/kernels/intelcpu/batch_norm.cc            |    5 +-
 src/kernels/intelcpu/concat.cc                |    4 +-
 src/kernels/intelcpu/conv.cc                  |    4 +-
 src/kernels/intelcpu/conv_transposed.cc       |    5 +-
 src/kernels/intelcpu/element_wise.cc          |   26 +-
 src/kernels/intelcpu/extend.cc                |    4 +-
 src/kernels/intelcpu/gather.cc                |    4 +-
 src/kernels/intelcpu/matmul.cc                |    3 +-
 src/kernels/intelcpu/matmul_dpcpp.cc          |    5 +-
 src/kernels/intelcpu/pad.cc                   |    4 +-
 src/kernels/intelcpu/pooling.cc               |    8 +-
 src/kernels/intelcpu/pow.cc                   |    4 +-
 src/kernels/intelcpu/reduce.cc                |    7 +-
 src/kernels/intelcpu/reshape.cc               |   11 +-
 src/kernels/intelcpu/resize.cc                |    4 +-
 src/kernels/intelcpu/slice.cc                 |    4 +-
 src/kernels/intelcpu/softmax.cc               |    4 +-
 src/kernels/intelcpu/split.cc                 |    4 +-
 src/kernels/kunlun/batch_norm.cc              |    5 +-
 src/kernels/kunlun/cast.cc                    |    3 +-
 src/kernels/kunlun/concat.cc                  |    5 +-
 src/kernels/kunlun/conv.cc                    |    4 +-
 src/kernels/kunlun/conv_trans.cc              |    9 +-
 src/kernels/kunlun/element_wise.cc            |   78 +-
 src/kernels/kunlun/gather.cc                  |    5 +-
 src/kernels/kunlun/matmul.cc                  |    5 +-
 src/kernels/kunlun/pad.cc                     |    4 +-
 src/kernels/kunlun/pooling.cc                 |    9 +-
 src/kernels/kunlun/reduce_mean.cc             |    5 +-
 src/kernels/kunlun/select.cc                  |    4 +-
 src/kernels/kunlun/softmax.cc                 |    5 +-
 src/kernels/kunlun/split.cc                   |    4 +-
 src/kernels/kunlun/transpose.cc               |   10 +-
 src/kernels/kunlun/unary.cc                   |  116 +-
 src/kunlun/kunlun_runtime.cc                  |    3 +-
 src/operators/layer_norm.cc                   |    5 +-
 src/utils/operator_utils.cc                   |    3 +-
 test/kernels/cuda/test_cuda_concat.cc         |   38 +
 .../cuda/test_cuda_conv_transposed_2d.cc      |    4 +-
 test/kernels/cuda/test_cuda_layernorm.cc      |   90 +-
 test/kernels/cuda/test_cuda_softmax.cc        |  227 ++-
 test/kernels/cuda/test_cuda_split.cc          |   30 +
 test/kernels/cuda/test_cuda_unary.cc          |   30 +
 test/kernels/cuda/test_cuda_where.cc          |   96 +-
 test/kernels/intelcpu/test_mkl_conv.cc        |    4 +-
 .../intelcpu/test_mkl_conv_transposed.cc      |    4 +-
 test/kernels/intelcpu/test_mkl_pooling.cc     |    2 +-
 test/kernels/intelcpu/test_mkl_reduce.cc      |    2 +-
 test/operators/test_unary.cc                  |    3 +-
 test/operators/test_where.cc                  |   38 +-
 157 files changed, 3627 insertions(+), 2575 deletions(-)
 delete mode 100644 src/kernels/cuda/conv_half.cc

diff --git a/examples/distributed/parallel_opt.py b/examples/distributed/parallel_opt.py
index 1214b6b3..bbb0ac65 100644
--- a/examples/distributed/parallel_opt.py
+++ b/examples/distributed/parallel_opt.py
@@ -137,7 +137,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
             place[node.output[0]] = Shard(list(perm).index(plc.dim))
 
     def shard_node(node: NodeProto):
-        if node.op_type in ["Relu", "Tanh", "Softmax"]:
+        if node.op_type in ["Relu", "Tanh", "Softmax", "Cast"]:
             place[node.output[0]] = place[node.input[0]]
         elif node.op_type in ["Where"]:
             place[node.output[0]] = place[node.input[1]]
@@ -177,7 +177,14 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
             input in data for input in node.input
         ):
             # FIXME(constroy): the last MatMul should not be sharded as TP.
-            if node.output[0] in output:
+            if (
+                node.output[0] in output
+                or (
+                    index + 1 < len(model.graph.node)
+                    and model.graph.node[index + 1].output[0]
+                )
+                in output
+            ):
                 continue
             groups = 1
             # If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups
diff --git a/include/core/kernel.h b/include/core/kernel.h
index a19f3f1a..76189599 100644
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@@ -30,7 +30,6 @@ class Kernel {
   public:
     Kernel() {}
     virtual ~Kernel() {}
-
     /**
      * @param op The operator to be executed.
      * @param record The parameters for kernel execution. If extra parameters
@@ -130,15 +129,16 @@ class CpuKernelWithoutConfig : public Kernel {
 
 } // namespace infini
 
-#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt)        \
+#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt)                  \
     namespace infini {                                                         \
     static const bool _CAT(_register_kernel_, cnt) =                           \
-        KernelRegistry::getInstance().registerKernel(                          \
-            KernelAttrs{device, opType, dataType}, new kernel(), name);        \
+        KernelRegistry::getInstance().registerKernel(KernelAttrs{device,       \
+                                                                 opType},      \
+                                                     new kernel(), name);      \
     }
 
-#define REGISTER_KERNEL(device, opType, dataType, kernel, name)                \
-    _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__)
+#define REGISTER_KERNEL(device, opType, kernel, name)                          \
+    _REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)
 
 #define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt)                        \
     namespace infini {                                                         \
diff --git a/include/core/operator.h b/include/core/operator.h
index cc8ce174..220a06c1 100644
--- a/include/core/operator.h
+++ b/include/core/operator.h
@@ -4,7 +4,7 @@
 #include "core/tensor.h"
 
 namespace infini {
-using KernelAttrs = std::tuple<Device, OpType::underlying_t, DataType>;
+using KernelAttrs = std::tuple<Device, OpType::underlying_t>;
 
 struct OpPerfKey {
     HashType hash;
@@ -90,6 +90,7 @@ class OperatorObj : public Object {
     OpType getOpType() const { return type; }
     // HACK: set correct data type
     DataType getDType() const { return getInputs(0)->getDType(); }
+    DataType getOutDType() const { return getOutput()->getDType(); }
     virtual int numInputs() const = 0;
     virtual int numOutputs() const = 0;
 
diff --git a/include/core/tensor.h b/include/core/tensor.h
index 95229c14..63efd0f7 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -44,8 +44,16 @@ class TensorObj : public TensorBaseObj {
     bool isOutput() const { return tensorType == TensorType::output; }
     bool isOthers() const { return tensorType == TensorType::others; }
     void setWeight() { tensorType = TensorType::weight; }
-    void setInput() { tensorType = TensorType::input; }
-    void setOutput() { tensorType = TensorType::output; }
+    void setInput() {
+        if (!this->isWeight()) {
+            tensorType = TensorType::input;
+        }
+    }
+    void setOutput() {
+        if (!this->isWeight()) {
+            tensorType = TensorType::output;
+        }
+    }
     string tensorTypeToString() const {
         switch (tensorType) {
         case TensorType::weight:
diff --git a/include/cuda/cuda_element_wise.h b/include/cuda/cuda_element_wise.h
index db9c16f1..10bb1bca 100644
--- a/include/cuda/cuda_element_wise.h
+++ b/include/cuda/cuda_element_wise.h
@@ -1,13 +1,16 @@
 #pragma once
 
 namespace infini {
-void div_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
-                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
-void add_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
-                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
-void pow_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
-                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
-void less_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
-                 int b0, int b1, int b2, int b3, int c0, int c1, int c2,
-                 int c3);
+void div_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
+                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
+                int c2, int c3);
+void add_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
+                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
+                int c2, int c3);
+void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
+                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
+                int c2, int c3);
+void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
+                 int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
+                 int c2, int c3);
 }; // namespace infini
diff --git a/include/cuda/cuda_expand.h b/include/cuda/cuda_expand.h
index 8d4701fd..3723a8e7 100644
--- a/include/cuda/cuda_expand.h
+++ b/include/cuda/cuda_expand.h
@@ -3,7 +3,8 @@
 #include "operators/unary.h"
 #include "utils/small_array.h"
 namespace infini {
-void expandKernel(float *input, float *output, int nDims, int outputsize,
-                  SmallArray inputShape, SmallArray outputShape);
+void expandKernel(int dType, void *input, void *output, int nDims,
+                  int outputsize, SmallArray inputShape,
+                  SmallArray outputShape);
 
 }; // namespace infini
diff --git a/include/cuda/cuda_layernorm.h b/include/cuda/cuda_layernorm.h
index 997c8a06..b6829d09 100644
--- a/include/cuda/cuda_layernorm.h
+++ b/include/cuda/cuda_layernorm.h
@@ -8,4 +8,10 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
 void LaynormKernel(const float *input, const float *scale, const float eps,
                    int size, int scaleSize, const int dimsize, const int stride,
                    float *output);
+void LaynormKernel(const half *input, const half *scale, const half eps,
+                   int size, int scaleSize, const int dimsize, const int stride,
+                   half *output, const half *bias, int biasSize);
+void LaynormKernel(const half *input, const half *scale, const half eps,
+                   int size, int scaleSize, const int dimsize, const int stride,
+                   half *output);
 }; // namespace infini
diff --git a/include/cuda/cuda_softmax.h b/include/cuda/cuda_softmax.h
index 671f46f8..c0a54a34 100644
--- a/include/cuda/cuda_softmax.h
+++ b/include/cuda/cuda_softmax.h
@@ -3,4 +3,6 @@
 namespace infini {
 void softmax_kernel(int num_blocks, float *input, float *output, int size,
                     int dimsize, int stride);
-}
+void softmax_kernel(int num_blocks, half *input, half *output, int size,
+                    int dimsize, int stride);
+} // namespace infini
diff --git a/include/cuda/cuda_split_concat.h b/include/cuda/cuda_split_concat.h
index 58bdf330..d324a3ef 100644
--- a/include/cuda/cuda_split_concat.h
+++ b/include/cuda/cuda_split_concat.h
@@ -8,8 +8,8 @@ const int DIM_MAX_SIZE = 8;
 // Concat operator acts like element tensors composing to one big tensor,and
 // split operator acts like one big tensor being composed by element
 // tensors.
-struct ElementTensorMetadata {
-    float *data[BATCH_SIZE];
+template <typename T> struct ElementTensorMetadata {
+    T *data[BATCH_SIZE];
     int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in
                              // the composed tensor.
     int dimSize[BATCH_SIZE]; // the dimention size of the element tensor.
@@ -20,16 +20,17 @@ struct ElementTensorMetadata {
                    data[i], dimBgNo[i], dimSize[i], nElements[i]);
     }
 };
-
-struct ComposedTensorMetadata {
+template <typename T> struct ComposedTensorMetadata {
     int dimSize[DIM_MAX_SIZE];
     int stride[DIM_MAX_SIZE];
-    float *data;
+    T *data;
 };
 
 namespace infini {
-void split_concat_kernel(const ElementTensorMetadata &eleMeta,
-                         const ComposedTensorMetadata &compMeta, int dim,
+void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
+                         const ComposedTensorMetadata<float> &compMeta, int dim,
+                         int batchSize, int nDims, bool isSplit);
+void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
+                         const ComposedTensorMetadata<half> &compMeta, int dim,
                          int batchSize, int nDims, bool isSplit);
-
 } // namespace infini
diff --git a/include/cuda/cuda_transpose.h b/include/cuda/cuda_transpose.h
index b168cf0e..89d080ed 100644
--- a/include/cuda/cuda_transpose.h
+++ b/include/cuda/cuda_transpose.h
@@ -5,7 +5,7 @@
 
 namespace infini {
 
-void transpose_kernel(float *input, float *output, int nDims, int size,
+void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
                       SmallArray strides, SmallArray outputShape);
 
 }; // namespace infini
diff --git a/include/cuda/cuda_unary.h b/include/cuda/cuda_unary.h
index 31a39951..49a589b3 100644
--- a/include/cuda/cuda_unary.h
+++ b/include/cuda/cuda_unary.h
@@ -3,48 +3,21 @@
 #include "operators/unary.h"
 
 namespace infini {
-void softmax_kernel(float *input, float *output, size_t num);
-void relu_kernel(float *input, float *output, size_t num);
-void sigmoid_kernel(float *input, float *output, size_t num);
-void tanh_kernel(float *input, float *output, size_t num);
-void abs_kernel(float *input, float *output, size_t num);
-void sqrt_kernel(float *input, float *output, size_t num);
-void neg_kernel(float *input, float *output, size_t num);
-void gelu_kernel(float *input, float *output, size_t num);
-void erf_kernel(float *input, float *output, size_t num);
-void hard_sigmoid_kernel(float *input, float *output, size_t num);
-void hard_swish_kernel(float *input, float *output, size_t num);
+template <typename T> void softmax_kernel(T *input, T *output, size_t num);
+template <typename T> void relu_kernel(T *input, T *output, size_t num);
+template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
+template <typename T> void tanh_kernel(T *input, T *output, size_t num);
+template <typename T> void abs_kernel(T *input, T *output, size_t num);
+template <typename T> void sqrt_kernel(T *input, T *output, size_t num);
+template <typename T> void neg_kernel(T *input, T *output, size_t num);
+template <typename T> void gelu_kernel(T *input, T *output, size_t num);
+template <typename T> void erf_kernel(T *input, T *output, size_t num);
+template <typename T> void hard_sigmoid_kernel(T *input, T *output, size_t num);
+template <typename T> void hard_swish_kernel(T *input, T *output, size_t num);
 
-void unary_kernel(const Operator &_op) {
-    auto op = as<UnaryObj>(_op);
-    float *const inputData = (op->getInputs(0)->getRawDataPtr<float *>());
-    float *const outputData = (op->getOutput()->getRawDataPtr<float *>());
+template <typename INPUT, typename OUTPUT>
+void cast_kernel(INPUT *input, OUTPUT *output, size_t num);
 
-    size_t num = op->getOutput()->size();
-    if (op->getOpType() == OpType::Softmax)
-        softmax_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Relu)
-        relu_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Sigmoid)
-        sigmoid_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::HardSigmoid)
-        hard_sigmoid_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::HardSwish)
-        hard_swish_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Tanh)
-        tanh_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Abs)
-        abs_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Sqrt)
-        sqrt_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Gelu)
-        gelu_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Neg)
-        neg_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Erf)
-        erf_kernel(inputData, outputData, num);
-    else
-        IT_TODO_HALT();
-}
+void unary_kernel(const Operator &_op);
 
 }; // namespace infini
diff --git a/include/cuda/cuda_utility.h b/include/cuda/cuda_utility.h
index 85e3478b..bc340912 100644
--- a/include/cuda/cuda_utility.h
+++ b/include/cuda/cuda_utility.h
@@ -1,11 +1,29 @@
+#pragma once
 #include "core/tensor.h"
+#include "cuda/cuda_common.h"
 
 namespace infini {
 
 void cudaPrintFloat(float *x, int len);
 
-void cudaPrintTensor(const Tensor &tensor) {
-    cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
-}
+void cudaPrintTensor(const Tensor &tensor);
 
-} // namespace infini
\ No newline at end of file
+cudnnDataType_t cudnnDataTypeConvert(DataType dataType);
+cudaDataType cublasDataTypeConvert(DataType);
+
+template <int index> struct DT_CUDA {};
+template <> struct DT_CUDA<0> { using t = bool; };
+template <> struct DT_CUDA<1> { using t = float; };
+template <> struct DT_CUDA<2> { using t = unsigned char; };
+template <> struct DT_CUDA<3> { using t = char; };
+template <> struct DT_CUDA<4> { using t = unsigned short; };
+template <> struct DT_CUDA<5> { using t = short; };
+template <> struct DT_CUDA<6> { using t = int; };
+template <> struct DT_CUDA<7> { using t = long long; };
+template <> struct DT_CUDA<9> { using t = bool; };
+template <> struct DT_CUDA<10> { using t = half; };
+template <> struct DT_CUDA<11> { using t = double; };
+template <> struct DT_CUDA<12> { using t = unsigned int; };
+template <> struct DT_CUDA<13> { using t = unsigned long long; };
+template <> struct DT_CUDA<16> { using t = nv_bfloat16; };
+} // namespace infini
diff --git a/include/cuda/cuda_where.h b/include/cuda/cuda_where.h
index bc6d3e81..8c2ba2db 100644
--- a/include/cuda/cuda_where.h
+++ b/include/cuda/cuda_where.h
@@ -3,10 +3,15 @@
 #include "utils/small_array.h"
 
 namespace infini {
+
 void whereKernel(const float *inputX, const float *inputY,
                  const uint8_t *condition, float *output, int nDims,
                  int outputsize, SmallArray inputXShape, SmallArray inputYShape,
                  SmallArray conditionShape, SmallArray outputShape, int xSize,
                  int ySize, int cSize);
-
+void whereKernel(const half *inputX, const half *inputY,
+                 const uint8_t *condition, half *output, int nDims,
+                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
+                 SmallArray conditionShape, SmallArray outputShape, int xSize,
+                 int ySize, int cSize);
 }; // namespace infini
diff --git a/include/cuda/gather.h b/include/cuda/gather.h
index 0f0a1b27..bea716c0 100644
--- a/include/cuda/gather.h
+++ b/include/cuda/gather.h
@@ -53,7 +53,8 @@ inline void initGatherMetaData(GatherMetaData &metaData,
         metaData.inStride[i] = in->getStride()[i];
     }
 }
-void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num);
+template <typename T>
+void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num);
 
 void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
                             size_t num);
diff --git a/include/utils/data_generator.h b/include/utils/data_generator.h
index 982db835..970b8038 100644
--- a/include/utils/data_generator.h
+++ b/include/utils/data_generator.h
@@ -91,6 +91,12 @@ template <int val> class ValGenerator : public DataGenerator {
         fill<uint32_t>(data, size);
     }
     void fill(float *data, size_t size) override { fill<float>(data, size); }
+    void fill_fp16(uint16_t *data, size_t size) {
+        for (size_t i = 0; i < size; i++) {
+            float x = 1.0f * val;
+            data[i] = float_to_fp16(x);
+        }
+    }
 };
 typedef ValGenerator<1> OneGenerator;
 typedef ValGenerator<0> ZeroGenerator;
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 192e5273..79abb7f4 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -37,7 +37,7 @@ class OnnxStub:
     It can be generated from an Onnx model object.
     """
 
-    def __init__(self, model: ModelProto, runtime):
+    def __init__(self, model: ModelProto, runtime, use_naive_allocator: bool = False):
         # We use some user-defined operators for distributed inference
         try:
             # onnx simplifier performs inplace simplify
@@ -51,13 +51,43 @@ class OnnxStub:
 
         self.inputs: Dict[str, backend.Tensor] = {}
         self.outputs: Dict[str, backend.Tensor] = {}
+        self.tensors: Dict[str, backend.Tensor] = {}
+        self.tensor_node_map: Dict[str, str] = {}
         self.initializer: Dict[int, TensorProto] = {}
+        self.use_naive_allocator: bool = use_naive_allocator
         # try:
         #     model = infer_shapes(model)
         # except:
         #     warnings.warn("infer_shapes failed.")
         self.handler = backend.GraphHandler(runtime)
 
+        # 处理重名和匿名算子
+        names = {}
+        for node in model.graph.node:
+            if node.name == "":
+                node.name = "missing_name(" + node.op_type + ")"
+            if node.name in names:
+                names[node.name] += 1
+                node.name += "_" + str(names[node.name])
+            else:
+                names[node.name] = 0
+        # 拓扑排序
+        sorted_nodes = []
+        known_edge = set(t.name for t in model.graph.input)
+        known_edge.update(t.name for t in model.graph.initializer)
+        while len(sorted_nodes) < len(model.graph.node):
+            updated = False
+            for i, node in enumerate(model.graph.node):
+                if all(t in known_edge for t in node.input):
+                    node.name = str(len(sorted_nodes)) + "_" + node.name
+                    sorted_nodes.append(i)
+                    known_edge.update(node.output)
+                    for t_ in node.output:
+                        self.tensor_node_map[t_] = node.name
+                    updated = True
+            if not updated:
+                raise Exception("Graph has cycle")
+
         tensors: Dict[str, backend.Tensor] = dict()
         data: Dict[str, TensorProto] = dict()
 
@@ -82,98 +112,64 @@ class OnnxStub:
             )
             tensors[output.name].set_output()
 
-        node_name = []
-        new_node_name = []
-        for node in model.graph.node:
-            node_name.append(node.name)
-        node_list = model.graph.node
-        while len(node_list) != 0:
-            for node in model.graph.node:
-                if node.name not in node_list:
-                    continue
-                if _analyse_node(node, tensors):
-                    continue
-                if node.op_type == "Conv":
-                    attributes = _parse_attribute(
-                        node,
-                        {
-                            "dilations": [1, 1],
-                            "pads": [0, 0, 0, 0],
-                            "strides": [1, 1],
-                        },
+        for node_idx in sorted_nodes:
+            node = model.graph.node[node_idx]
+            if node.op_type == "Conv":
+                attributes = _parse_attribute(
+                    node,
+                    {
+                        "dilations": [1, 1],
+                        "pads": [0, 0, 0, 0],
+                        "strides": [1, 1],
+                    },
+                )
+                (d, p, s) = (
+                    attributes[name] for name in ["dilations", "pads", "strides"]
+                )
+                if p[0] != p[2] or p[1] != p[3]:
+                    adapt = "{}-adapt".format(node.output[0])
+                    tensors[adapt] = self.handler.pad(
+                        tensors[node.input[0]], None, p, [-2, -1]
                     )
-                    (d, p, s) = (
-                        attributes[name] for name in ["dilations", "pads", "strides"]
-                    )
-                    if p[0] != p[2] or p[1] != p[3]:
-                        adapt = "{}-adapt".format(node.output[0])
-                        tensors[adapt] = self.handler.pad(
-                            tensors[node.input[0]], None, p, [-2, -1]
-                        )
-                        p = [0, 0, 0, 0]
-                    else:
-                        adapt = node.input[0]
+                    p = [0, 0, 0, 0]
+                else:
+                    adapt = node.input[0]
 
-                    if len(node.input) > 2:
-                        bias = "{}-bias".format(node.output[0])
-                        reshape = "{}-reshape".format(node.output[0])
-                        tensors[bias] = self.handler.conv(
-                            tensors[adapt],
-                            tensors[node.input[1]],
-                            None,
-                            p[0],
-                            p[1],
-                            s[0],
-                            s[1],
-                            d[0],
-                            d[1],
-                        )
-                        tensors[reshape] = self.handler.reshape(
-                            tensors[node.input[2]],
-                            None,
-                            [
-                                1,
-                                reduce(
-                                    lambda acc, x: acc * x,
-                                    tensors[node.input[2]].shape(),
-                                ),
-                                1,
-                                1,
-                            ],
-                        )
-                        tensors[node.output[0]] = self.handler.add(
-                            tensors[bias],
-                            tensors[reshape],
-                            tensors.get(node.output[0]),
-                        )
-                    else:
-                        tensors[node.output[0]] = self.handler.conv(
-                            tensors[adapt],
-                            tensors[node.input[1]],
-                            tensors.get(node.output[0]),
-                            p[0],
-                            p[1],
-                            s[0],
-                            s[1],
-                            d[0],
-                            d[1],
-                        )
-                elif node.op_type == "ConvTranspose":
-                    attributes = _parse_attribute(
-                        node,
-                        {
-                            "dilations": [1, 1],
-                            "pads": [0, 0],
-                            "strides": [1, 1],
-                            "output_padding": [0, 0],
-                        },
+                if len(node.input) > 2:
+                    bias = "{}-bias".format(node.output[0])
+                    reshape = "{}-reshape".format(node.output[0])
+                    tensors[bias] = self.handler.conv(
+                        tensors[adapt],
+                        tensors[node.input[1]],
+                        None,
+                        p[0],
+                        p[1],
+                        s[0],
+                        s[1],
+                        d[0],
+                        d[1],
                     )
-                    (d, p, s, op) = (
-                        attributes[name]
-                        for name in ["dilations", "pads", "strides", "output_padding"]
+                    tensors[reshape] = self.handler.reshape(
+                        tensors[node.input[2]],
+                        None,
+                        [
+                            1,
+                            reduce(
+                                lambda acc, x: acc * x,
+                                tensors[node.input[2]].shape(),
+                            ),
+                            1,
+                            1,
+                        ],
                     )
-                    tensors[node.output[0]] = self.handler.convTransposed2d(
-                        tensors[node.input[0]],
+                    tensors[node.output[0]] = self.handler.add(
+                        tensors[bias],
+                        tensors[reshape],
+                        tensors.get(node.output[0]),
+                    )
+                else:
+                    tensors[node.output[0]] = self.handler.conv(
+                        tensors[adapt],
                         tensors[node.input[1]],
                         tensors.get(node.output[0]),
                         p[0],
@@ -182,540 +178,632 @@ class OnnxStub:
                         s[1],
                         d[0],
                         d[1],
-                        op[0],
-                        op[1],
                     )
-                elif node.op_type == "MatMul":
-                    tensors[node.output[0]] = self.handler.matmul(
-                        tensors[node.input[0]],
-                        tensors[node.input[1]],
+            elif node.op_type == "ConvTranspose":
+                attributes = _parse_attribute(
+                    node,
+                    {
+                        "dilations": [1, 1],
+                        "pads": [0, 0],
+                        "strides": [1, 1],
+                        "output_padding": [0, 0],
+                    },
+                )
+                (d, p, s, op) = (
+                    attributes[name]
+                    for name in ["dilations", "pads", "strides", "output_padding"]
+                )
+                tensors[node.output[0]] = self.handler.convTransposed2d(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                    p[0],
+                    p[1],
+                    s[0],
+                    s[1],
+                    d[0],
+                    d[1],
+                    op[0],
+                    op[1],
+                )
+            elif node.op_type == "MatMul":
+                tensors[node.output[0]] = self.handler.matmul(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                    False,
+                    False,
+                    None,
+                    backend.ActType.Linear,
+                )
+            elif node.op_type == "Gemm":
+                attributes = _parse_attribute(
+                    node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
+                )
+                (alpha, beta, transA, transB) = (
+                    attributes[name] for name in ["alpha", "beta", "transA", "transB"]
+                )
+                # FIXME unsupport attributes: `alpha` `beta`
+                assert alpha == 1.0
+                assert beta == 1.0
+                tensors[node.output[0]] = self.handler.matmul(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                    transA == 1,
+                    transB == 1,
+                    tensors[node.input[2]] if len(node.input) > 2 else None,
+                    backend.ActType.Linear,
+                )
+            elif node.op_type == "BatchNormalization":
+                (input, mean, var, scale, bias) = (
+                    tensors[node.input[i]] for i in [0, 3, 4, 1, 2]
+                )
+                output = tensors.get(node.output[0])
+                attributes = _parse_attribute(
+                    node, {"momentum": 0.9, "epsilon": 1e-05, "training_mode": 0}
+                )
+                (momentum, eps, training) = (
+                    attributes[name]
+                    for name in ["momentum", "epsilon", "training_mode"]
+                )
+                tensors[node.output[0]] = self.handler.batchNormalization(
+                    input,
+                    output,
+                    mean,
+                    var,
+                    scale,
+                    bias,
+                    momentum,
+                    eps,
+                    training != 0,
+                )
+            elif node.op_type == "LayerNormalization":
+                (input, scale) = (tensors[node.input[i]] for i in [0, 1])
+                bias = None if len(node.input) < 3 else tensors[node.input[2]]
+                output = tensors.get(node.output[0])
+                attributes = _parse_attribute(
+                    node, {"axis": -1, "epsilon": 1e-05, "stash_type": 1}
+                )
+                (axis, eps, stash_type) = (
+                    attributes[name] for name in ["axis", "epsilon", "stash_type"]
+                )
+                tensors[node.output[0]] = self.handler.layerNormalization(
+                    input,
+                    scale,
+                    output,
+                    bias,
+                    eps,
+                    axis,
+                    stash_type,
+                )
+            elif node.op_type == "MaxPool":
+                attributes = _parse_attribute(
+                    node,
+                    {
+                        "kernel_shape": None,
+                        "dilations": [1, 1],
+                        "pads": [0, 0, 0, 0],
+                        "strides": [1, 1],
+                        "ceil_mode": 0,
+                    },
+                )
+                (k, d, p, s, ceil_mode) = (
+                    attributes[name]
+                    for name in [
+                        "kernel_shape",
+                        "dilations",
+                        "pads",
+                        "strides",
+                        "ceil_mode",
+                    ]
+                )
+                if p[0] != p[2] or p[1] != p[3]:
+                    adapt = "{}-adapt".format(node.output[0])
+                    tensors[adapt] = self.handler.pad(
+                        tensors.get(node.input[0]), None, p, [-2, -1]
+                    )
+                    tensors[node.output[0]] = self.handler.maxPool(
+                        tensors[adapt],
                         tensors.get(node.output[0]),
-                        False,
-                        False,
-                        None,
-                        backend.ActType.Linear,
+                        k[0],
+                        k[1],
+                        d[0],
+                        d[1],
+                        0,
+                        0,
+                        s[0],
+                        s[1],
+                        ceil_mode,
                     )
-                elif node.op_type == "Gemm":
-                    attributes = _parse_attribute(
-                        node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
-                    )
-                    (alpha, beta, transA, transB) = (
-                        attributes[name]
-                        for name in ["alpha", "beta", "transA", "transB"]
-                    )
-                    # FIXME unsupport attributes: `alpha` `beta`
-                    assert alpha == 1.0
-                    assert beta == 1.0
-                    tensors[node.output[0]] = self.handler.matmul(
+                else:
+                    tensors[node.output[0]] = self.handler.maxPool(
                         tensors[node.input[0]],
-                        tensors[node.input[1]],
                         tensors.get(node.output[0]),
-                        transA == 1,
-                        transB == 1,
-                        tensors[node.input[2]] if len(node.input) > 2 else None,
-                        backend.ActType.Linear,
+                        k[0],
+                        k[1],
+                        d[0],
+                        d[1],
+                        p[0],
+                        p[1],
+                        s[0],
+                        s[1],
+                        ceil_mode,
                     )
-                elif node.op_type == "BatchNormalization":
-                    (input, mean, var, scale, bias) = (
-                        tensors[node.input[i]] for i in [0, 3, 4, 1, 2]
+            elif node.op_type == "AveragePool":
+                attributes = _parse_attribute(
+                    node,
+                    {
+                        "kernel_shape": None,
+                        "pads": [0, 0, 0, 0],
+                        "strides": [1, 1],
+                        "ceil_mode": 0,
+                    },
+                )
+                (k, p, s, ceil_mode) = (
+                    attributes[name]
+                    for name in ["kernel_shape", "pads", "strides", "ceil_mode"]
+                )
+                if p[0] != p[2] or p[1] != p[3]:
+                    adapt = "{}-adapt".format(node.output[0])
+                    tensors[adapt] = self.handler.pad(
+                        tensors.get(node.input[0]), None, p, [-2, -1]
                     )
-                    output = tensors.get(node.output[0])
-                    attributes = _parse_attribute(
-                        node, {"momentum": 0.9, "epsilon": 1e-05, "training_mode": 0}
+                    tensors[node.output[0]] = self.handler.avgPool(
+                        tensors[adapt],
+                        tensors.get(node.output[0]),
+                        k[0],
+                        k[1],
+                        1,
+                        1,
+                        0,
+                        0,
+                        s[0],
+                        s[1],
+                        ceil_mode,
                     )
-                    (momentum, eps, training) = (
-                        attributes[name]
-                        for name in ["momentum", "epsilon", "training_mode"]
-                    )
-                    tensors[node.output[0]] = self.handler.batchNormalization(
-                        input,
-                        output,
-                        mean,
-                        var,
-                        scale,
-                        bias,
-                        momentum,
-                        eps,
-                        training != 0,
-                    )
-                elif node.op_type == "LayerNormalization":
-                    (input, scale) = (tensors[node.input[i]] for i in [0, 1])
-                    bias = None if len(node.input) < 3 else tensors[node.input[2]]
-                    output = tensors.get(node.output[0])
-                    attributes = _parse_attribute(
-                        node, {"axis": -1, "epsilon": 1e-05, "stash_type": 1}
-                    )
-                    (axis, eps, stash_type) = (
-                        attributes[name] for name in ["axis", "epsilon", "stash_type"]
-                    )
-                    tensors[node.output[0]] = self.handler.layerNormalization(
-                        input,
-                        scale,
-                        output,
-                        bias,
-                        eps,
-                        axis,
-                        stash_type,
-                    )
-                elif node.op_type == "MaxPool":
-                    attributes = _parse_attribute(
-                        node,
-                        {
-                            "kernel_shape": None,
-                            "dilations": [1, 1],
-                            "pads": [0, 0, 0, 0],
-                            "strides": [1, 1],
-                            "ceil_mode": 0,
-                        },
-                    )
-                    (k, d, p, s, ceil_mode) = (
-                        attributes[name]
-                        for name in [
-                            "kernel_shape",
-                            "dilations",
-                            "pads",
-                            "strides",
-                            "ceil_mode",
-                        ]
-                    )
-                    if p[0] != p[2] or p[1] != p[3]:
-                        adapt = "{}-adapt".format(node.output[0])
-                        tensors[adapt] = self.handler.pad(
-                            tensors.get(node.input[0]), None, p, [-2, -1]
-                        )
-                        tensors[node.output[0]] = self.handler.maxPool(
-                            tensors[adapt],
-                            tensors.get(node.output[0]),
-                            k[0],
-                            k[1],
-                            d[0],
-                            d[1],
-                            0,
-                            0,
-                            s[0],
-                            s[1],
-                            ceil_mode,
-                        )
-                    else:
-                        tensors[node.output[0]] = self.handler.maxPool(
-                            tensors[node.input[0]],
-                            tensors.get(node.output[0]),
-                            k[0],
-                            k[1],
-                            d[0],
-                            d[1],
-                            p[0],
-                            p[1],
-                            s[0],
-                            s[1],
-                            ceil_mode,
-                        )
-                elif node.op_type == "AveragePool":
-                    attributes = _parse_attribute(
-                        node,
-                        {
-                            "kernel_shape": None,
-                            "pads": [0, 0, 0, 0],
-                            "strides": [1, 1],
-                            "ceil_mode": 0,
-                        },
-                    )
-                    (k, p, s, ceil_mode) = (
-                        attributes[name]
-                        for name in ["kernel_shape", "pads", "strides", "ceil_mode"]
-                    )
-                    if p[0] != p[2] or p[1] != p[3]:
-                        adapt = "{}-adapt".format(node.output[0])
-                        tensors[adapt] = self.handler.pad(
-                            tensors.get(node.input[0]), None, p, [-2, -1]
-                        )
-                        tensors[node.output[0]] = self.handler.avgPool(
-                            tensors[adapt],
-                            tensors.get(node.output[0]),
-                            k[0],
-                            k[1],
-                            1,
-                            1,
-                            0,
-                            0,
-                            s[0],
-                            s[1],
-                            ceil_mode,
-                        )
-                    else:
-                        tensors[node.output[0]] = self.handler.avgPool(
-                            tensors[node.input[0]],
-                            tensors.get(node.output[0]),
-                            k[0],
-                            k[1],
-                            1,
-                            1,
-                            p[0],
-                            p[1],
-                            s[0],
-                            s[1],
-                            ceil_mode,
-                        )
-                elif node.op_type == "GlobalAveragePool":
-                    [_, _, h, w] = tensors[node.input[0]].shape()
+                else:
                     tensors[node.output[0]] = self.handler.avgPool(
                         tensors[node.input[0]],
                         tensors.get(node.output[0]),
-                        h,
-                        w,
+                        k[0],
+                        k[1],
                         1,
                         1,
-                        0,
-                        0,
+                        p[0],
+                        p[1],
+                        s[0],
+                        s[1],
+                        ceil_mode,
+                    )
+            elif node.op_type == "GlobalAveragePool":
+                [_, _, h, w] = tensors[node.input[0]].shape()
+                tensors[node.output[0]] = self.handler.avgPool(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    h,
+                    w,
+                    1,
+                    1,
+                    0,
+                    0,
+                    1,
+                    1,
+                    0,
+                )
+            elif node.op_type == "Add":
+                tensors[node.output[0]] = self.handler.add(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Sub":
+                tensors[node.output[0]] = self.handler.sub(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Mul":
+                tensors[node.output[0]] = self.handler.mul(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Div":
+                tensors[node.output[0]] = self.handler.div(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Pow":
+                tensors[node.output[0]] = self.handler.pow(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Min":
+                tensors[node.output[0]] = self.handler.min(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Max":
+                tensors[node.output[0]] = self.handler.max(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Relu":
+                tensors[node.output[0]] = self.handler.relu(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Gelu":
+                tensors[node.output[0]] = self.handler.gelu(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Sigmoid":
+                tensors[node.output[0]] = self.handler.sigmoid(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "HardSigmoid":
+                tensors[node.output[0]] = self.handler.hardSigmoid(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "HardSwish":
+                tensors[node.output[0]] = self.handler.hardSwish(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Tanh":
+                tensors[node.output[0]] = self.handler.tanh(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Softmax":
+                tensors[node.output[0]] = self.handler.softmax(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    next(
+                        (attr.i for attr in node.attribute if attr.name == "axis"),
+                        -1,
+                    ),
+                )
+            elif node.op_type == "Abs":
+                tensors[node.output[0]] = self.handler.abs(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Sqrt":
+                tensors[node.output[0]] = self.handler.sqrt(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Neg":
+                tensors[node.output[0]] = self.handler.neg(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Shape":
+                tensors[node.output[0]] = self.handler.shape(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Identity":
+                tensors[node.output[0]] = self.handler.identity(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Flatten":
+                tensors[node.output[0]] = self.handler.flatten(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    next(
+                        (attr.i for attr in node.attribute if attr.name == "axis"),
                         1,
-                        1,
-                        0,
-                    )
-                elif node.op_type == "Add":
-                    tensors[node.output[0]] = self.handler.add(
-                        tensors[node.input[0]],
-                        tensors[node.input[1]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Sub":
-                    tensors[node.output[0]] = self.handler.sub(
-                        tensors[node.input[0]],
-                        tensors[node.input[1]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Mul":
-                    tensors[node.output[0]] = self.handler.mul(
-                        tensors[node.input[0]],
-                        tensors[node.input[1]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Div":
-                    tensors[node.output[0]] = self.handler.div(
-                        tensors[node.input[0]],
-                        tensors[node.input[1]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Pow":
-                    tensors[node.output[0]] = self.handler.pow(
-                        tensors[node.input[0]],
-                        tensors[node.input[1]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Min":
-                    tensors[node.output[0]] = self.handler.min(
-                        tensors[node.input[0]],
-                        tensors[node.input[1]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Max":
-                    tensors[node.output[0]] = self.handler.max(
-                        tensors[node.input[0]],
-                        tensors[node.input[1]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Relu":
-                    tensors[node.output[0]] = self.handler.relu(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Gelu":
-                    tensors[node.output[0]] = self.handler.gelu(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Sigmoid":
-                    tensors[node.output[0]] = self.handler.sigmoid(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "HardSigmoid":
-                    tensors[node.output[0]] = self.handler.hardSigmoid(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "HardSwish":
-                    tensors[node.output[0]] = self.handler.hardSwish(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Tanh":
-                    tensors[node.output[0]] = self.handler.tanh(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Softmax":
-                    tensors[node.output[0]] = self.handler.softmax(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        next(
-                            (attr.i for attr in node.attribute if attr.name == "axis"),
-                            -1,
+                    ),
+                )
+            elif node.op_type == "PRelu":
+                tensors[node.output[0]] = self.handler.pRelu(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Clip":
+                tensors[node.output[0]] = self.handler.clip(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    next(_parse_data(data[node.input[1]]).__iter__(), None)
+                    if len(node.input) > 1
+                    else None,
+                    next(_parse_data(data[node.input[2]]).__iter__(), None)
+                    if len(node.input) > 2
+                    else None,
+                )
+            elif node.op_type == "Transpose":
+                perm = next(
+                    (attr.ints for attr in node.attribute if attr.name == "perm"),
+                    None,
+                )
+                tensors[node.output[0]] = self.handler.transpose(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    perm,
+                )
+            elif node.op_type == "DepthToSpace":
+                blocksize = next(
+                    (attr.i for attr in node.attribute if attr.name == "blocksize"),
+                    None,
+                )
+                mode = next(
+                    (attr.s for attr in node.attribute if attr.name == "mode"),
+                    None,
+                )
+                tensors[node.output[0]] = self.handler.depthToSpace(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    blocksize,
+                    mode,
+                )
+            elif node.op_type == "Reshape":
+                shape = _parse_data(data[node.input[1]])
+                tensors[node.output[0]] = self.handler.reshape(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    shape,
+                )
+            elif node.op_type == "Resize":
+                output = tensors.get(node.output[0])
+                attributes = _parse_attribute(
+                    node,
+                    {
+                        "antialias": 0,
+                        "axes": None,
+                        "coordinate_transformation_mode": "half_pixel",
+                        "cubic_coeff_a": -0.75,
+                        "exclude_outside": 0,
+                        "extrapolation_value": 0.0,
+                        "keep_aspect_ratio_policy": "none",
+                        "mode": "nearest",
+                        "nearest_mode": "none",
+                    },
+                )
+                (
+                    axes,
+                    keep_aspect_ratio_policy,
+                    coordinate_transformation_mode,
+                    mode,
+                    nearest_mode,
+                ) = (
+                    attributes[name]
+                    for name in [
+                        "axes",
+                        "keep_aspect_ratio_policy",
+                        "coordinate_transformation_mode",
+                        "mode",
+                        "nearest_mode",
+                    ]
+                )
+                if len(node.input) > 1:
+                    roiVal = _parse_data(data[node.input[1]])
+                else:
+                    roiVal = []
+                if len(node.input) > 2:
+                    scalesVal = _parse_data(data[node.input[2]])
+                else:
+                    scalesVal = []
+                if len(node.input) > 3:
+                    sizesVal = _parse_data(data[node.input[3]])
+                else:
+                    sizesVal = []
+                tensors[node.output[0]] = self.handler.resize(
+                    tensors[node.input[0]],
+                    output,
+                    axes,
+                    tensors[node.input[3]] if len(node.input) > 3 else None,
+                    tensors[node.input[2]] if len(node.input) > 2 else None,
+                    tensors[node.input[1]] if len(node.input) > 1 else None,
+                    sizesVal,
+                    scalesVal,
+                    roiVal,
+                    mode,
+                    keep_aspect_ratio_policy,
+                    nearest_mode,
+                    coordinate_transformation_mode,
+                )                
+            elif node.op_type == "Squeeze":
+                axes = (
+                    _parse_data(data[node.input[1]])
+                    if len(node.input) > 1
+                    else None
+                )
+                if axes is None:
+                    axes = next(
+                        (
+                            attr.ints
+                            for attr in node.attribute
+                            if attr.name == "axes"
                         ),
+                        [],
                     )
-                elif node.op_type == "Abs":
-                    tensors[node.output[0]] = self.handler.abs(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
+                tensors[node.output[0]] = self.handler.squeeze(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    axes,
+                )
+            elif node.op_type == "Unsqueeze":
+                axes = (
+                    _parse_data(data[node.input[1]])
+                    if len(node.input) > 1
+                    else None
+                )
+                if axes is None:
+                    axes = next(
+                        (
+                            attr.ints
+                            for attr in node.attribute
+                            if attr.name == "axes"
+                        )
                     )
-                elif node.op_type == "Sqrt":
-                    tensors[node.output[0]] = self.handler.sqrt(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Neg":
-                    tensors[node.output[0]] = self.handler.neg(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Shape":
-                    tensors[node.output[0]] = self.handler.shape(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Identity":
-                    tensors[node.output[0]] = self.handler.identity(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Flatten":
-                    tensors[node.output[0]] = self.handler.flatten(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        next(
-                            (attr.i for attr in node.attribute if attr.name == "axis"),
-                            1,
+                tensors[node.output[0]] = self.handler.unsqueeze(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    axes,
+                )
+            elif node.op_type == "Concat":
+                tensors[node.output[0]] = self.handler.concat(
+                    [tensors[name] for name in node.input],
+                    tensors.get(node.output[0]),
+                    next((attr.i for attr in node.attribute if attr.name == "axis")),
+                )
+            elif node.op_type == "AttentionKVCache":
+                tensors[node.output[0]] = self.handler.attentionKVCache(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors[node.input[2]],
+                    tensors[node.input[3]],
+                    tensors[node.input[4]],
+                    tensors[node.input[5]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Split":
+                split = (
+                    _parse_data(data[node.input[1]])
+                    if (len(node.input) > 1)
+                    else None
+                )
+                if split is None:
+                    split = next(
+                        (
+                            attr.ints
+                            for attr in node.attribute
+                            if attr.name == "split"
                         ),
+                        None,
                     )
-                elif node.op_type == "PRelu":
-                    tensors[node.output[0]] = self.handler.pRelu(
+                for name, tensor in zip(
+                    node.output,
+                    self.handler.split(
                         tensors[node.input[0]],
-                        tensors[node.input[1]],
-                        tensors.get(node.output[0]),
+                        None,
+                        next(
+                            (
+                                attr.i
+                                for attr in node.attribute
+                                if attr.name == "axis"
+                            ),
+                            0,
+                        ),
+                        split if split is not None else len(node.output),
+                    ),
+                ):
+                    tensors[name] = tensor
+            elif node.op_type == "Gather":
+                tensors[node.output[0]] = self.handler.gather(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                    next(
+                        (attr.i for attr in node.attribute if attr.name == "axis"),
+                        0,
+                    ),
+                )
+            elif node.op_type == "GatherElements":
+                tensors[node.output[0]] = self.handler.gatherElements(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                    next(
+                        (attr.i for attr in node.attribute if attr.name == "axis"),
+                        0,
+                    ),
+                )
+            elif node.op_type == "ReduceMean":
+                tensors[node.output[0]] = self.handler.reduceMean(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    # NOTE(constroy): `axes` is an attribute until opset version 13.
+                    next(
+                        (attr.ints for attr in node.attribute if attr.name == "axes"),
+                        None,
+                    ),
+                    next(
+                        (attr.i for attr in node.attribute if attr.name == "keepdims"),
+                        1,
                     )
-                elif node.op_type == "Clip":
-                    tensors[node.output[0]] = self.handler.clip(
+                    != 0,
+                )
+            elif node.op_type == "Slice":
+
+                def clamp(nums):
+                    MAX_INT = 0x7FFFFFFF
+                    return [min(x, MAX_INT) for x in nums]
+
+                tensors[node.output[0]] = self.handler.slice(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    clamp(_parse_data(data[node.input[1]])),
+                    clamp(_parse_data(data[node.input[2]])),
+                    clamp(_parse_data(data[node.input[3]]))
+                    if len(node.input) > 3
+                    else None,
+                    clamp(_parse_data(data[node.input[4]]))
+                    if len(node.input) > 4
+                    else None,
+                )
+            elif node.op_type == "Pad":
+                tensors[node.output[0]] = self.handler.pad(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    _parse_data(data[node.input[1]]),
+                    _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
+                )
+            elif node.op_type == "Dropout":
+                for name, tensor in zip(
+                    node.output,
+                    self.handler.dropout(
                         tensors[node.input[0]],
                         tensors.get(node.output[0]),
-                        next(_parse_data(data[node.input[1]]).__iter__(), None)
+                        tensors.get(node.output[1]) if len(node.output) > 1 else None,
+                        _parse_data(data[node.input[1]])[0]
                         if len(node.input) > 1
-                        else None,
-                        next(_parse_data(data[node.input[2]]).__iter__(), None)
+                        else 0.5,
+                        _parse_data(data[node.input[2]])[0]
                         if len(node.input) > 2
-                        else None,
-                    )
-                elif node.op_type == "Transpose":
-                    perm = next(
-                        (attr.ints for attr in node.attribute if attr.name == "perm"),
-                        None,
-                    )
-                    tensors[node.output[0]] = self.handler.transpose(
+                        else False,
+                    ),
+                ):
+                    tensors[name] = tensor
+            elif node.op_type == "Cast":
+                tensors[node.output[0]] = self.handler.cast(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    next((attr.i for attr in node.attribute if attr.name == "to")),
+                )
+            elif node.op_type == "ReduceSum":
+                if any(attr.name == "communicator" for attr in node.attribute):
+                    # ReduceSum with communicator is treated as allReduceSum.
+                    tensors[node.output[0]] = self.handler.allReduceSum(
                         tensors[node.input[0]],
                         tensors.get(node.output[0]),
-                        perm,
-                    )
-                elif node.op_type == "DepthToSpace":
-                    blocksize = next(
-                        (attr.i for attr in node.attribute if attr.name == "blocksize"),
-                        None,
-                    )
-                    mode = next(
-                        (attr.s for attr in node.attribute if attr.name == "mode"),
-                        None,
-                    )
-                    tensors[node.output[0]] = self.handler.depthToSpace(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        blocksize,
-                        mode,
-                    )
-                elif node.op_type == "Reshape":
-                    shape = _parse_data(data[node.input[1]])
-                    tensors[node.output[0]] = self.handler.reshape(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        shape,
-                    )
-                elif node.op_type == "Resize":
-                    output = tensors.get(node.output[0])
-                    attributes = _parse_attribute(
-                        node,
-                        {
-                            "antialias": 0,
-                            "axes": None,
-                            "coordinate_transformation_mode": "half_pixel",
-                            "cubic_coeff_a": -0.75,
-                            "exclude_outside": 0,
-                            "extrapolation_value": 0.0,
-                            "keep_aspect_ratio_policy": "none",
-                            "mode": "nearest",
-                            "nearest_mode": "none",
-                        },
-                    )
-                    (
-                        axes,
-                        keep_aspect_ratio_policy,
-                        coordinate_transformation_mode,
-                        mode,
-                        nearest_mode,
-                    ) = (
-                        attributes[name]
-                        for name in [
-                            "axes",
-                            "keep_aspect_ratio_policy",
-                            "coordinate_transformation_mode",
-                            "mode",
-                            "nearest_mode",
-                        ]
                     )
+                else:
+                    # NOTE: `axes` is an attribute until opset version 13.
                     if len(node.input) > 1:
-                        roiVal = _parse_data(data[node.input[1]])
+                        axis = _parse_data(data[node.input[1]])
                     else:
-                        roiVal = []
-                    if len(node.input) > 2:
-                        scalesVal = _parse_data(data[node.input[2]])
-                    else:
-                        scalesVal = []
-                    if len(node.input) > 3:
-                        sizesVal = _parse_data(data[node.input[3]])
-                    else:
-                        sizesVal = []
-                    tensors[node.output[0]] = self.handler.resize(
-                        tensors[node.input[0]],
-                        output,
-                        axes,
-                        tensors[node.input[3]] if len(node.input) > 3 else None,
-                        tensors[node.input[2]] if len(node.input) > 2 else None,
-                        tensors[node.input[1]] if len(node.input) > 1 else None,
-                        sizesVal,
-                        scalesVal,
-                        roiVal,
-                        mode,
-                        keep_aspect_ratio_policy,
-                        nearest_mode,
-                        coordinate_transformation_mode,
-                    )
-                elif node.op_type == "Squeeze":
-                    axes = (
-                        _parse_data(data[node.input[1]])
-                        if len(node.input) > 1
-                        else None
-                    )
-                    if axes is None:
-                        axes = next(
-                            (
-                                attr.ints
-                                for attr in node.attribute
-                                if attr.name == "axes"
-                            ),
-                            [],
-                        )
-                    tensors[node.output[0]] = self.handler.squeeze(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        axes,
-                    )
-                elif node.op_type == "Unsqueeze":
-                    axes = (
-                        _parse_data(data[node.input[1]])
-                        if len(node.input) > 1
-                        else None
-                    )
-                    if axes is None:
-                        axes = next(
-                            (
-                                attr.ints
-                                for attr in node.attribute
-                                if attr.name == "axes"
-                            )
-                        )
-                    tensors[node.output[0]] = self.handler.unsqueeze(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        axes,
-                    )
-                elif node.op_type == "Concat":
-                    tensors[node.output[0]] = self.handler.concat(
-                        [tensors[name] for name in node.input],
-                        tensors.get(node.output[0]),
-                        next(
-                            (attr.i for attr in node.attribute if attr.name == "axis")
-                        ),
-                    )
-                elif node.op_type == "AttentionKVCache":
-                    tensors[node.output[0]] = self.handler.attentionKVCache(
-                        tensors[node.input[0]],
-                        tensors[node.input[1]],
-                        tensors[node.input[2]],
-                        tensors[node.input[3]],
-                        tensors[node.input[4]],
-                        tensors[node.input[5]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Split":
-                    split = (
-                        _parse_data(data[node.input[1]])
-                        if (len(node.input) > 1)
-                        else None
-                    )
-                    if split is None:
-                        split = next(
-                            (
-                                attr.ints
-                                for attr in node.attribute
-                                if attr.name == "split"
-                            ),
-                            None,
-                        )
-                    for name, tensor in zip(
-                        node.output,
-                        self.handler.split(
-                            tensors[node.input[0]],
-                            None,
-                            next(
-                                (
-                                    attr.i
-                                    for attr in node.attribute
-                                    if attr.name == "axis"
-                                ),
-                                0,
-                            ),
-                            split if split is not None else len(node.output),
-                        ),
-                    ):
-                        tensors[name] = tensor
-                elif node.op_type == "Gather":
-                    tensors[node.output[0]] = self.handler.gather(
-                        tensors[node.input[0]],
-                        tensors[node.input[1]],
-                        tensors.get(node.output[0]),
-                        next(
-                            (attr.i for attr in node.attribute if attr.name == "axis"),
-                            0,
-                        ),
-                    )
-                elif node.op_type == "GatherElements":
-                    tensors[node.output[0]] = self.handler.gatherElements(
-                        tensors[node.input[0]],
-                        tensors[node.input[1]],
-                        tensors.get(node.output[0]),
-                        next(
-                            (attr.i for attr in node.attribute if attr.name == "axis"),
-                            0,
-                        ),
-                    )
-                elif node.op_type == "ReduceMean":
-                    tensors[node.output[0]] = self.handler.reduceMean(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        # NOTE(constroy): `axes` is an attribute until opset version 13.
-                        next(
+                        axis = next(
                             (
                                 attr.ints
                                 for attr in node.attribute
                                 if attr.name == "axes"
                             ),
                             None,
-                        ),
+                        )
+                    keepdims = (
                         next(
                             (
                                 attr.i
@@ -724,245 +812,153 @@ class OnnxStub:
                             ),
                             1,
                         )
-                        != 0,
-                    )
-                elif node.op_type == "Slice":
-
-                    def clamp(nums):
-                        MAX_INT = 0x7FFFFFFF
-                        return [min(x, MAX_INT) for x in nums]
-
-                    tensors[node.output[0]] = self.handler.slice(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        clamp(_parse_data(data[node.input[1]])),
-                        clamp(_parse_data(data[node.input[2]])),
-                        clamp(_parse_data(data[node.input[3]]))
-                        if len(node.input) > 3
-                        else None,
-                        clamp(_parse_data(data[node.input[4]]))
-                        if len(node.input) > 4
-                        else None,
-                    )
-                elif node.op_type == "Pad":
-                    tensors[node.output[0]] = self.handler.pad(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        _parse_data(data[node.input[1]]),
-                        _parse_data(data[node.input[3]])
-                        if len(node.input) > 3
-                        else None,
-                    )
-                elif node.op_type == "Dropout":
-                    for name, tensor in zip(
-                        node.output,
-                        self.handler.dropout(
-                            tensors[node.input[0]],
-                            tensors.get(node.output[0]),
-                            tensors.get(node.output[1])
-                            if len(node.output) > 1
-                            else None,
-                            _parse_data(data[node.input[1]])[0]
-                            if len(node.input) > 1
-                            else 0.5,
-                            _parse_data(data[node.input[2]])[0]
-                            if len(node.input) > 2
-                            else False,
-                        ),
-                    ):
-                        tensors[name] = tensor
-                elif node.op_type == "Cast":
-                    tensors[node.output[0]] = self.handler.cast(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        next((attr.i for attr in node.attribute if attr.name == "to")),
-                    )
-                elif node.op_type == "ReduceSum":
-                    if any(attr.name == "communicator" for attr in node.attribute):
-                        # ReduceSum with communicator is treated as allReduceSum.
-                        tensors[node.output[0]] = self.handler.allReduceSum(
-                            tensors[node.input[0]],
-                            tensors.get(node.output[0]),
-                        )
-                    else:
-                        # NOTE: `axes` is an attribute until opset version 13.
-                        if len(node.input) > 1:
-                            axis = _parse_data(data[node.input[1]])
-                        else:
-                            axis = next(
-                                (
-                                    attr.ints
-                                    for attr in node.attribute
-                                    if attr.name == "axes"
-                                ),
-                                None,
-                            )
-                        keepdims = (
-                            next(
-                                (
-                                    attr.i
-                                    for attr in node.attribute
-                                    if attr.name == "keepdims"
-                                ),
-                                1,
-                            )
-                            != 0
-                        )
-
-                        tensors[node.output[0]] = self.handler.reduceSum(
-                            tensors[node.input[0]],
-                            tensors.get(node.output[0]),
-                            axis,
-                            keepdims,
-                        )
-                elif node.op_type == "AllReduceSum":
-                    tensors[node.output[0]] = self.handler.allReduceSum(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "AllReduceProd":
-                    tensors[node.output[0]] = self.handler.allReduceProd(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "AllReduceMin":
-                    tensors[node.output[0]] = self.handler.allReduceMin(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "AllReduceMax":
-                    tensors[node.output[0]] = self.handler.allReduceMax(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "AllReduceAvg":
-                    tensors[node.output[0]] = self.handler.allReduceAvg(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "AllGather":
-                    for name, tensor in zip(
-                        node.output,
-                        self.handler.allGather(
-                            tensors[node.input[0]],
-                            None,
-                            len(node.output),
-                        ),
-                    ):
-                        tensors[name] = tensor
-                elif node.op_type == "Broadcast":
-                    tensors[node.output[0]] = self.handler.broadcast(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        next(
-                            (attr.i for attr in node.attribute if attr.name == "root"),
-                            0,
-                        ),
-                    )
-                elif node.op_type == "Send":
-                    source = next(
-                        (attr.i for attr in node.attribute if attr.name == "source"),
-                        0,
-                    )
-                    destination = next(
-                        (
-                            attr.i
-                            for attr in node.attribute
-                            if attr.name == "destination"
-                        ),
-                        0,
+                        != 0
                     )
 
-                    self.handler.send(
+                    tensors[node.output[0]] = self.handler.reduceSum(
+                        tensors[node.input[0]],
+                        tensors.get(node.output[0]),
+                        axis,
+                        keepdims,
+                    )
+            elif node.op_type == "AllReduceSum":
+                tensors[node.output[0]] = self.handler.allReduceSum(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "AllReduceProd":
+                tensors[node.output[0]] = self.handler.allReduceProd(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "AllReduceMin":
+                tensors[node.output[0]] = self.handler.allReduceMin(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "AllReduceMax":
+                tensors[node.output[0]] = self.handler.allReduceMax(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "AllReduceAvg":
+                tensors[node.output[0]] = self.handler.allReduceAvg(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "AllGather":
+                for name, tensor in zip(
+                    node.output,
+                    self.handler.allGather(
                         tensors[node.input[0]],
-                        source,
-                        destination,
                         None,
-                    )
-                elif node.op_type == "Recv":
-                    source = next(
-                        (attr.i for attr in node.attribute if attr.name == "source"),
+                        len(node.output),
+                    ),
+                ):
+                    tensors[name] = tensor
+            elif node.op_type == "Broadcast":
+                tensors[node.output[0]] = self.handler.broadcast(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    next(
+                        (attr.i for attr in node.attribute if attr.name == "root"),
                         0,
-                    )
-                    destination = next(
-                        (
-                            attr.i
-                            for attr in node.attribute
-                            if attr.name == "destination"
-                        ),
-                        0,
-                    )
+                    ),
+                )
+            elif node.op_type == "Send":
+                source = next(
+                    (attr.i for attr in node.attribute if attr.name == "source"),
+                    0,
+                )
+                destination = next(
+                    (attr.i for attr in node.attribute if attr.name == "destination"),
+                    0,
+                )
 
-                    for attr in node.attribute:
-                        if attr.name == "shape":
-                            shapeBasic = attr.ints
-                    shape = []
-                    for item in shapeBasic:
-                        shape.append(item)
+                self.handler.send(
+                    tensors[node.input[0]],
+                    source,
+                    destination,
+                    None,
+                )
+            elif node.op_type == "Recv":
+                source = next(
+                    (attr.i for attr in node.attribute if attr.name == "source"),
+                    0,
+                )
+                destination = next(
+                    (attr.i for attr in node.attribute if attr.name == "destination"),
+                    0,
+                )
 
-                    for attr in node.attribute:
-                        if attr.name == "dataType":
-                            outputType = attr.i
-                    tensors[node.output[0]] = self.handler.recv(
-                        tensors.get(node.output[0]),
-                        source,
-                        destination,
-                        shape,
-                        outputType,
-                        None,
-                    )
-                elif node.op_type == "Expand":
-                    shape = _parse_data(data[node.input[1]])
-                    tensors[node.output[0]] = self.handler.expand(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        shape,
-                    )
-                elif node.op_type == "Erf":
-                    tensors[node.output[0]] = self.handler.erf(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Where":
-                    tensors[node.output[0]] = self.handler.where(
-                        tensors[node.input[1]],
-                        tensors[node.input[2]],
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                    )
-                elif node.op_type == "Constant":
-                    output_name = node.output[0]
-                    attributes = _parse_attribute(node)
-                    tensor = attributes["value"]
-                    dims = [d for d in tensor.dims]
-                    tensors[output_name] = self.handler.tensor(dims, tensor.data_type)
-                    data[output_name] = tensor
-                    tensors[output_name].set_weight()
-                elif node.op_type == "LRN":
-                    attributes = _parse_attribute(
-                        node, {"alpha": 0.0001, "beta": 0.75, "bias": 1.0, "size": 1}
-                    )
-                    (alpha, beta, bias, size) = (
-                        attributes[name] for name in ["alpha", "beta", "bias", "size"]
-                    )
-                    tensors[node.output[0]] = self.handler.lrn(
-                        tensors[node.input[0]],
-                        tensors.get(node.output[0]),
-                        alpha,
-                        beta,
-                        bias,
-                        size,
-                    )
-                else:
-                    raise Exception('Unsupported operator "{}"'.format(node.op_type))
-                new_node_name.append(node.name)
-            # update the node_list
-            node_list = list(set(node_name) - set(new_node_name))
+                for attr in node.attribute:
+                    if attr.name == "shape":
+                        shapeBasic = attr.ints
+                shape = []
+                for item in shapeBasic:
+                    shape.append(item)
+
+                for attr in node.attribute:
+                    if attr.name == "dataType":
+                        outputType = attr.i
+                tensors[node.output[0]] = self.handler.recv(
+                    tensors.get(node.output[0]),
+                    source,
+                    destination,
+                    shape,
+                    outputType,
+                    None,
+                )
+            elif node.op_type == "Expand":
+                shape = _parse_data(data[node.input[1]])
+                tensors[node.output[0]] = self.handler.expand(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    shape,
+                )
+            elif node.op_type == "Erf":
+                tensors[node.output[0]] = self.handler.erf(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Where":
+                tensors[node.output[0]] = self.handler.where(
+                    tensors[node.input[1]],
+                    tensors[node.input[2]],
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Constant":
+                output_name = node.output[0]
+                attributes = _parse_attribute(node)
+                tensor = attributes["value"]
+                dims = [d for d in tensor.dims]
+                tensors[output_name] = self.handler.tensor(dims, tensor.data_type)
+                data[output_name] = tensor
+                tensors[output_name].set_weight()
+            elif node.op_type == "LRN":
+                attributes = _parse_attribute(
+                    node, {"alpha": 0.0001, "beta": 0.75, "bias": 1.0, "size": 1}
+                )
+                (alpha, beta, bias, size) = (
+                    attributes[name]
+                    for name in ["alpha", "beta", "bias", "size"]
+                )
+                tensors[node.output[0]] = self.handler.lrn(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    alpha,
+                    beta,
+                    bias,
+                    size,
+                )                
+            else:
+                raise Exception('Unsupported operator "{}"'.format(node.op_type))
 
         ################################
         # Allocate memory space for data
         ################################
-        self.handler.data_malloc()
+        self.handler.data_malloc(self.use_naive_allocator)
 
         #################################
         # Copy in data to tensor objects
@@ -993,6 +989,9 @@ class OnnxStub:
                 #     assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
                 obj.copyin_numpy(to_array(tensor))
 
+        for name, obj in tensors.items():
+            self.tensors[name] = obj
+
         for output in model.graph.output:
             self.outputs[output.name] = tensors[output.name]
 
@@ -1237,7 +1236,7 @@ class OnnxStub:
                         axes,
                     )
                 )
-                ctx.push_node(make_node(ty.name, inputs, outputs, name))
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))                
             elif ty == backend.OpTypeId.Concat:
                 axis = backend.concat_axis_of(op)
                 ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
@@ -1335,7 +1334,7 @@ class OnnxStub:
         return ctx.build(name)
 
     def init(self) -> None:
-        self.handler.data_malloc()
+        self.handler.data_malloc(self.use_naive_allocator)
 
     def optimize(self) -> None:
         self.handler.optimize()
@@ -1351,7 +1350,7 @@ class OnnxStub:
             oldTensor = self.inputs[oldInput]
             self.handler.change_shape(newInput, oldTensor.fuid())
         self.handler.shape_infer()
-        self.handler.data_malloc()
+        self.handler.data_malloc(self.use_naive_allocator)
 
     def getShape(self, name: str) -> List[int]:
         if name in self.inputs:
@@ -1414,10 +1413,3 @@ def _parse_data_fp16(tensor: TensorProto):
 
 def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
     return [(d.dim_value if d.dim_value > 0 else 1) for d in shape.dim]
-
-
-def _analyse_node(node: NodeProto, tensors) -> bool:
-    for i in node.input:
-        if i not in tensors:
-            return True
-    return False
diff --git a/src/bang/bang_runtime.cc b/src/bang/bang_runtime.cc
index 2f16b500..c4b426d2 100644
--- a/src/bang/bang_runtime.cc
+++ b/src/bang/bang_runtime.cc
@@ -16,8 +16,7 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
     std::map<OpType, int> opCnt;
     for (auto &op : graph->getOperators()) {
         // HACK: set correct data type
-        auto kernelAttrs =
-            KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
+        auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
         Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
         auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
         auto perfData = perfEngine.getPerfData(perfKey);
diff --git a/src/core/graph.cc b/src/core/graph.cc
index 5eb67402..ac90344a 100644
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@@ -87,48 +87,33 @@ string GraphObj::toString() const {
 }
 
 bool GraphObj::topo_sort() {
-    if (this->sorted)
+    if (this->sorted) {
         return true;
-
-    // std::unordered_set<Tensor> inputs;
-    std::unordered_set<Operator> waiting(this->ops.begin(), this->ops.end());
+    }
     std::vector<Operator> sorted;
-
-    while (!waiting.empty()) {
+    std::unordered_set<OperatorObj *> flags;
+    sorted.reserve(ops.size());
+    flags.reserve(ops.size());
+    while (sorted.size() < ops.size()) {
         // Any node is move to sorted in this loop.
         auto modified = false;
-        // Find head nodes.
-        for (auto it = waiting.begin(); it != waiting.end();) {
-            const auto &this_inputs = (*it)->getInputs();
-            // If none of the input tensors is in waiting list,
-            // this node is a head node.
-            const auto is_head = std::all_of(
-                this_inputs.begin(), this_inputs.end(), [&](const auto &input) {
-                    auto src = input->getSource();
-                    return src // If the source node is in the waiting
-                               // list, means that this node is not the
-                               // head node.
-                               ? waiting.find(src) == waiting.end()
-                               // This tensor has no source node,
-                               // it must be a input tensor.
-                               : (/*inputs.insert(input),*/ true);
-                });
-            // Moves head node to sorted.
-            if (is_head) {
+        for (auto const &op : ops) {
+            if (auto const &inputs = op->getInputs();
+                flags.find(op.get()) == flags.end() &&
+                std::all_of(inputs.begin(), inputs.end(),
+                            [&flags](auto const &input) {
+                                auto ptr = input->getSource().get();
+                                return !ptr || flags.find(ptr) != flags.end();
+                            })) {
                 modified = true;
-                sorted.emplace_back(std::move(*it));
-                it = waiting.erase(it);
-            } else {
-                ++it;
+                sorted.emplace_back(op);
+                flags.insert(op.get());
             }
         }
-        // Waiting list never modifies during a pass,
-        // sorting fails.
         if (!modified) {
             return false;
         }
     }
-    // Done.
     this->ops = std::move(sorted);
     return this->sorted = true;
 }
@@ -182,7 +167,10 @@ void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
         // note: behavior may not match running in non-naive mode, and it may
         // not reproduce the bug
         for (auto &tensor : tensors) {
-            tensor->dataMalloc();
+            if (!tensor->isWeight() ||
+                (tensor->isWeight() && !weightAllocated)) {
+                tensor->dataMalloc();
+            }
         }
         return;
     }
diff --git a/src/core/runtime.cc b/src/core/runtime.cc
index 4d64d433..f1ae8849 100644
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@@ -17,8 +17,7 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
     std::map<OpType, int> opCnt;
 
     for (auto &op : graph->getOperators()) {
-        auto kernelAttrs =
-            KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
+        auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
         Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
         auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
         auto perfData = perfEngine.getPerfData(perfKey);
@@ -66,8 +65,7 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
     std::map<OpType, int> opCnt;
 
     for (auto &op : graph->getOperators()) {
-        auto kernelAttrs =
-            KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
+        auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
         Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
         auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
         auto perfData = perfEngine.getPerfData(perfKey);
diff --git a/src/cuda/cuda_runtime.cc b/src/cuda/cuda_runtime.cc
index 0676646a..b92cb18f 100644
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@@ -25,8 +25,7 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
     auto &perfEngine = PerfEngine::getInstance();
     for (auto &op : graph->getOperators()) {
         // HACK: set correct data type
-        auto kernelAttrs =
-            KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
+        auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
         Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
         auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
         auto perfData = perfEngine.getPerfData(perfKey);
@@ -48,8 +47,7 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
     std::map<OpType, int> opCnt;
     for (auto &op : graph->getOperators()) {
         // HACK: set correct data type
-        auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying(),
-                                       DataType::Float32};
+        auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
         Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
         auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
         auto perfData = perfEngine.getPerfData(perfKey);
diff --git a/src/cuda/cuda_utility.cu b/src/cuda/cuda_utility.cu
index 83490959..e38910b9 100644
--- a/src/cuda/cuda_utility.cu
+++ b/src/cuda/cuda_utility.cu
@@ -1,4 +1,6 @@
+#include "core/data_type.h"
 #include "cuda/cuda_common.h"
+#include "cuda/cuda_utility.h"
 #include <cstdio>
 
 __global__ void cudaPrintFloatImpl(float *x, int len) {
@@ -18,4 +20,55 @@ void cudaPrintFloat(float *x, int len) {
     cudaDeviceSynchronize();
 }
 
+void cudaPrintTensor(const Tensor &tensor) {
+    cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
+}
+
+cudnnDataType_t cudnnDataTypeConvert(DataType dataType) {
+    if (dataType == DataType::Float32) {
+        return CUDNN_DATA_FLOAT;
+    }
+    if (dataType == DataType::Double) {
+        return CUDNN_DATA_DOUBLE;
+    }
+    if (dataType == DataType::Float16) {
+        return CUDNN_DATA_HALF;
+    }
+    if (dataType == DataType::Int8) {
+        return CUDNN_DATA_INT8;
+    }
+    if (dataType == DataType::Int32) {
+        return CUDNN_DATA_INT32;
+    }
+    if (dataType == DataType::UInt8) {
+        return CUDNN_DATA_UINT8;
+    }
+    if (dataType == DataType::BFloat16) {
+        return CUDNN_DATA_BFLOAT16;
+    }
+    if (dataType == DataType::Int64) {
+        return CUDNN_DATA_INT64;
+    }
+    if (dataType == DataType::Bool) {
+        return CUDNN_DATA_BOOLEAN;
+    }
+    IT_ASSERT(false, "Unsupported data type");
+}
+
+cudaDataType cublasDataTypeConvert(DataType dataType) {
+    switch (dataType.getIndex()) {
+    case 1:
+        return CUDA_R_32F;
+    // case 3:
+    //     return CUDA_R_8I;
+    case 10:
+        return CUDA_R_16F;
+    case 11:
+        return CUDA_R_64F;
+    // case 16:
+    //     return CUDA_R_16BF;
+    default:
+        IT_ASSERT(false, "MatMul Unsupported data type");
+    }
+}
 } // namespace infini
diff --git a/src/kernels/bang/activation.cc b/src/kernels/bang/activation.cc
index 1d7b0c20..bc970760 100644
--- a/src/kernels/bang/activation.cc
+++ b/src/kernels/bang/activation.cc
@@ -11,6 +11,7 @@ class UnaryCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -50,6 +51,7 @@ class RoundCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -80,6 +82,7 @@ class PReluCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<PReluObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -119,6 +122,7 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<SoftmaxObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -215,15 +219,12 @@ class SigmoidCnnl : public UnaryCnnl {
     float getCoef() const override { return 0.0; }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Relu, DataType::Float32, ReluCnnl,
-                "Relu_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::PRelu, DataType::Float32, PReluCnnl,
-                "PRelu_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, DataType::Float32, SigmoidCnnl,
-                "Sigmoid_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Round, DataType::Float32, RoundCnnl,
-                "Round_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Softmax, DataType::Float32, SoftmaxCnnl,
-                "Softmax_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Relu, ReluCnnl, "Relu_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::PRelu, PReluCnnl, "PRelu_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
+                "Sigmoid_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Round, RoundCnnl, "Round_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxCnnl,
+                "Softmax_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/activation_backward.cc b/src/kernels/bang/activation_backward.cc
index cc70afce..c2c3baa6 100644
--- a/src/kernels/bang/activation_backward.cc
+++ b/src/kernels/bang/activation_backward.cc
@@ -10,6 +10,7 @@ class ActivationBackwardCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ActivationBackwardObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const yData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -81,11 +82,11 @@ class TanhBackwardCnnl : public ActivationBackwardCnnl {
     float getCoef() const override { return 0.0; }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, DataType::Float32,
-                ReluBackwardCnnl, "ReluBackward_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, DataType::Float32,
-                SigmoidBackwardCnnl, "SigmoidBackward_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, DataType::Float32,
-                TanhBackwardCnnl, "TanhBackward_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, ReluBackwardCnnl,
+                "ReluBackward_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, SigmoidBackwardCnnl,
+                "SigmoidBackward_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, TanhBackwardCnnl,
+                "TanhBackward_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/batchnorm.cc b/src/kernels/bang/batchnorm.cc
index a1bc81c0..31aba547 100644
--- a/src/kernels/bang/batchnorm.cc
+++ b/src/kernels/bang/batchnorm.cc
@@ -7,6 +7,7 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<BatchNormObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const input = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -101,7 +102,7 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::BatchNormalization, DataType::Float32,
-                BatchNormCnnl, "BatchNorm_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::BatchNormalization, BatchNormCnnl,
+                "BatchNorm_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/cast.cc b/src/kernels/bang/cast.cc
index 267e4c2d..db281080 100644
--- a/src/kernels/bang/cast.cc
+++ b/src/kernels/bang/cast.cc
@@ -212,7 +212,6 @@ class CastCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Cast, DataType::Float32, CastCnnl,
-                "Cast_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Cast, CastCnnl, "Cast_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/ceil.cc b/src/kernels/bang/ceil.cc
index c3d0f3d0..ce77741d 100644
--- a/src/kernels/bang/ceil.cc
+++ b/src/kernels/bang/ceil.cc
@@ -7,6 +7,7 @@ class CeilCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -35,7 +36,6 @@ class CeilCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Ceil, DataType::Float32, CeilCnnl,
-                "Ceil_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Ceil, CeilCnnl, "Ceil_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/clip.cc b/src/kernels/bang/clip.cc
index 12b71fdc..fdb682f0 100644
--- a/src/kernels/bang/clip.cc
+++ b/src/kernels/bang/clip.cc
@@ -7,6 +7,7 @@ class ClipCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ClipObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -30,7 +31,6 @@ class ClipCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Clip, DataType::Float32, ClipCnnl,
-                "Clip_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Clip, ClipCnnl, "Clip_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/concat.cc b/src/kernels/bang/concat.cc
index ab535879..dae092c5 100644
--- a/src/kernels/bang/concat.cc
+++ b/src/kernels/bang/concat.cc
@@ -7,6 +7,7 @@ class ConcatCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ConcatObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
         int num = op->numInputs();
         int axis = op->getDim();
@@ -50,6 +51,5 @@ class ConcatCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Concat, DataType::Float32, ConcatCnnl,
-                "Concat_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Concat, ConcatCnnl, "Concat_cnnl_BANG");
 }; // namespace infini
diff --git a/src/kernels/bang/conv.cc b/src/kernels/bang/conv.cc
index d9ff3df8..24d8a3fd 100644
--- a/src/kernels/bang/conv.cc
+++ b/src/kernels/bang/conv.cc
@@ -7,6 +7,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ConvObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
@@ -151,6 +152,5 @@ class ConvCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Conv, DataType::Float32, ConvCnnl,
-                "Conv_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Conv, ConvCnnl, "Conv_cnnl_BANG");
 }; // namespace infini
diff --git a/src/kernels/bang/conv_trans.cc b/src/kernels/bang/conv_trans.cc
index a081e279..ce93fc9a 100644
--- a/src/kernels/bang/conv_trans.cc
+++ b/src/kernels/bang/conv_trans.cc
@@ -7,6 +7,7 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ConvBaseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
@@ -76,6 +77,6 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::ConvTranspose, DataType::Float32,
-                ConvTransCnnl, "ConvTrans_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ConvTranspose, ConvTransCnnl,
+                "ConvTrans_cnnl_BANG");
 }; // namespace infini
diff --git a/src/kernels/bang/convbpfilter.cc b/src/kernels/bang/convbpfilter.cc
index b360cedb..f3e9ec94 100644
--- a/src/kernels/bang/convbpfilter.cc
+++ b/src/kernels/bang/convbpfilter.cc
@@ -7,6 +7,7 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ConvBackwardFilterObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
@@ -154,6 +155,6 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::ConvBackwardFilter, DataType::Float32,
-                ConvBackwardFilterCnnl, "ConvBackwardFilter_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ConvBackwardFilter,
+                ConvBackwardFilterCnnl, "ConvBackwardFilter_cnnl_BANG");
 }; // namespace infini
diff --git a/src/kernels/bang/det.cc b/src/kernels/bang/det.cc
index 182baaa7..eeb197b6 100644
--- a/src/kernels/bang/det.cc
+++ b/src/kernels/bang/det.cc
@@ -7,6 +7,7 @@ class DetCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<DetObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -42,6 +43,5 @@ class DetCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Det, DataType::Float32, DetCnnl,
-                "Det_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Det, DetCnnl, "Det_cnnl_BANG");
 }; // namespace infini
diff --git a/src/kernels/bang/element_wise.cc b/src/kernels/bang/element_wise.cc
index 9c1d95b4..e919e7d1 100644
--- a/src/kernels/bang/element_wise.cc
+++ b/src/kernels/bang/element_wise.cc
@@ -11,6 +11,7 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -74,6 +75,7 @@ class LogicOpCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -127,6 +129,7 @@ class BitComputeCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -179,6 +182,7 @@ class DivCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -231,6 +235,7 @@ class MaximumCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -282,6 +287,7 @@ class MinimumCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -333,6 +339,7 @@ class MSELossCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<MSELossObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -389,6 +396,7 @@ class PowerCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -442,6 +450,7 @@ class FloorDivCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -494,6 +503,7 @@ class FloorModCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -546,6 +556,7 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -658,62 +669,48 @@ class BitNotCnnl : public BitComputeCnnl {
 //     CNNL_BLEFT_SHIFT_OP_V2; }
 // };
 
-REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
-                "Add_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
-                "Sub_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
-                "Mul_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Add, AddCnnl, "Add_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Sub, SubCnnl, "Sub_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Mul, MulCnnl, "Mul_cnnl_BANG");
 
-REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, DivCnnl,
-                "Div_cnnl_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Max, DataType::Float32, MaximumCnnl,
-                "Maximum_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Min, DataType::Float32, MinimumCnnl,
-                "Minimum_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::MSELoss, DataType::Float32, MSELossCnnl,
-                "MSELoss_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32, PowerCnnl,
-                "Power_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, DataType::Float32, FloorDivCnnl,
-                "FloorDiv_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::FloorMod, DataType::Float32, FloorModCnnl,
-                "FloorMod_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::SquaredDifference, DataType::Float32,
-                SquaredDifferenceCnnl, "SquaredDifference_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Equal, DataType::Float32, EqualCnnl,
-                "Equal_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Greater, DataType::Float32,
-                GreaterThanCnnl, "GreaterThan_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::GreaterOrEqual, DataType::Float32,
-                GreaterEqualCnnl, "GreaterEqual_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Less, DataType::Float32, LessThanCnnl,
-                "LessThan_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::LessOrEqual, DataType::Float32,
-                LessEqualCnnl, "LessEqual_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::And, DataType::Float32, AndCnnl,
-                "And_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Or, DataType::Float32, OrCnnl,
-                "Or_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Xor, DataType::Float32, XorCnnl,
-                "Xor_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Not, DataType::Float32, NotCnnl,
-                "Not_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::BitwiseAnd, DataType::Float32, BitAndCnnl,
-                "BitAnd_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::BitwiseOr, DataType::Float32, BitOrCnnl,
-                "BitOr_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::BitwiseXor, DataType::Float32, BitXorCnnl,
-                "BitXor_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::BitwiseNot, DataType::Float32, BitNotCnnl,
-                "BitNot_cnnl_BANG_Float32");
-// REGISTER_KERNEL(Device::BANG, OpType::BitLeftShift, DataType::Float32,
+REGISTER_KERNEL(Device::BANG, OpType::Div, DivCnnl, "Div_cnnl");
+REGISTER_KERNEL(Device::BANG, OpType::Max, MaximumCnnl, "Maximum_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Min, MinimumCnnl, "Minimum_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::MSELoss, MSELossCnnl,
+                "MSELoss_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Pow, PowerCnnl, "Power_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, FloorDivCnnl,
+                "FloorDiv_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::FloorMod, FloorModCnnl,
+                "FloorMod_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::SquaredDifference, SquaredDifferenceCnnl,
+                "SquaredDifference_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Equal, EqualCnnl, "Equal_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Greater, GreaterThanCnnl,
+                "GreaterThan_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::GreaterOrEqual, GreaterEqualCnnl,
+                "GreaterEqual_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Less, LessThanCnnl, "LessThan_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::LessOrEqual, LessEqualCnnl,
+                "LessEqual_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::And, AndCnnl, "And_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Or, OrCnnl, "Or_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Xor, XorCnnl, "Xor_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Not, NotCnnl, "Not_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::BitwiseAnd, BitAndCnnl,
+                "BitAnd_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::BitwiseOr, BitOrCnnl, "BitOr_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::BitwiseXor, BitXorCnnl,
+                "BitXor_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::BitwiseNot, BitNotCnnl,
+                "BitNot_cnnl_BANG");
+// REGISTER_KERNEL(Device::BANG, OpType::BitLeftShift,
 // BitLeftShiftCnnl,
-//                 "BitLeftShift_cnnl_BANG_Float32");
-// REGISTER_KERNEL(Device::BANG, OpType::BitRightShift, DataType::Float32,
+//                 "BitLeftShift_cnnl_BANG");
+// REGISTER_KERNEL(Device::BANG, OpType::BitRightShift,
 // BitRightShiftCnnl,
-//                 "BitRightShift_cnnl_BANG_Float32");
-// REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
+//                 "BitRightShift_cnnl_BANG");
+// REGISTER_KERNEL(Device::BANG, OpType::Pow,
 // ElementWiseBang,
-//                 "Pow_Bang_Float32");
+//                 "Pow_Bang");
 }; // namespace infini
diff --git a/src/kernels/bang/erf.cc b/src/kernels/bang/erf.cc
index 5f1c0985..dcf8eacd 100644
--- a/src/kernels/bang/erf.cc
+++ b/src/kernels/bang/erf.cc
@@ -7,6 +7,7 @@ class ErfCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -36,7 +37,6 @@ class ErfCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Erf, DataType::Float32, ErfCnnl,
-                "Erf_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Erf, ErfCnnl, "Erf_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/exp.cc b/src/kernels/bang/exp.cc
index fa71be72..4b3d88ab 100644
--- a/src/kernels/bang/exp.cc
+++ b/src/kernels/bang/exp.cc
@@ -7,6 +7,7 @@ class ExpCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -36,7 +37,6 @@ class ExpCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Exp, DataType::Float32, ExpCnnl,
-                "Exp_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Exp, ExpCnnl, "Exp_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/fill.cc b/src/kernels/bang/fill.cc
index c3f75311..c2de64d5 100644
--- a/src/kernels/bang/fill.cc
+++ b/src/kernels/bang/fill.cc
@@ -7,6 +7,7 @@ class FillCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<FillObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -29,7 +30,6 @@ class FillCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Fill, DataType::Float32, FillCnnl,
-                "Fill_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Fill, FillCnnl, "Fill_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/floor.cc b/src/kernels/bang/floor.cc
index dd049d1d..83d8b505 100644
--- a/src/kernels/bang/floor.cc
+++ b/src/kernels/bang/floor.cc
@@ -7,6 +7,7 @@ class FloorCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -35,7 +36,7 @@ class FloorCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Floor, DataType::Float32, FloorCnnl,
+REGISTER_KERNEL(Device::BANG, OpType::Floor, FloorCnnl,
                 "Floor_cnnl_BANG_Float32");
 
 }; // namespace infini
diff --git a/src/kernels/bang/gather.cc b/src/kernels/bang/gather.cc
index dc3ee636..97fa395c 100644
--- a/src/kernels/bang/gather.cc
+++ b/src/kernels/bang/gather.cc
@@ -7,6 +7,7 @@ class GatherCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<GatherObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -49,7 +50,6 @@ class GatherCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Gather, DataType::Float32, GatherCnnl,
-                "Gather_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Gather, GatherCnnl, "Gather_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/hardtanh.cc b/src/kernels/bang/hardtanh.cc
index 2cdb89fe..1f91084e 100644
--- a/src/kernels/bang/hardtanh.cc
+++ b/src/kernels/bang/hardtanh.cc
@@ -7,6 +7,7 @@ class HardtanhCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<HardtanhObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -30,7 +31,7 @@ class HardtanhCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Hardtanh, DataType::Float32, HardtanhCnnl,
-                "Hardtanh_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Hardtanh, HardtanhCnnl,
+                "Hardtanh_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/l2loss.cc b/src/kernels/bang/l2loss.cc
index 7fb5d3a8..deb127be 100644
--- a/src/kernels/bang/l2loss.cc
+++ b/src/kernels/bang/l2loss.cc
@@ -7,6 +7,7 @@ class L2LossCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<L2LossObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -28,7 +29,6 @@ class L2LossCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::L2Loss, DataType::Float32, L2LossCnnl,
-                "L2Loss_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::L2Loss, L2LossCnnl, "L2Loss_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/layer_norm.cc b/src/kernels/bang/layer_norm.cc
index 231177c5..acd36624 100644
--- a/src/kernels/bang/layer_norm.cc
+++ b/src/kernels/bang/layer_norm.cc
@@ -8,6 +8,7 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<LayerNormObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -58,7 +59,7 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::LayerNormalization, DataType::Float32,
-                LayerNormCnnl, "LayerNorm_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::LayerNormalization, LayerNormCnnl,
+                "LayerNorm_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/log.cc b/src/kernels/bang/log.cc
index 6237992e..c2a3e566 100644
--- a/src/kernels/bang/log.cc
+++ b/src/kernels/bang/log.cc
@@ -7,6 +7,7 @@ class LogCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<LogObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -51,7 +52,6 @@ class LogCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Log, DataType::Float32, LogCnnl,
-                "Log_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Log, LogCnnl, "Log_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/lrn.cc b/src/kernels/bang/lrn.cc
index 4183f0fd..14bca5fb 100644
--- a/src/kernels/bang/lrn.cc
+++ b/src/kernels/bang/lrn.cc
@@ -7,6 +7,7 @@ class LRNCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<LRNObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -56,7 +57,6 @@ class LRNCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::LRN, DataType::Float32, LRNCnnl,
-                "LRN_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::LRN, LRNCnnl, "LRN_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/matmul.cc b/src/kernels/bang/matmul.cc
index 368d6b1c..09780067 100644
--- a/src/kernels/bang/matmul.cc
+++ b/src/kernels/bang/matmul.cc
@@ -8,6 +8,7 @@ class MatmulCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<MatmulObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         auto input_num = op->numInputs();
@@ -107,6 +108,5 @@ class MatmulCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::MatMul, DataType::Float32, MatmulCnnl,
-                "Matmul_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::MatMul, MatmulCnnl, "Matmul_cnnl_BANG");
 }; // namespace infini
diff --git a/src/kernels/bang/negtensor.cc b/src/kernels/bang/negtensor.cc
index 02c5c37c..12377610 100644
--- a/src/kernels/bang/negtensor.cc
+++ b/src/kernels/bang/negtensor.cc
@@ -7,6 +7,7 @@ class NegTensorCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -35,7 +36,6 @@ class NegTensorCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Neg, DataType::Float32, NegTensorCnnl,
-                "Neg_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Neg, NegTensorCnnl, "Neg_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/pad.cc b/src/kernels/bang/pad.cc
index c2503ca0..e8aafa1a 100644
--- a/src/kernels/bang/pad.cc
+++ b/src/kernels/bang/pad.cc
@@ -7,6 +7,7 @@ class PadCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<PadObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -57,7 +58,6 @@ class PadCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Pad, DataType::Float32, PadCnnl,
-                "Pad_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Pad, PadCnnl, "Pad_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/pooling.cc b/src/kernels/bang/pooling.cc
index f3cf04bc..90a0637f 100644
--- a/src/kernels/bang/pooling.cc
+++ b/src/kernels/bang/pooling.cc
@@ -8,6 +8,7 @@ class PoolingCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<PoolingObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
         void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const outData = (op->getOutput()->getRawDataPtr<void *>());
@@ -68,8 +69,8 @@ class avgPoolCnnl : public PoolingCnnl {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::MaxPool, DataType::Float32, maxPoolCnnl,
-                "MaxPool_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::AveragePool, DataType::Float32,
-                avgPoolCnnl, "AvgPool_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::MaxPool, maxPoolCnnl,
+                "MaxPool_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::AveragePool, avgPoolCnnl,
+                "AvgPool_cnnl_BANG");
 }; // namespace infini
diff --git a/src/kernels/bang/reciprocal.cc b/src/kernels/bang/reciprocal.cc
index 6ac3f334..7b61c2ca 100644
--- a/src/kernels/bang/reciprocal.cc
+++ b/src/kernels/bang/reciprocal.cc
@@ -7,6 +7,7 @@ class ReciprocalCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -35,7 +36,7 @@ class ReciprocalCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Reciprocal, DataType::Float32,
-                ReciprocalCnnl, "Reciprocal_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Reciprocal, ReciprocalCnnl,
+                "Reciprocal_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/reduce.cc b/src/kernels/bang/reduce.cc
index 88d1e645..810aca72 100644
--- a/src/kernels/bang/reduce.cc
+++ b/src/kernels/bang/reduce.cc
@@ -9,6 +9,7 @@ class ReduceCnnlBase : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ReduceBaseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -73,9 +74,9 @@ class ReduceSumCnnl : public ReduceCnnlBase {
     cnnlReduceOp_t getReduceOp() const override { return CNNL_REDUCE_ADD; }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::ReduceMean, DataType::Float32,
-                ReduceMeanCnnl, "ReduceMean_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::ReduceSum, DataType::Float32,
-                ReduceSumCnnl, "ReduceSum_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ReduceMean, ReduceMeanCnnl,
+                "ReduceMean_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::ReduceSum, ReduceSumCnnl,
+                "ReduceSum_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/reshape.cc b/src/kernels/bang/reshape.cc
index f5628a7b..cd876bf1 100644
--- a/src/kernels/bang/reshape.cc
+++ b/src/kernels/bang/reshape.cc
@@ -13,9 +13,9 @@ class CopyBang : public BangKernelWithoutConfig {
         auto dim = op->getInputs(0)->getDims();
 
         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, dim.size(),
-                                               dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT8,
+            dim.size() * op->getDType().getSize(), dim.data()));
         cnnlStatus_t stat =
             cnnlCopy(context->cnnlHandle(), aDesc, inData, aDesc, outData);
         if (stat != CNNL_STATUS_SUCCESS)
@@ -25,13 +25,8 @@ class CopyBang : public BangKernelWithoutConfig {
     }
 };
 // reshape/flatten/identity all act as copying from input to output.
-REGISTER_KERNEL(Device::BANG, OpType::Reshape, DataType::Float32, CopyBang,
-                "Reshape_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Reshape, DataType::Int64, CopyBang,
-                "Reshape_BANG_Int64");
-REGISTER_KERNEL(Device::BANG, OpType::Flatten, DataType::Float32, CopyBang,
-                "Flatten_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Identity, DataType::Float32, CopyBang,
-                "Identity_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Reshape, CopyBang, "Reshape_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Flatten, CopyBang, "Flatten_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Identity, CopyBang, "Identity_BANG");
 
 } // namespace infini
diff --git a/src/kernels/bang/rsqrt.cc b/src/kernels/bang/rsqrt.cc
index 0da3c74d..66e63e0a 100644
--- a/src/kernels/bang/rsqrt.cc
+++ b/src/kernels/bang/rsqrt.cc
@@ -7,6 +7,7 @@ class RsqrtCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -36,7 +37,6 @@ class RsqrtCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Rsqrt, DataType::Float32, RsqrtCnnl,
-                "Rsqrt_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Rsqrt, RsqrtCnnl, "Rsqrt_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/split.cc b/src/kernels/bang/split.cc
index bf3f8123..397b5063 100644
--- a/src/kernels/bang/split.cc
+++ b/src/kernels/bang/split.cc
@@ -7,6 +7,7 @@ class SplitCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<SplitObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
         int num = op->numOutputs();
         int axis = op->getDim();
@@ -49,6 +50,5 @@ class SplitCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Split, DataType::Float32, SplitCnnl,
-                "Split_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Split, SplitCnnl, "Split_cnnl_BANG");
 }; // namespace infini
diff --git a/src/kernels/bang/sqrt.cc b/src/kernels/bang/sqrt.cc
index 52fea02a..a1ed85c9 100644
--- a/src/kernels/bang/sqrt.cc
+++ b/src/kernels/bang/sqrt.cc
@@ -7,6 +7,7 @@ class SqrtCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -36,7 +37,6 @@ class SqrtCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Sqrt, DataType::Float32, SqrtCnnl,
-                "Sqrt_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Sqrt, SqrtCnnl, "Sqrt_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/transpose.cc b/src/kernels/bang/transpose.cc
index ff2783b5..7dedd21d 100644
--- a/src/kernels/bang/transpose.cc
+++ b/src/kernels/bang/transpose.cc
@@ -7,6 +7,7 @@ class TransposeCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<TransposeObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -52,6 +53,7 @@ class DepthToSpaceCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<DepthToSpaceObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -101,9 +103,9 @@ class DepthToSpaceCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Transpose, DataType::Float32,
-                TransposeCnnl, "Transpose_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Transpose, TransposeCnnl,
+                "Transpose_cnnl_BANG");
 
-REGISTER_KERNEL(Device::BANG, OpType::DepthToSpace, DataType::Float32,
-                DepthToSpaceCnnl, "DepthToSpace_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::DepthToSpace, DepthToSpaceCnnl,
+                "DepthToSpace_cnnl_BANG");
 }; // namespace infini
diff --git a/src/kernels/bang/trigon.cc b/src/kernels/bang/trigon.cc
index b4842b95..989858c4 100644
--- a/src/kernels/bang/trigon.cc
+++ b/src/kernels/bang/trigon.cc
@@ -9,6 +9,7 @@ class TrigonCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -150,29 +151,17 @@ class ATanHCnnl : public TrigonCnnl {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Sin, DataType::Float32, SinCnnl,
-                "Sin_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Cos, DataType::Float32, CosCnnl,
-                "Cos_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Tan, DataType::Float32, TanCnnl,
-                "Tan_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Asin, DataType::Float32, ASinCnnl,
-                "ASin_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Acos, DataType::Float32, ACosCnnl,
-                "ACos_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Atan, DataType::Float32, ATanCnnl,
-                "ATan_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Sinh, DataType::Float32, SinHCnnl,
-                "SinH_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Cosh, DataType::Float32, CosHCnnl,
-                "CosH_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Tanh, DataType::Float32, TanHCnnl,
-                "TanH_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Asinh, DataType::Float32, ASinHCnnl,
-                "ASinH_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Acosh, DataType::Float32, ACosHCnnl,
-                "ACosH_cnnl_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::Atanh, DataType::Float32, ATanHCnnl,
-                "ATanH_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Sin, SinCnnl, "Sin_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Cos, CosCnnl, "Cos_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Tan, TanCnnl, "Tan_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Asin, ASinCnnl, "ASin_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Acos, ACosCnnl, "ACos_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Atan, ATanCnnl, "ATan_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Sinh, SinHCnnl, "SinH_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Cosh, CosHCnnl, "CosH_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Tanh, TanHCnnl, "TanH_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Asinh, ASinHCnnl, "ASinH_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Acosh, ACosHCnnl, "ACosH_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Atanh, ATanHCnnl, "ATanH_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/bang/where.cc b/src/kernels/bang/where.cc
index 725b63e0..8786f3fd 100644
--- a/src/kernels/bang/where.cc
+++ b/src/kernels/bang/where.cc
@@ -7,6 +7,7 @@ class WhereCnnl : public BangKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<WhereObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -67,7 +68,6 @@ class WhereCnnl : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Where, DataType::Float32, WhereCnnl,
-                "Where_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Where, WhereCnnl, "Where_cnnl_BANG");
 
 }; // namespace infini
diff --git a/src/kernels/cpu/concat.cc b/src/kernels/cpu/concat.cc
index 5dd73866..156a16af 100644
--- a/src/kernels/cpu/concat.cc
+++ b/src/kernels/cpu/concat.cc
@@ -3,9 +3,9 @@
 
 namespace infini {
 
-template <typename T> class NaiveConcat : public CpuKernelWithoutConfig {
-    void compute(const Operator &_op,
-                 const RuntimeObj *context) const override {
+class NaiveConcat : public CpuKernelWithoutConfig {
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
         auto op = as<ConcatObj>(_op);
         auto inputs = op->getInputs(), outputs = op->getOutputs();
         auto dim = op->getDim();
@@ -41,11 +41,25 @@ template <typename T> class NaiveConcat : public CpuKernelWithoutConfig {
             }
         }
     }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+    }
 };
 
-REGISTER_KERNEL(Device::CPU, OpType::Concat, DataType::UInt32,
-                NaiveConcat<uint32_t>, "ConcatNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Concat, DataType::Float32,
-                NaiveConcat<float>, "ConcatNaive_CPU_float32");
+REGISTER_KERNEL(Device::CPU, OpType::Concat, NaiveConcat, "ConcatNaive_CPU");
 
 } // namespace infini
diff --git a/src/kernels/cpu/conv.cc b/src/kernels/cpu/conv.cc
index b0ffa724..9300c72a 100644
--- a/src/kernels/cpu/conv.cc
+++ b/src/kernels/cpu/conv.cc
@@ -3,9 +3,9 @@
 
 namespace infini {
 
-template <typename T> class NaiveConv : public CpuKernelWithoutConfig {
-    void compute(const Operator &_op,
-                 const RuntimeObj *context) const override {
+class NaiveConv : public CpuKernelWithoutConfig {
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
         auto op = as<ConvObj>(_op);
         T *iptr = op->getInputs(0)->getRawDataPtr<T *>();
         T *wptr = op->getInputs(1)->getRawDataPtr<T *>();
@@ -50,11 +50,25 @@ template <typename T> class NaiveConv : public CpuKernelWithoutConfig {
             }
         }
     }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+    }
 };
 
-REGISTER_KERNEL(Device::CPU, OpType::Conv, DataType::UInt32,
-                NaiveConv<uint32_t>, "ConvNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Conv, DataType::Float32, NaiveConv<float>,
-                "ConvNaive_CPU_float32");
+REGISTER_KERNEL(Device::CPU, OpType::Conv, NaiveConv, "ConvNaive_CPU");
 
 } // namespace infini
diff --git a/src/kernels/cpu/element_wise.cc b/src/kernels/cpu/element_wise.cc
index ff03350c..98e974d3 100644
--- a/src/kernels/cpu/element_wise.cc
+++ b/src/kernels/cpu/element_wise.cc
@@ -3,10 +3,45 @@
 #include "utils/operator_utils.h"
 
 namespace infini {
-template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
-    virtual T doCompute(T val0, T val1) const = 0;
-    void compute(const Operator &_op,
-                 const RuntimeObj *context) const override {
+class NativeElementWise : public CpuKernelWithoutConfig {
+    template <typename T> static T addCompute(T val0, T val1) {
+        return val0 + val1;
+    }
+
+    template <typename T> static T subCompute(T val0, T val1) {
+        return val0 - val1;
+    }
+
+    template <typename T> static T mulCompute(T val0, T val1) {
+        return val0 * val1;
+    }
+
+    template <typename T> static T divCompute(T val0, T val1) {
+        return (T)(val0 / val1);
+    }
+
+    template <typename T> static T equalCompute(T val0, T val1) {
+        return (T)(val0 == val1);
+    }
+
+    template <typename T> static T greaterOrEqualCompute(T val0, T val1) {
+        return (T)(val0 >= val1);
+    }
+
+    template <typename T> static T greaterCompute(T val0, T val1) {
+        return (T)(val0 > val1);
+    }
+
+    template <typename T> static T lessOrEqualCompute(T val0, T val1) {
+        return (T)(val0 <= val1);
+    }
+
+    template <typename T> static T lessCompute(T val0, T val1) {
+        return (T)(val0 < val1);
+    }
+
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
         auto op = as<ElementWiseObj>(_op);
         T *inptr0 = op->getInputs(0)->getRawDataPtr<T *>();
         T *inptr1 = op->getInputs(1)->getRawDataPtr<T *>();
@@ -35,77 +70,77 @@ template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
         Shape strideB = getStride(b);
 
         auto n = op->getOutput()->size();
+        T (*_doCompute)(T val0, T val1);
+        switch (op->getOpType().underlying()) {
+        case OpType::Add:
+            _doCompute = addCompute<T>;
+            break;
+        case OpType::Sub:
+            _doCompute = subCompute<T>;
+            break;
+        case OpType::Mul:
+            _doCompute = mulCompute<T>;
+            break;
+        case OpType::Div:
+            _doCompute = divCompute<T>;
+            break;
+        case OpType::Equal:
+            _doCompute = equalCompute<T>;
+            break;
+        case OpType::GreaterOrEqual:
+            _doCompute = greaterOrEqualCompute<T>;
+            break;
+        case OpType::Greater:
+            _doCompute = greaterCompute<T>;
+            break;
+        case OpType::LessOrEqual:
+            _doCompute = lessOrEqualCompute<T>;
+            break;
+        case OpType::Less:
+            _doCompute = lessCompute<T>;
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+
         for (size_t i = 0; i < n; ++i) {
             auto shapeIndexC = locate_index(i, shapeC);
             auto indexA = delocate_index(shapeIndexC, a, strideA);
             auto indexB = delocate_index(shapeIndexC, b, strideB);
-            outptr[i] = doCompute(inptr0[indexA], inptr1[indexB]);
+            outptr[i] = _doCompute(inptr0[indexA], inptr1[indexB]);
+        }
+    }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
         }
     }
 };
 
-template <typename T> class NaiveAdd : public NativeElementWise<T> {
-    T doCompute(T val0, T val1) const override { return val0 + val1; }
-};
-template <typename T> class NaiveSub : public NativeElementWise<T> {
-    T doCompute(T val0, T val1) const override { return val0 - val1; }
-};
-template <typename T> class NaiveMul : public NativeElementWise<T> {
-    T doCompute(T val0, T val1) const override { return val0 * val1; }
-};
-template <typename T> class NaiveDiv : public NativeElementWise<T> {
-    T doCompute(T val0, T val1) const override { return (T)(val0 / val1); }
-};
-template <typename T> class NaiveEqual : public NativeElementWise<T> {
-    T doCompute(T val0, T val1) const override { return (T)(val0 == val1); }
-};
-template <typename T> class NaiveGreaterEqual : public NativeElementWise<T> {
-    T doCompute(T val0, T val1) const override { return (T)(val0 >= val1); }
-};
-template <typename T> class NaiveGreaterThan : public NativeElementWise<T> {
-    T doCompute(T val0, T val1) const override { return (T)(val0 > val1); }
-};
-template <typename T> class NaiveLessEqual : public NativeElementWise<T> {
-    T doCompute(T val0, T val1) const override { return (T)(val0 <= val1); }
-};
-template <typename T> class NaiveLessThan : public NativeElementWise<T> {
-    T doCompute(T val0, T val1) const override { return (T)(val0 < val1); }
-};
-
-REGISTER_KERNEL(Device::CPU, OpType::Add, DataType::UInt32, NaiveAdd<uint32_t>,
-                "addNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Add, DataType::Float32, NaiveAdd<float>,
-                "addNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Sub, DataType::UInt32, NaiveSub<uint32_t>,
-                "subNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Sub, DataType::Float32, NaiveSub<float>,
-                "subNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Mul, DataType::UInt32, NaiveMul<uint32_t>,
-                "mulNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Mul, DataType::Float32, NaiveMul<float>,
-                "mulNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::UInt32, NaiveDiv<uint32_t>,
-                "divNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::Float32, NaiveDiv<float>,
-                "divNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Equal, DataType::UInt32,
-                NaiveEqual<uint32_t>, "equalNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Equal, DataType::Float32,
-                NaiveEqual<float>, "equalNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::GreaterOrEqual, DataType::UInt32,
-                NaiveGreaterEqual<uint32_t>, "greaterEqualNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::GreaterOrEqual, DataType::Float32,
-                NaiveGreaterEqual<float>, "greaterEqualNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Greater, DataType::UInt32,
-                NaiveGreaterThan<uint32_t>, "greaterThanNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Greater, DataType::Float32,
-                NaiveGreaterThan<float>, "greaterThanNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::LessOrEqual, DataType::UInt32,
-                NaiveLessEqual<uint32_t>, "lessEqualNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::LessOrEqual, DataType::Float32,
-                NaiveLessEqual<float>, "lessEqualNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Less, DataType::UInt32,
-                NaiveLessThan<uint32_t>, "lessEqualNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Less, DataType::Float32,
-                NaiveLessThan<float>, "lessEqualNaive_CPU_float32");
+REGISTER_KERNEL(Device::CPU, OpType::Add, NativeElementWise, "addNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Sub, NativeElementWise, "subNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Mul, NativeElementWise, "mulNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Div, NativeElementWise, "divNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Equal, NativeElementWise,
+                "equalNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::GreaterOrEqual, NativeElementWise,
+                "greaterEqualNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Greater, NativeElementWise,
+                "greaterThanNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::LessOrEqual, NativeElementWise,
+                "lessEqualNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Less, NativeElementWise,
+                "lessEqualNaive_CPU");
 }; // namespace infini
diff --git a/src/kernels/cpu/matmul.cc b/src/kernels/cpu/matmul.cc
index 248cb60b..6a863402 100644
--- a/src/kernels/cpu/matmul.cc
+++ b/src/kernels/cpu/matmul.cc
@@ -3,9 +3,9 @@
 
 namespace infini {
 
-template <typename T> class NaiveMatmul : public CpuKernelWithoutConfig {
-    void compute(const Operator &_op,
-                 const RuntimeObj *context) const override {
+class NaiveMatmul : public CpuKernelWithoutConfig {
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
         auto op = as<MatmulObj>(_op);
         IT_ASSERT(op->getInputs().size() == 2, "Bias is not supported yet.");
         T *A = op->getInputs(0)->getRawDataPtr<T *>();
@@ -23,11 +23,25 @@ template <typename T> class NaiveMatmul : public CpuKernelWithoutConfig {
             }
         }
     }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+    }
 };
 
-REGISTER_KERNEL(Device::CPU, OpType::MatMul, DataType::UInt32,
-                NaiveMatmul<uint32_t>, "MatmulNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::MatMul, DataType::Float32,
-                NaiveMatmul<float>, "MatmulNaive_CPU_float32");
+REGISTER_KERNEL(Device::CPU, OpType::MatMul, NaiveMatmul, "MatmulNaive_CPU");
 
 } // namespace infini
diff --git a/src/kernels/cpu/membound.cc b/src/kernels/cpu/membound.cc
index b6b6c7ee..a2fd6232 100644
--- a/src/kernels/cpu/membound.cc
+++ b/src/kernels/cpu/membound.cc
@@ -80,8 +80,8 @@ class MemboundInterpreter : public Kernel {
     }
 };
 
-REGISTER_KERNEL(Device::CPU, OpType::MemBound, DataType::UInt32,
-                MemboundInterpreter, "MemboundInterpreter_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::MemBound, MemboundInterpreter,
+                "MemboundInterpreter_CPU");
 
 } // namespace infini
 
diff --git a/src/kernels/cpu/pooling.cc b/src/kernels/cpu/pooling.cc
index 1242e14f..a076011a 100644
--- a/src/kernels/cpu/pooling.cc
+++ b/src/kernels/cpu/pooling.cc
@@ -2,42 +2,10 @@
 #include "core/kernel.h"
 
 namespace infini {
-template <typename T> class NativePooling : public CpuKernelWithoutConfig {
-    virtual T getPoolingValue(int kh, int kw, int posh, int posw, int ih,
-                              int iw, T *inptr) const = 0;
-    void compute(const Operator &_op,
-                 const RuntimeObj *context) const override {
-        auto op = as<PoolingObj>(_op);
-        T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
-        T *outptr = op->getOutput()->getRawDataPtr<T *>();
-        const auto [n, c, ih, iw, kh, kw] = op->getNCHWRS();
-        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
-        if (dh != 1 || dw != 1)
-            IT_TODO_HALT(); // To support dailated pooling
-        auto outDim = op->getOutput()->getDims();
-        int oh = outDim[2], ow = outDim[3];
-        for (auto i = 0; i < n; i++) {
-            for (auto j = 0; j < c; j++) {
-                auto inoffset = i * (c * ih * iw) + j * ih * iw;
-                for (auto h = 0; h < oh; h++) {
-                    for (auto w = 0; w < ow; w++) {
-                        // TODO: verify ceil mode
-                        T val =
-                            getPoolingValue(kh, kw, h * sh - ph, w * sw - pw,
-                                            ih, iw, inptr + inoffset);
-                        auto outoffset =
-                            w + h * ow + j * (oh * ow) + i * (c * oh * ow);
-                        outptr[outoffset] = val;
-                    }
-                }
-            }
-        }
-    }
-};
-
-template <typename T> class NaiveMaxPool : public NativePooling<T> {
-    T getPoolingValue(int kh, int kw, int posh, int posw, int ih, int iw,
-                      T *inptr) const override {
+class NativePooling : public CpuKernelWithoutConfig {
+    template <typename T>
+    static T getMaxPoolingValue(int kh, int kw, int posh, int posw, int ih,
+                                int iw, T *inptr) {
         T maxval = 0;
         for (auto k = 0; k < kh; k++) {
             for (auto l = 0; l < kw; l++) {
@@ -53,11 +21,10 @@ template <typename T> class NaiveMaxPool : public NativePooling<T> {
         }
         return maxval;
     }
-};
 
-template <typename T> class NaiveAvgPool : public NativePooling<T> {
-    T getPoolingValue(int kh, int kw, int posh, int posw, int ih, int iw,
-                      T *inptr) const override {
+    template <typename T>
+    static T getAvgPoolingValue(int kh, int kw, int posh, int posw, int ih,
+                                int iw, T *inptr) {
         T sum = 0;
         for (auto k = 0; k < kh; k++) {
             for (auto l = 0; l < kw; l++) {
@@ -71,12 +38,70 @@ template <typename T> class NaiveAvgPool : public NativePooling<T> {
         }
         return T(sum / (kh * kw));
     }
+
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
+        auto op = as<PoolingObj>(_op);
+        T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
+        T *outptr = op->getOutput()->getRawDataPtr<T *>();
+
+        const auto [n, c, ih, iw, kh, kw] = op->getNCHWRS();
+        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+        if (dh != 1 || dw != 1)
+            IT_TODO_HALT(); // To support dailated pooling
+        auto outDim = op->getOutput()->getDims();
+        int oh = outDim[2], ow = outDim[3];
+
+        T(*_doCompute)
+        (int kh, int kw, int posh, int posw, int ih, int iw, T *inptr);
+        switch (op->getOpType().underlying()) {
+        case OpType::MaxPool:
+            _doCompute = getMaxPoolingValue<T>;
+            break;
+        case OpType::AveragePool:
+            _doCompute = getAvgPoolingValue<T>;
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+
+        for (auto i = 0; i < n; i++) {
+            for (auto j = 0; j < c; j++) {
+                auto inoffset = i * (c * ih * iw) + j * ih * iw;
+                for (auto h = 0; h < oh; h++) {
+                    for (auto w = 0; w < ow; w++) {
+                        // TODO: verify ceil mode
+                        T val = _doCompute(kh, kw, h * sh - ph, w * sw - pw, ih,
+                                           iw, inptr + inoffset);
+                        auto outoffset =
+                            w + h * ow + j * (oh * ow) + i * (c * oh * ow);
+                        outptr[outoffset] = val;
+                    }
+                }
+            }
+        }
+    }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+    }
 };
 
-REGISTER_KERNEL(Device::CPU, OpType::MaxPool, DataType::UInt32,
-                NaiveMaxPool<uint32_t>, "maxPoolNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::MaxPool, DataType::Float32,
-                NaiveMaxPool<float>, "maxPoolNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::AveragePool, DataType::Float32,
-                NaiveAvgPool<float>, "AvgPoolNaive_CPU_float32");
+REGISTER_KERNEL(Device::CPU, OpType::MaxPool, NativePooling,
+                "maxPoolNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::AveragePool, NativePooling,
+                "avgPoolNaive_CPU");
 } // namespace infini
diff --git a/src/kernels/cpu/split.cc b/src/kernels/cpu/split.cc
index 3ef0cea3..3da5ade8 100644
--- a/src/kernels/cpu/split.cc
+++ b/src/kernels/cpu/split.cc
@@ -3,9 +3,9 @@
 
 namespace infini {
 
-template <typename T> class NaiveSplit : public CpuKernelWithoutConfig {
-    void compute(const Operator &_op,
-                 const RuntimeObj *context) const override {
+class NaiveSplit : public CpuKernelWithoutConfig {
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
         auto op = as<SplitObj>(_op);
         auto inputs = op->getInputs(), outputs = op->getOutputs();
         auto dim = op->getDim();
@@ -40,11 +40,24 @@ template <typename T> class NaiveSplit : public CpuKernelWithoutConfig {
             }
         }
     }
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+    }
 };
 
-REGISTER_KERNEL(Device::CPU, OpType::Split, DataType::UInt32,
-                NaiveSplit<uint32_t>, "SplitNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Split, DataType::Float32,
-                NaiveSplit<float>, "SplitNaive_CPU_float32");
+REGISTER_KERNEL(Device::CPU, OpType::Split, NaiveSplit, "SplitNaive_CPU");
 
 } // namespace infini
diff --git a/src/kernels/cpu/transpose.cc b/src/kernels/cpu/transpose.cc
index 997c427e..46292d45 100644
--- a/src/kernels/cpu/transpose.cc
+++ b/src/kernels/cpu/transpose.cc
@@ -14,9 +14,9 @@ inline Shape idx2Pos(const Shape &shape, size_t idx) {
     return pos;
 }
 
-template <typename T> class NaiveTranspose : public CpuKernelWithoutConfig {
-    void compute(const Operator &_op,
-                 const RuntimeObj *context) const override {
+class NaiveTranspose : public CpuKernelWithoutConfig {
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
         auto op = as<TransposeObj>(_op);
         auto inputs = op->getInputs(), outputs = op->getOutputs();
         const auto &inDim = inputs[0]->getDims();
@@ -35,11 +35,26 @@ template <typename T> class NaiveTranspose : public CpuKernelWithoutConfig {
             outPtr[outIdx] = inPtr[inIdx];
         }
     }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+    }
 };
 
-REGISTER_KERNEL(Device::CPU, OpType::Transpose, DataType::UInt32,
-                NaiveTranspose<uint32_t>, "TransposeNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Transpose, DataType::Float32,
-                NaiveTranspose<float>, "TransposeNaive_CPU_float32");
+REGISTER_KERNEL(Device::CPU, OpType::Transpose, NaiveTranspose,
+                "TransposeNaive_CPU");
 
 } // namespace infini
diff --git a/src/kernels/cpu/unary.cc b/src/kernels/cpu/unary.cc
index 3ea61b41..024d720a 100644
--- a/src/kernels/cpu/unary.cc
+++ b/src/kernels/cpu/unary.cc
@@ -4,25 +4,170 @@
 #include "operators/softmax.h"
 
 namespace infini {
-template <typename T> class NativeUnary : public CpuKernelWithoutConfig {
-    virtual T doCompute(T val) const = 0;
-    void compute(const Operator &_op,
-                 const RuntimeObj *context) const override {
+class NativeUnary : public CpuKernelWithoutConfig {
+    template <typename T> static T reluCompute(T val) {
+        return std::max(T(0), val);
+    }
+
+    template <typename T> static T sigmoidCompute(T val) {
+        return 1 / (1 + pow(E_CONSTANT, -val));
+    }
+
+    template <typename T> static T hardSigmoidCompute(T val) {
+        return std::max(T(0), std::min(T(1), T(0.2) * val + T(0.5)));
+    }
+
+    template <typename T> static T hardSwishCompute(T val) {
+        return val *
+               std::max(T(0), std::min(T(1), val * T(1.0 / 6.0) + T(0.5)));
+    }
+
+    template <typename T> static T tanhCompute(T val) {
+        return (pow(E_CONSTANT, val) - pow(E_CONSTANT, -val)) /
+               (pow(E_CONSTANT, val) + pow(E_CONSTANT, -val));
+    }
+
+    template <typename T> static T absCompute(T val) {
+        return val < 0 ? -val : val;
+    }
+
+    template <typename T> static T sqrtCompute(T val) { return std::sqrt(val); }
+
+    template <typename T> static T cosCompute(T val) { return std::cos(val); }
+
+    template <typename T> static T sinCompute(T val) { return std::sin(val); }
+
+    template <typename T> static T tanCompute(T val) { return std::tan(val); }
+
+    template <typename T> static T sinhCompute(T val) { return std::sinh(val); }
+
+    template <typename T> static T coshCompute(T val) { return std::cosh(val); }
+
+    template <typename T> static T geluCompute(T val) {
+        return 0.5 * val * (1 + std::erf(val / std::sqrt(2)));
+    }
+
+    template <typename T> static T erfCompute(T val) { return std::erf(val); }
+
+    template <typename T> static T aCosCompute(T val) { return std::acos(val); }
+
+    template <typename T> static T aCoshCompute(T val) {
+        return std::acosh(val);
+    }
+
+    template <typename T> static T aSinCompute(T val) { return std::asin(val); }
+
+    template <typename T> static T aSinhCompute(T val) {
+        return std::asinh(val);
+    }
+    template <typename T> static T aTanCompute(T val) { return std::atan(val); }
+
+    template <typename T> static T aTanhCompute(T val) {
+        return std::atanh(val);
+    }
+    template <typename T> static T negCompute(T val) { return -val; }
+
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
         auto op = as<UnaryObj>(_op);
         T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
         T *outptr = op->getOutput()->getRawDataPtr<T *>();
 
         auto outDim = op->getOutput()->getDims();
         auto n = op->getOutput()->size();
+
+        T (*_doCompute)(T val);
+        switch (op->getOpType().underlying()) {
+        case OpType::Relu:
+            _doCompute = reluCompute<T>;
+            break;
+        case OpType::Gelu:
+            _doCompute = geluCompute<T>;
+            break;
+        case OpType::Sigmoid:
+            _doCompute = sigmoidCompute<T>;
+            break;
+        case OpType::HardSigmoid:
+            _doCompute = hardSigmoidCompute<T>;
+            break;
+        case OpType::HardSwish:
+            _doCompute = hardSwishCompute<T>;
+            break;
+        case OpType::Tanh:
+            _doCompute = tanhCompute<T>;
+            break;
+        case OpType::Abs:
+            _doCompute = absCompute<T>;
+            break;
+        case OpType::Sqrt:
+            _doCompute = sqrtCompute<T>;
+            break;
+        case OpType::Erf:
+            _doCompute = erfCompute<T>;
+            break;
+        case OpType::Neg:
+            _doCompute = negCompute<T>;
+            break;
+        case OpType::Cos:
+            _doCompute = cosCompute<T>;
+            break;
+        case OpType::Sin:
+            _doCompute = sinCompute<T>;
+            break;
+        case OpType::Tan:
+            _doCompute = tanCompute<T>;
+            break;
+        case OpType::Sinh:
+            _doCompute = sinhCompute<T>;
+            break;
+        case OpType::Cosh:
+            _doCompute = coshCompute<T>;
+            break;
+        case OpType::Acos:
+            _doCompute = aCosCompute<T>;
+            break;
+        case OpType::Asin:
+            _doCompute = aSinCompute<T>;
+            break;
+        case OpType::Asinh:
+            _doCompute = aSinhCompute<T>;
+            break;
+        case OpType::Atan:
+            _doCompute = aTanCompute<T>;
+            break;
+        case OpType::Atanh:
+            _doCompute = aTanhCompute<T>;
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+
         for (size_t offset = 0; offset < n; offset++) {
-            outptr[offset] = doCompute(inptr[offset]);
+            outptr[offset] = _doCompute(inptr[offset]);
+        }
+    }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
         }
     }
 };
 
-template <typename T> class NaiveSoftmax : public CpuKernelWithoutConfig {
-    void compute(const Operator &_op,
-                 const RuntimeObj *context) const override {
+class NaiveSoftmax : public CpuKernelWithoutConfig {
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
         auto op = as<SoftmaxObj>(_op);
         T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
         T *outptr = op->getOutput()->getRawDataPtr<T *>();
@@ -37,98 +182,28 @@ template <typename T> class NaiveSoftmax : public CpuKernelWithoutConfig {
             outptr[offset] = pow(E_CONSTANT, inptr[offset]) / sum;
         }
     }
-};
 
-template <typename T> class NaiveRelu : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::max(T(0), val); }
-};
-template <typename T> class NaiveSigmoid : public NativeUnary<T> {
-    T doCompute(T val) const override {
-        return 1 / (1 + pow(E_CONSTANT, -val));
-    }
-};
-template <typename T> class NaiveHardSigmoid : public NativeUnary<T> {
-    T doCompute(T val) const override {
-        return std::max(T(0), std::min(T(1), T(0.2) * val + T(0.5)));
-    }
-};
-template <typename T> class NaiveHardSwish : public NativeUnary<T> {
-    T doCompute(T val) const override {
-        return val *
-               std::max(T(0), std::min(T(1), val * T(1.0 / 6.0) + T(0.5)));
-    }
-};
-template <typename T> class NaiveTanh : public NativeUnary<T> {
-    T doCompute(T val) const override {
-        return (pow(E_CONSTANT, val) - pow(E_CONSTANT, -val)) /
-               (pow(E_CONSTANT, val) + pow(E_CONSTANT, -val));
-    }
-};
-template <typename T> class NaiveAbs : public NativeUnary<T> {
-    T doCompute(T val) const override { return val < 0 ? -val : val; }
-};
-
-template <typename T> class NaiveSqrt : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::sqrt(val); }
-};
-
-template <typename T> class NaiveCos : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::cos(val); }
-};
-
-template <typename T> class NaiveSin : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::sin(val); }
-};
-
-template <typename T> class NaiveTan : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::tan(val); }
-};
-
-template <typename T> class NaiveSinh : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::sinh(val); }
-};
-
-template <typename T> class NaiveCosh : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::cosh(val); }
-};
-
-template <typename T> class NaiveGelu : public NativeUnary<T> {
-    T doCompute(T val) const override {
-        return 0.5 * val * (1 + std::erf(val / std::sqrt(2)));
-    }
-};
-
-template <typename T> class NaiveErf : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::erf(val); }
-};
-
-template <typename T> class NaiveACos : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::acos(val); }
-};
-
-template <typename T> class NaiveACosh : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::acosh(val); }
-};
-
-template <typename T> class NaiveASin : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::asin(val); }
-};
-
-template <typename T> class NaiveASinh : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::asinh(val); }
-};
-
-template <typename T> class NaiveATanh : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::atanh(val); }
-};
-
-template <typename T> class NaiveNeg : public NativeUnary<T> {
-    T doCompute(T val) const override { return -val; }
-};
-
-template <typename T> class Clip : public CpuKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+    }
+};
+
+class Clip : public CpuKernelWithoutConfig {
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
         auto op = as<ClipObj>(_op);
         T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
         T *outptr = op->getOutput()->getRawDataPtr<T *>();
@@ -143,11 +218,28 @@ template <typename T> class Clip : public CpuKernelWithoutConfig {
                                                         : val;
         }
     }
-};
 
-template <typename T> class Log : public CpuKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+    }
+};
+
+class Log : public CpuKernelWithoutConfig {
+    template <typename T>
+    void doCompute(const Operator &_op, const RuntimeObj *context) const {
         auto op = as<LogObj>(_op);
         T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
         T *outptr = op->getOutput()->getRawDataPtr<T *>();
@@ -176,70 +268,50 @@ template <typename T> class Log : public CpuKernelWithoutConfig {
             }
         }
     }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+#define CASE(N)                                                                \
+    case N:                                                                    \
+        doCompute<DT<N>::t>(_op, context)
+
+        int dataTypeIdx = _op->getDType().getIndex();
+        switch (dataTypeIdx) {
+            CASE(1); // DataType::Float32
+            break;
+            CASE(12); // DataType::UInt32
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+    }
 };
 
-template <typename T> class NaiveATan : public NativeUnary<T> {
-    T doCompute(T val) const override { return std::atan(val); }
-};
+REGISTER_KERNEL(Device::CPU, OpType::Relu, NativeUnary, "reluNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Gelu, NativeUnary, "geluNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, NativeUnary, "sigmoidNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::HardSigmoid, NativeUnary,
+                "hardSigmoidNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::HardSwish, NativeUnary,
+                "hardSwishNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Tanh, NativeUnary, "tanhNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Abs, NativeUnary, "absNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Sqrt, NativeUnary, "sqrtNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Erf, NativeUnary, "erfNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Neg, NativeUnary, "negNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Cos, NativeUnary, "Cos_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Sin, NativeUnary, "Sin_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Tan, NativeUnary, "Tan_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Sinh, NativeUnary, "Sinh_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Cosh, NativeUnary, "Cosh_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Acos, NativeUnary, "ACos_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Acosh, NativeUnary, "ACosh_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Asin, NativeUnary, "ASin_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Asinh, NativeUnary, "ASinh_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Atan, NativeUnary, "Atan_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Atanh, NativeUnary, "ATanh_CPU");
 
-REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32,
-                NaiveRelu<uint32_t>, "reluNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu<float>,
-                "reluNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Gelu, DataType::UInt32, NaiveGelu<float>,
-                "geluNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Gelu, DataType::Float32, NaiveGelu<float>,
-                "geluNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::UInt32,
-                NaiveSigmoid<uint32_t>, "sigmoidNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::Float32,
-                NaiveSigmoid<float>, "sigmoidNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::HardSigmoid, DataType::Float32,
-                NaiveHardSigmoid<float>, "hardSigmoidNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::HardSwish, DataType::Float32,
-                NaiveHardSwish<float>, "hardSwishNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Tanh, DataType::UInt32,
-                NaiveTanh<uint32_t>, "tanhNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Tanh, DataType::Float32, NaiveTanh<float>,
-                "tanhNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Abs, DataType::UInt32, NaiveAbs<uint32_t>,
-                "absNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Abs, DataType::Float32, NaiveAbs<float>,
-                "absNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Sqrt, DataType::Float32, NaiveSqrt<float>,
-                "sqrtNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Erf, DataType::Float32, NaiveErf<float>,
-                "erfNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Neg, DataType::Float32, NaiveNeg<float>,
-                "negNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::UInt32,
-                NaiveSoftmax<uint32_t>, "softmaxNaive_CPU_uint32");
-REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::Float32,
-                NaiveSoftmax<float>, "softmaxNaive_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Clip, DataType::Float32, Clip<float>,
-                "Clip_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Atan, DataType::Float32, NaiveATan<float>,
-                "Atan_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Log, DataType::Float32, Log<float>,
-                "Log_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Cos, DataType::Float32, NaiveCos<float>,
-                "Cos_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Sin, DataType::Float32, NaiveSin<float>,
-                "Sin_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Tan, DataType::Float32, NaiveTan<float>,
-                "Tan_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Sinh, DataType::Float32, NaiveSinh<float>,
-                "Sinh_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Cosh, DataType::Float32, NaiveCosh<float>,
-                "Cosh_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Acos, DataType::Float32, NaiveACos<float>,
-                "ACos_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Acosh, DataType::Float32,
-                NaiveACosh<float>, "ACosh_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Asin, DataType::Float32, NaiveASin<float>,
-                "ASin_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Asinh, DataType::Float32,
-                NaiveASinh<float>, "ASinh_CPU_float32");
-REGISTER_KERNEL(Device::CPU, OpType::Atanh, DataType::Float32,
-                NaiveATanh<float>, "ATanh_CPU_float32");
+REGISTER_KERNEL(Device::CPU, OpType::Softmax, NaiveSoftmax, "softmaxNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Clip, Clip, "Clip_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Log, Log, "Log_CPU");
 }; // namespace infini
diff --git a/src/kernels/cuda/G2BMM.cc b/src/kernels/cuda/G2BMM.cc
index cb69f76a..133e4c4d 100644
--- a/src/kernels/cuda/G2BMM.cc
+++ b/src/kernels/cuda/G2BMM.cc
@@ -48,13 +48,13 @@ class G2BMMCudnn : public CudaKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<G2BMMObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
         bool success = g2bmmKernel(op, context);
         IT_ASSERT(success);
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::G2BMM, DataType::Float32, G2BMMCudnn,
-                "G2BMM_cuDNN_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::G2BMM, G2BMMCudnn, "G2BMM_cuDNN_CUDA");
 
 } // namespace infini
diff --git a/src/kernels/cuda/GBMM.cc b/src/kernels/cuda/GBMM.cc
index 06002850..392101ab 100644
--- a/src/kernels/cuda/GBMM.cc
+++ b/src/kernels/cuda/GBMM.cc
@@ -49,13 +49,13 @@ class GBMMCudnn : public CudaKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<GBMMObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
         bool success = gbmmKernel(op, context);
         IT_ASSERT(success);
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::GBMM, DataType::Float32, GBMMCudnn,
-                "GBMM_cuDNN_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::GBMM, GBMMCudnn, "GBMM_cuDNN_CUDA");
 
 } // namespace infini
diff --git a/src/kernels/cuda/all_gather.cc b/src/kernels/cuda/all_gather.cc
index 187aea5c..261f9070 100644
--- a/src/kernels/cuda/all_gather.cc
+++ b/src/kernels/cuda/all_gather.cc
@@ -39,8 +39,8 @@ class AllGatherNCCL : public CudaKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::AllGather, DataType::Float32,
-                AllGatherNCCL, "AllGather_NCCL_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::AllGather, AllGatherNCCL,
+                "AllGather_NCCL_CUDA");
 } // namespace infini
 
 #endif
diff --git a/src/kernels/cuda/all_reduce.cc b/src/kernels/cuda/all_reduce.cc
index ef60b991..8b64d2ab 100644
--- a/src/kernels/cuda/all_reduce.cc
+++ b/src/kernels/cuda/all_reduce.cc
@@ -13,15 +13,24 @@ class AllReduceNCCL : public CudaKernelWithoutConfig {
         auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
         void *input = op->getInputs(0)->getRawDataPtr<void *>();
         void *output = op->getOutput()->getRawDataPtr<void *>();
-        IT_ASSERT(op->getDType() == DataType::Float32);
+        ncclDataType_t ncclType = ncclFloat;
+        if (op->getDType() == DataType::Float16) {
+            ncclType = ncclFloat16;
+        } else if (op->getDType() == DataType::Int8) {
+            ncclType = ncclInt8;
+        } else if (op->getDType() == DataType::Float32) {
+            ncclType = ncclFloat;
+        } else {
+            IT_TODO_HALT();
+        }
         size_t count = op->getInputs(0)->size();
 
         ncclComm_t comm =
             dynamic_cast<NcclCommunicatorObj &>(context->getCommunicator())
                 .getNcclComm();
         // TODO: Using default stream 0 for now.
-        checkNcclError(ncclAllReduce(input, output, count, ncclFloat,
-                                     getRedOp(), comm, 0));
+        checkNcclError(
+            ncclAllReduce(input, output, count, ncclType, getRedOp(), comm, 0));
     }
 
     virtual ncclRedOp_t getRedOp() const = 0;
@@ -43,16 +52,16 @@ class AllReduceAvgNCCL : public AllReduceNCCL {
     ncclRedOp_t getRedOp() const override { return ncclAvg; }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::AllReduceSum, DataType::Float32,
-                AllReduceSumNCCL, "AllReduce_Sum_NCCL_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::AllReduceProd, DataType::Float32,
-                AllReduceProdNCCL, "AllReduce_Prod_NCCL_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMin, DataType::Float32,
-                AllReduceMinNCCL, "AllReduce_Min_NCCL_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMax, DataType::Float32,
-                AllReduceMaxNCCL, "AllReduce_Max_NCCL_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::AllReduceAvg, DataType::Float32,
-                AllReduceAvgNCCL, "AllReduce_Avg_NCCL_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::AllReduceSum, AllReduceSumNCCL,
+                "AllReduce_Sum_NCCL_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::AllReduceProd, AllReduceProdNCCL,
+                "AllReduce_Prod_NCCL_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMin, AllReduceMinNCCL,
+                "AllReduce_Min_NCCL_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMax, AllReduceMaxNCCL,
+                "AllReduce_Max_NCCL_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::AllReduceAvg, AllReduceAvgNCCL,
+                "AllReduce_Avg_NCCL_CUDA");
 
 } // namespace infini
 #endif
diff --git a/src/kernels/cuda/attention_kvcache.cc b/src/kernels/cuda/attention_kvcache.cc
index 0d21603a..52356d8d 100644
--- a/src/kernels/cuda/attention_kvcache.cc
+++ b/src/kernels/cuda/attention_kvcache.cc
@@ -40,6 +40,7 @@ class AttentionKVCacheCuda : private AttentionKVCacheCompute,
                              public CudaKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
+        IT_ASSERT(_op->getDType() == DataType::Float32);
         do_compute(_op->getInputs()[0], _op->getInputs()[1],
                    _op->getInputs()[2], _op->getInputs()[3],
                    _op->getInputs()[4], _op->getInputs()[5],
@@ -47,6 +48,6 @@ class AttentionKVCacheCuda : private AttentionKVCacheCompute,
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::AttentionKVCache, DataType::Float32,
-                AttentionKVCacheCuda, "AttentionKVCache_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::AttentionKVCache, AttentionKVCacheCuda,
+                "AttentionKVCache_CUDA");
 } // namespace infini
diff --git a/src/kernels/cuda/batch_norm.cc b/src/kernels/cuda/batch_norm.cc
index 1df7313f..b083ad9c 100644
--- a/src/kernels/cuda/batch_norm.cc
+++ b/src/kernels/cuda/batch_norm.cc
@@ -10,6 +10,7 @@ class BatchNormCudnn : public CudaKernelWithoutConfig {
         auto op = as<BatchNormObj>(_op);
         auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
         cudnnStatus_t stat;
+        IT_ASSERT(op->getDType() == DataType::Float32);
         void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const outData = (op->getOutput()->getRawDataPtr<void *>());
         void *const meanData = (op->getInputs(1)->getRawDataPtr<void *>());
@@ -59,6 +60,6 @@ class BatchNormCudnn : public CudaKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::BatchNormalization, DataType::Float32,
-                BatchNormCudnn, "BatchNorm_cuDNN_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::BatchNormalization, BatchNormCudnn,
+                "BatchNorm_cuDNN_CUDA");
 } // namespace infini
diff --git a/src/kernels/cuda/broadcast.cc b/src/kernels/cuda/broadcast.cc
index 79190491..6fb35914 100644
--- a/src/kernels/cuda/broadcast.cc
+++ b/src/kernels/cuda/broadcast.cc
@@ -25,8 +25,8 @@ class BroadcastNCCL : public CudaKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Broadcast, DataType::Float32,
-                BroadcastNCCL, "Broadcast_NCCL_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Broadcast, BroadcastNCCL,
+                "Broadcast_NCCL_CUDA");
 } // namespace infini
 
 #endif
diff --git a/src/kernels/cuda/clip.cc b/src/kernels/cuda/clip.cc
index b4865504..55184eb9 100644
--- a/src/kernels/cuda/clip.cc
+++ b/src/kernels/cuda/clip.cc
@@ -9,7 +9,7 @@ class ClipCuda : public CudaKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ClipObj>(_op);
-
+        IT_ASSERT(op->getDType() == DataType::Float32);
         void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
         auto min = op->getMin();
@@ -21,7 +21,6 @@ class ClipCuda : public CudaKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Clip, DataType::Float32, ClipCuda,
-                "Clip_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Clip, ClipCuda, "Clip_CUDA");
 
 }; // namespace infini
diff --git a/src/kernels/cuda/conv.cc b/src/kernels/cuda/conv.cc
index c020ed33..de7d8d09 100644
--- a/src/kernels/cuda/conv.cc
+++ b/src/kernels/cuda/conv.cc
@@ -1,10 +1,12 @@
 #include "operators/conv.h"
 #include "core/kernel.h"
 #include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
 #include <chrono>
 #include <functional>
 #include <limits>
 #include <tuple>
+
 namespace infini {
 
 struct ConvCuDnnPerfRecordObj : public PerfRecordObj {
@@ -56,8 +58,11 @@ class convCudnn : public Kernel {
                           const ConvCuDnnPerfRecord &record) const {
         void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
-        if (op->getInputs().size() > 2) // Bias is not supported yet
+        // Bias is not supported yet
+        if (op->getInputs().size() > 2) {
             IT_TODO_HALT();
+        }
+        auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
         // void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
         void *const outData = (op->getOutput()->getRawDataPtr<void *>());
 
@@ -72,27 +77,26 @@ class convCudnn : public Kernel {
         cudnnTensorDescriptor_t inDesc;
         checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
         checkCudnnError(cudnnSetTensor4dDescriptor(
-            inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, channels, h, w));
+            inDesc, CUDNN_TENSOR_NCHW, cudnnDataType, n, channels, h, w));
 
         // get kernels
         cudnnFilterDescriptor_t knDesc;
         checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
-        checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
-                                                   CUDNN_TENSOR_NCHW, f,
-                                                   channelsPerGrp, r, s));
+        checkCudnnError(cudnnSetFilter4dDescriptor(
+            knDesc, cudnnDataType, CUDNN_TENSOR_NCHW, f, channelsPerGrp, r, s));
         // get bias
         cudnnTensorDescriptor_t biasDesc;
         checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(
-            biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1));
+        checkCudnnError(cudnnSetTensor4dDescriptor(biasDesc, CUDNN_TENSOR_NCHW,
+                                                   cudnnDataType, 1, f, 1, 1));
 
-        // get convlution descriptor
+        // get convolution descriptor
         cudnnConvolutionDescriptor_t convDesc;
         checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
         // TODO: CUDNN_CONVOLUTION is a tunable argument
         checkCudnnError(cudnnSetConvolution2dDescriptor(
             convDesc, ph, pw, sh, sw, dh, dw, MODES[record->mode],
-            CUDNN_DATA_FLOAT));
+            cudnnDataType));
         if (g > 1) {
             checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g));
         }
@@ -120,14 +124,14 @@ class convCudnn : public Kernel {
             assert(false);
         }
 
+        // get output descriptor
         int outn, outc, outh, outw;
         checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
             convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
         cudnnTensorDescriptor_t outDesc;
         checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
-                                                   CUDNN_DATA_FLOAT, outn, outc,
-                                                   outh, outw));
+        checkCudnnError(cudnnSetTensor4dDescriptor(
+            outDesc, CUDNN_TENSOR_NCHW, cudnnDataType, outn, outc, outh, outw));
         IT_ASSERT((vector{outn, outc, outh, outw}) ==
                       op->getOutput()->getDims(),
                   "cuDNN output shape mismatches with OP output shape");
@@ -151,55 +155,9 @@ class convCudnn : public Kernel {
                                        inData, knDesc, knData, convDesc,
                                        ALGOS[record->algo], wsData, wsSize,
                                        &beta, outDesc, outData);
-        if (stat != CUDNN_STATUS_SUCCESS)
+        if (stat != CUDNN_STATUS_SUCCESS) {
             return false;
-        // TODO:
-        // // bias
-        // if (bias != nullptr) {
-        //     auto sz = op.getOutputs()[0]->size();
-        //     // TODO: element wise
-        //     t += sz * 2 / 400;
-        // }
-        // // act
-        // if (act != None) {
-        //     stat = cudnnActivationForward(cudnnHandle(), actDesc,
-        //                                   &alpha, inDesc, inData,
-        //                                   &beta, outDesc, outData);
-        //     checkCudaError(cudaDeviceSynchronize());
-        //     end = ch::high_resolution_clock::now();
-        //     if (stat != CUDNN_STATUS_SUCCESS) {
-        //         durtime = INFINITY;
-        //         break;
-        //     }
-        //     t +=
-        //         ch::duration_cast<ch::duration<double>>(end -
-        //         beg).count() * 1000; // ms
-        // }
-
-        // best = ConvResult{durtime, ALGOS[i], wsSize, false};
-
-        // // w/ bias & act
-        // for (int j = 0; j < rounds + warmupRounds; ++j) {
-        //     cudnnStatus_t stat;
-        //     if (j == warmupRounds) {
-        //         checkCudaError(cudaDeviceSynchronize());
-        //         beg = ch::high_resolution_clock::now();
-        //     }
-        //     stat = cudnnConvolutionBiasActivationForward(
-        //         cudnnHandle(), &alpha, inDesc, inData, knDesc, knData,
-        //         convDesc, ALGOS[i], wsData, wsSize, &beta, outDesc,
-        //         outData, biasDesc, biasData, actDesc, outDesc, outData);
-        //     if (stat != CUDNN_STATUS_SUCCESS) {
-        //         // checkCudnnError(stat);
-        //         // Do not checkCudnnError since not all algorithms are
-        //         // supported
-        //         durtime_fuse = INFINITY;
-        //         break;
-        //     }
-        // }
-
-        // Destories in CUDA does not require sync. But cuDNN does not state
-        // whether sync is required before destories.
+        }
         checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
         checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
         checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
@@ -238,10 +196,12 @@ class convCudnn : public Kernel {
                 stat = cudnnGetConvolutionForwardWorkspaceSize(
                     context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
                     ALGOS[record.algo], &record.workspaceSize);
-                if (stat != CUDNN_STATUS_SUCCESS)
+                if (stat != CUDNN_STATUS_SUCCESS) {
                     continue;
-                if (record.workspaceSize > context->getWorkspaceSize())
+                }
+                if (record.workspaceSize > context->getWorkspaceSize()) {
                     continue;
+                }
                 CudaPtr wsData = context->getWorkspace(record.workspaceSize);
                 float alpha = 1.f, beta = 0.f;
 
@@ -249,8 +209,9 @@ class convCudnn : public Kernel {
                     context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
                     knData, convDesc, ALGOS[record.algo], wsData,
                     record.workspaceSize, &beta, outDesc, outData);
-                if (stat != CUDNN_STATUS_SUCCESS)
+                if (stat != CUDNN_STATUS_SUCCESS) {
                     continue;
+                }
                 record.time = timeit(
                     [&]() {
                         cudnnConvolutionForward(context->cudnnHandle(), &alpha,
@@ -263,8 +224,9 @@ class convCudnn : public Kernel {
                 // printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
 
                 // Update the tune result
-                if (ret.time > record.time)
+                if (ret.time > record.time) {
                     ret = record;
+                }
                 checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
                 checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
                 checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
@@ -291,8 +253,7 @@ class convCudnn : public Kernel {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float32, convCudnn,
-                "Conv_cuDNN_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Conv, convCudnn, "Conv_cuDNN_CUDA");
 
 REGISTER_CONSTRUCTOR(1, ConvCuDnnPerfRecordObj::from_json);
 } // namespace infini
diff --git a/src/kernels/cuda/conv_half.cc b/src/kernels/cuda/conv_half.cc
deleted file mode 100644
index 1f83b484..00000000
--- a/src/kernels/cuda/conv_half.cc
+++ /dev/null
@@ -1,261 +0,0 @@
-#include "core/kernel.h"
-#include "cuda/cuda_runtime.h"
-#include "operators/conv.h"
-#include <chrono>
-#include <functional>
-#include <limits>
-#include <tuple>
-
-namespace infini {
-
-struct ConvCuDnnPerfRecordObj : public PerfRecordObj {
-    int algo = 0; // cudnnConvolutionFwdAlgo_t
-    int mode = 1;
-    size_t workspaceSize = 100000;
-    bool fuseAct = false;
-    void to_json(json &j) override {
-        j["type"] = 1;
-        j["data"] = std::make_tuple(algo, mode, fuseAct, time, workspaceSize);
-    }
-    static PerfRecord from_json(const json &j) {
-        ConvCuDnnPerfRecordObj tmp;
-        auto [Algo, Mode, FuseAct, Time, WorkspaceSize] =
-            j["data"].get<tuple<int, int, bool, double, size_t>>();
-        tmp.algo = Algo;
-        tmp.mode = Mode;
-        tmp.fuseAct = FuseAct;
-        tmp.time = Time;
-        tmp.workspaceSize = WorkspaceSize;
-        return make_ref<ConvCuDnnPerfRecordObj>(tmp);
-    }
-};
-
-using ConvCuDnnPerfRecord = Ref<ConvCuDnnPerfRecordObj>;
-
-class convCudnnFP16 : public Kernel {
-
-    static constexpr int N_ALGO = 8;
-    static constexpr int N_MODE = 2;
-    static constexpr cudnnConvolutionFwdAlgo_t ALGOS[8] = {
-        CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
-        CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
-        CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-        CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
-        CUDNN_CONVOLUTION_FWD_ALGO_FFT,
-        CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
-        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
-        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED};
-
-    static constexpr cudnnConvolutionMode_t MODES[2] = {
-        CUDNN_CONVOLUTION, CUDNN_CROSS_CORRELATION};
-
-    std::tuple<void *, void *, void *, cudnnTensorDescriptor_t,
-               cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
-               cudnnConvolutionDescriptor_t, cudnnActivationDescriptor_t,
-               cudnnTensorDescriptor_t>
-    createCuDNNDescriptor(const Ref<ConvObj> &op,
-                          const ConvCuDnnPerfRecord &record) const {
-        void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
-        void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
-        // Bias is not supported yet
-        if (op->getInputs().size() > 2) {
-            IT_TODO_HALT();
-        }
-        // void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
-        void *const outData = (op->getOutput()->getRawDataPtr<void *>());
-
-        const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
-        const int cpg = op->getChannelPerGroup();
-        const int g = c / cpg;
-        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
-
-        int channelsPerGrp = cpg, channels = c;
-
-        // get inputs
-        cudnnTensorDescriptor_t inDesc;
-        checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(inDesc, CUDNN_TENSOR_NCHW,
-                                                   CUDNN_DATA_HALF, n, channels,
-                                                   h, w)); /*fp16 type*/
-
-        // get kernels
-        cudnnFilterDescriptor_t knDesc;
-        checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
-        checkCudnnError(cudnnSetFilter4dDescriptor(
-            knDesc, CUDNN_DATA_HALF, /*fp16 type*/
-            CUDNN_TENSOR_NCHW, f, channelsPerGrp, r, s));
-        // get bias
-        cudnnTensorDescriptor_t biasDesc;
-        checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(biasDesc, CUDNN_TENSOR_NCHW,
-                                                   CUDNN_DATA_HALF, 1, f, 1,
-                                                   1)); /*fp16 type*/
-
-        // get convolution descriptor
-        cudnnConvolutionDescriptor_t convDesc;
-        checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
-        // TODO: CUDNN_CONVOLUTION is a tunable argument
-        checkCudnnError(cudnnSetConvolution2dDescriptor(
-            convDesc, ph, pw, sh, sw, dh, dw, MODES[record->mode],
-            CUDNN_DATA_HALF)); /*fp16 type*/
-        if (g > 1) {
-            checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g));
-        }
-
-        // get activation descriptor
-        cudnnActivationDescriptor_t actDesc;
-        checkCudnnError(cudnnCreateActivationDescriptor(&actDesc));
-        // NOT_PROPAGATE_NAN is requierd by
-        // cudnnConvolotionBiasActivationForward
-        switch (op->getAct()) {
-        case ActType::Relu:
-            checkCudnnError(cudnnSetActivationDescriptor(
-                actDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0));
-            break;
-        case ActType::Sigmoid:
-            checkCudnnError(cudnnSetActivationDescriptor(
-                actDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0));
-            break;
-        case ActType::None:
-            checkCudnnError(
-                cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_IDENTITY,
-                                             CUDNN_NOT_PROPAGATE_NAN, 0));
-            break;
-        default:
-            assert(false);
-        }
-
-        // get output descriptor
-        int outn, outc, outh, outw;
-        checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
-            convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
-        cudnnTensorDescriptor_t outDesc;
-        checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
-                                                   CUDNN_DATA_HALF, outn, outc,
-                                                   outh, outw));
-        IT_ASSERT((vector{outn, outc, outh, outw}) ==
-                      op->getOutput()->getDims(),
-                  "cuDNN output shape mismatches with OP output shape");
-
-        return tuple(inData, knData, outData, inDesc, knDesc, biasDesc,
-                     convDesc, actDesc, outDesc);
-    }
-
-    bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
-                      const CudaRuntimeObj *context) const {
-        cudnnStatus_t stat;
-
-        const auto &[inData, knData, outData, inDesc, knDesc, biasDesc,
-                     convDesc, actDesc, outDesc] =
-            createCuDNNDescriptor(op, record);
-        size_t wsSize = record->workspaceSize;
-        CudaPtr wsData = context->getWorkspace(wsSize);
-        float alpha = 1.f, beta = 0.f;
-
-        stat = cudnnConvolutionForward(context->cudnnHandle(), &alpha, inDesc,
-                                       inData, knDesc, knData, convDesc,
-                                       ALGOS[record->algo], wsData, wsSize,
-                                       &beta, outDesc, outData);
-        if (stat != CUDNN_STATUS_SUCCESS) {
-            return false;
-        }
-        checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
-        checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
-        checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
-        checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
-        checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
-        checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
-        return true;
-    }
-
-    void compute(const Operator &op, const RuntimeObj *context) const override {
-        auto record = make_ref<ConvCuDnnPerfRecordObj>(); // with paramters in
-                                                          // default ctor
-        compute(op, record, context);
-    }
-
-    PerfRecord tune(const Operator &_op,
-                    const RuntimeObj *_context) const override {
-        ConvCuDnnPerfRecordObj ret;
-        ret.time = std::numeric_limits<double>::max();
-        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
-        auto op = as<ConvObj>(_op);
-        // Both modes have the same performance. Only run cross-correlation.
-        for (int mode = 1; mode < 2; mode++) {
-            // Try every possible algorithm of convolution
-            for (int algo = 0; algo < N_ALGO; algo++) {
-                auto recordRef = make_ref<ConvCuDnnPerfRecordObj>();
-                auto &record = *recordRef;
-                record.mode = mode;
-                record.algo = algo;
-                cudnnStatus_t stat;
-                const auto &[inData, knData, outData, inDesc, knDesc, biasDesc,
-                             convDesc, actDesc, outDesc] =
-                    createCuDNNDescriptor(op, recordRef);
-
-                // get workspace
-                stat = cudnnGetConvolutionForwardWorkspaceSize(
-                    context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
-                    ALGOS[record.algo], &record.workspaceSize);
-                if (stat != CUDNN_STATUS_SUCCESS) {
-                    continue;
-                }
-                if (record.workspaceSize > context->getWorkspaceSize()) {
-                    continue;
-                }
-                CudaPtr wsData = context->getWorkspace(record.workspaceSize);
-                float alpha = 1.f, beta = 0.f;
-
-                stat = cudnnConvolutionForward(
-                    context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
-                    knData, convDesc, ALGOS[record.algo], wsData,
-                    record.workspaceSize, &beta, outDesc, outData);
-                if (stat != CUDNN_STATUS_SUCCESS) {
-                    continue;
-                }
-                record.time = timeit(
-                    [&]() {
-                        cudnnConvolutionForward(context->cudnnHandle(), &alpha,
-                                                inDesc, inData, knDesc, knData,
-                                                convDesc, ALGOS[record.algo],
-                                                wsData, record.workspaceSize,
-                                                &beta, outDesc, outData);
-                    },
-                    [&]() { context->sync(); });
-                // printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
-
-                // Update the tune result
-                if (ret.time > record.time) {
-                    ret = record;
-                }
-                checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
-                checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
-                checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
-                checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
-                checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
-                checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
-            }
-        }
-        // printf("the best algo is %d, the best conv mode is %d\n", ret.algo,
-        //        ret.mode);
-        IT_ASSERT(ret.time < std::numeric_limits<double>::max(), "No valid "
-                                                                 "algorithm "
-                                                                 "found");
-        return make_ref<ConvCuDnnPerfRecordObj>(ret);
-    }
-
-    void compute(const Operator &_op, const PerfRecord &_record,
-                 const RuntimeObj *_context) const override {
-        auto op = as<ConvObj>(_op);
-        auto record = as<ConvCuDnnPerfRecordObj>(_record);
-        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
-        bool success = cuDNNUnfused(op, record, context);
-        IT_ASSERT(success);
-    }
-};
-
-REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float16, convCudnnFP16,
-                "Conv_cuDNN_CUDA_Float16");
-
-} // namespace infini
diff --git a/src/kernels/cuda/conv_transposed.cc b/src/kernels/cuda/conv_transposed.cc
index 4bd1b5e9..259fe8bb 100644
--- a/src/kernels/cuda/conv_transposed.cc
+++ b/src/kernels/cuda/conv_transposed.cc
@@ -219,6 +219,7 @@ class convBackwardDataCudnn : public Kernel {
     void compute(const Operator &op, const RuntimeObj *context) const override {
         // with paramters in default ctor
         auto record = make_ref<ConvTransposedCuDnnPerfRecordObj>();
+        IT_ASSERT(op->getDType() == DataType::Float32);
         compute(op, record, context);
     }
 
@@ -300,8 +301,9 @@ class convBackwardDataCudnn : public Kernel {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::ConvTranspose, DataType::Float32,
-                convBackwardDataCudnn, "ConvTranposed_cuDNN_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::ConvTransNHWC, DataType::Float32,
-                convBackwardDataCudnn, "ConvTranposedNHWC_cuDNN_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::ConvTranspose, convBackwardDataCudnn,
+                "ConvTranposed_cuDNN_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::ConvTransNHWC, convBackwardDataCudnn,
+                "ConvTranposedNHWC_cuDNN_CUDA");
+
 } // namespace infini
diff --git a/src/kernels/cuda/element_wise.cc b/src/kernels/cuda/element_wise.cc
index 8603c198..4a16de29 100644
--- a/src/kernels/cuda/element_wise.cc
+++ b/src/kernels/cuda/element_wise.cc
@@ -2,6 +2,7 @@
 #include "cuda/cuda_element_wise.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
 
 namespace infini {
 class ElementWiseCudnn : public CudaKernelWithoutConfig {
@@ -44,22 +45,21 @@ class ElementWiseCudnn : public CudaKernelWithoutConfig {
         std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
         std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
         std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
+
+        auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
         // get inputs
         checkCudnnError(cudnnCreateTensorDescriptor(&aDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW,
-                                                   CUDNN_DATA_FLOAT, a[0], a[1],
-                                                   a[2], a[3]));
+        checkCudnnError(cudnnSetTensor4dDescriptor(
+            aDesc, CUDNN_TENSOR_NCHW, cudnnDataType, a[0], a[1], a[2], a[3]));
 
         checkCudnnError(cudnnCreateTensorDescriptor(&bDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(bDesc, CUDNN_TENSOR_NCHW,
-                                                   CUDNN_DATA_FLOAT, b[0], b[1],
-                                                   b[2], b[3]));
+        checkCudnnError(cudnnSetTensor4dDescriptor(
+            bDesc, CUDNN_TENSOR_NCHW, cudnnDataType, b[0], b[1], b[2], b[3]));
 
         // get outputs
         checkCudnnError(cudnnCreateTensorDescriptor(&cDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW,
-                                                   CUDNN_DATA_FLOAT, c[0], c[1],
-                                                   c[2], c[3]));
+        checkCudnnError(cudnnSetTensor4dDescriptor(
+            cDesc, CUDNN_TENSOR_NCHW, cudnnDataType, c[0], c[1], c[2], c[3]));
 
         // get op descriptor
         cudnnOpTensorDescriptor_t opDesc;
@@ -127,40 +127,33 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
         std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
         std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
 
-        if (op->getOpType() == OpType::Div)
-            div_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
-                       b[2], b[3], c[0], c[1], c[2], c[3]);
-        else if (op->getOpType() == OpType::Pow)
-            pow_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
-                       b[2], b[3], c[0], c[1], c[2], c[3]);
-        else if (op->getOpType() == OpType::Add) {
-            add_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
-                       b[2], b[3], c[0], c[1], c[2], c[3]);
+        const int dType = _op->getDType().getIndex();
+        if (op->getOpType() == OpType::Div) {
+            div_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3], b[0],
+                       b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
+        } else if (op->getOpType() == OpType::Add) {
+            add_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3], b[0],
+                       b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
+        } else if (op->getOpType() == OpType::Pow) {
+            pow_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3], b[0],
+                       b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
         } else if (op->getOpType() == OpType::Less) {
-            less_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
-                        b[2], b[3], c[0], c[1], c[2], c[3]);
-        } else
+            less_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3],
+                        b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
+        } else {
             IT_TODO_HALT();
+        }
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Add, DataType::Float32, AddCudnn,
-                "Add_cuDNN_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Sub, DataType::Float32, SubCudnn,
-                "Sub_cuDNN_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Mul, DataType::Float32, MulCudnn,
-                "Mul_cuDNN_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Min, DataType::Float32, MinCudnn,
-                "Min_cuDNN_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Max, DataType::Float32, MaxCudnn,
-                "Max_cuDNN_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Add, AddCudnn, "Add_cuDNN_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Sub, SubCudnn, "Sub_cuDNN_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Mul, MulCudnn, "Mul_cuDNN_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Min, MinCudnn, "Min_cuDNN_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Max, MaxCudnn, "Max_cuDNN_CUDA");
+
+REGISTER_KERNEL(Device::CUDA, OpType::Div, ElementWiseCuda, "Div_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Pow, ElementWiseCuda, "Pow_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Less, ElementWiseCuda, "Less_CUDA");
 
-REGISTER_KERNEL(Device::CUDA, OpType::Div, DataType::Float32, ElementWiseCuda,
-                "Div_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Add, DataType::Int64, ElementWiseCuda,
-                "Add_CUDA_Int64");
-REGISTER_KERNEL(Device::CUDA, OpType::Pow, DataType::Float32, ElementWiseCuda,
-                "Pow__CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Less, DataType::Int64, ElementWiseCuda,
-                "Less__CUDA_Int64");
 }; // namespace infini
diff --git a/src/kernels/cuda/element_wise.cu b/src/kernels/cuda/element_wise.cu
index 9d1b101a..98a12571 100644
--- a/src/kernels/cuda/element_wise.cu
+++ b/src/kernels/cuda/element_wise.cu
@@ -1,4 +1,5 @@
 #include "cuda/cuda_common.h"
+#include "cuda/cuda_utility.h"
 #include <math.h>
 
 constexpr unsigned int num_threads() { return 32 * 4; }
@@ -129,44 +130,113 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
     }
 }
 
+#define CASE(OP, T)                                                            \
+    _##OP##_kernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>(                    \
+        a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
+
+#define SWITCH_DTYPE(OP, DTYPE)                                                \
+    switch (DTYPE) {                                                           \
+    case 1:                                                                    \
+        CASE(OP, 1)                                                            \
+        break;                                                                 \
+    case 2:                                                                    \
+        CASE(OP, 2)                                                            \
+        break;                                                                 \
+    case 3:                                                                    \
+        CASE(OP, 3)                                                            \
+        break;                                                                 \
+    case 4:                                                                    \
+        CASE(OP, 4)                                                            \
+        break;                                                                 \
+    case 5:                                                                    \
+        CASE(OP, 5)                                                            \
+        break;                                                                 \
+    case 6:                                                                    \
+        CASE(OP, 6)                                                            \
+        break;                                                                 \
+    case 7:                                                                    \
+        CASE(OP, 7)                                                            \
+        break;                                                                 \
+    case 10:                                                                   \
+        CASE(OP, 10)                                                           \
+        break;                                                                 \
+    case 11:                                                                   \
+        CASE(OP, 11)                                                           \
+        break;                                                                 \
+    case 12:                                                                   \
+        CASE(OP, 12)                                                           \
+        break;                                                                 \
+    case 13:                                                                   \
+        CASE(OP, 13)                                                           \
+        break;                                                                 \
+    case 16:                                                                   \
+        CASE(OP, 16)                                                           \
+        break;                                                                 \
+    default:                                                                   \
+        IT_TODO_HALT();                                                        \
+    }
+
 namespace infini {
-void div_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
-                int b0, int b1, int b2, int b3, int c0, int c1, int c2,
+void div_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
+                int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
                 int c3) {
 
     int blocksize = block_work_size();
     int num = c0 * c1 * c2 * c3;
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _div_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1,
-                                                b2, b3, c0, c1, c2, c3);
+    SWITCH_DTYPE(div, dType)
 }
-void add_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
-                int b0, int b1, int b2, int b3, int c0, int c1, int c2,
+void add_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
+                int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
                 int c3) {
 
     int blocksize = block_work_size();
     int num = c0 * c1 * c2 * c3;
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _add_kernel<int64_t><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0,
-                                                  b1, b2, b3, c0, c1, c2, c3);
+    SWITCH_DTYPE(add, dType)
 }
-void pow_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
-                int b0, int b1, int b2, int b3, int c0, int c1, int c2,
+void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
+                int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
                 int c3) {
     int blocksize = block_work_size();
     int num = c0 * c1 * c2 * c3;
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _pow_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1,
-                                                b2, b3, c0, c1, c2, c3);
+    if (dType == 1) {
+        _pow_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0,
+                                                    b1, b2, b3, c0, c1, c2, c3);
+    } else if (dType == 3) {
+        _pow_kernel<int8_t><<<gridsize, blocksize>>>(
+            a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
+    } else if (dType == 10) {
+        int a_size = a0 * a1 * a2 * a3;
+        int b_size = b0 * b1 * b2 * b3;
+        int c_size = c0 * c1 * c2 * c3;
+        vector<float> a_float(a_size);
+        vector<float> b_float(b_size);
+        vector<float> c_float(c_size);
+        for (int i = 0; i < a_size; ++i) {
+            a_float[i] = __half2float(((half *)a)[i]);
+        }
+        for (int i = 0; i < b_size; ++i) {
+            b_float[i] = __half2float(((half *)b)[i]);
+        }
+        _pow_kernel<float><<<gridsize, blocksize>>>(
+            a_float.data(), b_float.data(), c_float.data(), a0, a1, a2, a3, b0,
+            b1, b2, b3, c0, c1, c2, c3);
+        for (int i = 0; i < c_size; ++i) {
+            ((half *)c)[i] = __float2half(c_float[i]);
+        }
+    } else {
+        IT_TODO_HALT();
+    }
 }
-void less_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
-                 int b0, int b1, int b2, int b3, int c0, int c1, int c2,
+void less_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
+                 int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
                  int c3) {
     int blocksize = block_work_size();
     int num = c0 * c1 * c2 * c3;
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _less_kernel<int64_t><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0,
-                                                   b1, b2, b3, c0, c1, c2, c3);
+    SWITCH_DTYPE(less, dType)
 }
 
 }; // namespace infini
diff --git a/src/kernels/cuda/expand.cc b/src/kernels/cuda/expand.cc
index acbf5cd2..35b14f85 100644
--- a/src/kernels/cuda/expand.cc
+++ b/src/kernels/cuda/expand.cc
@@ -25,12 +25,12 @@ class ExpandCuda : public CudaKernelWithoutConfig {
             inputShape.data[i] = in_Shape[i];
             outputsize *= out_Shape[i];
         }
-        expandKernel((float *)inputData, (float *)outputData, nDims, outputsize,
+        const int dType = op->getDType().getIndex();
+        expandKernel(dType, inputData, outputData, nDims, outputsize,
                      inputShape, outputShape);
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Expand, DataType::Float32, ExpandCuda,
-                "Expand_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Expand, ExpandCuda, "Expand_CUDA");
 
 }; // namespace infini
diff --git a/src/kernels/cuda/expand.cu b/src/kernels/cuda/expand.cu
index 09405d09..af92b9ce 100644
--- a/src/kernels/cuda/expand.cu
+++ b/src/kernels/cuda/expand.cu
@@ -1,12 +1,14 @@
 #include "core/common.h"
 #include "cuda/cuda_common.h"
+#include "cuda/cuda_utility.h"
 #include "utils/small_array.h"
 
 constexpr unsigned int num_threads() { return 32 * 4; }
 constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
 
-__global__ void _expandKernel(float *input, float *output, int nDims,
+template <class T>
+__global__ void _expandKernel(void *input, void *output, int nDims,
                               int outputsize, infini::SmallArray inputShape,
                               infini::SmallArray outputShape) {
 
@@ -33,17 +35,64 @@ __global__ void _expandKernel(float *input, float *output, int nDims,
             temp *= inputShape.data[i];
             v = v / outputShape.data[i];
         }
-        output[outputIdx] = input[inputIdx];
+        ((T *)output)[outputIdx] = ((T *)input)[inputIdx];
     }
 }
 
 namespace infini {
-void expandKernel(float *input, float *output, int nDims, int outputsize,
-                  SmallArray inputShape, SmallArray outputShape) {
+
+#define CASE(T)                                                                \
+    _expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>(                     \
+        input, output, nDims, outputsize, inputShape, outputShape);
+
+#define SWITCH_DTYPE(DTYPE)                                                    \
+    switch (DTYPE) {                                                           \
+    case 1:                                                                    \
+        CASE(1)                                                                \
+        break;                                                                 \
+    case 2:                                                                    \
+        CASE(2)                                                                \
+        break;                                                                 \
+    case 3:                                                                    \
+        CASE(3)                                                                \
+        break;                                                                 \
+    case 4:                                                                    \
+        CASE(4)                                                                \
+        break;                                                                 \
+    case 5:                                                                    \
+        CASE(5)                                                                \
+        break;                                                                 \
+    case 6:                                                                    \
+        CASE(6)                                                                \
+        break;                                                                 \
+    case 7:                                                                    \
+        CASE(7)                                                                \
+        break;                                                                 \
+    case 10:                                                                   \
+        CASE(10)                                                               \
+        break;                                                                 \
+    case 11:                                                                   \
+        CASE(11)                                                               \
+        break;                                                                 \
+    case 12:                                                                   \
+        CASE(12)                                                               \
+        break;                                                                 \
+    case 13:                                                                   \
+        CASE(13)                                                               \
+        break;                                                                 \
+    case 16:                                                                   \
+        CASE(16)                                                               \
+        break;                                                                 \
+    default:                                                                   \
+        IT_TODO_HALT();                                                        \
+    }
+
+void expandKernel(int dType, void *input, void *output, int nDims,
+                  int outputsize, SmallArray inputShape,
+                  SmallArray outputShape) {
     int blocksize = block_work_size();
     int gridsize = (outputsize + block_work_size() - 1) / block_work_size();
-    _expandKernel<<<gridsize, blocksize>>>(input, output, nDims, outputsize,
-                                           inputShape, outputShape);
+    SWITCH_DTYPE(dType)
 }
 
 } // namespace infini
diff --git a/src/kernels/cuda/extend.cc b/src/kernels/cuda/extend.cc
index a5603e02..c8df7ff1 100644
--- a/src/kernels/cuda/extend.cc
+++ b/src/kernels/cuda/extend.cc
@@ -8,6 +8,7 @@ class ExtendCuda : public CudaKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ExtendObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto inData = op->getInputs(0)->getRawDataPtr<float *>();
         auto outData = op->getOutputs()[0]->getRawDataPtr<float *>();
         int blockSize = 1;
@@ -22,6 +23,5 @@ class ExtendCuda : public CudaKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Extend, DataType::Float32, ExtendCuda,
-                "Extend_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Extend, ExtendCuda, "Extend_CUDA");
 } // namespace infini
diff --git a/src/kernels/cuda/gather.cc b/src/kernels/cuda/gather.cc
index 54e6bd10..4417e3b4 100644
--- a/src/kernels/cuda/gather.cc
+++ b/src/kernels/cuda/gather.cc
@@ -15,12 +15,23 @@ class GatherCuda : public CudaKernelWithoutConfig {
         GatherMetaData metaData;
         initGatherMetaData(metaData, op);
 
-        auto inData = input->getRawDataPtr<float *>();
-        auto outData = op->getOutput()->getRawDataPtr<float *>();
-        gather_kernel(inData, outData, metaData, op->getOutput()->size());
+        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
+
+        if (op->getDType() == DataType::Float32) {
+            gather_kernel<float>((float *)inputData, (float *)outputData,
+                                 metaData, op->getOutput()->size());
+        } else if (op->getDType() == DataType::Float16) {
+            gather_kernel<half>((half *)inputData, (half *)outputData, metaData,
+                                op->getOutput()->size());
+        } else if (op->getDType() == DataType::Int8) {
+            gather_kernel<int8_t>((int8_t *)inputData, (int8_t *)outputData,
+                                  metaData, op->getOutput()->size());
+        } else {
+            IT_ASSERT(false);
+        }
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Gather, DataType::Float32, GatherCuda,
-                "Gather_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Gather, GatherCuda, "Gather_CUDA");
 } // namespace infini
diff --git a/src/kernels/cuda/gather.cu b/src/kernels/cuda/gather.cu
index 8ffeeac9..c9dedd95 100644
--- a/src/kernels/cuda/gather.cu
+++ b/src/kernels/cuda/gather.cu
@@ -28,27 +28,32 @@ __device__ T gatheredOffset2Offset(int gOffset,
     return offset;
 }
 
-template <typename T>
-__global__ void _gather_kernel(float *in, float *out,
+template <typename dataT, typename T>
+__global__ void _gather_kernel(dataT *in, dataT *out,
                                infini::GatherMetaData metaData, size_t num) {
     T tid = threadIdx.x + blockIdx.x * blockDim.x;
-    int stride = blockDim.x * gridDim.x;
-    while (tid < num) {
+    if (tid < num) {
         T offset = gatheredOffset2Offset<T>(tid, metaData);
         out[tid] = in[offset];
-        tid += stride;
     }
 }
 
 namespace infini {
-void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num) {
+template <typename T>
+void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num) {
     int blockSize = 32 * 16;
     int gridSize = (num + blockSize - 1) / blockSize;
     if (metaData.indexType == DataType::Int64) {
-        _gather_kernel<int64_t>
+        _gather_kernel<T, int64_t>
             <<<gridSize, blockSize>>>(in, out, metaData, num);
     } else {
-        _gather_kernel<int><<<gridSize, blockSize>>>(in, out, metaData, num);
+        _gather_kernel<T, int><<<gridSize, blockSize>>>(in, out, metaData, num);
     }
 }
+template void gather_kernel<float>(float *in, float *out,
+                                   GatherMetaData metaData, size_t num);
+template void gather_kernel<half>(half *in, half *out, GatherMetaData metaData,
+                                  size_t num);
+template void gather_kernel<int8_t>(int8_t *in, int8_t *out,
+                                    GatherMetaData metaData, size_t num);
 } // namespace infini
diff --git a/src/kernels/cuda/gather_elements.cc b/src/kernels/cuda/gather_elements.cc
index 795a5c6f..943f0209 100644
--- a/src/kernels/cuda/gather_elements.cc
+++ b/src/kernels/cuda/gather_elements.cc
@@ -21,8 +21,7 @@ class GatherElementsCuda : public CudaKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::GatherElements, DataType::Float32,
-                GatherElementsCuda, "GatherELements_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::GatherElements, DataType::Int32,
-                GatherElementsCuda, "GatherElements_CUDA_Int32");
+REGISTER_KERNEL(Device::CUDA, OpType::GatherElements, GatherElementsCuda,
+                "GatherELements_CUDA");
+
 } // namespace infini
diff --git a/src/kernels/cuda/layer_norm.cc b/src/kernels/cuda/layer_norm.cc
index a301eb0b..2cd3c786 100644
--- a/src/kernels/cuda/layer_norm.cc
+++ b/src/kernels/cuda/layer_norm.cc
@@ -24,22 +24,41 @@ class LayerNormCuda : public CudaKernelWithoutConfig {
         int dimsize = dims[op->getAxis()];
         int size = op->getOutput(0)->size();
         int scaleSize = op->getInputs(1)->size();
-        if (op->numInputs() == 3) {
-            void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
-            int biasSize = op->getInputs(2)->size();
-            // printf("kernel bias:true:%d\n", 1);
-            LaynormKernel((float *)inputData, (float *)scaleData, eps, size,
-                          scaleSize, dimsize, stride, (float *)outputData,
-                          (float *)biasData, biasSize);
+        if (op->getDType() == DataType::Float32) {
+            if (op->numInputs() == 3) {
+                void *const biasData =
+                    (op->getInputs(2)->getRawDataPtr<void *>());
+                int biasSize = op->getInputs(2)->size();
+                // printf("kernel bias:true:%d\n", 1);
+                LaynormKernel((float *)inputData, (float *)scaleData, eps, size,
+                              scaleSize, dimsize, stride, (float *)outputData,
+                              (float *)biasData, biasSize);
+            } else {
+                // printf("kernel bias:false:%d\n", 0);
+                LaynormKernel((float *)inputData, (float *)scaleData, eps, size,
+                              scaleSize, dimsize, stride, (float *)outputData);
+            }
+        } else if (op->getDType() == DataType::Float16) {
+            if (op->numInputs() == 3) {
+                void *const biasData =
+                    (op->getInputs(2)->getRawDataPtr<void *>());
+                int biasSize = op->getInputs(2)->size();
+                // printf("kernel bias:true:%d\n", 1);
+                LaynormKernel((half *)inputData, (half *)scaleData, eps, size,
+                              scaleSize, dimsize, stride, (half *)outputData,
+                              (half *)biasData, biasSize);
+            } else {
+                // printf("kernel bias:false:%d\n", 0);
+                LaynormKernel((half *)inputData, (half *)scaleData, eps, size,
+                              scaleSize, dimsize, stride, (half *)outputData);
+            }
         } else {
-            // printf("kernel bias:false:%d\n", 0);
-            LaynormKernel((float *)inputData, (float *)scaleData, eps, size,
-                          scaleSize, dimsize, stride, (float *)outputData);
+            IT_ASSERT(false);
         }
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::LayerNormalization, DataType::Float32,
-                LayerNormCuda, "LayerNorm_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::LayerNormalization, LayerNormCuda,
+                "LayerNorm_CUDA");
 
 }; // namespace infini
diff --git a/src/kernels/cuda/layer_norm.cu b/src/kernels/cuda/layer_norm.cu
index c5e6e492..26f06e28 100644
--- a/src/kernels/cuda/layer_norm.cu
+++ b/src/kernels/cuda/layer_norm.cu
@@ -1,43 +1,41 @@
 #include "cuda/cuda_common.h"
 #include <cub/cub.cuh>
 
-template <int BLOCK_DIM>
+template <typename T, int BLOCK_DIM>
 __launch_bounds__(BLOCK_DIM) __global__
-    void blockLaynormKernel(const float *input, const float *scale,
-                            const int dimsize, const int stride, float *output,
-                            const float eps, int scaleSize, const float *bias,
-                            int biasSize) {
+    void blockLaynormKernel(const T *input, const T *scale, const int dimsize,
+                            const int stride, T *output, const T eps,
+                            int scaleSize, const T *bias, int biasSize) {
     // len(scale) = len(bias) = dimsize
     int tmp = blockIdx.x % stride;
     int tid = (blockIdx.x - tmp) * dimsize + tmp;
-    float muPartial = 0.0f;
+    T muPartial = 0.0f;
     for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
         muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
     }
-    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+    typedef cub::BlockReduce<T, BLOCK_DIM> BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
-    __shared__ float mu;
-    float muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum());
+    __shared__ T mu;
+    T muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum());
     if (threadIdx.x ==
         0) { // must set threadIdx.x = 0 write the output to memory
-        mu = muBlock / dimsize;
+        mu = muBlock * static_cast<T>(__fdividef(1.0F, dimsize));
     }
     __syncthreads();
 
-    float sigma2Partial = 0.0f;
+    T sigma2Partial = 0.0f;
     for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
         sigma2Partial +=
             (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
             (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu);
     }
-    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+    typedef cub::BlockReduce<T, BLOCK_DIM> BlockReduce;
 
-    __shared__ float sigma2;
-    float sigma2Block =
-        BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
+    __shared__ T sigma2;
+    T sigma2Block = BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
     if (threadIdx.x ==
         0) { // must set threadIdx.x = 0 write the output to memory
-        sigma2 = sigma2Block / dimsize;
+        sigma2 = sigma2Block * static_cast<T>(__fdividef(1.0F, dimsize));
     }
     __syncthreads();
     if (biasSize == dimsize) {
@@ -47,8 +45,9 @@ __launch_bounds__(BLOCK_DIM) __global__
                 output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
                     scale[threadIdx.x + ph * BLOCK_DIM] *
                         (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
-                         mu) /
-                        sqrt(sigma2 + eps) +
+                         mu) *
+                        static_cast<T>(__fdividef(
+                            1.0F, sqrt(static_cast<float>(sigma2 + eps)))) +
                     bias[threadIdx.x + ph * BLOCK_DIM];
             }
         } else {
@@ -57,8 +56,9 @@ __launch_bounds__(BLOCK_DIM) __global__
                 output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
                     scale[0] *
                         (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
-                         mu) /
-                        sqrt(sigma2 + eps) +
+                         mu) *
+                        static_cast<T>(__fdividef(
+                            1.0F, sqrt(static_cast<float>(sigma2 + eps)))) +
                     bias[threadIdx.x + ph * BLOCK_DIM];
             }
         }
@@ -69,8 +69,9 @@ __launch_bounds__(BLOCK_DIM) __global__
                 output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
                     scale[threadIdx.x + ph * BLOCK_DIM] *
                         (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
-                         mu) /
-                        sqrt(sigma2 + eps) +
+                         mu) *
+                        static_cast<T>(__fdividef(
+                            1.0F, sqrt(static_cast<float>(sigma2 + eps)))) +
                     bias[0];
             }
         } else {
@@ -79,50 +80,50 @@ __launch_bounds__(BLOCK_DIM) __global__
                 output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
                     scale[0] *
                         (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
-                         mu) /
-                        sqrt(sigma2 + eps) +
+                         mu) *
+                        static_cast<T>(__fdividef(
+                            1.0F, sqrt(static_cast<float>(sigma2 + eps)))) +
                     bias[0];
             }
         }
     }
 }
 //-----------------
-template <int BLOCK_DIM>
+template <typename T, int BLOCK_DIM>
 __launch_bounds__(BLOCK_DIM) __global__
-    void blockLaynormKernel(const float *input, const float *scale,
-                            const int dimsize, const int stride, float *output,
-                            const float eps, int scaleSize) {
+    void blockLaynormKernel(const T *input, const T *scale, const int dimsize,
+                            const int stride, T *output, const T eps,
+                            int scaleSize) {
     // len(scale) = len(bias) = dimsize
     int tmp = blockIdx.x % stride;
     int tid = (blockIdx.x - tmp) * dimsize + tmp;
-    float muPartial = 0.0f;
+    T muPartial = 0.0f;
     for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
         muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
     }
-    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+    typedef cub::BlockReduce<T, BLOCK_DIM> BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
-    __shared__ float mu;
-    float muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum());
+    __shared__ T mu;
+    T muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum());
     if (threadIdx.x ==
         0) { // must set threadIdx.x = 0 write the output to memory
-        mu = muBlock / dimsize;
+        mu = muBlock * static_cast<T>(__fdividef(1.0F, dimsize));
     }
     __syncthreads();
 
-    float sigma2Partial = 0.0f;
+    T sigma2Partial = 0.0f;
     for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
         sigma2Partial +=
             (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
             (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu);
     }
-    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+    typedef cub::BlockReduce<T, BLOCK_DIM> BlockReduce;
 
-    __shared__ float sigma2;
-    float sigma2Block =
-        BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
+    __shared__ T sigma2;
+    T sigma2Block = BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
     if (threadIdx.x ==
         0) { // must set threadIdx.x = 0 write the output to memory
-        sigma2 = sigma2Block / dimsize;
+        sigma2 = sigma2Block * static_cast<T>(__fdividef(1.0F, dimsize));
     }
     __syncthreads();
     if (scaleSize == dimsize) {
@@ -130,16 +131,18 @@ __launch_bounds__(BLOCK_DIM) __global__
 
             output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
                 scale[threadIdx.x + ph * BLOCK_DIM] *
-                (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) /
-                sqrt(sigma2 + eps);
+                (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
+                static_cast<T>(
+                    __fdividef(1.0F, sqrt(static_cast<float>(sigma2 + eps))));
         }
     } else {
         for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
 
             output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
                 scale[0] *
-                (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) /
-                sqrt(sigma2 + eps);
+                (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
+                static_cast<T>(
+                    __fdividef(1.0F, sqrt(static_cast<float>(sigma2 + eps))));
         }
     }
 }
@@ -158,33 +161,33 @@ __inline__ __device__ T WarpAllReduce(T val) {
     }
     return val;
 }
-template <int BLOCK_DIM_x, int BLOCK_DIM_y>
-__global__ void warpLaynormKernel(const float *input, const float *scale,
+template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y>
+__global__ void warpLaynormKernel(const T *input, const T *scale,
                                   const int dimsize, const int stride,
-                                  float *output, const float eps, int scaleSize,
-                                  int otherSize, const float *bias,
-                                  int biasSize) {
+                                  T *output, const T eps, int scaleSize,
+                                  int otherSize, const T *bias, int biasSize) {
     int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
 
     int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
     if (otherIdx < otherSize) {
 
-        __shared__ float muTotal[BLOCK_DIM_y];
-        __shared__ float sigma2Total[BLOCK_DIM_y];
+        __shared__ T muTotal[BLOCK_DIM_y];
+        __shared__ T sigma2Total[BLOCK_DIM_y];
 
-        float muPartial = 0.0f;
+        T muPartial = 0.0f;
 
         for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
             muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride];
         }
 
-        muPartial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(muPartial);
+        muPartial = WarpAllReduce<SumOp, T, BLOCK_DIM_x>(muPartial);
 
         if (threadIdx.x == 0)
-            muTotal[threadIdx.y] = muPartial / dimsize;
+            muTotal[threadIdx.y] =
+                muPartial * static_cast<T>(__fdividef(1.0F, dimsize));
 
         //--------------------------------------------
-        float sigma2Partial = 0.0f;
+        T sigma2Partial = 0.0f;
 
         for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
             sigma2Partial +=
@@ -194,10 +197,11 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
                  muTotal[threadIdx.y]);
         }
 
-        sigma2Partial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sigma2Partial);
+        sigma2Partial = WarpAllReduce<SumOp, T, BLOCK_DIM_x>(sigma2Partial);
 
         if (threadIdx.x == 0)
-            sigma2Total[threadIdx.y] = sigma2Partial / dimsize;
+            sigma2Total[threadIdx.y] =
+                sigma2Partial * static_cast<T>(__fdividef(1.0F, dimsize));
 
         //--------------------------------------------
         if (biasSize == dimsize) {
@@ -209,8 +213,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
                         scale[threadIdx.x + ph * BLOCK_DIM_x] *
                             (input[tid +
                                    (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
-                             muTotal[threadIdx.y]) /
-                            sqrt(sigma2Total[threadIdx.y] + eps) +
+                             muTotal[threadIdx.y]) *
+                            static_cast<T>(__fdividef(
+                                1.0F, sqrt(static_cast<float>(
+                                          sigma2Total[threadIdx.y] + eps)))) +
                         bias[threadIdx.x + ph * BLOCK_DIM_x];
                 }
             } else {
@@ -221,8 +227,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
                         scale[0] *
                             (input[tid +
                                    (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
-                             muTotal[threadIdx.y]) /
-                            sqrt(sigma2Total[threadIdx.y] + eps) +
+                             muTotal[threadIdx.y]) *
+                            static_cast<T>(__fdividef(
+                                1.0F, sqrt(static_cast<float>(
+                                          sigma2Total[threadIdx.y] + eps)))) +
                         bias[threadIdx.x + ph * BLOCK_DIM_x];
                 }
             }
@@ -235,8 +243,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
                         scale[threadIdx.x + ph * BLOCK_DIM_x] *
                             (input[tid +
                                    (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
-                             muTotal[threadIdx.y]) /
-                            sqrt(sigma2Total[threadIdx.y] + eps) +
+                             muTotal[threadIdx.y]) *
+                            static_cast<T>(__fdividef(
+                                1.0F, sqrt(static_cast<float>(
+                                          sigma2Total[threadIdx.y] + eps)))) +
                         bias[0];
                 }
             } else {
@@ -247,40 +257,43 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
                         scale[0] *
                             (input[tid +
                                    (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
-                             muTotal[threadIdx.y]) /
-                            sqrt(sigma2Total[threadIdx.y] + eps) +
+                             muTotal[threadIdx.y]) *
+                            static_cast<T>(__fdividef(
+                                1.0F, sqrt(static_cast<float>(
+                                          sigma2Total[threadIdx.y] + eps)))) +
                         bias[0];
                 }
             }
         }
     }
 }
-template <int BLOCK_DIM_x, int BLOCK_DIM_y>
-__global__ void warpLaynormKernel(const float *input, const float *scale,
+template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y>
+__global__ void warpLaynormKernel(const T *input, const T *scale,
                                   const int dimsize, const int stride,
-                                  float *output, const float eps, int scaleSize,
+                                  T *output, const T eps, int scaleSize,
                                   int otherSize) {
     int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
 
     int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
     if (otherIdx < otherSize) {
 
-        __shared__ float muTotal[BLOCK_DIM_y];
-        __shared__ float sigma2Total[BLOCK_DIM_y];
+        __shared__ T muTotal[BLOCK_DIM_y];
+        __shared__ T sigma2Total[BLOCK_DIM_y];
 
-        float muPartial = 0.0f;
+        T muPartial = 0.0f;
 
         for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
             muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride];
         }
 
-        muPartial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(muPartial);
+        muPartial = WarpAllReduce<SumOp, T, BLOCK_DIM_x>(muPartial);
 
         if (threadIdx.x == 0)
-            muTotal[threadIdx.y] = muPartial / dimsize;
+            muTotal[threadIdx.y] =
+                muPartial * static_cast<T>(__fdividef(1.0F, dimsize));
 
         //--------------------------------------------
-        float sigma2Partial = 0.0f;
+        T sigma2Partial = 0.0f;
 
         for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
             sigma2Partial +=
@@ -290,10 +303,11 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
                  muTotal[threadIdx.y]);
         }
 
-        sigma2Partial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sigma2Partial);
+        sigma2Partial = WarpAllReduce<SumOp, T, BLOCK_DIM_x>(sigma2Partial);
 
         if (threadIdx.x == 0)
-            sigma2Total[threadIdx.y] = sigma2Partial / dimsize;
+            sigma2Total[threadIdx.y] =
+                sigma2Partial * static_cast<T>(__fdividef(1.0F, dimsize));
 
         //--------------------------------------------
         if (scaleSize == dimsize) {
@@ -302,8 +316,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
                 output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
                     scale[threadIdx.x + ph * BLOCK_DIM_x] *
                     (input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
-                     muTotal[threadIdx.y]) /
-                    sqrt(sigma2Total[threadIdx.y] + eps);
+                     muTotal[threadIdx.y]) *
+                    static_cast<T>(
+                        __fdividef(1.0F, sqrt(static_cast<float>(
+                                             sigma2Total[threadIdx.y] + eps))));
             }
         } else {
             for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
@@ -311,8 +327,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
                 output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
                     scale[0] *
                     (input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
-                     muTotal[threadIdx.y]) /
-                    sqrt(sigma2Total[threadIdx.y] + eps);
+                     muTotal[threadIdx.y]) *
+                    static_cast<T>(
+                        __fdividef(1.0F, sqrt(static_cast<float>(
+                                             sigma2Total[threadIdx.y] + eps))));
             }
         }
     }
@@ -325,7 +343,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
     if (dimsize > 1024) {
         int BLOCK_DIM = 1024;
 
-        blockLaynormKernel<1024>
+        blockLaynormKernel<float, 1024>
             <<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
                                        eps, scaleSize, bias, biasSize);
     } else if (dimsize > 31) {
@@ -335,7 +353,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
         dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
         dim3 grid_dim(num_block_x, 1, 1);
 
-        warpLaynormKernel<32, 32><<<grid_dim, block_dim>>>(
+        warpLaynormKernel<float, 32, 32><<<grid_dim, block_dim>>>(
             input, scale, dimsize, stride, output, eps, scaleSize, num_block,
             bias, biasSize);
     } else if (dimsize > 15) {
@@ -345,7 +363,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
         dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
         dim3 grid_dim(num_block_x, 1, 1);
 
-        warpLaynormKernel<16, 64><<<grid_dim, block_dim>>>(
+        warpLaynormKernel<float, 16, 64><<<grid_dim, block_dim>>>(
             input, scale, dimsize, stride, output, eps, scaleSize, num_block,
             bias, biasSize);
     } else if (dimsize > 7) {
@@ -355,7 +373,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
         dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
         dim3 grid_dim(num_block_x, 1, 1);
 
-        warpLaynormKernel<8, 128><<<grid_dim, block_dim>>>(
+        warpLaynormKernel<float, 8, 128><<<grid_dim, block_dim>>>(
             input, scale, dimsize, stride, output, eps, scaleSize, num_block,
             bias, biasSize);
     } else {
@@ -365,7 +383,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
         dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
         dim3 grid_dim(num_block_x, 1, 1);
 
-        warpLaynormKernel<4, 256><<<grid_dim, block_dim>>>(
+        warpLaynormKernel<float, 4, 256><<<grid_dim, block_dim>>>(
             input, scale, dimsize, stride, output, eps, scaleSize, num_block,
             bias, biasSize);
     }
@@ -378,7 +396,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
     if (dimsize > 1024) {
         int BLOCK_DIM = 1024;
 
-        blockLaynormKernel<1024><<<num_block, BLOCK_DIM>>>(
+        blockLaynormKernel<float, 1024><<<num_block, BLOCK_DIM>>>(
             input, scale, dimsize, stride, output, eps, scaleSize);
     } else if (dimsize > 31) {
         int BLOCK_DIM_x = 32;
@@ -387,7 +405,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
         dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
         dim3 grid_dim(num_block_x, 1, 1);
 
-        warpLaynormKernel<32, 32><<<grid_dim, block_dim>>>(
+        warpLaynormKernel<float, 32, 32><<<grid_dim, block_dim>>>(
             input, scale, dimsize, stride, output, eps, scaleSize, num_block);
     } else if (dimsize > 15) {
         int BLOCK_DIM_x = 16;
@@ -396,7 +414,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
         dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
         dim3 grid_dim(num_block_x, 1, 1);
 
-        warpLaynormKernel<16, 64><<<grid_dim, block_dim>>>(
+        warpLaynormKernel<float, 16, 64><<<grid_dim, block_dim>>>(
             input, scale, dimsize, stride, output, eps, scaleSize, num_block);
     } else if (dimsize > 7) {
         int BLOCK_DIM_x = 8;
@@ -405,7 +423,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
         dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
         dim3 grid_dim(num_block_x, 1, 1);
 
-        warpLaynormKernel<8, 128><<<grid_dim, block_dim>>>(
+        warpLaynormKernel<float, 8, 128><<<grid_dim, block_dim>>>(
             input, scale, dimsize, stride, output, eps, scaleSize, num_block);
     } else {
         int BLOCK_DIM_x = 4;
@@ -414,7 +432,108 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
         dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
         dim3 grid_dim(num_block_x, 1, 1);
 
-        warpLaynormKernel<4, 256><<<grid_dim, block_dim>>>(
+        warpLaynormKernel<float, 4, 256><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+    }
+}
+//-----------------
+void LaynormKernel(const half *input, const half *scale, const half eps,
+                   int size, int scaleSize, const int dimsize, const int stride,
+                   half *output, const half *bias, int biasSize) {
+    int num_block = size / dimsize;
+    if (dimsize > 1024) {
+        int BLOCK_DIM = 1024;
+
+        blockLaynormKernel<half, 1024>
+            <<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
+                                       eps, scaleSize, bias, biasSize);
+    } else if (dimsize > 31) {
+        int BLOCK_DIM_x = 32;
+        int BLOCK_DIM_y = 32;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<half, 32, 32><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+            bias, biasSize);
+    } else if (dimsize > 15) {
+        int BLOCK_DIM_x = 16;
+        int BLOCK_DIM_y = 64;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<half, 16, 64><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+            bias, biasSize);
+    } else if (dimsize > 7) {
+        int BLOCK_DIM_x = 8;
+        int BLOCK_DIM_y = 128;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<half, 8, 128><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+            bias, biasSize);
+    } else {
+        int BLOCK_DIM_x = 4;
+        int BLOCK_DIM_y = 256;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<half, 4, 256><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+            bias, biasSize);
+    }
+}
+
+void LaynormKernel(const half *input, const half *scale, const half eps,
+                   int size, int scaleSize, const int dimsize, const int stride,
+                   half *output) {
+    int num_block = size / dimsize;
+    if (dimsize > 1024) {
+        int BLOCK_DIM = 1024;
+
+        blockLaynormKernel<half, 1024><<<num_block, BLOCK_DIM>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize);
+    } else if (dimsize > 31) {
+        int BLOCK_DIM_x = 32;
+        int BLOCK_DIM_y = 32;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<half, 32, 32><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+    } else if (dimsize > 15) {
+        int BLOCK_DIM_x = 16;
+        int BLOCK_DIM_y = 64;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<half, 16, 64><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+    } else if (dimsize > 7) {
+        int BLOCK_DIM_x = 8;
+        int BLOCK_DIM_y = 128;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<half, 8, 128><<<grid_dim, block_dim>>>(
+            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+    } else {
+        int BLOCK_DIM_x = 4;
+        int BLOCK_DIM_y = 256;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpLaynormKernel<half, 4, 256><<<grid_dim, block_dim>>>(
             input, scale, dimsize, stride, output, eps, scaleSize, num_block);
     }
 }
diff --git a/src/kernels/cuda/matmul.cc b/src/kernels/cuda/matmul.cc
index 2d457cbc..e2addde1 100644
--- a/src/kernels/cuda/matmul.cc
+++ b/src/kernels/cuda/matmul.cc
@@ -2,6 +2,7 @@
 #include "core/kernel.h"
 #include "cuda/cuda_expand.h"
 #include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
 #include "utils/small_array.h"
 
 namespace infini {
@@ -48,11 +49,12 @@ class matmulCublas : public Kernel {
         auto opB = op->getTransB() ? CUBLAS_OP_T : CUBLAS_OP_N;
         const int lda = op->getTransA() ? m : k, ldb = op->getTransB() ? k : n,
                   ldc = n;
-        float alpha = 1.f, beta = 0.f;
-        if (op->numInputs() == 2) { // no bias
-            beta = 0.f;
-        } else { // broadcast bias to output
-            beta = 1.f;
+        float alpha_naive = 1.f, beta_naive = 0.f;
+        auto dataType = op->getDType();
+        auto cuDataType = cublasDataTypeConvert(dataType);
+        IT_ASSERT(cuDataType != CUDA_R_8I, "matmul don't support int8 dtype.");
+        if (op->numInputs() == 3) { // have bias
+            beta_naive = 1.f;
             auto inC = op->getInputs(2);
             auto out = op->getOutput();
             SmallArray inputShape, outputShape;
@@ -69,8 +71,9 @@ class matmulCublas : public Kernel {
                 if (i >= offset)
                     inputShape.data[i] = inC->getDims()[i - offset];
             }
-            expandKernel(inC->getRawDataPtr<float *>(),
-                         out->getRawDataPtr<float *>(), nDims, outputsize,
+            const int dType = dataType.getIndex();
+            expandKernel(dType, inC->getRawDataPtr<void *>(),
+                         out->getRawDataPtr<void *>(), nDims, outputsize,
                          inputShape, outputShape);
         }
         // TODO:use compute type
@@ -89,16 +92,38 @@ class matmulCublas : public Kernel {
                  (dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
                     ? 0 // Broadcast the batch dimension if batch size is 1
                     : n * k;
-            stat = cublasGemmStridedBatchedEx(
-                context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
-                CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA,
-                &beta, outData, CUDA_R_32F, ldc, m * n, b, CUDA_R_32F,
-                (cublasGemmAlgo_t)record->algo);
+            if (dataType == DataType::Float16) {
+                half alpha_half = static_cast<half>(alpha_naive);
+                half beta_half = static_cast<half>(beta_naive);
+                stat = cublasGemmStridedBatchedEx(
+                    context->cublasHandle(), opB, opA, n, m, k, &alpha_half,
+                    inBData, cuDataType, ldb, strideB, inAData, cuDataType, lda,
+                    strideA, &beta_half, outData, cuDataType, ldc, m * n, b,
+                    cuDataType, (cublasGemmAlgo_t)record->algo);
+
+            } else {
+                stat = cublasGemmStridedBatchedEx(
+                    context->cublasHandle(), opB, opA, n, m, k, &alpha_naive,
+                    inBData, cuDataType, ldb, strideB, inAData, cuDataType, lda,
+                    strideA, &beta_naive, outData, cuDataType, ldc, m * n, b,
+                    cuDataType, (cublasGemmAlgo_t)record->algo);
+            }
         } else {
-            stat = cublasGemmEx(
-                context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
-                CUDA_R_32F, ldb, inAData, CUDA_R_32F, lda, &beta, outData,
-                CUDA_R_32F, ldc, CUDA_R_32F, (cublasGemmAlgo_t)record->algo);
+            if (dataType == DataType::Float16) {
+                half alpha_half = static_cast<half>(alpha_naive);
+                half beta_half = static_cast<half>(beta_naive);
+                stat = cublasGemmEx(context->cublasHandle(), opB, opA, n, m, k,
+                                    &alpha_half, inBData, cuDataType, ldb,
+                                    inAData, cuDataType, lda, &beta_half,
+                                    outData, cuDataType, ldc, cuDataType,
+                                    (cublasGemmAlgo_t)record->algo);
+            } else {
+                stat = cublasGemmEx(context->cublasHandle(), opB, opA, n, m, k,
+                                    &alpha_naive, inBData, cuDataType, ldb,
+                                    inAData, cuDataType, lda, &beta_naive,
+                                    outData, cuDataType, ldc, cuDataType,
+                                    (cublasGemmAlgo_t)record->algo);
+            }
         }
         // if (stat != CUBLAS_STATUS_SUCCESS)
         //     cout << cublasGetErrorString(stat);
@@ -140,8 +165,9 @@ class matmulCublas : public Kernel {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::MatMul, DataType::Float32, matmulCublas,
-                "Matmul_cuBLAS_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::MatMul, matmulCublas,
+                "Matmul_cuBLAS_CUDA");
 
 REGISTER_CONSTRUCTOR(2, MatmulCublasPerfRecordObj::from_json);
+
 }; // namespace infini
diff --git a/src/kernels/cuda/membound_tvm_extract_source.cc b/src/kernels/cuda/membound_tvm_extract_source.cc
index e4b76e60..57e5b9d1 100644
--- a/src/kernels/cuda/membound_tvm_extract_source.cc
+++ b/src/kernels/cuda/membound_tvm_extract_source.cc
@@ -229,9 +229,8 @@ class MemboundTVMExtractSource : public Kernel {
     }
 };
 
-// REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
-// MemboundTVMExtractSource,
-//                 "Memobund_TVM_Ansor_extract_source");
+REGISTER_KERNEL(Device::CUDA, OpType::MemBound, MemboundTVMExtractSource,
+                "Memobund_TVM_Ansor_extract_source");
 }; // namespace infini
 
 #endif
diff --git a/src/kernels/cuda/membound_tvm_packed_function.cc b/src/kernels/cuda/membound_tvm_packed_function.cc
index 8086518d..a8af951e 100644
--- a/src/kernels/cuda/membound_tvm_packed_function.cc
+++ b/src/kernels/cuda/membound_tvm_packed_function.cc
@@ -216,9 +216,9 @@ class MemboundTVMPackedFunction : public Kernel {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
-                MemboundTVMPackedFunction,
+REGISTER_KERNEL(Device::CUDA, OpType::MemBound, MemboundTVMPackedFunction,
                 "Memobund_TVM_Ansor_packed_funciton");
+
 }; // namespace infini
 
 #endif
diff --git a/src/kernels/cuda/pad_slice.cc b/src/kernels/cuda/pad_slice.cc
index 1ff4dffa..0039c11c 100644
--- a/src/kernels/cuda/pad_slice.cc
+++ b/src/kernels/cuda/pad_slice.cc
@@ -39,10 +39,8 @@ class SliceCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Slice, DataType::Float32, SliceCuda,
-                "Slice__CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Slice, DataType::Int64, SliceCuda,
-                "Slice__CUDA_Int64");
-REGISTER_KERNEL(Device::CUDA, OpType::Pad, DataType::Float32, PadCuda,
-                "Pad__CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Slice, SliceCuda, "Slice__CUDA");
+
+REGISTER_KERNEL(Device::CUDA, OpType::Pad, PadCuda, "Pad__CUDA");
+
 } // namespace infini
diff --git a/src/kernels/cuda/pad_slice.cu b/src/kernels/cuda/pad_slice.cu
index cd6bc37b..ccf85748 100644
--- a/src/kernels/cuda/pad_slice.cu
+++ b/src/kernels/cuda/pad_slice.cu
@@ -1,6 +1,7 @@
 #include "core/data_type.h"
 #include "cuda/cuda_common.h"
 #include "cuda/cuda_pad_slice.h"
+#include "cuda/cuda_utility.h"
 
 __device__ int WholeTensorOffset2PartTensorOffset(int wholeOffset,
                                                   TransMetaData metaData,
@@ -21,39 +22,83 @@ __device__ int WholeTensorOffset2PartTensorOffset(int wholeOffset,
 }
 
 template <typename T>
-__global__ void _pad_slice_kernel(T *part, T *whole, TransMetaData metaData,
-                                  int nDims, int num, bool isPad) {
+__global__ void _pad_slice_kernel(void *part, void *whole,
+                                  TransMetaData metaData, int nDims, int num,
+                                  bool isPad) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    if (tid >= num)
+    if (tid >= num) {
         return;
+    }
 
     int stride = blockDim.x * gridDim.x;
     while (tid < num) {
         int offset = WholeTensorOffset2PartTensorOffset(tid, metaData, nDims);
-        if (isPad)
-            if (offset < 0)
-                whole[tid] = 0;
-            else
-                whole[tid] = part[offset];
-        else if (offset >= 0)
-            part[offset] = whole[tid];
+        if (isPad) {
+            if (offset < 0) {
+                ((T *)whole)[tid] = static_cast<T>(0.f);
+            } else {
+                ((T *)whole)[tid] = ((T *)part)[offset];
+            }
+        } else if (offset >= 0) {
+            ((T *)part)[offset] = ((T *)whole)[tid];
+        }
         tid += stride;
     }
 }
 
 namespace infini {
+#define CASE(T)                                                                \
+    _pad_slice_kernel<DT_CUDA<T>::t><<<gridSize, blockSize>>>(                 \
+        partData, wholeData, metadata, nDims, num, isPad);
+
+#define SWITCH_DTYPE(DTYPE)                                                    \
+    switch (DTYPE) {                                                           \
+    case 1:                                                                    \
+        CASE(1)                                                                \
+        break;                                                                 \
+    case 2:                                                                    \
+        CASE(2)                                                                \
+        break;                                                                 \
+    case 3:                                                                    \
+        CASE(3)                                                                \
+        break;                                                                 \
+    case 4:                                                                    \
+        CASE(4)                                                                \
+        break;                                                                 \
+    case 5:                                                                    \
+        CASE(5)                                                                \
+        break;                                                                 \
+    case 6:                                                                    \
+        CASE(6)                                                                \
+        break;                                                                 \
+    case 7:                                                                    \
+        CASE(7)                                                                \
+        break;                                                                 \
+    case 10:                                                                   \
+        CASE(10)                                                               \
+        break;                                                                 \
+    case 11:                                                                   \
+        CASE(11)                                                               \
+        break;                                                                 \
+    case 12:                                                                   \
+        CASE(12)                                                               \
+        break;                                                                 \
+    case 13:                                                                   \
+        CASE(13)                                                               \
+        break;                                                                 \
+    case 16:                                                                   \
+        CASE(16)                                                               \
+        break;                                                                 \
+    default:                                                                   \
+        IT_TODO_HALT();                                                        \
+    }
+
 void pad_slice_kernel(void *partData, void *wholeData,
                       const TransMetaData &metadata, int nDims, int num,
                       bool isPad) {
     int blockSize = 32 * 16;
     int gridSize = (num + blockSize - 1) / blockSize;
-    if (metadata.DType == DataType::Int64.getIndex()) {
-        _pad_slice_kernel<int64_t>
-            <<<gridSize, blockSize>>>((int64_t *)partData, (int64_t *)wholeData,
-                                      metadata, nDims, num, isPad);
-    } else if (metadata.DType == DataType::Float32.getIndex()) {
-        _pad_slice_kernel<float><<<gridSize, blockSize>>>(
-            (float *)partData, (float *)wholeData, metadata, nDims, num, isPad);
-    }
+    int dType = metadata.DType;
+    SWITCH_DTYPE(dType)
 }
 } // namespace infini
diff --git a/src/kernels/cuda/pooling.cc b/src/kernels/cuda/pooling.cc
index d8b2e0f8..03d5b883 100644
--- a/src/kernels/cuda/pooling.cc
+++ b/src/kernels/cuda/pooling.cc
@@ -8,6 +8,7 @@ class poolingCudnn : public CudaKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<PoolingObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
         void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const outData = (op->getOutput()->getRawDataPtr<void *>());
@@ -76,8 +77,9 @@ class avgPoolCudnn : public poolingCudnn {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::MaxPool, DataType::Float32, maxPoolCudnn,
-                "MaxPool_cuDNN_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::AveragePool, DataType::Float32,
-                avgPoolCudnn, "AvgPool_cuDNN_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::MaxPool, maxPoolCudnn,
+                "MaxPool_cuDNN_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::AveragePool, avgPoolCudnn,
+                "AvgPool_cuDNN_CUDA");
+
 }; // namespace infini
diff --git a/src/kernels/cuda/recv.cc b/src/kernels/cuda/recv.cc
index 7fd7ee49..42c9073e 100644
--- a/src/kernels/cuda/recv.cc
+++ b/src/kernels/cuda/recv.cc
@@ -40,8 +40,7 @@ class RecvNCCL : public CudaKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Recv, DataType::Float32, RecvNCCL,
-                "Recv_NCCL_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Recv, RecvNCCL, "Recv_NCCL_CUDA");
 } // namespace infini
 
 #endif
diff --git a/src/kernels/cuda/reduce.cc b/src/kernels/cuda/reduce.cc
index 840a572f..531c09d0 100644
--- a/src/kernels/cuda/reduce.cc
+++ b/src/kernels/cuda/reduce.cc
@@ -1,6 +1,7 @@
 #include "operators/reduce.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
 
 namespace infini {
 class ReduceCudnnBase : public CudaKernelWithoutConfig {
@@ -46,12 +47,12 @@ class ReduceCudnnBase : public CudaKernelWithoutConfig {
         checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
         cudnnTensorDescriptor_t outDesc;
         checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
+        auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
         if (nInDims > 3) {
             checkCudnnError(cudnnSetTensorNdDescriptor(
-                inDesc, CUDNN_DATA_FLOAT, nInDims, inDimArray, inStrideArray));
-            checkCudnnError(
-                cudnnSetTensorNdDescriptor(outDesc, CUDNN_DATA_FLOAT, nInDims,
-                                           outDimArray, outStrideArray));
+                inDesc, cudnnDataType, nInDims, inDimArray, inStrideArray));
+            checkCudnnError(cudnnSetTensorNdDescriptor(
+                outDesc, cudnnDataType, nInDims, outDimArray, outStrideArray));
         } else {
             int idims[4] = {1, 1, 1, 1}, odims[4] = {1, 1, 1, 1};
             for (int i = 0; i < nInDims; ++i) {
@@ -62,20 +63,19 @@ class ReduceCudnnBase : public CudaKernelWithoutConfig {
             }
 
             checkCudnnError(cudnnSetTensor4dDescriptor(
-                inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, idims[0], idims[1],
+                inDesc, CUDNN_TENSOR_NCHW, cudnnDataType, idims[0], idims[1],
                 idims[2], idims[3]));
             checkCudnnError(cudnnSetTensor4dDescriptor(
-                outDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, odims[0],
-                odims[1], odims[2], odims[3]));
+                outDesc, CUDNN_TENSOR_NCHW, cudnnDataType, odims[0], odims[1],
+                odims[2], odims[3]));
         }
 
         // get reduce descriptor
         cudnnReduceTensorDescriptor_t reduceDesc;
         checkCudnnError(cudnnCreateReduceTensorDescriptor(&reduceDesc));
         checkCudnnError(cudnnSetReduceTensorDescriptor(
-            reduceDesc, getReduceOp(), CUDNN_DATA_FLOAT,
-            CUDNN_NOT_PROPAGATE_NAN, CUDNN_REDUCE_TENSOR_NO_INDICES,
-            CUDNN_32BIT_INDICES));
+            reduceDesc, getReduceOp(), cudnnDataType, CUDNN_NOT_PROPAGATE_NAN,
+            CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
 
         // get workspace
         size_t workspaceSize = 0;
@@ -120,8 +120,9 @@ class ReduceSumCudnn : public ReduceCudnnBase {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::ReduceMean, DataType::Float32,
-                ReduceMeanCudnn, "ReduceMean_cuDNN_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::ReduceSum, DataType::Float32,
-                ReduceSumCudnn, "ReduceSum_cuDNN_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::ReduceMean, ReduceMeanCudnn,
+                "ReduceMean_cuDNN_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::ReduceSum, ReduceSumCudnn,
+                "ReduceSum_cuDNN_CUDA");
+
 }; // namespace infini
diff --git a/src/kernels/cuda/reshape.cc b/src/kernels/cuda/reshape.cc
index 232bcdf6..450105b0 100644
--- a/src/kernels/cuda/reshape.cc
+++ b/src/kernels/cuda/reshape.cc
@@ -11,19 +11,12 @@ class CopyCuda : public CudaKernelWithoutConfig {
     }
 };
 // reshape/flatten/identity all act as copying from input to output.
-REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Float32, CopyCuda,
-                "Reshape_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Int64, CopyCuda,
-                "Reshape_CUDA_Int64");
-REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Int32, CopyCuda,
-                "Reshape_CUDA_Int32");
-REGISTER_KERNEL(Device::CUDA, OpType::Flatten, DataType::Float32, CopyCuda,
-                "Flatten_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Squeeze, DataType::Float32, CopyCuda,
-                "Squeeze_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Unsqueeze, DataType::Float32, CopyCuda,
-                "Unsqueeze_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Identity, DataType::Float32, CopyCuda,
-                "Identity_CUDA_Float32");
+
+REGISTER_KERNEL(Device::CUDA, OpType::Reshape, CopyCuda, "Reshape_CUDA");
+
+REGISTER_KERNEL(Device::CUDA, OpType::Flatten, CopyCuda, "Flatten_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Identity, CopyCuda, "Identity_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Squeeze, CopyCuda, "Squeeze_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Unsqueeze, CopyCuda, "Unsqueeze_CUDA");
 
 } // namespace infini
diff --git a/src/kernels/cuda/resize.cc b/src/kernels/cuda/resize.cc
index 5becb913..106b46f3 100644
--- a/src/kernels/cuda/resize.cc
+++ b/src/kernels/cuda/resize.cc
@@ -6,6 +6,7 @@ class ResizeCuda : public CudaKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ResizeObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto in = op->getInputs(0);
         auto out = op->getOutputs()[0];
 
@@ -48,7 +49,6 @@ class ResizeCuda : public CudaKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Resize, DataType::Float32, ResizeCuda,
-                "Resize_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Resize, ResizeCuda, "Resize_CUDA");
 
 } // namespace infini
diff --git a/src/kernels/cuda/send.cc b/src/kernels/cuda/send.cc
index 38684062..6f8af9aa 100644
--- a/src/kernels/cuda/send.cc
+++ b/src/kernels/cuda/send.cc
@@ -36,8 +36,7 @@ class SendNCCL : public CudaKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Send, DataType::Float32, SendNCCL,
-                "Send_NCCL_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Send, SendNCCL, "Send_NCCL_CUDA");
 } // namespace infini
 
 #endif
diff --git a/src/kernels/cuda/softmax.cc b/src/kernels/cuda/softmax.cc
index 024288c2..4a2d844b 100644
--- a/src/kernels/cuda/softmax.cc
+++ b/src/kernels/cuda/softmax.cc
@@ -20,11 +20,17 @@ class SoftmaxCuda : public CudaKernelWithoutConfig {
         int stride = op->getInputs(0)->getStride().at(op->getAxis());
 
         int num_blocks = size / dimsize;
-        softmax_kernel(num_blocks, (float *)input, (float *)output, size,
-                       dimsize, stride);
+        if (op->getDType() == DataType::Float32) {
+            softmax_kernel(num_blocks, (float *)input, (float *)output, size,
+                           dimsize, stride);
+        } else if (op->getDType() == DataType::Float16) {
+            softmax_kernel(num_blocks, (half *)input, (half *)output, size,
+                           dimsize, stride);
+        } else {
+            IT_ASSERT(false);
+        }
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, SoftmaxCuda,
-                "Softmax_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Softmax, SoftmaxCuda, "Softmax_CUDA");
 } // namespace infini
diff --git a/src/kernels/cuda/softmax.cu b/src/kernels/cuda/softmax.cu
index 7e85ec43..69334d50 100644
--- a/src/kernels/cuda/softmax.cu
+++ b/src/kernels/cuda/softmax.cu
@@ -1,6 +1,5 @@
 #include "cuda/cuda_common.h"
 #include <cub/cub.cuh>
-
 struct __align__(8) DataMaxSum { // update the global max and sum, store the
                                  // output at max_tmp and sum_tmp
     float max_tmp;               // store max
@@ -16,9 +15,9 @@ __device__ __forceinline__ DataMaxSum reduce_dms_op(DataMaxSum a,
 
     return bigger;
 }
-template <int BLOCK_DIM>
+template <typename T, int BLOCK_DIM>
 __launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel(
-    float *__restrict input, float *__restrict output, int size, int dimsize,
+    T *__restrict input, T *__restrict output, int size, int dimsize,
     int stride) { // if set axis = 1, inputShape=[I,J,K,S]
                   // tid = i(JKS) + j(KS) + k(S) + s
 
@@ -33,15 +32,33 @@ __launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel(
     dms_partial.max_tmp = -__FLT_MAX__;
     dms_partial.sum_tmp = 0.0f;
     DataMaxSum dms_input;
-    for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+    int remain = dimsize % BLOCK_DIM;
+    int step = (dimsize - remain) / BLOCK_DIM + 1; // step <= numPerThread
 
-        dms_input.max_tmp =
-            input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
+    if (threadIdx.x < remain) {
+        for (int ind = 0; ind < step; ind++) {
+            dms_input.max_tmp =
+                input[tid + (threadIdx.x * step + ind) * stride];
 
-        dms_input.sum_tmp = 1.0f;
-        dms_partial = reduce_dms_op(dms_partial,
-                                    dms_input); // reduce the data to one block
+            dms_input.sum_tmp = 1.0f;
+            dms_partial =
+                reduce_dms_op(dms_partial,
+                              dms_input); // reduce the data to one block
+        }
+    } else {
+        for (int ind = 0; ind < step - 1; ind++) {
+            dms_input.max_tmp =
+                input[tid + (remain * step +
+                             (threadIdx.x - remain) * (step - 1) + ind) *
+                                stride];
+
+            dms_input.sum_tmp = 1.0f;
+            dms_partial =
+                reduce_dms_op(dms_partial,
+                              dms_input); // reduce the data to one block
+        }
     }
+
     typedef cub::BlockReduce<DataMaxSum, BLOCK_DIM> BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
     __shared__ DataMaxSum dms_total;
@@ -53,12 +70,102 @@ __launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel(
     }
     __syncthreads();
     //-----------------
+    if (threadIdx.x < remain) {
+        for (int ind = 0; ind < step; ind++) {
 
-    for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
-        output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
-            __expf(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
-                   dms_total.max_tmp) *
-            __fdividef(1.0F, dms_total.sum_tmp);
+            output[tid + (threadIdx.x * step + ind) * stride] =
+                __expf(static_cast<float>(
+                           input[tid + (threadIdx.x * step + ind) * stride]) -
+                       dms_total.max_tmp) *
+                __fdividef(1.0F, dms_total.sum_tmp);
+        }
+    } else {
+        for (int ind = 0; ind < step - 1; ind++) {
+
+            output[tid +
+                   (remain * step + (threadIdx.x - remain) * (step - 1) + ind) *
+                       stride] =
+                __expf(static_cast<float>(
+                           input[tid +
+                                 (remain * step +
+                                  (threadIdx.x - remain) * (step - 1) + ind) *
+                                     stride]) -
+                       dms_total.max_tmp) *
+                __fdividef(1.0F, dms_total.sum_tmp);
+        }
+    }
+}
+
+template <typename T, int BLOCK_DIM, int numPerThread>
+__global__ void
+_blockSoftmaxKernel(T *__restrict input, T *__restrict output, int size,
+                    int dimsize,
+                    int stride) { // if set axis = 1, inputShape=[I,J,K,S]
+                                  // tid = i(JKS) + j(KS) + k(S) + s
+
+    // blockDim.x = size/dimsize = IKS
+    // blockIdx.x = i(KS) + k(S) + s,blockIdx.x%stride = k(S) + s
+
+    int tid =
+        blockIdx.x % stride + (blockIdx.x - blockIdx.x % stride) *
+                                  dimsize; // now, tid = i(JKS) + k(S) + s;
+    int remain = dimsize % BLOCK_DIM;
+    int step = (dimsize - remain) / BLOCK_DIM + 1; // step <= numPerThread
+    float dataPerThread[numPerThread];
+
+    DataMaxSum dms_partial;
+    dms_partial.max_tmp = -__FLT_MAX__;
+    dms_partial.sum_tmp = 0.0f;
+    DataMaxSum dms_input;
+    if (threadIdx.x < remain) {
+        for (int ind = 0; ind < step; ind++) {
+            dataPerThread[ind] =
+                input[tid + (threadIdx.x * step + ind) * stride];
+            dms_input.max_tmp = dataPerThread[ind];
+            dms_input.sum_tmp = 1.0f;
+            dms_partial =
+                reduce_dms_op(dms_partial,
+                              dms_input); // reduce the data to one block
+        }
+    } else {
+        for (int ind = 0; ind < step - 1; ind++) {
+            dataPerThread[ind] =
+                input[tid + (remain * step +
+                             (threadIdx.x - remain) * (step - 1) + ind) *
+                                stride];
+            dms_input.max_tmp = dataPerThread[ind];
+            dms_input.sum_tmp = 1.0f;
+            dms_partial =
+                reduce_dms_op(dms_partial,
+                              dms_input); // reduce the data to one block
+        }
+    }
+
+    typedef cub::BlockReduce<DataMaxSum, BLOCK_DIM> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    __shared__ DataMaxSum dms_total;
+    DataMaxSum dms_block =
+        BlockReduce(temp_storage).Reduce(dms_partial, reduce_dms_op);
+    if (threadIdx.x ==
+        0) { // must set threadIdx.x = 0 write the output to memory
+        dms_total = dms_block;
+    }
+    __syncthreads();
+    //-----------------
+    if (threadIdx.x < remain) {
+        for (int ind = 0; ind < step; ind++) {
+            output[tid + (threadIdx.x * step + ind) * stride] =
+                __expf(dataPerThread[ind] - dms_total.max_tmp) *
+                __fdividef(1.0F, dms_total.sum_tmp);
+        }
+    } else {
+        for (int ind = 0; ind < step - 1; ind++) {
+            output[tid +
+                   (remain * step + (threadIdx.x - remain) * (step - 1) + ind) *
+                       stride] =
+                __expf(dataPerThread[ind] - dms_total.max_tmp) *
+                __fdividef(1.0F, dms_total.sum_tmp);
+        }
     }
 }
 
@@ -81,14 +188,14 @@ __inline__ __device__ T WarpAllReduce(T val) {
     }
     return val;
 }
-template <int BLOCK_DIM_x, int BLOCK_DIM_y>
-__global__ void _warpSoftmaxKernel(float *__restrict input,
-                                   float *__restrict output, int size,
-                                   int dimsize, int stride) {
+
+template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y, int numPerThreadx>
+__global__ void _warpSoftmaxKernel(T *__restrict input, T *__restrict output,
+                                   int size, int dimsize, int stride) {
     int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
     int otherSize = size / dimsize;
     int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
-
+    float dataPerThreadx[numPerThreadx];
     if (otherIdx < otherSize) {
 
         __shared__ float max_total[BLOCK_DIM_y];
@@ -96,9 +203,9 @@ __global__ void _warpSoftmaxKernel(float *__restrict input,
         float max_data = -__FLT_MAX__;
 
         for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
-            max_data =
-                max(max_data,
-                    input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride]);
+            dataPerThreadx[ph] =
+                input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride];
+            max_data = max(max_data, dataPerThreadx[ph]);
         }
 
         max_data = WarpAllReduce<MaxOp, float, BLOCK_DIM_x>(max_data);
@@ -110,9 +217,9 @@ __global__ void _warpSoftmaxKernel(float *__restrict input,
         float sum_data = 0.0f;
 
         for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
-            sum_data +=
-                __expf(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
-                       max_total[threadIdx.y]);
+            dataPerThreadx[ph] =
+                __expf(dataPerThreadx[ph] - max_total[threadIdx.y]);
+            sum_data += dataPerThreadx[ph];
         }
 
         sum_data = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sum_data);
@@ -124,9 +231,7 @@ __global__ void _warpSoftmaxKernel(float *__restrict input,
 
         for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
             output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
-                __expf(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
-                       max_total[threadIdx.y]) *
-                __fdividef(1.0F, sum_total[threadIdx.y]);
+                dataPerThreadx[ph] * __fdividef(1.0F, sum_total[threadIdx.y]);
         }
     }
 }
@@ -137,10 +242,35 @@ namespace infini {
 void softmax_kernel(int num_blocks, float *input, float *output, int size,
                     int dimsize, int stride) {
 
-    if (dimsize > 1024) {
+    if (dimsize > 1024 * 128) {
 
         int BLOCK_DIM = 1024;
-        _blockSoftmaxKernel<1024>
+        _blockSoftmaxKernel<float, 1024>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 1024 * 64) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<float, 1024, 128>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 1024 * 32) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<float, 1024, 64>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 1024 * 16) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<float, 1024, 32>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 1024 * 4) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<float, 1024, 16>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 1024) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<float, 1024, 4>
             <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
     } else if (dimsize > 31) {
         int BLOCK_DIM_x = 32;
@@ -149,7 +279,7 @@ void softmax_kernel(int num_blocks, float *input, float *output, int size,
         dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
         dim3 grid_dim(num_block_x, 1, 1);
 
-        _warpSoftmaxKernel<32, 32>
+        _warpSoftmaxKernel<float, 32, 32, 32>
             <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
     } else if (dimsize > 15) {
         int BLOCK_DIM_x = 16;
@@ -158,7 +288,7 @@ void softmax_kernel(int num_blocks, float *input, float *output, int size,
         dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
         dim3 grid_dim(num_block_x, 1, 1);
 
-        _warpSoftmaxKernel<16, 64>
+        _warpSoftmaxKernel<float, 16, 64, 2>
             <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
     } else if (dimsize > 7) {
         int BLOCK_DIM_x = 8;
@@ -167,7 +297,7 @@ void softmax_kernel(int num_blocks, float *input, float *output, int size,
         dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
         dim3 grid_dim(num_block_x, 1, 1);
 
-        _warpSoftmaxKernel<8, 128>
+        _warpSoftmaxKernel<float, 8, 128, 2>
             <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
     } else {
         int BLOCK_DIM_x = 4;
@@ -176,7 +306,79 @@ void softmax_kernel(int num_blocks, float *input, float *output, int size,
         dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
         dim3 grid_dim(num_block_x, 1, 1);
 
-        _warpSoftmaxKernel<4, 256>
+        _warpSoftmaxKernel<float, 4, 256, 2>
+            <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
+    }
+}
+//------------------
+void softmax_kernel(int num_blocks, half *input, half *output, int size,
+                    int dimsize, int stride) {
+
+    if (dimsize > 1024 * 128) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<half, 1024>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 1024 * 64) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<half, 1024, 128>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 1024 * 32) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<half, 1024, 64>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 1024 * 16) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<half, 1024, 32>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 1024 * 4) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<half, 1024, 16>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 1024) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<half, 1024, 4>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 31) {
+        int BLOCK_DIM_x = 32;
+        int BLOCK_DIM_y = 32;
+        int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        _warpSoftmaxKernel<half, 32, 32, 32>
+            <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 15) {
+        int BLOCK_DIM_x = 16;
+        int BLOCK_DIM_y = 64;
+        int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        _warpSoftmaxKernel<half, 16, 64, 2>
+            <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 7) {
+        int BLOCK_DIM_x = 8;
+        int BLOCK_DIM_y = 128;
+        int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        _warpSoftmaxKernel<half, 8, 128, 2>
+            <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
+    } else {
+        int BLOCK_DIM_x = 4;
+        int BLOCK_DIM_y = 256;
+        int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        _warpSoftmaxKernel<half, 4, 256, 2>
             <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
     }
 }
diff --git a/src/kernels/cuda/split_concat.cc b/src/kernels/cuda/split_concat.cc
index d3f8a551..e06ef731 100644
--- a/src/kernels/cuda/split_concat.cc
+++ b/src/kernels/cuda/split_concat.cc
@@ -7,7 +7,8 @@
 namespace infini {
 
 class CudaCompute {
-    void initComposedTensorMetadata(ComposedTensorMetadata &metadata,
+    template <typename T>
+    void initComposedTensorMetadata(ComposedTensorMetadata<T> &metadata,
                                     Tensor tensor) const {
         int nDims = tensor->getRank();
         auto strides = tensor->getStride();
@@ -16,10 +17,10 @@ class CudaCompute {
             metadata.dimSize[i] = tensor->getDims().at(i);
             metadata.stride[i] = strides.at(i);
         }
-        metadata.data = tensor->getRawDataPtr<float *>();
+        metadata.data = tensor->getRawDataPtr<T *>();
     }
-
-    void initElementTensorMetadata(ElementTensorMetadata &metadata,
+    template <typename T>
+    void initElementTensorMetadata(ElementTensorMetadata<T> &metadata,
                                    TensorVec tensors, int idx, int dim,
                                    int &dimBgIdx, int &batchCounter) const {
         int nTensors = tensors.size();
@@ -27,7 +28,7 @@ class CudaCompute {
              ++batchCounter) {
             auto tensor = tensors.at(idx + batchCounter);
             auto dimSize = tensor->getDims()[dim];
-            metadata.data[batchCounter] = tensor->getRawDataPtr<float *>();
+            metadata.data[batchCounter] = tensor->getRawDataPtr<T *>();
             metadata.dimBgNo[batchCounter] = dimBgIdx;
             metadata.dimSize[batchCounter] = dimSize;
             metadata.nElements[batchCounter] = tensor->size();
@@ -36,17 +37,17 @@ class CudaCompute {
     }
 
   public:
+    template <typename T>
     void do_compute(Tensor composedTensor, TensorVec elementsTensor, int dim,
                     int nDims, bool isSplit) const {
         IT_ASSERT(nDims <= DIM_MAX_SIZE);
-
-        ComposedTensorMetadata composedMetadata;
-        initComposedTensorMetadata(composedMetadata, composedTensor);
+        ComposedTensorMetadata<T> composedMetadata;
+        initComposedTensorMetadata<T>(composedMetadata, composedTensor);
 
         int dimBgNo = 0;
         int nElemets = elementsTensor.size();
         for (int i = 0; i < nElemets; i += BATCH_SIZE) {
-            ElementTensorMetadata elemMetadata;
+            ElementTensorMetadata<T> elemMetadata;
             int batchCounter = 0;
             initElementTensorMetadata(elemMetadata, elementsTensor, i, dim,
                                       dimBgNo, batchCounter);
@@ -74,23 +75,38 @@ class ConcatCuda : private CudaCompute, public CudaKernelWithoutConfig {
                 }
             }
         }
-        do_compute(_op->getOutput(), _op->getInputs(),
-                   as<ConcatObj>(_op)->getDim(), _op->getOutput()->getRank(),
-                   false);
+        if (_op->getDType() == DataType::Float32) {
+            do_compute<float>(_op->getOutput(), _op->getInputs(),
+                              as<ConcatObj>(_op)->getDim(),
+                              _op->getOutput()->getRank(), false);
+        } else if (_op->getDType() == DataType::Float16) {
+            do_compute<half>(_op->getOutput(), _op->getInputs(),
+                             as<ConcatObj>(_op)->getDim(),
+                             _op->getOutput()->getRank(), false);
+        } else {
+            IT_ASSERT(false);
+        }
     }
 };
 
 class SplitCuda : private CudaCompute, public CudaKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
-        do_compute(_op->getInputs(0), _op->getOutputs(),
-                   as<SplitObj>(_op)->getDim(), _op->getInputs(0)->getRank(),
-                   true);
+        if (_op->getDType() == DataType::Float32) {
+            do_compute<float>(_op->getInputs(0), _op->getOutputs(),
+                              as<SplitObj>(_op)->getDim(),
+                              _op->getInputs(0)->getRank(), true);
+        } else if (_op->getDType() == DataType::Float16) {
+            do_compute<half>(_op->getInputs(0), _op->getOutputs(),
+                             as<SplitObj>(_op)->getDim(),
+                             _op->getInputs(0)->getRank(), true);
+        } else {
+            IT_ASSERT(false);
+        }
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Concat, DataType::Float32, ConcatCuda,
-                "Concat_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Split, DataType::Float32, SplitCuda,
-                "Split_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Concat, ConcatCuda, "Concat_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Split, SplitCuda, "Split_CUDA");
+
 } // namespace infini
diff --git a/src/kernels/cuda/split_concat.cu b/src/kernels/cuda/split_concat.cu
index 193501e0..fdb5f18c 100644
--- a/src/kernels/cuda/split_concat.cu
+++ b/src/kernels/cuda/split_concat.cu
@@ -1,9 +1,9 @@
 #include "cuda/cuda_common.h"
 #include "cuda/cuda_split_concat.h"
-
+template <typename T>
 __host__ __device__ int
 elementIdx2ComposedIdx(int elementIndex, int dimBgNo, int dimSize, int dim,
-                       int nDim, ComposedTensorMetadata wholeMeta) {
+                       int nDim, ComposedTensorMetadata<T> wholeMeta) {
     int offset = 0;
 
     // COMP(x0,...,xk,...,xn-1) = ELMT[xk / d](x0,...,xk % d,...xn-1)
@@ -25,10 +25,10 @@ elementIdx2ComposedIdx(int elementIndex, int dimBgNo, int dimSize, int dim,
     int oP = (dim == 0) ? (elementIndex + dimBgNo) : elementIndex;
     return offset + oP * wholeMeta.stride[0];
 }
-
-__global__ void _split_concat_kernel(ElementTensorMetadata elemMeta,
-                                     ComposedTensorMetadata compMeta, int dim,
-                                     int nDims, bool isSplit) {
+template <typename T>
+__global__ void _split_concat_kernel(ElementTensorMetadata<T> elemMeta,
+                                     ComposedTensorMetadata<T> compMeta,
+                                     int dim, int nDims, bool isSplit) {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
     int nElements = elemMeta.nElements[blockIdx.y];
     if (tid >= nElements)
@@ -36,10 +36,10 @@ __global__ void _split_concat_kernel(ElementTensorMetadata elemMeta,
 
     auto dimBgNo = elemMeta.dimBgNo[blockIdx.y];
     auto dimSize = elemMeta.dimSize[blockIdx.y];
-    float *elemData = elemMeta.data[blockIdx.y];
+    T *elemData = elemMeta.data[blockIdx.y];
 
     int Offset =
-        elementIdx2ComposedIdx(tid, dimBgNo, dimSize, dim, nDims, compMeta);
+        elementIdx2ComposedIdx<T>(tid, dimBgNo, dimSize, dim, nDims, compMeta);
     // copy data from input to output
     // for split:input is composed tensor;for concat:input is element
     // tensors.
@@ -52,8 +52,22 @@ __global__ void _split_concat_kernel(ElementTensorMetadata elemMeta,
 namespace infini {
 
 // TODO: when dim=0, the operation can be executed in-place
-void split_concat_kernel(const ElementTensorMetadata &eleMeta,
-                         const ComposedTensorMetadata &compMeta, int dim,
+void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
+                         const ComposedTensorMetadata<float> &compMeta, int dim,
+                         int batchSize, int nDims, bool isSplit) {
+    dim3 blockSize = dim3(32 * 16);
+    // gridsize = max_n_elements / blockSize
+    int max_n_elements =
+        *std::max_element(eleMeta.nElements, eleMeta.nElements + batchSize);
+    int gridDimX = (max_n_elements - 1) / (32 * 16) + 1;
+    // each y is a split among the batch
+    dim3 gridSize(gridDimX, batchSize);
+
+    _split_concat_kernel<<<gridSize, blockSize>>>(eleMeta, compMeta, dim, nDims,
+                                                  isSplit);
+}
+void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
+                         const ComposedTensorMetadata<half> &compMeta, int dim,
                          int batchSize, int nDims, bool isSplit) {
     dim3 blockSize = dim3(32 * 16);
     // gridsize = max_n_elements / blockSize
diff --git a/src/kernels/cuda/transpose.cc b/src/kernels/cuda/transpose.cc
index 774cb37f..b22ee3dd 100644
--- a/src/kernels/cuda/transpose.cc
+++ b/src/kernels/cuda/transpose.cc
@@ -38,8 +38,9 @@ class TransposeCuda : public CudaKernelWithoutConfig {
             outputDims.data[i] = outputShape[i];
         }
 
-        transpose_kernel((float *)inputData, (float *)outputData, nDims, size,
-                         strides, outputDims);
+        const int dType = op->getDType().getIndex();
+        transpose_kernel(dType, inputData, outputData, nDims, size, strides,
+                         outputDims);
     }
 };
 
@@ -82,15 +83,16 @@ class DepthToSpaceCuda : public CudaKernelWithoutConfig {
         for (int i = 0; i < nDims; ++i) {
             outputDims.data[i] = transpose[i];
         }
-
-        transpose_kernel((float *)inputData, (float *)outputData, nDims, size,
-                         strides, outputDims);
+        const int dType = op->getDType().getIndex();
+        transpose_kernel(dType, inputData, outputData, nDims, size, strides,
+                         outputDims);
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Transpose, DataType::Float32,
-                TransposeCuda, "Transpose_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Transpose, TransposeCuda,
+                "Transpose_CUDA");
+
+REGISTER_KERNEL(Device::CUDA, OpType::DepthToSpace, DepthToSpaceCuda,
+                "DepthToSpace_CUDA");
 
-REGISTER_KERNEL(Device::CUDA, OpType::DepthToSpace, DataType::Float32,
-                DepthToSpaceCuda, "DepthToSpace_CUDA_Float32");
 } // namespace infini
diff --git a/src/kernels/cuda/transpose.cu b/src/kernels/cuda/transpose.cu
index f753217c..917afde3 100644
--- a/src/kernels/cuda/transpose.cu
+++ b/src/kernels/cuda/transpose.cu
@@ -1,12 +1,14 @@
 #include "core/common.h"
 #include "cuda/cuda_common.h"
+#include "cuda/cuda_utility.h"
 #include "utils/small_array.h"
 
 constexpr unsigned int num_threads() { return 32 * 4; }
 constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
 
-__global__ void _transpose_kernel(float *input, float *output, int nDims,
+template <class T>
+__global__ void _transpose_kernel(void *input, void *output, int nDims,
                                   int size, infini::SmallArray strides,
                                   infini::SmallArray outputShape) {
     int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -17,21 +19,61 @@ __global__ void _transpose_kernel(float *input, float *output, int nDims,
             inputIdx += v % outputShape.data[i] * strides.data[i];
             v /= outputShape.data[i];
         }
-#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
-        output[outputIdx] = __ldg(input + inputIdx);
-#else
-        output[outputIdx] = input[inputIdx];
-#endif
+        ((T *)output)[outputIdx] = ((T *)input)[inputIdx];
     }
 }
+#define CASE(T)                                                                \
+    _transpose_kernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>(                 \
+        input, output, nDims, size, strides, outputShape);
+
+#define SWITCH_DTYPE(DTYPE)                                                    \
+    switch (DTYPE) {                                                           \
+    case 1:                                                                    \
+        CASE(1)                                                                \
+        break;                                                                 \
+    case 2:                                                                    \
+        CASE(2)                                                                \
+        break;                                                                 \
+    case 3:                                                                    \
+        CASE(3)                                                                \
+        break;                                                                 \
+    case 4:                                                                    \
+        CASE(4)                                                                \
+        break;                                                                 \
+    case 5:                                                                    \
+        CASE(5)                                                                \
+        break;                                                                 \
+    case 6:                                                                    \
+        CASE(6)                                                                \
+        break;                                                                 \
+    case 7:                                                                    \
+        CASE(7)                                                                \
+        break;                                                                 \
+    case 10:                                                                   \
+        CASE(10)                                                               \
+        break;                                                                 \
+    case 11:                                                                   \
+        CASE(11)                                                               \
+        break;                                                                 \
+    case 12:                                                                   \
+        CASE(12)                                                               \
+        break;                                                                 \
+    case 13:                                                                   \
+        CASE(13)                                                               \
+        break;                                                                 \
+    case 16:                                                                   \
+        CASE(16)                                                               \
+        break;                                                                 \
+    default:                                                                   \
+        IT_TODO_HALT();                                                        \
+    }
 
 namespace infini {
-void transpose_kernel(float *input, float *output, int nDims, int size,
+void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
                       SmallArray strides, SmallArray outputShape) {
     int blocksize = block_work_size();
     int gridsize = (size + block_work_size() - 1) / block_work_size();
-    _transpose_kernel<<<gridsize, blocksize>>>(input, output, nDims, size,
-                                               strides, outputShape);
+    SWITCH_DTYPE(dType)
 }
 
 } // namespace infini
diff --git a/src/kernels/cuda/unary.cc b/src/kernels/cuda/unary.cc
index a27d4ac4..bb9691a7 100644
--- a/src/kernels/cuda/unary.cc
+++ b/src/kernels/cuda/unary.cc
@@ -2,6 +2,7 @@
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_unary.h"
+#include "cuda/cuda_utility.h"
 
 namespace infini {
 
@@ -12,6 +13,46 @@ class UnaryCuda : public CudaKernelWithoutConfig {
     }
 };
 
+class CastCuda : public CudaKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<CastObj>(_op);
+
+        size_t num = op->getOutput()->size();
+        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
+
+        if (op->getType() == CastType::Float162Float) {
+            IT_ASSERT(op->getDType() == DataType::Float16 &&
+                      op->getOutDType() == DataType::Float32);
+            cast_kernel<half, float>((half *)inputData, (float *)outputData,
+                                     num);
+        } else if (op->getType() == CastType::Float2Float16) {
+            IT_ASSERT(op->getDType() == DataType::Float32 &&
+                      op->getOutDType() == DataType::Float16);
+            cast_kernel<float, half>((float *)inputData, (half *)outputData,
+                                     num);
+        } else if (op->getType() == CastType::Float2Int32) {
+            IT_ASSERT(op->getDType() == DataType::Float32 &&
+                      op->getOutDType() == DataType::Int32);
+            cast_kernel<float, int32_t>((float *)inputData,
+                                        (int32_t *)outputData, num);
+        } else if (op->getType() == CastType::Float2Int8) {
+            IT_ASSERT(op->getDType() == DataType::Float32 &&
+                      op->getOutDType() == DataType::Int8);
+            cast_kernel<float, int8_t>((float *)inputData, (int8_t *)outputData,
+                                       num);
+        } else if (op->getType() == CastType::Int82Float) {
+            IT_ASSERT(op->getDType() == DataType::Int8 &&
+                      op->getOutDType() == DataType::Float32);
+            cast_kernel<int8_t, float>((int8_t *)inputData, (float *)outputData,
+                                       num);
+        } else {
+            IT_ASSERT(false);
+        }
+    }
+};
+
 class ActivationCudnn : public CudaKernelWithoutConfig {
     virtual cudnnActivationMode_t getOpType() const = 0;
     virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
@@ -33,17 +74,17 @@ class ActivationCudnn : public CudaKernelWithoutConfig {
         while (stride.size() < 4)
             stride.push_back(1);
 
+        auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
+
         // get inputs
         checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
-        checkCudnnError(cudnnSetTensorNdDescriptor(inputDesc, CUDNN_DATA_FLOAT,
-                                                   dim.size(), dim.data(),
-                                                   stride.data()));
+        checkCudnnError(cudnnSetTensorNdDescriptor(
+            inputDesc, cudnnDataType, dim.size(), dim.data(), stride.data()));
 
         // get outputs
         checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
-        checkCudnnError(cudnnSetTensorNdDescriptor(outputDesc, CUDNN_DATA_FLOAT,
-                                                   dim.size(), dim.data(),
-                                                   stride.data()));
+        checkCudnnError(cudnnSetTensorNdDescriptor(
+            outputDesc, cudnnDataType, dim.size(), dim.data(), stride.data()));
 
         // get op descriptor
         cudnnActivationDescriptor_t activationDesc;
@@ -86,16 +127,18 @@ class SoftmaxCudnn : public CudaKernelWithoutConfig {
         memcpy(dim_array + (4 - dim.size()), dim.data(),
                dim.size() * sizeof(int));
 
+        auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
+
         // get inputs
         checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
         checkCudnnError(cudnnSetTensor4dDescriptor(
-            inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dim_array[0],
+            inputDesc, CUDNN_TENSOR_NCHW, cudnnDataType, dim_array[0],
             dim_array[1], dim_array[2], dim_array[3]));
 
         // get outputs
         checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
         checkCudnnError(cudnnSetTensor4dDescriptor(
-            outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dim_array[0],
+            outputDesc, CUDNN_TENSOR_NCHW, cudnnDataType, dim_array[0],
             dim_array[1], dim_array[2], dim_array[3]));
 
         auto [alpha, beta] = getAlphBeta();
@@ -130,35 +173,27 @@ class TanhCudnn : public ActivationCudnn {
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Relu, DataType::Float32, ReluCudnn,
-                "Relu_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, DataType::Float32, SigmoidCudnn,
-                "Sigmoid_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::HardSigmoid, DataType::Float32, UnaryCuda,
-                "Hard_Sigmoid_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::HardSwish, DataType::Float32, UnaryCuda,
-                "Hard_Swish_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Tanh, DataType::Float32, TanhCudnn,
-                "Tanh_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Abs, DataType::Float32, UnaryCuda,
-                "Abs_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Sqrt, DataType::Float32, UnaryCuda,
-                "Sqrt_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Gelu, DataType::Float32, UnaryCuda,
-                "Gelu_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Neg, DataType::Float32, UnaryCuda,
-                "Neg_CUDA_Float32");
-REGISTER_KERNEL(Device::CUDA, OpType::Erf, DataType::Float32, UnaryCuda,
-                "Erf_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Relu, ReluCudnn, "Relu_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, SigmoidCudnn, "Sigmoid_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::HardSigmoid, UnaryCuda,
+                "Hard_Sigmoid_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::HardSwish, UnaryCuda, "Hard_Swish_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Tanh, TanhCudnn, "Tanh_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Abs, UnaryCuda, "Abs_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Sqrt, UnaryCuda, "Sqrt_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Gelu, UnaryCuda, "Gelu_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Neg, UnaryCuda, "Neg_CUDA");
+REGISTER_KERNEL(Device::CUDA, OpType::Erf, UnaryCuda, "Erf_CUDA");
 
-// REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, UnaryCuda,
-//                 "Softmax_CUDA_Float32");
-// REGISTER_KERNEL(Device::CUDA, OpType::Relu, DataType::Float32, UnaryCuda,
-//                 "Relu_CUDA_Float32");
-// REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, DataType::Float32, UnaryCuda,
-//                 "Sigmoid_CUDA_Float32");
-// REGISTER_KERNEL(Device::CUDA, OpType::Tanh, DataType::Float32, UnaryCuda,
-//                 "Tanh_CUDA_Float32");
-// REGISTER_KERNEL(Device::CUDA, OpType::Abs, DataType::Float32, UnaryCuda,
-//                 "Abs_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Cast, CastCuda, "Cast_CUDA");
+
+// REGISTER_KERNEL(Device::CUDA, OpType::Softmax, UnaryCuda, "Softmax_CUDA");
+// REGISTER_KERNEL(Device::CUDA, OpType::Relu, UnaryCuda,
+//                 "Relu_CUDA");
+// REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, UnaryCuda,
+//                 "Sigmoid_CUDA");
+// REGISTER_KERNEL(Device::CUDA, OpType::Tanh, UnaryCuda,
+//                 "Tanh_CUDA");
+// REGISTER_KERNEL(Device::CUDA, OpType::Abs, UnaryCuda,
+//                 "Abs_CUDA");
 }; // namespace infini
diff --git a/src/kernels/cuda/unary.cu b/src/kernels/cuda/unary.cu
index 22e2e423..afd7f02a 100644
--- a/src/kernels/cuda/unary.cu
+++ b/src/kernels/cuda/unary.cu
@@ -1,6 +1,8 @@
 #include "core/common.h"
 #include "core/constants.h"
 #include "cuda/cuda_common.h"
+#include "cuda/cuda_unary.h"
+#include <cub/cub.cuh>
 #include <math.h>
 
 using infini::E_CONSTANT;
@@ -8,15 +10,16 @@ constexpr unsigned int num_threads() { return 32 * 4; }
 constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
 
-__global__ void _softmax_kernel1(float *input, float *output, size_t n) {
+template <typename T>
+__global__ void _softmax_kernel1(T *input, T *output, size_t n) {
     float sum = 0.0f;
     for (size_t i = 0; i < n; ++i) {
         sum += pow(E_CONSTANT, input[i]);
     }
     *output = sum;
 }
-
-__global__ void _softmax_kernel2(float *input, float *output, size_t n) {
+template <typename T>
+__global__ void _softmax_kernel2(T *input, T *output, size_t n) {
     float sum = *output;
     size_t index = threadIdx.x + blockIdx.x * blockDim.x;
     size_t stride = blockDim.x * gridDim.x;
@@ -24,32 +27,32 @@ __global__ void _softmax_kernel2(float *input, float *output, size_t n) {
         output[i] = pow(E_CONSTANT, input[i]) / sum;
     }
 }
-
-__global__ void _relu_kernel(float *input, float *output, size_t n) {
+template <typename T>
+__global__ void _relu_kernel(T *input, T *output, size_t n) {
     size_t index = threadIdx.x + blockIdx.x * blockDim.x;
     size_t stride = blockDim.x * gridDim.x;
     for (size_t i = index; i < n; i += stride) {
         output[i] = max(input[i], float(0));
     }
 }
-
-__global__ void _sigmoid_kernel(float *input, float *output, size_t n) {
+template <typename T>
+__global__ void _sigmoid_kernel(T *input, T *output, size_t n) {
     size_t index = threadIdx.x + blockIdx.x * blockDim.x;
     size_t stride = blockDim.x * gridDim.x;
     for (size_t i = index; i < n; i += stride) {
         output[i] = 1 / (1 + pow(E_CONSTANT, -input[i]));
     }
 }
-
-__global__ void _hard_sigmoid_kernel(float *input, float *output, size_t n) {
+template <typename T>
+__global__ void _hard_sigmoid_kernel(T *input, T *output, size_t n) {
     size_t index = threadIdx.x + blockIdx.x * blockDim.x;
     size_t stride = blockDim.x * gridDim.x;
     for (size_t i = index; i < n; i += stride) {
         output[i] = max(0.0f, min(1.0f, 0.2f * input[i] + 0.5f));
     }
 }
-
-__global__ void _hard_swish_kernel(float *input, float *output, size_t n) {
+template <typename T>
+__global__ void _hard_swish_kernel(T *input, T *output, size_t n) {
     size_t index = threadIdx.x + blockIdx.x * blockDim.x;
     size_t stride = blockDim.x * gridDim.x;
     for (size_t i = index; i < n; i += stride) {
@@ -57,8 +60,8 @@ __global__ void _hard_swish_kernel(float *input, float *output, size_t n) {
             input[i] * max(0.f, min(1.f, (1.f / 6.f) * input[i] + 0.5f));
     }
 }
-
-__global__ void _tanh_kernel(float *input, float *output, size_t n) {
+template <typename T>
+__global__ void _tanh_kernel(T *input, T *output, size_t n) {
     size_t index = threadIdx.x + blockIdx.x * blockDim.x;
     size_t stride = blockDim.x * gridDim.x;
     for (size_t i = index; i < n; i += stride) {
@@ -66,8 +69,8 @@ __global__ void _tanh_kernel(float *input, float *output, size_t n) {
                     (pow(E_CONSTANT, input[i]) + pow(E_CONSTANT, -input[i]));
     }
 }
-
-__global__ void _abs_kernel(float *input, float *output, size_t n) {
+template <typename T>
+__global__ void _abs_kernel(T *input, T *output, size_t n) {
     size_t index = threadIdx.x + blockIdx.x * blockDim.x;
     size_t stride = blockDim.x * gridDim.x;
     for (size_t i = index; i < n; i += stride) {
@@ -83,7 +86,16 @@ __global__ void _sqrt_kernel(float *input, float *output, size_t n) {
     }
 }
 
-__global__ void _gelu_kernel(float *input, float *output, size_t n) {
+__global__ void _sqrt_kernel(half *input, half *output, size_t n) {
+    size_t index = threadIdx.x + blockIdx.x * blockDim.x;
+    size_t stride = blockDim.x * gridDim.x;
+    for (size_t i = index; i < n; i += stride) {
+        output[i] = hsqrt(input[i]);
+    }
+}
+
+template <typename T>
+__global__ void _gelu_kernel(T *input, T *output, size_t n) {
     int index = threadIdx.x + blockIdx.x * blockDim.x;
     int stride = blockDim.x * gridDim.x;
     for (int i = index; i < n; i += stride) {
@@ -91,8 +103,8 @@ __global__ void _gelu_kernel(float *input, float *output, size_t n) {
         output[i] = 0.5 * x * (1 + erf(x / sqrt(2.0f)));
     }
 }
-
-__global__ void _erf_kernel(float *input, float *output, size_t n) {
+template <typename T>
+__global__ void _erf_kernel(T *input, T *output, size_t n) {
     size_t index = threadIdx.x + blockIdx.x * blockDim.x;
     size_t stride = blockDim.x * gridDim.x;
     for (int i = index; i < n; i += stride) {
@@ -109,72 +121,187 @@ __global__ void _neg_kernel(T *input, T *output, size_t n) {
     }
 }
 
+template <typename INPUT, typename OUTPUT>
+__global__ void _cast_kernel(INPUT *input, OUTPUT *output, size_t n) {
+
+    size_t index = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (index < n) {
+        cub::CastOp<OUTPUT> _CastOp;
+        output[index] = _CastOp(input[index]);
+    }
+}
+
 namespace infini {
-void softmax_kernel(float *input, float *output, size_t num) {
+template <typename T> void softmax_kernel(T *input, T *output, size_t num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _softmax_kernel1<<<1, 1>>>(input, output, num);
-    _softmax_kernel2<<<gridsize, blocksize>>>(input, output, num);
+    _softmax_kernel1<T><<<1, 1>>>(input, output, num);
+    _softmax_kernel2<T><<<gridsize, blocksize>>>(input, output, num);
 }
-void relu_kernel(float *input, float *output, size_t num) {
+template <typename T> void relu_kernel(T *input, T *output, size_t num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _relu_kernel<<<gridsize, blocksize>>>(input, output, num);
+    _relu_kernel<T><<<gridsize, blocksize>>>(input, output, num);
 }
-void sigmoid_kernel(float *input, float *output, size_t num) {
+template <typename T> void sigmoid_kernel(T *input, T *output, size_t num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num);
+    _sigmoid_kernel<T><<<gridsize, blocksize>>>(input, output, num);
 }
-void hard_sigmoid_kernel(float *input, float *output, size_t num) {
+template <typename T>
+void hard_sigmoid_kernel(T *input, T *output, size_t num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _hard_sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num);
+    _hard_sigmoid_kernel<T><<<gridsize, blocksize>>>(input, output, num);
 }
-void hard_swish_kernel(float *input, float *output, size_t num) {
+template <typename T> void hard_swish_kernel(T *input, T *output, size_t num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _hard_swish_kernel<<<gridsize, blocksize>>>(input, output, num);
+    _hard_swish_kernel<T><<<gridsize, blocksize>>>(input, output, num);
 }
-void tanh_kernel(float *input, float *output, size_t num) {
+template <typename T> void tanh_kernel(T *input, T *output, size_t num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _tanh_kernel<<<gridsize, blocksize>>>(input, output, num);
+    _tanh_kernel<T><<<gridsize, blocksize>>>(input, output, num);
 }
-void abs_kernel(float *input, float *output, size_t num) {
+template <typename T> void abs_kernel(T *input, T *output, size_t num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _abs_kernel<<<gridsize, blocksize>>>(input, output, num);
+    _abs_kernel<T><<<gridsize, blocksize>>>(input, output, num);
 }
-void sqrt_kernel(float *input, float *output, size_t num) {
+template <typename T> void sqrt_kernel(T *input, T *output, size_t num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _sqrt_kernel<<<gridsize, blocksize>>>(input, output, num);
+    _sqrt_kernel<<<gridsize, blocksize>>>((T *)input, (T *)output, num);
 }
-void gelu_kernel(float *input, float *output, size_t num) {
+
+template <typename T> void gelu_kernel(T *input, T *output, size_t num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _gelu_kernel<<<gridsize, blocksize>>>(input, output, num);
+    _gelu_kernel<T><<<gridsize, blocksize>>>(input, output, num);
 }
-void erf_kernel(float *input, float *output, size_t num) {
+template <typename T> void erf_kernel(T *input, T *output, size_t num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _erf_kernel<<<gridsize, blocksize>>>(input, output, num);
+    _erf_kernel<T><<<gridsize, blocksize>>>(input, output, num);
 }
-void neg_kernel(float *input, float *output, size_t num) {
+template <typename T> void neg_kernel(T *input, T *output, size_t num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _neg_kernel<<<gridsize, blocksize>>>(input, output, num);
+    _neg_kernel<T><<<gridsize, blocksize>>>(input, output, num);
 }
+
+void unary_kernel(const Operator &_op) {
+    auto op = as<UnaryObj>(_op);
+    void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
+    void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
+
+    size_t num = op->getOutput()->size();
+    if (op->getOpType() == OpType::Softmax) {
+        if (_op->getDType() == DataType::Float32) {
+            softmax_kernel<float>((float *)inputData, (float *)outputData, num);
+        } else {
+            IT_TODO_HALT();
+        }
+    } else if (op->getOpType() == OpType::Relu) {
+        if (_op->getDType() == DataType::Float32) {
+            relu_kernel<float>((float *)inputData, (float *)outputData, num);
+        } else {
+            IT_TODO_HALT();
+        }
+    } else if (op->getOpType() == OpType::Sigmoid) {
+        if (_op->getDType() == DataType::Float32) {
+            sigmoid_kernel<float>((float *)inputData, (float *)outputData, num);
+        } else {
+            IT_TODO_HALT();
+        }
+    } else if (op->getOpType() == OpType::HardSigmoid) {
+        if (_op->getDType() == DataType::Float32) {
+            hard_sigmoid_kernel<float>((float *)inputData, (float *)outputData,
+                                       num);
+        } else {
+            IT_TODO_HALT();
+        }
+    } else if (op->getOpType() == OpType::HardSwish) {
+        if (_op->getDType() == DataType::Float32) {
+            hard_swish_kernel<float>((float *)inputData, (float *)outputData,
+                                     num);
+        } else {
+            IT_TODO_HALT();
+        }
+    } else if (op->getOpType() == OpType::Tanh) {
+        if (_op->getDType() == DataType::Float32) {
+            tanh_kernel<float>((float *)inputData, (float *)outputData, num);
+        } else {
+            IT_TODO_HALT();
+        }
+    } else if (op->getOpType() == OpType::Abs) {
+        if (_op->getDType() == DataType::Float32) {
+            abs_kernel<float>((float *)inputData, (float *)outputData, num);
+        } else {
+            IT_TODO_HALT();
+        }
+    } else if (op->getOpType() == OpType::Sqrt) {
+        if (_op->getDType() == DataType::Float32) {
+            sqrt_kernel<float>((float *)inputData, (float *)outputData, num);
+        } else if (_op->getDType() == DataType::Float16) {
+            sqrt_kernel<half>((half *)inputData, (half *)outputData, num);
+        } else {
+            IT_TODO_HALT();
+        }
+    } else if (op->getOpType() == OpType::Gelu) {
+        if (_op->getDType() == DataType::Float32) {
+            gelu_kernel<float>((float *)inputData, (float *)outputData, num);
+        } else {
+            IT_TODO_HALT();
+        }
+    } else if (op->getOpType() == OpType::Neg) {
+        if (_op->getDType() == DataType::Float32) {
+            neg_kernel<float>((float *)inputData, (float *)outputData, num);
+        } else if (_op->getDType() == DataType::Float16) {
+            neg_kernel<half>((half *)inputData, (half *)outputData, num);
+        } else {
+            IT_TODO_HALT();
+        }
+    }
+
+    else if (op->getOpType() == OpType::Erf) {
+        if (_op->getDType() == DataType::Float32) {
+            erf_kernel<float>((float *)inputData, (float *)outputData, num);
+        } else {
+            IT_TODO_HALT();
+        }
+    } else
+        IT_TODO_HALT();
+}
+
+template <typename INPUT, typename OUTPUT>
+void cast_kernel(INPUT *input, OUTPUT *output, size_t num) {
+
+    int blocksize = block_work_size();
+    int gridsize = (num + block_work_size() - 1) / block_work_size();
+    _cast_kernel<INPUT, OUTPUT><<<gridsize, blocksize>>>(input, output, num);
+}
+
+template void cast_kernel<float, half>(float *input, half *output, size_t num);
+template void cast_kernel<half, float>(half *input, float *output, size_t num);
+template void cast_kernel<float, int32_t>(float *input, int32_t *output,
+                                          size_t num);
+template void cast_kernel<float, int8_t>(float *input, int8_t *output,
+                                         size_t num);
+template void cast_kernel<int8_t, float>(int8_t *input, float *output,
+                                         size_t num);
+
 }; // namespace infini
diff --git a/src/kernels/cuda/where.cc b/src/kernels/cuda/where.cc
index df5e4476..da6ac784 100644
--- a/src/kernels/cuda/where.cc
+++ b/src/kernels/cuda/where.cc
@@ -36,14 +36,22 @@ class WhereCuda : public CudaKernelWithoutConfig {
         broadcastShape(opInputYShape, inputYShape, nDims, ySize);
         broadcastShape(opConditionShape, conditionShape, nDims, cSize);
 
-        whereKernel((float *)inputXData, (float *)inputYData,
-                    (uint8_t *)conditionData, (float *)outputData, nDims,
-                    outputsize, inputXShape, inputYShape, conditionShape,
-                    outputShape, xSize, ySize, cSize);
+        if (op->getDType() == DataType::Float32) {
+            whereKernel((float *)inputXData, (float *)inputYData,
+                        (uint8_t *)conditionData, (float *)outputData, nDims,
+                        outputsize, inputXShape, inputYShape, conditionShape,
+                        outputShape, xSize, ySize, cSize);
+        } else if (op->getDType() == DataType::Float16) {
+            whereKernel((half *)inputXData, (half *)inputYData,
+                        (uint8_t *)conditionData, (half *)outputData, nDims,
+                        outputsize, inputXShape, inputYShape, conditionShape,
+                        outputShape, xSize, ySize, cSize);
+        } else {
+            IT_ASSERT(false);
+        }
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Where, DataType::Float32, WhereCuda,
-                "Where_CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Where, WhereCuda, "Where_CUDA");
 
 }; // namespace infini
diff --git a/src/kernels/cuda/where.cu b/src/kernels/cuda/where.cu
index ac8b514a..e92a5e9f 100644
--- a/src/kernels/cuda/where.cu
+++ b/src/kernels/cuda/where.cu
@@ -17,13 +17,13 @@ __device__ int inferIndex(infini::SmallArray inputShape,
     }
     return inputIdx;
 }
-__global__ void _whereKernel(const float *inputX, const float *inputY,
-                             const uint8_t *condition, float *output, int nDims,
-                             int outputsize, infini::SmallArray inputXShape,
-                             infini::SmallArray inputYShape,
-                             infini::SmallArray conditionShape,
-                             infini::SmallArray outputShape, int xSize,
-                             int ySize, int cSize) {
+template <typename T>
+__global__ void
+_whereKernel(const T *inputX, const T *inputY, const uint8_t *condition,
+             T *output, int nDims, int outputsize,
+             infini::SmallArray inputXShape, infini::SmallArray inputYShape,
+             infini::SmallArray conditionShape, infini::SmallArray outputShape,
+             int xSize, int ySize, int cSize) {
 
     int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
     if (outputIdx < outputsize) {
@@ -61,7 +61,31 @@ void whereKernel(const float *inputX, const float *inputY,
         blocksize = 32;
     }
     int gridsize = (outputsize + blocksize - 1) / blocksize;
-    _whereKernel<<<gridsize, blocksize>>>(
+    _whereKernel<float><<<gridsize, blocksize>>>(
+        inputX, inputY, condition, output, nDims, outputsize, inputXShape,
+        inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
+}
+void whereKernel(const half *inputX, const half *inputY,
+                 const uint8_t *condition, half *output, int nDims,
+                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
+                 SmallArray conditionShape, SmallArray outputShape, int xSize,
+                 int ySize, int cSize) {
+    int blocksize;
+    if (outputsize > 511) {
+        blocksize = 1024;
+    } else if (outputsize > 255) {
+        blocksize = 512;
+    } else if (outputsize > 127) {
+        blocksize = 256;
+    } else if (outputsize > 63) {
+        blocksize = 128;
+    } else if (outputsize > 31) {
+        blocksize = 64;
+    } else {
+        blocksize = 32;
+    }
+    int gridsize = (outputsize + blocksize - 1) / blocksize;
+    _whereKernel<half><<<gridsize, blocksize>>>(
         inputX, inputY, condition, output, nDims, outputsize, inputXShape,
         inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
 }
diff --git a/src/kernels/intelcpu/batch_norm.cc b/src/kernels/intelcpu/batch_norm.cc
index 4583c013..ef024df1 100644
--- a/src/kernels/intelcpu/batch_norm.cc
+++ b/src/kernels/intelcpu/batch_norm.cc
@@ -7,6 +7,7 @@ class MklBatchNorm : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<BatchNormObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const MklRuntimeObj *>(_context);
 
         float *const srcData = op->getInputs(0)->getRawDataPtr<float *>();
@@ -63,6 +64,6 @@ class MklBatchNorm : public MklKernelWithoutConfig {
                                    {DNNL_ARG_SHIFT, baisMemory}});
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::BatchNormalization, DataType::Float32,
-                MklBatchNorm, "BatchNorm_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::BatchNormalization, MklBatchNorm,
+                "BatchNorm_Mkl");
 }; // namespace infini
diff --git a/src/kernels/intelcpu/concat.cc b/src/kernels/intelcpu/concat.cc
index b4e7b24b..85069e7e 100644
--- a/src/kernels/intelcpu/concat.cc
+++ b/src/kernels/intelcpu/concat.cc
@@ -7,6 +7,7 @@ class MklConcat : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ConcatObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const MklRuntimeObj *>(_context);
 
         //  create user memory that describes data layout in the buffers
@@ -53,6 +54,5 @@ class MklConcat : public MklKernelWithoutConfig {
         dnnl::concat(primDesc).execute(context->getStream(), args);
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::Concat, DataType::Float32, MklConcat,
-                "Concat_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Concat, MklConcat, "Concat_Mkl");
 }; // namespace infini
diff --git a/src/kernels/intelcpu/conv.cc b/src/kernels/intelcpu/conv.cc
index 77749e09..bd990e08 100644
--- a/src/kernels/intelcpu/conv.cc
+++ b/src/kernels/intelcpu/conv.cc
@@ -184,6 +184,7 @@ class MklConv : public Kernel {
 
     void compute(const Operator &op, const RuntimeObj *context) const override {
         auto record = make_ref<ConvMklPerfRecordObj>();
+        IT_ASSERT(op->getDType() == DataType::Float32);
         compute(op, record, context);
     }
 
@@ -233,6 +234,5 @@ class MklConv : public Kernel {
         return make_ref<ConvMklPerfRecordObj>(ret);
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::Conv, DataType::Float32, MklConv,
-                "MklConv_CPU_float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Conv, MklConv, "MklConv_CPU");
 } // namespace infini
diff --git a/src/kernels/intelcpu/conv_transposed.cc b/src/kernels/intelcpu/conv_transposed.cc
index ebf1ad24..b05cc491 100644
--- a/src/kernels/intelcpu/conv_transposed.cc
+++ b/src/kernels/intelcpu/conv_transposed.cc
@@ -197,6 +197,7 @@ class MklConvTranspose : public Kernel {
 
     void compute(const Operator &op, const RuntimeObj *context) const override {
         auto record = make_ref<ConvTransposeMklPerfRecordObj>();
+        IT_ASSERT(op->getDType() == DataType::Float32);
         compute(op, record, context);
     }
 
@@ -244,7 +245,7 @@ class MklConvTranspose : public Kernel {
         return make_ref<ConvTransposeMklPerfRecordObj>(ret);
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::ConvTranspose, DataType::Float32,
-                MklConvTranspose, "MklConvTrans_CPU_float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::ConvTranspose, MklConvTranspose,
+                "MklConvTrans_CPU");
 
 } // namespace infini
diff --git a/src/kernels/intelcpu/element_wise.cc b/src/kernels/intelcpu/element_wise.cc
index 0a27c31e..3cc5c47b 100644
--- a/src/kernels/intelcpu/element_wise.cc
+++ b/src/kernels/intelcpu/element_wise.cc
@@ -26,6 +26,7 @@ class MklBinary : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const MklRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -82,6 +83,7 @@ class MklUnary : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const MklRuntimeObj *>(_context);
 
         void *const srcData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -113,21 +115,13 @@ class MklUnary : public MklKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::INTELCPU, OpType::Add, DataType::Float32, MklBinary,
-                "Add_Mkl_Float32");
-REGISTER_KERNEL(Device::INTELCPU, OpType::Sub, DataType::Float32, MklBinary,
-                "Sub_Mkl_Float32");
-REGISTER_KERNEL(Device::INTELCPU, OpType::Mul, DataType::Float32, MklBinary,
-                "Mul_Mkl_Float32");
-REGISTER_KERNEL(Device::INTELCPU, OpType::Div, DataType::Float32, MklBinary,
-                "Div_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Add, MklBinary, "Add_Mkl");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Sub, MklBinary, "Sub_Mkl");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Mul, MklBinary, "Mul_Mkl");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Div, MklBinary, "Div_Mkl");
 
-REGISTER_KERNEL(Device::INTELCPU, OpType::Relu, DataType::Float32, MklUnary,
-                "Relu_Mkl_Float32");
-REGISTER_KERNEL(Device::INTELCPU, OpType::Sigmoid, DataType::Float32, MklUnary,
-                "Sigmoid_Mkl_Float32");
-REGISTER_KERNEL(Device::INTELCPU, OpType::Tanh, DataType::Float32, MklUnary,
-                "Tanh_Mkl_Float32");
-REGISTER_KERNEL(Device::INTELCPU, OpType::Abs, DataType::Float32, MklUnary,
-                "Abs_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Relu, MklUnary, "Relu_Mkl");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Sigmoid, MklUnary, "Sigmoid_Mkl");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Tanh, MklUnary, "Tanh_Mkl");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Abs, MklUnary, "Abs_Mkl");
 } // namespace infini
diff --git a/src/kernels/intelcpu/extend.cc b/src/kernels/intelcpu/extend.cc
index dff2ebc1..7e2ce225 100644
--- a/src/kernels/intelcpu/extend.cc
+++ b/src/kernels/intelcpu/extend.cc
@@ -10,6 +10,7 @@ class MklExtend : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ExtendObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto inData = op->getInputs(0)->getRawDataPtr<float *>();
         auto outData = op->getOutput(0)->getRawDataPtr<float *>();
         int iSize = op->getInputs(0)->size();
@@ -40,6 +41,5 @@ class MklExtend : public MklKernelWithoutConfig {
         sycl::free(outDevice, q);
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::Extend, DataType::Float32, MklExtend,
-                "Extend_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Extend, MklExtend, "Extend_Mkl");
 }; // namespace infini
diff --git a/src/kernels/intelcpu/gather.cc b/src/kernels/intelcpu/gather.cc
index 61549ccb..61a6fb45 100644
--- a/src/kernels/intelcpu/gather.cc
+++ b/src/kernels/intelcpu/gather.cc
@@ -10,6 +10,7 @@ class MklGather : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<GatherObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto in = op->getInputs(0);
         auto index = op->getInputs(1);
         auto out = op->getOutput();
@@ -81,6 +82,5 @@ class MklGather : public MklKernelWithoutConfig {
         sycl::free(indexDevice, q);
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::Gather, DataType::Float32, MklGather,
-                "Gather_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Gather, MklGather, "Gather_Mkl");
 }; // namespace infini
diff --git a/src/kernels/intelcpu/matmul.cc b/src/kernels/intelcpu/matmul.cc
index 61cf5c94..811c57ba 100644
--- a/src/kernels/intelcpu/matmul.cc
+++ b/src/kernels/intelcpu/matmul.cc
@@ -7,6 +7,7 @@ template <typename T> class MklMatmul : public CpuKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *context) const override {
         auto op = as<MatmulObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         IT_ASSERT(op->getInputs().size() == 2, "Bias is not supported yet.");
         const T *A = op->getInputs(0)->getRawDataPtr<T *>();
         const T *B = op->getInputs(1)->getRawDataPtr<T *>();
@@ -31,7 +32,7 @@ template <typename T> class MklMatmul : public CpuKernelWithoutConfig {
     }
 };
 
-/*REGISTER_KERNEL(Device::INTELCPU, OpType::Matmul, DataType::Float32,
+/*REGISTER_KERNEL(Device::INTELCPU, OpType::Matmul,
                 MklMatmul<float>, "MklMatmul_CPU_float32");*/
 
 } // namespace infini
diff --git a/src/kernels/intelcpu/matmul_dpcpp.cc b/src/kernels/intelcpu/matmul_dpcpp.cc
index 8fdddfe2..92db14df 100644
--- a/src/kernels/intelcpu/matmul_dpcpp.cc
+++ b/src/kernels/intelcpu/matmul_dpcpp.cc
@@ -10,6 +10,7 @@ template <typename T> class MklDpcppMatmul : public CpuKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *context) const override {
         auto op = as<MatmulObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         IT_ASSERT(op->getInputs().size() == 2, "Bias is not supported yet.");
         const T *A = op->getInputs(0)->getRawDataPtr<T *>();
         const T *B = op->getInputs(1)->getRawDataPtr<T *>();
@@ -69,7 +70,7 @@ template <typename T> class MklDpcppMatmul : public CpuKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::INTELCPU, OpType::MatMul, DataType::Float32,
-                MklDpcppMatmul<float>, "MklDpcppMatmul_CPU_float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::MatMul, MklDpcppMatmul<float>,
+                "MklDpcppMatmul_CPU");
 
 } // namespace infini
diff --git a/src/kernels/intelcpu/pad.cc b/src/kernels/intelcpu/pad.cc
index 8f52e7f6..047d5c80 100644
--- a/src/kernels/intelcpu/pad.cc
+++ b/src/kernels/intelcpu/pad.cc
@@ -7,6 +7,7 @@ class MklPad : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<PadObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const MklRuntimeObj *>(_context);
 
         std::vector<dnnl_dim_t> dims;
@@ -53,6 +54,5 @@ class MklPad : public MklKernelWithoutConfig {
                      {{DNNL_ARG_FROM, srcMemory}, {DNNL_ARG_TO, mem}});
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::Pad, DataType::Float32, MklPad,
-                "Pad_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Pad, MklPad, "Pad_Mkl");
 } // namespace infini
diff --git a/src/kernels/intelcpu/pooling.cc b/src/kernels/intelcpu/pooling.cc
index d3c9e44d..23e9adee 100644
--- a/src/kernels/intelcpu/pooling.cc
+++ b/src/kernels/intelcpu/pooling.cc
@@ -9,6 +9,7 @@ class MklPooling : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<PoolingObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const MklRuntimeObj *>(_context);
 
         float *const srcData = op->getInputs(0)->getRawDataPtr<float *>();
@@ -77,8 +78,7 @@ class MklMaxPool : public MklPooling {
     }
 };
 
-REGISTER_KERNEL(Device::INTELCPU, OpType::AveragePool, DataType::Float32,
-                MklAvgPool, "AvgPool_Mkl_Float32");
-REGISTER_KERNEL(Device::INTELCPU, OpType::MaxPool, DataType::Float32,
-                MklMaxPool, "MaxPool_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::AveragePool, MklAvgPool,
+                "AvgPool_Mkl");
+REGISTER_KERNEL(Device::INTELCPU, OpType::MaxPool, MklMaxPool, "MaxPool_Mkl");
 } // namespace infini
diff --git a/src/kernels/intelcpu/pow.cc b/src/kernels/intelcpu/pow.cc
index 166d0a75..493c3148 100644
--- a/src/kernels/intelcpu/pow.cc
+++ b/src/kernels/intelcpu/pow.cc
@@ -11,6 +11,7 @@ class MklPow : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<PowObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto in0Data = op->getInputs(0)->getRawDataPtr<float *>();
         auto in1Data = op->getInputs(1)->getRawDataPtr<float *>();
         auto outData = op->getOutput(0)->getRawDataPtr<float *>();
@@ -37,7 +38,6 @@ class MklPow : public MklKernelWithoutConfig {
         sycl::free(outDevice, q);
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::Pow, DataType::Float32, MklPow,
-                "Pow_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Pow, MklPow, "Pow_Mkl");
 
 }; // namespace infini
diff --git a/src/kernels/intelcpu/reduce.cc b/src/kernels/intelcpu/reduce.cc
index 6670229e..a63ec014 100644
--- a/src/kernels/intelcpu/reduce.cc
+++ b/src/kernels/intelcpu/reduce.cc
@@ -1,6 +1,6 @@
+#include "operators/reduce.h"
 #include "intelcpu/mkl_kernel_without_config.h"
 #include "intelcpu/mkl_runtime.h"
-#include "operators/reduce_mean.h"
 
 namespace infini {
 class MklReduce : public MklKernelWithoutConfig {
@@ -11,6 +11,7 @@ class MklReduce : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ReduceMeanObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const MklRuntimeObj *>(_context);
 
         float *const srcData = op->getInputs(0)->getRawDataPtr<float *>();
@@ -64,6 +65,6 @@ class MklReduce : public MklKernelWithoutConfig {
             {{DNNL_ARG_SRC, srcMemory}, {DNNL_ARG_DST, output}});
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::ReduceMean, DataType::Float32,
-                MklReduce, "ReduceMean_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::ReduceMean, MklReduce,
+                "ReduceMean_Mkl");
 }; // namespace infini
diff --git a/src/kernels/intelcpu/reshape.cc b/src/kernels/intelcpu/reshape.cc
index 2a17b881..a1432ac9 100644
--- a/src/kernels/intelcpu/reshape.cc
+++ b/src/kernels/intelcpu/reshape.cc
@@ -6,7 +6,7 @@ namespace infini {
 class MklReshape : public MklKernelWithoutConfig {
     void compute(const Operator &op,
                  const RuntimeObj *_context) const override {
-
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const MklRuntimeObj *>(_context);
 
         std::vector<dnnl_dim_t> dims;
@@ -41,10 +41,7 @@ class MklReshape : public MklKernelWithoutConfig {
                      {{DNNL_ARG_FROM, reshapeMemory}, {DNNL_ARG_TO, output}});
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::Reshape, DataType::Float32,
-                MklReshape, "Reshape_Mkl_Float32");
-REGISTER_KERNEL(Device::INTELCPU, OpType::Identity, DataType::Float32,
-                MklReshape, "Identify_Mkl_Float32");
-REGISTER_KERNEL(Device::INTELCPU, OpType::Flatten, DataType::Float32,
-                MklReshape, "Flatten_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Reshape, MklReshape, "Reshape_Mkl");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Identity, MklReshape, "Identify_Mkl");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Flatten, MklReshape, "Flatten_Mkl");
 }; // namespace infini
diff --git a/src/kernels/intelcpu/resize.cc b/src/kernels/intelcpu/resize.cc
index f9a85634..524db879 100644
--- a/src/kernels/intelcpu/resize.cc
+++ b/src/kernels/intelcpu/resize.cc
@@ -24,6 +24,7 @@ class MklResize : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ResizeObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
 
         // only support default coordinate transmode??
         if (op->getCoordinateTransMode() !=
@@ -75,6 +76,5 @@ class MklResize : public MklKernelWithoutConfig {
             {{DNNL_ARG_SRC, srcMemory}, {DNNL_ARG_DST, output}});
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::Resize, DataType::Float32, MklResize,
-                "Resize_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Resize, MklResize, "Resize_Mkl");
 }; // namespace infini
diff --git a/src/kernels/intelcpu/slice.cc b/src/kernels/intelcpu/slice.cc
index a5715ced..42e45d5b 100644
--- a/src/kernels/intelcpu/slice.cc
+++ b/src/kernels/intelcpu/slice.cc
@@ -7,6 +7,7 @@ class MklSlice : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<SliceObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const MklRuntimeObj *>(_context);
 
         std::vector<dnnl_dim_t> dims;
@@ -41,6 +42,5 @@ class MklSlice : public MklKernelWithoutConfig {
                      {{DNNL_ARG_FROM, sliceMemory}, {DNNL_ARG_TO, output}});
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::Slice, DataType::Float32, MklSlice,
-                "Slice_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Slice, MklSlice, "Slice_Mkl");
 } // namespace infini
diff --git a/src/kernels/intelcpu/softmax.cc b/src/kernels/intelcpu/softmax.cc
index 32c58a94..fe88cefa 100644
--- a/src/kernels/intelcpu/softmax.cc
+++ b/src/kernels/intelcpu/softmax.cc
@@ -7,6 +7,7 @@ class MklSoftmax : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<SoftmaxObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const MklRuntimeObj *>(_context);
 
         float *const srcData = op->getInputs(0)->getRawDataPtr<float *>();
@@ -38,6 +39,5 @@ class MklSoftmax : public MklKernelWithoutConfig {
             {{DNNL_ARG_SRC, srcMemory}, {DNNL_ARG_DST, output}});
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::Softmax, DataType::Float32,
-                MklSoftmax, "Softmax_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Softmax, MklSoftmax, "Softmax_Mkl");
 }; // namespace infini
diff --git a/src/kernels/intelcpu/split.cc b/src/kernels/intelcpu/split.cc
index df859083..37d28360 100644
--- a/src/kernels/intelcpu/split.cc
+++ b/src/kernels/intelcpu/split.cc
@@ -7,6 +7,7 @@ class MklSplit : public MklKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<SplitObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const MklRuntimeObj *>(_context);
 
         std::vector<dnnl_dim_t> dims;
@@ -49,6 +50,5 @@ class MklSplit : public MklKernelWithoutConfig {
         }
     }
 };
-REGISTER_KERNEL(Device::INTELCPU, OpType::Split, DataType::Float32, MklSplit,
-                "Split_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Split, MklSplit, "Split_Mkl");
 }; // namespace infini
diff --git a/src/kernels/kunlun/batch_norm.cc b/src/kernels/kunlun/batch_norm.cc
index d1c8c3b4..d0e1c9b2 100644
--- a/src/kernels/kunlun/batch_norm.cc
+++ b/src/kernels/kunlun/batch_norm.cc
@@ -7,6 +7,7 @@ class BatchNormXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<BatchNormObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const input = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -35,7 +36,7 @@ class BatchNormXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::BatchNormalization, DataType::Float32,
-                BatchNormXdnn, "BatchNorm_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::BatchNormalization, BatchNormXdnn,
+                "BatchNorm_xdnn_KUNLUN");
 
 }; // namespace infini
diff --git a/src/kernels/kunlun/cast.cc b/src/kernels/kunlun/cast.cc
index 443cc259..0bd7e4e8 100644
--- a/src/kernels/kunlun/cast.cc
+++ b/src/kernels/kunlun/cast.cc
@@ -93,6 +93,5 @@ class CastXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::Cast, DataType::Float32, CastXdnn,
-                "Cast_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Cast, CastXdnn, "Cast_xdnn_KUNLUN");
 }; // namespace infini
diff --git a/src/kernels/kunlun/concat.cc b/src/kernels/kunlun/concat.cc
index 35777cae..f7ba2a2d 100644
--- a/src/kernels/kunlun/concat.cc
+++ b/src/kernels/kunlun/concat.cc
@@ -7,6 +7,7 @@ class ConcatXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ConcatObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         int axis = op->getDim();
         int num = op->numInputs();
@@ -32,6 +33,6 @@ class ConcatXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::Concat, DataType::Float32, ConcatXdnn,
-                "Concat_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Concat, ConcatXdnn,
+                "Concat_xdnn_KUNLUN");
 }; // namespace infini
diff --git a/src/kernels/kunlun/conv.cc b/src/kernels/kunlun/conv.cc
index 80cc37c7..45f054b1 100644
--- a/src/kernels/kunlun/conv.cc
+++ b/src/kernels/kunlun/conv.cc
@@ -7,6 +7,7 @@ class ConvXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ConvObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
@@ -32,6 +33,5 @@ class ConvXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::Conv, DataType::Float32, ConvXdnn,
-                "Conv_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Conv, ConvXdnn, "Conv_xdnn_KUNLUN");
 }; // namespace infini
diff --git a/src/kernels/kunlun/conv_trans.cc b/src/kernels/kunlun/conv_trans.cc
index 841955a6..8219d829 100644
--- a/src/kernels/kunlun/conv_trans.cc
+++ b/src/kernels/kunlun/conv_trans.cc
@@ -7,6 +7,7 @@ class ConvTransXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ConvBaseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
@@ -46,9 +47,9 @@ class ConvTransXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::ConvTranspose, DataType::Float32,
-                ConvTransXdnn, "ConvTrans_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::ConvTransNHWC, DataType::Float32,
-                ConvTransXdnn, "ConvTranposedNHWC_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::ConvTranspose, ConvTransXdnn,
+                "ConvTrans_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::ConvTransNHWC, ConvTransXdnn,
+                "ConvTranposedNHWC_xdnn_KUNLUN");
 
 }; // namespace infini
diff --git a/src/kernels/kunlun/element_wise.cc b/src/kernels/kunlun/element_wise.cc
index 3370eb1a..5a9754f5 100644
--- a/src/kernels/kunlun/element_wise.cc
+++ b/src/kernels/kunlun/element_wise.cc
@@ -7,6 +7,7 @@ class AddXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -33,6 +34,7 @@ class SubXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -59,6 +61,7 @@ class MulXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -85,6 +88,7 @@ class DivXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -111,6 +115,7 @@ class PowXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -138,6 +143,7 @@ class MaxXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -164,6 +170,7 @@ class MinXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -190,6 +197,7 @@ class EqualXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -220,6 +228,7 @@ class GreaterEqualXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -250,6 +259,7 @@ class GreaterThanXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -280,6 +290,7 @@ class LessEqualXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -310,6 +321,7 @@ class LessThanXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -340,6 +352,7 @@ class FloorDivXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -366,6 +379,7 @@ class MSELossXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<MSELossObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -386,6 +400,7 @@ class AndXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -416,6 +431,7 @@ class OrXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -446,6 +462,7 @@ class XorXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -476,6 +493,7 @@ class NotXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ElementWiseObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -493,40 +511,28 @@ class NotXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::Add, DataType::Float32, AddXdnn,
-                "Add_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Sub, DataType::Float32, SubXdnn,
-                "Sub_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Mul, DataType::Float32, MulXdnn,
-                "Mul_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Div, DataType::Float32, DivXdnn,
-                "Div_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Pow, DataType::Float32, PowXdnn,
-                "Pow_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Max, DataType::Float32, MaxXdnn,
-                "Max_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Min, DataType::Float32, MinXdnn,
-                "Min_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Equal, DataType::Float32, EqualXdnn,
-                "Equal_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::GreaterOrEqual, DataType::Float32,
-                GreaterEqualXdnn, "GreaterEqual_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Greater, DataType::Float32,
-                GreaterThanXdnn, "GreaterThan_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::LessOrEqual, DataType::Float32,
-                LessEqualXdnn, "LessEqual_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Less, DataType::Float32, LessThanXdnn,
-                "LessThan_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::FloorDiv, DataType::Float32,
-                FloorDivXdnn, "FloorDiv_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::MSELoss, DataType::Float32, MSELossXdnn,
-                "MSELoss_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::And, DataType::Float32, AndXdnn,
-                "And_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Or, DataType::Float32, OrXdnn,
-                "Or_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Xor, DataType::Float32, XorXdnn,
-                "Xor_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Not, DataType::Float32, NotXdnn,
-                "Not_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Add, AddXdnn, "Add_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Sub, SubXdnn, "Sub_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Mul, MulXdnn, "Mul_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Div, DivXdnn, "Div_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Pow, PowXdnn, "Pow_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Max, MaxXdnn, "Max_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Min, MinXdnn, "Min_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Equal, EqualXdnn, "Equal_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::GreaterOrEqual, GreaterEqualXdnn,
+                "GreaterEqual_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Greater, GreaterThanXdnn,
+                "GreaterThan_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::LessOrEqual, LessEqualXdnn,
+                "LessEqual_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Less, LessThanXdnn,
+                "LessThan_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::FloorDiv, FloorDivXdnn,
+                "FloorDiv_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::MSELoss, MSELossXdnn,
+                "MSELoss_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::And, AndXdnn, "And_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Or, OrXdnn, "Or_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Xor, XorXdnn, "Xor_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Not, NotXdnn, "Not_xdnn_KUNLUN");
 }; // namespace infini
diff --git a/src/kernels/kunlun/gather.cc b/src/kernels/kunlun/gather.cc
index f94d24fa..75fd2365 100644
--- a/src/kernels/kunlun/gather.cc
+++ b/src/kernels/kunlun/gather.cc
@@ -7,6 +7,7 @@ class GatherXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<GatherObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -24,6 +25,6 @@ class GatherXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::Gather, DataType::Float32, GatherXdnn,
-                "Gather_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Gather, GatherXdnn,
+                "Gather_xdnn_KUNLUN");
 }; // namespace infini
diff --git a/src/kernels/kunlun/matmul.cc b/src/kernels/kunlun/matmul.cc
index 8506e812..f70394a9 100644
--- a/src/kernels/kunlun/matmul.cc
+++ b/src/kernels/kunlun/matmul.cc
@@ -7,6 +7,7 @@ class MatmulXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<MatmulObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@@ -28,6 +29,6 @@ class MatmulXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::MatMul, DataType::Float32, MatmulXdnn,
-                "Matmul_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::MatMul, MatmulXdnn,
+                "Matmul_xdnn_KUNLUN");
 }; // namespace infini
diff --git a/src/kernels/kunlun/pad.cc b/src/kernels/kunlun/pad.cc
index 2ae93d99..de063828 100644
--- a/src/kernels/kunlun/pad.cc
+++ b/src/kernels/kunlun/pad.cc
@@ -7,6 +7,7 @@ class PadXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<PadObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -31,7 +32,6 @@ class PadXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::Pad, DataType::Float32, PadXdnn,
-                "Pad_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Pad, PadXdnn, "Pad_xdnn_KUNLUN");
 
 }; // namespace infini
diff --git a/src/kernels/kunlun/pooling.cc b/src/kernels/kunlun/pooling.cc
index 27b8458a..bc49e31c 100644
--- a/src/kernels/kunlun/pooling.cc
+++ b/src/kernels/kunlun/pooling.cc
@@ -7,6 +7,7 @@ class AvgPooling : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<PoolingObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -30,6 +31,7 @@ class MaxPooling : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<PoolingObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -55,8 +57,7 @@ class MaxPooling : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::MaxPool, DataType::Float32, MaxPooling,
-                "MaxPool_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::AveragePool, DataType::Float32,
-                AvgPooling, "AvgPool_xdnn_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::MaxPool, MaxPooling, "MaxPool_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::AveragePool, AvgPooling,
+                "AvgPool_xdnn");
 }; // namespace infini
diff --git a/src/kernels/kunlun/reduce_mean.cc b/src/kernels/kunlun/reduce_mean.cc
index c7cf19ac..928d42c8 100644
--- a/src/kernels/kunlun/reduce_mean.cc
+++ b/src/kernels/kunlun/reduce_mean.cc
@@ -7,6 +7,7 @@ class ReduceMeanXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ReduceMeanObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -25,6 +26,6 @@ class ReduceMeanXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::ReduceMean, DataType::Float32,
-                ReduceMeanXdnn, "ReduceMean_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::ReduceMean, ReduceMeanXdnn,
+                "ReduceMean_xdnn_KUNLUN");
 }; // namespace infini
diff --git a/src/kernels/kunlun/select.cc b/src/kernels/kunlun/select.cc
index d6318e46..7cdfd8bf 100644
--- a/src/kernels/kunlun/select.cc
+++ b/src/kernels/kunlun/select.cc
@@ -7,6 +7,7 @@ class WhereXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<WhereObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -27,6 +28,5 @@ class WhereXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::Where, DataType::Float32, WhereXdnn,
-                "Where_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Where, WhereXdnn, "Where_xdnn_KUNLUN");
 }; // namespace infini
diff --git a/src/kernels/kunlun/softmax.cc b/src/kernels/kunlun/softmax.cc
index 56374766..552b6c21 100644
--- a/src/kernels/kunlun/softmax.cc
+++ b/src/kernels/kunlun/softmax.cc
@@ -7,6 +7,7 @@ class SoftmaxXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<SoftmaxObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         auto dim = op->getInputs(0)->getDims();
         auto axis = op->getAxis();
@@ -21,6 +22,6 @@ class SoftmaxXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::Softmax, DataType::Float32, SoftmaxXdnn,
-                "Softmax_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Softmax, SoftmaxXdnn,
+                "Softmax_xdnn_KUNLUN");
 }; // namespace infini
diff --git a/src/kernels/kunlun/split.cc b/src/kernels/kunlun/split.cc
index 46276c85..f76f86ff 100644
--- a/src/kernels/kunlun/split.cc
+++ b/src/kernels/kunlun/split.cc
@@ -7,6 +7,7 @@ class SplitXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<SplitObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         int axis = op->getDim();
         int num = op->numOutputs();
@@ -33,6 +34,5 @@ class SplitXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::Split, DataType::Float32, SplitXdnn,
-                "Split_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Split, SplitXdnn, "Split_xdnn_KUNLUN");
 }; // namespace infini
diff --git a/src/kernels/kunlun/transpose.cc b/src/kernels/kunlun/transpose.cc
index 817c32e2..7a89480e 100644
--- a/src/kernels/kunlun/transpose.cc
+++ b/src/kernels/kunlun/transpose.cc
@@ -7,6 +7,7 @@ class TransposeXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<TransposeObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -31,6 +32,7 @@ class DepthToSpaceXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<DepthToSpaceObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -52,8 +54,8 @@ class DepthToSpaceXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::Transpose, DataType::Float32,
-                TransposeXdnn, "Transpose_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::DepthToSpace, DataType::Float32,
-                DepthToSpaceXdnn, "DepthToSpace_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Transpose, TransposeXdnn,
+                "Transpose_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::DepthToSpace, DepthToSpaceXdnn,
+                "DepthToSpace_xdnn_KUNLUN");
 }; // namespace infini
diff --git a/src/kernels/kunlun/unary.cc b/src/kernels/kunlun/unary.cc
index c24fddaf..3b444d3b 100644
--- a/src/kernels/kunlun/unary.cc
+++ b/src/kernels/kunlun/unary.cc
@@ -7,6 +7,7 @@ class ReluXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -24,6 +25,7 @@ class SigmoidXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -41,6 +43,7 @@ class TanhXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -58,6 +61,7 @@ class SquareXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -75,6 +79,7 @@ class SqrtXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -92,6 +97,7 @@ class RsqrtXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -109,6 +115,7 @@ class ExpXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -126,6 +133,7 @@ class CeilXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -143,6 +151,7 @@ class ClipXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ClipObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -163,6 +172,7 @@ class FloorXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -180,6 +190,7 @@ class NegXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -196,6 +207,7 @@ class NegXdnn : public KUNLUNKernelWithoutConfig {
 class CopyXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &op,
                  const RuntimeObj *_context) const override {
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -213,6 +225,7 @@ class ReciprocalXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -230,6 +243,7 @@ class AbsXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -247,6 +261,7 @@ class ATanXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<UnaryObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
 
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@@ -264,6 +279,7 @@ class LogXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<LogObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -316,6 +332,7 @@ class CosXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<CosObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -332,6 +349,7 @@ class SinXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<SinObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -348,6 +366,7 @@ class TanXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<TanObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -364,6 +383,7 @@ class SinhXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<SinHObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -380,6 +400,7 @@ class CoshXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<CosHObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -396,6 +417,7 @@ class ErfXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ErfObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -412,6 +434,7 @@ class ACosXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ACosObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -428,6 +451,7 @@ class ACoshXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ACosHObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -444,6 +468,7 @@ class ASinXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ASinObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -460,6 +485,7 @@ class ASinhXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ASinHObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -476,6 +502,7 @@ class ATanhXdnn : public KUNLUNKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<ATanHObj>(_op);
+        IT_ASSERT(op->getDType() == DataType::Float32);
         auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@@ -488,63 +515,38 @@ class ATanhXdnn : public KUNLUNKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::Relu, DataType::Float32, ReluXdnn,
-                "Relu_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Sigmoid, DataType::Float32, SigmoidXdnn,
-                "Sigmoid_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Tanh, DataType::Float32, TanhXdnn,
-                "Tanh_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Square, DataType::Float32, SquareXdnn,
-                "Square_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Sqrt, DataType::Float32, SqrtXdnn,
-                "Sqrt_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Rsqrt, DataType::Float32, RsqrtXdnn,
-                "Rsqrt_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Exp, DataType::Float32, ExpXdnn,
-                "Exp_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Ceil, DataType::Float32, CeilXdnn,
-                "Ceil_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Clip, DataType::Float32, ClipXdnn,
-                "Clip_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Floor, DataType::Float32, FloorXdnn,
-                "Floor_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Neg, DataType::Float32, NegXdnn,
-                "Neg_xdnn_KUNLUN_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Reciprocal, DataType::Float32,
-                ReciprocalXdnn, "Reciprocal_xdnn_KUNLUN_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Relu, ReluXdnn, "Relu_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Sigmoid, SigmoidXdnn,
+                "Sigmoid_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Tanh, TanhXdnn, "Tanh_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Square, SquareXdnn,
+                "Square_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Sqrt, SqrtXdnn, "Sqrt_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Rsqrt, RsqrtXdnn, "Rsqrt_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Exp, ExpXdnn, "Exp_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Ceil, CeilXdnn, "Ceil_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Clip, ClipXdnn, "Clip_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Floor, FloorXdnn, "Floor_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Neg, NegXdnn, "Neg_xdnn_KUNLUN");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Reciprocal, ReciprocalXdnn,
+                "Reciprocal_xdnn_KUNLUN");
 
-REGISTER_KERNEL(Device::KUNLUN, OpType::Reshape, DataType::Float32, CopyXdnn,
-                "Reshape_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Flatten, DataType::Float32, CopyXdnn,
-                "Flatten_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Identity, DataType::Float32, CopyXdnn,
-                "Identity_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Abs, DataType::Float32, AbsXdnn,
-                "Abs_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Atan, DataType::Float32, ATanXdnn,
-                "Atan_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Log, DataType::Float32, LogXdnn,
-                "Log_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Cos, DataType::Float32, CosXdnn,
-                "Cos_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Sin, DataType::Float32, SinXdnn,
-                "Sin_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Tan, DataType::Float32, TanXdnn,
-                "Tan_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Sinh, DataType::Float32, SinhXdnn,
-                "Sinh_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Cosh, DataType::Float32, CoshXdnn,
-                "Cosh_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Erf, DataType::Float32, ErfXdnn,
-                "Erf_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Acos, DataType::Float32, ACosXdnn,
-                "ACos_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Acosh, DataType::Float32, ACoshXdnn,
-                "ACosh_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Asin, DataType::Float32, ASinXdnn,
-                "ASin_xdnn_Float32");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Asinh, DataType::Float32, ASinhXdnn,
+REGISTER_KERNEL(Device::KUNLUN, OpType::Reshape, CopyXdnn, "Reshape_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Flatten, CopyXdnn, "Flatten_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Identity, CopyXdnn, "Identity_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Abs, AbsXdnn, "Abs_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Atan, ATanXdnn, "Atan_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Log, LogXdnn, "Log_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Cos, CosXdnn, "Cos_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Sin, SinXdnn, "Sin_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Tan, TanXdnn, "Tan_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Sinh, SinhXdnn, "Sinh_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Cosh, CoshXdnn, "Cosh_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Erf, ErfXdnn, "Erf_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Acos, ACosXdnn, "ACos_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Acosh, ACoshXdnn, "ACosh_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Asin, ASinXdnn, "ASin_xdnn");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Asinh, ASinhXdnn,
                 "ASinh_xdnn_Float3 2");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Atanh, DataType::Float32, ATanhXdnn,
-                "ATanh_xdnn_Float32");
+REGISTER_KERNEL(Device::KUNLUN, OpType::Atanh, ATanhXdnn, "ATanh_xdnn");
 }; // namespace infini
diff --git a/src/kunlun/kunlun_runtime.cc b/src/kunlun/kunlun_runtime.cc
index b40e772f..b614ac9c 100644
--- a/src/kunlun/kunlun_runtime.cc
+++ b/src/kunlun/kunlun_runtime.cc
@@ -13,8 +13,7 @@ void KUNLUNRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
     std::map<OpType, int> opCnt;
     for (auto &op : graph->getOperators()) {
         // HACK: set correct data type
-        auto kernelAttrs =
-            KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
+        auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
         Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
         auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
         auto perfData = perfEngine.getPerfData(perfKey);
diff --git a/src/operators/layer_norm.cc b/src/operators/layer_norm.cc
index 68649215..5109c79b 100644
--- a/src/operators/layer_norm.cc
+++ b/src/operators/layer_norm.cc
@@ -27,10 +27,7 @@ optional<vector<Shape>> LayerNormObj::inferShape(const TensorVec &inputs) {
 
 vector<DataType> LayerNormObj::inferDataType(const TensorVec &inputs) const {
     IT_ASSERT(inputs.size() == 2 || inputs.size() == 3);
-    IT_ASSERT(inputs[1]->getDType() == DataType::Float32);
-    if (inputs.size() == 3) {
-        IT_ASSERT(inputs[2]->getDType() == DataType::Float32);
-    }
+
     return {inputs[0]->getDType()};
 }
 
diff --git a/src/utils/operator_utils.cc b/src/utils/operator_utils.cc
index 6687a8fd..b191fb33 100644
--- a/src/utils/operator_utils.cc
+++ b/src/utils/operator_utils.cc
@@ -112,7 +112,6 @@ std::string device_to_str(Device device) {
 std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs) {
     std::string deviceStr = device_to_str(std::get<0>(kernelAttrs));
     std::string opStr = OpType(std::get<1>(kernelAttrs)).toString();
-    std::string datatypeStr = std::get<2>(kernelAttrs).toString();
-    return deviceStr + ", " + opStr + ", " + datatypeStr;
+    return deviceStr + ", " + opStr;
 }
 } // namespace infini
diff --git a/test/kernels/cuda/test_cuda_concat.cc b/test/kernels/cuda/test_cuda_concat.cc
index 12e18a56..a17ba24b 100644
--- a/test/kernels/cuda/test_cuda_concat.cc
+++ b/test/kernels/cuda/test_cuda_concat.cc
@@ -187,4 +187,42 @@ TEST(ConcatToIdentity, Cuda) {
     EXPECT_TRUE(
         oCpu->equalData(vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}));
 }
+//----------
+TEST(ConcatFp16, CudaHigh) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    auto t1 = gCpu->addTensor({2, 2, 3, 1, 2}, DataType::Float16);
+    auto t2 = gCpu->addTensor({2, 2, 1, 1, 2}, DataType::Float16);
+    auto t3 = gCpu->addTensor({2, 2, 2, 1, 2}, DataType::Float16);
+    gCpu->dataMalloc();
+    t1->setData(ValGenerator<2>());
+    t2->setData(ValGenerator<1>());
+    t3->setData(ValGenerator<4>());
+
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+    auto t1Gpu = gCuda->cloneTensor(t1);
+    auto t2Gpu = gCuda->cloneTensor(t2);
+    auto t3Gpu = gCuda->cloneTensor(t3);
+
+    auto op =
+        gCuda->addOp<ConcatObj>(TensorVec{t1Gpu, t2Gpu, t3Gpu}, nullptr, 2);
+    gCuda->dataMalloc();
+    t1Gpu->setData(ValGenerator<2>());
+    t2Gpu->setData(ValGenerator<1>());
+    t3Gpu->setData(ValGenerator<4>());
+
+    cudaRuntime->run(gCuda);
+
+    // cudaPrintTensor(op->getOutput());
+    //  copy output from CUDA to CPU
+    auto oCpu = gCpu->cloneTensor(op->getOutput());
+    EXPECT_TRUE(oCpu->equalData(vector<float>{
+        2., 2., 2., 2., 2., 2., 1., 1., 4., 4., 4., 4., 2., 2., 2., 2.,
+        2., 2., 1., 1., 4., 4., 4., 4., 2., 2., 2., 2., 2., 2., 1., 1.,
+        4., 4., 4., 4., 2., 2., 2., 2., 2., 2., 1., 1., 4., 4., 4., 4.}));
+}
+
 } // namespace infini
diff --git a/test/kernels/cuda/test_cuda_conv_transposed_2d.cc b/test/kernels/cuda/test_cuda_conv_transposed_2d.cc
index 0c8899e4..3825a06e 100644
--- a/test/kernels/cuda/test_cuda_conv_transposed_2d.cc
+++ b/test/kernels/cuda/test_cuda_conv_transposed_2d.cc
@@ -160,8 +160,8 @@ TEST(cuDNN_ConvTransposed, tune) {
     bool tune = true;
     cuda->run(gCuda, tune);
     // check record
-    auto kernelAttrs = KernelAttrs{Device::CUDA, conv->getOpType().underlying(),
-                                   DataType::Float32};
+    auto kernelAttrs =
+        KernelAttrs{Device::CUDA, conv->getOpType().underlying()};
     auto perfKey = PerfEngine::Key{kernelAttrs, conv->getOpPerfKey()};
     std::optional<PerfRecord> perfData =
         PerfEngine::getInstance().getPerfData(perfKey);
diff --git a/test/kernels/cuda/test_cuda_layernorm.cc b/test/kernels/cuda/test_cuda_layernorm.cc
index 18b8c4df..e2af489e 100644
--- a/test/kernels/cuda/test_cuda_layernorm.cc
+++ b/test/kernels/cuda/test_cuda_layernorm.cc
@@ -8,7 +8,7 @@
 
 namespace infini {
 
-void test_layernorm(
+void test_layernormFp32(
     const Shape &inputShape, const vector<float> &inputData,
     const Shape &scaleShape, const vector<float> &scaleData, float eps,
     int axis, int stash_type, const vector<float> &ExpectData,
@@ -77,9 +77,78 @@ void test_layernorm(
         EXPECT_TRUE(oCpu->equalData(ExpectData));
     }
 }
+void test_layernormFp16(
+    const Shape &inputShape,
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &scaleShape, float eps, int axis, int stash_type,
+    const vector<float> &ExpectData,
+    const std::optional<Shape> &bShape = std::nullopt) {
 
-TEST(CUDA_Layernorm, run) {
-    test_layernorm(
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    if (bShape.has_value()) {
+        Shape biasShape = *bShape;
+
+        auto bias = gCpu->addTensor(biasShape, DataType::Float16);
+        auto input = gCpu->addTensor(inputShape, DataType::Float16);
+        auto scale = gCpu->addTensor(scaleShape, DataType::Float16);
+        gCpu->dataMalloc();
+        bias->setData(generator);
+        // bias->printData();
+        input->setData(generator);
+        scale->setData(generator);
+        auto cudaRuntime = make_ref<CudaRuntimeObj>();
+        Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+        auto biasGpu = gCuda->cloneTensor(bias);
+        auto inputGpu = gCuda->cloneTensor(input);
+        auto scaleGpu = gCuda->cloneTensor(scale);
+        // gCpu->cloneTensor(biasGpu)->printData();
+        auto op =
+            gCuda->addOp<LayerNormObj>(inputGpu, scaleGpu, nullptr, biasGpu,
+                                       eps, axis, stash_type); // LayernormObj
+        gCuda->dataMalloc();
+        biasGpu->setData(generator);
+        // gCpu->cloneTensor(biasGpu)->printData();
+        inputGpu->setData(generator);
+        scaleGpu->setData(generator);
+        cudaRuntime->run(gCuda);
+
+        auto oCpu =
+            gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
+        oCpu->printData();                      //->printData
+        EXPECT_TRUE(oCpu->equalData(ExpectData));
+    } else {
+
+        auto input = gCpu->addTensor(inputShape, DataType::Float16);
+        auto scale = gCpu->addTensor(scaleShape, DataType::Float16);
+        gCpu->dataMalloc();
+
+        input->setData(generator);
+        scale->setData(generator);
+        auto cudaRuntime = make_ref<CudaRuntimeObj>();
+        Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+        auto inputGpu = gCuda->cloneTensor(input);
+        auto scaleGpu = gCuda->cloneTensor(scale);
+        auto op =
+            gCuda->addOp<LayerNormObj>(inputGpu, scaleGpu, nullptr, nullptr,
+                                       eps, axis, stash_type); // LayernormObj
+        gCuda->dataMalloc();
+
+        inputGpu->setData(generator);
+        scaleGpu->setData(generator);
+        cudaRuntime->run(gCuda);
+
+        auto oCpu =
+            gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
+        oCpu->printData();                      //->printData
+        EXPECT_TRUE(oCpu->equalData(ExpectData));
+    }
+}
+
+TEST(CUDA_LayernormFp32, run) {
+    test_layernormFp32(
         Shape{2, 3, 2, 3},
         vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
                       9.,  10., 11., 12., 13., 14., 15., 16., 17.,
@@ -94,7 +163,7 @@ TEST(CUDA_Layernorm, run) {
             -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
             -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678},
         Shape{3}, vector<float>{0, 0, 0});
-    test_layernorm(
+    test_layernormFp32(
         Shape{2, 3, 2, 3},
         vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
                       9.,  10., 11., 12., 13., 14., 15., 16., 17.,
@@ -109,7 +178,7 @@ TEST(CUDA_Layernorm, run) {
             -0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
             -0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679},
         Shape{3}, vector<float>{0.3, 0.2, 0.5});
-    test_layernorm(
+    test_layernormFp32(
         Shape{2, 3, 2, 3},
         vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
                       9.,  10., 11., 12., 13., 14., 15., 16., 17.,
@@ -124,7 +193,7 @@ TEST(CUDA_Layernorm, run) {
             -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207,
             -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207},
         Shape{3}, vector<float>{0.3, 0.2, 0.5});
-    test_layernorm(
+    test_layernormFp32(
         Shape{2, 3, 2, 3},
         vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
                       9.,  10., 11., 12., 13., 14., 15., 16., 17.,
@@ -141,6 +210,15 @@ TEST(CUDA_Layernorm, run) {
                       0.0000000,  0.6123678,  -0.3674207, 0.0000000,
                       0.6123678,  -0.3674207, 0.0000000,  0.6123678});
 
+} // python output
+TEST(CUDA_LayernormFp16, run) {
+    test_layernormFp16(Shape{2, 3, 2, 3}, ValGenerator<2>(), Shape{3}, 1e-5, 3,
+                       1, vector<float>{2., 2., 2., 2., 2., 2., 2., 2., 2.,
+                                        2., 2., 2., 2., 2., 2., 2., 2., 2.,
+                                        2., 2., 2., 2., 2., 2., 2., 2., 2.,
+                                        2., 2., 2., 2., 2., 2., 2., 2., 2.},
+                       Shape{3});
+
 } // python output
 
 } // namespace infini
diff --git a/test/kernels/cuda/test_cuda_softmax.cc b/test/kernels/cuda/test_cuda_softmax.cc
index 9ce9705d..be73554d 100644
--- a/test/kernels/cuda/test_cuda_softmax.cc
+++ b/test/kernels/cuda/test_cuda_softmax.cc
@@ -8,130 +8,127 @@
 #include <cmath>
 namespace infini {
 
-TEST(cuDNN_Softmax, run_axis1) {
-    // Runtime
-    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+void test_softmaxFp32(const Shape &inputShape, const vector<float> &inputData,
+                      int axis, const vector<float> &ExpectData) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    auto input = gCpu->addTensor(inputShape, DataType::Float32);
+
+    gCpu->dataMalloc();
+
+    input->copyin(inputData);
+
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
 
-    // Build input data on CPU
-    Tensor inputCpu =
-        make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
+    auto inputGpu = gCuda->cloneTensor(input);
 
-    // GPU
-    Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
-    auto inputGpu = cudaGraph->cloneTensor(inputCpu);
-    auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
-    cudaGraph->dataMalloc();
-    inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
-    cudaRuntime->run(cudaGraph);
-    auto outputGpu = gpuOp->getOutput();
-    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
-    cudaPrintTensor(outputGpu);
-    // Check
-    EXPECT_TRUE(outputGpu2Cpu->equalData(
-        vector<float>{0.032058604, 0.08714432, 0.23688284, 0.6439143,
-                      0.032058604, 0.08714432, 0.23688284, 0.6439143}));
+    auto op = gCuda->addOp<SoftmaxObj>(inputGpu, nullptr, axis);
+    gCuda->dataMalloc();
+
+    inputGpu->copyin(inputData);
+
+    cudaRuntime->run(gCuda);
+
+    auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
+    oCpu->printData();                              //->printData
+    EXPECT_TRUE(oCpu->equalData(ExpectData));
 }
+void test_softmaxFp16(
+    const Shape &inputShape,
+    const std::function<void(void *, size_t, DataType)> &generator, int axis,
+    const vector<float> &ExpectData) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    auto input = gCpu->addTensor(inputShape, DataType::Float16);
+
+    gCpu->dataMalloc();
+
+    input->setData(generator);
 
-TEST(cuDNN_Softmax, run_axis0) {
-    // Runtime
-    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
 
-    // Build input data on CPU
-    Tensor inputCpu =
-        make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
+    auto inputGpu = gCuda->cloneTensor(input);
 
-    // GPU
-    Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
-    auto inputGpu = cudaGraph->cloneTensor(inputCpu);
-    auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 0);
-    cudaGraph->dataMalloc();
-    inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
-    cudaRuntime->run(cudaGraph);
-    auto outputGpu = gpuOp->getOutput();
-    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
-    cudaPrintTensor(outputGpu);
-    // Check
-    EXPECT_TRUE(
-        outputGpu2Cpu->equalData(vector<float>{0., 0., 0., 0., 1, 1, 1, 1}));
+    auto op = gCuda->addOp<SoftmaxObj>(inputGpu, nullptr, axis);
+    gCuda->dataMalloc();
+
+    inputGpu->setData(generator);
+
+    cudaRuntime->run(gCuda);
+
+    auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
+    oCpu->printData();                              //->printData
+    EXPECT_TRUE(oCpu->equalData(ExpectData));
 }
+TEST(CUDA_SoftmaxFP32, run) {
+    test_softmaxFp32(
+        Shape{2, 3, 2, 2},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,
+                      8.,  9.,  10., 11., 12., 13., 14., 15.,
+                      16., 17., 18., 19., 20., 21., 22., 23.},
+        0, vector<float>{6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
+                         6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
+                         6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
+                         6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
+                         9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
+                         9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
+                         9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
+                         9.99993801e-01, 9.99993801e-01, 9.99993801e-01});
+    test_softmaxFp32(
+        Shape{2, 3, 2, 2},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,
+                      8.,  9.,  10., 11., 12., 13., 14., 15.,
+                      16., 17., 18., 19., 20., 21., 22., 23.},
+        1, vector<float>{3.29320435e-04, 3.29320435e-04, 3.29320435e-04,
+                         3.29320435e-04, 1.79802869e-02, 1.79802869e-02,
+                         1.79802869e-02, 1.79802869e-02, 9.81690347e-01,
+                         9.81690347e-01, 9.81690347e-01, 9.81690347e-01,
+                         3.29320435e-04, 3.29320435e-04, 3.29320435e-04,
+                         3.29320435e-04, 1.79802869e-02, 1.79802869e-02,
+                         1.79802869e-02, 1.79802869e-02, 9.81690347e-01,
+                         9.81690347e-01, 9.81690347e-01, 9.81690347e-01});
+    test_softmaxFp32(
+        Shape{2, 3, 2, 2},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,
+                      8.,  9.,  10., 11., 12., 13., 14., 15.,
+                      16., 17., 18., 19., 20., 21., 22., 23.},
+        2, vector<float>{0.11920292, 0.11920292, 0.88079703, 0.88079703,
+                         0.11920292, 0.11920292, 0.88079703, 0.88079703,
+                         0.11920292, 0.11920292, 0.88079703, 0.88079703,
+                         0.11920292, 0.11920292, 0.88079703, 0.88079703,
+                         0.11920292, 0.11920292, 0.88079703, 0.88079703,
+                         0.11920292, 0.11920292, 0.88079703, 0.88079703});
+    test_softmaxFp32(
+        Shape{2, 3, 2, 2},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,
+                      8.,  9.,  10., 11., 12., 13., 14., 15.,
+                      16., 17., 18., 19., 20., 21., 22., 23.},
+        3, vector<float>{0.26894143, 0.73105860, 0.26894143, 0.73105860,
+                         0.26894143, 0.73105860, 0.26894143, 0.73105860,
+                         0.26894143, 0.73105860, 0.26894143, 0.73105860,
+                         0.26894143, 0.73105860, 0.26894143, 0.73105860,
+                         0.26894143, 0.73105860, 0.26894143, 0.73105860,
+                         0.26894143, 0.73105860, 0.26894143, 0.73105860});
+} // python output
+TEST(CUDA_SoftmaxFP16, run) {
+    test_softmaxFp16(Shape{2, 3, 2, 2}, ValGenerator<2>(), 0,
+                     vector<float>{0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
+                                   0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
+                                   0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
+                                   0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
+                                   0.5000, 0.5000, 0.5000, 0.5000});
+    test_softmaxFp16(
+        Shape{2, 3, 2, 2}, ValGenerator<2>(), 1, // data accuracy down
+        vector<float>{0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
+                      0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
+                      0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
+                      0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
+                      0.333252, 0.333252, 0.333252, 0.333252});
 
-TEST(cuDNN_Softmax2, run_axis1) {
-    // Runtime
-    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
-    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+} // python output
 
-    // Build input data on CPU
-    Tensor inputCpu =
-        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
-
-    // GPU
-    Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
-    auto inputGpu = cudaGraph->cloneTensor(inputCpu);
-    auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
-    cudaGraph->dataMalloc();
-    inputGpu->setData(IncrementalGenerator());
-    cudaRuntime->run(cudaGraph);
-    auto outputGpu = gpuOp->getOutput();
-    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
-    cudaPrintTensor(outputGpu);
-    // Check
-    EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
-        0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138, 0.9820138,
-        0.9820138, 0.9820138, 0.0179862, 0.0179862, 0.0179862, 0.0179862,
-        0.9820138, 0.9820138, 0.9820138, 0.9820138}));
-}
-
-TEST(cuDNN_Softmax2, run_axis2) {
-    // Runtime
-    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
-    auto cudaRuntime = make_ref<CudaRuntimeObj>();
-
-    // Build input data on CPU
-    Tensor inputCpu =
-        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
-
-    // GPU
-    Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
-    auto inputGpu = cudaGraph->cloneTensor(inputCpu);
-    auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 2);
-    cudaGraph->dataMalloc();
-    inputGpu->setData(IncrementalGenerator());
-    cudaRuntime->run(cudaGraph);
-    auto outputGpu = gpuOp->getOutput();
-    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
-    cudaPrintTensor(outputGpu);
-    // Check
-    EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
-        0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029,
-        0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971, 0.8807971,
-        0.1192029, 0.1192029, 0.8807971, 0.8807971}));
-}
-
-TEST(cuDNN_Softmax2, run_axis3) {
-    // Runtime
-    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
-    auto cudaRuntime = make_ref<CudaRuntimeObj>();
-
-    // Build input data on CPU
-    Tensor inputCpu =
-        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
-
-    // GPU
-    Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
-    auto inputGpu = cudaGraph->cloneTensor(inputCpu);
-    auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 3);
-    cudaGraph->dataMalloc();
-    inputGpu->setData(IncrementalGenerator());
-    cudaRuntime->run(cudaGraph);
-    auto outputGpu = gpuOp->getOutput();
-    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
-    cudaPrintTensor(outputGpu);
-    // Check
-    EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
-        0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
-        0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
-        0.2689414, 0.7310586, 0.2689414, 0.7310586}));
-}
 } // namespace infini
diff --git a/test/kernels/cuda/test_cuda_split.cc b/test/kernels/cuda/test_cuda_split.cc
index 43700b77..bae607f1 100644
--- a/test/kernels/cuda/test_cuda_split.cc
+++ b/test/kernels/cuda/test_cuda_split.cc
@@ -130,5 +130,35 @@ TEST(Split, Cuda_dim0) {
     EXPECT_TRUE(o0Cpu->equalData(vector<float>{0, 1, 2}));
     EXPECT_TRUE(o1Cpu->equalData(vector<float>{3, 4, 5}));
 }
+//----------------
+TEST(SplitFp16, CudaHigh) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
 
+    auto input = gCpu->addTensor({2, 6, 2, 1, 2}, DataType::Float16);
+    gCpu->dataMalloc();
+    input->setData(ValGenerator<2>());
+
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+    auto inputGpu = gCuda->cloneTensor(input);
+    auto op = gCuda->addOp<SplitObj>(inputGpu, std::nullopt, 1, 3);
+    gCuda->dataMalloc();
+    inputGpu->setData(ValGenerator<2>());
+
+    cudaRuntime->run(gCuda);
+
+    //  copy output from CUDA to CPU
+    EXPECT_EQ(op->getOutputs().size(), (size_t)3);
+    auto o0Cpu = gCpu->cloneTensor(op->getOutput(0));
+    auto o1Cpu = gCpu->cloneTensor(op->getOutput(1));
+    auto o2Cpu = gCpu->cloneTensor(op->getOutput(2));
+    EXPECT_TRUE(o0Cpu->equalData(vector<float>{
+        2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.}));
+    EXPECT_TRUE(o1Cpu->equalData(vector<float>{
+        2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.}));
+    EXPECT_TRUE(o2Cpu->equalData(vector<float>{
+        2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.}));
+}
 } // namespace infini
diff --git a/test/kernels/cuda/test_cuda_unary.cc b/test/kernels/cuda/test_cuda_unary.cc
index 4a2e5e98..fd407dfd 100644
--- a/test/kernels/cuda/test_cuda_unary.cc
+++ b/test/kernels/cuda/test_cuda_unary.cc
@@ -40,6 +40,34 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
     EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
 }
 
+template <class T>
+void testCast(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape, vector<float> ansVec) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
+    auto inputGpu = cudaGraph->cloneTensor(inputCpu);
+    auto gpuOp =
+        cudaGraph->addOp<T>(inputGpu, nullptr, CastType::Float2Float16);
+    cudaGraph->dataMalloc();
+    inputGpu->setData(generator);
+    cudaRuntime->run(cudaGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(outputGpu2Cpu->equalData(ansVec));
+}
+
 TEST(cuDNN_Unary, run) {
     testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
     testUnary<AbsObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
@@ -50,6 +78,8 @@ TEST(cuDNN_Unary, run) {
     testUnary<SqrtObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
     testUnary<NegObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
     testUnary<ErfObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testCast<CastObj>(IncrementalGenerator(), Shape{8, 1},
+                      vector<float>{0, 1, 2, 3, 4, 5, 6, 7});
     // more shapes
     testUnary<SqrtObj>(IncrementalGenerator(), Shape{13});
     testUnary<SqrtObj>(IncrementalGenerator(), Shape{4, 3});
diff --git a/test/kernels/cuda/test_cuda_where.cc b/test/kernels/cuda/test_cuda_where.cc
index 32c2f253..07f5b48e 100644
--- a/test/kernels/cuda/test_cuda_where.cc
+++ b/test/kernels/cuda/test_cuda_where.cc
@@ -8,11 +8,11 @@
 
 namespace infini {
 
-void test_where(const Shape &inputXShape, const vector<float> &inputXData,
-                const Shape &inputYShape, const vector<float> &inputYData,
-                const Shape &conditionShape,
-                const vector<uint8_t> &conditionData,
-                const vector<float> &ExpectData) {
+void test_whereFp32(const Shape &inputXShape, const vector<float> &inputXData,
+                    const Shape &inputYShape, const vector<float> &inputYData,
+                    const Shape &conditionShape,
+                    const vector<uint8_t> &conditionData,
+                    const vector<float> &ExpectData) {
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
     Graph gCpu = make_ref<GraphObj>(runtime);
     auto condition = gCpu->addTensor(conditionShape, DataType::UInt8);
@@ -43,22 +43,62 @@ void test_where(const Shape &inputXShape, const vector<float> &inputXData,
     oCpu->printData();                              //->printData
     EXPECT_TRUE(oCpu->equalData(ExpectData));
 }
+void test_whereFp16(
+    const Shape &inputXShape,
+    const std::function<void(void *, size_t, DataType)> &generatorX,
+    const Shape &inputYShape,
+    const std::function<void(void *, size_t, DataType)> &generatorY,
+    const Shape &conditionShape, const vector<uint8_t> &conditionData,
+    const vector<float> &ExpectData) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
 
-TEST(CUDA_Where, run) {
-    test_where(
+    auto inputX = gCpu->addTensor(inputXShape, DataType::Float16);
+    auto inputY = gCpu->addTensor(inputYShape, DataType::Float16);
+    auto condition = gCpu->addTensor(conditionShape, DataType::UInt8);
+    gCpu->dataMalloc();
+
+    inputX->setData(generatorX);
+    inputY->setData(generatorY);
+    condition->copyin(conditionData); //
+
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+    auto inputXGpu = gCuda->cloneTensor(inputX);
+    auto inputYGpu = gCuda->cloneTensor(inputY);
+    auto conditionGpu = gCuda->cloneTensor(condition);
+
+    auto op = gCuda->addOp<WhereObj>(inputXGpu, inputYGpu, conditionGpu,
+                                     nullptr); // WhereObj
+    gCuda->dataMalloc();
+
+    inputXGpu->setData(generatorX);
+    inputYGpu->setData(generatorY);
+    conditionGpu->copyin(conditionData);
+    cudaRuntime->run(gCuda);
+
+    auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
+    oCpu->printData();                              //->printData
+    EXPECT_TRUE(oCpu->equalData(ExpectData));
+}
+
+TEST(CUDA_WhereFp32, run) {
+    test_whereFp32(
         Shape{2, 2, 3, 1}, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
         Shape{2, 2, 3, 1}, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
         Shape{2, 2, 3, 1}, vector<uint8_t>{0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1},
         vector<float>{0., 1., 2., 0., 0., 0., 6., 7., 0., 9., 10., 11.});
 
-    test_where(Shape{2, 1, 1, 3},                                  // inputx
-               vector<float>{0, 1, 2, 3, 4, 5}, Shape{1, 2, 1, 1}, // inputy
-               vector<float>{1, 1}, Shape{2, 1, 3, 1},             // condition
-               vector<uint8_t>{0, 1, 1, 0, 0, 0},
-               vector<float>{1., 1., 1., 0., 1., 2., 0., 1., 2., 1., 1., 1.,
-                             0., 1., 2., 0., 1., 2., 1., 1., 1., 1., 1., 1.,
-                             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.});
-    test_where(
+    test_whereFp32(Shape{2, 1, 1, 3},                                  // inputx
+                   vector<float>{0, 1, 2, 3, 4, 5}, Shape{1, 2, 1, 1}, // inputy
+                   vector<float>{1, 1}, Shape{2, 1, 3, 1}, // condition
+                   vector<uint8_t>{0, 1, 1, 0, 0, 0},
+                   vector<float>{1., 1., 1., 0., 1., 2., 0., 1., 2.,
+                                 1., 1., 1., 0., 1., 2., 0., 1., 2.,
+                                 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+                                 1., 1., 1., 1., 1., 1., 1., 1., 1.});
+    test_whereFp32(
         Shape{
             3,
         },
@@ -68,7 +108,7 @@ TEST(CUDA_Where, run) {
         vector<float>{0., 0., 0., 0., 1., 2., 0., 1., 2., 3., 3., 3.,
                       0., 1., 2., 0., 1., 2., 0., 0., 0., 1., 1., 1.,
                       2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5.});
-    test_where(
+    test_whereFp32(
         Shape{
             3,
         },
@@ -80,6 +120,30 @@ TEST(CUDA_Where, run) {
                       0., 1., 2., 0., 1., 2., 0., 0., 0., 1., 1., 1.,
                       2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5.});
 
+} // python output
+TEST(CUDA_WhereFp16, run) {
+    test_whereFp16(
+        Shape{
+            3,
+        },
+        ValGenerator<1>(),                                    // inputX
+        Shape{2, 3, 1}, ValGenerator<2>(),                    // inputY
+        Shape{2, 1, 3, 1}, vector<uint8_t>{0, 1, 1, 0, 0, 0}, // condition
+        vector<float>{2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
+                      1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
+                      2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.});
+    test_whereFp16(
+        Shape{
+            3,
+        },
+        ValGenerator<1>(),                 // inputX
+        Shape{2, 3, 1}, ValGenerator<2>(), // inputY
+        Shape{2, 1, 3, 1},
+        vector<uint8_t>{false, true, true, false, false, false}, // condition
+        vector<float>{2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
+                      1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
+                      2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.});
+
 } // python output
 
 } // namespace infini
diff --git a/test/kernels/intelcpu/test_mkl_conv.cc b/test/kernels/intelcpu/test_mkl_conv.cc
index 76ff2628..bcb3f3f4 100644
--- a/test/kernels/intelcpu/test_mkl_conv.cc
+++ b/test/kernels/intelcpu/test_mkl_conv.cc
@@ -53,8 +53,8 @@ TEST(mkl_Conv, tune) {
     mklRuntime->run(gMkl, tune);
 
     // check record
-    auto kernelAttrs = KernelAttrs{
-        Device::INTELCPU, conv->getOpType().underlying(), DataType::Float32};
+    auto kernelAttrs =
+        KernelAttrs{Device::INTELCPU, conv->getOpType().underlying()};
     auto perfKey = PerfEngine::Key{kernelAttrs, conv->getOpPerfKey()};
     std::optional<PerfRecord> perfData =
         PerfEngine::getInstance().getPerfData(perfKey);
diff --git a/test/kernels/intelcpu/test_mkl_conv_transposed.cc b/test/kernels/intelcpu/test_mkl_conv_transposed.cc
index 40a33fcd..2f5624fb 100644
--- a/test/kernels/intelcpu/test_mkl_conv_transposed.cc
+++ b/test/kernels/intelcpu/test_mkl_conv_transposed.cc
@@ -74,7 +74,9 @@ TEST(mkl_ConvTransposed, tune) {
     runtime->run(gMkl, tune);
     // check record
     auto kernelAttrs = KernelAttrs{
-        Device::INTELCPU, conv->getOpType().underlying(), DataType::Float32};
+        Device::INTELCPU,
+        conv->getOpType().underlying(),
+    };
     auto perfKey = PerfEngine::Key{kernelAttrs, conv->getOpPerfKey()};
     std::optional<PerfRecord> perfData =
         PerfEngine::getInstance().getPerfData(perfKey);
diff --git a/test/kernels/intelcpu/test_mkl_pooling.cc b/test/kernels/intelcpu/test_mkl_pooling.cc
index 5d25bb22..71b381e9 100644
--- a/test/kernels/intelcpu/test_mkl_pooling.cc
+++ b/test/kernels/intelcpu/test_mkl_pooling.cc
@@ -19,7 +19,7 @@ void testPoolMkl(const std::function<void(void *, size_t, DataType)> &generator,
     // Build input data
     Tensor i0 = g->addTensor(shape, DataType::Float32);
     auto pool = g->addOp<T>(i0, nullptr, kdps[0], kdps[1], kdps[2], kdps[3],
-                            kdps[4], kdps[5], kdps[6], kdps[7]);
+                            kdps[4], kdps[5], kdps[6], kdps[7], 0);
     g->dataMalloc();
     i0->setData(generator);
 
diff --git a/test/kernels/intelcpu/test_mkl_reduce.cc b/test/kernels/intelcpu/test_mkl_reduce.cc
index 859a1f91..e67789f1 100644
--- a/test/kernels/intelcpu/test_mkl_reduce.cc
+++ b/test/kernels/intelcpu/test_mkl_reduce.cc
@@ -2,7 +2,7 @@
 #include "core/kernel.h"
 #include "core/runtime.h"
 #include "intelcpu/mkl_runtime.h"
-#include "operators/reduce_mean.h"
+#include "operators/reduce.h"
 
 #include "test.h"
 
diff --git a/test/operators/test_unary.cc b/test/operators/test_unary.cc
index 911d815e..be8be206 100644
--- a/test/operators/test_unary.cc
+++ b/test/operators/test_unary.cc
@@ -13,8 +13,9 @@ TEST(Unary, ShapeInference) {
     {
         Graph g = make_ref<GraphObj>(runtime);
         Tensor i0 = g->addTensor({2}, DataType::Float32);
-        auto op = g->addOp<GeluObj>(i0, nullptr);
+        auto op = g->addOp<CastObj>(i0, nullptr, CastType::Float2Float16);
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{2}));
+        EXPECT_EQ(op->getOutDType(), (DataType::Float16));
     }
 }
 
diff --git a/test/operators/test_where.cc b/test/operators/test_where.cc
index c32e2d81..6b90837f 100644
--- a/test/operators/test_where.cc
+++ b/test/operators/test_where.cc
@@ -7,7 +7,7 @@
 
 namespace infini {
 
-TEST(Where, ShapeInference) {
+TEST(WhereFp32, ShapeInference) {
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
     {
         Graph g = make_ref<GraphObj>(runtime);
@@ -42,5 +42,39 @@ TEST(Where, ShapeInference) {
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 12, 224, 224}));
     }
 }
-
+TEST(WhereFp16, ShapeInference) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor x = g->addTensor({2, 2}, DataType::Float16);
+        Tensor y = g->addTensor({2, 2}, DataType::Float16);
+        Tensor con = g->addTensor({2, 2}, DataType::Bool);
+        auto op = g->addOp<WhereObj>(x, y, con, nullptr);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 2}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor x = g->addTensor({1, 12, 224, 224}, DataType::Float16);
+        Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float16);
+        Tensor con = g->addTensor({1, 224, 1}, DataType::Bool);
+        auto op = g->addOp<WhereObj>(x, y, con, nullptr);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 12, 224, 224}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor x = g->addTensor({12, 224, 224}, DataType::Float16);
+        Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float16);
+        Tensor con = g->addTensor({1, 224}, DataType::Bool);
+        auto op = g->addOp<WhereObj>(x, y, con, nullptr);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 12, 224, 224}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor x = g->addTensor({12, 224, 224}, DataType::Float16);
+        Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float16);
+        Tensor con = g->addTensor({2, 1, 1, 1, 224}, DataType::Bool);
+        auto op = g->addOp<WhereObj>(x, y, con, nullptr);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 12, 224, 224}));
+    }
+}
 } // namespace infini