Merge branch 'ascend' of github.com:InfiniTensor/InfiniTensor into ascend

2024-04-25 17:28:18 +08:00 · 2024-04-25 17:28:18 +08:00 · a765cd2a3d
parent 8b8f165158 5b89c699dc
commit a765cd2a3d
155 changed files with 4262 additions and 1098 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -20,7 +20,6 @@ endif()
 include(CMakeDependentOption)
 project(InfiniTensor C CXX)

-
 cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)

@ -55,11 +54,13 @@ endif()

 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
+add_compile_options(-Wno-error=unused-variable)

 find_package(
  Python
  COMPONENTS Interpreter Development
  REQUIRED)
+
 # OpenMP
 find_package(OpenMP)
 if(OpenMP_C_FOUND)
@ -252,7 +253,6 @@ if(USE_BANG)
  find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64")
  find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64")
  find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64")
-  find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")

  if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
@ -269,12 +269,13 @@ if(USE_BANG)
  # BangC Kernels
  ################################################################################

-  target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
  if (BUILD_DIST)
+    find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
+    target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
    message(STATUS "Add BUILD_DIST, use CNCL with BANG")
-    
    add_compile_definitions(INFINI_USE_CNCL=1)
-    
+  else()
+    target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
  endif()
 endif()

@ -289,10 +290,11 @@ if(USE_KUNLUN)
  endif()
  message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")

-  include_directories("${KUNLUN_HOME}/XTDK/include/")
-  find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64")
-  find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/XTDK/shlib")
+  include_directories("${KUNLUN_HOME}/include/")
+  find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64/")
+  find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/lib64/")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
+
  if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
    execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
    set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
@ -301,7 +303,16 @@ if(USE_KUNLUN)
  else()
    set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
  endif()
+
  message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
+
+  if (BUILD_DIST)
+    message(STATUS "Add BUILD_DIST, use XCCL with KUNLUN XPU")
+    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
+    find_package(XCCL REQUIRED)
+    add_compile_definitions(INFINI_USE_XCCL=1)
+    target_link_libraries(InfiniTensor ${XCCL_LIBRARIES})
+  endif()
  target_link_libraries(InfiniTensor ${KUNLUN_RT} ${KUNLUN_DNN} stdc++)
 endif()

@ -376,6 +387,7 @@ if(BUILD_TEST)
    endif()
    if (USE_KUNLUN)
      build_test(test/kernels/kunlun/*.cc)
+      build_test(test/kunlun/*.cc)
    endif()
    if (USE_ASCEND)
      build_test(test/kernels/ascend/*.cc)
@ -394,4 +406,4 @@ if(BUILD_TEST)
    add_executable(nnet_reader test/nnet/readlog.cc)
    target_link_libraries(nnet_reader InfiniTensor)
  endif()
-endif()
+endif()
--- a/3
+++ b/3
@ -8,7 +8,9 @@ ASCEND ?= OFF
 INTELCPU ?= off
 BACKTRACE ?= ON
 TEST ?= ON
+DIST ?= OFF
 NNET ?= OFF
+DIST ?= OFF
 FORMAT_ORIGIN ?=
 # Docker build options
 DOCKER_NAME ?= infinitensor
@ -31,6 +33,7 @@ CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
 CMAKE_OPT += -DUSE_ASCEND=$(ASCEND)
 CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
 CMAKE_OPT += -DBUILD_TEST=$(TEST)
+CMAKE_OPT += -DBUILD_DIST=$(DIST)
 CMAKE_OPT += -DBUILD_NNET=$(NNET)

 ifeq ($(INTELCPU), ON)
--- a/cmake/FindXCCL.cmake
+++ b/cmake/FindXCCL.cmake
@ -0,0 +1,27 @@
+# Find the xccl libraries
+set(XCCL_INCLUDE_DIR $ENV{KUNLUN_HOME}/include CACHE PATH "Folder contains KUNLUN XCCL headers")
+set(XCCL_LIB_DIR $ENV{KUNLUN_HOME}  CACHE PATH "Folder contains KUNLUN XCCL libraries")
+
+list(APPEND CMAKE_PREFIX_PATH $ENV{KUNLUN_HOME})
+
+find_path(XCCL_INCLUDE_DIRS # ${XCCL_INCLUDE_DIR}
+  NAMES xpu/bkcl.h
+  HINTS XCCL_INCLUDE_DIR)
+
+find_library(XCCL_LIBRARIES # ${XCCL_LIB_DIR}
+  NAMES lib64/libbkcl.so
+  HINTS XCCL_LIB_DIR)
+
+message(STATUS "XCCL_INCLUDE_DIRS: ${XCCL_INCLUDE_DIRS}")
+message(STATUS "XCCL_LIBRARIES: ${XCCL_LIBRARIES}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(XCCL DEFAULT_MSG XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
+
+if (XCCL_FOUND)
+  set (XCCL_HEADER_FILE "${XCCL_INCLUDE_DIRS}/xpu/bkcl.h")
+  message (STATUS "Determing XCCL version from ${XCCL_HEADER_FILE}...")
+  list (APPEND CMAKE_REQUIRED_INCLUDES ${XCCL_INCLUDE_DIRS})
+  message(STATUS "Found XCCL (include: ${XCCL_INCLUDE_DIRS}, library: ${XCCL_LIBRARIES})")
+  mark_as_advanced(XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
+endif()
--- a/docs/SUPPORT_MATRIX_CN.md
+++ b/docs/SUPPORT_MATRIX_CN.md
@ -2,6 +2,7 @@

 ## 目录

+
 - [环境支持](#环境支持)
 - [神经网络支持](#神经网络支持)
 - [技术支持](#技术支持)
@ -19,10 +20,10 @@

 目前已经验证过的神经网络模型有

- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
+- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
+- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
+- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
+- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)

 ## 技术支持

--- a/docs/USER_GUIDE_CN.md
+++ b/docs/USER_GUIDE_CN.md
@ -3,9 +3,10 @@
 ## 目录

 - [使用方法](#使用方法)
- [python-前端应用指南](#python-前端应用指南)
-  - [导入-onnx-模型](#导入-onnx-模型)
-  - [导出-onnx-模型](#导出-onnx-模型)
+- [python 前端应用指南](#python-前端应用指南)
+  - [导入 onnx 模型](#导入-onnx-模型)
+  - [优化](#优化)
+  - [导出 onnx 模型](#导出-onnx-模型)
  - [执行推理](#执行推理)
  - [样例代码](#样例代码)
 - [技术支持](#技术支持)
@ -13,7 +14,7 @@

 ## 使用方法

-项目管理功能已写到 [Makefile](Makefile)，支持下列功能：
+项目管理功能已写到 [Makefile](../Makefile)，支持下列功能：

 - 编译项目：`make`/`make build`
 - 清理生成文件：`make clean`
@ -39,10 +40,10 @@

 支持的模型：

- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
+- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
+- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
+- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
+- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)

 ```python
 import onnx
@ -97,7 +98,7 @@ for name, tensor in stub.inputs.items():
    print(name, tensor.shape(), tensor)
 ```

-对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)，会打印出：
+对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)，会打印出：

 ```plaintext
 data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
@ -138,7 +139,7 @@ for name, tensor in stub.outputs.items():

 ### 样例代码

-您可以参照[./example/Resnet/resnet.py](./example/ResNet/resnet.py)的样例代码进行了解，并尝试运行。在这个文件中，我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式：
+您可以参照[resnet.py](https://github.com/wanghailu0717/NNmodel/blob/main/ResNet/resnet.py)的样例代码进行了解，并尝试运行。在这个文件中，我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式：

 ```python
 python resnet.py -h
--- a/examples/distributed/README.md
+++ b/examples/distributed/README.md
@ -0,0 +1,17 @@
+# 分布式脚本
+
+#### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx
+
+使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
+
+```bash
+python run_pytorch.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
+```
+
+会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`，目前只支持单一输入输出。
+
+#### 2. 运行InfiniTensor分布式脚本
+
+```bash
+python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
+```
--- a/examples/distributed/cuda_launch.py
+++ b/examples/distributed/cuda_launch.py
@ -10,9 +10,6 @@ import numpy as np
 from parallel_opt import parallel_model


-os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
-
-
 def parse_args():
    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
@ -32,6 +29,9 @@ def parse_args():
        action="store_true",
        help="whether to generate the standard results.",
    )
+    parser.add_argument(
+        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
+    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
@ -42,12 +42,13 @@ def parse_args():
        args.batch_size,
        args.length,
        args.gen_std,
+        args.type,
    )


-def run_model(model, runtime, inputs, n=10):
-    stub = OnnxStub(model, runtime)
-    for tensor, input in zip(stub.inputs.values(), inputs):
+def run_model(model, runtime, inputs, n=10, data_type = "default"):
+    stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
+    for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
        tensor.copyin_numpy(input)
    # stub.tune()
    stub.run()
@ -55,7 +56,7 @@ def run_model(model, runtime, inputs, n=10):
    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()

    # bench
-    for tensor, input in zip(stub.inputs.values(), inputs):
+    for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
        tensor.copyin_numpy(input)
    begin = time.time()
    for _ in range(n):
@ -66,17 +67,17 @@ def run_model(model, runtime, inputs, n=10):
    return outputs


-def run_and_compare(name, model, runtime):
+def run_and_compare(name, model, runtime, data_type):
    input_ids = np.load(f"{name}_inputs.npy")
    position_ids = np.arange(input_ids.shape[-1])
    results = np.load(f"{name}_results.npy")
-    outputs = run_model(model, runtime, (input_ids, position_ids))
+    outputs = run_model(model, runtime, (input_ids, position_ids), data_type=data_type)
    print("outputs abs mean:", abs(outputs).mean())
-    np.testing.assert_allclose(outputs, results, rtol=1e-6, atol=1e-3)
+    print("max abs diff:", abs(outputs - results).max())


 def start_worker(
-    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
+    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
 ):
    dist_name = name + "_dist"
    model = parallel_model(model, world_size, rank)
@ -89,7 +90,7 @@ def start_worker(
        save_as_external_data=True,
        location=extern_path,
    )
-    infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
+    #infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
    runtime = backend.CudaRuntime(local_rank)
    # print("init comm")
    runtime.init_comm(
@ -97,12 +98,12 @@ def start_worker(
        world_size,
        rank,
    )
-    run_and_compare(name, model, runtime)
+    run_and_compare(name, model, runtime, data_type)


-def start_single(name, model):
+def start_single(name, model, data_type):
    runtime = backend.CudaRuntime(0)
-    run_and_compare(name, model, runtime)
+    run_and_compare(name, model, runtime, data_type)


 def gen_standard(name, model, voc_size, bs, len):
@ -117,8 +118,10 @@ def gen_standard(name, model, voc_size, bs, len):


 def main():
-    nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
-
+    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
+    data_type = "default" if data_type == "fp32" else data_type
+    if data_type != "tf32":
+        os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
    model = onnx.load(model_path)

    # generate standart output
@ -132,7 +135,7 @@ def main():
    # run single process.
    # use standalone process to isolate cuda.
    print("run model by single GPU.")
-    p = mp.Process(target=start_single, args=(name, model))
+    p = mp.Process(target=start_single, args=(name, model, data_type))
    p.start()
    p.join()

@ -142,7 +145,7 @@ def main():
    workers = [
        mp.Process(
            target=start_worker,
-            args=(name, world_size, rank, rank % nproc_per_node, model),
+            args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
        )
        for rank in range(world_size)
    ]
--- a/examples/distributed/launch_kunlun.py
+++ b/examples/distributed/launch_kunlun.py
@ -0,0 +1,213 @@
+import argparse
+import os
+import time
+import multiprocessing as mp
+from pyinfinitensor.onnx import OnnxStub, backend
+import onnx
+from onnx.external_data_helper import convert_model_to_external_data
+from onnx.shape_inference import infer_shapes_path
+import numpy as np
+from parallel_opt import parallel_model
+
+st_input_dir = "standard/inputs/"
+st_output_dir = "standard/outputs/"
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
+    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
+    parser.add_argument(
+        "--nproc_per_node", type=int, default=2, help="number of processes per node"
+    )
+    parser.add_argument(
+        "--name", type=str, default="test", help="name of this instance."
+    )
+    parser.add_argument(
+        "--model", type=str, default="/data1/shared/panzezhong/llama/fp32/my_llama_fp32.sim.onnx", help="path to the ONNX model file."
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
+    parser.add_argument("--length", type=int, default=1, help="sequence length.")
+    parser.add_argument(
+        "--gen_std",
+        default=False,
+        action="store_true",
+        help="whether to generate the standard results.",
+    )
+    parser.add_argument(
+        "--run_single",
+        default=False,
+        action="store_true",
+        help="whether run model with single process with standard inputs"
+    )
+    args = parser.parse_args()
+    print("arg setting: ", args)
+    return (
+        args.num_nodes,
+        args.nproc_per_node,
+        args.name,
+        args.model,
+        args.batch_size,
+        args.length,
+        args.gen_std,
+        args.run_single
+    )
+
+
+def run_model(model, runtime, world_size=1, rank=0, n=10):
+    stub = OnnxStub(model, runtime)
+    load_inputs(stub, world_size, rank)
+    # stub.tune()
+    stub.run()
+    # get outputs
+    time.sleep(0.01)
+    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
+
+    # bench
+    begin = time.time()
+    for _ in range(n):
+        stub.run()
+    end = time.time()
+    avg_time = (end - begin) / n
+    print(f"average time: {avg_time}")
+    return outputs
+
+
+
+def run_and_compare(name, model, runtime, world_size=1, rank = 0):
+    results = np.load(os.path.join(st_output_dir,f"output.npy"))
+    outputs = run_model(model, runtime, world_size, rank)
+    print(outputs[:100])
+    if np.isnan(outputs).any():
+        print("Nan in output")
+    print("answer argmax:", np.argmax(results))
+    print("output argmax:", np.argmax(outputs))
+    #np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
+    getDiff(results, outputs)
+
+
+def start_worker(
+    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
+):
+    dist_name = name + "_dist"
+    model = parallel_model(model, world_size, rank)
+    extern_path = f"./{dist_name}_rank{rank}.pb"
+    if os.path.exists(extern_path):
+        os.remove(extern_path)
+    onnx.save_model(
+        model,
+        f"./{dist_name}_rank{rank}.onnx",
+        save_as_external_data=True,
+        location=extern_path,
+    )
+    infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
+    runtime = backend.KUNLUNRuntime(local_rank)
+    # print("init comm")
+    runtime.init_comm(
+        dist_name,
+        world_size,
+        rank,
+    )
+    run_and_compare(name, model, runtime, world_size, rank)
+
+
+def start_single(name, model):
+    runtime = backend.KUNLUNRuntime(0)
+    run_and_compare(name, model, runtime)
+
+
+def generate_input_output(model):
+    runtime = backend.KUNLUNRuntime(0)
+    stub = OnnxStub(model, runtime)
+    position_id = 0
+    for i, (name, tensor) in enumerate(stub.inputs.items()):
+        input = tensor.copyout_numpy()
+        if np.issubdtype(input.dtype, np.integer):
+            if input.size == 1:
+                # input = np.array([position_id])
+                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
+            else:
+                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
+        elif input.dtype == np.bool_:
+            input = np.random.randint(0,2,size=input.shape) > 0
+        else:
+            if i == 0:
+                input = np.ones(input.shape).astype(input.dtype)
+                position_id = input.shape[-1] - 1
+            else:
+                input = np.random.rand(*input.shape).astype(input.dtype)
+        tensor.copyin_numpy(input)
+        np.save(os.path.join(st_input_dir, f"input_{i}"), input)
+    stub.run()
+    # print(stub.outputs)
+    time.sleep(0.01)
+    output = next(stub.outputs.values().__iter__()).copyout_numpy()
+    print(output[:100])
+    if np.isnan(output).any():
+        print("Nan in output")
+    np.save(os.path.join(st_output_dir, f"output"), output)
+
+
+def load_inputs(stub, world_size=1, rank=0):
+    for i, (name, tensor) in enumerate(stub.inputs.items()):
+        input = np.load(os.path.join(st_input_dir, f"input_{i}.npy"))
+        if all(x == y for x,y in zip(input.shape,tensor.shape())):
+            tensor.copyin_numpy(input)
+        else:
+            tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
+
+
+def getDiff(base, test):
+    absolute_diff = np.abs(np.subtract(base, test))
+    max_absolute_diff = np.max(absolute_diff)
+
+    baseCopy = base.astype(np.float64).ravel()
+    testCopy = test.astype(np.float64).ravel()
+    upValue = np.sum(np.abs(baseCopy - testCopy))
+    downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
+    max_relative_diff = upValue / downValue
+    print(f"Max absolute difference: {max_absolute_diff}\nMax relative difference: {max_relative_diff}")
+
+    return max_absolute_diff, max_relative_diff
+
+
+def main():
+    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, run_single = parse_args()
+
+    model = onnx.load(model_path)
+
+    # generate standart output
+    if gen_std:
+        print("Generate inputs and outputs.")
+        p = mp.Process(target=generate_input_output, args=[model])
+        p.start()
+        p.join()
+        return
+
+    # # run single process.
+    # # use standalone process to isolate cuda.
+    if run_single:
+        print("run model by single GPU.")
+        p = mp.Process(target=start_single, args=(name, model))
+        p.start()
+        p.join()
+        return 
+
+    # run distributed parallel.
+    world_size = nnodes * nproc_per_node
+    print(f"run model by {world_size} GPU in parallel.")
+    workers = [
+        mp.Process(
+            target=start_worker,
+            args=(name, world_size, rank, rank % nproc_per_node, model),
+        )
+        for rank in range(world_size)
+    ]
+
+    for w in workers:
+        w.start()
+
+    for w in workers:
+        w.join()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/distributed/parallel_opt.py
+++ b/examples/distributed/parallel_opt.py
@ -244,5 +244,5 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
        if tt.HasField("shape"):
            tt.ClearField("shape")
    model = helper.make_model(graph)
-    model = onnx.shape_inference.infer_shapes(model)
+    #model = onnx.shape_inference.infer_shapes(model)
    return model
--- a/examples/distributed/run_pytorch.py
+++ b/examples/distributed/run_pytorch.py
@ -0,0 +1,188 @@
+import argparse
+import torch
+from transformers import BertModel, BertConfig
+from transformers import GPT2Model, GPT2Config
+from transformers import OPTModel, OPTConfig
+import time
+import numpy as np
+import onnx
+import os
+from onnx.external_data_helper import convert_model_to_external_data
+from onnxsim import simplify
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
+    parser.add_argument(
+        "--model", type=str, choices=["gpt2", "bert", "opt"], required=True, help="model type"
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
+    parser.add_argument("--length", type=int, default=1, help="sequence length.")
+    parser.add_argument(
+        "--export_onnx",
+        type=str,
+        nargs="?",
+        default=None,
+        const="./",
+        help="whether and where to export onnx file",
+    )
+    parser.add_argument(
+        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
+    )
+    args = parser.parse_args()
+    print("arg setting: ", args)
+    return (
+        args.model,
+        args.batch_size,
+        args.length,
+        args.export_onnx,
+        args.type,
+    )
+
+
+def get_model(modelname):
+    match modelname:
+        case "bert":
+            model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
+            voc_size = BertConfig().vocab_size
+        case "gpt2":
+            model = GPT2Model.from_pretrained("gpt2")
+            voc_size = GPT2Config().vocab_size
+        case "opt":
+            model = model = OPTModel.from_pretrained("./opt-125m")
+            voc_size = OPTConfig().vocab_size
+        case _:
+            raise KeyError(modelname)
+
+    model = model.eval()
+    return model, voc_size
+
+def run_pytorch(torch_model, voc_size, batchsize, len):
+    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
+    np.save("test_inputs", data)
+    inputs = torch.from_numpy(data).to("cuda")
+    torch_model = torch_model.to("cuda")
+
+    n_iter = 20
+    with torch.no_grad():
+        for _ in range(10):
+            outputs = torch_model(inputs)
+    torch.cuda.synchronize()
+    begin = time.time()
+    with torch.no_grad():
+        for _ in range(n_iter):
+            torch.cuda.synchronize()
+            outputs = torch_model(inputs)
+            # 
+            torch.cuda.synchronize()
+    torch.cuda.synchronize()
+    end = time.time()
+    
+    avg_time = (end - begin) / n_iter
+    outputs = outputs.last_hidden_state.to("cpu")
+    print("outputs abs mean:", abs(np.array(outputs)).mean())
+    print(f"average time: {avg_time}")
+    torch.cuda.memory.empty_cache()
+    np.save("test_results", np.array(outputs, dtype=np.float32))
+    print("Save input & output as test_inputs.npy and test_results.npy")
+
+
+def export_onnx(model, data, path, extern=False):
+    torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
+    onnx_model = onnx.load(path)
+    onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
+    #onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
+    assert check
+    add_value_info_for_constants(onnx_model)
+    onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
+    if extern:
+        extern_path = path.replace('.onnx', '.pb')
+        if os.path.exists(extern_path):
+            os.remove(extern_path)
+        convert_model_to_external_data(
+            onnx_model,
+            all_tensors_to_one_file=True,
+            location=extern_path,
+            size_threshold=1024,
+            convert_attribute=False,
+        )
+    onnx.save(onnx_model, path)
+
+def add_value_info_for_constants(model : onnx.ModelProto):
+    """
+    Currently onnx.shape_inference doesn't use the shape of initializers, so add
+    that info explicitly as ValueInfoProtos.
+    Mutates the model.
+    Args:
+        model: The ModelProto to update.
+    """
+    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
+    if model.ir_version < 4:
+        return
+
+    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
+        inputs = {i.name for i in graph.input}
+        existing_info = {vi.name: vi for vi in graph.value_info}
+        for init in graph.initializer:
+            # Check it really is a constant, not an input
+            if init.name in inputs:
+                continue
+
+            # The details we want to add
+            elem_type = init.data_type
+            shape = init.dims
+
+            # Get existing or create new value info for this constant
+            vi = existing_info.get(init.name)
+            if vi is None:
+                vi = graph.value_info.add()
+                vi.name = init.name
+
+            # Even though it would be weird, we will not overwrite info even if it doesn't match
+            tt = vi.type.tensor_type
+            if tt.elem_type == onnx.TensorProto.UNDEFINED:
+                tt.elem_type = elem_type
+            if not tt.HasField("shape"):
+                # Ensure we set an empty list if the const is scalar (zero dims)
+                tt.shape.dim.extend([])
+                for dim in shape:
+                    tt.shape.dim.add().dim_value = dim
+
+        # Handle subgraphs
+        for node in graph.node:
+            for attr in node.attribute:
+                # Ref attrs refer to other attrs, so we don't need to do anything
+                if attr.ref_attr_name != "":
+                    continue
+
+                if attr.type == onnx.AttributeProto.GRAPH:
+                    add_const_value_infos_to_graph(attr.g)
+                if attr.type == onnx.AttributeProto.GRAPHS:
+                    for g in attr.graphs:
+                        add_const_value_infos_to_graph(g)
+
+
+    return add_const_value_infos_to_graph(model.graph)
+
+
+def main():
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    modelname, batchsize, seqlen, export_path, data_type = parse_args()
+    if data_type == "tf32":
+        torch.backends.cuda.matmul.allow_tf32 = True
+    else:
+        os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
+
+    model, voc_size = get_model(modelname)
+    if export_path is not None:
+        filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
+        path = os.path.join(export_path, filename)
+        param = torch.zeros((batchsize, seqlen), dtype=torch.int)
+        export_onnx(model, param, path, True)
+
+    if data_type == "fp16":
+        model = model.half()
+    run_pytorch(model, voc_size, batchsize, seqlen)
+
+if __name__ == "__main__":
+    main()
--- a/examples/python/llama_kvcache_inference.py
+++ b/examples/python/llama_kvcache_inference.py
@ -0,0 +1,145 @@
+import os
+from pyinfinitensor.onnx import OnnxStub, backend
+import numpy as np
+import onnx
+import torch
+from transformers import LlamaModel, LlamaForCausalLM
+from tqdm import tqdm
+import onnx_graphsurgeon as gs
+from onnxsim import simplify
+import argparse
+
+parser = argparse.ArgumentParser(description='')
+parser.add_argument('--batchsize', dest='batchsize', type=int, default=1)
+parser.add_argument('--layer', dest='n_layers', type=int, default=2)
+parser.add_argument('--iter', dest='n_iter', type=int, default=1)
+parser.add_argument('--n_max_length', dest='n_max_length', type=int, default=1024)
+parser.add_argument('--pretrained_llama_path', dest='pretrained_llama_path', type=str, 
+                    default="/data0/shared/data/public/opensource_models/meta-llama/Llama-2-7b-hf/")
+parser.add_argument('--onnx_model_path', dest='onnx_model_path', type=str, 
+                    default="/data1/shared/llama")
+args = parser.parse_args()
+
+ONNX_MODEL_PATH = "{}/llama_bs{}_layer{}.onnx".format(args.onnx_model_path, args.batchsize, args.n_layers)
+ONNX_WEIGHT_PATH = "./llama_bs{}_layer{}.pb".format(args.batchsize, args.n_layers)
+
+def export_onnx(model: LlamaModel, ONNX_MODEL_PATH):
+    param = torch.zeros(
+        (args.batchsize, 1024), dtype=torch.long)
+    logits = model(param, past_key_values=None)
+    param_kvcache = torch.zeros((args.batchsize, 1), dtype=torch.long)
+
+    torch.onnx.export(model, (param_kvcache, {"past_key_values": logits.past_key_values,
+                                              "position_ids": param_kvcache}), ONNX_MODEL_PATH, verbose=False,
+                      do_constant_folding=True,)
+    onnx_model = onnx.load(ONNX_MODEL_PATH)
+    print("simplifing onnx model")
+    onnx_model, check = simplify(onnx_model, skipped_optimizers=[
+                                 'eliminate_duplicate_initializer'])
+    assert check
+    
+    onnx.save(onnx_model, ONNX_MODEL_PATH, save_as_external_data=True, location=ONNX_WEIGHT_PATH)
+    print("simlifing finished.")
+
+
+@gs.Graph.register()
+def replace_with_attention(self, inputs, outputs, inputs_added, outputs_removed):
+    for inp in inputs:
+        inp.outputs.clear()   
+    for out in outputs:
+        out.inputs.clear()
+    for inp in inputs_added:
+        inputs.append(inp)
+    for out in outputs_removed:
+        out.inputs.clear()
+    return self.layer(op="AttentionKVCache", inputs=inputs, outputs=outputs)
+
+
+def replace_onnx_with_attention_op():
+    graph = gs.import_onnx(
+        onnx.load(ONNX_MODEL_PATH))
+    tmap = graph.tensors()
+    for i in range(args.n_layers):
+        inputs = [
+            tmap["onnx::Concat_" + str((i+1)*2)],
+            tmap["onnx::Concat_" + str((i+1)*2+1)],
+            tmap["/model/layers." + str(i) + "/self_attn/Add_output_0"],
+            tmap["/model/layers." + str(i) + "/self_attn/Add_1_output_0"],
+            tmap["/model/layers." + str(i) + "/self_attn/Transpose_2_output_0"]]
+        outputs = [
+            tmap["/model/layers." + str(i) + "/self_attn/MatMul_1_output_0"]]
+
+        inputs_added = [graph.inputs[1]]
+        outputs_removed = []
+
+        graph.replace_with_attention(
+            inputs, outputs, inputs_added, outputs_removed)
+        
+    graph.outputs = [tmap[graph.outputs[0].name]]
+    graph.cleanup(True).toposort()
+    onnx.save(gs.export_onnx(graph), ONNX_MODEL_PATH, save_as_external_data=True)
+
+
+if __name__ == "__main__":
+    kvcache_torch = None
+    torch_model = LlamaForCausalLM.from_pretrained(
+        args.pretrained_llama_path, num_hidden_layers=int(args.n_layers)).eval()
+    
+    n_heads = torch_model.config.num_attention_heads
+    n_dims = torch_model.config.hidden_size // n_heads
+    
+    if not os.path.exists(ONNX_MODEL_PATH):
+        print("exporting onnx graph")
+        export_onnx(torch_model, ONNX_MODEL_PATH)
+        replace_onnx_with_attention_op()
+    else:
+        print("will use exsiting onnx graph")
+
+    onnx_model = onnx.load(ONNX_MODEL_PATH)
+    stub = OnnxStub(onnx_model, backend.cuda_runtime())
+
+    count_wrong = 0
+    for i in tqdm(range(0, args.n_max_length)):
+        query = np.random.randint(
+            torch_model.config.vocab_size, size=(args.batchsize, 1), dtype=np.int32)
+        position_id = i*np.ones((args.batchsize, 1), dtype=np.int32)
+
+        ####################################
+        # pytorch
+        ####################################
+        outputs_torch = torch_model(
+            torch.tensor(query), past_key_values=kvcache_torch)
+        logit_torch = outputs_torch['logits']
+        kvcache_torch = outputs_torch['past_key_values']
+
+        ####################################
+        # infinitensor
+        ####################################
+        # copyin input
+        (list(stub.inputs.items()))[0][1].copyin_int64(
+            query.reshape(-1).tolist())
+        (list(stub.inputs.items()))[1][1].copyin_int64(
+            position_id.reshape(-1).tolist())
+
+        stub.run()
+
+        ####################################
+        # validation
+        ####################################
+        # copyout output
+        logits_it = np.array((list(stub.outputs.items()))
+                                [0][1].copyout_float())
+        
+        try:
+            np.testing.assert_allclose(
+                logit_torch[:, -1, :].detach().cpu().numpy().flatten(), logits_it, rtol=1e-3, atol=1e-3)
+        except Exception as e: 
+            try:
+                np.testing.assert_allclose(
+                    np.argmax(logit_torch[:, -1, :].detach().cpu().numpy().flatten()), np.argmax(logits_it), rtol=1e-3, atol=1e-3)
+            except:
+                count_wrong = count_wrong + 1
+
+    result = "{}/{} failed.".format(count_wrong, args.n_max_length)
+    print(result)
+    del stub
--- a/include/bang/bang_common.h
+++ b/include/bang/bang_common.h
@ -2,6 +2,10 @@
 #include "cnnl.h"
 #include "cnrt.h"
 #include "core/common.h"
+#include "core/data_type.h"
+#ifdef INFINI_USE_CNCL
+#include "cncl.h"
+#endif

 #define checkBangError(call)                                                   \
    {                                                                          \
@ -27,4 +31,70 @@ namespace infini {

 using BangPtr = void *;

+inline cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
+    if (dataType == DataType::Float32) {
+        return CNNL_DTYPE_FLOAT;
+    }
+    if (dataType == DataType::Float16) {
+        return CNNL_DTYPE_HALF;
+    }
+    if (dataType == DataType::Double) {
+        return CNNL_DTYPE_DOUBLE;
+    }
+    if (dataType == DataType::Int8) {
+        return CNNL_DTYPE_INT8;
+    }
+    if (dataType == DataType::Int32) {
+        return CNNL_DTYPE_INT32;
+    }
+    if (dataType == DataType::UInt8) {
+        return CNNL_DTYPE_UINT8;
+    }
+    if (dataType == DataType::BFloat16) {
+        return CNNL_DTYPE_BFLOAT16;
+    }
+    if (dataType == DataType::Int64) {
+        return CNNL_DTYPE_INT64;
+    }
+    if (dataType == DataType::Bool) {
+        return CNNL_DTYPE_BOOL;
+    }
+    IT_TODO_HALT_MSG("Data type " + dataType.toString() +
+                     " not supported in CNNL.");
+}
+
+#ifdef INFINI_USE_CNCL
+inline cnclDataType_t cnclDataTypeConvert(DataType dataType) {
+    if (dataType == DataType::Float32) {
+        return cnclFloat32;
+    }
+    if (dataType == DataType::Float16) {
+        return cnclHalf;
+    }
+    if (dataType == DataType::Int8) {
+        return cnclInt8;
+    }
+    if (dataType == DataType::Int16) {
+        return cnclInt16;
+    }
+    if (dataType == DataType::Int32) {
+        return cnclInt32;
+    }
+    if (dataType == DataType::UInt8) {
+        return cnclUint8;
+    }
+    if (dataType == DataType::UInt16) {
+        return cnclUint16;
+    }
+    if (dataType == DataType::UInt32) {
+        return cnclUint32;
+    }
+    if (dataType == DataType::BFloat16) {
+        return cnclBfloat16;
+    }
+    IT_TODO_HALT_MSG("Data type " + dataType.toString() +
+                     " not supported in CNCL.");
+}
+#endif
+
 } // namespace infini
--- a/include/core/common.h
+++ b/include/core/common.h
@ -61,16 +61,30 @@ template <typename T> auto enum_to_underlying(T e) {
 }

 template <typename T> std::string vecToString(const std::vector<T> &vec) {
-    std::string ret;
-    ret.append("[");
-    for (auto d : vec) {
-        ret.append(std::to_string(d));
-        ret.append(",");
+    std::stringstream ss;
+    ss << "[";
+    for (size_t i = 0; i < vec.size(); ++i) {
+        ss << vec.at(i);
+        if (i < vec.size() - 1) {
+            ss << ",";
+        }
    }
-    if (!vec.empty())
-        ret.pop_back();
-    ret.append("]");
-    return ret;
+    ss << "]";
+    return ss.str();
+}
+
+template <typename T> std::string vecToString(const T *st, size_t length) {
+    std::stringstream ss;
+    ss << "[";
+    size_t i = 0;
+    for (i = 0; i < length; i++) {
+        ss << *(st + i);
+        if (i < length - 1) {
+            ss << ",";
+        }
+    }
+    ss << "]";
+    return ss.str();
 }

 double timeit(
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -5,6 +5,10 @@
 #include <cstdint>
 #include <iostream>

+#ifdef USE_CUDA
+#include "cuda/cuda_runtime.h"
+#endif
+
 namespace infini {

 class GraphHandlerObj {
@ -26,12 +30,14 @@ class GraphHandlerObj {
                            int pw, int sh, int sw, int dh, int dw, int oph,
                            int opw);
    Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
-                  Tensor bias, ActType act);
+                  Tensor bias, ActType act,
+                  std::string matmul_compute_type = "default");
    Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
                              Tensor var, Tensor scale, Tensor bias,
                              float momentum, float eps, bool training);
    Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
                              Tensor bias, float eps, int axis, int stash_type);
+    Tensor rmsNorm(Tensor input, Tensor weight, Tensor output);

    Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
                   int ph, int pw, int sh, int sw, int ceilMode);
@ -47,6 +53,7 @@ class GraphHandlerObj {
    Tensor max(Tensor a, Tensor b, Tensor c);

    Tensor relu(Tensor x, Tensor y);
+    Tensor silu(Tensor x, Tensor y);
    Tensor gelu(Tensor x, Tensor y);
    Tensor sigmoid(Tensor x, Tensor y);
    Tensor hardSigmoid(Tensor x, Tensor y);
@ -77,6 +84,7 @@ class GraphHandlerObj {
    Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
                            Tensor input_q, Tensor input_k, Tensor input_v,
                            Tensor position_id, Tensor output_matmul);
+    Tensor RoPE(Tensor pos, Tensor input, Tensor output);
    TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
                    std::variant<int, vector<int>> numOrRatio);
    Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
@ -135,6 +143,12 @@ class GraphHandlerObj {
    inline void run() { g->getRuntime()->run(g); }

    inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }
+
+#ifdef USE_CUDA
+    inline void run_with_cudagraph() {
+        (as<CudaRuntimeObj>(g->getRuntime()))->runWithCudaGraph(g);
+    }
+#endif
 };

 } // namespace infini
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@ -5,8 +5,8 @@
 #include "utils/operator_utils.h"
 #include <functional>
 #include <nlohmann/json.hpp>
-using json = nlohmann::json;
 namespace infini {
+using json = nlohmann::json;

 class RuntimeObj; // Forward declaration for Kernel::compute

--- a/include/core/op_type.h
+++ b/include/core/op_type.h
@ -151,11 +151,14 @@ struct OpType {
        ReduceSum,       // Reduce
        ReduceSumSquare, // Reduce
        Relu,            // Unary
+        Silu,            // Unary
        Reshape,
        Resize,
        ReverseSequence,
        RoiAlign,
-        Round, // Unary
+        RoPE,    // Fusion
+        Round,   // Unary
+        RMSNorm, // Fusion
        STFT,
        Scan,
        Scatter,
--- a/include/core/perf_engine.h
+++ b/include/core/perf_engine.h
@ -2,8 +2,8 @@
 #include "core/graph.h"
 #include "core/kernel.h"
 #include <nlohmann/json_fwd.hpp>
-using json = nlohmann::json;
 namespace infini {
+using json = nlohmann::json;

 class PerfEngine {
  public:
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -15,6 +15,7 @@ class GraphObj;
 class GraphHandlerObj;
 class RuntimeObj;
 class BlobObj;
+template <typename T> class WorkspaceObj;

 using TensorBase = Ref<TensorBaseObj>;
 using Tensor = Ref<TensorObj>;
@ -23,6 +24,7 @@ using Graph = Ref<GraphObj>;
 using GraphHandler = Ref<GraphHandlerObj>;
 using Runtime = Ref<RuntimeObj>;
 using Blob = Ref<BlobObj>;
+template <typename T> using Workspace = Ref<WorkspaceObj<T>>;

 using TensorVec = vector<Tensor>;
 using OpVec = vector<Operator>;
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -4,6 +4,7 @@
 #include "utils/data_convert.h"
 #include <cmath>
 #include <cstring>
+#include <fstream>

 #if USE_CUDA
 #include "cuda/cuda_runtime.h"
@ -143,6 +144,7 @@ class TensorObj : public TensorBaseObj {
    }

    void printData() const;
+    void dumpData(std::ofstream &ofs) const;
    bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;

    template <typename T> bool equalData(const vector<T> &dataVector) {
@ -198,13 +200,20 @@ class TensorObj : public TensorBaseObj {
                if (a[i] != b[i])
                    return false;
            } else if constexpr (std::is_floating_point_v<T>) {
-                if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
-                    relativeError) {
+                if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
+                    fabs(a[i] - b[i]) > relativeError) {
+                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
+                    return false;
+                } else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
+                           fabs(a[i] - b[i]) /
+                                   std::max(fabs(a[i]), fabs(b[i])) >
+                               relativeError) {
                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
                    return false;
                }
-            } else
+            } else {
                static_assert(!sizeof(T), "Unsupported data type");
+            }
        }
        return true;
    }
@ -239,8 +248,8 @@ class TensorObj : public TensorBaseObj {
    //         // std::cerr << "Init beginned " << std::endl;
    // #pragma omp parallel for
    //         for (size_t i = 0; i < iEnd; ++i)
-    //             data[i] = fastrand(random_seed[omp_get_thread_num() * 16]) %
-    //             10000;
+    //             data[i] = fastrand(random_seed[omp_get_thread_num() *
+    //             16]) % 10000;
    //         // std::cerr << "Init finished" << std::endl;
    //         computed = ComputedFull;
    //         return true;
@ -285,8 +294,8 @@ class TensorObj : public TensorBaseObj {
    //         auto nDim = dims.size();
    //         auto nBroadcastDim = ds.size() - nDim;
    //         for (size_t i = 0; i < nDim; ++i)
-    //             if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim + i] >=
-    //             dims[i])
+    //             if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim +
+    //             i] >= dims[i])
    //                 return (size_t)-1;
    //         size_t idx = 0;
    //         for (size_t i = 0; i < nDim; ++i)
@ -345,12 +354,14 @@ class TensorObj : public TensorBaseObj {
    //         return (g_seed >> 16) & 0x7FFF;
    //     }

-    //     std::vector<std::vector<int>> const *getSplittingPoints() const {
+    //     std::vector<std::vector<int>> const *getSplittingPoints()
+    //     const {
    //         assert(!splittingPoints.empty());
    //         return &splittingPoints;
    //     }

-    //     bool setSplittingPoints(std::vector<std::vector<int>> value) {
+    //     bool setSplittingPoints(std::vector<std::vector<int>> value)
+    //     {
    //         assert(!value.empty());
    //         splittingPoints = value;
    //         return true;
--- a/include/core/workspace.h
+++ b/include/core/workspace.h
@ -0,0 +1,42 @@
+#pragma once
+#include "core/runtime.h"
+
+namespace infini {
+
+template <class T> class WorkspaceObj {
+  private:
+    T workspace;           // workspace pointer
+    size_t workspaceSize;  // Size of workspace
+    size_t workspaceAlloc; // currently use workspace size
+
+  public:
+    WorkspaceObj(T workspace_, size_t workspaceSize_)
+        : workspace(workspace_), workspaceSize(workspaceSize_) {
+        workspaceAlloc = 0;
+    }
+    virtual ~WorkspaceObj() {
+        // Dealloc workspace in RuntimeObj
+        // Set workspace = nullptr here
+        workspace = nullptr;
+    }
+    size_t getWorkspaceSize() const { return workspaceSize; }
+
+    T getWorkspace(size_t size) {
+        // Get unused workspace
+        IT_ASSERT(size + workspaceAlloc <= workspaceSize);
+        auto ret = (T)(static_cast<uint8_t *>(workspace) + workspaceAlloc);
+        workspaceAlloc += size;
+        return ret;
+    }
+    T getWorkspace() {
+        // Override getWorkspace in order to dealloc in runtime
+        return workspace;
+    }
+    void resetWorkspace() {
+        // Reset workspaceAlloc every time end kernel
+        workspaceAlloc = 0;
+    }
+    size_t getWorkspaceAlloc() const { return workspaceAlloc; }
+};
+
+} // namespace infini
--- a/include/cuda/cuda_attention_kvcache.h
+++ b/include/cuda/cuda_attention_kvcache.h
@ -1,4 +1,5 @@
 #pragma once
+#include "core/common.h"
 #include <cstdio>

 struct AttentionKVCacheMetadata {
@ -10,6 +11,7 @@ namespace infini {
 void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
                              float *input_q, float *input_k, float *input_v,
                              int *position_id, float *output_matmul,
-                              const AttentionKVCacheMetadata &compMeta);
+                              const AttentionKVCacheMetadata &compMeta,
+                              float *output_O_temp, float *output_sum_temp);

 } // namespace infini
--- a/include/cuda/cuda_common.h
+++ b/include/cuda/cuda_common.h
@ -5,6 +5,7 @@
 #include <cuda_profiler_api.h>
 #include <cudnn.h>
 #include <curand.h>
+#include <memory>

 #define checkCudaError(call)                                                   \
    if (auto err = call; err != cudaSuccess)                                   \
@ -111,4 +112,20 @@ inline const char *curandGetErrorString(curandStatus_t error) {

 using CudaPtr = void *;

+class CUDAStream {
+  public:
+    CUDAStream(const CUDAStream &) = delete;
+    CUDAStream(CUDAStream &&) = delete;
+    void operator=(const CUDAStream &) = delete;
+    void operator=(CUDAStream &&) = delete;
+    static cudaStream_t getCurrentStream() { return _stream; }
+    static void Init() { CUDAStream::_stream = 0; };
+    static void createStream() { checkCudaError(cudaStreamCreate(&_stream)); }
+    static void destroyStream() { checkCudaError(cudaStreamDestroy(_stream)); }
+
+  private:
+    CUDAStream(){};
+    static cudaStream_t _stream;
+};
+
 } // namespace infini
--- a/include/cuda/cuda_element_wise.h
+++ b/include/cuda/cuda_element_wise.h
@ -13,4 +13,8 @@ void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
 void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
                 int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
                 int c2, int c3);
+
+void div_const_kernel(int dType, void *a, void *b, void *c, size_t n);
+
+void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n);
 }; // namespace infini
--- a/include/cuda/cuda_expand.h
+++ b/include/cuda/cuda_expand.h
@ -7,4 +7,6 @@ void expandKernel(int dType, void *input, void *output, int nDims,
                  int outputsize, SmallArray inputShape,
                  SmallArray outputShape);

+void expandRowKernel(int dType, void *input, void *output, int n_rows,
+                     int row_len);
 }; // namespace infini
--- a/include/cuda/cuda_rmsnorm.h
+++ b/include/cuda/cuda_rmsnorm.h
@ -0,0 +1,10 @@
+#pragma once
+
+#include "operators/rms_norm.h"
+
+namespace infini {
+
+void rmsnorm_kernel(int dType, void *input, void *weight, void *output,
+                    int num_tokens, int hidden_size);
+
+}; // namespace infini
--- a/include/cuda/cuda_rope.h
+++ b/include/cuda/cuda_rope.h
@ -0,0 +1,12 @@
+#pragma once
+
+#include "operators/rope.h"
+#include "utils/small_array.h"
+
+namespace infini {
+
+void rope_kernel(int dType, int *pos, void *input, void *output, int size,
+                 int dim_model, int dim_head, int hidden_stride,
+                 int pos_stride);
+
+}; // namespace infini
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -14,6 +14,9 @@ class CudaRuntimeObj : public RuntimeObj {
    std::unique_ptr<CommunicatorObj> comm;
    CudaPtr workspace;
    size_t workspaceSize;
+    bool isCudaGraphCreated;
+    cudaGraph_t cudaGraph;
+    cudaGraphExec_t cudaGraphInstance;

  public:
    explicit CudaRuntimeObj(int deviceId = 0)
@ -26,9 +29,16 @@ class CudaRuntimeObj : public RuntimeObj {
        // size_t longformerNum = 3lu * (1 << 30);
        workspaceSize = 7ll << 30; // 7 GB
        workspace = alloc(workspaceSize);
+        isCudaGraphCreated = false;
+        CUDAStream::Init();
    }
    virtual ~CudaRuntimeObj() {
        try {
+            if (isCudaGraphCreated) {
+                checkCudaError(cudaGraphExecDestroy(cudaGraphInstance));
+                checkCudaError(cudaGraphDestroy(cudaGraph));
+                CUDAStream::destroyStream();
+            }
            dealloc(workspace);
            checkCudnnError(cudnnDestroy(cudnn));
            checkCublasError(cublasDestroy(cublas));
@ -75,6 +85,8 @@ class CudaRuntimeObj : public RuntimeObj {

    void runWithoutSync(const Graph &graph) const;

+    void runWithCudaGraph(const Graph &graph);
+
    // init communicator
    void initComm(const string &name, int worldSize, int rank) final;

--- a/include/cuda/cuda_unary.h
+++ b/include/cuda/cuda_unary.h
@ -5,6 +5,7 @@
 namespace infini {
 template <typename T> void softmax_kernel(T *input, T *output, size_t num);
 template <typename T> void relu_kernel(T *input, T *output, size_t num);
+template <typename T> void silu_kernel(T *input, T *output, size_t num);
 template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
 template <typename T> void tanh_kernel(T *input, T *output, size_t num);
 template <typename T> void abs_kernel(T *input, T *output, size_t num);
--- a/include/kunlun/kunlun_act_type.h
+++ b/include/kunlun/kunlun_act_type.h
@ -0,0 +1,23 @@
+#include "core/op_type.h"
+#include "kunlun/kunlun_common.h"
+
+namespace infini {
+using KunlunActType = xdnn::Activation_t;
+KunlunActType parseActType(ActType act) {
+    switch (act) {
+    case ActType::None:
+        return KunlunActType::LINEAR;
+    case ActType::Tanh:
+        return KunlunActType::TANH;
+    case ActType::Sigmoid:
+        return KunlunActType::SIGMOID;
+    case ActType::Relu:
+        return KunlunActType::RELU6;
+    default:
+        fprintf(stderr, "Activation Type not support yet!\n");
+        break;
+    }
+    return KunlunActType::LINEAR;
+}
+
+}; // namespace infini
--- a/include/kunlun/kunlun_common.h
+++ b/include/kunlun/kunlun_common.h
@ -3,6 +3,8 @@
 #include "xpu/runtime_ex.h"
 #include "xpu/xdnn.h"

+namespace xdnn = baidu::xpu::api;
+
 #define checkKUNLUNError(call)                                                 \
    {                                                                          \
        auto err = call;                                                       \
--- a/include/kunlun/kunlun_runtime.h
+++ b/include/kunlun/kunlun_runtime.h
@ -1,28 +1,35 @@
 #pragma once
 #include "core/runtime.h"
+#include "core/workspace.h"
 #include "kunlun/kunlun_common.h"
-
+#ifdef INFINI_USE_XCCL
+#include "kunlun/xccl_communicator.h"
+#endif
 namespace infini {

 class KUNLUNRuntimeObj : public RuntimeObj {
  private:
-    baidu::xpu::api::Context *xdnn;
-    KUNLUNPtr workspace;
-    size_t workspaceSize;
+    xdnn::Context *ctx;
+    std::unique_ptr<CommunicatorObj> comm;
+    // KUNLUNPtr workspace;
+    // size_t workspaceSize;
+    Workspace<KUNLUNPtr> workspace;

  public:
-    KUNLUNRuntimeObj() : RuntimeObj(Device::KUNLUN) {
-        xdnn = baidu::xpu::api::create_context();
+    KUNLUNRuntimeObj(int deviceId = 0) : RuntimeObj(Device::KUNLUN) {
+        xpu_set_device(deviceId);
+        ctx = xdnn::create_context();
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
-        workspaceSize = 3ll << 30; // 3 GB
-        // std::cout<<workspaceSize/1024/1024/1024<< std::endl;
-        // std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
-        workspace = alloc(workspaceSize);
+        size_t workspaceSize = 3llu << 30; // 3 GB
+        KUNLUNPtr wkspacePtr = alloc(workspaceSize);
+        workspace =
+            make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
    }
    virtual ~KUNLUNRuntimeObj() {
-        dealloc(workspace);
-        baidu::xpu::api::destroy_context(xdnn);
+        KUNLUNPtr wkspacePtr = workspace->getWorkspace();
+        dealloc(wkspacePtr);
+        xdnn::destroy_context(ctx);
    }
    string toString() const override;

@ -31,6 +38,7 @@ class KUNLUNRuntimeObj : public RuntimeObj {
    // double runEvaluation(const Graph &graph, int nWarmups,
    //                      int nEvaluations) const;
    void sync() const;
+
    KUNLUNPtr alloc(size_t size) override {
        void *ptr;
        checkKUNLUNError(
@ -38,33 +46,33 @@ class KUNLUNRuntimeObj : public RuntimeObj {
        return ptr;
    }
    void dealloc(void *ptr) override { xpu_free(ptr); }
-    baidu::xpu::api::Context *KUNLUNHandle() const { return xdnn; }
+
+    xdnn::Context *KUNLUNHandle() const { return ctx; }
+    // Get $size workspace by bytes
    KUNLUNPtr getWorkspace(size_t size) const {
-        IT_ASSERT(size <= workspaceSize);
-        return workspace;
+        auto ret = workspace->getWorkspace(size);
+        return ret;
    }
+    Workspace<KUNLUNPtr> getWorkspaceObj() const { return workspace; }

    void copyBlobFromCPU(void *dst, const void *src,
                         size_t bytes) const override {
        xpu_memcpy(dst, const_cast<void *>(src), bytes,
                   XPUMemcpyKind::XPU_HOST_TO_DEVICE);
    }
-
    void copyBlobToCPU(void *dst, const void *src,
                       size_t bytes) const override {
        xpu_memcpy(dst, const_cast<void *>(src), bytes,
                   XPUMemcpyKind::XPU_DEVICE_TO_HOST);
    }
-
    void copyBlobInsideRuntime(void *dst, const void *src,
                               size_t bytes) const override {
        xpu_memcpy(dst, const_cast<void *>(src), bytes,
                   XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
    }
+    void initComm(const string &name, int worldSize, int rank) final;

-    void initComm(const string &, int, int) override { IT_TODO_HALT(); }
-
-    CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
+    CommunicatorObj &getCommunicator() const final { return *comm; }

  private:
    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
--- a/include/kunlun/xccl_communicator.h
+++ b/include/kunlun/xccl_communicator.h
@ -0,0 +1,60 @@
+#pragma once
+#include "core/communicator.h"
+#include "xpu/bkcl.h"
+#include <chrono>
+#include <filesystem>
+#include <fstream>
+#include <thread>
+
+#define checkXcclError(call)                                                   \
+    {                                                                          \
+        auto err = call;                                                       \
+        if (BKCL_SUCCESS != err) {                                             \
+            fprintf(stderr, "XCCL error in %s:%i.\n", __FILE__, __LINE__);     \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    }
+
+namespace infini {
+
+class XcclCommunicatorObj final : public CommunicatorObj {
+  private:
+    BKCLContext_t comm;
+
+  public:
+    XcclCommunicatorObj(const string &name, int worldSize, int rank)
+        : CommunicatorObj(worldSize, rank) {
+        const std::string filePath("./" + name + "_xccl_id.bin");
+        BKCLUniqueId commId;
+        if (rank == 0) {
+            checkXcclError(bkcl_get_unique_id(&commId));
+            std::ofstream ofs(filePath, std::ios::binary);
+            ofs.write((char *)&commId, sizeof(BKCLUniqueId));
+        } else {
+            auto begin = std::chrono::steady_clock::now();
+            while (!std::filesystem::exists(filePath)) {
+                auto now = std::chrono::steady_clock::now();
+                _IT_ASSERT_2(now < begin + std::chrono::seconds(10),
+                             "time limit (10s) exceeded.");
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            }
+            std::ifstream ifs(filePath, std::ios::binary);
+            ifs.read((char *)&commId, sizeof(BKCLUniqueId));
+        }
+        checkXcclError(bkcl_init_rank(&comm, rank, worldSize, &commId));
+        if (rank == 0) {
+            std::filesystem::remove(filePath);
+        }
+    }
+
+    BKCLContext_t getXcclComm() { return comm; }
+
+    ~XcclCommunicatorObj() final { checkXcclError(bkcl_destroy_context(comm)); }
+    virtual string toString() const final {
+        std::ostringstream oss;
+        oss << "XCCL communicator";
+        return oss.str();
+    }
+};
+
+} // namespace infini
--- a/include/operators/matmul.h
+++ b/include/operators/matmul.h
@ -17,6 +17,9 @@ class MatmulObj : public OperatorObj {
    // Auxiliary attributes which are not a part of operator attributes.
    int b, m, n, k;

+    // Specifies the data precision for the matrix multiply.
+    std::string computeType = "default";
+
  public:
    /**
     * @brief Matmul operator with batch broadcast and tensor transpose
@ -38,10 +41,11 @@ class MatmulObj : public OperatorObj {
     * @param transB If matrix B should be transposed when computing.
     * @param bias The bias tensor.
     * @param act The activation function.
+     * @param computeType Specifies the data precision for the matrix multiply.
     */
    MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
              bool transA = false, bool transB = false, Tensor bias = nullptr,
-              ActType act = ActType::None);
+              ActType act = ActType::None, std::string computeType = "default");
    OP_CLONE(MatmulObj);

    std::string toString() const override;
@ -60,6 +64,7 @@ class MatmulObj : public OperatorObj {
    int getN() const { return n; }
    int getK() const { return k; }
    auto getBMNK() const { return tuple{b, m, n, k}; }
+    std::string getComputeType() const { return computeType; }

  private:
    vector<int> getWorkloadVector() const override;
--- a/include/operators/rms_norm.h
+++ b/include/operators/rms_norm.h
@ -0,0 +1,34 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+/**
+ * @brief Fused RMSNorm Operator
+ *
+ */
+class RMSNormObj : public OperatorObj {
+    int dim;
+
+  public:
+    /**
+     * @brief Construct a new RMSNorm object.
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param input The input tensor.
+     * @param output The output tensor.
+     */
+    RMSNormObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output);
+    OP_CLONE(RMSNormObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 2; }
+    int numOutputs() const override { return 1; }
+    int getDim() const { return dim; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+} // namespace infini
--- a/include/operators/rope.h
+++ b/include/operators/rope.h
@ -0,0 +1,29 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class RoPEObj : public OperatorObj {
+  public:
+    /**
+     * @brief Construct a new RotaryEmbedding object.
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param pos The positon id of the query.
+     * @param input The input tensor.
+     * @param output The output tensor.
+     */
+    RoPEObj(GraphObj *graph, Tensor pos, Tensor input, Tensor output);
+    OP_CLONE(RoPEObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 2; }
+    int numOutputs() const override { return 1; }
+    DataType getDType() const { return getInputs(1)->getDType(); }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+} // namespace infini
--- a/include/operators/slice.h
+++ b/include/operators/slice.h
@ -7,9 +7,7 @@ namespace infini {
 *
 */
 class SliceObj : public OperatorObj {
-    template <class T> struct range_t {
-        T start, end, step;
-    };
+    template <class T> struct range_t { T start, end, step; };
    vector<range_t<int>> axes;

  public:
--- a/include/operators/unary.h
+++ b/include/operators/unary.h
@ -159,6 +159,7 @@ enum class CastType {
    Uint322Int64,
    Float162Float,
    BFloat162Float,
+    Float2Float,
 };

 class CastObj : public OperatorObj {
@ -258,6 +259,7 @@ class LogObj : public OperatorObj {
    };

 DEFINE_UNARY_OBJ(Relu, OpType::Relu)
+DEFINE_UNARY_OBJ(Silu, OpType::Silu)
 DEFINE_UNARY_OBJ(Gelu, OpType::Gelu)
 DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
 DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
--- a/include/utils/broadcast_shape.h
+++ b/include/utils/broadcast_shape.h
@ -1,14 +0,0 @@
-#pragma once
-
-namespace infini {
-void broadcastShape(const Shape &originShape, SmallArray &modifyShape,
-                    int nDims, int size) {
-    for (int i = nDims - size - 1; i >= 0; --i) {
-        modifyShape.data[i] = 1;
-    }
-    for (int i = nDims - 1; i >= nDims - size; --i) {
-        modifyShape.data[i] = originShape[i - nDims + size];
-    }
-}
-
-} // namespace infini
--- a/include/utils/operator_utils.h
+++ b/include/utils/operator_utils.h
@ -5,6 +5,9 @@
 #include "core/operator.h"
 #include "core/tensor.h"

+#include "utils/small_array.h"
+#include <numeric>
+
 namespace infini {

 // Launch a broadcast shape based on the shape of input A and B
@ -20,6 +23,12 @@ size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
                      const Shape &stride);
 // Convert KernelAttrs to a string representation
 std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs);
+// VectorProd
+int shapeProd(std::vector<int>::iterator start, std::vector<int>::iterator end);
+void broadcastShape(const Shape &originShape, SmallArray &modifyShape,
+                    int nDims, int size);
+void broadcastShape(const Shape &tempShape, Shape &modifyShape);
+
 } // namespace infini

 #endif
--- a/include/utils/small_array.h
+++ b/include/utils/small_array.h
@ -4,6 +4,14 @@ namespace infini {
 #define SMALL_ARRAY_SIZE 8
 struct SmallArray {
    int data[SMALL_ARRAY_SIZE];
+
+    int prod(int start, int end) {
+        int result = 1;
+        for (int i = start; i < end; ++i) {
+            result *= data[i];
+        }
+        return result;
+    }
 };

 } // namespace infini
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -1,4 +1,4 @@
-import backend
+import backend
 from onnx import (
    ModelProto,
    TensorProto,
@ -23,12 +23,13 @@ from onnx.checker import (
    ValidationError,
 )
 from onnx.shape_inference import infer_shapes
-from onnx.numpy_helper import to_array
+from onnx.numpy_helper import to_array, from_array
 from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
 from functools import reduce
 from onnxsim import simplify
 import copy
 import warnings
+import numpy as np


 class OnnxStub:
@ -37,7 +38,13 @@ class OnnxStub:
    It can be generated from an Onnx model object.
    """

-    def __init__(self, model: ModelProto, runtime, use_naive_allocator: bool = False):
+    def __init__(
+        self,
+        model: ModelProto,
+        runtime,
+        use_naive_allocator: bool = False,
+        matmul_compute_type: str = "default",
+    ):
        # We use some user-defined operators for distributed inference
        try:
            # onnx simplifier performs inplace simplify
@ -105,12 +112,6 @@ class OnnxStub:
                )
                tensors[input.name].set_input()

-        for output in model.graph.output:
-            dims = _take_shape_dim(output.type.tensor_type.shape)
-            tensors[output.name] = self.handler.tensor(
-                dims, output.type.tensor_type.elem_type
-            )
-            tensors[output.name].set_output()

        for node_idx in sorted_nodes:
            node = model.graph.node[node_idx]
@ -252,13 +253,14 @@ class OnnxStub:
                    )
            elif node.op_type == "MatMul":
                tensors[node.output[0]] = self.handler.matmul(
-                    tensors[node.input[0]],
-                    tensors[node.input[1]],
+                    tensors[node.input[0]], # input
+                    tensors[node.input[1]], # weight
                    tensors.get(node.output[0]),
                    False,
                    False,
                    None,
                    backend.ActType.Linear,
+                    matmul_compute_type,
                )
            elif node.op_type == "Gemm":
                attributes = _parse_attribute(
@ -278,6 +280,7 @@ class OnnxStub:
                    transB == 1,
                    tensors[node.input[2]] if len(node.input) > 2 else None,
                    backend.ActType.Linear,
+                    matmul_compute_type,
                )
            elif node.op_type == "BatchNormalization":
                (input, mean, var, scale, bias) = (
@ -321,6 +324,12 @@ class OnnxStub:
                    axis,
                    stash_type,
                )
+            elif node.op_type == "RMSNorm":
+                tensors[node.output[0]] = self.handler.RMSNorm(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
            elif node.op_type == "MaxPool":
                attributes = _parse_attribute(
                    node,
@ -482,6 +491,11 @@ class OnnxStub:
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                )
+            elif node.op_type == "Silu":
+                tensors[node.output[0]] = self.handler.silu(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
            elif node.op_type == "Gelu":
                tensors[node.output[0]] = self.handler.gelu(
                    tensors[node.input[0]],
@ -657,7 +671,7 @@ class OnnxStub:
                    keep_aspect_ratio_policy,
                    nearest_mode,
                    coordinate_transformation_mode,
-                )                
+                )
            elif node.op_type == "Squeeze":
                axes = (
                    _parse_data(data[node.input[1]])
@ -713,6 +727,12 @@ class OnnxStub:
                    tensors[node.input[5]],
                    tensors.get(node.output[0]),
                )
+            elif node.op_type == "RoPE":
+                tensors[node.output[0]]= self.handler.RoPE(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
            elif node.op_type == "Split":
                split = (
                    _parse_data(data[node.input[1]])
@ -966,6 +986,25 @@ class OnnxStub:
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Where":
+                ## If Y is single -inf, treat Where as Add 
+                ## TODO: deal with cases where Y is single inf or 0
+                if node.input[0] in data and node.input[2] in data:
+                    where_condition = to_array(data[node.input[0]])
+                    where_alt =  to_array(data[node.input[2]])
+                    if where_alt.size == 1:
+                        if np.isneginf(where_alt) or np.all(where_alt < -3e38):
+                            node.input[0] = node.input[0] + "_alt"
+                            if node.input[0] not in data:
+                                where_value = np.where(where_condition, 0, -np.inf).astype(where_alt.dtype)
+                                data[node.input[0]] = from_array(where_value, node.input[0])
+                                tensors[node.input[0]] = self.handler.tensor(list(where_value.shape), data[node.input[0]].data_type)
+                                tensors[node.input[0]].set_weight()
+                            tensors[node.output[0]] = self.handler.add(
+                                tensors[node.input[1]],
+                                tensors[node.input[0]],
+                                tensors.get(node.output[0]),
+                            )
+                            continue
                tensors[node.output[0]] = self.handler.where(
                    tensors[node.input[1]],
                    tensors[node.input[2]],
@ -995,10 +1034,12 @@ class OnnxStub:
                    beta,
                    bias,
                    size,
-                )                
+                )
            else:
                raise Exception('Unsupported operator "{}"'.format(node.op_type))

+        for output in model.graph.output:
+            tensors[output.name].set_output()
        ################################
        # Allocate memory space for data
        ################################
@ -1280,7 +1321,7 @@ class OnnxStub:
                        axes,
                    )
                )
-                ctx.push_node(make_node(ty.name, inputs, outputs, name))                
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
            elif ty == backend.OpTypeId.Concat:
                axis = backend.concat_axis_of(op)
                ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
@ -1409,6 +1450,9 @@ class OnnxStub:
    def run(self) -> None:
        self.handler.run()

+    def run_with_cudagraph(self) -> None:
+        self.handler.run_with_cudagraph()
+
    def get_perf_time(self) -> float:
        self.handler.get_perf_time()

--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@ -463,13 +463,20 @@ class TestStringMethods(unittest.TestCase):
    def test_split(self):
        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
        split = make_node("Split", ["input"], ["output"], name="split", axis=0)
-        make_and_import_model(make_graph([split], "split", [input], []))
+        output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
+        make_and_import_model(make_graph([split], "split", [input], [output]))

    def test_split1(self):
        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
-        splitAttr = make_tensor_value_info("split", TensorProto.INT64, [2, 1])
-        split = make_node("Split", ["input", "split"], ["output"], name="split", axis=1)
-        make_and_import_model(make_graph([split], "split", [input, splitAttr], []))
+        splitAttr = make_tensor("split", TensorProto.INT64, [2], [2, 1])
+        output1 = make_tensor_value_info("output1", TensorProto.FLOAT, [1, 2, 2, 4])
+        output2 = make_tensor_value_info("output2", TensorProto.FLOAT, [1, 1, 2, 4])
+        split = make_node(
+            "Split", ["input", "split"], ["output1", "output2"], name="split", axis=1
+        )
+        make_and_import_model(
+            make_graph([split], "split", [input], [output1, output2], [splitAttr])
+        )

    def test_allBroadcast(self):
        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@ -18,6 +18,8 @@
 #include "operators/reduce.h"
 #include "operators/reshape.h"
 #include "operators/resize.h"
+#include "operators/rms_norm.h"
+#include "operators/rope.h"
 #include "operators/send.h"
 #include "operators/slice.h"
 #include "operators/softmax.h"
@ -72,15 +74,17 @@ Tensor GraphHandlerObj::convTransposed2d(Tensor input, Tensor weight,
 }

 Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA,
-                               bool transB, Tensor bias, ActType act) {
+                               bool transB, Tensor bias, ActType act,
+                               std::string matmul_compute_type) {
    if (y) {
        g->addOpWithOutputs<MatmulObj>(std::move(a), std::move(b), y, transA,
-                                       transB, std::move(bias), act);
+                                       transB, std::move(bias), act,
+                                       matmul_compute_type);
        return y;
    } else {
        return g
            ->addOp<MatmulObj>(std::move(a), std::move(b), y, transA, transB,
-                               std::move(bias), act)
+                               std::move(bias), act, matmul_compute_type)
            ->getOutput();
    }
 }
@ -121,6 +125,17 @@ Tensor GraphHandlerObj::layerNormalization(Tensor input, Tensor scale,
    }
 }

+Tensor GraphHandlerObj::rmsNorm(Tensor input, Tensor weight, Tensor output) {
+    if (output) {
+        g->addOpWithOutputs<RMSNormObj>(std::move(input), std::move(weight),
+                                        output);
+        return output;
+    } else {
+        return g->addOp<RMSNormObj>(std::move(input), std::move(weight), output)
+            ->getOutput();
+    }
+}
+
 Tensor GraphHandlerObj::maxPool(Tensor input, Tensor output, int kh, int kw,
                                int dh, int dw, int ph, int pw, int sh, int sw,
                                int ceilMode) {
@ -181,6 +196,7 @@ DEFINE_ELEMENT_WISE_METHOD(max, Maximum)
        }                                                                      \
    }

+DEFINE_UNARY_METHOD(silu, Silu)
 DEFINE_UNARY_METHOD(relu, Relu)
 DEFINE_UNARY_METHOD(gelu, Gelu)
 DEFINE_UNARY_METHOD(sigmoid, Sigmoid)
@ -345,6 +361,16 @@ Tensor GraphHandlerObj::attentionKVCache(Tensor input_k_cache,
    }
 }

+Tensor GraphHandlerObj::RoPE(Tensor pos, Tensor input, Tensor output) {
+    if (output) {
+        g->addOpWithOutputs<RoPEObj>(std::move(pos), std::move(input), output);
+        return output;
+    } else {
+        return g->addOp<RoPEObj>(std::move(pos), std::move(input), output)
+            ->getOutput();
+    }
+}
+
 TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs,
                                 int axis,
                                 std::variant<int, vector<int>> numOrRatio) {
@ -683,6 +709,8 @@ static CastType inferCastType(Tensor input, int to) {
        return CastType::Float162Float;
    } else if (iType == DataType::BFloat16 && oType == DataType::Float32) {
        return CastType::BFloat162Float;
+    } else if (iType == DataType::Float32 && oType == DataType::Float32) {
+        return CastType::Float2Float;
    } else {
        IT_TODO_HALT_MSG("Unsupported CastType : input_type is " +
                         iType.toString() + " output_type is " +
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -66,6 +66,36 @@ void TensorObj::setShape(Shape shape_) {
    _size = size;
 }

+void TensorObj::dumpData(std::ofstream &ofs) const {
+    IT_ASSERT(data != nullptr);
+    if (!runtime->isCpu())
+        IT_TODO_HALT();
+
+#define TRY_DUMP(N)                                                            \
+    if (dtype == DataType(N))                                                  \
+        ofs << dataToString<DT<N>::t>() << std::endl;
+
+    TRY_DUMP(0)           // fmt: new line
+    else TRY_DUMP(1)      //
+        else TRY_DUMP(2)  //
+        else TRY_DUMP(3)  //
+        else TRY_DUMP(4)  //
+        else TRY_DUMP(5)  //
+        else TRY_DUMP(6)  //
+        else TRY_DUMP(7)  //
+        else TRY_DUMP(8)  //
+        else TRY_DUMP(9)  //
+        else TRY_DUMP(10) //
+        else TRY_DUMP(11) //
+        else TRY_DUMP(12) //
+        else TRY_DUMP(13) //
+        else TRY_DUMP(16) //
+        else IT_TODO_HALT();
+    ofs.flush();
+
+#undef TRY_DUMP
+}
+
 void TensorObj::printData() const {
    IT_ASSERT(data != nullptr);
    if (!runtime->isCpu())
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@ -19,7 +19,6 @@ void CHECK_CUDA_KERNEL_ERROR(infini::Operator op) {
 }

 namespace infini {
-
 void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto &perfEngine = PerfEngine::getInstance();
@ -39,6 +38,27 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
    }
 }

+void CudaRuntimeObj::runWithCudaGraph(const Graph &graph) {
+    if (!isCudaGraphCreated) {
+        CUDAStream::createStream();
+        checkCudnnError(cudnnSetStream(cudnn, CUDAStream::getCurrentStream()));
+        checkCublasError(
+            cublasSetStream(cublas, CUDAStream::getCurrentStream()));
+        checkCudaError(cudaStreamBeginCapture(CUDAStream::getCurrentStream(),
+                                              cudaStreamCaptureModeGlobal));
+        runWithoutSync(graph);
+        checkCudaError(
+            cudaStreamEndCapture(CUDAStream::getCurrentStream(), &cudaGraph));
+        checkCudaError(
+            cudaGraphInstantiate(&cudaGraphInstance, cudaGraph, NULL, NULL, 0));
+        isCudaGraphCreated = true;
+    } else {
+        checkCudaError(
+            cudaGraphLaunch(cudaGraphInstance, CUDAStream::getCurrentStream()));
+    }
+    checkCudaError(cudaStreamSynchronize(CUDAStream::getCurrentStream()));
+}
+
 void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto &perfEngine = PerfEngine::getInstance();
@ -102,4 +122,5 @@ void CudaRuntimeObj::initComm(const string &name, int worldSize, int rank) {
 #endif
 }

+cudaStream_t CUDAStream::_stream = 0;
 } // namespace infini
--- a/src/cuda/cuda_utility.cu
+++ b/src/cuda/cuda_utility.cu
@ -16,7 +16,8 @@ __global__ void cudaPrintFloatImpl(float *x, int len) {
 namespace infini {

 void cudaPrintFloat(float *x, int len) {
-    cudaPrintFloatImpl<<<1, 1>>>(x, len);
+    cudaPrintFloatImpl
+        <<<1, 1, 0, CUDAStream::getCurrentStream()>>>(x, len);
    cudaDeviceSynchronize();
 }

--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -442,7 +442,9 @@ void init_graph_builder(py::module &m) {
 #endif
 #ifdef USE_KUNLUN
    py::class_<KUNLUNRuntimeObj, std::shared_ptr<KUNLUNRuntimeObj>, RuntimeObj>(
-        m, "KUNLUNRuntime");
+        m, "KUNLUNRuntime")
+        .def(py::init<int>(), py::arg("device") = 0)
+        .def("init_comm", &KUNLUNRuntimeObj::initComm);
 #endif

 #ifdef USE_ASCEND
@ -525,6 +527,7 @@ void init_graph_builder(py::module &m) {
        .def("matmul", &Handler::matmul, policy::move)
        .def("batchNormalization", &Handler::batchNormalization, policy::move)
        .def("layerNormalization", &Handler::layerNormalization, policy::move)
+        .def("RMSNorm", &Handler::rmsNorm, policy::move)
        .def("maxPool", &Handler::maxPool, policy::move)
        .def("avgPool", &Handler::avgPool, policy::move)
        .def("add", &Handler::add, policy::move)
@ -536,6 +539,7 @@ void init_graph_builder(py::module &m) {
        .def("min", &Handler::min, policy::move)
        .def("max", &Handler::max, policy::move)
        .def("relu", &Handler::relu, policy::move)
+        .def("silu", &Handler::silu, policy::move)
        .def("gelu", &Handler::gelu, policy::move)
        .def("sigmoid", &Handler::sigmoid, policy::move)
        .def("tanh", &Handler::tanh, policy::move)
@ -558,6 +562,7 @@ void init_graph_builder(py::module &m) {
        .def("unsqueeze", &Handler::unsqueeze, policy::move)
        .def("concat", &Handler::concat, policy::move)
        .def("attentionKVCache", &Handler::attentionKVCache, policy::move)
+        .def("RoPE", &Handler::RoPE, policy::move)
        .def("split", &Handler::split, policy::move)
        .def("gather", &Handler::gather, policy::move)
        .def("gatherElements", &Handler::gatherElements, policy::move)
@ -590,6 +595,10 @@ void init_graph_builder(py::module &m) {
        .def("get_perf_time", &Handler::get_perf_time, policy::automatic)
        .def("tune", &Handler::tune, policy::automatic)
        .def("run", &Handler::run, policy::automatic)
+#ifdef USE_CUDA
+        .def("run_with_cudagraph", &Handler::run_with_cudagraph,
+             policy::automatic)
+#endif
        .def("shape_infer", &Handler::shape_infer, policy::automatic)
        .def("change_shape", &Handler::change_shape, policy::automatic)
        .def("getDims", &Handler::getDims, policy::automatic)
--- a/src/kernels/ascend/element_wise.cc
+++ b/src/kernels/ascend/element_wise.cc
@ -21,12 +21,12 @@ namespace infini {
            void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());   \
            void *const cData = (op->getOutput()->getRawDataPtr<void *>());    \
                                                                               \
-            auto a = op->getInputs(0) -> getDims();                            \
-            auto aS = op->getInputs(0) -> getStride();                         \
-            auto b = op->getInputs(1) -> getDims();                            \
-            auto bS = op->getInputs(1) -> getStride();                         \
-            auto c = op->getOutput() -> getDims();                             \
-            auto cS = op->getOutput() -> getStride();                          \
+            auto a = op->getInputs(0)->getDims();                              \
+            auto aS = op->getInputs(0)->getStride();                           \
+            auto b = op->getInputs(1)->getDims();                              \
+            auto bS = op->getInputs(1)->getStride();                           \
+            auto c = op->getOutput()->getDims();                               \
+            auto cS = op->getOutput()->getStride();                            \
                                                                               \
            std::vector<int64_t> aDim = castTo64(a);                           \
            std::vector<int64_t> aStride = castTo64(aS);                       \
--- a/src/kernels/ascend/unary.cc
+++ b/src/kernels/ascend/unary.cc
@ -92,22 +92,22 @@ class ReluAclnn : public ASCENDKernelWithoutConfig {
            void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());   \
            void *const cData = (op->getOutput()->getRawDataPtr<void *>());    \
                                                                               \
-            auto a = op->getInputs(0) -> getDims();                            \
+            auto a = op->getInputs(0)->getDims();                              \
            std::vector<int64_t> aDim(a.size(), 1);                            \
            for (size_t i = 0; i < a.size(); ++i) {                            \
                aDim[i] = int64_t(a[i]);                                       \
            }                                                                  \
-            auto aS = op->getInputs(0) -> getStride();                         \
+            auto aS = op->getInputs(0)->getStride();                           \
            std::vector<int64_t> aStride(aS.size(), 1);                        \
            for (size_t i = 0; i < aS.size(); ++i) {                           \
                aStride[i] = int64_t(aS[i]);                                   \
            }                                                                  \
-            auto c = op->getInputs(0) -> getDims();                            \
+            auto c = op->getInputs(0)->getDims();                              \
            std::vector<int64_t> cDim(c.size(), 1);                            \
            for (size_t i = 0; i < c.size(); ++i) {                            \
                cDim[i] = int64_t(c[i]);                                       \
            }                                                                  \
-            auto cS = op->getInputs(0) -> getStride();                         \
+            auto cS = op->getInputs(0)->getStride();                           \
            std::vector<int64_t> cStride(cS.size(), 1);                        \
            for (size_t i = 0; i < cS.size(); ++i) {                           \
                cStride[i] = int64_t(cS[i]);                                   \
--- a/src/kernels/bang/activation.cc
+++ b/src/kernels/bang/activation.cc
@ -2,16 +2,20 @@
 #include "bang/bang_runtime.h"
 #include "operators/softmax.h"
 #include "operators/unary.h"
+#include <iostream>

 namespace infini {
 class UnaryCnnl : public BangKernelWithoutConfig {
    virtual cnnlActivationMode_t getOpType() const = 0;
    virtual float getCoef() const = 0;
    virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
+    virtual float getSlicedDim() const { return 0.0; }
+    virtual float getGamma() const { return 0.0; }
+    virtual float getScale() const { return 0.0; }
+
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<UnaryObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -22,18 +26,19 @@ class UnaryCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));
        cnnlActivationDescriptor_t opDesc;
        checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
-        checkCnnlError(cnnlSetActivationDescriptor_v2(
+        checkCnnlError(cnnlSetActivationDescriptor_v5(
            opDesc, getOpType(), CNNL_ACTIVATION_HIGH_PRECISION,
-            CNNL_NOT_PROPAGATE_NAN, getCoef()));
+            CNNL_NOT_PROPAGATE_NAN, getCoef(), getSlicedDim(), getGamma(),
+            getScale(), true));

        auto [alpha, beta] = getAlphBeta();
        cnnlStatus_t stat =
@ -51,7 +56,6 @@ class RoundCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<UnaryObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -62,13 +66,13 @@ class RoundCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));
        cnnlStatus_t stat =
            cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData);
        if (stat != CNNL_STATUS_SUCCESS)
@ -82,7 +86,6 @@ class PReluCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<PReluObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -94,18 +97,22 @@ class PReluCnnl : public BangKernelWithoutConfig {
        auto bDim = op->getInputs(1)->getDims();
        auto cDim = op->getOutput()->getDims();

+        if (auto alignSize = aDim.size() - bDim.size(); alignSize) {
+            bDim.insert(bDim.begin(), alignSize, 1);
+        }
+
        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, bDim.size(),
-                                               bDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            bDim.size(), bDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlStatus_t stat = cnnlPrelu(context->cnnlHandle(), aDesc, aData,
                                      bDesc, bData, cDesc, cData);
@ -122,7 +129,6 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<SoftmaxObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -185,13 +191,13 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
        }

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, inDim.size(),
-                                               inDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            inDim.size(), inDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, outDim.size(),
-                                               outDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            outDim.size(), outDim.data()));
        float alpha = 1.0;
        float beta = 0.0;
        cnnlStatus_t stat =
@ -219,6 +225,22 @@ class SigmoidCnnl : public UnaryCnnl {
    float getCoef() const override { return 0.0; }
 };

+class HardSwishCnnl : public UnaryCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_HARDSWISH;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+class HardSigmoidCnnl : public UnaryCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_HARDSIGMOID;
+    }
+    float getCoef() const override { return 0.0; }
+    float getGamma() const override { return 1.f / 6.f; }
+    float getScale() const override { return 0.5f; }
+};
+
 REGISTER_KERNEL(Device::BANG, OpType::Relu, ReluCnnl, "Relu_cnnl_BANG");
 REGISTER_KERNEL(Device::BANG, OpType::PRelu, PReluCnnl, "PRelu_cnnl_BANG");
 REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
@ -226,5 +248,9 @@ REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
 REGISTER_KERNEL(Device::BANG, OpType::Round, RoundCnnl, "Round_cnnl_BANG");
 REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxCnnl,
                "Softmax_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::HardSigmoid, HardSigmoidCnnl,
+                "HardSigmoid_cnnl_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::HardSwish, HardSwishCnnl,
+                "HardSwish_cnnl_BANG");

 }; // namespace infini
--- a/src/kernels/bang/activation_backward.cc
+++ b/src/kernels/bang/activation_backward.cc
@ -10,7 +10,6 @@ class ActivationBackwardCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ActivationBackwardObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const yData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -25,21 +24,21 @@ class ActivationBackwardCnnl : public BangKernelWithoutConfig {
        auto diffxDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&yDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, yDim.size(),
-                                               yDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            yDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            yDim.size(), yDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            diffYDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffyDim.size(),
-            diffyDim.data()));
+            diffYDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            diffyDim.size(), diffyDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&xDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(xDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, xDim.size(),
-                                               xDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            xDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            xDim.size(), xDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            diffXDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffxDim.size(),
-            diffxDim.data()));
+            diffXDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            diffxDim.size(), diffxDim.data()));
        // get op descriptor
        cnnlActivationDescriptor_t opDesc;
        checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
--- a/src/kernels/bang/all_gather.cc
+++ b/src/kernels/bang/all_gather.cc
@ -19,7 +19,6 @@ class AllGatherCNCL : public BangKernelWithoutConfig {
        BangPtr output_temp =
            context->getWorkspace(op->getInputs(0)->getBytes() * world_size);
        // void *output = op->getOutput()->getRawDataPtr<void *>();
-        // IT_ASSERT(op->getDType() == DataType::Float32);
        checkBangError(cnrtMalloc(&output_temp,
                                  op->getInputs(0)->getBytes() * world_size));
        size_t bytes = op->getInputs(0)->getBytes();
@ -29,8 +28,9 @@ class AllGatherCNCL : public BangKernelWithoutConfig {
            dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
                .getCnclComm();
        cnrtQueue_t queue = context->getBangQueue();
-        CNCL_CHECK(
-            cnclAllGather(input, output_temp, count, cnclFloat32, comm, queue));
+        CNCL_CHECK(cnclAllGather(input, output_temp, count,
+                                 cnclDataTypeConvert(op->getDType()), comm,
+                                 queue));
        checkBangError(cnrtQueueSync(queue));
        for (int i = 0; i < world_size; ++i) {
            Tensor output = op->getOutput(i);
@ -42,8 +42,8 @@ class AllGatherCNCL : public BangKernelWithoutConfig {
    }
 };

-REGISTER_KERNEL(Device::BANG, OpType::AllGather, DataType::Float32,
-                AllGatherCNCL, "AllGather_CNCL_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::AllGather, AllGatherCNCL,
+                "AllGather_CNCL_BANG");
 } // namespace infini

 #endif
--- a/src/kernels/bang/all_reduce.cc
+++ b/src/kernels/bang/all_reduce.cc
@ -13,15 +13,16 @@ class AllReduceCNCL : public BangKernelWithoutConfig {
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
        void *input = op->getInputs(0)->getRawDataPtr<void *>();
        void *output = op->getOutput()->getRawDataPtr<void *>();
-        IT_ASSERT(op->getDType() == DataType::Float32);
-        size_t count = op->getInputs(0)->size();
+        size_t bytes = op->getInputs(0)->getBytes();
+        size_t count = bytes / op->getDType().getSize();
        cnclComm_t comm =
            dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
                .getCnclComm();
        cnrtQueue_t queue = context->getBangQueue();
        // checkBangError(cnrtQueueSync(queue));
-        CNCL_CHECK(cnclAllReduce(input, output, count, cnclFloat, getRedOp(),
-                                 comm, queue));
+        CNCL_CHECK(cnclAllReduce(input, output, count,
+                                 cnclDataTypeConvert(op->getDType()),
+                                 getRedOp(), comm, queue));
        checkBangError(cnrtQueueSync(queue));
    }

@ -41,13 +42,13 @@ class AllReduceMaxCNCL : public AllReduceCNCL {
    cnclReduceOp_t getRedOp() const override { return cnclMax; }
 };

-REGISTER_KERNEL(Device::BANG, OpType::AllReduceSum, DataType::Float32,
-                AllReduceSumCNCL, "AllReduce_Sum_CNCL_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::AllReduceProd, DataType::Float32,
-                AllReduceProdCNCL, "AllReduce_Prod_CNCL_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::AllReduceMin, DataType::Float32,
-                AllReduceMinCNCL, "AllReduce_Min_CNCL_BANG_Float32");
-REGISTER_KERNEL(Device::BANG, OpType::AllReduceMax, DataType::Float32,
-                AllReduceMaxCNCL, "AllReduce_Max_CNCL_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::AllReduceSum, AllReduceSumCNCL,
+                "AllReduce_Sum_CNCL_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::AllReduceProd, AllReduceProdCNCL,
+                "AllReduce_Prod_CNCL_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::AllReduceMin, AllReduceMinCNCL,
+                "AllReduce_Min_CNCL_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::AllReduceMax, AllReduceMaxCNCL,
+                "AllReduce_Max_CNCL_BANG");
 } // namespace infini
 #endif
--- a/src/kernels/bang/batchnorm.cc
+++ b/src/kernels/bang/batchnorm.cc
@ -7,7 +7,6 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<BatchNormObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const input = (op->getInputs(0)->getRawDataPtr<void *>());
@ -33,18 +32,18 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
        checkCnnlError(cnnlCreateTensorDescriptor(&intransDesc));
        checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
        checkCnnlError(cnnlCreateTensorDescriptor(&outtransDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, dims.size(),
-                                               dims.data()));
-        checkCnnlError(cnnlSetTensorDescriptor(intransDesc, CNNL_LAYOUT_NHWC,
-                                               CNNL_DTYPE_FLOAT, dims.size(),
-                                               dimsTrans));
-        checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, outDims.size(),
-                                               outDims.data()));
-        checkCnnlError(cnnlSetTensorDescriptor(outtransDesc, CNNL_LAYOUT_NHWC,
-                                               CNNL_DTYPE_FLOAT, outDims.size(),
-                                               dimsOutTrans));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            inDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            dims.size(), dims.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            intransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
+            dims.size(), dimsTrans));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            outDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            outDims.size(), outDims.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            outtransDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
+            outDims.size(), dimsOutTrans));
        cnnlTransposeDescriptor_t opDesc;
        checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
        checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
@ -53,9 +52,9 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
                                      &wsSize);
        BangPtr wsData = context->getWorkspace(wsSize);
        BangPtr inputTrans = context->getWorkspace(
-            cnnlGetTensorElementNum(inDesc) * sizeof(float));
+            cnnlGetTensorElementNum(inDesc) * op->getDType().getSize());
        BangPtr outputTrans = context->getWorkspace(
-            cnnlGetTensorElementNum(inDesc) * sizeof(float));
+            cnnlGetTensorElementNum(inDesc) * op->getDType().getSize());
        cnnlStatus_t stat =
            cnnlTranspose_v2(context->cnnlHandle(), opDesc, inDesc, input,
                             intransDesc, inputTrans, wsData, wsSize);
@ -67,7 +66,7 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
        cnnlTensorDescriptor_t paraDesc;
        checkCnnlError(cnnlCreateTensorDescriptor(&paraDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            paraDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
+            paraDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
            dimsScaleBiasMeanVar.size(), dimsScaleBiasMeanVar.data()));

        float alpha = 1.f, beta = 0.f;
--- a/src/kernels/bang/broadcast.cc
+++ b/src/kernels/bang/broadcast.cc
@ -13,22 +13,23 @@ class BroadcastCNCL : public BangKernelWithoutConfig {
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
        void *input = op->getInputs(0)->getRawDataPtr<void *>();
        void *output = op->getOutput()->getRawDataPtr<void *>();
-        IT_ASSERT(op->getDType() == DataType::Float32);
-        size_t count = op->getInputs(0)->getBytes() / op->getDType().getSize();
+        size_t bytes = op->getInputs(0)->getBytes();
+        size_t count = bytes / op->getDType().getSize();

        cnclComm_t comm =
            dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
                .getCnclComm();
        cnrtQueue_t queue = context->getBangQueue();
        // TODO: Using default stream 0 for now.
-        CNCL_CHECK(cnclBroadcast(input, output, count, cnclFloat32,
+        CNCL_CHECK(cnclBroadcast(input, output, count,
+                                 cnclDataTypeConvert(op->getDType()),
                                 op->getRoot(), comm, queue));
        checkBangError(cnrtQueueSync(queue));
    }
 };

-REGISTER_KERNEL(Device::BANG, OpType::Broadcast, DataType::Float32,
-                BroadcastCNCL, "Broadcast_CNCL_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Broadcast, BroadcastCNCL,
+                "Broadcast_CNCL_BANG");
 } // namespace infini

 #endif
--- a/src/kernels/bang/ceil.cc
+++ b/src/kernels/bang/ceil.cc
@ -7,7 +7,6 @@ class CeilCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<UnaryObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class CeilCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlStatus_t stat =
            cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData);
--- a/src/kernels/bang/clip.cc
+++ b/src/kernels/bang/clip.cc
@ -7,7 +7,6 @@ class ClipCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ClipObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -19,9 +18,9 @@ class ClipCnnl : public BangKernelWithoutConfig {
        auto aDim = op->getInputs(0)->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        cnnlStatus_t stat =
            cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData);
        if (stat != CNNL_STATUS_SUCCESS)
--- a/src/kernels/bang/concat.cc
+++ b/src/kernels/bang/concat.cc
@ -7,7 +7,6 @@ class ConcatCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ConcatObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
        int num = op->numInputs();
        int axis = op->getDim();
@ -15,17 +14,18 @@ class ConcatCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();
        cnnlTensorDescriptor_t desc;
        checkCnnlError(cnnlCreateTensorDescriptor(&desc));
-        checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            desc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlTensorDescriptor_t descArray[num];
        for (int i = 0; i < num; ++i) {
            checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
-            checkCnnlError(cnnlSetTensorDescriptor(
-                descArray[i], CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT,
-                op->getInputs(i)->getDims().size(),
-                op->getInputs(i)->getDims().data()));
+            checkCnnlError(
+                cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
+                                        cnnlDataTypeConvert(op->getDType()),
+                                        op->getInputs(i)->getDims().size(),
+                                        op->getInputs(i)->getDims().data()));
        }

        void *argv[num];
--- a/src/kernels/bang/conv.cc
+++ b/src/kernels/bang/conv.cc
@ -7,7 +7,6 @@ class ConvCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ConvObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
@ -21,8 +20,9 @@ class ConvCnnl : public BangKernelWithoutConfig {

        cnnlConvolutionDescriptor_t convDesc;
        checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
-        checkCnnlError(cnnlSetConvolutionDescriptor(
-            convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
+        checkCnnlError(
+            cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
+                                         cnnlDataTypeConvert(op->getDType())));

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -55,20 +55,24 @@ class ConvCnnl : public BangKernelWithoutConfig {

        // get inputs
        checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aInDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, 4, inputs0));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aInDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
+            inputs0));

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            aDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs0Array));
+            aDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
+            inputs0Array));

        checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bInDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, 4, inputs1));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bInDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
+            inputs1));

        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            bDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs1Array));
+            bDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
+            inputs1Array));

        int permute[4] = {0, 2, 3, 1};
        cnnlTransposeDescriptor_t opDesc;
@ -80,7 +84,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
                                      &wsSize);
        BangPtr wsData = context->getWorkspace(wsSize);
        BangPtr aDataOut = context->getWorkspace(
-            cnnlGetTensorElementNum(aInDesc) * sizeof(float));
+            cnnlGetTensorElementNum(aInDesc) * op->getDType().getSize());
        cnnlStatus_t stat =
            cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData,
                             aDesc, aDataOut, wsData, wsSize);
@ -91,7 +95,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
                                      &wsSize);
        wsData = context->getWorkspace(wsSize);
        BangPtr bDataOut = context->getWorkspace(
-            cnnlGetTensorElementNum(bInDesc) * sizeof(float));
+            cnnlGetTensorElementNum(bInDesc) * op->getDType().getSize());
        stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData,
                                bDesc, bDataOut, wsData, wsSize);
        if (stat != CNNL_STATUS_SUCCESS)
@ -100,11 +104,13 @@ class ConvCnnl : public BangKernelWithoutConfig {
        // get outputs
        checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            cInDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, outputArray));
+            cInDesc, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()), 4,
+            outputArray));

        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, 4, output));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
+            output));

        cnnlConvolutionForwardAlgo_t algo;
        cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc,
@ -116,7 +122,7 @@ class ConvCnnl : public BangKernelWithoutConfig {
                                               algo, &wsSize);
        wsData = context->getWorkspace(wsSize);
        BangPtr cDataIn = context->getWorkspace(
-            cnnlGetTensorElementNum(cInDesc) * sizeof(float));
+            cnnlGetTensorElementNum(cInDesc) * op->getDType().getSize());

        stat = cnnlConvolutionForward(
            context->cnnlHandle(), convDesc, algo, NULL, aDesc, aDataOut, bDesc,
--- a/src/kernels/bang/conv_trans.cc
+++ b/src/kernels/bang/conv_trans.cc
@ -7,7 +7,6 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ConvBaseObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
@ -21,8 +20,9 @@ class ConvTransCnnl : public BangKernelWithoutConfig {

        cnnlConvolutionDescriptor_t convDesc;
        checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
-        checkCnnlError(cnnlSetConvolutionDescriptor(
-            convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
+        checkCnnlError(
+            cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
+                                         cnnlDataTypeConvert(op->getDType())));

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -43,14 +43,17 @@ class ConvTransCnnl : public BangKernelWithoutConfig {
        // get inputs
        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs0.data()));
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
+            dimInputs0.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs1.data()));
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
+            dimInputs1.data()));
        // get outputs
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimOutput.data()));
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
+            dimOutput.data()));

        cnnlConvolutionBwdDataAlgo_t algo;
        cnnlGetConvolutionBackwardDataAlgorithm(
--- a/src/kernels/bang/convbpfilter.cc
+++ b/src/kernels/bang/convbpfilter.cc
@ -7,7 +7,6 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ConvBackwardFilterObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
@ -21,8 +20,9 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {

        cnnlConvolutionDescriptor_t convDesc;
        checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
-        checkCnnlError(cnnlSetConvolutionDescriptor(
-            convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
+        checkCnnlError(
+            cnnlSetConvolutionDescriptor(convDesc, 4, pad, stride, dilation, g,
+                                         cnnlDataTypeConvert(op->getDType())));

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -63,15 +63,16 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
        // get inputs
        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs0Array));
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
+            inputs0Array));

        checkCnnlError(cnnlCreateTensorDescriptor(&aDescTrans));
-        checkCnnlError(cnnlSetTensorDescriptor(aDescTrans, CNNL_LAYOUT_NHWC,
-                                               CNNL_DTYPE_FLOAT, 4,
-                                               inputs0ArrayTrans));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
+            4, inputs0ArrayTrans));

        size_t wsTrans1Size = dimInputs0[0] * dimInputs0[1] * dimInputs0[2] *
-                              dimInputs0[3] * sizeof(float);
+                              dimInputs0[3] * op->getDType().getSize();
        BangPtr wsTrans1Data = context->getWorkspace(wsTrans1Size);

        cnnlStatus_t stat =
@ -82,15 +83,16 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {

        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs1Array));
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
+            inputs1Array));

        checkCnnlError(cnnlCreateTensorDescriptor(&bDescTrans));
-        checkCnnlError(cnnlSetTensorDescriptor(bDescTrans, CNNL_LAYOUT_NHWC,
-                                               CNNL_DTYPE_FLOAT, 4,
-                                               inputs1ArrayTrans));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
+            4, inputs1ArrayTrans));

        size_t wsTrans2Size = dimInputs1[0] * dimInputs1[1] * dimInputs1[2] *
-                              dimInputs1[3] * sizeof(float);
+                              dimInputs1[3] * op->getDType().getSize();
        BangPtr wsTrans2Data = context->getWorkspace(wsTrans2Size);

        stat = cnnlTranspose(context->cnnlHandle(), transDesc, bDesc, bData,
@ -101,15 +103,16 @@ class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
        // get outputs
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, outputArray));
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
+            outputArray));

        checkCnnlError(cnnlCreateTensorDescriptor(&cDescTrans));
-        checkCnnlError(cnnlSetTensorDescriptor(cDescTrans, CNNL_LAYOUT_NHWC,
-                                               CNNL_DTYPE_FLOAT, 4,
-                                               outputArrayTrans));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(op->getDType()),
+            4, outputArrayTrans));

        size_t wsTrans3Size = dimOutput[0] * dimOutput[1] * dimOutput[2] *
-                              dimOutput[3] * sizeof(float);
+                              dimOutput[3] * op->getDType().getSize();
        BangPtr wsTrans3Data = context->getWorkspace(wsTrans3Size);

        cnnlConvolutionBwdFilterAlgo_t algo;
--- a/src/kernels/bang/det.cc
+++ b/src/kernels/bang/det.cc
@ -7,7 +7,6 @@ class DetCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<DetObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -24,14 +23,14 @@ class DetCnnl : public BangKernelWithoutConfig {
        auto dimout = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, dimin.size(),
-                                               dimin.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            dimin.size(), dimin.data()));

        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, dimout.size(),
-                                               dimout.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            dimout.size(), dimout.data()));

        cnnlStatus_t stat =
            cnnlDet(context->cnnlHandle(), nlMode, aDesc, aData, cDesc, cData);
--- a/src/kernels/bang/element_wise.cc
+++ b/src/kernels/bang/element_wise.cc
@ -11,8 +11,8 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ElementWiseObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        auto [aAlpha, bAlpha, beta] = getAlphBeta();

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -31,32 +31,33 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
        }

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, a_dim.size(),
-                                               a_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            a_dim.size(), a_dim.data()));

        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, b_dim.size(),
-                                               b_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            b_dim.size(), b_dim.data()));

        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, c_dim.size(),
-                                               c_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            c_dim.size(), c_dim.data()));

        cnnlOpTensorDescriptor_t opDesc;
        checkCnnlError(cnnlCreateOpTensorDescriptor(&opDesc));
        checkCnnlError(cnnlSetOpTensorDescriptor(
-            opDesc, getOpType(), CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN));
+            opDesc, getOpType(), cnnlDataTypeConvert(op->getDType()),
+            CNNL_NOT_PROPAGATE_NAN));

        size_t wsSize;
-        cnnlGetOpTensorWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
-                                     &wsSize);
+        cnnlGetOpTensorWorkspaceSize_v2(context->cnnlHandle(), opDesc, &aAlpha,
+                                        aDesc, aData, &bAlpha, bDesc, bData,
+                                        &beta, cDesc, cData, &wsSize);

        BangPtr wsData = context->getWorkspace(wsSize);

-        auto [aAlpha, bAlpha, beta] = getAlphBeta();
        cnnlStatus_t stat = cnnlOpTensor(context->cnnlHandle(), opDesc, &aAlpha,
                                         aDesc, aData, &bAlpha, bDesc, bData,
                                         wsData, wsSize, &beta, cDesc, cData);
@ -75,7 +76,6 @@ class LogicOpCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ElementWiseObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -95,17 +95,17 @@ class LogicOpCnnl : public BangKernelWithoutConfig {
        }

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, a_dim.size(),
-                                               a_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            a_dim.size(), a_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, b_dim.size(),
-                                               b_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            b_dim.size(), b_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, c_dim.size(),
-                                               c_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            c_dim.size(), c_dim.data()));

        size_t wsSize;
        cnnlGetLogicOpWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -129,7 +129,6 @@ class BitComputeCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ElementWiseObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -182,7 +181,6 @@ class DivCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ElementWiseObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -202,17 +200,17 @@ class DivCnnl : public BangKernelWithoutConfig {
        }

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, a_dim.size(),
-                                               a_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            a_dim.size(), a_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, b_dim.size(),
-                                               b_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            b_dim.size(), b_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, c_dim.size(),
-                                               c_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            c_dim.size(), c_dim.data()));

        size_t wsSize;
        cnnlGetDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -235,7 +233,6 @@ class MaximumCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ElementWiseObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -255,17 +252,17 @@ class MaximumCnnl : public BangKernelWithoutConfig {
        }

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, a_dim.size(),
-                                               a_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            a_dim.size(), a_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, b_dim.size(),
-                                               b_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            b_dim.size(), b_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, c_dim.size(),
-                                               c_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            c_dim.size(), c_dim.data()));

        size_t wsSize;
        cnnlGetMaximumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
@ -287,7 +284,6 @@ class MinimumCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ElementWiseObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -307,17 +303,17 @@ class MinimumCnnl : public BangKernelWithoutConfig {
        }

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, a_dim.size(),
-                                               a_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            a_dim.size(), a_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, b_dim.size(),
-                                               b_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            b_dim.size(), b_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, c_dim.size(),
-                                               c_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            c_dim.size(), c_dim.data()));

        size_t wsSize;
        cnnlGetMinimumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
@ -339,7 +335,6 @@ class MSELossCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<MSELossObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -359,18 +354,18 @@ class MSELossCnnl : public BangKernelWithoutConfig {
        }

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, a_dim.size(),
-                                               a_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            a_dim.size(), a_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, b_dim.size(),
-                                               b_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            b_dim.size(), b_dim.data()));

        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, c_dim.size(),
-                                               c_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            c_dim.size(), c_dim.data()));
        cnnlStatus_t stat;
        if (reduction == MSELossObj::None) {
            stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_NONE, aDesc,
@ -396,7 +391,6 @@ class PowerCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ElementWiseObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -417,17 +411,17 @@ class PowerCnnl : public BangKernelWithoutConfig {
        }

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, a_dim.size(),
-                                               a_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            a_dim.size(), a_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, b_dim.size(),
-                                               b_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            b_dim.size(), b_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, c_dim.size(),
-                                               c_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            c_dim.size(), c_dim.data()));

        size_t wsSize;
        cnnlGetPowWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -450,7 +444,6 @@ class FloorDivCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ElementWiseObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -470,17 +463,17 @@ class FloorDivCnnl : public BangKernelWithoutConfig {
        }

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, a_dim.size(),
-                                               a_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            a_dim.size(), a_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, b_dim.size(),
-                                               b_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            b_dim.size(), b_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, c_dim.size(),
-                                               c_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            c_dim.size(), c_dim.data()));

        size_t wsSize;
        cnnlGetFloorDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -503,7 +496,6 @@ class FloorModCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ElementWiseObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -523,17 +515,17 @@ class FloorModCnnl : public BangKernelWithoutConfig {
        }

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, a_dim.size(),
-                                               a_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            a_dim.size(), a_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, b_dim.size(),
-                                               b_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            b_dim.size(), b_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, c_dim.size(),
-                                               c_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            c_dim.size(), c_dim.data()));

        size_t wsSize;
        cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
@ -556,7 +548,6 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ElementWiseObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -576,17 +567,17 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
        }

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, a_dim.size(),
-                                               a_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            a_dim.size(), a_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, b_dim.size(),
-                                               b_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            b_dim.size(), b_dim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, c_dim.size(),
-                                               c_dim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            c_dim.size(), c_dim.data()));

        size_t wsSize;
        cnnlGetSquaredDifferenceWorkspaceSize(context->cnnlHandle(), aDesc,
--- a/src/kernels/bang/erf.cc
+++ b/src/kernels/bang/erf.cc
@ -7,7 +7,6 @@ class ErfCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<UnaryObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class ErfCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlStatus_t stat =
            cnnlErf_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
--- a/src/kernels/bang/exp.cc
+++ b/src/kernels/bang/exp.cc
@ -7,7 +7,6 @@ class ExpCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<UnaryObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class ExpCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlStatus_t stat =
            cnnlExp_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
--- a/src/kernels/bang/fill.cc
+++ b/src/kernels/bang/fill.cc
@ -7,7 +7,6 @@ class FillCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<FillObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@ -17,9 +16,9 @@ class FillCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlStatus_t stat =
            cnnlFill(context->cnnlHandle(), value, cDesc, cData);
--- a/src/kernels/bang/floor.cc
+++ b/src/kernels/bang/floor.cc
@ -7,7 +7,6 @@ class FloorCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<UnaryObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class FloorCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlStatus_t stat =
            cnnlFloor(context->cnnlHandle(), aDesc, aData, cDesc, cData);
--- a/src/kernels/bang/gather.cc
+++ b/src/kernels/bang/gather.cc
@ -7,7 +7,6 @@ class GatherCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<GatherObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -20,27 +19,56 @@ class GatherCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(
-            cnnlSetTensorDescriptorPointerMode(bDesc, CNNL_POINTER_MODE_HOST));
+
+        if (bDim.size() == 0) {
+            bDim.push_back(1);
+        }
        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY,
                                               CNNL_DTYPE_INT32, bDim.size(),
                                               bDim.data()));
-        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));

-        BangPtr wsData = context->getWorkspace(aDim.size() * 4);
-        context->copyBlobFromCPU(wsData, aDim.data(), aDim.size() * 4);
+        BangPtr indices;
+        DataType indicesDataType = op->getInputs(1)->getDType();
+        if (indicesDataType == DataType::Int64) {
+            // cnnlGatherV2 does not support int64 indices
+            int indicesSize =
+                op->getInputs(1)->getBytes() / indicesDataType.getSize();
+            indices = context->getWorkspace(indicesSize * sizeof(int));
+            cnnlTensorDescriptor_t bDescInt64;
+            checkCnnlError(cnnlCreateTensorDescriptor(&bDescInt64));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                bDescInt64, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT64, bDim.size(),
+                bDim.data()));
+            checkCnnlError(cnnlCastDataType(context->cnnlHandle(), bDescInt64,
+                                            bData, CNNL_CAST_INT64_TO_INT32,
+                                            bDesc, indices));
+            cnrtQueueSync(context->getBangQueue());
+            checkCnnlError(cnnlDestroyTensorDescriptor(bDescInt64));
+        } else if (indicesDataType == DataType::Int32) {
+            indices = bData;
+        } else {
+            IT_TODO_HALT_MSG("Unsupported data type of indices: " +
+                             indicesDataType.toString());
+        }
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));
+
+        BangPtr wsData = context->getWorkspace(aDim.size() * sizeof(int));
+        context->copyBlobFromCPU(wsData, aDim.data(),
+                                 aDim.size() * sizeof(int));

        auto axis = op->getAxis();
        cnnlStatus_t stat =
            cnnlGatherV2(context->cnnlHandle(), axis, aDesc, aData,
-                         (int *)wsData, bDesc, (int *)bData, cDesc, cData);
+                         reinterpret_cast<const int *>(wsData), bDesc,
+                         reinterpret_cast<const int *>(indices), cDesc, cData);
        if (stat != CNNL_STATUS_SUCCESS)
            return;

--- a/src/kernels/bang/hardtanh.cc
+++ b/src/kernels/bang/hardtanh.cc
@ -7,7 +7,6 @@ class HardtanhCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<HardtanhObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -20,7 +19,8 @@ class HardtanhCnnl : public BangKernelWithoutConfig {

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, dim.size(), dim.data()));
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            dim.size(), dim.data()));

        cnnlStatus_t stat = cnnlHardtanh(context->cnnlHandle(), aDesc, aData,
                                         max, min, aDesc, cData);
--- a/src/kernels/bang/l2loss.cc
+++ b/src/kernels/bang/l2loss.cc
@ -7,7 +7,6 @@ class L2LossCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<L2LossObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,7 +17,8 @@ class L2LossCnnl : public BangKernelWithoutConfig {

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, dim.size(), dim.data()));
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            dim.size(), dim.data()));

        cnnlStatus_t stat =
            cnnlL2Loss(context->cnnlHandle(), aDesc, aData, cData);
--- a/src/kernels/bang/layer_norm.cc
+++ b/src/kernels/bang/layer_norm.cc
@ -8,7 +8,6 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<LayerNormObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -29,17 +28,17 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
        cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc;

        checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, inDims.size(),
-                                               inDims.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            inDims.size(), inDims.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&fiterDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            fiterDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, fiterDims.size(),
-            fiterDims.data()));
+            fiterDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            fiterDims.size(), fiterDims.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, outDims.size(),
-                                               outDims.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            outDims.size(), outDims.data()));
        size_t wsSize;
        cnnlGetLayerNormOpWorkspaceSize(context->cnnlHandle(), axis, inDesc,
                                        &wsSize);
--- a/src/kernels/bang/log.cc
+++ b/src/kernels/bang/log.cc
@ -7,7 +7,6 @@ class LogCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<LogObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -33,13 +32,13 @@ class LogCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlStatus_t stat =
            cnnlLog_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
--- a/src/kernels/bang/lrn.cc
+++ b/src/kernels/bang/lrn.cc
@ -7,7 +7,6 @@ class LRNCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<LRNObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -20,13 +19,13 @@ class LRNCnnl : public BangKernelWithoutConfig {
        auto size = op->getSize();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        size_t extra_size;
        cnnlGetLrnExtraInputSize_v2(context->cnnlHandle(), cDesc,
--- a/src/kernels/bang/matmul.cc
+++ b/src/kernels/bang/matmul.cc
@ -8,7 +8,6 @@ class MatmulCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<MatmulObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        auto input_num = op->numInputs();
@ -38,25 +37,26 @@ class MatmulCnnl : public BangKernelWithoutConfig {
        int32_t transB = op->getTransB();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(
-            cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
-                                    dimInputs0.size(), dimInputs0.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            dimInputs0.size(), dimInputs0.data()));

        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(
-            cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
-                                    dimInputs1.size(), dimInputs1.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            dimInputs1.size(), dimInputs1.data()));

        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(
-            cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
-                                    dimOutput.size(), dimOutput.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            dimOutput.size(), dimOutput.data()));

        if (input_num > 2) {
            checkCnnlError(cnnlCreateTensorDescriptor(&biasDesc));
-            checkCnnlError(cnnlSetTensorDescriptor(
-                biasDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dimBias.size(),
-                dimBias.data()));
+            checkCnnlError(
+                cnnlSetTensorDescriptor(biasDesc, CNNL_LAYOUT_ARRAY,
+                                        cnnlDataTypeConvert(op->getDType()),
+                                        dimBias.size(), dimBias.data()));
        }

        cnnlMatMulDescriptor_t bmm_desc;
--- a/src/kernels/bang/negtensor.cc
+++ b/src/kernels/bang/negtensor.cc
@ -7,7 +7,6 @@ class NegTensorCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<UnaryObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class NegTensorCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlStatus_t stat =
            cnnlNegTensor(context->cnnlHandle(), aDesc, aData, cDesc, cData);
--- a/src/kernels/bang/pad.cc
+++ b/src/kernels/bang/pad.cc
@ -7,7 +7,6 @@ class PadCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<PadObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -37,14 +36,14 @@ class PadCnnl : public BangKernelWithoutConfig {
        float paddingValue = 0.0;
        // input
        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, dimIn.size(),
-                                               dimIn.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            dimIn.size(), dimIn.data()));
        // output
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, dimOut.size(),
-                                               dimOut.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            dimOut.size(), dimOut.data()));

        cnnlStatus_t stat = cnnlPad(context->cnnlHandle(), aDesc, aData,
                                    paddings, &paddingValue, cDesc, cData);
--- a/src/kernels/bang/pooling.cc
+++ b/src/kernels/bang/pooling.cc
@ -8,7 +8,6 @@ class PoolingCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<PoolingObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
        void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const outData = (op->getOutput()->getRawDataPtr<void *>());
@ -20,8 +19,9 @@ class PoolingCnnl : public BangKernelWithoutConfig {
        int inArray[4] = {n, c, h, w};
        cnnlTensorDescriptor_t inDesc;
        checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, 4, inArray));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            inDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
+            inArray));
        bool mode = op->getCeilMode();

        // get maxpool descriptor
@ -37,8 +37,9 @@ class PoolingCnnl : public BangKernelWithoutConfig {
        int outArray[4] = {outVec[0], outVec[1], outVec[2], outVec[3]};
        cnnlTensorDescriptor_t outDesc;
        checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, 4, outArray));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            outDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()), 4,
+            outArray));
        size_t wsSize;
        cnnlGetPoolingWorkspaceSize(context->cnnlHandle(), getPoolingMode(),
                                    outVec[3], outVec[2], &wsSize);
--- a/src/kernels/bang/reciprocal.cc
+++ b/src/kernels/bang/reciprocal.cc
@ -7,7 +7,6 @@ class ReciprocalCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<UnaryObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class ReciprocalCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlStatus_t stat =
            cnnlReciprocal(context->cnnlHandle(), aDesc, aData, cDesc, cData);
--- a/src/kernels/bang/reduce.cc
+++ b/src/kernels/bang/reduce.cc
@ -9,7 +9,6 @@ class ReduceCnnlBase : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ReduceBaseObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
@ -26,20 +25,20 @@ class ReduceCnnlBase : public BangKernelWithoutConfig {
        cnnlTensorDescriptor_t inDesc, outDesc;
        checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
        checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
-        checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, bDim.size(),
-                                               bDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            bDim.size(), bDim.data()));

        // get reduce descriptor
        cnnlReduceDescriptor_t reduceDesc;
        checkCnnlError(cnnlCreateReduceDescriptor(&reduceDesc));
        checkCnnlError(cnnlSetReduceDescriptor_v2(
            reduceDesc, axes.data(), axes.size(), getReduceOp(),
-            CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES,
-            CNNL_32BIT_INDICES, 0.0));
+            cnnlDataTypeConvert(op->getDType()), CNNL_NOT_PROPAGATE_NAN,
+            CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES, 0.0));

        // get workspace
        size_t workspaceSize = 0;
--- a/src/kernels/bang/reshape.cc
+++ b/src/kernels/bang/reshape.cc
@ -14,8 +14,8 @@ class CopyBang : public BangKernelWithoutConfig {

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT8,
-            dim.size() * op->getDType().getSize(), dim.data()));
+            aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            dim.size(), dim.data()));
        cnnlStatus_t stat =
            cnnlCopy(context->cnnlHandle(), aDesc, inData, aDesc, outData);
        if (stat != CNNL_STATUS_SUCCESS)
@ -28,5 +28,7 @@ class CopyBang : public BangKernelWithoutConfig {
 REGISTER_KERNEL(Device::BANG, OpType::Reshape, CopyBang, "Reshape_BANG");
 REGISTER_KERNEL(Device::BANG, OpType::Flatten, CopyBang, "Flatten_BANG");
 REGISTER_KERNEL(Device::BANG, OpType::Identity, CopyBang, "Identity_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Squeeze, CopyBang, "Squeeze_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::Unsqueeze, CopyBang, "Unsqueeze_BANG");

 } // namespace infini
--- a/src/kernels/bang/rsqrt.cc
+++ b/src/kernels/bang/rsqrt.cc
@ -7,7 +7,6 @@ class RsqrtCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<UnaryObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class RsqrtCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlStatus_t stat =
            cnnlRsqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
--- a/src/kernels/bang/slice.cc
+++ b/src/kernels/bang/slice.cc
@ -42,11 +42,13 @@ class SliceCnnl : public BangKernelWithoutConfig {
        // input
        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, aDim_size, aDim_array));
+            aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            aDim_size, aDim_array));
        // output
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, cDim_size, cDim_array));
+            cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            cDim_size, cDim_array));

        cnnlStatus_t stat =
            cnnlStridedSlice(context->cnnlHandle(), aDesc, aData, starts_array,
@ -59,6 +61,6 @@ class SliceCnnl : public BangKernelWithoutConfig {
    }
 };

-REGISTER_KERNEL(Device::BANG, OpType::Slice, DataType::Float32, SliceCnnl,
+REGISTER_KERNEL(Device::BANG, OpType::Slice, SliceCnnl,
                "Slice_cnnl_BANG_Float32");
 }; // namespace infini
--- a/src/kernels/bang/split.cc
+++ b/src/kernels/bang/split.cc
@ -7,7 +7,6 @@ class SplitCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<SplitObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
        int num = op->numOutputs();
        int axis = op->getDim();
@ -16,15 +15,17 @@ class SplitCnnl : public BangKernelWithoutConfig {
        cnnlTensorDescriptor_t desc;
        checkCnnlError(cnnlCreateTensorDescriptor(&desc));
        checkCnnlError(cnnlSetTensorDescriptor(
-            desc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, dim.size(), dim.data()));
+            desc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            dim.size(), dim.data()));

        cnnlTensorDescriptor_t descArray[num];
        for (int i = 0; i < num; ++i) {
            checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
-            checkCnnlError(cnnlSetTensorDescriptor(
-                descArray[i], CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT,
-                op->getOutput(i)->getDims().size(),
-                op->getOutput(i)->getDims().data()));
+            checkCnnlError(
+                cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
+                                        cnnlDataTypeConvert(op->getDType()),
+                                        op->getOutput(i)->getDims().size(),
+                                        op->getOutput(i)->getDims().data()));
        }

        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
--- a/src/kernels/bang/sqrt.cc
+++ b/src/kernels/bang/sqrt.cc
@ -7,7 +7,6 @@ class SqrtCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<UnaryObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class SqrtCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlStatus_t stat =
            cnnlSqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
--- a/src/kernels/bang/transpose.cc
+++ b/src/kernels/bang/transpose.cc
@ -7,7 +7,6 @@ class TransposeCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<TransposeObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -18,13 +17,13 @@ class TransposeCnnl : public BangKernelWithoutConfig {
        auto dimout = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, dimin.size(),
-                                               dimin.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            dimin.size(), dimin.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, dimout.size(),
-                                               dimout.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            dimout.size(), dimout.data()));

        auto permute = op->getPermute();
        cnnlTransposeDescriptor_t opDesc;
@ -53,7 +52,6 @@ class DepthToSpaceCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<DepthToSpaceObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -73,13 +71,13 @@ class DepthToSpaceCnnl : public BangKernelWithoutConfig {
        auto dimout = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, reshape.size(),
-                                               reshape.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            reshape.size(), reshape.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(
-            cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
-                                    transpose.size(), transpose.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            transpose.size(), transpose.data()));

        cnnlTransposeDescriptor_t opDesc;
        checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
--- a/src/kernels/bang/trigon.cc
+++ b/src/kernels/bang/trigon.cc
@ -9,7 +9,6 @@ class TrigonCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<UnaryObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -20,17 +19,18 @@ class TrigonCnnl : public BangKernelWithoutConfig {
        auto cDim = op->getOutput()->getDims();

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                               CNNL_DTYPE_FLOAT, cDim.size(),
-                                               cDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(op->getDType()),
+            cDim.size(), cDim.data()));

        cnnlTrigonDescriptor_t opDesc;
        checkCnnlError(cnnlCreateTrigonDescriptor(&opDesc));
-        checkCnnlError(cnnlSetTrigonDescriptor(opDesc, getOpType()));
+        checkCnnlError(
+            cnnlSetTrigonDescriptor_v2(opDesc, getOpType(), getPrefer()));

        cnnlStatus_t stat = cnnlTrigonForward(context->cnnlHandle(), opDesc,
                                              aDesc, aData, cDesc, cData);
--- a/src/kernels/bang/where.cc
+++ b/src/kernels/bang/where.cc
@ -7,7 +7,6 @@ class WhereCnnl : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<WhereObj>(_op);
-        IT_ASSERT(op->getDType() == DataType::Float32);
        auto context = dynamic_cast<const BangRuntimeObj *>(_context);

        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
@ -35,21 +34,21 @@ class WhereCnnl : public BangKernelWithoutConfig {
        }

        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, aDim.size(),
-                                               aDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            aDim.size(), aDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, bDim.size(),
-                                               bDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            bDim.size(), bDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
                                               CNNL_DTYPE_BOOL, cDim.size(),
                                               cDim.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&dDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(dDesc, CNNL_LAYOUT_ARRAY,
-                                               CNNL_DTYPE_FLOAT, dDim.size(),
-                                               dDim.data()));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            dDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
+            dDim.size(), dDim.data()));
        size_t wsSize;
        cnnlGetSelectV2WorkspaceSize(context->cnnlHandle(), cDesc, aDesc, bDesc,
                                     &wsSize);
--- a/src/kernels/cpu/reshape.cc
+++ b/src/kernels/cpu/reshape.cc
@ -0,0 +1,29 @@
+#include "operators/reshape.h"
+#include "core/kernel.h"
+#include "operators/squeeze.h"
+#include "operators/unsqueeze.h"
+
+namespace infini {
+class NaiveIdentity : public CpuKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+        auto size = _op->getInputs()[0]->getBytes();
+        void *inptr = _op->getInputs(0)->getRawDataPtr<void *>();
+        void *outptr = _op->getOutput()->getRawDataPtr<void *>();
+
+        std::memcpy(outptr, inptr, size);
+    }
+};
+
+REGISTER_KERNEL(Device::CPU, OpType::Reshape, NaiveIdentity,
+                "ReshapeNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Identity, NaiveIdentity,
+                "IdentityNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Unsqueeze, NaiveIdentity,
+                "UnsqueezeNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Squeeze, NaiveIdentity,
+                "SqueezeNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Flatten, NaiveIdentity,
+                "FlattenNaive_CPU");
+
+} // namespace infini
--- a/src/kernels/cpu/unary.cc
+++ b/src/kernels/cpu/unary.cc
@ -47,6 +47,10 @@ class NativeUnary : public CpuKernelWithoutConfig {
        return 0.5 * val * (1 + std::erf(val / std::sqrt(2)));
    }

+    template <typename T> static T siluCompute(T val) {
+        return val / (1 + pow(E_CONSTANT, -val));
+    }
+
    template <typename T> static T erfCompute(T val) { return std::erf(val); }

    template <typename T> static T aCosCompute(T val) { return std::acos(val); }
@ -84,6 +88,9 @@ class NativeUnary : public CpuKernelWithoutConfig {
        case OpType::Gelu:
            _doCompute = geluCompute<T>;
            break;
+        case OpType::Silu:
+            _doCompute = siluCompute<T>;
+            break;
        case OpType::Sigmoid:
            _doCompute = sigmoidCompute<T>;
            break;
@ -138,6 +145,9 @@ class NativeUnary : public CpuKernelWithoutConfig {
        case OpType::Atanh:
            _doCompute = aTanhCompute<T>;
            break;
+        case OpType::Acosh:
+            _doCompute = aCoshCompute<T>;
+            break;
        default:
            IT_TODO_HALT();
        }
@ -289,6 +299,7 @@ class Log : public CpuKernelWithoutConfig {

 REGISTER_KERNEL(Device::CPU, OpType::Relu, NativeUnary, "reluNaive_CPU");
 REGISTER_KERNEL(Device::CPU, OpType::Gelu, NativeUnary, "geluNaive_CPU");
+REGISTER_KERNEL(Device::CPU, OpType::Silu, NativeUnary, "siluNaive_CPU");
 REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, NativeUnary, "sigmoidNaive_CPU");
 REGISTER_KERNEL(Device::CPU, OpType::HardSigmoid, NativeUnary,
                "hardSigmoidNaive_CPU");
--- a/src/kernels/cuda/all_reduce.cc
+++ b/src/kernels/cuda/all_reduce.cc
@ -28,9 +28,8 @@ class AllReduceNCCL : public CudaKernelWithoutConfig {
        ncclComm_t comm =
            dynamic_cast<NcclCommunicatorObj &>(context->getCommunicator())
                .getNcclComm();
-        // TODO: Using default stream 0 for now.
-        checkNcclError(
-            ncclAllReduce(input, output, count, ncclType, getRedOp(), comm, 0));
+        checkNcclError(ncclAllReduce(input, output, count, ncclType, getRedOp(),
+                                     comm, CUDAStream::getCurrentStream()));
    }

    virtual ncclRedOp_t getRedOp() const = 0;
--- a/src/kernels/cuda/attention_kvcache.cc
+++ b/src/kernels/cuda/attention_kvcache.cc
@ -21,7 +21,7 @@ class AttentionKVCacheCompute {
  public:
    void do_compute(Tensor input_k_cache, Tensor input_v_cache, Tensor input_q,
                    Tensor input_k, Tensor input_v, Tensor position_id,
-                    Tensor output_matmul) const {
+                    Tensor output_matmul, CudaPtr p_workspace) const {
        AttentionKVCacheMetadata metadata;
        initAttentionKVCacheMetadata(metadata, input_v_cache);

@ -32,7 +32,8 @@ class AttentionKVCacheCompute {
                                 input_v->getRawDataPtr<float *>(),
                                 position_id->getRawDataPtr<int *>(),
                                 output_matmul->getRawDataPtr<float *>(),
-                                 metadata);
+                                 metadata, (float *)p_workspace,
+                                 (float *)(p_workspace + (1ll << 30)));
    }
 };

@ -41,10 +42,14 @@ class AttentionKVCacheCuda : private AttentionKVCacheCompute,
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        IT_ASSERT(_op->getDType() == DataType::Float32);
+
+        size_t workspaceSize = 2ll << 30;
+        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
+        CudaPtr idxWsData = context->getWorkspace(workspaceSize);
        do_compute(_op->getInputs()[0], _op->getInputs()[1],
                   _op->getInputs()[2], _op->getInputs()[3],
                   _op->getInputs()[4], _op->getInputs()[5],
-                   _op->getOutputs()[0]);
+                   _op->getOutputs()[0], idxWsData);
    }
 };

--- a/src/kernels/cuda/attention_kvcache.cu
+++ b/src/kernels/cuda/attention_kvcache.cu
@ -2,127 +2,170 @@
 #include "cuda/cuda_attention_kvcache.h"
 #define WARP_SIZE 32
 #define BLOCKSIZE WARP_SIZE
-#define SEQ_UNIT 64
+#define SEQ_UNIT 16

-__global__ void _attention_kvcache_kernel(float* input_k_cache,
+// ASSUME SEQ_LEN OF Q IS 1
+__global__ void _attention_kvcache_kernel_128_1(float* input_k_cache,
                                              float* input_v_cache, 
                                              float* input_q, 
                                              float* input_k, 
                                              float* input_v, 
                                              int* position_id,
-                                              float* output_matmul,
-                                              AttentionKVCacheMetadata compMeta) {
+                                              AttentionKVCacheMetadata compMeta,
+                                              float* output_O_temp,
+                                              float* output_sum_temp) {
+    int seq_length = position_id[0] + 1;
+    int stride = (seq_length + SEQ_UNIT - 1) / SEQ_UNIT;
+    if(blockIdx.y >= stride)
+        return;
+
    int lane_id = threadIdx.x % WARP_SIZE;
    int group_id = threadIdx.x / WARP_SIZE;
    int parallel_idx = blockIdx.x * (blockDim.x / WARP_SIZE) + group_id;
+    int idx_seq = blockIdx.y * SEQ_UNIT; 

    if(parallel_idx >= compMeta.dimSize[0] * compMeta.dimSize[1])
        return;

-    float ptr_V[SEQ_UNIT*2]; 
-    float ptr_K[SEQ_UNIT*2]; 
-    float ptr_Q[2]; 
-    float ptr_P[SEQ_UNIT];
+    float ptr_V[SEQ_UNIT*4]; // V
+    float ptr_K[SEQ_UNIT*4]; // K
+    float ptr_Q[4];          // Q
+    float ptr_P[SEQ_UNIT] = {0};

-    float ptr_O[2];
-    float ptr_max[1];
-    float ptr_sum[1];
+    float ptr_O[4] = {0};
+    float ptr_sum[1] = {0};

-    float ptr_max_last[1];
-    float ptr_sum_last[1];
-    float ptr_O_last[2];
+    // readin Q
+    (float4 &)ptr_Q[0] = (float4 &)input_q[(lane_id * 4) + (parallel_idx * 128)];
+    int common_idx = (lane_id * 4) + (parallel_idx * compMeta.stride[1]);

-    (float2 &)ptr_Q[0] = (float2 &)input_q[(lane_id * 2) + (parallel_idx * 64)];
-
-    int SEQ_LENGTH = position_id[0] + 1;
-
-    int common_idx = (lane_id * 2) + (parallel_idx * compMeta.stride[1]);
-
-
-    for (int idx_seq = 0; idx_seq < SEQ_LENGTH; idx_seq += SEQ_UNIT){ 
-        ptr_max_last[0] = ptr_max[0];
-        ptr_sum_last[0] = ptr_sum[0];
-        (float2 &)ptr_O_last[0] = (float2 &)ptr_O[0];
+    // Q*K
+    #pragma unroll
+    for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < seq_length; idx_SEQ_UNIT ++) { 
+        if(idx_SEQ_UNIT + idx_seq < seq_length - 1){                  
+            (float4 &)ptr_K[idx_SEQ_UNIT * 4] 
+                = (float4 &) input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
+        }
+        else{
+            (float4 &)ptr_K[idx_SEQ_UNIT * 4] 
+                = (float4 &) input_k[((lane_id * 4) + parallel_idx * compMeta.stride[2])];
+            (float4 &)input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])] =
+                (float4 &)ptr_K[idx_SEQ_UNIT * 4];
+        }

+        
        #pragma unroll
-        for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) { 
-            if(idx_SEQ_UNIT + idx_seq < SEQ_LENGTH - 1){                  
-                (float2 &)ptr_K[idx_SEQ_UNIT * 2] 
-                    = (float2 &) input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
-            }
-            else{
-                (float2 &)ptr_K[idx_SEQ_UNIT * 2] 
-                    = (float2 &) input_k[((lane_id * 2) + parallel_idx * compMeta.stride[2])];
-                (float2 &)input_k_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])] =
-                    (float2 &)ptr_K[idx_SEQ_UNIT * 2];
-            }
-            ptr_K[idx_SEQ_UNIT * 2] = ptr_Q[0] * ptr_K[idx_SEQ_UNIT * 2];
-            ptr_K[idx_SEQ_UNIT * 2 + 1] = ptr_Q[1] * ptr_K[idx_SEQ_UNIT * 2 + 1];
-
+        for (int i = 0; i < 4; i ++){
+            ptr_K[idx_SEQ_UNIT * 4 + i] = ptr_Q[i] * ptr_K[idx_SEQ_UNIT * 4 + i];
            #pragma unroll
            for (int offset = 16; offset > 0; offset /= 2) {
-                ptr_K[idx_SEQ_UNIT * 2] += __shfl_down_sync(0xffffffff, ptr_K[idx_SEQ_UNIT * 2], offset);
+                ptr_K[idx_SEQ_UNIT * 4 + i] += __shfl_down_sync(0xffffffff, ptr_K[idx_SEQ_UNIT * 4 + i], offset);
            }
-            ptr_P[idx_SEQ_UNIT] = ptr_K[idx_SEQ_UNIT * 2];
-            #pragma unroll
-            for (int offset = 16; offset > 0; offset /= 2){
-                ptr_K[((idx_SEQ_UNIT * 2) + 1)] += __shfl_down_sync(0xffffffff, ptr_K[((idx_SEQ_UNIT * 2) + 1)], offset);
-            }
-            ptr_P[idx_SEQ_UNIT] += ptr_K[((idx_SEQ_UNIT * 2) + 1)];
+            ptr_P[idx_SEQ_UNIT] += ptr_K[idx_SEQ_UNIT * 4 + i];
        }
-
-        #pragma unroll
-        for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) { 
-            ptr_P[idx_SEQ_UNIT] = __shfl_sync(0xffffffff, ptr_P[idx_SEQ_UNIT], 0); 
-            ptr_P[idx_SEQ_UNIT] /= 8;
-            ptr_max[0] = (idx_SEQ_UNIT == 0) ? ptr_P[0] : max(ptr_max[0], ptr_P[idx_SEQ_UNIT]);
-        }
-        ptr_max[0] = (idx_seq == 0) ? ptr_max[0] : max(ptr_max[0], ptr_max_last[0]);
-
-        ptr_sum[0] = 0;
-        #pragma unroll
-        for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) { 
-            ptr_P[idx_SEQ_UNIT] = expf(ptr_P[idx_SEQ_UNIT] - ptr_max[0]);
-            ptr_sum[0] += ptr_P[idx_SEQ_UNIT];
-        }
-        ptr_sum[0] = (idx_seq == 0) ? ptr_sum[0] : expf(ptr_max_last[0] - ptr_max[0]) * ptr_sum_last[0] + ptr_sum[0]; 
-
-        ptr_O[0] = 0;
-        ptr_O[1] = 0;
-        #pragma unroll
-        for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < SEQ_LENGTH; idx_SEQ_UNIT ++) { 
-            if(idx_SEQ_UNIT + idx_seq < SEQ_LENGTH - 1){                  
-                (float2 &)ptr_V[idx_SEQ_UNIT * 2] 
-                    = (float2 &) input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
-            }
-            else{
-                (float2 &)ptr_V[idx_SEQ_UNIT * 2] 
-                    = (float2 &) input_v[((lane_id * 2) + parallel_idx * compMeta.stride[2])];
-                (float2 &)input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])] =
-                    (float2 &)ptr_V[idx_SEQ_UNIT * 2];
-            }
-
-            ptr_P[idx_SEQ_UNIT] /= ptr_sum[0];
-
-            ptr_O[0] = fmaf(ptr_P[idx_SEQ_UNIT],  ptr_V[(idx_SEQ_UNIT * 2)], ptr_O[0]);
-            ptr_O[1] = fmaf(ptr_P[idx_SEQ_UNIT],  ptr_V[(idx_SEQ_UNIT * 2) + 1], ptr_O[1]);
-        }
-        ptr_O[0] = (idx_seq == 0) ? ptr_O[0] : ptr_O[0] + ptr_O_last[0]  * expf(ptr_max_last[0] - ptr_max[0]) * ptr_sum_last[0] / ptr_sum[0];  
-        ptr_O[1] = (idx_seq == 0) ? ptr_O[1] : ptr_O[1] + ptr_O_last[1]  * expf(ptr_max_last[0] - ptr_max[0]) * ptr_sum_last[0] / ptr_sum[0]; 
    }
-    (float2 &)output_matmul[(lane_id * 2) + (parallel_idx * compMeta.dimSize[3])] = (float2 &)ptr_O[0];
+
+    // div sqrt(d)
+    #pragma unroll
+    for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < seq_length; idx_SEQ_UNIT ++) { 
+        ptr_P[idx_SEQ_UNIT] = __shfl_sync(0xffffffff, ptr_P[idx_SEQ_UNIT], 0); 
+        ptr_P[idx_SEQ_UNIT] /= sqrt(128.0);
+    }
+
+    // softmax
+    #pragma unroll
+    for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < seq_length; idx_SEQ_UNIT ++) { 
+        ptr_P[idx_SEQ_UNIT] = expf(ptr_P[idx_SEQ_UNIT]);
+        ptr_sum[0] += ptr_P[idx_SEQ_UNIT];
+    }
+
+    // * V
+    #pragma unroll
+    for (int idx_SEQ_UNIT = 0; idx_SEQ_UNIT < SEQ_UNIT && idx_SEQ_UNIT + idx_seq < seq_length; idx_SEQ_UNIT ++) { 
+        if(idx_SEQ_UNIT + idx_seq < seq_length - 1){                  
+            (float4 &)ptr_V[idx_SEQ_UNIT * 4] 
+                = (float4 &) input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])];
+        }
+        else{
+            (float4 &)ptr_V[idx_SEQ_UNIT * 4] 
+                = (float4 &) input_v[((lane_id * 4) + parallel_idx * compMeta.stride[2])];
+            (float4 &)input_v_cache[common_idx + ((idx_SEQ_UNIT + idx_seq) * compMeta.stride[2])]
+                = (float4 &)ptr_V[idx_SEQ_UNIT * 4];
+        }
+
+        #pragma unroll
+        for (int i = 0; i < 4; i ++)
+            ptr_O[i] = fmaf(ptr_P[idx_SEQ_UNIT], ptr_V[(idx_SEQ_UNIT * 4 + i)], ptr_O[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 4; i ++)
+        ptr_O[i] /= ptr_sum[0];
+
+    (float4 &)output_O_temp[(lane_id * 4) + (blockIdx.y * compMeta.dimSize[3]) + (parallel_idx * compMeta.dimSize[3] * stride)] = (float4 &)ptr_O[0];
+    if(lane_id == 0){
+        output_sum_temp[blockIdx.y + parallel_idx * stride] = ptr_sum[0];
+    }
+
 }

+__global__ void _attention_kvcache_kernel_128_2(int* position_id,
+                                              float* output_matmul,
+                                              AttentionKVCacheMetadata compMeta,
+                                              float* output_O_temp,
+                                              float* output_sum_temp) {
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int group_id = threadIdx.x / WARP_SIZE;
+    int parallel_idx = blockIdx.x * (blockDim.x / WARP_SIZE) + group_id;
+
+    float ptr_O[4] = {0};
+    float ptr_O_sum[4] = {0};
+    float ptr_sum = 0;
+    float ptr_sum_temp;
+    int size = (position_id[0] + SEQ_UNIT) / SEQ_UNIT;
+
+    #pragma unroll
+    for(int i = 0; i < size; i ++){
+        (float4 &)ptr_O[0] 
+            = (float4 &)output_O_temp[(lane_id * 4) + (i * compMeta.dimSize[3]) + parallel_idx * compMeta.dimSize[3] * size];
+        ptr_sum_temp = output_sum_temp[i + parallel_idx * size];
+
+        #pragma unroll
+        for(int k = 0; k < 4; k ++)
+            ptr_O_sum[k] += ptr_O[k] * ptr_sum_temp;
+        ptr_sum += ptr_sum_temp;
+    }
+
+    #pragma unroll
+    for(int k = 0; k < 4; k ++)
+        ptr_O_sum[k] = ptr_O_sum[k] / ptr_sum; 
+
+    (float4 &)output_matmul[(lane_id * 4) + (parallel_idx * compMeta.dimSize[3])] = (float4 &)ptr_O_sum[0];
+
+}
+
+
 namespace infini {
-void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache, float *input_q, float *input_k,
-                          float *input_v, int *position_id, float *output_matmul,
-                          const AttentionKVCacheMetadata &compMeta) {
-    IT_ASSERT(compMeta.dimSize[3] == 64);
-    dim3 gridDim(compMeta.dimSize[0]*compMeta.dimSize[1]/(BLOCKSIZE/WARP_SIZE), 1);
+void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache, 
+                          float *input_q, float *input_k,
+                          float *input_v, int *position_id, float *output_matmul, 
+                          const AttentionKVCacheMetadata &compMeta,
+                          float *output_O_temp, float *output_sum_temp) {
+    IT_ASSERT(compMeta.dimSize[3] == 128);
+
+    int gridsize_y = (compMeta.dimSize[2] - 1 + SEQ_UNIT) / SEQ_UNIT;
+    dim3 gridDim(compMeta.dimSize[0]*compMeta.dimSize[1]/(BLOCKSIZE/WARP_SIZE), gridsize_y);
    dim3 blockDim(BLOCKSIZE, 1);

-    _attention_kvcache_kernel<<<gridDim, blockDim>>>(
-        input_k_cache, input_v_cache, input_q, input_k, input_v, position_id, output_matmul, compMeta);
+    _attention_kvcache_kernel_128_1
+        <<<gridDim, blockDim, 0, CUDAStream::getCurrentStream()>>>
+        (input_k_cache, input_v_cache, input_q, input_k, input_v, position_id,
+        compMeta, output_O_temp, output_sum_temp);
+
+    _attention_kvcache_kernel_128_2
+        <<<compMeta.dimSize[0]*compMeta.dimSize[1]/(BLOCKSIZE/WARP_SIZE), WARP_SIZE,
+        0, CUDAStream::getCurrentStream()>>>
+        (position_id, output_matmul, compMeta, output_O_temp, output_sum_temp);
 }

 } // namespace infini
--- a/src/kernels/cuda/clip.cu
+++ b/src/kernels/cuda/clip.cu
@ -25,8 +25,9 @@ void clip_kernel(float *input, float *output, int num, float minValue,
                 float maxValue) {
    int blocksize = block_work_size();
    int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _clip_kernel<<<gridsize, blocksize>>>(input, output, num, minValue,
-                                          maxValue);
+    _clip_kernel
+        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
+        input, output, num, minValue, maxValue);
 }

 }; // namespace infini
--- a/src/kernels/cuda/element_wise.cc
+++ b/src/kernels/cuda/element_wise.cc
@ -115,6 +115,20 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
        auto a_dim = op->getInputs(0)->getDims();
        auto b_dim = op->getInputs(1)->getDims();
        auto c_dim = op->getOutput()->getDims();
+        const int dType = _op->getDType().getIndex();
+
+        // Use optimized kernel if b is constant
+        if (b_dim.size() == 0) {
+            if (op->getOpType() == OpType::Div) {
+                div_const_kernel(dType, aData, bData, cData,
+                                 op->getOutput()->size());
+                return;
+            } else if (op->getOpType() == OpType::Pow) {
+                pow_const_kernel(dType, aData, bData, cData,
+                                 op->getOutput()->size());
+                return;
+            }
+        }

        if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4)
            IT_TODO_HALT();
@ -127,7 +141,6 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
        std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
        std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));

-        const int dType = _op->getDType().getIndex();
        if (op->getOpType() == OpType::Div) {
            div_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3], b[0],
                       b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
--- a/src/kernels/cuda/element_wise.cu
+++ b/src/kernels/cuda/element_wise.cu
@ -131,8 +131,9 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
 }

 #define CASE(OP, T)                                                            \
-    _##OP##_kernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>(                    \
-        a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
+    _##OP##_kernel<DT_CUDA<T>::t>                                              \
+        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(          \
+            a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);

 #define SWITCH_DTYPE(OP, DTYPE)                                                \
    switch (DTYPE) {                                                           \
@ -176,7 +177,92 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
        IT_TODO_HALT();                                                        \
    }

+template <class T>
+__global__ void _div_const_kernel(void const *__restrict__ x,
+                                  void const *__restrict__ y,
+                                  void *__restrict__ z, const size_t n) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < n) {
+        ((T *)z)[tid] = ((T *)x)[tid] / *((T *)y);
+    }
+}
+
+template <class T>
+__global__ void _pow_const_kernel(void const *__restrict__ x,
+                                  void const *__restrict__ y,
+                                  void *__restrict__ z, const size_t n) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < n) {
+        ((T *)z)[tid] = pow(((T *)x)[tid], *((T *)y));
+    }
+}
+template <>
+__global__ void _pow_const_kernel<half>(void const *__restrict__ x,
+                                        void const *__restrict__ y,
+                                        void *__restrict__ z, const size_t n) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < n) {
+        ((half *)z)[tid] = pow(((float)((half *)x)[tid]), *((half *)y));
+    }
+}
+
+#define CASE_CONST(OP, T)                                                      \
+    _##OP##_const_kernel<DT_CUDA<T>::t>                                        \
+        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(a, b, c,  \
+                                                                     n);
+
+#define SWITCH_DTYPE_CONST(OP, DTYPE)                                          \
+    switch (DTYPE) {                                                           \
+    case 1:                                                                    \
+        CASE_CONST(OP, 1)                                                      \
+        break;                                                                 \
+    case 2:                                                                    \
+        CASE_CONST(OP, 2)                                                      \
+        break;                                                                 \
+    case 3:                                                                    \
+        CASE_CONST(OP, 3)                                                      \
+        break;                                                                 \
+    case 4:                                                                    \
+        CASE_CONST(OP, 4)                                                      \
+        break;                                                                 \
+    case 5:                                                                    \
+        CASE_CONST(OP, 5)                                                      \
+        break;                                                                 \
+    case 6:                                                                    \
+        CASE_CONST(OP, 6)                                                      \
+        break;                                                                 \
+    case 7:                                                                    \
+        CASE_CONST(OP, 7)                                                      \
+        break;                                                                 \
+    case 10:                                                                   \
+        CASE_CONST(OP, 10)                                                     \
+        break;                                                                 \
+    case 11:                                                                   \
+        CASE_CONST(OP, 11)                                                     \
+        break;                                                                 \
+    case 12:                                                                   \
+        CASE_CONST(OP, 12)                                                     \
+        break;                                                                 \
+    case 13:                                                                   \
+        CASE_CONST(OP, 13)                                                     \
+        break;                                                                 \
+    default:                                                                   \
+        IT_TODO_HALT();                                                        \
+    }
+
 namespace infini {
+void div_const_kernel(int dType, void *a, void *b, void *c, size_t n) {
+    size_t blocksize = block_work_size();
+    size_t gridsize = (n + block_work_size() - 1) / block_work_size();
+    SWITCH_DTYPE_CONST(div, dType);
+}
+
+void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n) {
+    size_t blocksize = block_work_size();
+    size_t gridsize = (n + block_work_size() - 1) / block_work_size();
+    SWITCH_DTYPE_CONST(pow, dType);
+}
+
 void div_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
                int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
                int c3) {
@ -202,11 +288,13 @@ void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
    int num = c0 * c1 * c2 * c3;
    int gridsize = (num + block_work_size() - 1) / block_work_size();
    if (dType == 1) {
-        _pow_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0,
-                                                    b1, b2, b3, c0, c1, c2, c3);
+        _pow_kernel<float>
+            <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
+                a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
    } else if (dType == 3) {
-        _pow_kernel<int8_t><<<gridsize, blocksize>>>(
-            a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
+        _pow_kernel<int8_t>
+            <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
+                a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
    } else if (dType == 10) {
        int a_size = a0 * a1 * a2 * a3;
        int b_size = b0 * b1 * b2 * b3;
@ -220,9 +308,10 @@ void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
        for (int i = 0; i < b_size; ++i) {
            b_float[i] = __half2float(((half *)b)[i]);
        }
-        _pow_kernel<float><<<gridsize, blocksize>>>(
-            a_float.data(), b_float.data(), c_float.data(), a0, a1, a2, a3, b0,
-            b1, b2, b3, c0, c1, c2, c3);
+        _pow_kernel<float>
+            <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
+                a_float.data(), b_float.data(), c_float.data(), a0, a1, a2, a3,
+                b0, b1, b2, b3, c0, c1, c2, c3);
        for (int i = 0; i < c_size; ++i) {
            ((half *)c)[i] = __float2half(c_float[i]);
        }
--- a/src/kernels/cuda/expand.cu
+++ b/src/kernels/cuda/expand.cu
@ -39,10 +39,19 @@ __global__ void _expandKernel(void *input, void *output, int nDims,
    }
 }

+template <class T>
+static __global__ void _expandRowKernel(void *__restrict__ dst,
+                                        void const *__restrict__ src) {
+    auto da = gridDim.x, db = blockDim.y, dx = blockDim.x, n = blockIdx.y,
+         a = blockIdx.x, b = threadIdx.y, x = threadIdx.x;
+    auto i = ((n * da + a) * db + b) * dx + x, j = (a * db + b) * dx + x;
+    reinterpret_cast<T *>(dst)[i] = reinterpret_cast<T const *>(src)[j];
+}
 namespace infini {

 #define CASE(T)                                                                \
-    _expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>(                     \
+    _expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize,                        \
+        0, CUDAStream::getCurrentStream()>>>(                                  \
        input, output, nDims, outputsize, inputShape, outputShape);

 #define SWITCH_DTYPE(DTYPE)                                                    \
@ -95,4 +104,67 @@ void expandKernel(int dType, void *input, void *output, int nDims,
    SWITCH_DTYPE(dType)
 }

+#define CASE_ROW(T)                                                            \
+    _expandRowKernel<float>                                                    \
+        <<<grid, block, 0, CUDAStream::getCurrentStream()>>>(output, input);
+
+#define SWITCH_DTYPE_ROW(DTYPE)                                                \
+    switch (DTYPE) {                                                           \
+    case 1:                                                                    \
+        CASE_ROW(1)                                                            \
+        break;                                                                 \
+    case 2:                                                                    \
+        CASE_ROW(2)                                                            \
+        break;                                                                 \
+    case 3:                                                                    \
+        CASE_ROW(3)                                                            \
+        break;                                                                 \
+    case 4:                                                                    \
+        CASE_ROW(4)                                                            \
+        break;                                                                 \
+    case 5:                                                                    \
+        CASE_ROW(5)                                                            \
+        break;                                                                 \
+    case 6:                                                                    \
+        CASE_ROW(6)                                                            \
+        break;                                                                 \
+    case 7:                                                                    \
+        CASE_ROW(7)                                                            \
+        break;                                                                 \
+    case 10:                                                                   \
+        CASE_ROW(10)                                                           \
+        break;                                                                 \
+    case 11:                                                                   \
+        CASE_ROW(11)                                                           \
+        break;                                                                 \
+    case 12:                                                                   \
+        CASE_ROW(12)                                                           \
+        break;                                                                 \
+    case 13:                                                                   \
+        CASE_ROW(13)                                                           \
+        break;                                                                 \
+    case 16:                                                                   \
+        CASE_ROW(16)                                                           \
+        break;                                                                 \
+    default:                                                                   \
+        IT_TODO_HALT();                                                        \
+    }
+
+// Optimization for expanding a row vector. The row length must be a multiple of 32
+void expandRowKernel(int dType, void *input, void *output, int n_rows,
+                     int row_len) {
+    // Factorize row_len: row_len = a x b x 32 (32 is the warp size), b<=32
+    // input: 1 x (a x b x 32 x sizeT)
+    // output: n_rows x (a x b x 32 x sizeT)
+    // grid: n_rows x a
+    // block: b x 32
+    auto c = row_len / 32, b = c;
+    if (b > 32) {
+        for (b = 32; c % b != 0; --b);
+    }
+    auto a = c / b;
+    dim3 grid(a, n_rows), block(32, b);
+    SWITCH_DTYPE_ROW(dType)
+}
+
 } // namespace infini
--- a/src/kernels/cuda/extend.cu
+++ b/src/kernels/cuda/extend.cu
@ -19,7 +19,8 @@ void extend_kernel(float *in, float *out, int blockSize, int blockSizeOuter,
                   int oSize) {
    int blocksize = 32 * 16;
    int gridsize = (oSize + blocksize - 1) / blocksize;
-    _extend_kernel<<<gridsize, blocksize>>>(in, out, blockSize, blockSizeOuter,
-                                            oSize);
+    _extend_kernel
+        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
+        in, out, blockSize, blockSizeOuter, oSize);
 }
 } // namespace infini
--- a/src/kernels/cuda/gather.cu
+++ b/src/kernels/cuda/gather.cu
@ -45,9 +45,12 @@ void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num) {
    int gridSize = (num + blockSize - 1) / blockSize;
    if (metaData.indexType == DataType::Int64) {
        _gather_kernel<T, int64_t>
-            <<<gridSize, blockSize>>>(in, out, metaData, num);
+            <<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>
+            (in, out, metaData, num);
    } else {
-        _gather_kernel<T, int><<<gridSize, blockSize>>>(in, out, metaData, num);
+        _gather_kernel<T, int>
+            <<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>
+            (in, out, metaData, num);
    }
 }
 template void gather_kernel<float>(float *in, float *out,
--- a/src/kernels/cuda/gather_elements.cu
+++ b/src/kernels/cuda/gather_elements.cu
@ -40,22 +40,26 @@ void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
    int gridSize = (num + blockSize - 1) / blockSize;
    if (metaData.dataType == DataType::Float32 &&
        metaData.indexType == DataType::Int64) {
-        _gather_elements_kernel<float, int64_t><<<gridSize, blockSize>>>(
+        _gather_elements_kernel<float, int64_t>
+            <<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>(
            reinterpret_cast<float *>(in), reinterpret_cast<float *>(out),
            metaData, num);
    } else if (metaData.dataType == DataType::Int32 &&
               metaData.indexType == DataType::Int64) {
-        _gather_elements_kernel<int, int64_t><<<gridSize, blockSize>>>(
+        _gather_elements_kernel<int, int64_t>
+            <<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>(
            reinterpret_cast<int *>(in), reinterpret_cast<int *>(out), metaData,
            num);
    } else if (metaData.dataType == DataType::Float32 &&
               metaData.indexType == DataType::Int32) {
-        _gather_elements_kernel<float, int><<<gridSize, blockSize>>>(
+        _gather_elements_kernel<float, int>
+            <<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>(
            reinterpret_cast<float *>(in), reinterpret_cast<float *>(out),
            metaData, num);
    } else if (metaData.dataType == DataType::Int32 &&
               metaData.indexType == DataType::Int32) {
-        _gather_elements_kernel<int, int><<<gridSize, blockSize>>>(
+        _gather_elements_kernel<int, int>
+            <<<gridSize, blockSize, 0, CUDAStream::getCurrentStream()>>>(
            reinterpret_cast<int *>(in), reinterpret_cast<int *>(out), metaData,
            num);
    } else {
--- a/src/kernels/cuda/layer_norm.cu
+++ b/src/kernels/cuda/layer_norm.cu
@ -344,8 +344,8 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
        int BLOCK_DIM = 1024;

        blockLaynormKernel<float, 1024>
-            <<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
-                                       eps, scaleSize, bias, biasSize);
+            <<<num_block, BLOCK_DIM, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, bias, biasSize);
    } else if (dimsize > 31) {
        int BLOCK_DIM_x = 32;
        int BLOCK_DIM_y = 32;
@ -353,9 +353,10 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<float, 32, 32><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
-            bias, biasSize);
+        warpLaynormKernel<float, 32, 32>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+             bias, biasSize);
    } else if (dimsize > 15) {
        int BLOCK_DIM_x = 16;
        int BLOCK_DIM_y = 64;
@ -363,8 +364,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<float, 16, 64><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+        warpLaynormKernel<float, 16, 64>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block,
            bias, biasSize);
    } else if (dimsize > 7) {
        int BLOCK_DIM_x = 8;
@ -373,8 +375,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<float, 8, 128><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+        warpLaynormKernel<float, 8, 128>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block,
            bias, biasSize);
    } else {
        int BLOCK_DIM_x = 4;
@ -383,8 +386,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<float, 4, 256><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+        warpLaynormKernel<float, 4, 256>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block,
            bias, biasSize);
    }
 }
@ -396,8 +400,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
    if (dimsize > 1024) {
        int BLOCK_DIM = 1024;

-        blockLaynormKernel<float, 1024><<<num_block, BLOCK_DIM>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize);
+        blockLaynormKernel<float, 1024>
+            <<<num_block, BLOCK_DIM, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize);
    } else if (dimsize > 31) {
        int BLOCK_DIM_x = 32;
        int BLOCK_DIM_y = 32;
@ -405,8 +410,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<float, 32, 32><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+        warpLaynormKernel<float, 32, 32>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block);
    } else if (dimsize > 15) {
        int BLOCK_DIM_x = 16;
        int BLOCK_DIM_y = 64;
@ -414,8 +420,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<float, 16, 64><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+        warpLaynormKernel<float, 16, 64>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block);
    } else if (dimsize > 7) {
        int BLOCK_DIM_x = 8;
        int BLOCK_DIM_y = 128;
@ -423,8 +430,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<float, 8, 128><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+        warpLaynormKernel<float, 8, 128>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block);
    } else {
        int BLOCK_DIM_x = 4;
        int BLOCK_DIM_y = 256;
@ -432,8 +440,9 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<float, 4, 256><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+        warpLaynormKernel<float, 4, 256>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block);
    }
 }
 //-----------------
@ -445,8 +454,8 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
        int BLOCK_DIM = 1024;

        blockLaynormKernel<half, 1024>
-            <<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
-                                       eps, scaleSize, bias, biasSize);
+            <<<num_block, BLOCK_DIM, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, bias, biasSize);
    } else if (dimsize > 31) {
        int BLOCK_DIM_x = 32;
        int BLOCK_DIM_y = 32;
@ -454,8 +463,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<half, 32, 32><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+        warpLaynormKernel<half, 32, 32>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block,
            bias, biasSize);
    } else if (dimsize > 15) {
        int BLOCK_DIM_x = 16;
@ -464,8 +474,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<half, 16, 64><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+        warpLaynormKernel<half, 16, 64>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block,
            bias, biasSize);
    } else if (dimsize > 7) {
        int BLOCK_DIM_x = 8;
@ -474,8 +485,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<half, 8, 128><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+        warpLaynormKernel<half, 8, 128>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block,
            bias, biasSize);
    } else {
        int BLOCK_DIM_x = 4;
@ -484,8 +496,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<half, 4, 256><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block,
+        warpLaynormKernel<half, 4, 256>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block,
            bias, biasSize);
    }
 }
@ -497,8 +510,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
    if (dimsize > 1024) {
        int BLOCK_DIM = 1024;

-        blockLaynormKernel<half, 1024><<<num_block, BLOCK_DIM>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize);
+        blockLaynormKernel<half, 1024>
+            <<<num_block, BLOCK_DIM, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize);
    } else if (dimsize > 31) {
        int BLOCK_DIM_x = 32;
        int BLOCK_DIM_y = 32;
@ -506,8 +520,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<half, 32, 32><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+        warpLaynormKernel<half, 32, 32>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block);
    } else if (dimsize > 15) {
        int BLOCK_DIM_x = 16;
        int BLOCK_DIM_y = 64;
@ -515,8 +530,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<half, 16, 64><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+        warpLaynormKernel<half, 16, 64>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block);
    } else if (dimsize > 7) {
        int BLOCK_DIM_x = 8;
        int BLOCK_DIM_y = 128;
@ -524,8 +540,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<half, 8, 128><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+        warpLaynormKernel<half, 8, 128>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block);
    } else {
        int BLOCK_DIM_x = 4;
        int BLOCK_DIM_y = 256;
@ -533,8 +550,9 @@ void LaynormKernel(const half *input, const half *scale, const half eps,
        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
        dim3 grid_dim(num_block_x, 1, 1);

-        warpLaynormKernel<half, 4, 256><<<grid_dim, block_dim>>>(
-            input, scale, dimsize, stride, output, eps, scaleSize, num_block);
+        warpLaynormKernel<half, 4, 256>
+            <<<grid_dim, block_dim, 0, CUDAStream::getCurrentStream()>>>
+            (input, scale, dimsize, stride, output, eps, scaleSize, num_block);
    }
 }
 } // namespace infini
--- a/Show More
+++ b/Show More