update fused kernels.

update reduce kernel
add: onnx ok.
2022-11-15 15:13:50 +08:00 · 2022-11-02 20:02:22 +08:00 · 2022-11-02 17:49:37 +08:00 · 2022-11-02 17:49:37 +08:00 · 2022-11-02 17:49:37 +08:00 · 2022-11-02 17:49:37 +08:00
85 changed files with 6129 additions and 53 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -12,10 +12,11 @@ option(BUILD_TEST "Build tests" ON)
 cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
+cmake_dependent_option(BUILD_TEST_PFUSION "Build tests for EINNET" ON BUILD_TEST OFF)

 set(DEFAULT_BUILD_TYPE "RelWithDebInfo")

-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off

 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations")
@ -74,7 +75,7 @@ if(BUILD_TEST)
 endif()

 # Source files
-file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/operators/*.cc src/utils/*.cc)
+file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/pfusion/*.cc src/operators/*.cc src/utils/*.cc)

 if(USE_CUDA)
  file(GLOB_RECURSE SRC_CUDA src/cuda/*.cc src/cuda/*.cu src/kernels/cuda/*.cc src/kernels/cuda/*.cu)
@ -114,7 +115,7 @@ if(USE_CUDA)
      ${CMAKE_CXX_COMPILER}
      CACHE STRING "Set cuda host compiler path")
  # CMP0104 requires CUDA_ARCHITECTURES
-  set_target_properties(InfiniTensor PROPERTIES CUDA_ARCHITECTURES "70;80")
+  set_target_properties(InfiniTensor PROPERTIES CUDA_ARCHITECTURES "70")
  enable_language(CUDA)
  find_package(CUDAToolkit) # For nvrtc and cuda driver
  target_link_libraries(InfiniTensor cudnn CUDA::curand CUDA::cublas CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
@ -196,4 +197,7 @@ if(BUILD_TEST)
  if(BUILD_TEST_EINNET)
    build_test(test/nnet/test_*.cc)
  endif()
+  if(BUILD_TEST_PFUSION)
+    build_test(test/pfusion/test_*.cc)
+  endif()
 endif()
--- a/eval_pfusion/cuda_utils.h
+++ b/eval_pfusion/cuda_utils.h
@ -0,0 +1,89 @@
+#pragma once
+
+#include <iostream>
+
+#include <cublas.h>
+#include <curand.h>
+
+#define cudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
+#define cudaCheckError() __cudaCheckError(__FILE__, __LINE__)
+#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
+#define curandSafeCall(err) __curandSafeCall(err, __FILE__, __LINE__)
+
+inline void __cudaSafeCall(cudaError err, const char *file, const int line) {
+    if (err != cudaSuccess) {
+        std::cout << "[ERROR] " << file << "::" << line
+                  << ": cudaSafeCall() failed. " << cudaGetErrorString(err)
+                  << std::endl;
+        exit(-1);
+    }
+    return;
+}
+
+inline void __cudaCheckError(const char *file, const int line) {
+    auto err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        std::cout << "[ERROR] " << file << "::" << line
+                  << ": cudaCheckError() failed. " << cudaGetErrorString(err)
+                  << std::endl;
+        exit(-1);
+    }
+
+#ifdef DEBUG
+    // This checking will affect performance.
+    err = cudaDeviceSynchronize();
+    if (err != cudaSuccess) {
+        std::cout << "[ERROR] " << file << "::" << line
+                  << ": cudaCheckError() with sync failed. "
+                  << cudaGetErrorString(err) << std::endl;
+        exit(-1);
+    }
+#endif
+
+    return;
+}
+
+inline const char *cublasGetErrorString(cublasStatus_t err) {
+    switch (err) {
+    case CUBLAS_STATUS_SUCCESS:
+        return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+        return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+        return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+        return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+        return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+        return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+        return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+        return "CUBLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+        return "CUBLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+        return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "<unknown>";
+}
+
+inline void __cublasSafeCall(cublasStatus_t err, const char *file,
+                             const int line) {
+    if (err != CUBLAS_STATUS_SUCCESS) {
+        std::cout << "[ERROR]" << file << "::" << line
+                  << ": cublasSafeCall() failed. " << cublasGetErrorString(err)
+                  << std::endl;
+        exit(-1);
+    }
+}
+
+inline void __curandSafeCall(curandStatus_t err, const char *file,
+                             const int line) {
+    if (err != CURAND_STATUS_SUCCESS) {
+        std::cout << "[ERROR]" << file << "::" << line
+                  << ": curandSafeCall() failed. " << err << std::endl;
+        exit(-1);
+    }
+}
--- a/eval_pfusion/eval_kernel.py
+++ b/eval_pfusion/eval_kernel.py
@ -0,0 +1,21 @@
+import os
+
+
+def eval(filename, kernel, shape):
+    with open("../eval_pfusion/eval_kernel.tmp", "r") as f:
+        code = f.read()
+    code = code.replace("%%invoke_func%%", kernel)
+    code = code.replace("%%shape%%", shape)
+    with open("../generated_code/tmp.cu", "w") as f:
+        f.write(code)
+    # os.system("make -j && ./test_bias")
+    os.system(
+        "nvcc ../generated_code/tmp.cu ../generated_code/" + filename + " -I ../eval_pfusion -o ./tmp")
+    os.system("./tmp")
+
+
+if __name__ == "__main__":
+    eval("bias_0.cu", "invoke_func_0", "{28 * 28, 24}")
+    eval("bias_1.cu", "invoke_func_1", "{28 * 28, 58}")
+    eval("bias_2.cu", "invoke_func_2", "{14 * 14, 116}")
+    eval("bias_3.cu", "invoke_func_3", "{7 * 7, 232}")
--- a/eval_pfusion/eval_kernel.tmp
+++ b/eval_pfusion/eval_kernel.tmp
@ -0,0 +1,41 @@
+#include "cuda.h"
+#include "cuda_utils.h"
+
+#include <vector>
+
+void %%invoke_func%%(float *tensor_ptr_0, float *tensor_ptr_1,
+    float *tensor_ptr_2);
+
+int main() {
+    std::vector<int> shape = %%shape%%;
+    float *t0, *t1, *t2;
+    size_t size = 1;
+    for (auto x : shape) {
+        size *= x;
+    }
+
+    cudaSafeCall(cudaMalloc((void **)&t0, size * sizeof(float)));
+    cudaSafeCall(cudaMalloc((void **)&t1, size * sizeof(float)));
+    cudaSafeCall(cudaMalloc((void **)&t2, size * sizeof(float)));
+
+    float duration = 0;
+    cudaEvent_t st, ed;
+    cudaEventCreate(&st);
+    cudaEventCreate(&ed);
+    int cnt = 128;
+    for (int t = 0; t < cnt; t++) {
+        %%invoke_func%%(t0, t1, t2);
+    }
+    cudaEventRecord(st, 0);
+    for (int t = 0; t < cnt; t++) {
+        %%invoke_func%%(t0, t1, t2);
+    }
+    cudaEventRecord(ed, 0);
+    cudaEventSynchronize(st);
+    cudaEventSynchronize(ed);
+    cudaEventElapsedTime(&duration, st, ed);
+    std::cout << "[INFO] time: " << duration / cnt << std::endl;
+    // double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
+    // std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
+    std::cout << "[Exit] successful." << std::endl;
+}
--- a/eval_pfusion/eval_sar_drn_0.cu
+++ b/eval_pfusion/eval_sar_drn_0.cu
@ -0,0 +1,41 @@
+#include "cuda.h"
+#include "cuda_utils.h"
+
+#include <vector>
+
+void invoke_func_2(float *tensor_ptr_2, float *tensor_ptr_3,
+    float *tensor_ptr_4);
+
+int main() {
+    std::vector<int> shape = {1, 64, 512, 512};
+    float *t0, *t1, *t2;
+    size_t size = 1;
+    for (auto x : shape) {
+        size *= x;
+    }
+
+    cudaSafeCall(cudaMalloc((void **)&t0, size * sizeof(float)));
+    cudaSafeCall(cudaMalloc((void **)&t1, size * sizeof(float)));
+    cudaSafeCall(cudaMalloc((void **)&t2, size * sizeof(float)));
+
+    float duration = 0;
+    cudaEvent_t st, ed;
+    cudaEventCreate(&st);
+    cudaEventCreate(&ed);
+    int cnt = 128;
+    for (int t = 0; t < cnt; t++) {
+        invoke_func_2(t0, t1, t2);
+    }
+    cudaEventRecord(st, 0);
+    for (int t = 0; t < cnt; t++) {
+        invoke_func_2(t0, t1, t2);
+    }
+    cudaEventRecord(ed, 0);
+    cudaEventSynchronize(st);
+    cudaEventSynchronize(ed);
+    cudaEventElapsedTime(&duration, st, ed);
+    std::cout << "[INFO] time: " << duration / cnt << std::endl;
+    double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
+    std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
+    std::cout << "[Exit] successful." << std::endl;
+}
--- a/eval_pfusion/eval_sar_drn_1.cu
+++ b/eval_pfusion/eval_sar_drn_1.cu
@ -0,0 +1,41 @@
+#include "cuda.h"
+#include "cuda_utils.h"
+
+#include <vector>
+
+void invoke_func_5(float *tensor_ptr_2, float *tensor_ptr_3,
+    float *tensor_ptr_4);
+
+int main() {
+    std::vector<int> shape = {1, 1, 512, 512};
+    float *t0, *t1, *t2;
+    size_t size = 1;
+    for (auto x : shape) {
+        size *= x;
+    }
+
+    cudaSafeCall(cudaMalloc((void **)&t0, size * sizeof(float)));
+    cudaSafeCall(cudaMalloc((void **)&t1, size * sizeof(float)));
+    cudaSafeCall(cudaMalloc((void **)&t2, size * sizeof(float)));
+
+    float duration = 0;
+    cudaEvent_t st, ed;
+    cudaEventCreate(&st);
+    cudaEventCreate(&ed);
+    int cnt = 128;
+    for (int t = 0; t < cnt; t++) {
+        invoke_func_5(t0, t1, t2);
+    }
+    cudaEventRecord(st, 0);
+    for (int t = 0; t < cnt; t++) {
+        invoke_func_5(t0, t1, t2);
+    }
+    cudaEventRecord(ed, 0);
+    cudaEventSynchronize(st);
+    cudaEventSynchronize(ed);
+    cudaEventElapsedTime(&duration, st, ed);
+    std::cout << "[INFO] time: " << duration / cnt << std::endl;
+    double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
+    std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
+    std::cout << "[Exit] successful." << std::endl;
+}
--- a/eval_pfusion/eval_sardrn.sh
+++ b/eval_pfusion/eval_sardrn.sh
@ -0,0 +1,8 @@
+# rm ./eval_sar_drn_0 ./eval_sar_drn_1 ./eval_sar_drn_2
+make -j && ./test_sar_drn
+nvcc ../eval_pfusion/eval_sar_drn_0.cu ../generated_code/sar_drn_0.cu -I ../eval_pfusion -o eval_sar_drn_0
+nvcc ../eval_pfusion/eval_sar_drn_1.cu ../generated_code/sar_drn_1.cu -I ../eval_pfusion -o eval_sar_drn_1
+# nvcc ../eval_pfusion/eval_sar_drn_2.cu ../generated_code/sar_drn_1.cu -I ../eval_pfusion -o eval_sar_drn_2
+./eval_sar_drn_0
+./eval_sar_drn_1
+# ./eval_sar_drn_2
--- a/eval_pfusion/eval_transpose.cu
+++ b/eval_pfusion/eval_transpose.cu
@ -0,0 +1,84 @@
+#include "cuda.h"
+#include "cuda_utils.h"
+
+#include <vector>
+
+void invoke_func_0(float *src, float *dst);
+
+int main() {
+    std::vector<int> shape = {31, 32, 32, 33};
+    std::vector<int> perm = {2, 0, 3, 1};
+    float *src, *dst;
+    size_t size = 1;
+    for (auto x : shape) {
+        size *= x;
+    }
+    std::vector<int> stride_src(4), stride_dst(4);
+    stride_dst[0] = 1;
+    for (int i = 1; i < 4; i++) {
+        stride_dst[i] = stride_dst[i-1] * shape[i-1];
+    }
+    size_t this_stride = 1;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            if (perm[j] == i) {
+                stride_src[i] = this_stride;
+                this_stride *= shape[j];
+            }
+        }
+    }
+
+    cudaSafeCall(cudaMalloc((void **)&src, size * sizeof(float)));
+    cudaSafeCall(cudaMalloc((void **)&dst, size * sizeof(float)));
+
+    float *src_host, *dst_host;
+    src_host = (float *)malloc(size * sizeof(float));
+    dst_host = (float *)malloc(size * sizeof(float));
+    for (size_t i = 0; i < size; i++) {
+        src_host[i] = i;
+    }
+    cudaSafeCall(cudaMemcpy(src, src_host, size * sizeof(float), cudaMemcpyHostToDevice));
+    invoke_func_0(src, dst);
+    cudaSafeCall(cudaMemcpy(dst_host, dst, size * sizeof(float), cudaMemcpyDeviceToHost));
+    bool flag = 0;
+    for (size_t i = 0; i < size; i++) {
+        size_t base = i;
+        size_t offset_src = 0;
+        for (int j = 0; j < 4; j++) {
+            offset_src += base % shape[j] * stride_src[perm[j]];
+            base /= shape[j];
+        }
+        if (dst_host[i] != src_host[offset_src]) {
+            flag = 1;
+            std::cout << "[ERROR] at " << i << "," << offset_src << ":" << dst_host[i] << "," << src_host[offset_src] << std::endl;
+            break;
+        }
+    }
+
+    if (!flag) {
+        std::cout << "[INFO] transpose correct." << std::endl;
+    } else {
+        std::cout << "[ERROR] transpose incorrect." << std::endl;
+    }
+
+    float duration = 0;
+    cudaEvent_t st, ed;
+    cudaEventCreate(&st);
+    cudaEventCreate(&ed);
+    int cnt = 128;
+    for (int t = 0; t < cnt; t++) {
+        invoke_func_0(src, dst);
+    }
+    cudaEventRecord(st, 0);
+    for (int t = 0; t < cnt; t++) {
+        invoke_func_0(src, dst);
+    }
+    cudaEventRecord(ed, 0);
+    cudaEventSynchronize(st);
+    cudaEventSynchronize(ed);
+    cudaEventElapsedTime(&duration, st, ed);
+    std::cout << "[INFO] time: " << duration << std::endl;
+    double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
+    std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
+    std::cout << "[Exit] successful." << std::endl;
+}
--- a/eval_pfusion/eval_transpose.py
+++ b/eval_pfusion/eval_transpose.py
@ -0,0 +1,21 @@
+import os
+
+
+def eval(filename, kernel, shape, perm):
+    with open("../eval_pfusion/eval_transpose.tmp", "r") as f:
+        code = f.read()
+    code = code.replace("%%invoke_func%%", kernel)
+    code = code.replace("%%shape%%", shape)
+    code = code.replace("%%perm%%", perm)
+    with open("../generated_code/tmp.cu", "w") as f:
+        f.write(code)
+    # os.system("make -j && ./test_bias")
+    os.system(
+        "nvcc ../generated_code/tmp.cu ../generated_code/" + filename + " -I ../eval_pfusion -o ./tmp")
+    os.system("./tmp")
+
+
+if __name__ == "__main__":
+    eval("transpose_0.cu", "invoke_func_0", "{28 * 28, 58, 2}", "{0, 2, 1}")
+    eval("transpose_1.cu", "invoke_func_1", "{14 * 14, 116, 2}", "{0, 2, 1}")
+    eval("transpose_2.cu", "invoke_func_2", "{7 * 7, 232, 2}", "{0, 2, 1}")
--- a/eval_pfusion/eval_transpose.sh
+++ b/eval_pfusion/eval_transpose.sh
@ -0,0 +1,4 @@
+rm ./eval_transpose
+make -j && ./test_transpose
+nvcc ../eval_pfusion/eval_transpose.cu ../generated_code/transpose.cu -I ../eval_pfusion -o eval_transpose
+./eval_transpose
--- a/eval_pfusion/eval_transpose.tmp
+++ b/eval_pfusion/eval_transpose.tmp
@ -0,0 +1,84 @@
+#include "cuda.h"
+#include "cuda_utils.h"
+
+#include <vector>
+
+void %%invoke_func%%(float *src, float *dst);
+
+int main() {
+    std::vector<int> shape = %%shape%%;
+    std::vector<int> perm = %%perm%%;
+    float *src, *dst;
+    size_t size = 1;
+    for (auto x : shape) {
+        size *= x;
+    }
+    std::vector<int> stride_src(shape.size()), stride_dst(shape.size());
+    stride_dst[0] = 1;
+    for (int i = 1; i < shape.size(); i++) {
+        stride_dst[i] = stride_dst[i-1] * shape[i-1];
+    }
+    size_t this_stride = 1;
+    for (int i = 0; i < shape.size(); i++) {
+        for (int j = 0; j < shape.size(); j++) {
+            if (perm[j] == i) {
+                stride_src[i] = this_stride;
+                this_stride *= shape[j];
+            }
+        }
+    }
+
+    cudaSafeCall(cudaMalloc((void **)&src, size * sizeof(float)));
+    cudaSafeCall(cudaMalloc((void **)&dst, size * sizeof(float)));
+
+    float *src_host, *dst_host;
+    src_host = (float *)malloc(size * sizeof(float));
+    dst_host = (float *)malloc(size * sizeof(float));
+    for (size_t i = 0; i < size; i++) {
+        src_host[i] = i;
+    }
+    cudaSafeCall(cudaMemcpy(src, src_host, size * sizeof(float), cudaMemcpyHostToDevice));
+    %%invoke_func%%(src, dst);
+    cudaSafeCall(cudaMemcpy(dst_host, dst, size * sizeof(float), cudaMemcpyDeviceToHost));
+    bool flag = 0;
+    for (size_t i = 0; i < size; i++) {
+        size_t base = i;
+        size_t offset_src = 0;
+        for (int j = 0; j < shape.size(); j++) {
+            offset_src += base % shape[j] * stride_src[perm[j]];
+            base /= shape[j];
+        }
+        if (dst_host[i] != src_host[offset_src]) {
+            flag = 1;
+            std::cout << "[ERROR] at " << i << "," << offset_src << ":" << dst_host[i] << "," << src_host[offset_src] << std::endl;
+            break;
+        }
+    }
+
+    if (!flag) {
+        std::cout << "[INFO] transpose correct." << std::endl;
+    } else {
+        std::cout << "[ERROR] transpose incorrect." << std::endl;
+    }
+
+    float duration = 0;
+    cudaEvent_t st, ed;
+    cudaEventCreate(&st);
+    cudaEventCreate(&ed);
+    int cnt = 128;
+    for (int t = 0; t < cnt; t++) {
+        %%invoke_func%%(src, dst);
+    }
+    cudaEventRecord(st, 0);
+    for (int t = 0; t < cnt; t++) {
+        %%invoke_func%%(src, dst);
+    }
+    cudaEventRecord(ed, 0);
+    cudaEventSynchronize(st);
+    cudaEventSynchronize(ed);
+    cudaEventElapsedTime(&duration, st, ed);
+    std::cout << "[INFO] time: " << duration / cnt << std::endl;
+    double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
+    std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
+    std::cout << "[Exit] successful." << std::endl;
+}
--- a/generated_code/bert_0.cu
+++ b/generated_code/bert_0.cu
@ -0,0 +1,148 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func_0(float *tensor_ptr_3, float *tensor_ptr_2,
+                              float *tensor_ptr_4) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 2 + warp_id;
+    float buf[24];
+    for (int loop_idx = parallel_idx; loop_idx < 15627264; loop_idx += 216) {
+        int offset_src = 0;
+        int offset_src_buf = loop_idx;
+        offset_src += offset_src_buf % 15627264 * 32;
+        offset_src_buf /= 15627264;
+    }
+}
+// Kernel
+__global__ void kernel_func_6(float *tensor_ptr_4, float *tensor_ptr_5,
+                              float *tensor_ptr_6, float *tensor_ptr_7,
+                              float *tensor_ptr_8) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 8 + warp_id;
+    float buf[48];
+    for (int loop_idx = parallel_idx; loop_idx < 256; loop_idx += 864) {
+        int offset_src = 0;
+        int offset_src_buf = loop_idx;
+        offset_src += offset_src_buf % 256 * 256;
+        offset_src_buf /= 256;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx] =
+                tensor_ptr_4[0 + offset_src + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 8] =
+                tensor_ptr_5[0 + offset_src + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            tensor_ptr_6[0 + offset_src + inst_idx * 32 + lane_id] =
+                buf[inst_idx + 16];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx] =
+                tensor_ptr_6[0 + offset_src + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 8] =
+                tensor_ptr_7[0 + offset_src + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            tensor_ptr_8[0 + offset_src + inst_idx * 32 + lane_id] =
+                buf[inst_idx + 16];
+        }
+    }
+}
+// Kernel
+__global__ void kernel_func_7(float *tensor_ptr_8, float *tensor_ptr_9,
+                              float *tensor_ptr_10) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 2 + warp_id;
+    float buf[48];
+    for (int loop_idx = parallel_idx; loop_idx < 128; loop_idx += 216) {
+        int offset_src = 0;
+        int offset_src_buf = loop_idx;
+        offset_src += offset_src_buf % 128 * 512;
+        offset_src_buf /= 128;
+    }
+}
+// Kernel
+__global__ void kernel_func_5(float *tensor_ptr_10, float *tensor_ptr_11,
+                              float *tensor_ptr_12) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 8 + warp_id;
+    float buf[24];
+    for (int loop_idx = parallel_idx; loop_idx < 256; loop_idx += 864) {
+        int offset_src = 0;
+        int offset_src_buf = loop_idx;
+        offset_src += offset_src_buf % 256 * 256;
+        offset_src_buf /= 256;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx] =
+                tensor_ptr_10[0 + offset_src + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 8] =
+                tensor_ptr_11[0 + offset_src + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 16] = buf[inst_idx] - buf[inst_idx + 8];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            tensor_ptr_12[0 + offset_src + inst_idx * 32 + lane_id] =
+                buf[inst_idx + 16];
+        }
+    }
+}
+void invoke_func_0(float *tensor_ptr_3, float *tensor_ptr_2,
+                   float *tensor_ptr_4) {
+    dim3 gridDim(108, 1);
+    dim3 blockDim(64, 1);
+    kernel_func_0<<<gridDim, blockDim>>>(tensor_ptr_3, tensor_ptr_2,
+                                         tensor_ptr_4);
+    cudaCheckError();
+}
+void invoke_func_6(float *tensor_ptr_4, float *tensor_ptr_5,
+                   float *tensor_ptr_6, float *tensor_ptr_7,
+                   float *tensor_ptr_8) {
+    dim3 gridDim(108, 1);
+    dim3 blockDim(256, 1);
+    kernel_func_6<<<gridDim, blockDim>>>(
+        tensor_ptr_4, tensor_ptr_5, tensor_ptr_6, tensor_ptr_7, tensor_ptr_8);
+    cudaCheckError();
+}
+void invoke_func_7(float *tensor_ptr_8, float *tensor_ptr_9,
+                   float *tensor_ptr_10) {
+    dim3 gridDim(108, 1);
+    dim3 blockDim(64, 1);
+    kernel_func_7<<<gridDim, blockDim>>>(tensor_ptr_8, tensor_ptr_9,
+                                         tensor_ptr_10);
+    cudaCheckError();
+}
+void invoke_func_5(float *tensor_ptr_10, float *tensor_ptr_11,
+                   float *tensor_ptr_12) {
+    dim3 gridDim(108, 1);
+    dim3 blockDim(256, 1);
+    kernel_func_5<<<gridDim, blockDim>>>(tensor_ptr_10, tensor_ptr_11,
+                                         tensor_ptr_12);
+    cudaCheckError();
+}
--- a/generated_code/bias_0.cu
+++ b/generated_code/bias_0.cu
@ -0,0 +1,42 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func_0(float *input, float *bias, float *output) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 4 + warp_id;
+    float buf[4];
+    for (int loop_idx = parallel_idx; loop_idx < 144; loop_idx += 320) {
+        int offset_input = 0;
+        int offset_input_buf = loop_idx;
+        offset_input += offset_input_buf % 7 * 128;
+        offset_input_buf /= 7;
+        offset_input += offset_input_buf % 24 * 784;
+        offset_input_buf /= 24;
+        int offset_bias = 0;
+        int offset_bias_buf = loop_idx;
+        offset_bias += offset_bias_buf % 24 * 24;
+        offset_bias_buf /= 24;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
+            buf[inst_idx] = input[0 + offset_input + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
+            buf[4] = bias[0 + offset_bias];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
+            buf[inst_idx] = buf[inst_idx] + buf[4];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            bias[0 + offset_input + inst_idx * 32 + lane_id] = buf[inst_idx];
+        }
+    }
+}
+void invoke_func_0(float *input, float *bias, float *output) {
+    dim3 gridDim(80, 1);
+    dim3 blockDim(128, 1);
+    kernel_func_0<<<gridDim, blockDim>>>(input, bias, output);
+    cudaCheckError();
+}
--- a/generated_code/bias_1.cu
+++ b/generated_code/bias_1.cu
@ -0,0 +1,42 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func_1(float *input, float *bias, float *output) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 4 + warp_id;
+    float buf[4];
+    for (int loop_idx = parallel_idx; loop_idx < 348; loop_idx += 320) {
+        int offset_input = 0;
+        int offset_input_buf = loop_idx;
+        offset_input += offset_input_buf % 7 * 128;
+        offset_input_buf /= 7;
+        offset_input += offset_input_buf % 58 * 784;
+        offset_input_buf /= 58;
+        int offset_bias = 0;
+        int offset_bias_buf = loop_idx;
+        offset_bias += offset_bias_buf % 58 * 58;
+        offset_bias_buf /= 58;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
+            buf[inst_idx] = input[0 + offset_input + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
+            buf[4] = bias[0 + offset_bias];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
+            buf[inst_idx] = buf[inst_idx] + buf[4];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            bias[0 + offset_input + inst_idx * 32 + lane_id] = buf[inst_idx];
+        }
+    }
+}
+void invoke_func_1(float *input, float *bias, float *output) {
+    dim3 gridDim(80, 1);
+    dim3 blockDim(128, 1);
+    kernel_func_1<<<gridDim, blockDim>>>(input, bias, output);
+    cudaCheckError();
+}
--- a/generated_code/bias_2.cu
+++ b/generated_code/bias_2.cu
@ -0,0 +1,42 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func_2(float *input, float *bias, float *output) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 4 + warp_id;
+    float buf[4];
+    for (int loop_idx = parallel_idx; loop_idx < 116; loop_idx += 320) {
+        int offset_input = 0;
+        int offset_input_buf = loop_idx;
+        offset_input += offset_input_buf % 2 * 128;
+        offset_input_buf /= 2;
+        offset_input += offset_input_buf % 116 * 196;
+        offset_input_buf /= 116;
+        int offset_bias = 0;
+        int offset_bias_buf = loop_idx;
+        offset_bias += offset_bias_buf % 116 * 116;
+        offset_bias_buf /= 116;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
+            buf[inst_idx] = input[0 + offset_input + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
+            buf[4] = bias[0 + offset_bias];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
+            buf[inst_idx] = buf[inst_idx] + buf[4];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            bias[0 + offset_input + inst_idx * 32 + lane_id] = buf[inst_idx];
+        }
+    }
+}
+void invoke_func_2(float *input, float *bias, float *output) {
+    dim3 gridDim(80, 1);
+    dim3 blockDim(128, 1);
+    kernel_func_2<<<gridDim, blockDim>>>(input, bias, output);
+    cudaCheckError();
+}
--- a/generated_code/bias_3.cu
+++ b/generated_code/bias_3.cu
@ -0,0 +1,42 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func_3(float *input, float *bias, float *output) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 4 + warp_id;
+    float buf[4];
+    for (int loop_idx = parallel_idx; loop_idx < 0; loop_idx += 320) {
+        int offset_input = 0;
+        int offset_input_buf = loop_idx;
+        offset_input += offset_input_buf % 1 * 128;
+        offset_input_buf /= 1;
+        offset_input += offset_input_buf % 232 * 49;
+        offset_input_buf /= 232;
+        int offset_bias = 0;
+        int offset_bias_buf = loop_idx;
+        offset_bias += offset_bias_buf % 232 * 232;
+        offset_bias_buf /= 232;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
+            buf[inst_idx] = input[0 + offset_input + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
+            buf[4] = bias[0 + offset_bias];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
+            buf[inst_idx] = buf[inst_idx] + buf[4];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            bias[0 + offset_input + inst_idx * 32 + lane_id] = buf[inst_idx];
+        }
+    }
+}
+void invoke_func_3(float *input, float *bias, float *output) {
+    dim3 gridDim(80, 1);
+    dim3 blockDim(128, 1);
+    kernel_func_3<<<gridDim, blockDim>>>(input, bias, output);
+    cudaCheckError();
+}
--- a/generated_code/reduce_mean.cu
+++ b/generated_code/reduce_mean.cu
@ -0,0 +1,20 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 2 + warp_id;
+    float buf[24];
+    for (int loop_idx = parallel_idx; loop_idx < 128; loop_idx += 216) {
+        int offset_src = 0;
+        int offset_src_buf = loop_idx;
+        offset_src += offset_src_buf % 128 * 512;
+        offset_src_buf /= 128;
+    }
+}
+void invoke_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
+    dim3 gridDim(108, 1);
+    dim3 blockDim(64, 1);
+    kernel_func_0<<<gridDim, blockDim>>>(tensor_ptr_2, tensor_ptr_3);
+    cudaCheckError();
+}
--- a/generated_code/sar_drn_0.cu
+++ b/generated_code/sar_drn_0.cu
@ -0,0 +1,46 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func_2(float *tensor_ptr_2, float *tensor_ptr_4,
+                              float *tensor_ptr_5) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 8 + warp_id;
+    float buf[32];
+    for (int loop_idx = parallel_idx; loop_idx < 65536; loop_idx += 864) {
+        int offset_src = 0;
+        int offset_src_buf = loop_idx;
+        offset_src += offset_src_buf % 65536 * 256;
+        offset_src_buf /= 65536;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx] =
+                tensor_ptr_2[0 + offset_src + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx] = (buf[inst_idx] > 0) ? buf[inst_idx] : 0;
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 8] =
+                tensor_ptr_4[0 + offset_src + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            tensor_ptr_5[0 + offset_src + inst_idx * 32 + lane_id] =
+                buf[inst_idx + 16];
+        }
+    }
+}
+void invoke_func_2(float *tensor_ptr_2, float *tensor_ptr_4,
+                   float *tensor_ptr_5) {
+    dim3 gridDim(108, 1);
+    dim3 blockDim(256, 1);
+    kernel_func_2<<<gridDim, blockDim>>>(tensor_ptr_2, tensor_ptr_4,
+                                         tensor_ptr_5);
+    cudaCheckError();
+}
--- a/generated_code/sar_drn_1.cu
+++ b/generated_code/sar_drn_1.cu
@ -0,0 +1,46 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func_5(float *tensor_ptr_9, float *tensor_ptr_11,
+                              float *tensor_ptr_12) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 8 + warp_id;
+    float buf[32];
+    for (int loop_idx = parallel_idx; loop_idx < 1024; loop_idx += 864) {
+        int offset_src = 0;
+        int offset_src_buf = loop_idx;
+        offset_src += offset_src_buf % 1024 * 256;
+        offset_src_buf /= 1024;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx] =
+                tensor_ptr_9[0 + offset_src + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx] = (buf[inst_idx] > 0) ? buf[inst_idx] : 0;
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 8] =
+                tensor_ptr_11[0 + offset_src + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 16] = buf[inst_idx] - buf[inst_idx + 8];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            tensor_ptr_12[0 + offset_src + inst_idx * 32 + lane_id] =
+                buf[inst_idx + 16];
+        }
+    }
+}
+void invoke_func_5(float *tensor_ptr_9, float *tensor_ptr_11,
+                   float *tensor_ptr_12) {
+    dim3 gridDim(108, 1);
+    dim3 blockDim(256, 1);
+    kernel_func_5<<<gridDim, blockDim>>>(tensor_ptr_9, tensor_ptr_11,
+                                         tensor_ptr_12);
+    cudaCheckError();
+}
--- a/generated_code/test.cu
+++ b/generated_code/test.cu
@ -0,0 +1,24 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func(float *src, float *dst) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 0 + warp_id;
+    float buf[0];
+    __shared__ float smem[0];
+    for (int loop_idx = parallel_idx; loop_idx < 1024; loop_idx += 0) {
+        if (lane_id < 0) {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
+                smem[warp_id * 32 * 32 * 2 + inst_idx * 32 + lane_id] =
+                    inst_idx;
+            }
+        }
+    }
+}
+void invoke_func(float *src, float *dst) {
+    dim3 gridDim(0, 1);
+    dim3 blockDim(0, 1);
+    kernel_func<<<gridDim, blockDim>>>(src, dst);
+    cudaCheckError();
+}
--- a/generated_code/tmp.cu
+++ b/generated_code/tmp.cu
@ -0,0 +1,84 @@
+#include "cuda.h"
+#include "cuda_utils.h"
+
+#include <vector>
+
+void invoke_func_2(float *src, float *dst);
+
+int main() {
+    std::vector<int> shape = {7 * 7, 232, 2};
+    std::vector<int> perm = {0, 2, 1};
+    float *src, *dst;
+    size_t size = 1;
+    for (auto x : shape) {
+        size *= x;
+    }
+    std::vector<int> stride_src(shape.size()), stride_dst(shape.size());
+    stride_dst[0] = 1;
+    for (int i = 1; i < shape.size(); i++) {
+        stride_dst[i] = stride_dst[i-1] * shape[i-1];
+    }
+    size_t this_stride = 1;
+    for (int i = 0; i < shape.size(); i++) {
+        for (int j = 0; j < shape.size(); j++) {
+            if (perm[j] == i) {
+                stride_src[i] = this_stride;
+                this_stride *= shape[j];
+            }
+        }
+    }
+
+    cudaSafeCall(cudaMalloc((void **)&src, size * sizeof(float)));
+    cudaSafeCall(cudaMalloc((void **)&dst, size * sizeof(float)));
+
+    float *src_host, *dst_host;
+    src_host = (float *)malloc(size * sizeof(float));
+    dst_host = (float *)malloc(size * sizeof(float));
+    for (size_t i = 0; i < size; i++) {
+        src_host[i] = i;
+    }
+    cudaSafeCall(cudaMemcpy(src, src_host, size * sizeof(float), cudaMemcpyHostToDevice));
+    invoke_func_2(src, dst);
+    cudaSafeCall(cudaMemcpy(dst_host, dst, size * sizeof(float), cudaMemcpyDeviceToHost));
+    bool flag = 0;
+    for (size_t i = 0; i < size; i++) {
+        size_t base = i;
+        size_t offset_src = 0;
+        for (int j = 0; j < shape.size(); j++) {
+            offset_src += base % shape[j] * stride_src[perm[j]];
+            base /= shape[j];
+        }
+        if (dst_host[i] != src_host[offset_src]) {
+            flag = 1;
+            std::cout << "[ERROR] at " << i << "," << offset_src << ":" << dst_host[i] << "," << src_host[offset_src] << std::endl;
+            break;
+        }
+    }
+
+    if (!flag) {
+        std::cout << "[INFO] transpose correct." << std::endl;
+    } else {
+        std::cout << "[ERROR] transpose incorrect." << std::endl;
+    }
+
+    float duration = 0;
+    cudaEvent_t st, ed;
+    cudaEventCreate(&st);
+    cudaEventCreate(&ed);
+    int cnt = 128;
+    for (int t = 0; t < cnt; t++) {
+        invoke_func_2(src, dst);
+    }
+    cudaEventRecord(st, 0);
+    for (int t = 0; t < cnt; t++) {
+        invoke_func_2(src, dst);
+    }
+    cudaEventRecord(ed, 0);
+    cudaEventSynchronize(st);
+    cudaEventSynchronize(ed);
+    cudaEventElapsedTime(&duration, st, ed);
+    std::cout << "[INFO] time: " << duration / cnt << std::endl;
+    double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
+    std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
+    std::cout << "[Exit] successful." << std::endl;
+}
--- a/generated_code/transpose.cu
+++ b/generated_code/transpose.cu
@ -0,0 +1,52 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 8 + warp_id;
+    float buf[32];
+    __shared__ float smem[8448];
+    for (int loop_idx = parallel_idx; loop_idx < 1056; loop_idx += 864) {
+        int offset_src = 0;
+        int offset_src_buf = loop_idx;
+        offset_src += offset_src_buf % 32 * 32736;
+        offset_src_buf /= 32;
+        offset_src += offset_src_buf % 33 * 33;
+        offset_src_buf /= 33;
+        int offset_dst = 0;
+        int offset_dst_buf = loop_idx;
+        offset_dst += offset_dst_buf % 32 * 1024;
+        offset_dst_buf /= 32;
+        offset_dst += offset_dst_buf % 33 * 33792;
+        offset_dst_buf /= 33;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
+            buf[inst_idx] =
+                tensor_ptr_2[0 + offset_src + 0 + inst_idx * 1056 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
+            smem[group_id * 32 * 33 + inst_idx * 33 + lane_id] = buf[inst_idx];
+        }
+        if (lane_id < 31) {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
+                buf[inst_idx] =
+                    smem[group_id * 32 * 33 + lane_id * 33 + inst_idx];
+            }
+        }
+        if (lane_id < 31) {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
+                tensor_ptr_3[0 + offset_dst + 0 + inst_idx * 31 + lane_id] =
+                    buf[inst_idx];
+            }
+        }
+    }
+}
+void invoke_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
+    dim3 gridDim(108, 1);
+    dim3 blockDim(256, 1);
+    kernel_func_0<<<gridDim, blockDim>>>(tensor_ptr_2, tensor_ptr_3);
+    cudaCheckError();
+}
--- a/generated_code/transpose_0.cu
+++ b/generated_code/transpose_0.cu
@ -0,0 +1,56 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func_0(float *input, float *output) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 4 + warp_id;
+    float buf[4];
+    for (int loop_idx = parallel_idx; loop_idx < 812; loop_idx += 320) {
+        int offset_input = 0;
+        int offset_input_buf = loop_idx;
+        offset_input += offset_input_buf % 7 * 128;
+        offset_input_buf /= 7;
+        offset_input += offset_input_buf % 58 * 1568;
+        offset_input_buf /= 58;
+        offset_input += offset_input_buf % 2 * 784;
+        offset_input_buf /= 2;
+        int offset_output = 0;
+        int offset_output_buf = loop_idx;
+        offset_output += offset_output_buf % 7 * 128;
+        offset_output_buf /= 7;
+        offset_output += offset_output_buf % 58 * 784;
+        offset_output_buf /= 58;
+        offset_output += offset_output_buf % 2 * 45472;
+        offset_output_buf /= 2;
+        if (loop_idx % 7 == 6) {
+            if (lane_id < 16) {
+                buf[0] = input[0 + offset_input + 0 * 32 + lane_id];
+            }
+        } else {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
+                buf[inst_idx] =
+                    input[0 + offset_input + inst_idx * 32 + lane_id];
+            }
+        }
+        // test
+        if (loop_idx % 7 == 6) {
+            if (lane_id < 16) {
+                output[0 + offset_output + 0 * 32 + lane_id] = buf[0];
+            }
+        } else {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
+                output[0 + offset_output + inst_idx * 32 + lane_id] =
+                    buf[inst_idx];
+            }
+        }
+        // test
+    }
+}
+void invoke_func_0(float *input, float *output) {
+    dim3 gridDim(80, 1);
+    dim3 blockDim(128, 1);
+    kernel_func_0<<<gridDim, blockDim>>>(input, output);
+    cudaCheckError();
+}
--- a/generated_code/transpose_1.cu
+++ b/generated_code/transpose_1.cu
@ -0,0 +1,66 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func_1(float *input, float *output) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 4 + warp_id;
+    float buf[4];
+    for (int loop_idx = parallel_idx; loop_idx < 464; loop_idx += 320) {
+        int offset_input = 0;
+        int offset_input_buf = loop_idx;
+        offset_input += offset_input_buf % 2 * 128;
+        offset_input_buf /= 2;
+        offset_input += offset_input_buf % 116 * 392;
+        offset_input_buf /= 116;
+        offset_input += offset_input_buf % 2 * 196;
+        offset_input_buf /= 2;
+        int offset_output = 0;
+        int offset_output_buf = loop_idx;
+        offset_output += offset_output_buf % 2 * 128;
+        offset_output_buf /= 2;
+        offset_output += offset_output_buf % 116 * 196;
+        offset_output_buf /= 116;
+        offset_output += offset_output_buf % 2 * 22736;
+        offset_output_buf /= 2;
+        if (loop_idx % 2 == 1) {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 2; inst_idx++) {
+                buf[inst_idx] =
+                    input[0 + offset_input + inst_idx * 32 + lane_id];
+            }
+            if (lane_id < 4) {
+                buf[2] = input[0 + offset_input + 2 * 32 + lane_id];
+            }
+        } else {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
+                buf[inst_idx] =
+                    input[0 + offset_input + inst_idx * 32 + lane_id];
+            }
+        }
+        // test
+        if (loop_idx % 2 == 1) {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 2; inst_idx++) {
+                output[0 + offset_output + inst_idx * 32 + lane_id] =
+                    buf[inst_idx];
+            }
+            if (lane_id < 4) {
+                output[0 + offset_output + 2 * 32 + lane_id] = buf[2];
+            }
+        } else {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
+                output[0 + offset_output + inst_idx * 32 + lane_id] =
+                    buf[inst_idx];
+            }
+        }
+        // test
+    }
+}
+void invoke_func_1(float *input, float *output) {
+    dim3 gridDim(80, 1);
+    dim3 blockDim(128, 1);
+    kernel_func_1<<<gridDim, blockDim>>>(input, output);
+    cudaCheckError();
+}
--- a/generated_code/transpose_2.cu
+++ b/generated_code/transpose_2.cu
@ -0,0 +1,62 @@
+#include "cuda_utils.h"
+// Kernel
+__global__ void kernel_func_2(float *input, float *output) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 4 + warp_id;
+    float buf[4];
+    for (int loop_idx = parallel_idx; loop_idx < 464; loop_idx += 320) {
+        int offset_input = 0;
+        int offset_input_buf = loop_idx;
+        offset_input += offset_input_buf % 232 * 98;
+        offset_input_buf /= 232;
+        offset_input += offset_input_buf % 2 * 49;
+        offset_input_buf /= 2;
+        int offset_output = 0;
+        int offset_output_buf = loop_idx;
+        offset_output += offset_output_buf % 232 * 49;
+        offset_output_buf /= 232;
+        offset_output += offset_output_buf % 2 * 11368;
+        offset_output_buf /= 2;
+        if (loop_idx % 1 == 0) {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
+                buf[inst_idx] =
+                    input[0 + offset_input + inst_idx * 32 + lane_id];
+            }
+            if (lane_id < 17) {
+                buf[1] = input[0 + offset_input + 1 * 32 + lane_id];
+            }
+        } else {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 2; inst_idx++) {
+                buf[inst_idx] =
+                    input[0 + offset_input + inst_idx * 32 + lane_id];
+            }
+        }
+        // test
+        if (loop_idx % 1 == 0) {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
+                output[0 + offset_output + inst_idx * 32 + lane_id] =
+                    buf[inst_idx];
+            }
+            if (lane_id < 17) {
+                output[0 + offset_output + 1 * 32 + lane_id] = buf[1];
+            }
+        } else {
+#pragma unroll
+            for (int inst_idx = 0; inst_idx < 2; inst_idx++) {
+                output[0 + offset_output + inst_idx * 32 + lane_id] =
+                    buf[inst_idx];
+            }
+        }
+        // test
+    }
+}
+void invoke_func_2(float *input, float *output) {
+    dim3 gridDim(80, 1);
+    dim3 blockDim(128, 1);
+    kernel_func_2<<<gridDim, blockDim>>>(input, output);
+    cudaCheckError();
+}
--- a/include/core/graph_builder.h
+++ b/include/core/graph_builder.h
@ -0,0 +1,169 @@
+#pragma once
+#include "core/common.h"
+#include "core/graph.h"
+#include "core/operator.h"
+#include "core/tensor.h"
+#include "operators/G2BMM.h"
+#include "operators/GBMM.h"
+#include "operators/concat.h"
+#include "operators/conv.h"
+#include "operators/element_wise.h"
+#include "operators/extend.h"
+#include "operators/gather.h"
+#include "operators/matmul.h"
+#include "operators/membound.h"
+#include "operators/pad.h"
+#include "operators/pooling.h"
+#include "operators/reduce_mean.h"
+#include "operators/reshape.h"
+#include "operators/slice.h"
+#include "operators/split.h"
+#include "operators/transpose.h"
+#include "operators/unary.h"
+
+namespace infini {
+
+class GraphBuilderObj {
+  private:
+    Graph g;
+
+  public:
+    GraphBuilderObj(Runtime runtime) : g(make_ref<GraphObj>(runtime)) {}
+
+    // tensors
+    Tensor tensor(Shape dim, const std::string &dtype);
+
+    // operators
+    // conv op
+    Operator conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
+                  int sh = 1, int sw = 1, int dh = 1, int dw = 1,
+                  Tensor bias = nullptr);
+    Operator conv(Tensor input, Tensor weight, int ph, int pw, int sh = 1,
+                  int sw = 1, int dh = 1, int dw = 1, Tensor bias = nullptr);
+    Operator conv(Tensor input, Tensor weight, Tensor output,
+                  ConvBaseObj::PaddingMode pm, int sh = 1, int sw = 1,
+                  int dh = 1, int dw = 1, Tensor bias = nullptr);
+    Operator conv(Tensor input, Tensor weight, ConvBaseObj::PaddingMode pm,
+                  int sh = 1, int sw = 1, int dh = 1, int dw = 1,
+                  Tensor bias = nullptr);
+    // matmul op
+    Operator matmul(Tensor A, Tensor B, Tensor C, bool transA = false,
+                    bool transB = false);
+    Operator matmul(Tensor A, Tensor B, bool transA = false,
+                    bool transB = false);
+    // conv trans op
+    Operator convTrans(Tensor input, Tensor weight, Tensor output, int ph,
+                       int pw, int sh = 1, int sw = 1, int dh = 1, int dw = 1,
+                       int oph = 0, int opw = 0, int group = 1,
+                       Tensor bias = nullptr, ActType act = ActType::None);
+    Operator convTrans(Tensor input, Tensor weight, int ph, int pw, int sh = 1,
+                       int sw = 1, int dh = 1, int dw = 1, int oph = 0,
+                       int opw = 0, int group = 1, Tensor bias = nullptr,
+                       ActType act = ActType::None);
+    Operator convTrans(Tensor input, Tensor weight, Tensor output,
+                       ConvBaseObj::PaddingMode pm, int sh = 1, int sw = 1,
+                       int dh = 1, int dw = 1, int oph = 0, int opw = 0,
+                       int group = 1, Tensor bias = nullptr,
+                       ActType act = ActType::None);
+    Operator convTrans(Tensor input, Tensor weight, ConvBaseObj::PaddingMode pm,
+                       int sh = 1, int sw = 1, int dh = 1, int dw = 1,
+                       int oph = 0, int opw = 0, int group = 1,
+                       Tensor bias = nullptr, ActType act = ActType::None);
+    // g2bmm op
+    Operator g2bmm(Tensor A, Tensor B, Tensor C, const int width,
+                   const int dilation, Tensor bias = nullptr,
+                   ActType act = ActType::None);
+    Operator g2bmm(Tensor A, Tensor B, const int width, const int dilation,
+                   Tensor bias = nullptr, ActType act = ActType::None);
+    // gbmm-like op
+    Operator gbmml(Tensor A, Tensor B, Tensor C, const int dilation,
+                   Tensor bias = nullptr, ActType act = ActType::None);
+    Operator gbmml(Tensor A, Tensor B, const int dilation,
+                   Tensor bias = nullptr, ActType act = ActType::None);
+    // pad op
+    Operator pad(Tensor input, Tensor output, const vector<int> &pads,
+                 const optional<const vector<int>> &axis);
+    Operator pad(Tensor input, const vector<int> &pads,
+                 const optional<const vector<int>> &axis);
+    // slice op
+    Operator slice(Tensor input, Tensor output, const vector<int> &starts,
+                   const vector<int> &ends,
+                   const optional<const vector<int>> &axis,
+                   const optional<const vector<int>> &steps);
+    Operator slice(Tensor input, const vector<int> &starts,
+                   const vector<int> &ends,
+                   const optional<const vector<int>> &axis,
+                   const optional<const vector<int>> &steps);
+    // concat op
+    Operator concat(TensorVec inputs, Tensor output, int dim);
+    Operator concat(TensorVec inputs, int dim);
+    // split op
+    Operator split(Tensor input, std::optional<TensorVec> outputs, int dim,
+                   int num);
+    Operator split(Tensor input, int dim, int num);
+    Operator split(Tensor input, std::optional<TensorVec> outputs, int dim,
+                   const vector<int> &ratio);
+    Operator split(Tensor input, int dim, const vector<int> &ratio);
+    // transpose op
+    Operator transpose(Tensor input, Tensor output, const Shape &perm);
+    // TODO
+    // extend op
+    Operator extend(Tensor input, Tensor output, int dim, int num);
+    Operator extend(Tensor input, int dim, int num);
+    // max pool op
+    Operator maxpool(Tensor input, Tensor output, int kh, int kw, int dh,
+                     int dw, int ph, int pw, int sh, int sw);
+    Operator maxpool(Tensor input, int kh, int kw, int dh, int dw, int ph,
+                     int pw, int sh, int sw);
+    // average pool op
+    Operator avgpool(Tensor input, Tensor output, int kh, int kw, int dh,
+                     int dw, int ph, int pw, int sh, int sw);
+    Operator avgpool(Tensor input, int kh, int kw, int dh, int dw, int ph,
+                     int pw, int sh, int sw);
+    // element wise op
+    Operator add(Tensor input0, Tensor input1, Tensor output);
+    Operator add(Tensor input0, Tensor input1);
+    Operator sub(Tensor input0, Tensor input1, Tensor output);
+    Operator sub(Tensor input0, Tensor input1);
+    Operator mul(Tensor input0, Tensor input1, Tensor output);
+    Operator mul(Tensor input0, Tensor input1);
+    Operator div(Tensor input0, Tensor input1, Tensor output);
+    Operator div(Tensor input0, Tensor input1);
+    Operator pow(Tensor input0, Tensor input1, Tensor output);
+    Operator pow(Tensor input0, Tensor input1);
+    // gather op
+    Operator gather(Tensor input, Tensor index, Tensor output, int axis);
+    Operator gather(Tensor input, Tensor index, int axis);
+    // reduce mean op
+    // TODO
+    // reshape op
+    Operator reshape(Tensor input, Tensor output, const Shape &dims);
+    Operator reshape(Tensor input, const Shape &dims);
+    Operator flatten(Tensor input, Tensor output);
+    Operator flatten(Tensor input);
+    Operator identity(Tensor input, Tensor output);
+    Operator identity(Tensor input);
+    // unary op
+    // TODO: batch norm
+    Operator softmax(Tensor input, Tensor output);
+    Operator softmax(Tensor input);
+    // TODO: activation
+    Operator relu(Tensor input, Tensor output);
+    Operator relu(Tensor input);
+    Operator sigmoid(Tensor input, Tensor output);
+    Operator sigmoid(Tensor input);
+    Operator tanh(Tensor input, Tensor output);
+    Operator tanh(Tensor input);
+    Operator abs(Tensor input, Tensor output);
+    Operator abs(Tensor input);
+    Operator reduceMean(Tensor input, Tensor Output, int axis);
+    Operator erf(Tensor input, Tensor output);
+    // resize op
+    // TODO
+    // membound op
+    Operator memBound(const TensorVec &inputs, const TensorVec &outputs,
+                      const std::vector<nnet::Tensor> &nnetInputs,
+                      nnet::Expr expr, double exec_time, std::string hint = {});
+};
+
+} // namespace infini
--- a/include/core/operator.h
+++ b/include/core/operator.h
@ -38,6 +38,7 @@ enum class OpType {
    Tanh,
    Abs,
    Resize,
+    Erf,
    //
    MemBound = 300,
 };
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -10,6 +10,7 @@ class TensorBaseObj;
 class TensorObj;
 class OperatorObj;
 class GraphObj;
+class GraphBuilderObj;
 class RuntimeObj;
 class BlobObj;

@ -17,6 +18,7 @@ using TensorBase = Ref<TensorBaseObj>;
 using Tensor = Ref<TensorObj>;
 using Operator = Ref<OperatorObj>;
 using Graph = Ref<GraphObj>;
+using GraphBuilder = Ref<GraphBuilderObj>;
 using Runtime = Ref<RuntimeObj>;
 using Blob = Ref<BlobObj>;
 enum class OpType;
--- a/include/cuda/cuda_common.h
+++ b/include/cuda/cuda_common.h
@ -1,5 +1,5 @@
 #pragma once
-#include "core/common.h"
+// #include "core/common.h"
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_profiler_api.h>
--- a/include/cuda/operator_timer.h
+++ b/include/cuda/operator_timer.h
@ -1,10 +1,16 @@
 #pragma once
+#include "core/operator.h"
 namespace infini {
 namespace opTimer {
+
 double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
                        int padh, int padw, int strideh, int stridew,
-                        int dilationh, int dilationw, int group,
-                        const char *name);
+                        int dilationh, int dilationw, int group);
+
+double getPerfConvBiasActCudnn(int n, int c, int h, int w, int f, int r, int s,
+                               int padh, int padw, int strideh, int stridew,
+                               int dilationh, int dilationw, int group,
+                               bool bias, string act);

 double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
                                    int s, int padh, int padw, int strideh,
--- a/include/operators/conv.h
+++ b/include/operators/conv.h
@ -25,19 +25,22 @@ class ConvBaseObj : public OperatorObj {
    int h, w; // input shape (same for conv2d and convTranposed2d)
    int f;    // output/input channel for conv2d/convTransposed2d
    int r, s; // weight shape
+    ActType act;

  public:
    // Constructors for explicitly setting padding size
    ConvBaseObj(OpType opType, TensorVec inputs, Tensor &output, int ph, int pw,
                int sh, int sw, int dh, int dw, const Tensor &inputInConvFWD,
-                const Tensor &weightInConvFWD);
+                const Tensor &weightInConvFWD, const ActType act);
    ConvBaseObj(OpType opType, TensorVec inputs, Tensor &output,
                PaddingMode mode, int sh, int sw, int dh, int dw,
-                const Tensor &inputInConvFWD, const Tensor &weightInConvFWD);
+                const Tensor &inputInConvFWD, const Tensor &weightInConvFWD,
+                const ActType act);

    std::string toString() const override;
-    int numInputs() const override { return 2; }
+    int numInputs() const override { return inputs.size(); }
    int numOutputs() const override { return 1; }
+    bool hasBias() const { return inputs.size() == 3; }

    Tensor getBias() const { return inputs[2]; }
    PaddingMode getPaddingMode() const { return padding; }
@ -53,6 +56,7 @@ class ConvBaseObj : public OperatorObj {
    auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); }
    int getChannelPerGroup() const { return inputs[1]->getDims()[1]; }
    virtual int getNumGroups() const = 0;
+    ActType getAct() const { return act; }

  private:
    vector<int> getWorkloadVector() const override;
@ -65,8 +69,6 @@ class ConvBaseObj : public OperatorObj {
 };

 class ConvObj : public ConvBaseObj {
-  private:
-    ActType act;

  public:
    ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output, int ph,
@ -79,7 +81,6 @@ class ConvObj : public ConvBaseObj {
            ActType act = ActType::None);

    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
-    ActType getAct() const { return act; }
    int getNumGroups() const override { return c / getChannelPerGroup(); }

  private:
@ -90,7 +91,6 @@ class ConvTransposed2dObj : public ConvBaseObj {
  private:
    int oph, opw;
    int group;
-    ActType act;

  public:
    ConvTransposed2dObj(GraphObj *graph, Tensor input, Tensor weight,
@ -106,7 +106,6 @@ class ConvTransposed2dObj : public ConvBaseObj {
                        Tensor bias = nullptr, ActType act = ActType::None);

    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
-    ActType getAct() const { return act; }
    int getNumGroups() const override { return group; }

  private:
--- a/include/operators/transpose.h
+++ b/include/operators/transpose.h
@ -0,0 +1,25 @@
+#pragma once
+
+#include "core/operator.h"
+
+namespace infini {
+class TransposeObj : public OperatorObj {
+    Shape perm;
+
+  public:
+    TransposeObj(GraphObj *graph, Tensor input, Tensor output,
+                 const Shape &dims);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+    Shape getPerm() const { return perm; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+} // namespace infini
--- a/include/operators/unary.h
+++ b/include/operators/unary.h
@ -28,4 +28,5 @@ DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
 DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
 DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)
 DEFINE_UNARY_OBJ(Abs, OpType::Abs)
+DEFINE_UNARY_OBJ(Erf, OpType::Erf)
 }; // namespace infini
--- a/include/pfusion/common.h
+++ b/include/pfusion/common.h
@ -0,0 +1,71 @@
+#pragma once
+
+#include "core/common.h"
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace memb {
+
+enum OpType {
+    NONE = 0,
+    EMPTY = 1,
+    READ,
+    WRITE,
+    RELU,
+    ADD,
+    SUB,
+    REDUCEMEAN,
+    REDUCESUM,
+    GATHER,
+    TRANSPOSE,
+    BROADCAST,
+};
+
+enum MemType {
+    DRAM = 1,
+    SRAM,
+    REG,
+};
+
+inline std::string getName(OpType opType) {
+    switch (opType) {
+    case (OpType::EMPTY):
+        return "EMPTY";
+    case (OpType::READ):
+        return "READ";
+    case (OpType::WRITE):
+        return "WRITE";
+    case (OpType::RELU):
+        return "RELU";
+    case (OpType::ADD):
+        return "ADD";
+    case (OpType::SUB):
+        return "SUB";
+    default:
+        IT_ASSERT(false);
+    }
+    return "";
+}
+
+inline std::string getName(MemType memType) {
+    switch (memType) {
+    case (MemType::DRAM):
+        return "DRAM";
+    case (MemType::SRAM):
+        return "SRAM";
+    case (MemType::REG):
+        return "REG";
+    default:
+        IT_ASSERT(false);
+    }
+    return "";
+}
+
+inline size_t hashAppend(size_t a, size_t b) {
+    return (a * 10000019 + b * 10000079) % 2147483647;
+}
+
+} // namespace memb
--- a/include/pfusion/cuda_utils.h
+++ b/include/pfusion/cuda_utils.h
@ -0,0 +1,88 @@
+#pragma once
+
+#include <cublas.h>
+#include <curand.h>
+#include <iostream>
+
+#define cudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
+#define cudaCheckError() __cudaCheckError(__FILE__, __LINE__)
+#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
+#define curandSafeCall(err) __curandSafeCall(err, __FILE__, __LINE__)
+
+inline void __cudaSafeCall(cudaError err, const char *file, const int line) {
+    if (err != cudaSuccess) {
+        std::cout << "[ERROR] " << file << "::" << line
+                  << ": cudaSafeCall() failed. " << cudaGetErrorString(err)
+                  << std::endl;
+        exit(-1);
+    }
+    return;
+}
+
+inline void __cudaCheckError(const char *file, const int line) {
+    auto err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        std::cout << "[ERROR] " << file << "::" << line
+                  << ": cudaCheckError() failed. " << cudaGetErrorString(err)
+                  << std::endl;
+        exit(-1);
+    }
+
+#ifdef DEBUG
+    // This checking will affect performance.
+    err = cudaDeviceSynchronize();
+    if (err != cudaSuccess) {
+        std::cout << "[ERROR] " << file << "::" << line
+                  << ": cudaCheckError() with sync failed. "
+                  << cudaGetErrorString(err) << std::endl;
+        exit(-1);
+    }
+#endif
+
+    return;
+}
+
+inline const char *cublasGetErrorString(cublasStatus_t err) {
+    switch (err) {
+    case CUBLAS_STATUS_SUCCESS:
+        return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+        return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+        return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+        return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+        return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+        return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+        return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+        return "CUBLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+        return "CUBLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+        return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "<unknown>";
+}
+
+inline void __cublasSafeCall(cublasStatus_t err, const char *file,
+                             const int line) {
+    if (err != CUBLAS_STATUS_SUCCESS) {
+        std::cout << "[ERROR]" << file << "::" << line
+                  << ": cublasSafeCall() failed. " << cublasGetErrorString(err)
+                  << std::endl;
+        exit(-1);
+    }
+}
+
+inline void __curandSafeCall(curandStatus_t err, const char *file,
+                             const int line) {
+    if (err != CURAND_STATUS_SUCCESS) {
+        std::cout << "[ERROR]" << file << "::" << line
+                  << ": curandSafeCall() failed. " << err << std::endl;
+        exit(-1);
+    }
+}
--- a/include/pfusion/instantiate.h
+++ b/include/pfusion/instantiate.h
@ -0,0 +1,32 @@
+#pragma once
+
+#include "pfusion/common.h"
+#include "pfusion/meta_op.h"
+
+namespace memb {
+std::vector<std::shared_ptr<MetaOp>>
+instantiateUnary(const OpType opType,
+                 std::vector<std::shared_ptr<Pointer>> ptrs,
+                 const std::vector<size_t> &shape);
+std::vector<std::shared_ptr<MetaOp>>
+instantiateBinary(const OpType opType,
+                  std::vector<std::shared_ptr<Pointer>> ptrs,
+                  const std::vector<size_t> &shape);
+std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
+    const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
+    const std::vector<size_t> &shape, const std::vector<size_t> &perm);
+std::vector<std::shared_ptr<MetaOp>>
+instantiateGather(const OpType opType,
+                  const std::vector<std::shared_ptr<Pointer>> &ptrs,
+                  const std::vector<size_t> &inputShape,
+                  const std::vector<size_t> &indexShape,
+                  const std::vector<size_t> &outputShape, const size_t axis);
+std::vector<std::shared_ptr<MetaOp>>
+instantiateReduce(const OpType opType,
+                  const std::vector<std::shared_ptr<Pointer>> &ptrs,
+                  const std::vector<size_t> &inputShape, const size_t axis);
+std::vector<std::shared_ptr<MetaOp>> instantiateBroadcast(
+    const OpType opType, const std::vector<std::shared_ptr<Pointer>> &ptrs,
+    const std::vector<size_t> &inputShape, const size_t axis, const size_t num);
+
+} // namespace memb
--- a/include/pfusion/memory_codegen.h
+++ b/include/pfusion/memory_codegen.h
@ -0,0 +1,29 @@
+#pragma once
+
+#include "core/runtime.h"
+
+namespace infini {
+class MemoryCodegen {
+  private:
+    std::string generateGraph(Graph graph);
+    std::string generateBias(const std::vector<size_t> &shape);
+    std::string generateTranspose(const std::vector<size_t> &shape,
+                                  const std::vector<size_t> &perm);
+
+  public:
+    MemoryCodegen() {}
+    ~MemoryCodegen() {}
+    void exportGraph(Graph graph, std::string filename);
+    void exportBert_LN(const std::string &filename);
+    void exportBert_SM(const std::string &filename);
+    void exportBert_GELU(const std::string &filename);
+    void exportViT_LN(const std::string &filename);
+    void exportViT_SM(const std::string &filename);
+    void exportViT_GELU(const std::string &filename);
+    void exportBias(const std::string &filename,
+                    const std::vector<size_t> &shape);
+    void exportTranspose(const std::string &filename,
+                         const std::vector<size_t> &shape,
+                         const std::vector<size_t> &perm);
+};
+} // namespace infini
--- a/include/pfusion/meta_graph.h
+++ b/include/pfusion/meta_graph.h
@ -0,0 +1,33 @@
+#pragma once
+
+#include "pfusion/meta_op.h"
+
+namespace memb {
+class MetaGraph {
+  private:
+    std::vector<std::shared_ptr<MetaOp>> metaOps;
+    std::vector<std::pair<size_t, size_t>> edges;
+    std::unordered_map<size_t, size_t> metaOpMap;
+
+  public:
+    MetaGraph() {}
+    ~MetaGraph() {}
+    inline void addOp(std::shared_ptr<MetaOp> op) {
+        IT_ASSERT(metaOpMap.find(op->id) == metaOpMap.end());
+        metaOpMap[op->id] = metaOps.size();
+        metaOps.emplace_back(op);
+    }
+    inline void addEdge(std::shared_ptr<MetaOp> op1,
+                        std::shared_ptr<MetaOp> op2) {
+        IT_ASSERT(metaOpMap.find(op1->id) != metaOpMap.end());
+        IT_ASSERT(metaOpMap.find(op2->id) != metaOpMap.end());
+        edges.emplace_back(metaOpMap[op1->id], metaOpMap[op2->id]);
+    }
+    void print();
+    void optimize();
+    std::string genHeader();
+    std::string genKernelFuncs();
+    std::string genInvokeFuncs();
+};
+
+} // namespace memb
--- a/include/pfusion/meta_op.h
+++ b/include/pfusion/meta_op.h
@ -0,0 +1,111 @@
+#pragma once
+
+#include "pfusion/common.h"
+#include "pfusion/micro_op.h"
+#include "pfusion/pointer.h"
+
+namespace memb {
+class TensorMapping {
+  private:
+    std::vector<size_t> shape, stride, map;
+    std::string name;
+
+  public:
+    TensorMapping(const std::string &_name, const std::vector<size_t> &_shape,
+                  const std::vector<size_t> &_stride,
+                  const std::vector<size_t> &_map) {
+        name = "offset_" + _name;
+        IT_ASSERT(_shape.size() > 0 && _shape.size() < 10);
+        for (auto x : _shape) {
+            shape.emplace_back(x);
+        }
+        IT_ASSERT(_stride.size() > 0 && _stride.size() < 10);
+        for (auto x : _stride) {
+            stride.emplace_back(x);
+        }
+        IT_ASSERT(_map.size() > 0 && _map.size() < 10);
+        for (auto x : _map) {
+            map.emplace_back(x);
+        }
+    }
+    ~TensorMapping() {}
+
+    static inline std::shared_ptr<TensorMapping>
+    buildWithMap(const std::string &name, const std::vector<size_t> &shape,
+                 const std::vector<size_t> &map) {
+        std::vector<size_t> stride(shape.size());
+        stride[0] = 1;
+        for (size_t i = 1; i < stride.size(); i++) {
+            stride[i] = shape[i] * stride[i - 1];
+        }
+        return std::make_shared<TensorMapping>(name, shape, stride, map);
+    }
+
+    static inline std::shared_ptr<TensorMapping>
+    build(const std::string &name, const std::vector<size_t> &shape,
+          const std::vector<size_t> &stride, const std::vector<size_t> &map) {
+        return std::make_shared<TensorMapping>(name, shape, stride, map);
+    }
+
+    inline std::string offset() { return name; }
+    inline size_t getHash() {
+        std::hash<size_t> hasher;
+        std::hash<std::string> stringHasher;
+        size_t ret = stringHasher(name);
+        ret = hashAppend(ret, hasher(shape.size()));
+        for (auto x : shape) {
+            ret = hashAppend(ret, hasher(x));
+        }
+        ret = hashAppend(ret, hasher(map.size()));
+        for (auto x : map) {
+            ret = hashAppend(ret, hasher(x));
+        }
+        return ret;
+    }
+    std::string genOffset();
+};
+
+class MetaOp {
+  public:
+    int id;
+    int main_loop_st, main_loop_ed, numBlocks, numGroups, numReg, numSmem,
+        numLanes;
+    std::vector<std::shared_ptr<MicroOp>> microOps;
+    std::vector<std::shared_ptr<Pointer>> ptrs;
+    std::vector<std::shared_ptr<TensorMapping>> mappings;
+    MetaOp() {
+        static int metaOpId = 0;
+        id = metaOpId++;
+    }
+    ~MetaOp() {}
+
+    inline void setLoopSt(int _main_loop_st) { main_loop_st = _main_loop_st; }
+    inline void setLoopEd(int _main_loop_ed) { main_loop_ed = _main_loop_ed; }
+    inline int getLoopSt() { return main_loop_st; }
+    inline int getLoopEd() { return main_loop_ed; }
+
+    void optimize();
+    std::string genKernelFunc();
+    std::string genInvokeFunc();
+
+    static std::shared_ptr<MetaOp> merge(std::shared_ptr<MetaOp> metaOp0,
+                                         std::shared_ptr<MetaOp> metaOp1);
+
+    inline void print() {
+        std::cout << "MetaOp: " << id << std::endl;
+        for (auto microOp : microOps) {
+            microOp->print();
+        }
+    }
+    bool checkValid() {
+        // TODO: check valid
+        return true;
+    };
+    static std::shared_ptr<MetaOp>
+    buildBiasOp(const std::vector<size_t> &shape);
+    static std::shared_ptr<MetaOp>
+    buildTransposeOp(const std::vector<size_t> &shape,
+                     const std::vector<size_t> &perm);
+};
+
+} // namespace memb
--- a/include/pfusion/micro_kernel/binary.h
+++ b/include/pfusion/micro_kernel/binary.h
@ -0,0 +1,29 @@
+#pragma once
+
+#include "pfusion/micro_op.h"
+
+namespace memb {
+class BinaryOp : public MicroOp {
+  private:
+    size_t num, width;
+
+  public:
+    BinaryOp(OpType _opType, std::shared_ptr<Pointer> _pSrc0,
+             std::shared_ptr<Pointer> _pSrc1, std::shared_ptr<Pointer> _pDst,
+             size_t _num, size_t _width)
+        : num(_num), width(_width) {
+        opType = _opType;
+        ptrs = {_pSrc0, _pSrc1, _pDst};
+    }
+    ~BinaryOp() {}
+    std::shared_ptr<Pointer> getSrc0() { return ptrs[0]; }
+    std::shared_ptr<Pointer> getSrc1() { return ptrs[1]; }
+    std::shared_ptr<Pointer> getDst() { return ptrs[2]; }
+    // bool checkValid() override;
+    std::string generate() override;
+    inline void print() override {
+        std::cout << id << " " << getName(opType) << std::endl;
+    }
+};
+
+} // namespace memb
--- a/include/pfusion/micro_kernel/empty.h
+++ b/include/pfusion/micro_kernel/empty.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include "pfusion/micro_op.h"
+
+namespace memb {
+class EmptyOp : public MicroOp {
+  public:
+    EmptyOp() { opType = EMPTY; }
+    ~EmptyOp() {}
+    // bool checkValid() override;
+    std::string generate() override { return ""; };
+    inline void print() override {
+        std::cout << id << " " << getName(opType) << std::endl;
+    }
+};
+
+} // namespace memb
--- a/include/pfusion/micro_kernel/memory.h
+++ b/include/pfusion/micro_kernel/memory.h
@ -0,0 +1,55 @@
+#pragma once
+
+#include "pfusion/micro_op.h"
+
+namespace memb {
+class MemoryOp : public MicroOp {
+  private:
+    size_t num, width;
+
+  public:
+    MemoryOp(const OpType _opType, const std::shared_ptr<Pointer> _src,
+             const std::shared_ptr<Pointer> _dst, const size_t _num,
+             const size_t _width, const std::vector<size_t> &_cond)
+        : num(_num), width(_width) {
+        opType = _opType;
+        ptrs = {_src, _dst};
+        cond = _cond;
+    }
+    // bool checkValid() override;
+    ~MemoryOp() {}
+
+    static inline std::shared_ptr<MicroOp>
+    build(const OpType opType, const std::shared_ptr<Pointer> src,
+          const std::shared_ptr<Pointer> dst, const size_t num,
+          const size_t width) {
+        return std::make_shared<MemoryOp>(opType, src, dst, num, width,
+                                          std::vector<size_t>({}));
+    }
+
+    static inline std::shared_ptr<MicroOp>
+    build(const OpType opType, const std::shared_ptr<Pointer> src,
+          const std::shared_ptr<Pointer> dst, const size_t num,
+          const size_t width, const std::vector<size_t> &cond) {
+        return std::make_shared<MemoryOp>(opType, src, dst, num, width, cond);
+    }
+
+    std::shared_ptr<Pointer> getSrc() { return ptrs[0]; }
+    std::shared_ptr<Pointer> getDst() { return ptrs[1]; }
+    std::string generate() override;
+    std::string generateWithCond();
+    inline void print() override {
+        if (opType == READ) {
+            std::cout << id << " " << getName(opType) << " "
+                      << getName(getSrc()->getType()) << " "
+                      << getSrc()->getHash() << std::endl;
+        } else if (opType == WRITE) {
+            std::cout << id << " " << getName(opType) << " "
+                      << getName(getDst()->getType()) << " "
+                      << getDst()->getHash() << std::endl;
+        } else {
+            IT_ASSERT(false);
+        }
+    }
+};
+} // namespace memb
--- a/include/pfusion/micro_kernel/reduce.h
+++ b/include/pfusion/micro_kernel/reduce.h
@ -0,0 +1,29 @@
+#pragma once
+
+#include "pfusion/micro_op.h"
+
+namespace memb {
+class ReduceOp : public MicroOp {
+  private:
+    size_t num, width;
+
+  public:
+    ReduceOp(OpType _opType, std::shared_ptr<Pointer> _pSrc,
+             std::shared_ptr<Pointer> _pDst, std::shared_ptr<Pointer> _pBuf,
+             size_t _num, size_t _width)
+        : num(_num), width(_width) {
+        opType = _opType;
+        ptrs = {_pSrc, _pDst, _pBuf};
+    }
+    ~ReduceOp() {}
+    std::shared_ptr<Pointer> getSrc() { return ptrs[0]; }
+    std::shared_ptr<Pointer> getDst() { return ptrs[1]; }
+    std::shared_ptr<Pointer> getBuf() { return ptrs[2]; }
+    // bool checkValid() override;
+    std::string generate() override;
+    inline void print() override {
+        std::cout << id << " " << getName(opType) << std::endl;
+    }
+};
+
+} // namespace memb
--- a/include/pfusion/micro_kernel/unary.h
+++ b/include/pfusion/micro_kernel/unary.h
@ -0,0 +1,29 @@
+#pragma once
+
+#include "pfusion/micro_op.h"
+
+namespace memb {
+class UnaryOp : public MicroOp {
+  private:
+    const int num, width;
+
+  public:
+    UnaryOp(OpType _opType, std::shared_ptr<Pointer> _src,
+            std::shared_ptr<Pointer> _dst, int _num, int _width)
+        : num(_num), width(_width) {
+        opType = _opType;
+        ptrs = {_src, _dst};
+    }
+    ~UnaryOp() {}
+
+    std::shared_ptr<Pointer> getSrc() { return ptrs[0]; }
+    std::shared_ptr<Pointer> getDst() { return ptrs[1]; }
+
+    // bool checkValid() override;
+    std::string generate() override;
+    inline void print() override {
+        std::cout << id << " " << getName(opType) << std::endl;
+    }
+};
+
+} // namespace memb
--- a/include/pfusion/micro_op.h
+++ b/include/pfusion/micro_op.h
@ -0,0 +1,43 @@
+#pragma once
+
+#include "pfusion/common.h"
+#include "pfusion/pointer.h"
+
+namespace memb {
+
+class MicroOp {
+  protected:
+    size_t id;
+    OpType opType;
+    std::vector<std::shared_ptr<Pointer>> ptrs;
+    std::vector<size_t> cond;
+
+  public:
+    MicroOp() : opType(NONE), cond(0) {
+        static int microOpId = 0;
+        id = microOpId++;
+    }
+    virtual ~MicroOp() {}
+
+    inline OpType getType() { return opType; }
+    inline bool isMemoryOp() { return opType == READ || opType == WRITE; }
+    inline std::vector<std::shared_ptr<Pointer>> getPtrs() { return ptrs; }
+
+    // virtual bool checkValid() = 0;
+    virtual std::string generate() = 0;
+    virtual void print() = 0;
+    static std::shared_ptr<MicroOp> merge(std::shared_ptr<MicroOp> op0,
+                                          std::shared_ptr<MicroOp> op1);
+};
+
+class MicroGraph {
+  public:
+    MicroGraph() {}
+    ~MicroGraph() {}
+
+  private:
+    std::vector<std::shared_ptr<MicroOp>> microOps;
+    std::vector<std::pair<int, int>> edges;
+};
+
+} // namespace memb
--- a/include/pfusion/pointer.h
+++ b/include/pfusion/pointer.h
@ -0,0 +1,71 @@
+#pragma once
+
+#include "pfusion/common.h"
+
+namespace memb {
+class Pointer {
+  private:
+    const MemType memType;
+    const std::string name, offset;
+
+  public:
+    Pointer(MemType _memType, std::string _name, std::string _offset)
+        : memType(_memType), name(_name), offset(_offset) {}
+    ~Pointer() {}
+
+    static inline std::shared_ptr<Pointer> buildPtr(MemType memType,
+                                                    std::string name) {
+        return std::make_shared<Pointer>(memType, name, "0");
+    }
+    static inline std::shared_ptr<Pointer>
+    buildPtr(MemType memType, std::string name, std::string offset) {
+        return std::make_shared<Pointer>(memType, name, offset);
+    }
+    static inline std::shared_ptr<Pointer>
+    buildPtr(std::shared_ptr<Pointer> ptr) {
+        return std::make_shared<Pointer>(ptr->getType(), ptr->getName(),
+                                         ptr->getOffset());
+    }
+    static inline std::shared_ptr<Pointer>
+    buildPtr(std::shared_ptr<Pointer> ptr, std::string offset) {
+        return std::make_shared<Pointer>(ptr->getType(), ptr->getName(),
+                                         ptr->getOffset() + " + " + offset);
+    }
+    static inline std::shared_ptr<Pointer> buildPtrByTensorGuid(size_t guid) {
+        return std::make_shared<Pointer>(
+            MemType::DRAM, "tensor_ptr_" + std::to_string(guid), "0");
+    }
+
+    inline const MemType getType() { return memType; }
+    inline const std::string getName() { return name; }
+    inline const std::string getOffset() { return offset; }
+    inline const std::string generate() { return name + "[" + offset + "]"; }
+    inline const std::string generateWithInstIdx(std::string idx) {
+        std::string code = generate();
+        size_t pos = 0, lengthA = 8, lengthB = idx.size();
+        while ((pos = code.find("inst_idx", pos)) != std::string::npos) {
+            code.replace(pos, lengthA, idx);
+            pos += lengthB;
+        }
+        std::cout << "[INFO] " << idx << " " << lengthB << " " << code
+                  << std::endl;
+        return code;
+    }
+    inline bool equal(std::shared_ptr<Pointer> ptr) {
+        if (name == ptr->getName() && offset == ptr->getOffset()) {
+            IT_ASSERT(memType == ptr->getType());
+            return true;
+        }
+        return false;
+    }
+    inline const size_t getHash() {
+        std::hash<MemType> memTypeHash;
+        std::hash<std::string> stringHash;
+        size_t ret = memTypeHash(memType);
+        ret = hashAppend(ret, stringHash(name));
+        ret = hashAppend(ret, stringHash(offset));
+        return ret;
+    }
+};
+
+} // namespace memb
--- a/include/pfusion/search_graph.h
+++ b/include/pfusion/search_graph.h
@ -0,0 +1,39 @@
+#pragma once
+
+#include "pfusion/meta_graph.h"
+#include "pfusion/meta_op.h"
+
+namespace memb {
+class SearchGraph {
+  private:
+    class Node {
+      public:
+        int id;
+        std::vector<std::shared_ptr<MetaOp>> metaOps;
+        std::vector<int> pred;
+        std::vector<int> succ;
+    };
+    // each node is a vector of metaOps.
+    std::vector<Node> nodes;
+    std::vector<std::pair<int, int>> edges;
+
+  public:
+    SearchGraph() {}
+    ~SearchGraph() {}
+    inline void addNode(std::vector<std::shared_ptr<MetaOp>> metaOps) {
+        Node node;
+        node.id = nodes.size();
+        for (auto metaOp : metaOps) {
+            node.metaOps.emplace_back(metaOp);
+        }
+        nodes.emplace_back(node);
+    }
+    inline void addEdge(int i, int j) {
+        edges.emplace_back(i, j);
+        nodes[i].succ.emplace_back(j);
+        nodes[j].pred.emplace_back(i);
+    }
+    std::shared_ptr<MetaGraph> exportFirstMetaGraph();
+};
+
+} // namespace memb
--- a/python/infinitensor/init.py
+++ b/python/infinitensor/init.py
@ -0,0 +1 @@
+from .import_onnx import import_onnx
--- a/python/infinitensor/evalute_onnx.py
+++ b/python/infinitensor/evalute_onnx.py
@ -0,0 +1,483 @@
+import functools
+
+import numpy as np
+
+import onnx
+import onnx.checker
+import onnx.numpy_helper
+import onnx.shape_inference
+from rules import conv_transposed2d_rules, conv_rules, print_result
+
+
+def _add_value_info_for_constants(model: onnx.ModelProto):
+    """
+    Currently onnx.shape_inference doesn't use the shape of initializers, so add
+    that info explicitly as ValueInfoProtos.
+    Mutates the model.
+    Args:
+        model: The ModelProto to update.
+    """
+    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
+    if model.ir_version < 4:
+        return
+
+    def add_const_value_infos_to_graph(graph: onnx.GraphProto):
+        inputs = {i.name for i in graph.input}
+        existing_info = {vi.name: vi for vi in graph.value_info}
+        for init in graph.initializer:
+            # Check it really is a constant, not an input
+            if init.name in inputs:
+                continue
+
+            # The details we want to add
+            elem_type = init.data_type
+            shape = init.dims
+
+            # Get existing or create new value info for this constant
+            vi = existing_info.get(init.name)
+            if vi is None:
+                vi = graph.value_info.add()
+                vi.name = init.name
+
+            # Even though it would be weird, we will not overwrite info even if it doesn't match
+            tt = vi.type.tensor_type
+            if tt.elem_type == onnx.TensorProto.UNDEFINED:
+                tt.elem_type = elem_type
+            if not tt.HasField("shape"):
+                # Ensure we set an empty list if the const is scalar (zero dims)
+                tt.shape.dim.extend([])
+                for dim in shape:
+                    tt.shape.dim.add().dim_value = dim
+
+        # Handle subgraphs
+        for node in graph.node:
+            for attr in node.attribute:
+                # Ref attrs refer to other attrs, so we don't need to do anything
+                if attr.ref_attr_name != "":
+                    continue
+
+                if attr.type == onnx.AttributeProto.GRAPH:
+                    add_const_value_infos_to_graph(attr.g)
+                if attr.type == onnx.AttributeProto.GRAPHS:
+                    for g in attr.graphs:
+                        add_const_value_infos_to_graph(g)
+
+    return add_const_value_infos_to_graph(model.graph)
+
+
+def _parse_attribute(attributes, defaults=dict()):
+    atts = defaults
+    for att in attributes:
+        if att.type == onnx.AttributeProto.INT:
+            atts[att.name] = att.i
+        elif att.type == onnx.AttributeProto.INTS:
+            atts[att.name] = att.ints
+        elif att.type == onnx.AttributeProto.FLOAT:
+            atts[att.name] = att.f
+        elif att.type == onnx.AttributeProto.STRING:
+            atts[att.name] = att.s
+        elif att.type == onnx.AttributeProto.TENSOR:
+            atts[att.name] = att.t
+        else:
+            assert False, "Unsupported Attribute Type: {}".format(att.type)
+    return atts
+
+
+def _onnx_datatype_tostring(dtype):
+    if dtype == 0:
+        return 'UNDEFINED'
+    elif dtype == 1:
+        return 'FLOAT'
+    elif dtype == 2:
+        return 'UINT8'
+    elif dtype == 3:
+        return 'INT8'
+    elif dtype == 4:
+        return 'UINT16'
+    elif dtype == 5:
+        return 'INT16'
+    elif dtype == 6:
+        return 'INT32'
+    elif dtype == 7:
+        return 'INT64'
+    elif dtype == 8:
+        return 'STRING'
+    elif dtype == 9:
+        return 'BOOL'
+    elif dtype == 10:
+        return 'FLOAT16'
+    elif dtype == 11:
+        return 'DOUBLE'
+    elif dtype == 12:
+        return 'UINT32'
+    elif dtype == 13:
+        return 'UINT64'
+    elif dtype == 14:
+        return 'COMPLEX64'
+    elif dtype == 15:
+        return 'COMPLEX128'
+    elif dtype == 16:
+        return 'BFLOAT16'
+    else:
+        assert False, 'Unknown onnx datatype'
+
+
+def import_onnx(model_path: str, bs: int):
+    ts, ds, ops, consts = dict(), dict(), dict(), dict()  # (key, value) = (name, class)
+    model = onnx.load(model_path)
+
+    # Tensor_input
+    for input in model.graph.input:
+        if input.name not in ts:
+            dims = [d.dim_value for d in input.type.tensor_type.shape.dim]
+            # ts[input.name] = g.tensor(dims, _onnx_datatype_tostring(input.type.tensor_type.elem_type))
+            ds[input.name] = dims
+
+    # Tensor_weight
+    for weight in model.graph.initializer:
+        if weight.name not in ts:
+            # ts[weight.name] = g.tensor(weight.dims, _onnx_datatype_tostring(weight.data_type))
+            ds[weight.name] = weight.dims
+
+    # Tensor_inference
+    _add_value_info_for_constants(model)
+    infered_model = onnx.shape_inference.infer_shapes(model)
+    for v in infered_model.graph.value_info:
+        if v.name not in ts:
+            dims = [d.dim_value for d in v.type.tensor_type.shape.dim]
+            # ts[v.name] = g.tensor(dims, _onnx_datatype_tostring(v.type.tensor_type.elem_type))
+            ds[v.name] = dims
+
+    # Tensor_output
+    for output in model.graph.output:
+        if output.name not in ts:
+            dims = [d.dim_value for d in output.type.tensor_type.shape.dim]
+            # ts[output.name] = g.tensor(dims, _onnx_datatype_tostring(output.type.tensor_type.elem_type))
+            ds[output.name] = dims
+
+    # Op
+    for node in model.graph.node:
+        # if node.op_type == 'Add':
+        #     assert len(node.output) == 1
+        #     g.add([ts[item] for item in node.input], ts[node.output[0]])
+
+        # elif node.op_type == 'Cast':
+        #     assert len(node.input) == 1
+        #     assert len(node.output) == 1
+        #     # Ignore for now (TODO)
+        #     g.identity(ts[node.input[0]], ts[node.output[0]])
+
+        if node.op_type == 'Conv':
+            attrs = _parse_attribute(node.attribute, {
+                "auto_pad": "NOTSET",
+                "dilations": [1, 1],
+                "pads": [0, 0, 0, 0],
+                "strides": [1, 1]})
+            assert len(node.input) == 2 or len(node.input) == 3
+            assert len(node.output) == 1
+            assert attrs["auto_pad"] == "NOTSET"
+            assert len(attrs["pads"]) == 4
+            assert len(attrs["strides"]) == 2
+            assert len(attrs["dilations"]) == 2
+            assert attrs["pads"][0] == attrs["pads"][2]
+            assert attrs["pads"][1] == attrs["pads"][3]
+            assert ds[node.input[0]][1] % ds[node.input[1]][1] == 0
+            n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw = ds[node.input[0]][0], ds[node.input[0]][1], ds[node.input[0]][2], ds[node.input[0]][3], ds[node.input[1]][0], ds[node.input[1]][2], ds[node.input[1]
+                                                                                                                                                                                                 ][3],                attrs["pads"][0], attrs["pads"][1],                 attrs["strides"][0], attrs["strides"][1],                attrs["dilations"][0], attrs["dilations"][1]
+            group = ds[node.input[0]][1] // ds[node.input[1]][1]
+            # t = getPerfConv(n, c, h, w, f, r, s, ph, pw,
+            #                 sh, sw, dh, dw, group, "")
+            # print(node.name, n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, group, f'{t:.3f}')
+            n = n*bs
+            for rule in conv_rules:
+                rule(node.name, n, c, h, w, f, r, s,
+                     ph, pw, sh, sw, dh, dw, group)
+
+        elif node.op_type == 'ConvTranspose':
+            attrs = _parse_attribute(node.attribute, {
+                "auto_pad": "NOTSET",
+                "dilations": [1, 1],
+                "pads": [0, 0, 0, 0],
+                "strides": [1, 1],
+                "group": 1})
+            assert len(node.input) == 2 or len(node.input) == 3
+            assert len(node.output) == 1
+            assert attrs["auto_pad"] == "NOTSET"
+            assert len(attrs["pads"]) == 4
+            assert len(attrs["strides"]) == 2
+            assert len(attrs["dilations"]) == 2
+            assert attrs["pads"][0] == attrs["pads"][2]
+            assert attrs["pads"][1] == attrs["pads"][3]
+            n, f, h, w = ds[node.input[0]]
+            _, c, r, s = ds[node.input[1]]
+            ph, pw, sh, sw, dh, dw = attrs["pads"][0], attrs["pads"][1], attrs["strides"][
+                0], attrs["strides"][1], attrs["dilations"][0], attrs["dilations"][1]
+            oph, opw = 0, 0
+            if "output_padding" in attrs:
+                oph, opw = attrs["output_padding"][0], attrs["output_padding"][1]
+                assert attrs["output_padding"][0] == attrs["output_padding"][1]
+            group = attrs["group"]
+            n = n*bs
+            for rule in conv_transposed2d_rules:
+                rule(node.name, n, c, h, w, f, r, s, ph,
+                     pw, sh, sw, dh, dw, oph, opw, group)
+
+        elif node.op_type == 'MatMul':
+            print(f'{node.name} skipped')
+            continue
+            assert len(node.input) == 2
+            assert len(node.output) == 1
+            dimA = list(ds[node.input[0]])
+            dimB = list(ds[node.input[1]])
+            dimO = list(ds[node.output[0]])
+
+        #     if len(dimA) == 2 and len(dimB) == 2:
+        #         tmpI0 = g.tensor([1] + list(ds[node.input[0]]), "FLOAT")
+        #         tmpI1 = g.tensor([1] + list(ds[node.input[1]]), "FLOAT")
+        #         tmpO = g.tensor([1] + list(ds[node.output[0]]), "FLOAT")
+        #         g.transpose(ts[node.input[0]], tmpI0, 0, Perm([PermItem(-1), PermItem(0), PermItem(1)]), 1)
+        #         g.transpose(ts[node.input[1]], tmpI1, 0, Perm([PermItem(-1), PermItem(0), PermItem(1)]), 1)
+        #         g.matmul(tmpI0, tmpI1, tmpO, False, False, None)
+        #         g.transpose(tmpO, ts[node.output[0]], -1, Perm([PermItem([0, 1]), PermItem(2)]), 0)
+
+        #     else:
+        #         assert len(dimO) >= 3
+        #         batch = functools.reduce(lambda x, y: x * y, dimO[:-2])
+
+        #         if len(dimA) == 3:
+        #             tmpI0 = ts[node.input[0]]
+        #         else:
+        #             tmpI0 = g.tensor([batch, dimA[-2], dimA[-1]], "FLOAT")
+        #             g.reshape(ts[node.input[0]], tmpI0)
+
+        #         if len(dimB) == 3:
+        #             tmpI1 = ts[node.input[1]]
+        #         else:
+        #             tmpI1 = g.tensor([batch, dimB[-2], dimB[-1]], "FLOAT")
+        #             g.reshape(ts[node.input[1]], tmpI1)
+
+        #         if len(dimO) == 3:
+        #             tmpO = ts[node.output[0]]
+        #             g.matmul(tmpI0, tmpI1, tmpO, False, False, None)
+        #         else:
+        #             tmpO = g.tensor([batch, dimO[-2], dimO[-1]], "FLOAT")
+        #             g.matmul(tmpI0, tmpI1, tmpO, False, False, None)
+        #             g.reshape(tmpO, ts[node.output[0]])
+
+        # elif node.op_type == 'Concat':
+        #     assert len(node.output) == 1
+        #     attrs = _parse_attribute(node.attribute, {})
+        #     g.concat([ts[item] for item in node.input], ts[node.output[0]], attrs["axis"])
+
+        # elif node.op_type == 'Constant':
+        #     attrs = _parse_attribute(node.attribute, {})
+        #     assert len(node.output) == 1
+        #     c = onnx.numpy_helper.to_array(attrs["value"])
+        #     if c.ndim == 0:
+        #         c = c[()]
+        #     consts[node.output[0]] = c
+
+        # elif node.op_type == 'Flatten':
+        #     attrs = _parse_attribute(node.attribute, {"axis": 1})
+        #     assert len(node.input) == 1
+        #     assert len(node.output) == 1
+        #     g.flatten(ts[node.input[0]], ts[node.output[0]], attrs["axis"])
+
+        # elif node.op_type == 'Gather':
+        #     attrs = _parse_attribute(node.attribute, {"axis": 0})
+        #     assert len(node.input) == 2
+        #     assert len(node.output) == 1
+        #     g.gather(ts[node.input[0]], ts[node.input[1]], ts[node.output[0]], attrs["axis"])
+
+        # elif node.op_type == 'Gemm':
+        #     attrs = _parse_attribute(node.attribute, {
+        #         "alpha": 1.0,
+        #         "beta": 1.0,
+        #         "transA": 0,
+        #         "transB": 0})
+        #     assert len(node.input) == 2 or len(node.input) == 3
+        #     assert len(node.output) == 1
+        #     assert attrs["alpha"] == 1.0
+        #     assert attrs["beta"] == 1.0 or len(node.input) == 2
+        #     tmpI0 = g.tensor([1] + list(ds[node.input[0]]), "FLOAT")
+        #     tmpI1 = g.tensor([1] + list(ds[node.input[1]]), "FLOAT")
+        #     tmpO = g.tensor([1] + list(ds[node.output[0]]), "FLOAT")
+        #     g.transpose(ts[node.input[0]], tmpI0, 0, Perm([PermItem(-1), PermItem(0), PermItem(1)]), 1)
+        #     g.transpose(ts[node.input[1]], tmpI1, 0, Perm([PermItem(-1), PermItem(0), PermItem(1)]), 1)
+        #     g.matmul(tmpI0, tmpI1, tmpO,
+        #             attrs["transA"], attrs["transB"],
+        #             None if len(node.input) == 2 else ts[node.input[2]])
+        #     g.transpose(tmpO, ts[node.output[0]], -1, Perm([PermItem([0, 1]), PermItem(2)]), 0)
+
+        # elif node.op_type == 'Mul':
+        #     assert len(node.output) == 1
+        #     g.mul([ts[x] for x in node.input], ts[node.output[0]])
+
+        # elif node.op_type == 'GlobalAveragePool':
+        #     assert len(node.input) == 1
+        #     assert len(node.output) == 1
+        #     dims = ds[node.input[0]]
+        #     if len(dims) > 0:
+        #         g.avgpool(ts[node.input[0]], ts[node.output[0]], dims[2], dims[3], 0, 0, 1, 1)
+        #     else:
+        #         g.avgpool(ts[node.input[0]], ts[node.output[0]])
+
+        # elif node.op_type == 'MaxPool':
+        #     attrs = _parse_attribute(node.attribute, {
+        #         "auto_pad": "NOTSET",
+        #         "dilations": [1, 1],
+        #         "pads": [0, 0, 0, 0],
+        #         "strides": [1, 1]})
+        #     assert len(node.input) == 1
+        #     assert len(node.output) == 1
+        #     assert len(attrs["kernel_shape"]) == 2
+        #     assert len(attrs["pads"]) == 4
+        #     assert len(attrs["strides"]) == 2
+        #     assert len(attrs["dilations"]) == 2
+        #     assert attrs["pads"][0] == attrs["pads"][2]
+        #     assert attrs["pads"][1] == attrs["pads"][3]
+        #     g.maxpool(ts[node.input[0]], ts[node.output[0]],
+        #             attrs["kernel_shape"][0], attrs["kernel_shape"][1],
+        #             attrs["dilations"][0], attrs["dilations"][1],
+        #             attrs["pads"][0], attrs["pads"][1],
+        #             attrs["strides"][0], attrs["strides"][1])
+
+        # elif node.op_type == 'AveragePool':
+        #     attrs = _parse_attribute(node.attribute, {
+        #         "auto_pad": "NOTSET",
+        #         "count_include_pad": 0,
+        #         "pads": [0, 0, 0, 0],
+        #         "strides": [1, 1]})
+        #     # No dilation in ONNX
+        #     assert len(node.input) == 1
+        #     assert len(node.output) == 1
+        #     assert attrs["count_include_pad"] == 0 # To be consistent with operator.cc
+        #     assert len(attrs["kernel_shape"]) == 2
+        #     assert len(attrs["pads"]) == 4
+        #     assert len(attrs["strides"]) == 2
+        #     assert attrs["pads"][0] == attrs["pads"][2]
+        #     assert attrs["pads"][1] == attrs["pads"][3]
+        #     g.avgpool(ts[node.input[0]], ts[node.output[0]],
+        #             attrs["kernel_shape"][0], attrs["kernel_shape"][1],
+        #             attrs["pads"][0], attrs["pads"][1],
+        #             attrs["strides"][0], attrs["strides"][1])
+
+        # elif node.op_type == 'Pad':
+        #     attrs = _parse_attribute(node.attribute, {'mode': b'constant'})
+        #     assert attrs["mode"].decode("ascii") == "constant"
+        #     assert len(attrs["pads"]) % 2 == 0
+        #     assert attrs["value"] == 0
+        #     nDim = len(attrs["pads"]) // 2
+        #     begin = attrs["pads"][:nDim]
+        #     end = attrs["pads"][nDim:]
+        #     g.pad(ts[node.input[0]], ts[node.output[0]], begin, end)
+
+        # elif node.op_type == 'ReduceMean':
+        #     attrs = _parse_attribute(node.attribute, {'keepdims': 1})
+        #     assert len(node.input) == 1
+        #     assert len(node.output) == 1
+        #     assert len(attrs["axes"]) == 1
+        #     axis = attrs["axes"][0]
+        #     if axis < 0:
+        #         axis = len(ds[node.input[0]]) - axis
+        #     g.reduceMean(ts[node.input[0]], ts[node.output[0]], axis)
+
+        # elif node.op_type == 'Softmax':
+        #     attrs = _parse_attribute(node.attribute)
+        #     assert len(node.input) == 1
+        #     assert len(node.output) == 1
+        #     axis = attrs["axis"]
+        #     if axis < 0:
+        #         axis = len(ds[node.input[0]]) - axis
+        #     g.softmax(ts[node.input[0]], ts[node.output[0]], axis)
+
+        # elif node.op_type == 'Reshape':
+        #     assert len(node.input) == 2
+        #     assert len(node.output) == 1
+        #     g.reshape(ts[node.input[0]], ts[node.output[0]])
+
+        # elif node.op_type == 'Relu':
+        #     assert len(node.input) == 1
+        #     assert len(node.output) == 1
+        #     g.relu(ts[node.input[0]], ts[node.output[0]])
+
+        # elif node.op_type == 'Tanh':
+        #     assert len(node.input) == 1
+        #     assert len(node.output) == 1
+        #     g.tanh(ts[node.input[0]], ts[node.output[0]])
+
+        # elif node.op_type == 'Sigmoid':
+        #     assert len(node.input) == 1
+        #     assert len(node.output) == 1
+        #     g.sigmoid(ts[node.input[0]], ts[node.output[0]])
+
+        # elif node.op_type == 'Shape':
+        #     # Ignore for now, and no need to output anything (TODO)
+        #     pass
+
+        # elif node.op_type == 'Sub':
+        #     assert len(node.input) == 2
+        #     assert len(node.output) == 1
+        #     g.sub(ts[node.input[0]], ts[node.input[1]], ts[node.output[0]])
+
+        # elif node.op_type == 'Transpose':
+        #     attrs = _parse_attribute(node.attribute, {})
+        #     assert len(node.input) == 1
+        #     assert len(node.output) == 1
+        #     assert "perm" in attrs
+        #     g.transpose(ts[node.input[0]], ts[node.output[0]], -1,
+        #             Perm([PermItem(x) for x in attrs["perm"]]), 0)
+
+        # elif node.op_type == 'Unsqueeze':
+        #     assert len(node.input) == 2
+        #     assert len(node.output) == 1
+        #     g.reshape(ts[node.input[0]], ts[node.output[0]])
+
+        # elif node.op_type == "BatchNormalization":
+        #     attrs = _parse_attribute(node.attribute, {})
+        #     assert len(node.input) == 5
+        #     assert len(node.output) == 1
+        #     epsilon = attrs['epsilon'] if 'epsilon' in attrs else 1e-5
+        #     momentum = attrs['momentum'] if 'momentum' in attrs else 0.9
+        #     g.batchnorm(ts[node.input[0]], ts[node.input[1]],
+        #                 ts[node.input[2]], ts[node.input[3]],
+        #                 ts[node.input[4]], ts[node.output[0]],
+        #                 epsilon, momentum)
+
+        # elif node.op_type == "Split":
+        #     attrs = _parse_attribute(node.attribute, {})
+        #     assert len(node.input) == 1
+        #     assert len(node.output) > 1
+        #     axis = attrs['axis']
+        #     split = attrs['split']
+        #     g.split(ts[node.input[0]], [ts[t] for t in node.output], axis, split)
+
+        # elif node.op_type == "Slice":
+        #     attrs = _parse_attribute(node.attribute, {})
+        #     assert len(node.input) == 4
+        #     assert len(node.output) == 1
+        #     g.slice(ts[node.input[0]], ts[node.output[0]],
+        #             ts[node.input[1]], ts[node.input[2]])
+
+        # elif node.op_type == "Resize":
+        #     attrs = _parse_attribute(node.attribute, {})
+        #     assert len(node.input) == 4
+        #     assert len(node.output) == 1
+        #     roi = ts[node.input[1]] if node.input[1] != '' else g.tensor(
+        #         [], 'FLOAT')
+        #     g.resize(ts[node.input[0]], roi, ts[node.output[0]])
+
+        # else:
+        #     assert False, "Unsupported op: " + node.op_type
+
+
+if __name__ == "__main__":
+    import sys
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model", help="ONNX model file")
+    parser.add_argument("bs", help="batch size", type=int, default=1)
+    # parser.add_argument("--output", help="Output file")
+    args = parser.parse_args()
+    import_onnx(args.model, args.bs)
+    print_result(args.model)
--- a/python/infinitensor/import_onnx.py
+++ b/python/infinitensor/import_onnx.py
@ -0,0 +1,588 @@
+from pyinfinitensor import *
+
+import functools
+
+import numpy as np
+
+import onnx
+import onnx.checker
+import onnx.numpy_helper
+import onnx.shape_inference
+
+
+def _add_value_info_for_constants(model: onnx.ModelProto):
+    """
+    Currently onnx.shape_inference doesn't use the shape of initializers, so add
+    that info explicitly as ValueInfoProtos.
+    Mutates the model.
+    Args:
+        model: The ModelProto to update.
+    """
+    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
+    if model.ir_version < 4:
+        return
+
+    def add_const_value_infos_to_graph(graph: onnx.GraphProto):
+        inputs = {i.name for i in graph.input}
+        existing_info = {vi.name: vi for vi in graph.value_info}
+        for init in graph.initializer:
+            # Check it really is a constant, not an input
+            if init.name in inputs:
+                continue
+
+            # The details we want to add
+            elem_type = init.data_type
+            shape = init.dims
+
+            # Get existing or create new value info for this constant
+            vi = existing_info.get(init.name)
+            if vi is None:
+                vi = graph.value_info.add()
+                vi.name = init.name
+
+            # Even though it would be weird, we will not overwrite info even if it doesn't match
+            tt = vi.type.tensor_type
+            if tt.elem_type == onnx.TensorProto.UNDEFINED:
+                tt.elem_type = elem_type
+            if not tt.HasField("shape"):
+                # Ensure we set an empty list if the const is scalar (zero dims)
+                tt.shape.dim.extend([])
+                for dim in shape:
+                    tt.shape.dim.add().dim_value = dim
+
+        # Handle subgraphs
+        for node in graph.node:
+            for attr in node.attribute:
+                # Ref attrs refer to other attrs, so we don't need to do anything
+                if attr.ref_attr_name != "":
+                    continue
+
+                if attr.type == onnx.AttributeProto.GRAPH:
+                    add_const_value_infos_to_graph(attr.g)
+                if attr.type == onnx.AttributeProto.GRAPHS:
+                    for g in attr.graphs:
+                        add_const_value_infos_to_graph(g)
+
+    return add_const_value_infos_to_graph(model.graph)
+
+
+def _parse_attribute(attributes, defaults=dict()):
+    atts = defaults
+    for att in attributes:
+        if att.type == onnx.AttributeProto.INT:
+            atts[att.name] = att.i
+        elif att.type == onnx.AttributeProto.INTS:
+            atts[att.name] = att.ints
+        elif att.type == onnx.AttributeProto.FLOAT:
+            atts[att.name] = att.f
+        elif att.type == onnx.AttributeProto.STRING:
+            atts[att.name] = att.s
+        elif att.type == onnx.AttributeProto.TENSOR:
+            atts[att.name] = att.t
+        else:
+            assert False, "Unsupported Attribute Type: {}".format(att.type)
+    return atts
+
+
+def _onnx_datatype_tostring(dtype):
+    if dtype == 0:
+        return 'UNDEFINED'
+    elif dtype == 1:
+        return 'FLOAT'
+    elif dtype == 2:
+        return 'UINT8'
+    elif dtype == 3:
+        return 'INT8'
+    elif dtype == 4:
+        return 'UINT16'
+    elif dtype == 5:
+        return 'INT16'
+    elif dtype == 6:
+        return 'INT32'
+    elif dtype == 7:
+        return 'INT64'
+    elif dtype == 8:
+        return 'STRING'
+    elif dtype == 9:
+        return 'BOOL'
+    elif dtype == 10:
+        return 'FLOAT16'
+    elif dtype == 11:
+        return 'DOUBLE'
+    elif dtype == 12:
+        return 'UINT32'
+    elif dtype == 13:
+        return 'UINT64'
+    elif dtype == 14:
+        return 'COMPLEX64'
+    elif dtype == 15:
+        return 'COMPLEX128'
+    elif dtype == 16:
+        return 'BFLOAT16'
+    else:
+        assert False, 'Unknown onnx datatype'
+
+
+def import_onnx(gf: GraphBuilder, net: str):
+    ts, ds, ops, consts = dict(), dict(), dict(), dict()  # (key, value) = (name, class)
+    model = onnx.load(net)
+
+    # Tensor_input
+    for input in model.graph.input:
+        if input.name not in ts:
+            dims = [d.dim_value for d in input.type.tensor_type.shape.dim]
+            ts[input.name] = gf.tensor(dims, _onnx_datatype_tostring(
+                input.type.tensor_type.elem_type))
+            ds[input.name] = dims
+
+    # Tensor_weight
+    for weight in model.graph.initializer:
+        if weight.name not in ts:
+            ts[weight.name] = gf.tensor(
+                weight.dims, _onnx_datatype_tostring(weight.data_type))
+            ds[weight.name] = weight.dims
+
+    # Tensor_inference
+    _add_value_info_for_constants(model)
+    infered_model = onnx.shape_inference.infer_shapes(model)
+    for v in infered_model.graph.value_info:
+        if v.name not in ts:
+            dims = [d.dim_value for d in v.type.tensor_type.shape.dim]
+            ts[v.name] = gf.tensor(dims, _onnx_datatype_tostring(
+                v.type.tensor_type.elem_type))
+            ds[v.name] = dims
+
+    # Tensor_output
+    for output in model.graph.output:
+        if output.name not in ts:
+            dims = [d.dim_value for d in output.type.tensor_type.shape.dim]
+            ts[output.name] = gf.tensor(dims, _onnx_datatype_tostring(
+                output.type.tensor_type.elem_type))
+            ds[output.name] = dims
+
+    # Op
+    for node in model.graph.node:
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Conv
+        if node.op_type == 'Conv':
+            attrs = _parse_attribute(node.attribute, {
+                "auto_pad": "NOTSET",
+                "dilations": [1, 1],
+                "pads": [0, 0, 0, 0],
+                "strides": [1, 1]})
+            # assert len(node.input) == 2  # bias is not implemented yet
+            assert len(node.output) == 1
+            assert attrs["auto_pad"] == "NOTSET"
+            assert len(attrs["pads"]) == 4
+            assert len(attrs["strides"]) == 2
+            assert len(attrs["dilations"]) == 2
+            assert attrs["pads"][0] == attrs["pads"][2]
+            assert attrs["pads"][1] == attrs["pads"][3]
+            gf.conv(ts[node.input[0]], ts[node.input[1]], ts[node.output[0]],
+                    attrs["pads"][0], attrs["pads"][1],
+                    attrs["strides"][0], attrs["strides"][1],
+                    attrs["dilations"][0], attrs["dilations"][1],
+                    None)
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#MatMul
+        elif node.op_type == 'MatMul':
+            assert len(node.input) == 2
+            assert len(node.output) == 1
+            dimA = list(ds[node.input[0]])
+            dimB = list(ds[node.input[1]])
+            dimO = list(ds[node.output[0]])
+
+            if len(dimA) == 2 and len(dimB) == 2:
+                tmpI0 = gf.tensor([1] + list(ds[node.input[0]]), "FLOAT")
+                tmpI1 = gf.tensor([1] + list(ds[node.input[1]]), "FLOAT")
+                tmpO = gf.tensor([1] + list(ds[node.output[0]]), "FLOAT")
+                gf.matmul(tmpI0, tmpI1, tmpO, False, False, None)
+
+            else:
+                assert len(dimO) >= 3
+                batch = functools.reduce(lambda x, y: x * y, dimO[:-2])
+
+                if len(dimA) == 3:
+                    tmpI0 = ts[node.input[0]]
+                else:
+                    tmpI0 = gf.tensor([batch, dimA[-2], dimA[-1]], "FLOAT")
+                    gf.reshape(ts[node.input[0]], tmpI0, [
+                               batch, dimA[-2], dimA[-1]])
+
+                if len(dimB) == 3:
+                    tmpI1 = ts[node.input[1]]
+                else:
+                    tmpI1 = gf.tensor([batch, dimB[-2], dimB[-1]], "FLOAT")
+                    gf.reshape(ts[node.input[1]], tmpI1, [
+                               batch, dimB[-2], dimB[-1]])
+
+                if len(dimO) == 3:
+                    tmpO = ts[node.output[0]]
+                    gf.matmul(tmpI0, tmpI1, tmpO, False, False)
+                else:
+                    tmpO = gf.tensor([batch, dimO[-2], dimO[-1]], "FLOAT")
+                    gf.matmul(tmpI0, tmpI1, tmpO, False, False)
+                    gf.reshape(tmpO, ts[node.output[0]], ds[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#ConvTranspose
+        elif node.op_type == 'ConvTranspose':
+            attrs = _parse_attribute(node.attribute, {
+                "auto_pad": "NOTSET",
+                "dilations": [1, 1],
+                "pads": [0, 0, 0, 0],
+                "strides": [1, 1],
+                "group": 1})
+            assert len(node.input) == 2 or len(node.input) == 3
+            assert len(node.output) == 1
+            assert attrs["auto_pad"] == "NOTSET"
+            assert len(attrs["pads"]) == 4
+            assert len(attrs["strides"]) == 2
+            assert len(attrs["dilations"]) == 2
+            assert attrs["pads"][0] == attrs["pads"][2]
+            assert attrs["pads"][1] == attrs["pads"][3]
+            oph, opw = 0, 0
+            if "output_padding" in attrs:
+                oph, opw = attrs["output_padding"][0], attrs["output_padding"][1]
+                assert attrs["output_padding"][0] == attrs["output_padding"][1]
+            gf.convTrans(ts[node.input[0]], ts[node.input[1]], ts[node.output[0]],
+                         attrs["pads"][0], attrs["pads"][1],
+                         attrs["strides"][0], attrs["strides"][1],
+                         attrs["dilations"][0], attrs["dilations"][1],
+                         oph, opw, group,
+                         None if len(node.input) == 2 else ts[node.input[2]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Pad
+        elif node.op_type == 'Pad':
+            attrs = _parse_attribute(node.attribute, {'mode': b'constant'})
+            assert attrs["mode"].decode("ascii") == "constant"
+            assert len(attrs["pads"]) % 2 == 0
+            if "constant_value" in attrs:
+                assert attrs["constant_value"] == 0
+            gf.pad(ts[node.input[0]], ts[node.output[0]], attrs["pads"],
+                   attrs["axes"] if axes in attrs else None)
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Slice
+        elif node.op_type == "Slice":
+            assert 3 <= len(node.input) <= 5
+            assert len(node.output) == 1
+            gf.slice(ts[node.input[0]], ts[node.output[0]],
+                     ts[node.input[1]], ts[node.input[2]],
+                     ts[node.input[3]] if len(node.input) == 4 else None,
+                     ts[node.input[4]] if len(node.input) == 5 else None)
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Concat
+        elif node.op_type == 'Concat':
+            attrs = _parse_attribute(node.attribute, {})
+            assert len(node.output) == 1
+            gf.concat([ts[item] for item in node.input],
+                      ts[node.output[0]], attrs["axis"])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Split
+        elif node.op_type == "Split":
+            attrs = _parse_attribute(node.attribute, {'axis': 0})
+            assert len(node.input) == 1
+            assert len(node.output) > 1
+            dim = attrs['axis']
+            num = attrs['num_outputs']
+            gf.split(ts[node.input[0]], [ts[t] for t in node.output], dim, num)
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#MaxPool
+        elif node.op_type == 'MaxPool':
+            attrs = _parse_attribute(node.attribute, {
+                "auto_pad": "NOTSET",
+                "dilations": [1, 1],
+                "pads": [0, 0, 0, 0],
+                "strides": [1, 1]})
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            assert len(attrs["kernel_shape"]) == 2
+            assert len(attrs["pads"]) == 4
+            assert len(attrs["strides"]) == 2
+            assert len(attrs["dilations"]) == 2
+            assert attrs["pads"][0] == attrs["pads"][2]
+            assert attrs["pads"][1] == attrs["pads"][3]
+            gf.maxpool(ts[node.input[0]], ts[node.output[0]],
+                       attrs["kernel_shape"][0], attrs["kernel_shape"][1],
+                       attrs["dilations"][0], attrs["dilations"][1],
+                       attrs["pads"][0], attrs["pads"][1],
+                       attrs["strides"][0], attrs["strides"][1])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#AveragePool
+        # No dilation in ONNX
+        elif node.op_type == 'AveragePool':
+            attrs = _parse_attribute(node.attribute, {
+                "auto_pad": "NOTSET",
+                "count_include_pad": 0,
+                "pads": [0, 0, 0, 0],
+                "strides": [1, 1]})
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            # To be consistent with operator.cc
+            assert attrs["count_include_pad"] == 0
+            assert len(attrs["kernel_shape"]) == 2
+            assert len(attrs["pads"]) == 4
+            assert len(attrs["strides"]) == 2
+            assert attrs["pads"][0] == attrs["pads"][2]
+            assert attrs["pads"][1] == attrs["pads"][3]
+            dh, dw = 1, 1
+            gf.avgpool(ts[node.input[0]], ts[node.output[0]],
+                       attrs["kernel_shape"][0], attrs["kernel_shape"][1],
+                       dw, dh,
+                       attrs["pads"][0], attrs["pads"][1],
+                       attrs["strides"][0], attrs["strides"][1])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Add
+        elif node.op_type == 'Add':
+            print(ds[node.input[0]], ds[node.input[1]], ds[node.output[0]])
+            assert len(node.input) == 2
+            assert len(node.output) == 1
+            if ds[node.input[0]] == ds[node.input[1]]:
+                gf.add(ts[node.input[0]], ts[node.input[1]],
+                       ts[node.output[0]])
+            elif ds[node.input[0]] == ds[node.output[0]]:
+                tmp = gf.tensor(ds[node.output[0]], "FLOAT")
+                gf.add(ts[node.input[0]], tmp, ts[node.output[0]])
+            elif ds[node.input[1]] == ds[node.output[0]]:
+                tmp = gf.tensor(ds[node.output[0]], "FLOAT")
+                gf.add(tmp, ts[node.input[1]], ts[node.output[0]])
+            else:
+                assert False
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sub
+        elif node.op_type == 'Sub':
+            assert len(node.input) == 2
+            assert len(node.output) == 1
+            if ds[node.input[0]] == ds[node.input[1]]:
+                gf.sub(ts[node.input[0]], ts[node.input[1]],
+                       ts[node.output[0]])
+            elif ds[node.input[0]] == ds[node.output[0]]:
+                tmp = gf.tensor(ds[node.output[0]], "FLOAT")
+                gf.sub(ts[node.input[0]], tmp, ts[node.output[0]])
+            elif ds[node.input[1]] == ds[node.output[0]]:
+                tmp = gf.tensor(ds[node.output[0]], "FLOAT")
+                gf.sub(tmp, ts[node.input[1]], ts[node.output[0]])
+            else:
+                assert False
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Mul
+        elif node.op_type == 'Mul':
+            assert len(node.input) == 2
+            assert len(node.output) == 1
+            if ds[node.input[0]] == ds[node.input[1]]:
+                gf.mul(ts[node.input[0]], ts[node.input[1]],
+                       ts[node.output[0]])
+            elif ds[node.input[0]] == ds[node.output[0]]:
+                tmp = gf.tensor(ds[node.output[0]], "FLOAT")
+                gf.mul(ts[node.input[0]], tmp, ts[node.output[0]])
+            elif ds[node.input[1]] == ds[node.output[0]]:
+                tmp = gf.tensor(ds[node.output[0]], "FLOAT")
+                gf.mul(tmp, ts[node.input[1]], ts[node.output[0]])
+            else:
+                assert False
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Div
+        elif node.op_type == 'Div':
+            assert len(node.input) == 2
+            assert len(node.output) == 1
+            if ds[node.input[0]] == ds[node.input[1]]:
+                gf.div(ts[node.input[0]], ts[node.input[1]],
+                       ts[node.output[0]])
+            elif ds[node.input[0]] == ds[node.output[0]]:
+                tmp = gf.tensor(ds[node.output[0]], "FLOAT")
+                gf.div(ts[node.input[0]], tmp, ts[node.output[0]])
+            elif ds[node.input[1]] == ds[node.output[0]]:
+                tmp = gf.tensor(ds[node.output[0]], "FLOAT")
+                gf.div(tmp, ts[node.input[1]], ts[node.output[0]])
+            else:
+                assert False
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Pow
+        elif node.op_type == 'Pow':
+            assert len(node.input) == 2
+            assert len(node.output) == 1
+            if ds[node.input[0]] == ds[node.input[1]]:
+                gf.pow(ts[node.input[0]], ts[node.input[1]],
+                       ts[node.output[0]])
+            elif ds[node.input[0]] == ds[node.output[0]]:
+                tmp = gf.tensor(ds[node.output[0]], "FLOAT")
+                gf.pow(ts[node.input[0]], tmp, ts[node.output[0]])
+            elif ds[node.input[1]] == ds[node.output[0]]:
+                tmp = gf.tensor(ds[node.output[0]], "FLOAT")
+                gf.pow(tmp, ts[node.input[1]], ts[node.output[0]])
+            else:
+                assert False
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gather
+        elif node.op_type == 'Gather':
+            attrs = _parse_attribute(node.attribute, {"axis": 0})
+            assert len(node.input) == 2
+            assert len(node.output) == 1
+            gf.gather(ts[node.input[0]], ts[node.input[1]],
+                      ts[node.output[0]], attrs["axis"])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Reshape
+        elif node.op_type == 'Reshape':
+            attrs = _parse_attribute(node.attribute, {"allowzero": 0})
+            assert len(node.input) == 2
+            assert len(node.output) == 1
+            gf.reshape(ts[node.input[0]],
+                       ts[node.output[0]], ds[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Flatten
+        # Output is 2D in ONNX
+        elif node.op_type == 'Flatten':
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            gf.flatten(ts[node.input[0]], ts[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Identity
+        elif node.op_type == 'Identity':
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            gf.identity(ts[node.input[0]], ts[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Softmax
+        elif node.op_type == 'Softmax':
+            attrs = _parse_attribute(node.attribute, {"axis": -1})
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            gf.softmax(ts[node.input[0]], ts[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Relu
+        elif node.op_type == 'Relu':
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            gf.relu(ts[node.input[0]], ts[node.output[0]])
+
+        # TODO
+        elif node.op_type == 'Sqrt':
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            gf.relu(ts[node.input[0]], ts[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sigmoid
+        elif node.op_type == 'Sigmoid':
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            gf.sigmoid(ts[node.input[0]], ts[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Tanh
+        elif node.op_type == 'Tanh':
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            gf.tanh(ts[node.input[0]], ts[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Abs
+        elif node.op_type == 'Abs':
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            gf.abs(ts[node.input[0]], ts[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Cast
+        # Ignore for now (TODO)
+        elif node.op_type == 'Cast':
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            gf.identity(ts[node.input[0]], ts[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Constant
+        elif node.op_type == 'Constant':
+            attrs = _parse_attribute(node.attribute, {})
+            assert len(node.output) == 1
+            c = onnx.numpy_helper.to_array(attrs["value"])
+            if c.ndim == 0:
+                c = c[()]
+            consts[node.output[0]] = c
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gemm
+        elif node.op_type == 'Gemm':
+            attrs = _parse_attribute(node.attribute, {
+                "alpha": 1.0,
+                "beta": 1.0,
+                "transA": 0,
+                "transB": 0})
+            assert len(node.input) == 2 or len(node.input) == 3
+            assert len(node.output) == 1
+            assert attrs["alpha"] == 1.0
+            assert attrs["beta"] == 1.0 or len(node.input) == 2
+            i0 = gf.tensor([1] + list(ds[node.input[0]]), "FLOAT")
+            i1 = gf.tensor([1] + list(ds[node.input[1]]), "FLOAT")
+            o0 = gf.tensor([1] + list(ds[node.output[0]]), "FLOAT")
+            gf.reshape(ts[node.input[0]], i0, [1] + list(ds[node.input[0]]))
+            gf.reshape(ts[node.input[1]], i1, [1] + list(ds[node.input[1]]))
+            gf.matmul(i0, i1, o0,  attrs["transA"], attrs["transB"])
+            o1 = gf.tensor(ds[node.output[0]], "FLOAT")
+            a0 = gf.tensor(ds[node.output[0]], "FLOAT")
+            gf.reshape(o0, o1, ds[node.output[0]])
+            gf.add(o1, a0, ts[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#GlobalAveragePool
+        # elif node.op_type == 'GlobalAveragePool':
+        #     assert len(node.input) == 1
+        #     assert len(node.output) == 1
+        #     dims = ds[node.input[0]]
+        #     if len(dims) > 0:
+        #         g.avgpool(ts[node.input[0]], ts[node.output[0]], dims[2], dims[3], 0, 0, 1, 1)
+        #     else:
+        #         g.avgpool(ts[node.input[0]], ts[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceMean
+        elif node.op_type == 'ReduceMean':
+            attrs = _parse_attribute(node.attribute, {'keepdims': 1})
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            assert len(attrs["axes"]) == 1
+            axis = attrs["axes"][0]
+            print(axis, len(ds[node.input[0]]))
+            if axis < 0:
+                axis = len(ds[node.input[0]]) + axis
+            gf.reduceMean(ts[node.input[0]], ts[node.output[0]], axis)
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Shape
+        # Ignore for now, and no need to output anything (TODO)
+        # elif node.op_type == 'Shape':
+        #     pass
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Transpose
+        elif node.op_type == 'Transpose':
+            attrs = _parse_attribute(node.attribute, {})
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            assert "perm" in attrs
+            gf.transpose(ts[node.input[0]], ts[node.output[0]], attrs["perm"])
+
+            # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Unsqueeze
+        elif node.op_type == 'Unsqueeze':
+            assert len(node.input) == 2
+            assert len(node.output) == 1
+            gf.reshape(ts[node.input[0]],
+                       ts[node.output[0]], ts[node.output[0]])
+
+        # TODO
+        elif node.op_type == 'Erf':
+            assert len(node.input) == 1
+            assert len(node.output) == 1
+            gf.erf(ts[node.input[0]], ts[node.output[0]])
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#BatchNormalization
+        # elif node.op_type == "BatchNormalization":
+        #     attrs = _parse_attribute(node.attribute, {})
+        #     assert len(node.input) == 5
+        #     assert len(node.output) == 1
+        #     epsilon = attrs['epsilon'] if 'epsilon' in attrs else 1e-5
+        #     momentum = attrs['momentum'] if 'momentum' in attrs else 0.9
+        #     g.batchnorm(ts[node.input[0]], ts[node.input[1]],
+        #                 ts[node.input[2]], ts[node.input[3]],
+        #                 ts[node.input[4]], ts[node.output[0]],
+        #                 epsilon, momentum)
+
+        # https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize
+        # elif node.op_type == "Resize":
+        #     attrs = _parse_attribute(node.attribute, {})
+        #     assert len(node.input) == 4
+        #     assert len(node.output) == 1
+        #     roi = ts[node.input[1]] if node.input[1] != '' else g.tensor(
+        #         [], 'FLOAT')
+        #     g.resize(ts[node.input[0]], roi, ts[node.output[0]])
+
+        else:
+            assert False, "Unsupported op: " + node.op_type
--- a/python/infinitensor/operator_timer.py
+++ b/python/infinitensor/operator_timer.py
@ -1,10 +1,11 @@
+import argparse
 from tokenize import Double
 import pyinfinitensor  # import getPerfConv, getPerfMatmul


-def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name=""):
+def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group):
    return pyinfinitensor.getPerfConvCudnn(n, c, h, w, f, r, s, padh, padw,
-                                           strideh, stridew, dilationh, dilationw, group, name)
+                                           strideh, stridew, dilationh, dilationw, group)


 def getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group):
@ -13,3 +14,43 @@ def getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, strid

 def getPerfMatmul(b, m, n, k, name=""):
    return pyinfinitensor.getPerfMatmulCublas(b, m, n, k, name)
+
+
+def getPerfConvBiasActCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, bias: bool, act="None"):
+    return pyinfinitensor.getPerfConvBiasActCudnn(n, c, h, w, f, r, s, padh, padw,
+                                                  strideh, stridew, dilationh, dilationw, group, bias, act)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Process some integers.')
+    parser.add_argument('op', metavar='operator', type=str)
+    parser.add_argument('shape', nargs='+')
+    parser.add_argument('--pad', type=int, default=0)
+    parser.add_argument('--stride', type=int, default=1)
+    parser.add_argument('--dilation', type=int, default=1)
+    parser.add_argument('--group', type=int, default=1)
+    parser.add_argument('--bias', type=bool, default=False)
+    parser.add_argument('--act', type=str, default="None")
+    args = parser.parse_args()
+    print(args)
+    if args.op == 'gemm':
+        t = getPerfMatmul(int(args.shape[0]), int(
+            args.shape[1]), int(args.shape[2]), int(args.shape[3]))
+        print(
+            f'time {t:.3f} ms, {2*int(args.shape[0])*int(args.shape[1])*int(args.shape[2])*int(args.shape[3])/t/1e9:.3f} TFLOPS')
+    elif args.op == 'conv':
+        assert len(args.shape) == 7
+        n, c, h, w, f, r, s = [int(v) for v in args.shape]
+        padh = padw = int(args.pad)
+        strideh = stridew = int(args.stride)
+        dilationh = dilationw = int(args.dilation)
+        group = int(args.group)
+        bias = int(args.bias)
+        act = args.act
+        assert group==1, "Unsupported"
+        t = pyinfinitensor.getPerfConvBiasActCudnn(
+            n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, bias, act)
+        print(
+            f'time {t:.3f} ms, {n*c*h*w*f*r*s/strideh/stridew*2/10**9:.3f} TFlops')
+    else:
+        assert False, "Not supported"
--- a/python/infinitensor/rules.py
+++ b/python/infinitensor/rules.py
@ -0,0 +1,84 @@
+import pandas as pd
+import numpy as np
+from operator_timer import *
+from datetime import datetime
+
+pd.options.display.float_format = '{:,.3f}'.format
+
+df= pd.DataFrame(columns=['n', 'c', 'h', 'w', 'f', 'r', 's', 'ph', 'pw', 'sh', 'sw', 'dh', 'dw', 'oph', 'opw', 'group'])
+def conv_original(name, n, c, h, w, f, r, s, ph, pw,
+                            sh, sw, dh, dw, group):
+    df.loc[name, ['n', 'c', 'h', 'w', 'f', 'r', 's', 'ph', 'pw', 'sh', 'sw', 'dh', 'dw', 'group']] = n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, group
+    df.loc[name, 't_original'] = getPerfConv(n, c, h, w, f, r, s, ph, pw,
+                        sh, sw, dh, dw, group)
+    df.loc[name, 't_bias'] = getPerfConvBiasActCudnn(n, c, h, w, f, r, s, ph, pw,
+                        sh, sw, dh, dw, group, bias=True)
+    df.loc[name, 't_bias_relu'] = getPerfConvBiasActCudnn(n, c, h, w, f, r, s, ph, pw,
+                        sh, sw, dh, dw, group, bias=True, act="Relu")
+
+def conv_rule_5x5_to_3x3(name, n, c, h, w, f, r, s, ph, pw,
+                            sh, sw, dh, dw, group):
+    col = 't_5x5_to_3x3'
+    if r == 5 and s == 5:
+        df.loc[name, col] = getPerfConv(n, c, h, w, f*4, 3, 3, ph, pw,
+                            sh, sw, dh, dw, group)
+    else:
+        df.loc[name, col] = np.inf
+
+def conv_rule_9x9_to_3x3(name, n, c, h, w, f, r, s, ph, pw,
+                            sh, sw, dh, dw, group):
+    col = 't_9x9_to_3x3'
+    if r == 9 and s == 9:
+        df.loc[name, col] = getPerfConv(n, c, h, w, f*9, r//3, s//3, ph, pw,
+                            sh, sw, dh, dw, group)
+    else:
+        df.loc[name, col] = np.inf
+
+bandwidth=200*10**6 # (200GB/ms)
+
+def conv_rule_conv2gemm(name, n, c, h, w, f, r, s, ph, pw,
+                            sh, sw, dh, dw, group):
+    col = 't_conv2gemm'
+    if [sh, sw, dh, dw, group] == [1] * 5:
+        # b = group
+        # m = batch_size * input_height * input_width
+        # n = output_channel * kernel_height * kernel_width
+        # k = input_channel // group
+        t_reduce= group*n*h*w*f*r*s*4/bandwidth if r>1 or s>1 else 0 
+        df.loc[name, '_'+col+'_mem'] = t_reduce
+        df.loc[name, col] = getPerfMatmul(group, n*h*w, f*r*s, c//group) + t_reduce 
+    else:
+        df.loc[name, col] = np.inf
+
+# conv_rules=[conv_original, conv_rule_9x9_to_3x3, conv_rule_5x5_to_3x3, conv_rule_conv2gemm]
+conv_rules=[conv_original]
+
+def conv_tranpsposed2d_original(name, n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, oph, opw, group):
+    df.loc[name, ['n', 'c', 'h', 'w', 'f', 'r', 's', 'ph', 'pw', 'sh', 'sw', 'dh', 'dw', 'oph', 'opw', 'group']] = n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, oph, opw, group
+    df.loc[name, 't_original'] = getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, oph, opw, group)
+
+def conv_tranpsposed2d_togemm(name, n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, oph, opw, group):
+    col = 't_conv2gemm'
+    if [dh, dw, group] == [1] * 3:
+        # ConvTransose2gemm
+        # b = 1
+        # m = batch_size * input_height*input_width
+        # n = output_channel*kernel_height*kernel_width
+        # k = input_channel
+        t_reduce= n*h*w*c*r*s*4/bandwidth if r>1 or s>1 else 0 
+        df.loc[name, '_'+col+'_mem'] = t_reduce
+        print('t_conv2gemm', group, n*h*w, c*r*s, f)
+        df.loc[name, col] = getPerfMatmul(group, n*h*w, c*r*s, f) + t_reduce 
+    else:
+        df.loc[name, col] = np.inf
+
+conv_transposed2d_rules=[conv_tranpsposed2d_original, conv_tranpsposed2d_togemm]
+
+def print_result(model_fn):
+    pd.set_option('display.max_rows', 500)
+    df['t_min'] = df.filter(regex=("^t_.*")).min(axis=1)
+    print(df)
+    print(f'Origin:  {df["t_original"].sum():.3f} ms')
+    print(f'Min:     {df["t_min"].sum():.3f} ms') 
+    print(f'Speedup: {df["t_original"].sum()/df["t_min"].sum():.3f} x')
+    df.to_pickle(f'optime_{model_fn.split("/")[-1]}_{datetime.now().strftime("%m_%d_%H_%M_%S")}.pkl')
--- a/python/test/run_test.py
+++ b/python/test/run_test.py
@ -0,0 +1,5 @@
+import pytest
+
+
+if __name__ == "__main__":
+    retcode = pytest.main()
--- a/python/test/test_import_onnx.py
+++ b/python/test/test_import_onnx.py
@ -0,0 +1,16 @@
+from pyinfinitensor import *
+from infinitensor import import_onnx
+
+
+class Test_ImportOnnx:
+    def test_Netname(self):
+        runtime = CpuRuntimeObj.getInstance()
+        graphBuilder = GraphBuilderObj(runtime)
+        import_onnx(graphBuilder, '/home/mazx/git/pf-models/bert.bs1.onnx')
+
+
+class Test_SARDRN:
+    def test_Netname(self):
+        runtime = CpuRuntimeObj.getInstance()
+        graphBuilder = GraphBuilderObj(runtime)
+        import_onnx(graphBuilder, '/home/mazx/git/pf-models/sardrn.bs1.onnx')
--- a/src/core/graph_builder.cc
+++ b/src/core/graph_builder.cc
@ -0,0 +1,545 @@
+#include "core/graph_builder.h"
+
+namespace infini {
+
+Tensor GraphBuilderObj::tensor(Shape dim, const std::string &dtype) {
+    if (dtype == "FLOAT") {
+        return g->addTensor(dim, DataType::Float32);
+    }
+    if (dtype == "INT32") {
+        return g->addTensor(dim, DataType::UInt32);
+    }
+    if (dtype == "INT64") {
+        return g->addTensor(dim, DataType::UInt32);
+    }
+    IT_TODO_HALT_MSG("Unsupported data type");
+}
+
+Operator GraphBuilderObj::conv(Tensor input, Tensor weight, Tensor output,
+                               int ph, int pw, int sh, int sw, int dh, int dw,
+                               Tensor bias) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op =
+        g->addOpWithOutputs<ConvObj>(i0, w0, o0, ph, ph, sh, sw, dh, dw, bias);
+    return op;
+}
+
+Operator GraphBuilderObj::conv(Tensor input, Tensor weight, int ph, int pw,
+                               int sh, int sw, int dh, int dw, Tensor bias) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
+    auto op = g->addOp<ConvObj>(i0, w0, nullptr, ph, ph, sh, sw, dh, dw, bias);
+    return op;
+}
+
+Operator GraphBuilderObj::conv(Tensor input, Tensor weight, Tensor output,
+                               ConvBaseObj::PaddingMode pm, int sh, int sw,
+                               int dh, int dw, Tensor bias) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op =
+        g->addOpWithOutputs<ConvObj>(i0, w0, o0, pm, sh, sw, dh, dw, bias);
+    return op;
+}
+
+Operator GraphBuilderObj::conv(Tensor input, Tensor weight,
+                               ConvBaseObj::PaddingMode pm, int sh, int sw,
+                               int dh, int dw, Tensor bias) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
+    auto op = g->addOp<ConvObj>(i0, w0, nullptr, pm, sh, sw, dh, dw, bias);
+    return op;
+}
+
+Operator GraphBuilderObj::matmul(Tensor A, Tensor B, Tensor C, bool transA,
+                                 bool transB) {
+    Tensor i0 = g->addTensor(A->getDims(), A->getDType());
+    Tensor i1 = g->addTensor(B->getDims(), B->getDType());
+    Tensor o0 = g->addTensor(C->getDims(), C->getDType());
+    auto op = g->addOpWithOutputs<MatmulObj>(i0, i1, o0, transA, transB);
+    return op;
+}
+
+Operator GraphBuilderObj::matmul(Tensor A, Tensor B, bool transA, bool transB) {
+    Tensor i0 = g->addTensor(A->getDims(), A->getDType());
+    Tensor i1 = g->addTensor(B->getDims(), B->getDType());
+    auto op = g->addOp<MatmulObj>(i0, i1, nullptr, transA, transB);
+    return op;
+}
+
+Operator GraphBuilderObj::convTrans(Tensor input, Tensor weight, Tensor output,
+                                    int ph, int pw, int sh, int sw, int dh,
+                                    int dw, int oph, int opw, int group,
+                                    Tensor bias, ActType act) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<ConvTransposed2dObj>(
+        i0, w0, o0, ph, pw, sh, sw, dh, dw, oph, opw, group, bias, act);
+    return op;
+}
+
+Operator GraphBuilderObj::convTrans(Tensor input, Tensor weight, int ph, int pw,
+                                    int sh, int sw, int dh, int dw, int oph,
+                                    int opw, int group, Tensor bias,
+                                    ActType act) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
+    auto op = g->addOp<ConvTransposed2dObj>(i0, w0, nullptr, ph, pw, sh, sw, dh,
+                                            dw, oph, opw, group, bias, act);
+    return op;
+}
+
+Operator GraphBuilderObj::convTrans(Tensor input, Tensor weight, Tensor output,
+                                    ConvBaseObj::PaddingMode pm, int sh, int sw,
+                                    int dh, int dw, int oph, int opw, int group,
+                                    Tensor bias, ActType act) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<ConvTransposed2dObj>(
+        i0, w0, o0, pm, sh, sw, dh, dw, oph, opw, group, bias, act);
+    return op;
+}
+
+Operator GraphBuilderObj::convTrans(Tensor input, Tensor weight,
+                                    ConvBaseObj::PaddingMode pm, int sh, int sw,
+                                    int dh, int dw, int oph, int opw, int group,
+                                    Tensor bias, ActType act) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
+    auto op = g->addOp<ConvTransposed2dObj>(i0, w0, nullptr, pm, sh, sw, dh, dw,
+                                            oph, opw, group, bias, act);
+    return op;
+}
+
+Operator GraphBuilderObj::g2bmm(Tensor A, Tensor B, Tensor C, const int width,
+                                const int dilation, Tensor bias, ActType act) {
+    Tensor i0 = g->addTensor(A->getDims(), A->getDType());
+    Tensor i1 = g->addTensor(B->getDims(), B->getDType());
+    Tensor o0 = g->addTensor(C->getDims(), C->getDType());
+    auto op =
+        g->addOpWithOutputs<G2BMMObj>(i0, i1, o0, width, dilation, bias, act);
+    return op;
+}
+
+Operator GraphBuilderObj::g2bmm(Tensor A, Tensor B, const int width,
+                                const int dilation, Tensor bias, ActType act) {
+    Tensor i0 = g->addTensor(A->getDims(), A->getDType());
+    Tensor i1 = g->addTensor(B->getDims(), B->getDType());
+    auto op = g->addOp<G2BMMObj>(i0, i1, nullptr, width, dilation, bias, act);
+    return op;
+}
+
+Operator GraphBuilderObj::gbmml(Tensor A, Tensor B, Tensor C,
+                                const int dilation, Tensor bias, ActType act) {
+    Tensor i0 = g->addTensor(A->getDims(), A->getDType());
+    Tensor i1 = g->addTensor(B->getDims(), B->getDType());
+    Tensor o0 = g->addTensor(C->getDims(), C->getDType());
+    auto op = g->addOpWithOutputs<GBMMObj>(i0, i1, o0, dilation, bias, act);
+    return op;
+}
+
+Operator GraphBuilderObj::gbmml(Tensor A, Tensor B, const int dilation,
+                                Tensor bias, ActType act) {
+    Tensor i0 = g->addTensor(A->getDims(), A->getDType());
+    Tensor i1 = g->addTensor(B->getDims(), B->getDType());
+    auto op = g->addOp<GBMMObj>(i0, i1, nullptr, dilation, bias, act);
+    return op;
+}
+
+Operator GraphBuilderObj::pad(Tensor input, Tensor output,
+                              const vector<int> &pads,
+                              const optional<const vector<int>> &axis) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<PadObj>(i0, o0, pads, axis);
+    return op;
+}
+
+Operator GraphBuilderObj::pad(Tensor input, const vector<int> &pads,
+                              const optional<const vector<int>> &axis) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<PadObj>(i0, nullptr, pads, axis);
+    return op;
+}
+
+Operator GraphBuilderObj::slice(Tensor input, Tensor output,
+                                const vector<int> &starts,
+                                const vector<int> &ends,
+                                const optional<const vector<int>> &axis,
+                                const optional<const vector<int>> &steps) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<SliceObj>(i0, o0, starts, ends, axis, steps);
+    return op;
+}
+
+Operator GraphBuilderObj::slice(Tensor input, const vector<int> &starts,
+                                const vector<int> &ends,
+                                const optional<const vector<int>> &axis,
+                                const optional<const vector<int>> &steps) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<SliceObj>(i0, nullptr, starts, ends, axis, steps);
+    return op;
+}
+
+Operator GraphBuilderObj::concat(TensorVec inputs, Tensor output, int dim) {
+    TensorVec is;
+    for (auto input : inputs) {
+        Tensor i = g->addTensor(input->getDims(), input->getDType());
+        is.push_back(i);
+    }
+    Tensor o = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<ConcatObj>(is, o, dim);
+    return op;
+}
+
+Operator GraphBuilderObj::concat(TensorVec inputs, int dim) {
+    TensorVec is;
+    for (auto input : inputs) {
+        Tensor i = g->addTensor(input->getDims(), input->getDType());
+        is.push_back(i);
+    }
+    auto op = g->addOp<ConcatObj>(is, nullptr, dim);
+    return op;
+}
+
+Operator GraphBuilderObj::split(Tensor input, std::optional<TensorVec> outputs,
+                                int dim, int num) {
+    Tensor i = g->addTensor(input->getDims(), input->getDType());
+    if (outputs.has_value()) {
+        TensorVec os;
+        for (auto output : outputs.value()) {
+            Tensor o = g->addTensor(output->getDims(), output->getDType());
+            os.push_back(o);
+        }
+        auto op = g->addOpWithOutputs<SplitObj>(i, os, dim, num);
+        return op;
+    } else {
+        auto op = g->addOp<SplitObj>(i, std::nullopt, dim, num);
+        return op;
+    }
+}
+
+Operator GraphBuilderObj::split(Tensor input, int dim, int num) {
+    Tensor i = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<SplitObj>(i, std::nullopt, dim, num);
+    return op;
+}
+
+Operator GraphBuilderObj::split(Tensor input, std::optional<TensorVec> outputs,
+                                int dim, const vector<int> &ratio) {
+    Tensor i = g->addTensor(input->getDims(), input->getDType());
+    if (outputs.has_value()) {
+        TensorVec os;
+        for (auto output : outputs.value()) {
+            Tensor o = g->addTensor(output->getDims(), output->getDType());
+            os.push_back(o);
+        }
+        auto op = g->addOpWithOutputs<SplitObj>(i, os, dim, ratio);
+        return op;
+    } else {
+        auto op = g->addOp<SplitObj>(i, std::nullopt, dim, ratio);
+        return op;
+    }
+}
+
+Operator GraphBuilderObj::split(Tensor input, int dim,
+                                const vector<int> &ratio) {
+    Tensor i = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<SplitObj>(i, std::nullopt, dim, ratio);
+    return op;
+}
+
+Operator GraphBuilderObj::transpose(Tensor input, Tensor output,
+                                    const Shape &perm) {
+    Tensor i = g->addTensor(input->getDims(), input->getDType());
+    Tensor o = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<TransposeObj>(i, o, perm);
+    return op;
+}
+
+Operator GraphBuilderObj::extend(Tensor input, Tensor output, int dim,
+                                 int num) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<ExtendObj>(i0, o0, dim, num);
+    return op;
+}
+
+Operator GraphBuilderObj::extend(Tensor input, int dim, int num) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<ExtendObj>(i0, nullptr, dim, num);
+    return op;
+}
+
+Operator GraphBuilderObj::maxpool(Tensor input, Tensor output, int kh, int kw,
+                                  int dh, int dw, int ph, int pw, int sh,
+                                  int sw) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op =
+        g->addOpWithOutputs<MaxPoolObj>(i0, o0, kh, kw, dh, dw, ph, pw, sh, sw);
+    return op;
+}
+
+Operator GraphBuilderObj::maxpool(Tensor input, int kh, int kw, int dh, int dw,
+                                  int ph, int pw, int sh, int sw) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<MaxPoolObj>(i0, nullptr, kh, kw, dh, dw, ph, pw, sh, sw);
+    return op;
+}
+
+Operator GraphBuilderObj::avgpool(Tensor input, Tensor output, int kh, int kw,
+                                  int dh, int dw, int ph, int pw, int sh,
+                                  int sw) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(input->getDims(), input->getDType());
+    auto op =
+        g->addOpWithOutputs<AvgPoolObj>(i0, o0, kh, kw, dh, dw, ph, pw, sh, sw);
+    return op;
+}
+
+Operator GraphBuilderObj::avgpool(Tensor input, int kh, int kw, int dh, int dw,
+                                  int ph, int pw, int sh, int sw) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<AvgPoolObj>(i0, nullptr, kh, kw, dh, dw, ph, pw, sh, sw);
+    return op;
+}
+
+Operator GraphBuilderObj::add(Tensor input0, Tensor input1, Tensor output) {
+    Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
+    Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<AddObj>(i0, i1, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::add(Tensor input0, Tensor input1) {
+    Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
+    Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
+    auto op = g->addOp<AddObj>(i0, i1, nullptr);
+    return op;
+}
+
+Operator GraphBuilderObj::sub(Tensor input0, Tensor input1, Tensor output) {
+    std::cout << "Sub1" << std::endl;
+    Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
+    Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<SubObj>(i0, i1, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::sub(Tensor input0, Tensor input1) {
+    std::cout << "Sub2" << std::endl;
+    Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
+    Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
+    auto op = g->addOp<SubObj>(i0, i1, nullptr);
+    return op;
+}
+
+Operator GraphBuilderObj::mul(Tensor input0, Tensor input1, Tensor output) {
+    Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
+    Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<MulObj>(i0, i1, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::mul(Tensor input0, Tensor input1) {
+    Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
+    Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
+    auto op = g->addOp<MulObj>(i0, i1, nullptr);
+    return op;
+}
+
+Operator GraphBuilderObj::div(Tensor input0, Tensor input1, Tensor output) {
+    Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
+    Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<DivObj>(i0, i1, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::div(Tensor input0, Tensor input1) {
+    Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
+    Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
+    auto op = g->addOp<DivObj>(i0, i1, nullptr);
+    return op;
+}
+
+Operator GraphBuilderObj::pow(Tensor input0, Tensor input1, Tensor output) {
+    Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
+    Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<PowObj>(i0, i1, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::pow(Tensor input0, Tensor input1) {
+    Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
+    Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
+    auto op = g->addOp<PowObj>(i0, i1, nullptr);
+    return op;
+}
+
+Operator GraphBuilderObj::gather(Tensor input, Tensor index, Tensor output,
+                                 int axis) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<GatherObj>(i0, index, o0, axis);
+    return op;
+}
+
+Operator GraphBuilderObj::gather(Tensor input, Tensor index, int axis) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<GatherObj>(i0, index, nullptr, axis);
+    return op;
+}
+
+Operator GraphBuilderObj::reshape(Tensor input, Tensor output,
+                                  const Shape &dims) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<ReshapeObj>(i0, o0, dims);
+    return op;
+}
+
+Operator GraphBuilderObj::reshape(Tensor input, const Shape &dims) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<ReshapeObj>(i0, nullptr, dims);
+    return op;
+}
+
+Operator GraphBuilderObj::flatten(Tensor input, Tensor output) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<FlattenObj>(i0, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::flatten(Tensor input) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<FlattenObj>(i0, nullptr);
+    return op;
+}
+
+Operator GraphBuilderObj::identity(Tensor input, Tensor output) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<IdentityObj>(i0, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::identity(Tensor input) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<IdentityObj>(i0, nullptr);
+    return op;
+}
+
+Operator GraphBuilderObj::softmax(Tensor input, Tensor output) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<SoftmaxObj>(i0, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::softmax(Tensor input) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<SoftmaxObj>(i0, nullptr);
+    return op;
+}
+
+Operator GraphBuilderObj::relu(Tensor input, Tensor output) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<ReluObj>(i0, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::relu(Tensor input) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<ReluObj>(i0, nullptr);
+    return op;
+}
+
+Operator GraphBuilderObj::sigmoid(Tensor input, Tensor output) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<SigmoidObj>(i0, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::sigmoid(Tensor input) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<SigmoidObj>(i0, nullptr);
+    return op;
+}
+
+Operator GraphBuilderObj::tanh(Tensor input, Tensor output) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<TanhObj>(i0, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::tanh(Tensor input) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<TanhObj>(i0, nullptr);
+    return op;
+}
+
+Operator GraphBuilderObj::abs(Tensor input, Tensor output) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<AbsObj>(i0, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::abs(Tensor input) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    auto op = g->addOp<AbsObj>(i0, nullptr);
+    return op;
+}
+
+Operator GraphBuilderObj::reduceMean(Tensor input, Tensor output, int axis) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op =
+        g->addOpWithOutputs<ReduceMeanObj>(i0, o0, std::vector<int>({axis}));
+    return op;
+}
+
+Operator GraphBuilderObj::erf(Tensor input, Tensor output) {
+    Tensor i0 = g->addTensor(input->getDims(), input->getDType());
+    Tensor o0 = g->addTensor(output->getDims(), output->getDType());
+    auto op = g->addOpWithOutputs<ErfObj>(i0, o0);
+    return op;
+}
+
+Operator GraphBuilderObj::memBound(const TensorVec &inputs,
+                                   const TensorVec &outputs,
+                                   const std::vector<nnet::Tensor> &nnetInputs,
+                                   nnet::Expr expr, double exec_time,
+                                   std::string hint) {
+    TensorVec is;
+    for (auto input : inputs) {
+        auto i = g->addTensor(input->getDims(), input->getDType());
+        is.push_back(i);
+    }
+    TensorVec os;
+    for (auto output : outputs) {
+        auto o = g->addTensor(output->getDims(), output->getDType());
+        os.push_back(o);
+    }
+    auto op = g->addOpWithOutputs<MemBoundObj>(is, os, nnetInputs, expr,
+                                               exec_time, hint);
+    return op;
+}
+
+} // namespace infini
--- a/src/cuda/operator_timer.cc
+++ b/src/cuda/operator_timer.cc
@ -1,3 +1,4 @@
+#include "cuda/operator_timer.h"
 #include "core/graph.h"
 #include "core/kernel.h"
 #include "core/runtime.h"
@ -12,8 +13,23 @@ namespace opTimer {

 double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
                        int padh, int padw, int strideh, int stridew,
-                        int dilationh, int dilationw, int group,
-                        const char *name) {
+                        int dilationh, int dilationw, int group) {
+    return getPerfConvBiasActCudnn(n, c, h, w, f, r, s, padh, padw, strideh,
+                                   stridew, dilationh, dilationw, group, false,
+                                   "None");
+}
+
+double getPerfConvBiasActCudnn(int n, int c, int h, int w, int f, int r, int s,
+                               int padh, int padw, int strideh, int stridew,
+                               int dilationh, int dilationw, int group,
+                               bool bias, string actName) {
+    ActType act = ActType::None;
+    if (actName == "None")
+        act = ActType::None;
+    else if (actName == "Relu")
+        act = ActType::Relu;
+    else
+        IT_ASSERT(false, "Unsupported activation");
    // const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
    // dilationh, dilationw, group] =
    //     tuple{1, 512, 14, 14, 512, 3, 3, 2, 2, 1, 1, 2, 2, 1};
@ -25,17 +41,27 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
    IT_ASSERT(c % group == 0);
    Tensor i0Cpu = gCpu->addTensor({n, c, h, w}, DataType::Float32);
    Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
+    Tensor b0Cpu = gCpu->addTensor({f}, DataType::Float32);
    // Malloc data for all tensors in a graph. Do we need implicit allocation?
    gCpu->dataMalloc();
    i0Cpu->setData(IncrementalGenerator());
    w0Cpu->setData(IncrementalGenerator());
+    b0Cpu->setData(IncrementalGenerator());

    // Copy input tensors from CPU to CUDA
    Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
    Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
+    Tensor b0Cuda = gCuda->cloneTensor(b0Cpu);
    // Build CUDA graph
-    auto conv = gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, padh, padw,
-                                      strideh, stridew, dilationh, dilationw);
+    if (!bias) {
+        auto conv =
+            gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, padh, padw, strideh,
+                                  stridew, dilationh, dilationw);
+    } else {
+        auto conv =
+            gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, padh, padw, strideh,
+                                  stridew, dilationh, dilationw, b0Cuda, act);
+    }
    // allocate CUDA memory
    gCuda->dataMalloc();
    // Execute on CUDA
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -2,6 +2,7 @@
 #ifdef USE_CUDA
 #include "cuda/operator_timer.h"
 #endif
+#include "core/graph_builder.h"
 namespace py = pybind11;

 namespace infini {
@ -13,11 +14,193 @@ void register_operator_timer(py::module &m) {
 #ifdef USE_CUDA
    using namespace opTimer;
    m.def("getPerfConvCudnn", &getPerfConvCudnn);
+    m.def("getPerfConvBiasActCudnn", &getPerfConvBiasActCudnn);
    m.def("getPerfConvTransposed2dCudnn", &getPerfConvTransposed2dCudnn);
    m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
+    m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
 #endif
 }

+void init_graph_builder(py::module &m) {
+    py::class_<RuntimeObj, std::shared_ptr<RuntimeObj>>(m, "RuntimeObj");
+    py::class_<CpuRuntimeObj, std::shared_ptr<CpuRuntimeObj>, RuntimeObj>(
+        m, "CpuRuntimeObj")
+        .def(py::init<>())
+        .def("getInstance", py::overload_cast<>(&CpuRuntimeObj::getInstance),
+             policy::reference_internal);
+    py::class_<Shape>(m, "Shape");
+    py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "TensorObj");
+    py::class_<Tensor>(m, "Tensor");
+    py::class_<TensorVec>(m, "TensorVec");
+    py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "OperatorObj");
+    py::class_<Operator>(m, "Operator");
+    py::class_<ActType>(m, "ActType");
+    py::class_<ConvObj, std::shared_ptr<ConvObj>, OperatorObj>(m, "ConvObj");
+    py::class_<MatmulObj, std::shared_ptr<MatmulObj>, OperatorObj>(m,
+                                                                   "MatmulObj");
+    py::class_<ConvTransposed2dObj, std::shared_ptr<ConvTransposed2dObj>,
+               OperatorObj>(m, "ConvTransposed2dObj");
+    py::class_<G2BMMObj, std::shared_ptr<G2BMMObj>, OperatorObj>(m, "G2BMMObj");
+    py::class_<GBMMObj, std::shared_ptr<GBMMObj>, OperatorObj>(m, "GBMMObj");
+    py::class_<PadObj, std::shared_ptr<PadObj>, OperatorObj>(m, "PadObj");
+    py::class_<SliceObj, std::shared_ptr<SliceObj>, OperatorObj>(m, "SliceObj");
+    py::class_<ConcatObj, std::shared_ptr<ConcatObj>, OperatorObj>(m,
+                                                                   "ConcatObj");
+    py::class_<SplitObj, std::shared_ptr<SplitObj>, OperatorObj>(m, "SplitObj");
+    py::class_<TransposeObj, std::shared_ptr<TransposeObj>, OperatorObj>(
+        m, "TransposeObj");
+    py::class_<ExtendObj, std::shared_ptr<ExtendObj>, OperatorObj>(m,
+                                                                   "ExtendObj");
+    py::class_<MaxPoolObj, std::shared_ptr<MaxPoolObj>, OperatorObj>(
+        m, "MaxPoolObj");
+    py::class_<AvgPoolObj, std::shared_ptr<AvgPoolObj>, OperatorObj>(
+        m, "AvgPoolObj");
+    py::class_<AddObj, std::shared_ptr<AddObj>, OperatorObj>(m, "AddObj");
+    py::class_<SubObj, std::shared_ptr<SubObj>, OperatorObj>(m, "SubObj");
+    py::class_<MulObj, std::shared_ptr<MulObj>, OperatorObj>(m, "MulObj");
+    py::class_<DivObj, std::shared_ptr<DivObj>, OperatorObj>(m, "DivObj");
+    py::class_<PowObj, std::shared_ptr<PowObj>, OperatorObj>(m, "PowObj");
+    py::class_<GatherObj, std::shared_ptr<GatherObj>, OperatorObj>(m,
+                                                                   "GatherObj");
+    py::class_<ReshapeObj, std::shared_ptr<ReshapeObj>, OperatorObj>(
+        m, "ReshapeObj");
+    py::class_<FlattenObj, std::shared_ptr<FlattenObj>, OperatorObj>(
+        m, "FlattenObj");
+    py::class_<IdentityObj, std::shared_ptr<IdentityObj>, OperatorObj>(
+        m, "IdentityObj");
+    py::class_<SoftmaxObj, std::shared_ptr<SoftmaxObj>, OperatorObj>(
+        m, "SoftmaxObj");
+    py::class_<ReluObj, std::shared_ptr<ReluObj>, OperatorObj>(m, "ReluObj");
+    py::class_<SigmoidObj, std::shared_ptr<SigmoidObj>, OperatorObj>(
+        m, "SigmoidObj");
+    py::class_<TanhObj, std::shared_ptr<TanhObj>, OperatorObj>(m, "TanhObj");
+    py::class_<AbsObj, std::shared_ptr<AbsObj>, OperatorObj>(m, "AbsObj");
+    py::class_<ReduceMeanObj, std::shared_ptr<ReduceMeanObj>, OperatorObj>(
+        m, "ReduceMeanObj");
+    py::class_<MemBoundObj, std::shared_ptr<MemBoundObj>, OperatorObj>(
+        m, "MemBoundObj");
+    py::class_<GraphBuilder>(m, "GraphBuilder");
+    py::class_<GraphBuilderObj>(m, "GraphBuilderObj")
+        .def(py::init<Runtime>())
+        .def("tensor",
+             py::overload_cast<Shape, const std::string &>(
+                 &GraphBuilderObj::tensor),
+             policy::reference_internal)
+        .def("conv",
+             py::overload_cast<Tensor, Tensor, Tensor, int, int, int, int, int,
+                               int, Tensor>(&GraphBuilderObj::conv),
+             policy::reference_internal)
+        .def("matmul",
+             py::overload_cast<Tensor, Tensor, Tensor, bool, bool>(
+                 &GraphBuilderObj::matmul),
+             policy::reference_internal)
+        .def("convTrans",
+             py::overload_cast<Tensor, Tensor, Tensor, int, int, int, int, int,
+                               int, int, int, int, Tensor, ActType>(
+                 &GraphBuilderObj::convTrans),
+             policy::reference_internal)
+        .def("g2bmm",
+             py::overload_cast<Tensor, Tensor, Tensor, const int, const int,
+                               Tensor, ActType>(&GraphBuilderObj::g2bmm),
+             policy::reference_internal)
+        .def("gbmml",
+             py::overload_cast<Tensor, Tensor, Tensor, const int, Tensor,
+                               ActType>(&GraphBuilderObj::gbmml),
+             policy::reference_internal)
+
+        .def("pad",
+             py::overload_cast<Tensor, Tensor, const vector<int> &,
+                               const optional<const vector<int>> &>(
+                 &GraphBuilderObj::pad),
+             policy::reference_internal)
+        .def("slice",
+             py::overload_cast<Tensor, Tensor, const vector<int> &,
+                               const vector<int> &,
+                               const optional<const vector<int>> &,
+                               const optional<const vector<int>> &>(
+                 &GraphBuilderObj::slice),
+             policy::reference_internal)
+        .def(
+            "concat",
+            py::overload_cast<TensorVec, Tensor, int>(&GraphBuilderObj::concat),
+            policy::reference_internal)
+        .def("split",
+             py::overload_cast<Tensor, std::optional<TensorVec>, int, int>(
+                 &GraphBuilderObj::split),
+             policy::reference_internal)
+        .def("transpose",
+             py::overload_cast<Tensor, Tensor, const vector<int> &>(
+                 &GraphBuilderObj::transpose),
+             policy::reference_internal)
+        .def("extend",
+             py::overload_cast<Tensor, Tensor, int, int>(
+                 &GraphBuilderObj::extend),
+             policy::reference_internal)
+        .def("maxpool",
+             py::overload_cast<Tensor, Tensor, int, int, int, int, int, int,
+                               int, int>(&GraphBuilderObj::maxpool),
+             policy::reference_internal)
+        .def("avgpool",
+             py::overload_cast<Tensor, Tensor, int, int, int, int, int, int,
+                               int, int>(&GraphBuilderObj::avgpool),
+             policy::reference_internal)
+        .def("add",
+             py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::add),
+             policy::reference_internal)
+        .def("sub",
+             py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::sub),
+             policy::reference_internal)
+        .def("mul",
+             py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::mul),
+             policy::reference_internal)
+        .def("div",
+             py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::div),
+             policy::reference_internal)
+        .def("pow",
+             py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::pow),
+             policy::reference_internal)
+        .def("gather",
+             py::overload_cast<Tensor, Tensor, Tensor, int>(
+                 &GraphBuilderObj::gather),
+             policy::reference_internal)
+        .def("reshape",
+             py::overload_cast<Tensor, Tensor, const Shape &>(
+                 &GraphBuilderObj::reshape),
+             policy::reference_internal)
+        .def("flatten",
+             py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::flatten),
+             policy::reference_internal)
+        .def("identity",
+             py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::identity),
+             policy::reference_internal)
+        .def("softmax",
+             py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::softmax),
+             policy::reference_internal)
+        .def("relu", py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::relu),
+             policy::reference_internal)
+        .def("sigmoid",
+             py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::sigmoid),
+             policy::reference_internal)
+        .def("tanh", py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::tanh),
+             policy::reference_internal)
+        .def("abs", py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::abs),
+             policy::reference_internal)
+        .def("reduceMean",
+             py::overload_cast<Tensor, Tensor, int>(
+                 &GraphBuilderObj::reduceMean),
+             policy::reference_internal)
+        .def("erf", py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::erf),
+             policy::reference_internal)
+        .def("memBound",
+             py::overload_cast<const TensorVec &, const TensorVec &,
+                               const std::vector<nnet::Tensor> &, nnet::Expr,
+                               double, std::string>(&GraphBuilderObj::memBound),
+             policy::reference_internal);
+}
+
 } // namespace infini

-PYBIND11_MODULE(pyinfinitensor, m) { infini::register_operator_timer(m); }
+PYBIND11_MODULE(pyinfinitensor, m) {
+    infini::register_operator_timer(m);
+    infini::init_graph_builder(m);
+}
--- a/src/kernels/cuda/conv.cc
+++ b/src/kernels/cuda/conv.cc
@ -8,6 +8,8 @@
 namespace infini {

 struct ConvCuDnnPerfRecordObj : public PerfRecordObj {
+    int kernel =
+        0; // 0 cudnnConvolutionForward, 1 cudnnConvolutionBiasActivationForward
    int algo = 0; // cudnnConvolutionFwdAlgo_t
    int mode = 1;
    size_t workspaceSize = 100000;
@ -56,8 +58,6 @@ class convCudnn : public Kernel {
                          const ConvCuDnnPerfRecord &record) const {
        void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
-        if (op->getInputs().size() > 2) // Bias is not supported yet
-            IT_TODO_HALT();
        // void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
        void *const outData = (op->getOutput()->getRawDataPtr<void *>());

@ -209,6 +209,36 @@ class convCudnn : public Kernel {
        return true;
    }

+    bool cuDNNfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
+                    const CudaRuntimeObj *context) const {
+        cudnnStatus_t stat;
+
+        const auto &[inData, knData, outData, inDesc, knDesc, biasDesc,
+                     convDesc, actDesc, outDesc] =
+            createCuDNNDescriptor(op, record);
+        size_t wsSize = record->workspaceSize;
+        CudaPtr wsData = context->getWorkspace(wsSize);
+        float alpha = 1.f, beta = 0.f;
+
+        // w/ bias & act
+        stat = cudnnConvolutionBiasActivationForward(
+            context->cudnnHandle(), &alpha, inDesc, inData, knDesc, knData,
+            convDesc, ALGOS[record->algo], wsData, wsSize, &beta, outDesc,
+            outData, biasDesc, nullptr, actDesc, outDesc, outData);
+        if (stat != CUDNN_STATUS_SUCCESS)
+            return false;
+
+        // Destories in CUDA does not require sync. But cuDNN does not state
+        // whether sync is required before destories.
+        checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
+        checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
+        checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
+        checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
+        checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
+        checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
+        return true;
+    }
+
    void compute(const Operator &op, const RuntimeObj *context) const override {
        auto record = make_ref<ConvCuDnnPerfRecordObj>(); // with paramters in
                                                          // default ctor
@ -217,10 +247,88 @@ class convCudnn : public Kernel {

    PerfRecord tune(const Operator &_op,
                    const RuntimeObj *_context) const override {
-        ConvCuDnnPerfRecordObj ret;
-        ret.time = std::numeric_limits<double>::max();
        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
        auto op = as<ConvObj>(_op);
+        printf("%s\n", op->toString().c_str());
+        if (op->hasBias() || op->getAct() != ActType::None)
+            return tuneFused(op, context);
+        else
+            return tuneUnfused(op, context);
+    }
+
+    PerfRecord tuneFused(const Ref<ConvObj> &op,
+                         const CudaRuntimeObj *context) const {
+        ConvCuDnnPerfRecordObj ret;
+        ret.time = std::numeric_limits<double>::max();
+
+        // Both modes have the same performance. Only run cross-correlation.
+        for (int mode = 1; mode < 2; mode++) {
+            // Try every possible algorithm of convolution
+            for (int algo = 0; algo < N_ALGO; algo++) {
+                auto recordRef = make_ref<ConvCuDnnPerfRecordObj>();
+                auto &record = *recordRef;
+                record.mode = mode;
+                record.algo = algo;
+                cudnnStatus_t stat;
+                const auto &[inData, knData, outData, inDesc, knDesc, biasDesc,
+                             convDesc, actDesc, outDesc] =
+                    createCuDNNDescriptor(op, recordRef);
+                void *biasData = op->getBias()->getRawDataPtr<void *>();
+
+                // get workspace
+                stat = cudnnGetConvolutionForwardWorkspaceSize(
+                    context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
+                    ALGOS[record.algo], &record.workspaceSize);
+                if (stat != CUDNN_STATUS_SUCCESS)
+                    continue;
+                if (record.workspaceSize > context->getWorkspaceSize())
+                    continue;
+                CudaPtr wsData = context->getWorkspace(record.workspaceSize);
+                float alpha = 1.f, beta = 0.f;
+
+                stat = cudnnConvolutionBiasActivationForward(
+                    context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
+                    knData, convDesc, ALGOS[record.algo], wsData,
+                    record.workspaceSize, &beta, outDesc, outData, biasDesc,
+                    biasData, actDesc, outDesc, outData);
+                if (stat != CUDNN_STATUS_SUCCESS)
+                    continue;
+                record.time = timeit(
+                    [&]() {
+                        stat = cudnnConvolutionBiasActivationForward(
+                            context->cudnnHandle(), &alpha, inDesc, inData,
+                            knDesc, knData, convDesc, ALGOS[record.algo],
+                            wsData, record.workspaceSize, &beta, outDesc,
+                            outData, biasDesc, biasData, actDesc, outDesc,
+                            outData);
+                    },
+                    [&]() { context->sync(); });
+                printf("mode %d, algo %d, time %.3lf ms\n", mode, algo,
+                       record.time);
+
+                // Update the tune result
+                if (ret.time > record.time)
+                    ret = record;
+                checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
+                checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
+                checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
+                checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
+                checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
+                checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
+            }
+        }
+        printf("tuneFused: the best algo is %d, the best conv mode is %d\n",
+               ret.algo, ret.mode);
+        IT_ASSERT(ret.time < std::numeric_limits<double>::max(), "No valid "
+                                                                 "algorithm "
+                                                                 "found");
+        return make_ref<ConvCuDnnPerfRecordObj>(ret);
+    }
+
+    PerfRecord tuneUnfused(const Ref<ConvObj> &op,
+                           const CudaRuntimeObj *context) const {
+        ConvCuDnnPerfRecordObj ret;
+        ret.time = std::numeric_limits<double>::max();
        // Both modes have the same performance. Only run cross-correlation.
        for (int mode = 1; mode < 2; mode++) {
            // Try every possible algorithm of convolution
@ -260,7 +368,8 @@ class convCudnn : public Kernel {
                                                &beta, outDesc, outData);
                    },
                    [&]() { context->sync(); });
-                // printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
+                printf("mode %d, algo %d, time %.3lf ms\n", mode, algo,
+                       record.time);

                // Update the tune result
                if (ret.time > record.time)
@ -273,8 +382,8 @@ class convCudnn : public Kernel {
                checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
            }
        }
-        // printf("the best algo is %d, the best conv mode is %d\n", ret.algo,
-        //        ret.mode);
+        printf("tuneUnfused: the best algo is %d, the best conv mode is %d\n",
+               ret.algo, ret.mode);
        IT_ASSERT(ret.time < std::numeric_limits<double>::max(), "No valid "
                                                                 "algorithm "
                                                                 "found");
--- a/src/kernels/cuda/unary.cu
+++ b/src/kernels/cuda/unary.cu
@ -1,4 +1,3 @@
-#include "core/common.h"
 #include "core/constants.h"
 #include "cuda/cuda_common.h"
 #include <math.h>
--- a/src/operators/conv.cc
+++ b/src/operators/conv.cc
@ -5,15 +5,15 @@ namespace infini {
 ConvBaseObj::ConvBaseObj(OpType opType, TensorVec inputs, Tensor &output,
                         int ph, int pw, int sh, int sw, int dh, int dw,
                         const Tensor &inputInConvFWD,
-                         const Tensor &weightInConvFWD)
+                         const Tensor &weightInConvFWD, const ActType act)
    : OperatorObj(opType, inputs, {output}), ph(ph), pw(pw), sh(sh), sw(sw),
-      dh(dh), dw(dw), padding(PaddingMode::Other) {}
+      dh(dh), dw(dw), padding(PaddingMode::Other), act(act) {}
 ConvBaseObj::ConvBaseObj(OpType opType, TensorVec inputs, Tensor &output,
                         PaddingMode mode, int sh, int sw, int dh, int dw,
                         const Tensor &inputInConvFWD,
-                         const Tensor &weightInConvFWD)
+                         const Tensor &weightInConvFWD, const ActType act)
    : OperatorObj(opType, inputs, {output}), ph(-1), pw(-1), sh(sh), sw(sw),
-      dh(dh), dw(dw), padding(mode) {
+      dh(dh), dw(dw), padding(mode), act(act) {
    IT_ASSERT(mode != PaddingMode::Other);
 }

@ -21,28 +21,60 @@ string ConvBaseObj::toString() const {
    std::ostringstream os;
    os << OpRegistry::getOpName(getOpType()) << "[" << getGuid() << "]";
    os << "(";
-    if (inputs.size() == 2) {
-        os << vecToString(inputs[0]->getDims()) << ",";
-        os << vecToString(inputs[1]->getDims()) << ",";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << vecToString(inputs[1]->getDims()) << ",";
+    if (inputs.size() > 2) {
+        os << vecToString(inputs[2]->getDims()) << ",";
    }
    os << "p=[" << ph << "," << pw << "],";
    os << "s=[" << sh << "," << sw << "],";
    os << "d=[" << dh << "," << dw << "],";
+    os << "act=" << enum_to_underlying(getAct()) << ",";
    // os << "act=" << enum_to_underlying(act) << ",";
    os << "input=" << inputs[0]->getGuid() << ",";
    os << "weight=" << inputs[1]->getGuid() << ",";
+    os << "bias="
+       << ((inputs.size() == 2) ? "nullptr"
+                                : std::to_string(inputs[2]->getGuid()))
+       << ",";
    os << "output=" << outputs[0]->getGuid() << ")";
    return os.str();
 }

 vector<int> ConvBaseObj::getWorkloadVector() const {
-    return {
-        enum_to_underlying(type), n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw};
+    return {enum_to_underlying(type),
+            n,
+            c,
+            h,
+            w,
+            f,
+            r,
+            s,
+            ph,
+            pw,
+            sh,
+            sw,
+            dh,
+            dw,
+            hasBias(),
+            enum_to_underlying(getAct())};
 }

 vector<int> ConvBaseObj::getOpAttrVector() const {
    IT_TODO_HALT(); // should padding mode / ph+pw be in attrs?
-    return {enum_to_underlying(type), c, f, r, s, ph, pw, sh, sw, dh, dw};
+    return {enum_to_underlying(type),
+            c,
+            f,
+            r,
+            s,
+            ph,
+            pw,
+            sh,
+            sw,
+            dh,
+            dw,
+            hasBias(),
+            enum_to_underlying(getAct())};
 }

 void ConvObj::setAuxilaryAttributes(PaddingMode mode) {
@ -64,11 +96,10 @@ void ConvObj::setAuxilaryAttributes(PaddingMode mode) {
 ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
                 int ph, int pw, int sh, int sw, int dh, int dw, Tensor bias,
                 ActType act)
-    : ConvBaseObj(OpType::Conv, {input, weight}, output, ph, pw, sh, sw, dh, dw,
-                  input, weight),
-      act(act) {
-    if (bias)
-        IT_TODO_HALT();
+    : ConvBaseObj(OpType::Conv,
+                  ((bias) ? (TensorVec{input, weight, bias})
+                          : (TensorVec{input, weight})),
+                  output, ph, pw, sh, sw, dh, dw, input, weight, act) {
    setAuxilaryAttributes(PaddingMode::Other);
    IT_ASSERT(checkValid(graph));
 }
@ -76,11 +107,10 @@ ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
 ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
                 PaddingMode mode, int sh, int sw, int dh, int dw, Tensor bias,
                 ActType act)
-    : ConvBaseObj(OpType::Conv, {input, weight}, output, mode, sh, sw, dh, dw,
-                  input, weight),
-      act(act) {
-    if (bias)
-        IT_TODO_HALT();
+    : ConvBaseObj(OpType::Conv,
+                  ((bias) ? (TensorVec{input, weight, bias})
+                          : (TensorVec{input, weight})),
+                  output, mode, sh, sw, dh, dw, input, weight, act) {
    setAuxilaryAttributes(mode);
    IT_ASSERT(checkValid(graph));
 }
@ -98,6 +128,11 @@ optional<vector<Shape>> ConvObj::inferShape(const TensorVec &inputs) const {
    // For NCHW+FCRS layout, C of input is divisable by C of weight
    if (input->getDims()[1] % weight->getDims()[1] != 0)
        return {};
+    // check bias shape
+    if (inputs.size() == 3) {
+        if (inputs[2]->size() != (size_t)f)
+            return {};
+    }
    // Set padding size
    if (padding == PaddingMode::Other) {
        oh = (h - (r - sh) * dh + ph * 2) / sh;
@ -122,8 +157,8 @@ ConvTransposed2dObj::ConvTransposed2dObj(GraphObj *graph, Tensor input,
                                         int oph, int opw, int group,
                                         Tensor bias, ActType act)
    : ConvBaseObj(OpType::ConvTrans, {input, weight}, output, ph, pw, sh, sw,
-                  dh, dw, output, weight),
-      oph(oph), opw(opw), group(group), act(act) {
+                  dh, dw, output, weight, act),
+      oph(oph), opw(opw), group(group) {
    if (bias)
        IT_TODO_HALT();
    setAuxilaryAttributes(PaddingMode::Other);
@ -136,8 +171,8 @@ ConvTransposed2dObj::ConvTransposed2dObj(GraphObj *graph, Tensor input,
                                         int dh, int dw, int oph, int opw,
                                         int group, Tensor bias, ActType act)
    : ConvBaseObj(OpType::ConvTrans, {input, weight}, output, mode, sh, sw, dh,
-                  dw, output, weight),
-      oph(oph), opw(opw), group(group), act(act) {
+                  dw, output, weight, act),
+      oph(oph), opw(opw), group(group) {
    if (bias)
        IT_TODO_HALT();
    setAuxilaryAttributes(mode);
@ -156,6 +191,8 @@ ConvTransposed2dObj::inferShape(const TensorVec &inputs) const {
    auto s = weight->getDims()[3];
    if (f != weight->getDims()[0])
        return {};
+    if (inputs.size() != 2)
+        IT_TODO_HALT();

    int on = n, oc = c * group;
    int oh = 0, ow = 0;
--- a/src/operators/element_wise.cc
+++ b/src/operators/element_wise.cc
@ -11,6 +11,7 @@ optional<vector<Shape>>
 ElementWiseObj::inferShape(const TensorVec &inputs) const {
    // For now,we only process the same dims here, broardcast will be considered
    // in the opt layer.
+    std::cout << std::endl;
    const auto A = inputs[0], B = inputs[1];
    if (A->getDims().size() != B->getDims().size() ||
        A->getDims() != B->getDims())
--- a/src/operators/reduce_mean.cc
+++ b/src/operators/reduce_mean.cc
@ -12,6 +12,7 @@ ReduceMeanObj::ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
            int idx = (*_axis)[j];
            if (idx < 0)
                IT_TODO_HALT();
+            std::cout << idx << " " << input->getDims().size() << std::endl;
            IT_ASSERT((size_t)idx < input->getDims().size());
            axis.emplace(idx);
        }
--- a/src/operators/transpose.cc
+++ b/src/operators/transpose.cc
@ -0,0 +1,52 @@
+#include "operators/transpose.h"
+
+namespace infini {
+TransposeObj::TransposeObj(GraphObj *graph, Tensor input, Tensor output,
+                           const Shape &perm)
+    : OperatorObj(OpType::Transpose, {input}, {output}), perm(perm) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>>
+TransposeObj::inferShape(const TensorVec &inputs) const {
+    const Shape &dimsIn = inputs[0]->getDims();
+    Shape dimsOut;
+    std::unordered_set<size_t> dimSet;
+    for (size_t i = 0; i < perm.size(); ++i) {
+        if (size_t(perm[i]) >= dimsIn.size() ||
+            dimSet.find(perm[i]) != dimSet.end()) {
+            std::cout << i << " " << perm[i] << " "
+                      << int(dimSet.find(perm[i]) != dimSet.end()) << std::endl;
+            return {};
+        }
+        dimsOut.emplace_back(dimsIn[perm[i]]);
+        dimSet.emplace(perm[i]);
+    }
+    std::cout << "transpose Ok" << std::endl;
+    return {{dimsOut}};
+}
+
+std::string TransposeObj::toString() const {
+    std::ostringstream os;
+    os << "Transpose[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "perm=" << vecToString(perm) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> TransposeObj::getWorkloadVector() const {
+    vector<int> ret = inputs[0]->getDims();
+    ret.insert(ret.end(), perm.begin(), perm.end());
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+vector<int> TransposeObj::getOpAttrVector() const {
+    vector<int> ret = perm;
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+} // namespace infini
--- a/src/pfusion/CMakeLists.txt
+++ b/src/pfusion/CMakeLists.txt
@ -0,0 +1,6 @@
+project(PowerFusion)
+
+aux_source_directory(. SRC_CXX)
+aux_source_directory(micro_kernel/. SRC_CXX)
+cuda_add_library(pf_base SHARED ${SRC_CXX})
+set_target_properties(pf_base PROPERTIES CUDA_ARCHITECTURES "80")
--- a/src/pfusion/code.cpp
+++ b/src/pfusion/code.cpp
@ -0,0 +1,27 @@
+#include <iostream>
+
+#include "code.h"
+#include "memory_operator.h"
+
+namespace memb {
+std::string MemoryOperator::generate() {
+    CodeBuffer buf;
+    std::string inst;
+    buf.emit("#pragma unroll");
+    buf.emit("for (int inst_idx = 0; inst_idx < " + num + "; inst_idx++) {");
+    if (opType == READ) {
+        inst = "reg[inst_idx] = " + ptr.base_ptr + "[" + ptr.offset + " + " +
+               offset + "];";
+        buf.emit(inst);
+    } else if (opType == WRITE) {
+        inst = ptr.base_ptr + "[" + ptr.offset + " + " + offset +
+               "] = " + "reg[inst_idx];";
+        buf.emit(inst);
+    } else {
+        std::cout << "[ERROR]" << std::endl;
+        exit(-1);
+    }
+    buf.emit("}");
+    return buf.toString();
+}
+} // namespace memb
--- a/src/pfusion/gen_relu.cpp
+++ b/src/pfusion/gen_relu.cpp
@ -0,0 +1,90 @@
+#include "codegen.h"
+#include "memory_operator.h"
+#include "micro_kernel/element.h"
+#include "micro_kernel/transpose.h"
+
+namespace memb {
+
+std::string gen_relu(int dimn, int dimx, int dimy, int nblocks, int nthreads) {
+    std::string code = "\n";
+
+    MicroKernelElement relu("relu", "return element > 0 ? element : 0");
+    code += relu.gen_func();
+    code += "\n";
+
+    code += "// Kernel\n";
+    code += "__global__ void kernel_tmp(float *src, float *dst) {\n";
+    code += "int lane_id = threadIdx.x % 32;\n";
+    code += "int warp_id = threadIdx.x / 32;\n";
+    code += "int parallel_idx = blockIdx.x * " + std::to_string(nthreads / 32) +
+            " + warp_id;\n";
+    code += "float reg[32];\n";
+    code += "__shared__ float smem[32 * 32 * 2 * " +
+            std::to_string(nthreads / 32) + "];\n";
+
+    int loop = ((dimx - 1) / 32 + 1) * ((dimy - 1) / 32 + 1);
+    code += "for (int loop_idx = 0; loop_idx < " + std::to_string(loop) +
+            "; loop_idx++) {\n";
+
+    MemoryOperator dram_read;
+    dram_read.memoryType = MemoryOperator::DRAM;
+    dram_read.opType = MemoryOperator::READ;
+    dram_read.ptr = Ptr("src", "parallel_idx * " + std::to_string(dimx * dimy));
+    dram_read.num =
+        "MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
+    dram_read.offset =
+        "INDEX(loop_idx / 32 + inst_idx, loop_idx % 32 + lane_id, " +
+        std::to_string(dimy) + ")";
+    dram_read.reg = "inst_idx";
+    code += dram_read.generate();
+
+    MemoryOperator sram_write;
+    sram_write.memoryType = MemoryOperator::SRAM;
+    sram_write.opType = MemoryOperator::WRITE;
+    sram_write.ptr = Ptr("smem", "warp_id * 32 * 32 * 2");
+    sram_write.num =
+        "MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
+    sram_write.offset = "inst_idx * 32 + lane_id";
+    sram_write.reg = "inst_idx";
+    code += sram_write.generate();
+
+    code += relu.gen_kernel(Ptr("smem", "warp_id * 32 * 32 * 2"),
+                            Ptr("smem", "warp_id * 32 * 32 * 2 + 32 * 32"), 32,
+                            32, "32");
+
+    MemoryOperator sram_read;
+    sram_read.memoryType = MemoryOperator::SRAM;
+    sram_read.opType = MemoryOperator::READ;
+    sram_read.ptr = Ptr("smem", "warp_id * 32 * 32 * 2 + 32 * 32");
+    sram_read.num =
+        "MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
+    sram_read.offset = "inst_idx * 32 + lane_id";
+    sram_read.reg = "inst_idx";
+    code += sram_read.generate();
+
+    MemoryOperator dram_write;
+    sram_read.memoryType = MemoryOperator::DRAM;
+    sram_read.opType = MemoryOperator::WRITE;
+    sram_read.ptr = Ptr("dst", "parallel_idx * " + std::to_string(dimx) +
+                                   " * " + std::to_string(dimy));
+    sram_read.num =
+        "MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
+    sram_read.offset =
+        "INDEX(loop_idx / 32 + inst_idx, loop_idx % 32 + lane_id, " +
+        std::to_string(dimy) + ")";
+    sram_read.reg = "inst_idx";
+    code += sram_read.generate();
+    code += "}\n";
+    code += "}\n\n";
+
+    code += "void transpose(float *src, float *dst) {\n";
+    code += "dim3 gridDim(" + std::to_string(nblocks) + ", 1);";
+    code += "dim3 blockDim(" + std::to_string(nthreads) + ", 1);";
+    code += "kernel_tmp<<<gridDim, blockDim>>>(src, dst);\n";
+    code += "cudaCheckError();\n";
+    code += "}\n";
+
+    return code;
+}
+
+} // namespace memb
--- a/src/pfusion/gen_transpose_last.cpp
+++ b/src/pfusion/gen_transpose_last.cpp
@ -0,0 +1,93 @@
+#include "codegen.h"
+#include "memory_operator.h"
+#include "micro_kernel/transpose.h"
+
+namespace memb {
+
+std::string header =
+    std::string("// Header code\n") + std::string("#include \"cuda.h\"\n") +
+    std::string("#include \"cuda_utils.h\"\n\n") +
+    std::string("#define ROUND_UP(n, b) (((n) - 1) / (b) + 1)\n") +
+    std::string("#define INDEX(x, y, n) ((x) * (n) + (y))\n") +
+    std::string("#define MIN(x, y) ((x) < (y) ? (x) : (y))\n\n");
+
+std::string gen_lowest_basic(int dimn, int dimx, int dimy, int nblocks,
+                             int nthreads) {
+    std::string code = header;
+    code += "// Kernel\n";
+    code += "__global__ void kernel_tmp(float *src, float *dst) {\n";
+    code += "int lane_id = threadIdx.x % 32;\n";
+    code += "int warp_id = threadIdx.x / 32;\n";
+    code += "int parallel_idx = blockIdx.x * " + std::to_string(nthreads / 32) +
+            " + warp_id;\n";
+    code += "float reg[32];\n";
+    code += "__shared__ float smem[32 * 32 * 2 * " +
+            std::to_string(nthreads / 32) + "];\n";
+
+    int loop = ((dimx - 1) / 32 + 1) * ((dimy - 1) / 32 + 1);
+    code += "for (int loop_idx = 0; loop_idx < " + std::to_string(loop) +
+            "; loop_idx++) {\n";
+
+    MemoryOperator dram_read;
+    dram_read.memoryType = MemoryOperator::DRAM;
+    dram_read.opType = MemoryOperator::READ;
+    dram_read.ptr = Ptr("src", "parallel_idx * " + std::to_string(dimx * dimy));
+    dram_read.num =
+        "MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
+    dram_read.offset =
+        "INDEX(loop_idx / 32 + inst_idx, loop_idx % 32 + lane_id, " +
+        std::to_string(dimy) + ")";
+    dram_read.reg = "inst_idx";
+    code += dram_read.generate();
+
+    MemoryOperator sram_write;
+    sram_write.memoryType = MemoryOperator::SRAM;
+    sram_write.opType = MemoryOperator::WRITE;
+    sram_write.ptr = Ptr("smem", "warp_id * 32 * 32 * 2");
+    sram_write.num =
+        "MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
+    sram_write.offset = "inst_idx * 32 + lane_id";
+    sram_write.reg = "inst_idx";
+    code += sram_write.generate();
+
+    MicroKernelTranspose transpose;
+    code += transpose.generate(Ptr("smem", "warp_id * 32 * 32 * 2"),
+                               Ptr("smem", "warp_id * 32 * 32 * 2 + 32 * 32"),
+                               32, "32", 32, "32");
+
+    MemoryOperator sram_read;
+    sram_read.memoryType = MemoryOperator::SRAM;
+    sram_read.opType = MemoryOperator::READ;
+    sram_read.ptr = Ptr("smem", "warp_id * 32 * 32 * 2 + 32 * 32");
+    sram_read.num =
+        "MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
+    sram_read.offset = "inst_idx * 32 + lane_id";
+    sram_read.reg = "inst_idx";
+    code += sram_read.generate();
+
+    MemoryOperator dram_write;
+    sram_read.memoryType = MemoryOperator::DRAM;
+    sram_read.opType = MemoryOperator::WRITE;
+    sram_read.ptr = Ptr("dst", "parallel_idx * " + std::to_string(dimx) +
+                                   " * " + std::to_string(dimy));
+    sram_read.num =
+        "MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
+    sram_read.offset =
+        "INDEX(loop_idx / 32 + inst_idx, loop_idx % 32 + lane_id, " +
+        std::to_string(dimy) + ")";
+    sram_read.reg = "inst_idx";
+    code += sram_read.generate();
+    code += "}\n";
+    code += "}\n\n";
+
+    code += "void transpose(float *src, float *dst) {\n";
+    code += "dim3 gridDim(" + std::to_string(nblocks) + ", 1);";
+    code += "dim3 blockDim(" + std::to_string(nthreads) + ", 1);";
+    code += "kernel_tmp<<<gridDim, blockDim>>>(src, dst);\n";
+    code += "cudaCheckError();\n";
+    code += "}\n";
+
+    return code;
+}
+
+} // namespace memb
--- a/src/pfusion/instantiate.cc
+++ b/src/pfusion/instantiate.cc
@ -0,0 +1,302 @@
+#include "pfusion/instantiate.h"
+#include "pfusion/meta_op.h"
+#include "pfusion/micro_kernel/binary.h"
+#include "pfusion/micro_kernel/memory.h"
+#include "pfusion/micro_kernel/unary.h"
+#include "pfusion/micro_op.h"
+
+namespace memb {
+
+size_t getSize(const std::vector<size_t> &shape) {
+    size_t size = 1;
+    for (auto x : shape) {
+        size *= x;
+    }
+    return size;
+}
+
+size_t min(size_t a, size_t b) { return (a < b) ? a : b; }
+
+std::vector<std::shared_ptr<MetaOp>>
+instantiateUnary(const OpType opType,
+                 std::vector<std::shared_ptr<Pointer>> ptrs,
+                 const std::vector<size_t> &shape) {
+    std::vector<std::shared_ptr<MetaOp>> metaOps;
+    size_t size = getSize(shape);
+
+    auto metaOp = std::make_shared<MetaOp>();
+    metaOp->main_loop_st = 0;
+    metaOp->main_loop_ed = size / 32 / 8;
+    metaOp->numBlocks = 108;
+    metaOp->numGroups = 8;
+    metaOp->numLanes = 32;
+    metaOp->numReg = 8;
+    metaOp->numSmem = 0;
+
+    metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
+        std::string("src"), std::vector<size_t>({32 * 8, size / 32 / 8}),
+        std::vector<size_t>({1})));
+
+    metaOp->ptrs = ptrs;
+    auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");
+
+    metaOp->microOps.emplace_back(MemoryOp::build(
+        READ,
+        Pointer::buildPtr(ptrs[0], "offset_src + inst_idx * 32 + lane_id"), buf,
+        8, 32));
+    metaOp->microOps.emplace_back(
+        std::make_shared<UnaryOp>(opType, buf, buf, 8, 32));
+
+    metaOp->microOps.emplace_back(MemoryOp::build(
+        WRITE, buf,
+        Pointer::buildPtr(ptrs[1], "offset_src + inst_idx * 32 + lane_id"), 8,
+        32));
+
+    metaOps.emplace_back(metaOp);
+    return metaOps;
+}
+
+std::vector<std::shared_ptr<MetaOp>>
+instantiateBinary(const OpType opType,
+                  std::vector<std::shared_ptr<Pointer>> ptrs,
+                  const std::vector<size_t> &shape) {
+    std::vector<std::shared_ptr<MetaOp>> metaOps;
+    size_t size = getSize(shape);
+
+    auto metaOp = std::make_shared<MetaOp>();
+    metaOp->main_loop_st = 0;
+    metaOp->main_loop_ed = size / 32 / 8;
+    metaOp->numBlocks = 108;
+    metaOp->numGroups = 8;
+    metaOp->numLanes = 32;
+    metaOp->numReg = 24;
+    metaOp->numSmem = 0;
+
+    metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
+        std::string("src"), std::vector<size_t>({32 * 8, size / 32 / 8}),
+        std::vector<size_t>({1})));
+
+    metaOp->ptrs = ptrs;
+    auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
+    auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
+    auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
+
+    metaOp->microOps.emplace_back(MemoryOp::build(
+        READ,
+        Pointer::buildPtr(ptrs[0], "offset_src + inst_idx * 32 + lane_id"),
+        buf0, 8, 32));
+    metaOp->microOps.emplace_back(MemoryOp::build(
+        READ,
+        Pointer::buildPtr(ptrs[1], "offset_src + inst_idx * 32 + lane_id"),
+        buf1, 8, 32));
+    metaOp->microOps.emplace_back(
+        std::make_shared<BinaryOp>(opType, buf0, buf1, buf2, 8, 32));
+    metaOp->microOps.emplace_back(MemoryOp::build(
+        WRITE, buf2,
+        Pointer::buildPtr(ptrs[2], "offset_src + inst_idx * 32 + lane_id"), 8,
+        32));
+
+    metaOps.emplace_back(metaOp);
+    return metaOps;
+}
+
+std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
+    const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
+    const std::vector<size_t> &shape, const std::vector<size_t> &perm) {
+    std::vector<std::shared_ptr<MetaOp>> metaOps;
+
+    size_t size = 1;
+    for (auto x : shape) {
+        size *= x;
+    }
+
+    auto metaOp = std::make_shared<MetaOp>();
+
+    std::vector<size_t> srcShape(shape.size());
+    std::vector<size_t> srcMap;
+    size_t parallelSize = 1;
+    for (size_t i = 0; i < shape.size(); i++) {
+        srcShape[perm[i]] = shape[i];
+        if (i != 0 && perm[i] != 0) {
+            srcMap.emplace_back(perm[i]);
+            parallelSize *= shape[i];
+        }
+    }
+    metaOp->mappings.emplace_back(
+        TensorMapping::buildWithMap("src", srcShape, srcMap));
+
+    std::vector<size_t> dstMap;
+    for (size_t i = 0; i < shape.size(); i++) {
+        if (i != 0 && perm[i] != 0) {
+            dstMap.emplace_back(i);
+        }
+    }
+    metaOp->mappings.emplace_back(
+        TensorMapping::buildWithMap("dst", shape, dstMap));
+
+    metaOp->main_loop_st = 0;
+    metaOp->main_loop_ed = parallelSize;
+    metaOp->numBlocks = 108;
+    metaOp->numGroups = 8;
+    metaOp->numLanes = 32;
+
+    int numTileA = (shape[perm[0]] - 1) / 32 + 1;
+    int numTileB = (shape[0] - 1) / 32 + 1;
+    // std::cout << numTileA << " " << numTileB << std::endl;
+
+    metaOp->numReg = 32;
+    metaOp->numSmem = 32 * 33;
+
+    size_t stride_src = 1, stride_dst = 1;
+    for (size_t i = 0; i < shape.size(); i++) {
+        if (perm[i] < perm[0]) {
+            stride_src *= shape[i];
+        }
+    }
+    for (size_t i = 0; perm[i] != 0; i++) {
+        stride_dst *= shape[i];
+    }
+
+    // TODO: tiling is a metaOp or microOps?
+
+    metaOp->ptrs = ptrs;
+    auto smem =
+        Pointer::buildPtr(SRAM, "smem",
+                          "group_id * " + std::to_string(metaOp->numLanes) +
+                              " * " + std::to_string(metaOp->numLanes + 1));
+    auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");
+
+    for (int i = 0; i < numTileA; i++) {
+        for (int j = 0; j < numTileB; j++) {
+            auto src_ptr = Pointer::buildPtr(
+                ptrs[0], "offset_src + " +
+                             std::to_string(j * 32 * stride_src + i * 32) +
+                             "+" + "inst_idx * " + std::to_string(stride_src) +
+                             " + lane_id");
+            metaOp->microOps.emplace_back(
+                MemoryOp::build(READ, src_ptr, buf, min(32u, shape[0]),
+                                min(32, shape[perm[0]])));
+            metaOp->microOps.emplace_back(MemoryOp::build(
+                WRITE, buf, Pointer::buildPtr(smem, "inst_idx * 33 + lane_id"),
+                min(32, shape[0]), min(32, shape[perm[0]])));
+            metaOp->microOps.emplace_back(MemoryOp::build(
+                READ, Pointer::buildPtr(smem, "lane_id * 33 + inst_idx"), buf,
+                min(32, shape[perm[0]]), min(32, shape[0])));
+            auto dst_ptr = Pointer::buildPtr(
+                ptrs[1], "offset_dst + " +
+                             std::to_string(i * 32 * stride_dst + j * 32) +
+                             "+" + "inst_idx * " + std::to_string(stride_dst) +
+                             " + lane_id");
+            metaOp->microOps.emplace_back(
+                MemoryOp::build(WRITE, buf, dst_ptr, min(32, shape[perm[0]]),
+                                min(32, shape[0])));
+        }
+    }
+    metaOps.emplace_back(metaOp);
+
+    return metaOps;
+}
+
+std::vector<std::shared_ptr<MetaOp>>
+instantiateGather(const OpType opType,
+                  const std::vector<std::shared_ptr<Pointer>> &ptrs,
+                  const std::vector<size_t> &inputShape,
+                  const std::vector<size_t> &indexShape,
+                  const std::vector<size_t> &outputShape, const size_t axis) {
+    std::vector<std::shared_ptr<MetaOp>> metaOps;
+    size_t par_size = 1;
+    for (size_t i = 0; i < outputShape.size() - 1; i++) {
+        par_size *= inputShape[i];
+    }
+    size_t seq_size = inputShape[outputShape.size() - 1];
+
+    auto metaOp = std::make_shared<MetaOp>();
+    metaOp->main_loop_st = 0;
+    metaOp->main_loop_ed = par_size;
+    metaOp->numBlocks = 108;
+    metaOp->numGroups = 2;
+    metaOp->numLanes = 32;
+    metaOp->numReg = 24;
+    metaOp->numSmem = 0;
+
+    metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
+        std::string("src"), std::vector<size_t>({seq_size, par_size}),
+        std::vector<size_t>({1})));
+
+    metaOp->ptrs = ptrs;
+    auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
+    auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
+    auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
+
+    metaOps.emplace_back(metaOp);
+    return metaOps;
+}
+
+std::vector<std::shared_ptr<MetaOp>>
+instantiateReduce(const OpType opType,
+                  const std::vector<std::shared_ptr<Pointer>> &ptrs,
+                  const std::vector<size_t> &inputShape, const size_t axis) {
+    IT_ASSERT(axis == 0);
+    std::vector<std::shared_ptr<MetaOp>> metaOps;
+    size_t par_size = 1;
+    for (size_t i = 0; i < inputShape.size(); i++) {
+        if (i != axis) {
+            par_size *= inputShape[i];
+        }
+    }
+    size_t seq_size = inputShape[axis];
+
+    auto metaOp = std::make_shared<MetaOp>();
+    metaOp->main_loop_st = 0;
+    metaOp->main_loop_ed = par_size;
+    metaOp->numBlocks = 108;
+    metaOp->numGroups = 1;
+    metaOp->numLanes = 128;
+    metaOp->numReg = inputShape[0] / 128;
+    metaOp->numSmem = 0;
+
+    metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
+        std::string("src"), std::vector<size_t>({seq_size, par_size}),
+        std::vector<size_t>({1})));
+
+    metaOp->ptrs = ptrs;
+    auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
+    auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
+    auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
+
+    metaOps.emplace_back(metaOp);
+    return metaOps;
+}
+
+std::vector<std::shared_ptr<MetaOp>>
+instantiateBroadcast(const OpType opType,
+                     const std::vector<std::shared_ptr<Pointer>> &ptrs,
+                     const std::vector<size_t> &inputShape, const size_t axis,
+                     const size_t num) {
+    std::vector<std::shared_ptr<MetaOp>> metaOps;
+    size_t par_size = getSize(inputShape);
+    size_t seq_size = num;
+
+    auto metaOp = std::make_shared<MetaOp>();
+    metaOp->main_loop_st = 0;
+    metaOp->main_loop_ed = par_size;
+    metaOp->numBlocks = 108;
+    metaOp->numGroups = 2;
+    metaOp->numLanes = 32;
+    metaOp->numReg = 24;
+    metaOp->numSmem = 0;
+
+    metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
+        std::string("src"), std::vector<size_t>({seq_size, par_size}),
+        std::vector<size_t>({1})));
+
+    metaOp->ptrs = ptrs;
+    auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
+    auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
+    auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
+
+    metaOps.emplace_back(metaOp);
+    return metaOps;
+}
+
+} // namespace memb
--- a/src/pfusion/memory_codegen.cc
+++ b/src/pfusion/memory_codegen.cc
@ -0,0 +1,257 @@
+#include "core/graph.h"
+#include "operators/extend.h"
+#include "operators/gather.h"
+#include "operators/reduce_mean.h"
+#include "operators/transpose.h"
+
+#include "pfusion/instantiate.h"
+#include "pfusion/memory_codegen.h"
+#include "pfusion/pointer.h"
+#include "pfusion/search_graph.h"
+
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+
+void exportCode(const std::string &filename, const std::string &code) {
+    // check dir
+    if (std::filesystem::exists("../generated_code")) {
+        assert(std::filesystem::is_directory("../generated_code"));
+    } else {
+        auto ok = std::filesystem::create_directory("../generated_code");
+        assert(ok);
+    }
+    std::string dir = "../generated_code/" + filename;
+    std::ofstream fout(dir);
+    assert(fout.is_open());
+    fout << code;
+    fout.close();
+    system(std::string("clang-format -i " + dir).c_str());
+}
+
+void infini::MemoryCodegen::exportGraph(Graph graph, std::string filename) {
+    std::string code = generateGraph(graph);
+    exportCode(filename, code);
+}
+
+void infini::MemoryCodegen::exportBert_LN(const std::string &filename) {
+    std::string code = "";
+    exportCode(filename, code);
+}
+
+void infini::MemoryCodegen::exportBert_SM(const std::string &filename) {
+    std::string code = "";
+    exportCode(filename, code);
+}
+
+void infini::MemoryCodegen::exportBert_GELU(const std::string &filename) {
+    std::string code = "";
+    exportCode(filename, code);
+}
+
+void infini::MemoryCodegen::exportViT_LN(const std::string &filename) {
+    std::string code = "";
+    exportCode(filename, code);
+}
+
+void infini::MemoryCodegen::exportViT_SM(const std::string &filename) {
+    std::string code = "";
+    exportCode(filename, code);
+}
+
+void infini::MemoryCodegen::exportViT_GELU(const std::string &filename) {
+    std::string code = "";
+    exportCode(filename, code);
+}
+
+void infini::MemoryCodegen::exportBias(const std::string &filename,
+                                       const std::vector<size_t> &shape) {
+    std::string code = generateBias(shape);
+    exportCode(filename, code);
+}
+
+void infini::MemoryCodegen::exportTranspose(const std::string &filename,
+                                            const std::vector<size_t> &shape,
+                                            const std::vector<size_t> &perm) {
+    std::string code = generateTranspose(shape, perm);
+    exportCode(filename, code);
+}
+
+std::vector<size_t> convertShape(const std::vector<int> &_shape) {
+    std::vector<size_t> shape;
+    for (int i = int(_shape.size()); i > 0; i--) {
+        shape.emplace_back(_shape[i - 1]);
+    }
+    return shape;
+}
+
+size_t convertIndex(const size_t idx, const size_t size) { return size - idx; }
+
+std::vector<size_t> convertPerm(const std::vector<int> &_perm) {
+    std::vector<size_t> perm;
+    for (int i = int(_perm.size()); i > 0; i--) {
+        perm.emplace_back(_perm.size() - _perm[i - 1] - 1);
+    }
+    return perm;
+}
+
+void convertTranspose(std::shared_ptr<memb::SearchGraph> searchGraph,
+                      infini::Operator op) {
+    searchGraph->addNode(memb::instantiateTranspose(
+        memb::TRANSPOSE,
+        {memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
+         memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
+        convertShape(op->getOutputs()[0]->getDims()),
+        convertPerm(infini::as<infini::TransposeObj>(op)->getPerm())));
+}
+
+void convertUnary(std::shared_ptr<memb::SearchGraph> searchGraph,
+                  infini::Operator op, memb::OpType opType) {
+    searchGraph->addNode(memb::instantiateUnary(
+        opType,
+        {memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
+         memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
+        convertShape(op->getOutputs()[0]->getDims())));
+}
+
+void convertBinary(std::shared_ptr<memb::SearchGraph> searchGraph,
+                   infini::Operator op, memb::OpType opType) {
+    searchGraph->addNode(memb::instantiateBinary(
+        opType,
+        {memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
+         memb::Pointer::buildPtrByTensorGuid(op->getInputs()[1]->getGuid()),
+         memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
+        convertShape(op->getOutputs()[0]->getDims())));
+}
+void convertGather(std::shared_ptr<memb::SearchGraph> searchGraph,
+                   infini::Operator op) {
+    searchGraph->addNode(memb::instantiateGather(
+        memb::GATHER,
+        {memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
+         memb::Pointer::buildPtrByTensorGuid(op->getInputs()[1]->getGuid()),
+         memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
+        convertShape(op->getInputs()[0]->getDims()),
+        convertShape(op->getInputs()[1]->getDims()),
+        convertShape(op->getOutputs()[0]->getDims()),
+        convertIndex(infini::as<infini::GatherObj>(op)->getAxis(),
+                     op->getInputs()[0]->getDims().size())));
+}
+void convertReduce(std::shared_ptr<memb::SearchGraph> searchGraph,
+                   infini::Operator op, memb::OpType opType) {
+    auto reduceMeanOp = infini::as<infini::ReduceMeanObj>(op);
+    int axis = -1,
+        dimSize = int(reduceMeanOp->getInputs()[0]->getDims().size());
+    for (int i = 0; i < dimSize; i++) {
+        if (reduceMeanOp->isReduced(i)) {
+            if (axis != -1) {
+                IT_ASSERT(false);
+            } else {
+                axis = dimSize - i - 1;
+            }
+        }
+    }
+    IT_ASSERT(axis != -1);
+    searchGraph->addNode(memb::instantiateReduce(
+        opType,
+        {memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
+         memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
+        convertShape(op->getInputs()[0]->getDims()), axis));
+}
+
+void convertBroadcast(std::shared_ptr<memb::SearchGraph> searchGraph,
+                      infini::Operator op) {
+    auto extendOp = infini::as<infini::ExtendObj>(op);
+    IT_ASSERT(op->getInputs()[0]->getDims()[extendOp->getDim()] == 1);
+    searchGraph->addNode(memb::instantiateBroadcast(
+        memb::BROADCAST,
+        {memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
+         memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
+        convertShape(op->getInputs()[0]->getDims()), extendOp->getDim(),
+        extendOp->getNum() + 1));
+}
+
+std::shared_ptr<memb::SearchGraph> instantiateGraph(infini::Graph graph) {
+    auto searchGraph = std::make_shared<memb::SearchGraph>();
+    std::unordered_map<int, int> opMap;
+    int id = 0;
+    for (auto op : graph->getOperators()) {
+        switch (op->getOpType()) {
+        case infini::OpType::Transpose:
+            convertTranspose(searchGraph, op);
+            break;
+        case infini::OpType::Relu:
+            convertUnary(searchGraph, op, memb::RELU);
+            break;
+        case infini::OpType::Add:
+            convertBinary(searchGraph, op, memb::ADD);
+            break;
+        case infini::OpType::Sub:
+            convertBinary(searchGraph, op, memb::SUB);
+            break;
+        case infini::OpType::Gather:
+            convertGather(searchGraph, op);
+            break;
+        case infini::OpType::ReduceMean:
+            convertReduce(searchGraph, op, memb::REDUCEMEAN);
+            break;
+        case infini::OpType::Extend:
+            convertBroadcast(searchGraph, op);
+            break;
+        default:
+            std::cout << int(op->getOpType()) << std::endl;
+            IT_ASSERT(false);
+        }
+        IT_ASSERT(opMap.find(op->getGuid()) == opMap.end());
+        opMap[op->getGuid()] = id;
+        id++;
+    }
+    for (auto op : graph->getOperators()) {
+        for (auto nextOp : op->getSuccessors()) {
+            assert(opMap.find(op->getGuid()) != opMap.end());
+            assert(opMap.find(nextOp->getGuid()) != opMap.end());
+            searchGraph->addEdge(opMap[op->getGuid()],
+                                 opMap[nextOp->getGuid()]);
+        }
+    }
+    return searchGraph;
+}
+
+std::string infini::MemoryCodegen::generateGraph(Graph graph) {
+    auto searchGraph = instantiateGraph(graph);
+    auto metaGraph = searchGraph->exportFirstMetaGraph();
+    std::string code = "";
+    std::cout << "[INFO] before opt." << std::endl;
+    metaGraph->print();
+    metaGraph->optimize();
+    std::cout << "[INFO] after opt." << std::endl;
+    metaGraph->print();
+    code += metaGraph->genHeader();
+    code += metaGraph->genKernelFuncs();
+    code += metaGraph->genInvokeFuncs();
+    return code;
+}
+
+std::string
+infini::MemoryCodegen::generateBias(const std::vector<size_t> &shape) {
+    auto metaGraph = std::make_shared<memb::MetaGraph>();
+    metaGraph->addOp(memb::MetaOp::buildBiasOp(shape));
+    metaGraph->print();
+    std::string code = "";
+    code += metaGraph->genHeader();
+    code += metaGraph->genKernelFuncs();
+    code += metaGraph->genInvokeFuncs();
+    return code;
+}
+
+std::string
+infini::MemoryCodegen::generateTranspose(const std::vector<size_t> &shape,
+                                         const std::vector<size_t> &perm) {
+    auto metaGraph = std::make_shared<memb::MetaGraph>();
+    metaGraph->addOp(memb::MetaOp::buildTransposeOp(shape, perm));
+    metaGraph->print();
+    std::string code = "";
+    code += metaGraph->genHeader();
+    code += metaGraph->genKernelFuncs();
+    code += metaGraph->genInvokeFuncs();
+    return code;
+}
--- a/src/pfusion/meta_graph.cc
+++ b/src/pfusion/meta_graph.cc
@ -0,0 +1,54 @@
+#include "pfusion/meta_graph.h"
+
+namespace memb {
+
+void MetaGraph::print() {
+    for (auto op : metaOps) {
+        op->print();
+    }
+}
+
+void MetaGraph::optimize() {
+    std::vector<std::shared_ptr<MetaOp>> ops;
+    int numOp = metaOps.size();
+    int cur = 0;
+    for (int i = 1; i < numOp; i++) {
+        auto next = MetaOp::merge(metaOps[cur], metaOps[i]);
+        if (next == nullptr) {
+            ops.emplace_back(metaOps[cur]);
+            cur = i;
+        } else {
+            cur = metaOps.size();
+            metaOps.emplace_back(next);
+        }
+    }
+    ops.emplace_back(metaOps[cur]);
+    metaOps.clear();
+    for (auto op : ops) {
+        op->optimize();
+        metaOps.emplace_back(op);
+    }
+}
+
+std::string MetaGraph::genHeader() {
+    std::string code = "#include \"cuda_utils.h\"\n";
+    return code;
+}
+
+std::string MetaGraph::genKernelFuncs() {
+    std::string code = "";
+    for (auto metaOp : metaOps) {
+        code += metaOp->genKernelFunc();
+    }
+    return code;
+}
+
+std::string MetaGraph::genInvokeFuncs() {
+    std::string code = "";
+    for (auto metaOp : metaOps) {
+        code += metaOp->genInvokeFunc();
+    }
+    return code;
+}
+
+} // namespace memb
--- a/src/pfusion/meta_op.cc
+++ b/src/pfusion/meta_op.cc
@ -0,0 +1,303 @@
+#include "pfusion/meta_op.h"
+#include "pfusion/micro_kernel/binary.h"
+#include "pfusion/micro_kernel/memory.h"
+
+namespace memb {
+
+std::string TensorMapping::genOffset() {
+    std::string code = "int " + offset() + " = 0;\n";
+
+    std::string bufName = name + "_buf";
+    code += "int " + bufName + " = loop_idx;\n";
+    for (size_t i = 0; i < map.size(); i++) {
+        code += name + " += " + bufName + " % " +
+                std::to_string(shape[map[i]]) + " * " +
+                std::to_string(stride[map[i]]) + ";\n";
+        code += bufName + " /= " + std::to_string(shape[map[i]]) + ";\n";
+    }
+
+    return code;
+}
+
+void MetaOp::optimize() {
+    if (microOps.size() == 0)
+        return;
+    std::vector<std::shared_ptr<MicroOp>> ops;
+    int numOp = microOps.size();
+    int cur = 0;
+    for (int i = 1; i < numOp; i++) {
+        auto next = MicroOp::merge(microOps[cur], microOps[i]);
+        if (next == nullptr) {
+            ops.emplace_back(microOps[cur]);
+            cur = i;
+        } else {
+            cur = microOps.size();
+            microOps.emplace_back(next);
+        }
+    }
+    ops.emplace_back(microOps[cur]);
+    microOps.clear();
+    std::unordered_set<std::string> ptrSet;
+    for (auto op : ops) {
+        for (auto ptr : op->getPtrs()) {
+            ptrSet.emplace(ptr->getName());
+        }
+        if (op->getType() != EMPTY) {
+            microOps.emplace_back(op);
+        }
+    }
+    std::vector<std::shared_ptr<Pointer>> newPtrs;
+    for (auto ptr : ptrs) {
+        if (ptrSet.find(ptr->getName()) != ptrSet.end()) {
+            newPtrs.emplace_back(ptr);
+        }
+    }
+    ptrs.clear();
+    for (auto ptr : newPtrs) {
+        ptrs.emplace_back(ptr);
+    }
+}
+
+std::string MetaOp::genKernelFunc() {
+    std::string code = "";
+    code += "// Kernel\n";
+    code += "__global__ void kernel_func_" + std::to_string(id) + "(";
+    IT_ASSERT(ptrs.size() > 0);
+    code += "float *" + ptrs[0]->getName();
+    for (size_t i = 1; i < ptrs.size(); i++) {
+        code += ", float *" + ptrs[i]->getName();
+    }
+    code += ") {\n";
+    code += "int lane_id = threadIdx.x % " + std::to_string(numLanes) + ";\n";
+    code += "int warp_id = threadIdx.x / " + std::to_string(numLanes) + ";\n";
+    code += "int parallel_idx = blockIdx.x * " + std::to_string(numGroups) +
+            " + warp_id;\n";
+    if (numReg != 0) {
+        code += "float buf[" + std::to_string(numReg) + "];\n";
+    }
+    if (numSmem != 0) {
+        code += "__shared__ float smem[" + std::to_string(numSmem * numGroups) +
+                "];\n";
+    }
+
+    code += "for (int loop_idx = parallel_idx; loop_idx < " +
+            std::to_string(main_loop_ed) +
+            "; loop_idx += " + std::to_string(numBlocks * numGroups) + ") {\n";
+
+    // gen offset_src
+    for (auto mapping : mappings) {
+        code += mapping->genOffset();
+    }
+
+    for (auto microOp : microOps) {
+        code += microOp->generate();
+    }
+    code += "}\n}\n";
+    return code;
+}
+
+std::string MetaOp::genInvokeFunc() {
+    std::string code = "";
+    code += "void invoke_func_" + std::to_string(id) + "(";
+    IT_ASSERT(ptrs.size() > 0);
+    code += "float *" + ptrs[0]->getName();
+    for (size_t i = 1; i < ptrs.size(); i++) {
+        code += ", float *" + ptrs[i]->getName();
+    }
+    code += ") {\n";
+    int numThreads = numGroups * numLanes;
+    code += "dim3 gridDim(" + std::to_string(numBlocks) + ", 1);";
+    code += "dim3 blockDim(" + std::to_string(numThreads) + ", 1);";
+    code += "kernel_func_" + std::to_string(id) + "<<<gridDim, blockDim>>>(";
+    IT_ASSERT(ptrs.size() > 0);
+    code += ptrs[0]->getName();
+    for (size_t i = 1; i < ptrs.size(); i++) {
+        code += ", " + ptrs[i]->getName();
+    }
+    code += ");\n";
+    code += "cudaCheckError();\n";
+    code += "}\n";
+    return code;
+}
+
+std::shared_ptr<MetaOp> MetaOp::merge(std::shared_ptr<MetaOp> metaOp0,
+                                      std::shared_ptr<MetaOp> metaOp1) {
+    IT_ASSERT(metaOp0->checkValid());
+    IT_ASSERT(metaOp1->checkValid());
+    // Check unmergeable
+    if (metaOp0->main_loop_st != metaOp1->main_loop_st ||
+        metaOp0->main_loop_ed != metaOp1->main_loop_ed ||
+        metaOp0->numBlocks != metaOp1->numBlocks ||
+        metaOp0->numGroups != metaOp1->numGroups ||
+        metaOp0->numLanes != metaOp1->numLanes) {
+        return nullptr;
+    }
+    auto metaOp = std::make_shared<MetaOp>();
+    metaOp->main_loop_st = metaOp0->main_loop_st;
+    metaOp->main_loop_ed = metaOp0->main_loop_ed;
+    metaOp->numBlocks = metaOp0->numBlocks;
+    metaOp->numGroups = metaOp0->numGroups;
+    metaOp->numLanes = metaOp0->numLanes;
+    metaOp->numReg = metaOp0->numReg + metaOp1->numReg;
+    metaOp->numSmem = metaOp0->numSmem + metaOp1->numSmem;
+
+    // Merge ptr
+    std::unordered_set<size_t> ptrSet;
+    for (auto ptr : metaOp0->ptrs) {
+        IT_ASSERT(ptrSet.find(ptr->getHash()) == ptrSet.end());
+        metaOp->ptrs.emplace_back(ptr);
+        ptrSet.emplace(ptr->getHash());
+    }
+    for (auto ptr : metaOp1->ptrs) {
+        if (ptrSet.find(ptr->getHash()) == ptrSet.end()) {
+            metaOp->ptrs.emplace_back(ptr);
+            ptrSet.emplace(ptr->getHash());
+        }
+    }
+
+    // Merge mapping
+    std::unordered_set<size_t> mappingSet;
+    for (auto mapping : metaOp0->mappings) {
+        IT_ASSERT(mappingSet.find(mapping->getHash()) == mappingSet.end());
+        metaOp->mappings.emplace_back(mapping);
+        mappingSet.emplace(mapping->getHash());
+    }
+    for (auto mapping : metaOp1->mappings) {
+        if (mappingSet.find(mapping->getHash()) == mappingSet.end()) {
+            metaOp->mappings.emplace_back(mapping);
+            mappingSet.emplace(mapping->getHash());
+        }
+    }
+
+    // Merge microOps.
+    // TODO: make it a graph.
+    for (auto microOp : metaOp0->microOps) {
+        metaOp->microOps.emplace_back(microOp);
+    }
+    for (auto microOp : metaOp1->microOps) {
+        metaOp->microOps.emplace_back(microOp);
+    }
+    for (auto microOp : metaOp->microOps) {
+        microOp->print();
+    }
+    // TODO: elimiate microOps.
+
+    return metaOp;
+}
+
+std::shared_ptr<MetaOp> MetaOp::buildBiasOp(const std::vector<size_t> &shape) {
+    IT_ASSERT(shape.size() == 2);
+    auto metaOp = std::make_shared<MetaOp>();
+    metaOp->main_loop_st = 0;
+    metaOp->main_loop_ed = shape[1] * (shape[0] / 32 / 4);
+    metaOp->numBlocks = 80;
+    metaOp->numGroups = 4;
+    metaOp->numLanes = 32;
+    metaOp->numReg = 4;
+    metaOp->numSmem = 0;
+
+    metaOp->mappings.emplace_back(TensorMapping::build(
+        std::string("input"),
+        std::vector<size_t>({32 * 4, (shape[0] - 1) / (32 * 4) + 1, shape[1]}),
+        std::vector<size_t>({1, 32 * 4, shape[0]}),
+        std::vector<size_t>({1, 2})));
+    metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
+        std::string("bias"), std::vector<size_t>({shape[0], shape[1]}),
+        std::vector<size_t>({1})));
+
+    metaOp->ptrs = std::vector<std::shared_ptr<Pointer>>();
+    auto &ptrs = metaOp->ptrs;
+    ptrs.emplace_back(Pointer::buildPtr(DRAM, "input"));
+    ptrs.emplace_back(Pointer::buildPtr(DRAM, "bias"));
+    ptrs.emplace_back(Pointer::buildPtr(DRAM, "output"));
+
+    auto buf_input = Pointer::buildPtr(REG, "buf", "inst_idx");
+    auto buf_bias = Pointer::buildPtr(REG, "buf", "4");
+    auto buf_output = Pointer::buildPtr(REG, "buf", "inst_idx");
+
+    // @cond group_id * 4 * 32 + inst_idx * 32 + lane_id < shape[0]
+    metaOp->microOps.emplace_back(MemoryOp::build(
+        READ,
+        Pointer::buildPtr(ptrs[0], "offset_input + inst_idx * 32 + lane_id"),
+        buf_input, 4, 32));
+    metaOp->microOps.emplace_back(MemoryOp::build(
+        READ, Pointer::buildPtr(ptrs[1], "offset_bias"), buf_bias, 1, 32));
+    metaOp->microOps.emplace_back(std::make_shared<BinaryOp>(
+        ADD, buf_input, buf_bias, buf_output, 4, 32));
+    // @cond group_id * 4 * 32 + inst_idx * 32 + lane_id < shape[0]
+    metaOp->microOps.emplace_back(MemoryOp::build(
+        WRITE, buf_output,
+        Pointer::buildPtr(ptrs[1], "offset_input + inst_idx * 32 + lane_id"), 8,
+        32));
+    return metaOp;
+}
+
+std::shared_ptr<MetaOp>
+MetaOp::buildTransposeOp(const std::vector<size_t> &shape,
+                         const std::vector<size_t> &perm) {
+    IT_ASSERT(perm[0] == 0 && shape[0] >= 32);
+    IT_ASSERT(shape.size() == 3);
+    auto metaOp = std::make_shared<MetaOp>();
+    size_t numInst, extraDim;
+    std::vector<size_t> map_shape, map_stride;
+    if (shape[0] <= 4 * 32) {
+        numInst = (shape[0] - 1) / 32 + 1;
+        extraDim = 1;
+        metaOp->mappings.emplace_back(TensorMapping::build(
+            std::string("input"),
+            std::vector<size_t>({shape[0], shape[perm[1]], shape[perm[2]]}),
+            std::vector<size_t>({1, shape[0], shape[0] * shape[perm[1]]}),
+            std::vector<size_t>({perm[1], perm[2]})));
+        metaOp->mappings.emplace_back(TensorMapping::build(
+            std::string("output"),
+            std::vector<size_t>({shape[0], shape[1], shape[2]}),
+            std::vector<size_t>({1, shape[0], shape[0] * shape[1]}),
+            std::vector<size_t>({1, 2})));
+        // cond: local_id < shape[0];
+    } else {
+        numInst = 4;
+        extraDim = (shape[0] - 1) / 128 + 1;
+        metaOp->mappings.emplace_back(TensorMapping::build(
+            std::string("input"),
+            std::vector<size_t>(
+                {128, extraDim, shape[perm[1]], shape[perm[2]]}),
+            std::vector<size_t>({1, 128, shape[0], shape[0] * shape[perm[1]]}),
+            std::vector<size_t>({1, perm[1] + 1, perm[2] + 1})));
+        metaOp->mappings.emplace_back(TensorMapping::build(
+            std::string("output"),
+            std::vector<size_t>({128, extraDim, shape[1], shape[2]}),
+            std::vector<size_t>({1, 128, shape[0], shape[0] * shape[1]}),
+            std::vector<size_t>({1, 2, 3})));
+        // cond loop_idx % extraDim * 128 + local_id < shape[0];
+    }
+    metaOp->main_loop_st = 0;
+    metaOp->main_loop_ed = shape[1] * shape[2] * extraDim;
+    metaOp->numBlocks = 80;
+    metaOp->numGroups = 4;
+    metaOp->numLanes = 32;
+    metaOp->numReg = 4;
+    metaOp->numSmem = 0;
+
+    metaOp->ptrs = std::vector<std::shared_ptr<Pointer>>();
+    auto &ptrs = metaOp->ptrs;
+    ptrs.emplace_back(Pointer::buildPtr(DRAM, "input"));
+    ptrs.emplace_back(Pointer::buildPtr(DRAM, "output"));
+
+    auto buf_input = Pointer::buildPtr(REG, "buf", "inst_idx");
+    auto buf_output = Pointer::buildPtr(REG, "buf", "inst_idx");
+
+    // @cond group_id * 4 * 32 + inst_idx * 32 + lane_id < shape[0]
+    std::vector<size_t> cond = {shape[0], extraDim, 128};
+    auto inPtr =
+        Pointer::buildPtr(ptrs[0], "offset_input + inst_idx * 32 + lane_id");
+    auto opRead = MemoryOp::build(READ, inPtr, buf_input, numInst, 32, cond);
+    auto outPtr =
+        Pointer::buildPtr(ptrs[1], "offset_output + inst_idx * 32 + lane_id");
+    auto opWrite =
+        MemoryOp::build(WRITE, buf_output, outPtr, numInst, 32, cond);
+    metaOp->microOps = std::vector<std::shared_ptr<MicroOp>>({opRead, opWrite});
+
+    return metaOp;
+}
+
+} // namespace memb
--- a/src/pfusion/micro_kernel/binary.cc
+++ b/src/pfusion/micro_kernel/binary.cc
@ -0,0 +1,33 @@
+#include "pfusion/micro_kernel/binary.h"
+
+namespace memb {
+std::string BinaryOp::generate() {
+    std::string code;
+
+    if (width < 32) {
+        code += "if (lane_id < " + std::to_string(width) + ") {\n";
+    }
+
+    code += "#pragma unroll\n";
+    code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
+            "; inst_idx++) {\n";
+    std::string opFunc = getDst()->generate() + " = " + getSrc0()->generate();
+    if (opType == ADD) {
+        opFunc += " + ";
+    } else if (opType == SUB) {
+        opFunc += " - ";
+    } else {
+        IT_ASSERT(false);
+    }
+    opFunc += getSrc1()->generate() + ";\n";
+    code += opFunc;
+    code += "}\n";
+
+    if (width < 32) {
+        code += "}\n";
+    }
+
+    return code;
+}
+
+} // namespace memb
--- a/src/pfusion/micro_kernel/memory.cc
+++ b/src/pfusion/micro_kernel/memory.cc
@ -0,0 +1,99 @@
+#include "pfusion/micro_kernel/memory.h"
+
+namespace memb {
+
+std::string MemoryOp::generateWithCond() {
+    IT_ASSERT(cond.size() == 3);
+    std::string code = "";
+    int edge_length = cond[0] % cond[2];
+    int edge_num = edge_length / width;
+    int edge_width = edge_length % width;
+
+    if (edge_num > 0 || edge_width > 0) {
+        code += "if (loop_idx % " + std::to_string(cond[1]) +
+                " == " + std::to_string(cond[1] - 1) + ") {\n";
+    }
+
+    if (edge_num > 0) {
+        code += "#pragma unroll\n";
+        code += "for (int inst_idx = 0; inst_idx < " +
+                std::to_string(edge_num) + "; inst_idx++) {\n";
+        if ((opType == OpType::READ && getSrc()->getType() != MemType::REG &&
+             getDst()->getType() == MemType::REG) ||
+            (opType == OpType::WRITE && getSrc()->getType() == MemType::REG &&
+             getDst()->getType() != MemType::REG)) {
+            code += getDst()->generate() + " = " + getSrc()->generate() + ";\n";
+        } else {
+            IT_ASSERT(false);
+        }
+        code += "}\n";
+    }
+
+    if (edge_width > 0) {
+        code += "if (lane_id < " + std::to_string(edge_width) + ") {";
+        if ((opType == OpType::READ && getSrc()->getType() != MemType::REG &&
+             getDst()->getType() == MemType::REG) ||
+            (opType == OpType::WRITE && getSrc()->getType() == MemType::REG &&
+             getDst()->getType() != MemType::REG)) {
+            code += getDst()->generateWithInstIdx(std::to_string(edge_num)) +
+                    " = " +
+                    getSrc()->generateWithInstIdx(std::to_string(edge_num)) +
+                    ";\n";
+        } else {
+            IT_ASSERT(false);
+        }
+        code += "}\n";
+    }
+
+    if (edge_num > 0 || edge_width > 0) {
+        code += "} else {\n";
+    }
+
+    code += "#pragma unroll\n";
+    code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
+            "; inst_idx++) {\n";
+    if ((opType == OpType::READ && getSrc()->getType() != MemType::REG &&
+         getDst()->getType() == MemType::REG) ||
+        (opType == OpType::WRITE && getSrc()->getType() == MemType::REG &&
+         getDst()->getType() != MemType::REG)) {
+        code += getDst()->generate() + " = " + getSrc()->generate() + ";\n";
+    } else {
+        IT_ASSERT(false);
+    }
+
+    code += "}\n";
+    if (edge_num > 0 || edge_width > 0) {
+        code += "}\n";
+    }
+    code += "// test\n";
+    return code;
+}
+
+std::string MemoryOp::generate() {
+    if (cond.size() != 0) {
+        return generateWithCond();
+    }
+    std::string code;
+    if (width < 32) {
+        code += "if (lane_id < " + std::to_string(width) + ") {\n";
+    }
+
+    code += "#pragma unroll\n";
+    code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
+            "; inst_idx++) {\n";
+    if ((opType == OpType::READ && getSrc()->getType() != MemType::REG &&
+         getDst()->getType() == MemType::REG) ||
+        (opType == OpType::WRITE && getSrc()->getType() == MemType::REG &&
+         getDst()->getType() != MemType::REG)) {
+        code += getDst()->generate() + " = " + getSrc()->generate() + ";\n";
+    } else {
+        IT_ASSERT(false);
+    }
+
+    code += "}\n";
+    if (width < 32) {
+        code += "}\n";
+    }
+    return code;
+}
+} // namespace memb
--- a/src/pfusion/micro_kernel/reduce.cc
+++ b/src/pfusion/micro_kernel/reduce.cc
@ -0,0 +1,30 @@
+#include "pfusion/micro_kernel/reduce.h"
+
+namespace memb {
+std::string ReduceOp::generate() {
+    std::string code;
+    IT_ASSERT(width == 32);
+
+    code += getBuf()->generate() + " = 0;";
+    code += "#pragma unroll\n";
+    code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
+            "; inst_idx++) {\n";
+    std::string opFunc = getBuf()->generate() + " = " + getBuf()->generate();
+    if (opType == REDUCEMEAN) {
+        opFunc += " + ";
+    } else {
+        IT_ASSERT(false);
+    }
+    opFunc += getSrc()->generate() + ";\n";
+    code += opFunc;
+    code += "}\n";
+
+    if (opType == REDUCEMEAN) {
+        code += getBuf()->generate() + " = " + getBuf()->generate() +
+                " / float(" + std::to_string(num * width) + ");\n";
+    }
+
+    return code;
+}
+
+} // namespace memb
--- a/src/pfusion/micro_kernel/unary.cc
+++ b/src/pfusion/micro_kernel/unary.cc
@ -0,0 +1,29 @@
+#include "pfusion/micro_kernel/unary.h"
+
+namespace memb {
+std::string UnaryOp::generate() {
+    std::string code;
+
+    if (width < 32) {
+        code += "if (lane_id < " + std::to_string(width) + ") {\n";
+    }
+
+    code += "#pragma unroll\n";
+    code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
+            "; inst_idx++) {\n";
+    if (opType == RELU) {
+        code += getDst()->generate() + " = (" + getSrc()->generate() +
+                " > 0) ? " + getSrc()->generate() + " : 0;\n";
+    } else {
+        IT_ASSERT(false);
+    }
+    code += "}\n";
+
+    if (width < 32) {
+        code += "}\n";
+    }
+
+    return code;
+}
+
+} // namespace memb
--- a/src/pfusion/micro_op.cc
+++ b/src/pfusion/micro_op.cc
@ -0,0 +1,22 @@
+#include "pfusion/micro_op.h"
+#include "pfusion/micro_kernel/empty.h"
+#include "pfusion/micro_kernel/memory.h"
+
+namespace memb {
+std::shared_ptr<MicroOp> MicroOp::merge(std::shared_ptr<MicroOp> op0,
+                                        std::shared_ptr<MicroOp> op1) {
+    if (op0->getType() == WRITE && op1->getType() == READ) {
+        auto memOp0 = std::dynamic_pointer_cast<MemoryOp>(op0);
+        auto memOp1 = std::dynamic_pointer_cast<MemoryOp>(op1);
+        if (memOp0->getDst()->getHash() == memOp1->getSrc()->getHash()) {
+            if (memOp0->getSrc()->getHash() == memOp1->getDst()->getHash()) {
+                return std::make_shared<EmptyOp>();
+            } else {
+                // TODO: gen reg to reg.
+                // IT_ASSERT(false);
+            }
+        }
+    }
+    return nullptr;
+}
+} // namespace memb
--- a/src/pfusion/search_graph.cc
+++ b/src/pfusion/search_graph.cc
@ -0,0 +1,15 @@
+#include "pfusion/search_graph.h"
+
+namespace memb {
+std::shared_ptr<MetaGraph> SearchGraph::exportFirstMetaGraph() {
+    auto metaGraph = std::make_shared<MetaGraph>();
+    for (auto node : nodes) {
+        metaGraph->addOp(node.metaOps[0]);
+    }
+    for (auto edge : edges) {
+        metaGraph->addEdge(nodes[edge.first].metaOps[0],
+                           nodes[edge.second].metaOps[0]);
+    }
+    return metaGraph;
+}
+} // namespace memb
--- a/test/core/test_graph_builder.cc
+++ b/test/core/test_graph_builder.cc
@ -0,0 +1,265 @@
+#include "core/graph_builder.h"
+#include "test.h"
+
+namespace infini {
+
+TEST(GraphBuilder, ops) {
+    Runtime runtime = CpuRuntimeObj::getInstance();
+    { // conv without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::UInt32, runtime);
+        auto weight =
+            make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::UInt32, runtime);
+        auto conv = gf->conv(input, weight, 1, 1);
+        EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
+    }
+    { // conv with output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::UInt32, runtime);
+        auto weight =
+            make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::UInt32, runtime);
+        auto output =
+            make_ref<TensorObj>(Shape{1, 2, 4, 4}, DataType::UInt32, runtime);
+        auto conv = gf->conv(input, weight, output, 1, 1);
+    }
+    { // matmul without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto A = make_ref<TensorObj>(Shape{1, 3, 5}, DataType::UInt32, runtime);
+        auto B = make_ref<TensorObj>(Shape{1, 5, 2}, DataType::UInt32, runtime);
+        auto matmul = gf->matmul(A, B);
+        EXPECT_EQ(matmul->getOutput()->getDims(), (Shape{1, 3, 2}));
+    }
+    { // matmul with output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto A = make_ref<TensorObj>(Shape{1, 3, 5}, DataType::UInt32, runtime);
+        auto B = make_ref<TensorObj>(Shape{1, 5, 2}, DataType::UInt32, runtime);
+        auto C = make_ref<TensorObj>(Shape{1, 3, 2}, DataType::UInt32, runtime);
+        auto matmul = gf->matmul(A, B, C);
+    }
+    { // convtrans without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{1, 228, 1, 1}, DataType::UInt32, runtime);
+        auto weight = make_ref<TensorObj>(Shape{228, 448, 2, 2},
+                                          DataType::UInt32, runtime);
+        auto convtrans = gf->convTrans(input, weight, 0, 0);
+        EXPECT_EQ(convtrans->getOutput()->getDims(), (Shape{1, 448, 2, 2}));
+    }
+    { // convtrans with output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{1, 228, 1, 1}, DataType::UInt32, runtime);
+        auto weight = make_ref<TensorObj>(Shape{228, 448, 2, 2},
+                                          DataType::UInt32, runtime);
+        auto output =
+            make_ref<TensorObj>(Shape{1, 448, 2, 2}, DataType::UInt32, runtime);
+        auto convtrans = gf->convTrans(input, weight, 0, 0);
+    }
+    { // pad without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input = make_ref<TensorObj>(Shape{1, 64, 162, 162},
+                                         DataType::UInt32, runtime);
+        vector<int> pads = {2, 10, 1, 5, 0, 10, 1, 5};
+        auto pad = gf->pad(input, pads, std::nullopt);
+        EXPECT_EQ(pad->getOutput()->getDims(), (Shape{3, 84, 164, 172}));
+    }
+    { // pad with output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input = make_ref<TensorObj>(Shape{1, 64, 162, 162},
+                                         DataType::UInt32, runtime);
+        auto output = make_ref<TensorObj>(Shape{3, 84, 164, 172},
+                                          DataType::UInt32, runtime);
+        vector<int> pads = {2, 10, 1, 5, 0, 10, 1, 5};
+        auto pad = gf->pad(input, output, pads, std::nullopt);
+    }
+    { // slice without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input = make_ref<TensorObj>(Shape{10, 64, 162, 162},
+                                         DataType::UInt32, runtime);
+        vector<int> starts = {2, 10, 1, 5};
+        vector<int> ends = {3, 10, 100, 100};
+        auto slice = gf->slice(input, starts, ends, std::nullopt, std::nullopt);
+        EXPECT_EQ(slice->getOutput()->getDims(), (Shape{2, 1, 100, 96}));
+    }
+    { // slice with output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input = make_ref<TensorObj>(Shape{10, 64, 162, 162},
+                                         DataType::UInt32, runtime);
+        auto output = make_ref<TensorObj>(Shape{2, 1, 100, 96},
+                                          DataType::UInt32, runtime);
+        vector<int> starts = {2, 10, 1, 5};
+        vector<int> ends = {3, 10, 100, 100};
+        auto slice =
+            gf->slice(input, output, starts, ends, std::nullopt, std::nullopt);
+    }
+    { // concat without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto t1 =
+            make_ref<TensorObj>(Shape{1, 3, 2, 4}, DataType::Float32, runtime);
+        auto t2 =
+            make_ref<TensorObj>(Shape{1, 3, 2, 5}, DataType::Float32, runtime);
+        auto concat = gf->concat(TensorVec{t1, t2}, 3);
+        EXPECT_EQ(concat->getOutput()->getDims(), (Shape{1, 3, 2, 9}));
+    }
+    { // concat with output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto t1 =
+            make_ref<TensorObj>(Shape{1, 3, 2, 4}, DataType::Float32, runtime);
+        auto t2 =
+            make_ref<TensorObj>(Shape{1, 3, 2, 5}, DataType::Float32, runtime);
+        auto o0 =
+            make_ref<TensorObj>(Shape{1, 3, 2, 9}, DataType::Float32, runtime);
+        auto concat = gf->concat(TensorVec{t1, t2}, o0, 3);
+    }
+    { // split without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{1, 3, 2, 15}, DataType::Float32, runtime);
+        auto split = gf->split(input, 3, 4);
+        EXPECT_EQ(split->numOutputs(), 4);
+        EXPECT_EQ(split->getOutputs().size(), (size_t)4);
+        EXPECT_EQ(split->getOutput(0)->getDims(), (Shape{1, 3, 2, 3}));
+        EXPECT_EQ(split->getOutput(1)->getDims(), (Shape{1, 3, 2, 3}));
+        EXPECT_EQ(split->getOutput(2)->getDims(), (Shape{1, 3, 2, 3}));
+        EXPECT_EQ(split->getOutput(3)->getDims(), (Shape{1, 3, 2, 6}));
+    }
+    { // split with output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{1, 3, 2, 15}, DataType::Float32, runtime);
+        auto output0 =
+            make_ref<TensorObj>(Shape{1, 3, 2, 3}, DataType::Float32, runtime);
+        auto output1 =
+            make_ref<TensorObj>(Shape{1, 3, 2, 3}, DataType::Float32, runtime);
+        auto output2 =
+            make_ref<TensorObj>(Shape{1, 3, 2, 3}, DataType::Float32, runtime);
+        auto output3 =
+            make_ref<TensorObj>(Shape{1, 3, 2, 6}, DataType::Float32, runtime);
+        auto split = gf->split(
+            input, TensorVec{output0, output1, output2, output3}, 3, 4);
+    }
+    { // extend without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
+        auto extend = gf->extend(input, 2, 1);
+        EXPECT_EQ(extend->getOutput()->getDims(), (Shape{2, 3, 6, 4}));
+    }
+    { // extend with output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
+        auto output =
+            make_ref<TensorObj>(Shape{2, 3, 6, 4}, DataType::UInt32, runtime);
+        auto extend = gf->extend(input, output, 2, 1);
+    }
+    { // maxpool without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input = make_ref<TensorObj>(Shape{1, 64, 162, 162},
+                                         DataType::UInt32, runtime);
+        const int kh = 3, kw = 3, dh = 1, dw = 1, ph = 0, pw = 0, sh = 2,
+                  sw = 2;
+        auto maxpool = gf->maxpool(input, kh, kw, dh, dw, ph, pw, sh, sw);
+        EXPECT_EQ(maxpool->getOutput()->getDims(), (Shape{1, 64, 80, 80}));
+    }
+    { // maxpool with output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input = make_ref<TensorObj>(Shape{1, 64, 162, 162},
+                                         DataType::UInt32, runtime);
+        auto output = make_ref<TensorObj>(Shape{1, 64, 80, 80},
+                                          DataType::UInt32, runtime);
+        const int kh = 3, kw = 3, dh = 1, dw = 1, ph = 0, pw = 0, sh = 2,
+                  sw = 2;
+        auto maxpool =
+            gf->maxpool(input, output, kh, kw, dh, dw, ph, pw, sh, sw);
+    }
+    { // add without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input0 =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
+        auto input1 =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
+        auto add = gf->add(input0, input1);
+        EXPECT_EQ(add->getOutput()->getDims(), (Shape{2, 3, 3, 4}));
+    }
+    { // add with output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input0 =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
+        auto input1 =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
+        auto output =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
+        auto add = gf->add(input0, input1, output);
+    }
+    { // gather without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::UInt32, runtime);
+        auto index =
+            make_ref<TensorObj>(Shape{2, 1, 2}, DataType::UInt32, runtime);
+        auto gather = gf->gather(input, index, 1);
+        EXPECT_EQ(gather->getOutput()->getDims(), (Shape{1, 2, 1, 2, 4, 4}));
+    }
+    { // gather with output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::UInt32, runtime);
+        auto index =
+            make_ref<TensorObj>(Shape{2, 1, 2}, DataType::UInt32, runtime);
+        auto output = make_ref<TensorObj>(Shape{1, 2, 1, 2, 4, 4},
+                                          DataType::UInt32, runtime);
+        auto gather = gf->gather(input, index, output, 1);
+    }
+    { // reshape without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
+        vector<int> dims = {3, 2, 4, 3};
+        auto reshape = gf->reshape(input, dims);
+        EXPECT_EQ(reshape->getOutput()->getDims(), (Shape{3, 2, 4, 3}));
+    }
+    { // reshape with output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
+        vector<int> dims = {3, 2, 4, 3};
+        auto output =
+            make_ref<TensorObj>(Shape{3, 2, 4, 3}, DataType::Float32, runtime);
+        auto reshape = gf->reshape(input, output, dims);
+    }
+    { // flatten without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
+        auto flatten = gf->flatten(input);
+        EXPECT_EQ(flatten->getOutput()->getDims(), (Shape{72}));
+    }
+    { // flatten without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
+        auto output =
+            make_ref<TensorObj>(Shape{72}, DataType::Float32, runtime);
+        auto flatten = gf->flatten(input, output);
+    }
+    { // identity without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
+        auto identity = gf->identity(input);
+        EXPECT_EQ(identity->getOutput()->getDims(), (Shape{2, 3, 3, 4}));
+    }
+    { // identity without output
+        GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
+        auto input =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
+        auto output =
+            make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
+        auto identity = gf->identity(input, output);
+    }
+}
+
+} // namespace infini
--- a/test/pfusion/test_bert.cc
+++ b/test/pfusion/test_bert.cc
@ -0,0 +1,32 @@
+#include "core/blob.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+#include "operators/extend.h"
+#include "operators/gather.h"
+#include "operators/matmul.h"
+#include "operators/reduce_mean.h"
+#include "operators/transpose.h"
+#include "operators/unary.h"
+
+#include "pfusion/memory_codegen.h"
+#include "test.h"
+
+namespace infini {
+
+TEST(Graph, bert_layernorm) {
+    MemoryCodegen codegen;
+    codegen.exportBert_LN("bert_layernorm.cu");
+}
+
+TEST(Graph, bert_softmax) {
+    MemoryCodegen codegen;
+    codegen.exportBert_SM("bert_softmax.cu");
+}
+
+TEST(Graph, bert_gelu) {
+    MemoryCodegen codegen;
+    codegen.exportBert_GELU("bert_gelu.cu");
+}
+
+} // namespace infini
--- a/test/pfusion/test_bias.cc
+++ b/test/pfusion/test_bias.cc
@ -0,0 +1,26 @@
+#include "pfusion/memory_codegen.h"
+#include "test.h"
+
+namespace infini {
+
+TEST(Graph, bias_0) {
+    MemoryCodegen codegen;
+    codegen.exportBias("bias_0.cu", std::vector<size_t>({28 * 28, 24}));
+}
+
+TEST(Graph, bias_1) {
+    MemoryCodegen codegen;
+    codegen.exportBias("bias_1.cu", std::vector<size_t>({28 * 28, 58}));
+}
+
+TEST(Graph, bias_2) {
+    MemoryCodegen codegen;
+    codegen.exportBias("bias_2.cu", std::vector<size_t>({14 * 14, 116}));
+}
+
+TEST(Graph, bias_3) {
+    MemoryCodegen codegen;
+    codegen.exportBias("bias_3.cu", std::vector<size_t>({7 * 7, 232}));
+}
+
+} // namespace infini
--- a/test/pfusion/test_pfusion.cc
+++ b/test/pfusion/test_pfusion.cc
@ -0,0 +1,38 @@
+#include "core/blob.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/matmul.h"
+#include "operators/transpose.h"
+#include "operators/unary.h"
+#include "pfusion/memory_codegen.h"
+#include "test.h"
+
+namespace infini {
+
+TEST(Graph, build_and_run) {
+    Runtime runtime = CpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+    Tensor t0 = g->addTensor({1024, 32, 32}, DataType::Float32);
+    Tensor t1 = g->addTensor({1024, 32, 32}, DataType::Float32);
+    Tensor t2 = g->addTensor({1024, 32, 32}, DataType::Float32);
+    g->dataMalloc();
+    g->addOpWithOutputs<AbsObj>(t0, t1);
+    g->addOpWithOutputs<ReluObj>(t0, t1);
+    MemoryCodegen codegen;
+    codegen.exportGraph(g, "test.cu");
+}
+
+TEST(Graph, transpose) {
+    Runtime runtime = CpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+    Tensor t0 = g->addTensor({32, 31, 33, 32}, DataType::Float32);
+    Tensor t1 = g->addTensor({33, 32, 32, 31}, DataType::Float32);
+    Tensor t2 = g->addTensor({33, 32, 32, 31}, DataType::Float32);
+    g->dataMalloc();
+    g->addOpWithOutputs<TransposeObj>(t0, t1, Shape{2, 0, 3, 1});
+    g->addOpWithOutputs<ReluObj>(t1, t2);
+    MemoryCodegen codegen;
+    codegen.exportGraph(g, "transpose.cu");
+}
+
+} // namespace infini
--- a/test/pfusion/test_reduce.cc
+++ b/test/pfusion/test_reduce.cc
@ -0,0 +1,29 @@
+#include "core/blob.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+#include "operators/extend.h"
+#include "operators/gather.h"
+#include "operators/matmul.h"
+#include "operators/reduce_mean.h"
+#include "operators/transpose.h"
+#include "operators/unary.h"
+
+#include "pfusion/memory_codegen.h"
+#include "test.h"
+
+namespace infini {
+
+TEST(Graph, reduce_mean) {
+    Runtime runtime = CpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+    Tensor t0 = g->addTensor({1, 128, 512}, DataType::Float32);
+    Tensor t1 = g->addTensor({1, 128, 1}, DataType::Float32);
+    g->dataMalloc();
+    g->addOpWithOutputs<ReduceMeanObj>(t0, t1, Shape({2}));
+
+    MemoryCodegen codegen;
+    codegen.exportGraph(g, "reduce_mean.cu");
+}
+
+} // namespace infini
--- a/test/pfusion/test_sardrn.cc
+++ b/test/pfusion/test_sardrn.cc
@ -0,0 +1,41 @@
+#include "core/blob.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+#include "operators/matmul.h"
+#include "operators/transpose.h"
+#include "operators/unary.h"
+#include "pfusion/memory_codegen.h"
+#include "test.h"
+
+namespace infini {
+
+TEST(Graph, SAR_DRN_0) {
+    Runtime runtime = CpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+    Tensor t0 = g->addTensor({1, 64, 512, 512}, DataType::Float32);
+    Tensor t1 = g->addTensor({1, 64, 512, 512}, DataType::Float32);
+    Tensor t2 = g->addTensor({1, 64, 512, 512}, DataType::Float32);
+    Tensor t3 = g->addTensor({1, 64, 512, 512}, DataType::Float32);
+    g->dataMalloc();
+    g->addOpWithOutputs<ReluObj>(t0, t1);
+    g->addOpWithOutputs<AddObj>(t1, t2, t3);
+    MemoryCodegen codegen;
+    codegen.exportGraph(g, "sar_drn_0.cu");
+}
+
+TEST(Graph, SAR_DRN_1) {
+    Runtime runtime = CpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+    Tensor t0 = g->addTensor({1, 1, 512, 512}, DataType::Float32);
+    Tensor t1 = g->addTensor({1, 1, 512, 512}, DataType::Float32);
+    Tensor t2 = g->addTensor({1, 1, 512, 512}, DataType::Float32);
+    Tensor t3 = g->addTensor({1, 1, 512, 512}, DataType::Float32);
+    g->dataMalloc();
+    g->addOpWithOutputs<ReluObj>(t0, t1);
+    g->addOpWithOutputs<SubObj>(t1, t2, t3);
+    MemoryCodegen codegen;
+    codegen.exportGraph(g, "sar_drn_1.cu");
+}
+
+} // namespace infini
--- a/test/pfusion/test_transpose.cc
+++ b/test/pfusion/test_transpose.cc
@ -0,0 +1,21 @@
+#include "pfusion/memory_codegen.h"
+#include "test.h"
+
+namespace infini {
+
+TEST(Graph, transpose_0) {
+    MemoryCodegen codegen;
+    codegen.exportTranspose("transpose_0.cu", {28 * 28, 58, 2}, {0, 2, 1});
+}
+
+TEST(Graph, transpose_1) {
+    MemoryCodegen codegen;
+    codegen.exportTranspose("transpose_1.cu", {14 * 14, 116, 2}, {0, 2, 1});
+}
+
+TEST(Graph, transpose_2) {
+    MemoryCodegen codegen;
+    codegen.exportTranspose("transpose_2.cu", {7 * 7, 232, 2}, {0, 2, 1});
+}
+
+} // namespace infini
--- a/test/script/env_cuda10.sh
+++ b/test/script/env_cuda10.sh
@ -0,0 +1,4 @@
+. /home/spack/spack/share/spack/setup-env.sh
+spack load /yb2wg5g # cuda@10.2.89
+spack load /3bfwma4 # cudnn@7.6.5.32-10.2
+export CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-haswell/gcc-7.5.0/gcc-7.5.0-sti65cu3zunc4p4kfylgweim6mqan3mk/bin/gcc
Author	SHA1	Message	Date
mazx	89398a1c57	update fused kernels.	2022-11-15 15:13:50 +08:00
mazx	6e3f0bbf9a	update reduce kernel	2022-11-02 20:02:22 +08:00
mazx	2aadcb6e9c	add: onnx ok.	2022-11-02 17:49:37 +08:00
mazx	3046dd5901	add: graph for bert.	2022-11-02 17:49:37 +08:00
mazx	5ed540be6e	add: optimization pass for metaGraph.	2022-11-02 17:49:37 +08:00
mazx	ec58c85505	add: add metaGaph for codegen.	2022-11-02 17:49:37 +08:00
mazx	18b79903ee	add: codegen for all metaOps.	2022-11-02 17:49:37 +08:00
mazx	2c8bd3729b	add: generate transpose, unary, and binary.	2022-11-02 17:49:37 +08:00
mazx	254d23b3c0	Add: transpose operator	2022-11-02 17:49:37 +08:00
mazx	05d39439db	add: init power fusion.	2022-11-02 17:49:37 +08:00
mazx	48e986d377	add: fix bert to transpose.	2022-11-02 17:49:37 +08:00
mazx	09365c81f4	add: graph build for pf.	2022-11-02 17:49:37 +08:00
Pairshoe	94faefb0ef	Add: pytest for import_onnx	2022-11-02 17:49:37 +08:00
Pairshoe	970c77d0f4	Update: Rename GraphFactory -> GraphBuilder && Remove unnecessary outputs	2022-11-02 17:49:37 +08:00
Pairshoe	7cf2d8f78f	Add: python interfaced for importing onnx	2022-11-02 17:49:37 +08:00
Pairshoe	ff90c4c7d5	Add: test for class GraphFactoryObj	2022-11-02 17:49:37 +08:00
Pairshoe	9e45e51279	Add: class GraphFactory and pybind11 interfaces	2022-11-02 17:49:37 +08:00
Liyan Zheng	1b1fc2585b	Add: save optime result	2022-11-02 17:38:08 +08:00
Liyan Zheng	eb993f7829	Add: evaluate onnx script	2022-11-02 16:51:33 +08:00
Liyan Zheng	63e5df4227	Add: fused conv	2022-11-02 16:39:12 +08:00