From 04ea5eed3881812aa1e78b489b179d46e0988919 Mon Sep 17 00:00:00 2001
From: zhengly123 <zhengly123@outlook.com>
Date: Mon, 22 Aug 2022 15:01:03 +0800
Subject: [PATCH] Add CUDA runtime (#6)

* Fix: add warm-up and repetition in timing

* Add: CUDA runtime and float support

* Refactor: Cuda and Cpu runtimes inherit Runtime

* Add: environment script for Lotus

* Add: Lotus build instructions

* Update README.md

Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
---
 CMakeLists.txt                         |  19 ++-
 README.md                              |  10 +-
 include/core/blob.h                    |  24 +++
 include/core/common.h                  |   4 +-
 include/core/graph.h                   |   4 +-
 include/core/kernel.h                  |  18 ++-
 include/core/run_enigne.h              |  26 ----
 include/core/runtime.h                 |  62 ++++++++
 include/core/tensor.h                  |  36 ++++-
 include/core/tensor_base.h             |  19 ++-
 include/cuda/cuda_common.h             | 111 +++++++++++++
 include/cuda/cuda_runtime.h            |  49 ++++++
 include/cuda/cuda_utility.h            |  11 ++
 include/test.h                         |  34 +++-
 src/core/blob.cc                       |  10 ++
 src/core/common.cc                     |  15 +-
 src/core/graph.cc                      |   5 +-
 src/core/{run_engine.cc => runtime.cc} |  34 ++--
 src/core/tensor.cc                     |  78 ++++++++--
 src/core/tensor_base.cc                |   9 +-
 src/cuda/cuda_runtime.cc               |  32 ++++
 src/cuda/cuda_utility.cu               |  20 +++
 src/kerels/cpu/conv.cc                 |  18 ++-
 src/kerels/cpu/matmul.cc               |  20 ++-
 src/kerels/cuda/conv.cc                | 208 +++++++++++++++++++++++++
 test/core/test_graph.cc                |  31 ++--
 test/core/test_hash.cc                 |   6 +-
 test/operators/test_conv.cc            |  87 +++++++++--
 test/script/env_lotus.sh               |   2 +
 29 files changed, 875 insertions(+), 127 deletions(-)
 create mode 100644 include/core/blob.h
 delete mode 100644 include/core/run_enigne.h
 create mode 100644 include/core/runtime.h
 create mode 100644 include/cuda/cuda_common.h
 create mode 100644 include/cuda/cuda_runtime.h
 create mode 100644 include/cuda/cuda_utility.h
 create mode 100644 src/core/blob.cc
 rename src/core/{run_engine.cc => runtime.cc} (75%)
 create mode 100644 src/cuda/cuda_runtime.cc
 create mode 100644 src/cuda/cuda_utility.cu
 create mode 100644 src/kerels/cuda/conv.cc
 create mode 100644 test/script/env_lotus.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70935137..8220b3ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,7 @@ include(CMakeDependentOption)
 project(InfiniTensor C CXX)
 
 # Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
+option(USE_CUDA "Support CUDA GPU" ON)
 option(BUILD_TEST "Build tests" ON)
 cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
@@ -21,7 +22,6 @@ find_package(
   Python
   COMPONENTS Interpreter Development
   REQUIRED)
-find_package(CUDA REQUIRED)
 # OpenMP
 find_package(OpenMP)
 if(OpenMP_C_FOUND)
@@ -53,14 +53,19 @@ if(BUILD_TEST)
 endif()
 
 file(GLOB_RECURSE SRC src/*.cc src/*.cu)
-# file(GLOB_RECURSE FFI src/ffi/ffi_pet.cc)
-# list(REMOVE_ITEM SRC ${TEST} ${FFI})
 
 add_library(InfiniTensor SHARED ${SRC})
-# Target
-# cuda_add_library(it SHARED ${SRC})
-# cuda_add_cublas_to_target(it) # cublas
-# # target_link_libraries(infini_cpp cudnn curand nlohmann_json::nlohmann_json pybind11::embed)
+
+if(USE_CUDA)
+  # set(CUDA_HOST_COMPILER /home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc)
+  enable_language(CUDA)
+  # TODO: how to set option for CUDA_HOST_COMPILER. Now env var CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc takes effect.
+  # option(CUDA_HOST_COMPILER "" ${CMAKE_C_COMPILER})
+  # TODO:  find_package seems unnecessary for CMake >= 3.8
+  find_package(CUDA REQUIRED)
+  # message("CUBLAS_LIBRARIES: ${CUDA_LIBRARIES}")
+  target_link_libraries(InfiniTensor cudnn curand cublas ${CUDA_LIBRARIES})
+endif()
 
 # # Python bindings
 # pybind11_add_module(infini MODULE ${FFI})
diff --git a/README.md b/README.md
index 6d449566..d3778328 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,9 @@
-# InfiniTensor
\ No newline at end of file
+# InfiniTensor
+
+## Compilation on Lotus
+``` bash
+# Enter the root of InfiniTensor
+source test/script/env_lotus.sh 
+mkdir build && cd build
+cmake .. && make -j 12
+```
diff --git a/include/core/blob.h b/include/core/blob.h
new file mode 100644
index 00000000..4ff05ea8
--- /dev/null
+++ b/include/core/blob.h
@@ -0,0 +1,24 @@
+#pragma once
+#include "core/common.h"
+#include "core/ref.h"
+
+namespace infini {
+
+class RuntimeObj;
+using Runtime = Ref<RuntimeObj>;
+
+class BlobObj {
+    // Runtime might be replaced with a raw pointer for optimization
+    Runtime runtime;
+    void *ptr;
+
+  public:
+    BlobObj(Runtime runtime, void *ptr) : runtime(runtime), ptr(ptr) {}
+    BlobObj(BlobObj &other) = delete;
+    BlobObj &operator=(BlobObj const &) = delete;
+    ~BlobObj();
+
+    template <typename T> T getPtr() const { return reinterpret_cast<T>(ptr); }
+};
+
+} // namespace infini
\ No newline at end of file
diff --git a/include/core/common.h b/include/core/common.h
index 54ad8065..e296c331 100644
--- a/include/core/common.h
+++ b/include/core/common.h
@@ -69,6 +69,8 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
     return ret;
 }
 
-double timeit(const std::function<void()> &func);
+double timeit(const std::function<void()> &func, int warmupRounds = 200,
+              int timingRounds = 200,
+              const std::function<void(void)> &sync = {});
 
 } // namespace infini
diff --git a/include/core/graph.h b/include/core/graph.h
index 7abe5f0e..3e159792 100644
--- a/include/core/graph.h
+++ b/include/core/graph.h
@@ -4,16 +4,16 @@
 
 namespace infini {
 
-// TODO: graph should be attached to a context
 class GraphObj : public Object {
   protected:
+    Runtime runtime;
     TensorVec tensors;
     TensorVec inputs;
     TensorVec outputs;
     OpVec ops;
 
   public:
-    // Graph(OpVec oplist);
+    GraphObj(Runtime runtime) : runtime(runtime){};
     string toString() const override;
 
     Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
diff --git a/include/core/kernel.h b/include/core/kernel.h
index 4ecb2065..4d1a97d5 100644
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@@ -5,8 +5,14 @@
 
 namespace infini {
 
+class RuntimeObj; // Forward declaration for Kernel::compute
+
 struct PerfRecord {
-    double time; // in milliseconds
+    PerfRecord(){};
+    PerfRecord(double time) : time(time){};
+    virtual ~PerfRecord() {}
+
+    double time = 0; // in milliseconds
 };
 
 class Kernel {
@@ -20,14 +26,16 @@ class Kernel {
      * are required, inherit from PerfRecord and add extra parameters.
      * Otherwire, use PerfRecord directly.
      */
-    virtual void compute(const Operator &op,
-                         const PerfRecord &record) const = 0;
+    virtual void compute(const Operator &op, const PerfRecord &record,
+                         const RuntimeObj *context) const = 0;
     /**
      * @brief Executes an op with a default parameter.
      */
-    virtual void compute(const Operator &op) const = 0;
+    virtual void compute(const Operator &op,
+                         const RuntimeObj *context) const = 0;
     // Premise: op is idempotent since it is called multiple times.
-    virtual PerfRecord tune(const Operator &op) const = 0;
+    virtual PerfRecord tune(const Operator &op,
+                            const RuntimeObj *context) const = 0;
 };
 
 class KernelRegistry {
diff --git a/include/core/run_enigne.h b/include/core/run_enigne.h
deleted file mode 100644
index 1b4877a1..00000000
--- a/include/core/run_enigne.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-#include "core/graph.h"
-#include "core/kernel.h"
-#include "core/perf_engine.h"
-
-namespace infini {
-
-class RunEngine {
-  private:
-    Device device;
-
-  public:
-    RunEngine(Device device) : device(device) {}
-    ~RunEngine() {}
-
-    void run(const Graph &graph, bool tune = false,
-             bool profiling = false) const;
-    double getPerfTime(const Graph &graph, bool profiling = false) const;
-
-  private:
-    void printProfilingData(double totTime,
-                            const std::map<OpType, double> &opTime,
-                            const std::map<OpType, int> &opCnt) const;
-};
-
-} // namespace infini
\ No newline at end of file
diff --git a/include/core/runtime.h b/include/core/runtime.h
new file mode 100644
index 00000000..da15578e
--- /dev/null
+++ b/include/core/runtime.h
@@ -0,0 +1,62 @@
+#pragma once
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/perf_engine.h"
+namespace infini {
+
+class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
+  protected:
+    Device device;
+
+  public:
+    RuntimeObj(Device device) : device(device) {}
+    RuntimeObj(RuntimeObj &other) = delete;
+    RuntimeObj &operator=(RuntimeObj const &) = delete;
+    virtual ~RuntimeObj() {}
+
+    /**
+     * @brief Execute a graph.
+     *
+     * @param graph
+     * @param tune If there is no performance record, whether to tune it. These
+     * can be independent method.
+     * @param profiling Whether to print breakdown of time
+     */
+    virtual void run(const Graph &graph, bool tune = false,
+                     bool profiling = false) const = 0;
+    virtual void *alloc(size_t size) = 0;
+    virtual void dealloc(void *ptr) = 0;
+
+    /**
+     * @brief Get the execution time of each operator in performance record. No
+     * execution happens.
+     *
+     * @param graph
+     * @param profiling Whether to print breakdown of time
+     * @return double Return the sum of perf time for each operator
+     */
+    double getPerfTime(const Graph &graph, bool profiling = false) const;
+    Blob allocBlob(size_t size);
+
+  protected:
+    void printProfilingData(double totTime,
+                            const std::map<OpType, double> &opTime,
+                            const std::map<OpType, int> &opCnt) const;
+};
+
+// TODO: change inheritance relation
+class CpuRuntimeObj : public RuntimeObj {
+  public:
+    CpuRuntimeObj() : RuntimeObj(Device::CPU) {}
+
+    void run(const Graph &graph, bool tune = false,
+             bool profiling = false) const override;
+    void dealloc(void *ptr) override { return free(ptr); };
+
+    void *alloc(size_t size) override {
+        return calloc((size + sizeof(uint64_t) - 1) / sizeof(uint64_t),
+                      sizeof(uint64_t));
+    };
+};
+
+} // namespace infini
\ No newline at end of file
diff --git a/include/core/tensor.h b/include/core/tensor.h
index dcdd4aef..67dddc95 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -1,5 +1,6 @@
 #pragma once
 #include "core/tensor_base.h"
+#include <cmath>
 
 namespace infini {
 
@@ -16,21 +17,46 @@ class TensorObj : public TensorBaseObj {
     string toString() const override;
 
     size_t size() const;
-    void dataMalloc();
 
     Shape getDims() const { return shape; }
 
     size_t getOffset(const Shape &ds) const;
     using TensorBaseObj::getData;
     VType getData(const Shape &pos) const;
-    void copyData(VType *dptr);
+    void dataMalloc(const Runtime &runtime);
+    // void copyData(VType *dptr);
+    template <typename T> void copyData(const T *dptr);
     void copyData(vector<VType> dataVector);
+    void copyData(vector<float> dataVector);
     void printData() const;
+    // TODO: merge these methods
     bool equalData(const Tensor &rhs) const;
-    void
-    setData(std::function<void(void *, size_t, DataType)> generator) const {
-        generator((void *)(data.get()), size(), dtype);
+    template <typename T> bool equalData(const Tensor &rhs) const {
+        IT_ASSERT(data != nullptr);
+        IT_ASSERT(rhs->data != nullptr);
+        // TODO: deal with data type
+        auto ptr = data->getPtr<T *>();
+        auto ptrRhs = rhs->data->getPtr<T *>();
+        if (shape != rhs->getDims())
+            return false;
+        size_t sz = size();
+        for (size_t i = 0; i < sz; ++i)
+            if (fabs(ptr[i] - ptrRhs[i]) /
+                    std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
+                1e-6) {
+                printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
+                return false;
+            }
+        return true;
     }
+    void setData(
+        const std::function<void(void *, size_t, DataType)> &generator) const {
+        generator(data->getPtr<void *>(), size(), dtype);
+    }
+
+  private:
+    void printDataFloat() const;
+    void printDataUint32_t() const;
     // void setDims(const Dim &dms) { dims = dms; }
 
     //     bool dataRand(int seed = 0) {
diff --git a/include/core/tensor_base.h b/include/core/tensor_base.h
index d746f273..d38458a7 100644
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@@ -1,19 +1,23 @@
 #pragma once
+#include "core/blob.h"
 #include "core/object.h"
 #include "core/ref.h"
 
 namespace infini {
 
-// class Tensor;
 class TensorBaseObj;
 class TensorObj;
 class OperatorObj;
 class GraphObj;
+class RuntimeObj;
+class BlobObj;
 
 using TensorBase = Ref<TensorBaseObj>;
 using Tensor = Ref<TensorObj>;
 using Operator = Ref<OperatorObj>;
 using Graph = Ref<GraphObj>;
+using Runtime = Ref<RuntimeObj>;
+using Blob = Ref<BlobObj>;
 
 using TensorVec = vector<Tensor>;
 using OpVec = vector<Operator>;
@@ -40,8 +44,7 @@ class TensorBaseObj : public Object {
     DataType dtype;
     vector<WRef<TensorBaseObj>> inputOf;
     WRef<TensorBaseObj> outputOf;
-    // TODO: Ref<void> -> Ref<Blob>
-    Ref<VType[]> data;
+    Blob data;
     // ComputeState computed;
     // static int random_seed[256 * 16];
     // static bool random_inited;
@@ -50,7 +53,15 @@ class TensorBaseObj : public Object {
     TensorBaseObj(int dim, DataType dtype);
     virtual ~TensorBaseObj() {}
 
-    Ref<VType[]> getDataPtr() const { return data; }
+    void dataMalloc(const Blob &blob) {
+        IT_ASSERT(data == nullptr);
+        data = blob;
+    }
+    Blob getDataPtr() const { return data; }
+    template <typename T> T getDataRawPtr() const {
+        IT_ASSERT(data != nullptr);
+        return data->getPtr<T>();
+    }
     VType getData(size_t offset) const;
 
     DataType getDType() const { return dtype; }
diff --git a/include/cuda/cuda_common.h b/include/cuda/cuda_common.h
new file mode 100644
index 00000000..fdf7b1de
--- /dev/null
+++ b/include/cuda/cuda_common.h
@@ -0,0 +1,111 @@
+#pragma once
+#include "core/common.h"
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cudnn.h>
+#include <curand.h>
+
+// TODO: replace with Exception (IT_ASSERT)
+#define checkCudaError(call)                                                   \
+    {                                                                          \
+        auto err = call;                                                       \
+        if (cudaSuccess != err) {                                              \
+            fprintf(stderr, "Cuda error in %s:%i : %s.\n", __FILE__, __LINE__, \
+                    cudaGetErrorString(err));                                  \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    }
+
+#define checkCublasError(call)                                                 \
+    {                                                                          \
+        auto err = call;                                                       \
+        if (CUBLAS_STATUS_SUCCESS != err) {                                    \
+            fprintf(stderr, "cuBLAS error in %s:%i : %s.\n", __FILE__,         \
+                    __LINE__, cublasGetErrorString(err));                      \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    }
+
+#define checkCudnnError(call)                                                  \
+    {                                                                          \
+        auto err = call;                                                       \
+        if (CUDNN_STATUS_SUCCESS != err) {                                     \
+            fprintf(stderr, "cuDNN error in %s:%i : %s.\n", __FILE__,          \
+                    __LINE__, cudnnGetErrorString(err));                       \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    }
+
+#define checkCurandError(call)                                                 \
+    {                                                                          \
+        auto err = call;                                                       \
+        if (CURAND_STATUS_SUCCESS != err) {                                    \
+            fprintf(stderr, "cuRAND error in %s:%i : %s.\n", __FILE__,         \
+                    __LINE__, curandGetErrorString(err));                      \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    }
+
+namespace infini {
+
+inline const char *cublasGetErrorString(cublasStatus_t error) {
+    switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+        return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+        return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+        return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+        return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+        return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+        return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+        return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+        return "CUBLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+        return "CUBLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+        return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "<unknown>";
+}
+
+inline const char *curandGetErrorString(curandStatus_t error) {
+    switch (error) {
+    case CURAND_STATUS_SUCCESS:
+        return "CURAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:
+        return "CURAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:
+        return "CURAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:
+        return "CURAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:
+        return "CURAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:
+        return "CURAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+        return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+        return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:
+        return "CURAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+        return "CURAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+        return "CURAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:
+        return "CURAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:
+        return "CURAND_STATUS_INTERNAL_ERROR";
+    }
+    return "<unknown>";
+}
+
+using CudaPtr = void *;
+
+} // namespace infini
\ No newline at end of file
diff --git a/include/cuda/cuda_runtime.h b/include/cuda/cuda_runtime.h
new file mode 100644
index 00000000..9ad15f0c
--- /dev/null
+++ b/include/cuda/cuda_runtime.h
@@ -0,0 +1,49 @@
+#pragma once
+#include "core/runtime.h"
+#include "cuda/cuda_common.h"
+
+namespace infini {
+
+class CudaRuntimeObj : public RuntimeObj {
+  private:
+    cudnnHandle_t cudnn;
+    cublasHandle_t cublas;
+    CudaPtr workspace;
+    size_t workspaceSize;
+
+  public:
+    CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
+        checkCudnnError(cudnnCreate(&cudnn));
+        checkCublasError(cublasCreate(&cublas));
+        // 10GB for Longformer
+        // size_t longformerNum = 3lu * (1 << 30);
+        workspaceSize = 7ll << 30; // 7 GB
+        workspace = alloc(workspaceSize);
+    }
+    virtual ~CudaRuntimeObj() {
+        checkCudnnError(cudnnDestroy(cudnn));
+        checkCublasError(cublasDestroy(cublas));
+    }
+
+    void run(const Graph &graph, bool tune = false,
+             bool profiling = false) const;
+    // double runEvaluation(const Graph &graph, int nWarmups,
+    //                      int nEvaluations) const;
+    void sync() const;
+    CudaPtr alloc(size_t size) override {
+        void *ptr;
+        checkCudaError(cudaMalloc(&ptr, size));
+        return ptr;
+    }
+    void dealloc(void *ptr) override { checkCudaError(cudaFree(ptr)); }
+    cudnnHandle_t cudnnHandle() const { return cudnn; }
+    cublasHandle_t cublasHandle() const { return cublas; }
+    CudaPtr getWorkspace(size_t size) const {
+        IT_ASSERT(size <= workspaceSize);
+        return workspace;
+    }
+
+  private:
+    void runWithoutSync(const Graph &graph) const;
+};
+} // namespace infini
\ No newline at end of file
diff --git a/include/cuda/cuda_utility.h b/include/cuda/cuda_utility.h
new file mode 100644
index 00000000..354a38ff
--- /dev/null
+++ b/include/cuda/cuda_utility.h
@@ -0,0 +1,11 @@
+#include "core/tensor.h"
+
+namespace infini {
+
+void cudaPrintFloat(float *x, int len);
+
+void cudaPrintTensor(const Tensor &tensor) {
+    cudaPrintFloat(tensor->getDataRawPtr<float *>(), tensor->size());
+}
+
+} // namespace infini
\ No newline at end of file
diff --git a/include/test.h b/include/test.h
index ad8fc8a5..35131fae 100644
--- a/include/test.h
+++ b/include/test.h
@@ -5,12 +5,14 @@
 
 namespace infini {
 
+// TODO: isolate these class
 class DataGenerator {
   private:
-    virtual void fill(uint32_t *data, size_t size) { IT_TODO_HALT(); };
-    virtual void fill(float *data, size_t size) { IT_TODO_HALT(); };
+    virtual void fill(uint32_t *data, size_t size) { IT_TODO_HALT(); }
+    virtual void fill(float *data, size_t size) { IT_TODO_HALT(); }
 
   public:
+    virtual ~DataGenerator() {}
     void operator()(void *data, size_t size, DataType dataType) {
         switch (dataType) {
         case DataType::UInt32:
@@ -26,10 +28,36 @@ class DataGenerator {
 };
 
 class IncrementalGenerator : public DataGenerator {
-    void fill(uint32_t *data, size_t size) override {
+  public:
+    virtual ~IncrementalGenerator() {}
+
+  private:
+    template <typename T> void fill(T *data, size_t size) {
         for (size_t i = 0; i < size; i++) {
             data[i] = i;
         }
     }
+
+    void fill(uint32_t *data, size_t size) override {
+        fill<uint32_t>(data, size);
+    }
+    void fill(float *data, size_t size) override { fill<float>(data, size); }
+};
+
+class OneGenerator : public DataGenerator {
+  public:
+    virtual ~OneGenerator() {}
+
+  private:
+    template <typename T> void fill(T *data, size_t size) {
+        for (size_t i = 0; i < size; i++) {
+            data[i] = 1;
+        }
+    }
+
+    void fill(uint32_t *data, size_t size) override {
+        fill<uint32_t>(data, size);
+    }
+    void fill(float *data, size_t size) override { fill<float>(data, size); }
 };
 } // namespace infini
\ No newline at end of file
diff --git a/src/core/blob.cc b/src/core/blob.cc
new file mode 100644
index 00000000..72b00be4
--- /dev/null
+++ b/src/core/blob.cc
@@ -0,0 +1,10 @@
+#include "core/runtime.h"
+
+namespace infini {
+
+BlobObj::~BlobObj() {
+    // Avoid cycled inclusion
+    runtime->dealloc(ptr);
+}
+
+} // namespace infini
\ No newline at end of file
diff --git a/src/core/common.cc b/src/core/common.cc
index d1c7fd40..43192441 100644
--- a/src/core/common.cc
+++ b/src/core/common.cc
@@ -4,11 +4,20 @@
 
 namespace infini {
 
-double timeit(const std::function<void()> &func) {
+double timeit(const std::function<void()> &func, int warmupRounds,
+              int timingRounds, const std::function<void(void)> &sync) {
+    for (int i = 0; i < warmupRounds; ++i)
+        func();
+    if (sync)
+        sync();
     auto start = std::chrono::high_resolution_clock::now();
-    func();
+    for (int i = 0; i < timingRounds; ++i)
+        func();
+    if (sync)
+        sync();
     auto end = std::chrono::high_resolution_clock::now();
-    return std::chrono::duration<double, std::milli>(end - start).count();
+    return std::chrono::duration<double, std::milli>(end - start).count() /
+           timingRounds;
 }
 
 } // namespace infini
\ No newline at end of file
diff --git a/src/core/graph.cc b/src/core/graph.cc
index f8d85122..13e4678f 100644
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@@ -13,8 +13,9 @@ string GraphObj::toString() const {
 }
 
 void GraphObj::dataMalloc() {
-    for (auto &tensor : tensors)
-        tensor->dataMalloc();
+    for (auto &tensor : tensors) {
+        tensor->dataMalloc(runtime);
+    }
 }
 
 Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
diff --git a/src/core/run_engine.cc b/src/core/runtime.cc
similarity index 75%
rename from src/core/run_engine.cc
rename to src/core/runtime.cc
index eace9218..38e3cc46 100644
--- a/src/core/run_engine.cc
+++ b/src/core/runtime.cc
@@ -1,9 +1,14 @@
-#include "core/run_enigne.h"
+#include "core/runtime.h"
+#include "core/blob.h"
 #include <chrono>
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cudnn.h>
+#include <curand.h>
 
 namespace infini {
 
-void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
+void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
     if (!tune && profiling)
         IT_TODO_HALT();
     const auto &kernelRegistry = KernelRegistry::getInstance();
@@ -12,7 +17,6 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
     double totalTime = 0;
     std::map<OpType, double> opTime;
     std::map<OpType, int> opCnt;
-    std::chrono::system_clock::time_point begin, end;
 
     for (auto &op : graph->getOperators()) {
         // HACK: set correct data type
@@ -24,7 +28,7 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
 
         // If no record and disable tuning, run with the default argument
         if (!perfData && !tune) {
-            kernel->compute(op);
+            kernel->compute(op, this);
             continue;
         }
 
@@ -32,16 +36,18 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
         PerfRecord record;
         // Tune the kernel if there is no record
         if (!perfData) {
-            record = kernel->tune(op);
+            // TODO: record is not used
+            record = kernel->tune(op, this);
             perfEngine.setPerfData(perfKey, record);
         } else
             record = *perfData;
 
         if (!profiling) {
-            kernel->compute(op, *perfData);
+            kernel->compute(op, *perfData, this);
             continue;
         } else {
-            double t = timeit([&]() { kernel->compute(op, *perfData); });
+            double t =
+                timeit([&]() { kernel->compute(op, *perfData, this); }, 1, 1);
             op->print();
             printf(" op_time %lf\n", t);
             totalTime += t;
@@ -53,7 +59,7 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
         printProfilingData(totalTime, opTime, opCnt);
 }
 
-double RunEngine::getPerfTime(const Graph &graph, bool profiling) const {
+double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
     const auto &kernelRegistry = KernelRegistry::getInstance();
     auto perfEngine = PerfEngine::getInstance();
     // Statistics
@@ -72,7 +78,7 @@ double RunEngine::getPerfTime(const Graph &graph, bool profiling) const {
         PerfRecord record;
         // Tune the kernel if there is no record
         if (!perfData) {
-            record = kernel->tune(op);
+            record = kernel->tune(op, this);
             perfEngine.setPerfData(perfKey, record);
         } else
             record = *perfData;
@@ -91,9 +97,9 @@ double RunEngine::getPerfTime(const Graph &graph, bool profiling) const {
     return totalTime;
 }
 
-void RunEngine::printProfilingData(double totalTime,
-                                   const std::map<OpType, double> &opTime,
-                                   const std::map<OpType, int> &opCnt) const {
+void RuntimeObj::printProfilingData(double totalTime,
+                                    const std::map<OpType, double> &opTime,
+                                    const std::map<OpType, int> &opCnt) const {
     printf("%11s %3s %7s %7s %7s\n", "Op", "Cnt", "T_tot", "Percent", "T_mean");
     for (const auto &[type, t] : opTime) {
         printf("%11s %3d %7.3f %7.1f %7.3f\n",
@@ -102,4 +108,8 @@ void RunEngine::printProfilingData(double totalTime,
     }
 }
 
+Blob RuntimeObj::allocBlob(size_t size) {
+    return make_ref<BlobObj>(shared_from_this(), alloc(size));
+}
+
 } // namespace infini
\ No newline at end of file
diff --git a/src/core/tensor.cc b/src/core/tensor.cc
index c6aac89d..ec6991ae 100644
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@@ -1,15 +1,12 @@
-#include <core/tensor.h>
+#include "core/tensor.h"
+#include "core/blob.h"
+#include "core/runtime.h"
+
 namespace infini {
 
 TensorObj::TensorObj(const Shape &shape, DataType dtype)
     : TensorBaseObj(shape.size(), dtype), shape(shape) {}
 
-void TensorObj::dataMalloc() {
-    IT_ASSERT(data == nullptr);
-    // initialized to zero
-    data.reset(reinterpret_cast<VType *>(calloc(size(), sizeof(VType))));
-}
-
 VType TensorObj::getData(const Shape &pos) const {
     return getData(getOffset(pos));
 }
@@ -37,24 +34,42 @@ size_t TensorObj::size() const {
     return ret;
 }
 
-void TensorObj::copyData(VType *dptr) {
+template <typename T> void TensorObj::copyData(const T *dptr) {
+    // TODO: cuda
     IT_ASSERT(data != nullptr);
+    auto ptr = data->getPtr<T *>();
     size_t sz = size();
 #pragma omp parallel for
     for (size_t i = 0; i < sz; ++i) {
-        data[i] = dptr[i];
+        ptr[i] = dptr[i];
     }
 }
+
 void TensorObj::copyData(vector<VType> dataVector) {
     IT_ASSERT(dataVector.size() >= size());
     copyData(dataVector.data());
 }
 
+void TensorObj::copyData(vector<float> dataVector) {
+    IT_ASSERT(dataVector.size() >= size());
+    copyData(dataVector.data());
+}
+
 void TensorObj::printData() const {
     IT_ASSERT(data != nullptr);
+    if (dtype == DataType::Float32)
+        printDataFloat();
+    else if (dtype == DataType::UInt32)
+        printDataUint32_t();
+    else
+        IT_TODO_HALT();
+}
+
+void TensorObj::printDataFloat() const {
     std::cout << "Tensor: " << guid << std::endl;
     auto numDims = shape.size();
     auto dimSzVec = std::vector<int>(numDims, 1);
+    auto ptr = data->getPtr<float *>();
     dimSzVec[numDims - 1] = shape[numDims - 1];
     for (int i = numDims - 1; i != 0; --i)
         dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
@@ -64,7 +79,35 @@ void TensorObj::printData() const {
                 std::cout << "[";
             }
         }
-        std::cout << data[i];
+        printf("%.1f", ptr[i]);
+        for (size_t j = 0; j < numDims; ++j) {
+            if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
+                std::cout << "]";
+            }
+        }
+        if (i != size() - 1)
+            std::cout << ", ";
+        if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1)
+            std::cout << std::endl;
+    }
+}
+
+void TensorObj::printDataUint32_t() const {
+    IT_ASSERT(data != nullptr);
+    std::cout << "Tensor: " << guid << std::endl;
+    auto numDims = shape.size();
+    auto dimSzVec = std::vector<int>(numDims, 1);
+    auto ptr = data->getPtr<VType *>();
+    dimSzVec[numDims - 1] = shape[numDims - 1];
+    for (int i = numDims - 1; i != 0; --i)
+        dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
+    for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
+        for (size_t j = 0; j < numDims; ++j) {
+            if (i % dimSzVec[j] == 0) {
+                std::cout << "[";
+            }
+        }
+        std::cout << ptr[i];
         for (size_t j = 0; j < numDims; ++j) {
             if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
                 std::cout << "]";
@@ -80,13 +123,26 @@ void TensorObj::printData() const {
 bool TensorObj::equalData(const Tensor &rhs) const {
     IT_ASSERT(data != nullptr);
     IT_ASSERT(rhs->data != nullptr);
+    // TODO: deal with data type
+    auto ptr = data->getPtr<VType *>();
+    auto ptrRhs = rhs->data->getPtr<VType *>();
     if (shape != rhs->getDims())
         return false;
     size_t sz = size();
     for (size_t i = 0; i < sz; ++i)
-        if (data[i] != rhs->data[i])
+        if (ptr[i] != ptrRhs[i])
             return false;
     return true;
 }
 
+void TensorObj::dataMalloc(const Runtime &runtime) {
+    IT_ASSERT(data == nullptr);
+    size_t bytesPerElement;
+    if (getDType() == DataType::Float32)
+        bytesPerElement = sizeof(float);
+    else if (getDType() == DataType::UInt32)
+        bytesPerElement = sizeof(uint32_t);
+    data = runtime->allocBlob(size() * bytesPerElement);
+}
+
 }; // namespace infini
\ No newline at end of file
diff --git a/src/core/tensor_base.cc b/src/core/tensor_base.cc
index 84cad959..9414361b 100644
--- a/src/core/tensor_base.cc
+++ b/src/core/tensor_base.cc
@@ -1,9 +1,14 @@
-#include <core/tensor_base.h>
+#include "core/tensor_base.h"
+#include "core/blob.h"
+#include "core/runtime.h"
 namespace infini {
 
 TensorBaseObj::TensorBaseObj(int dim, DataType dtype)
     : dim(dim), dtype(dtype) {}
 
-VType TensorBaseObj::getData(size_t offset) const { return data[offset]; }
+VType TensorBaseObj::getData(size_t offset) const {
+    // TODO: check cuda array
+    return (data->getPtr<VType *>())[offset];
+}
 
 }; // namespace infini
\ No newline at end of file
diff --git a/src/cuda/cuda_runtime.cc b/src/cuda/cuda_runtime.cc
new file mode 100644
index 00000000..59fcabe2
--- /dev/null
+++ b/src/cuda/cuda_runtime.cc
@@ -0,0 +1,32 @@
+#include "cuda/cuda_runtime.h"
+
+namespace infini {
+
+void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
+    const auto &kernelRegistry = KernelRegistry::getInstance();
+    auto perfEngine = PerfEngine::getInstance();
+
+    for (auto &op : graph->getOperators()) {
+        // HACK: set correct data type
+        auto kernelAttrs =
+            KernelAttrs{device, op->getOpType(), DataType::Float32};
+        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
+        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
+        std::optional<PerfRecord> perfData = perfEngine.getPerfData(perfKey);
+        if (perfData)
+            kernel->compute(op, *perfData, this);
+        else
+            kernel->compute(op, this);
+    }
+}
+
+void CudaRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
+    if (tune || profiling)
+        IT_TODO_HALT();
+    runWithoutSync(graph);
+    sync();
+}
+
+void CudaRuntimeObj::sync() const { cudaDeviceSynchronize(); }
+
+} // namespace infini
\ No newline at end of file
diff --git a/src/cuda/cuda_utility.cu b/src/cuda/cuda_utility.cu
new file mode 100644
index 00000000..ca0a2d01
--- /dev/null
+++ b/src/cuda/cuda_utility.cu
@@ -0,0 +1,20 @@
+#include "cuda/cuda_common.h"
+#include <cstdio>
+
+__global__ void cudaPrintFloatImpl(float *x, int len) {
+    int start = threadIdx.x + blockDim.x * blockIdx.x;
+    if (start == 0) {
+        for (int i = 0; i < len; ++i) {
+            printf("%.3f ", x[i]);
+        }
+        printf("\n");
+    }
+}
+
+namespace infini {
+
+void cudaPrintFloat(float *x, int len) {
+    cudaPrintFloatImpl<<<1, 1>>>(x, len);
+    cudaDeviceSynchronize();
+}
+} // namespace infini
\ No newline at end of file
diff --git a/src/kerels/cpu/conv.cc b/src/kerels/cpu/conv.cc
index f68479d4..ac8f7b43 100644
--- a/src/kerels/cpu/conv.cc
+++ b/src/kerels/cpu/conv.cc
@@ -4,11 +4,12 @@
 namespace infini {
 
 template <typename T> class NaiveConv : public Kernel {
-    void compute(const Operator &_op, const PerfRecord &record) const override {
+    void compute(const Operator &_op, const PerfRecord &record,
+                 const RuntimeObj *context) const override {
         auto op = as<ConvObj>(_op);
-        T *iptr = reinterpret_cast<T *>(op->getInputs(0)->getDataPtr().get());
-        T *wptr = reinterpret_cast<T *>(op->getInputs(1)->getDataPtr().get());
-        T *optr = reinterpret_cast<T *>(op->getOutput()->getDataPtr().get());
+        T *iptr = op->getInputs(0)->getDataRawPtr<T *>();
+        T *wptr = op->getInputs(1)->getDataRawPtr<T *>();
+        T *optr = op->getOutput()->getDataRawPtr<T *>();
         auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
         auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
         int cpg = op->getChannelPerGroup();
@@ -45,10 +46,13 @@ template <typename T> class NaiveConv : public Kernel {
         }
     }
 
-    void compute(const Operator &op) const override { compute(op, {}); }
+    void compute(const Operator &op, const RuntimeObj *context) const override {
+        compute(op, {}, context);
+    }
 
-    PerfRecord tune(const Operator &op) const override {
-        return PerfRecord{.time = timeit([this, &op]() { compute(op); })};
+    PerfRecord tune(const Operator &op,
+                    const RuntimeObj *context) const override {
+        return PerfRecord(timeit([&]() { compute(op, context); }));
     }
 };
 
diff --git a/src/kerels/cpu/matmul.cc b/src/kerels/cpu/matmul.cc
index a1e83103..d7182954 100644
--- a/src/kerels/cpu/matmul.cc
+++ b/src/kerels/cpu/matmul.cc
@@ -4,11 +4,12 @@
 namespace infini {
 
 template <typename T> class NaiveMatmul : public Kernel {
-    void compute(const Operator &_op, const PerfRecord &record) const override {
+    void compute(const Operator &_op, const PerfRecord &record,
+                 const RuntimeObj *context) const override {
         auto op = as<MatmulObj>(_op);
-        T *A = reinterpret_cast<T *>(op->getInputs(0)->getDataPtr().get());
-        T *B = reinterpret_cast<T *>(op->getInputs(1)->getDataPtr().get());
-        T *C = reinterpret_cast<T *>(op->getOutput()->getDataPtr().get());
+        T *A = op->getInputs(0)->getDataRawPtr<T *>();
+        T *B = op->getInputs(1)->getDataRawPtr<T *>();
+        T *C = op->getOutput()->getDataRawPtr<T *>();
         IT_ASSERT(op->getTransA() == false && op->getTransB() == false);
         IT_ASSERT(op->getAct() == ActType::None);
         IT_ASSERT(op->getB() == 1);
@@ -23,10 +24,15 @@ template <typename T> class NaiveMatmul : public Kernel {
         }
     }
 
-    void compute(const Operator &op) const override { compute(op, {}); }
+    void compute(const Operator &op, const RuntimeObj *context) const override {
+        compute(op, {}, context);
+    }
 
-    PerfRecord tune(const Operator &op) const override {
-        return PerfRecord{.time = timeit([this, &op]() { compute(op); })};
+    PerfRecord tune(const Operator &op,
+                    const RuntimeObj *context) const override {
+        PerfRecord ret;
+        ret.time = timeit([&]() { compute(op, context); });
+        return ret;
     }
 };
 
diff --git a/src/kerels/cuda/conv.cc b/src/kerels/cuda/conv.cc
new file mode 100644
index 00000000..8289ea4f
--- /dev/null
+++ b/src/kerels/cuda/conv.cc
@@ -0,0 +1,208 @@
+#include "operators/conv.h"
+#include "core/kernel.h"
+#include "cuda/cuda_runtime.h"
+
+namespace infini {
+
+static constexpr int N_ALGO = 8;
+static constexpr cudnnConvolutionFwdAlgo_t ALGOS[N_ALGO] = {
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
+    CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED};
+
+struct ConvCuDnnPerfRecord : public PerfRecord {
+    int algo = 0; // cudnnConvolutionFwdAlgo_t
+    size_t workspaceSize = 100000;
+    bool fuseAct = false;
+};
+
+class convCudnn : public Kernel {
+
+    bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
+                      const CudaRuntimeObj *context) const {
+        cudnnStatus_t stat;
+        void *const inData = (op->getInputs(0)->getDataRawPtr<void *>());
+        void *const knData = (op->getInputs(1)->getDataRawPtr<void *>());
+        if (op->getInputs(2) != nullptr)
+            IT_TODO_HALT();
+        // void *const biasData = (op->getInputs(2)->getDataRawPtr<void *>());
+        void *const outData = (op->getOutput()->getDataRawPtr<void *>());
+
+        const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
+        const int cpg = op->getChannelPerGroup();
+        const int g = c / cpg;
+        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+
+        int channelsPerGrp = cpg, channels = c;
+
+        // get inputs
+        cudnnTensorDescriptor_t inDesc;
+        checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
+        checkCudnnError(cudnnSetTensor4dDescriptor(
+            inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, channels, h, w));
+
+        // get kernels
+        cudnnFilterDescriptor_t knDesc;
+        checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
+        checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
+                                                   CUDNN_TENSOR_NCHW, f,
+                                                   channelsPerGrp, r, s));
+
+        // get bias
+        cudnnTensorDescriptor_t biasDesc;
+        checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
+        checkCudnnError(cudnnSetTensor4dDescriptor(
+            biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1));
+
+        // get convlution descriptor
+        cudnnConvolutionDescriptor_t convDesc;
+        checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
+        // TODO: CUDNN_CONVOLUTION is a tunable argument
+        checkCudnnError(cudnnSetConvolution2dDescriptor(
+            convDesc, ph, pw, sh, sw, dh, dw, CUDNN_CROSS_CORRELATION,
+            CUDNN_DATA_FLOAT));
+        if (g > 1) {
+            checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g));
+        }
+
+        // get activation descriptor
+        cudnnActivationDescriptor_t actDesc;
+        checkCudnnError(cudnnCreateActivationDescriptor(&actDesc));
+        // NOT_PROPAGATE_NAN is requierd by
+        // cudnnConvolotionBiasActivationForward
+        switch (op->getAct()) {
+        case ActType::Relu:
+            checkCudnnError(cudnnSetActivationDescriptor(
+                actDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0));
+            break;
+        case ActType::Sigmoid:
+            checkCudnnError(cudnnSetActivationDescriptor(
+                actDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0));
+            break;
+        case ActType::None:
+            checkCudnnError(
+                cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_IDENTITY,
+                                             CUDNN_NOT_PROPAGATE_NAN, 0));
+            break;
+        default:
+            assert(false);
+        }
+
+        // get outputs
+        int outn, outc, outh, outw;
+        checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
+            convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
+        cudnnTensorDescriptor_t outDesc;
+        checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
+        checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
+                                                   CUDNN_DATA_FLOAT, outn, outc,
+                                                   outh, outw));
+        IT_ASSERT((vector{outn, outc, outh, outw}) ==
+                      op->getOutput()->getDims(),
+                  "cuDNN output shape mismatches with OP output shape");
+
+        // get workspace
+        size_t wsSize = record.workspaceSize;
+        stat = cudnnGetConvolutionForwardWorkspaceSize(
+            context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
+            ALGOS[record.algo], &wsSize);
+        if (stat != CUDNN_STATUS_SUCCESS)
+            return false;
+        // assert(wsSize < (size_t)3 * 1024 * 1024 * 1024);
+        // if (wsSize >= (size_t)10 * 1024 * 1024 * 1024)
+        //     continue;
+        CudaPtr wsData = context->getWorkspace(wsSize);
+        float alpha = 1.f, beta = 0.f;
+
+        stat = cudnnConvolutionForward(context->cudnnHandle(), &alpha, inDesc,
+                                       inData, knDesc, knData, convDesc,
+                                       ALGOS[record.algo], wsData, wsSize,
+                                       &beta, outDesc, outData);
+        if (stat != CUDNN_STATUS_SUCCESS)
+            return false;
+        // TODO:
+        // // bias
+        // if (bias != nullptr) {
+        //     auto sz = op.getOutputs()[0]->size();
+        //     // TODO: element wise
+        //     t += sz * 2 / 400;
+        // }
+        // // act
+        // if (act != None) {
+        //     stat = cudnnActivationForward(cudnnHandle(), actDesc,
+        //                                   &alpha, inDesc, inData,
+        //                                   &beta, outDesc, outData);
+        //     checkCudaError(cudaDeviceSynchronize());
+        //     end = ch::high_resolution_clock::now();
+        //     if (stat != CUDNN_STATUS_SUCCESS) {
+        //         durtime = INFINITY;
+        //         break;
+        //     }
+        //     t +=
+        //         ch::duration_cast<ch::duration<double>>(end -
+        //         beg).count() * 1000; // ms
+        // }
+
+        // best = ConvResult{durtime, ALGOS[i], wsSize, false};
+
+        // // w/ bias & act
+        // for (int j = 0; j < rounds + warmupRounds; ++j) {
+        //     cudnnStatus_t stat;
+        //     if (j == warmupRounds) {
+        //         checkCudaError(cudaDeviceSynchronize());
+        //         beg = ch::high_resolution_clock::now();
+        //     }
+        //     stat = cudnnConvolutionBiasActivationForward(
+        //         cudnnHandle(), &alpha, inDesc, inData, knDesc, knData,
+        //         convDesc, ALGOS[i], wsData, wsSize, &beta, outDesc,
+        //         outData, biasDesc, biasData, actDesc, outDesc, outData);
+        //     if (stat != CUDNN_STATUS_SUCCESS) {
+        //         // checkCudnnError(stat);
+        //         // Do not checkCudnnError since not all algorithms are
+        //         // supported
+        //         durtime_fuse = INFINITY;
+        //         break;
+        //     }
+        // }
+
+        // Destories in CUDA does not require sync. But cuDNN does not state
+        // whether sync is required before destories.
+        checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
+        checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
+        checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
+        checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
+        return true;
+    }
+
+    void compute(const Operator &op, const RuntimeObj *context) const override {
+        ConvCuDnnPerfRecord record; // with paramters in default ctor
+        compute(op, record, context);
+    }
+
+    PerfRecord tune(const Operator &_op,
+                    const RuntimeObj *_context) const override {
+        // TODO: real tuning
+        ConvCuDnnPerfRecord ret;
+        ret.time = timeit([&]() { compute(_op, _context); });
+        return ret;
+    }
+
+    void compute(const Operator &_op, const PerfRecord &_record,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ConvObj>(_op);
+        auto &record = dynamic_cast<const ConvCuDnnPerfRecord &>(_record);
+        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
+        bool success = cuDNNUnfused(op, record, context);
+        IT_ASSERT(success);
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float32, convCudnn,
+                "Conv_cuDNN_CUDA_Float32");
+
+} // namespace infini
\ No newline at end of file
diff --git a/test/core/test_graph.cc b/test/core/test_graph.cc
index 7c764141..ad1752f8 100644
--- a/test/core/test_graph.cc
+++ b/test/core/test_graph.cc
@@ -1,45 +1,48 @@
+#include "core/blob.h"
 #include "core/graph.h"
-#include "core/run_enigne.h"
+#include "core/runtime.h"
 #include "operators/matmul.h"
 #include "test.h"
 
 namespace infini {
 
 TEST(Graph, build_and_run) {
-    Graph g = make_ref<GraphObj>();
+    Runtime runtime = make_ref<CpuRuntimeObj>();
+    Graph g = make_ref<GraphObj>(runtime);
     Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
     Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
     Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
     g->dataMalloc();
-    i0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    w0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
-    RunEngine(Device::CPU).run(g);
+    runtime->run(g);
     // check answer
     auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
-    ans->dataMalloc();
-    ans->copyData({38, 44, 50, 56, 83, 98, 113, 128});
+    ans->dataMalloc(runtime);
+    ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
     EXPECT_TRUE(o0->equalData(ans));
 }
 
 TEST(Graph, perf_engine) {
-    Graph g = make_ref<GraphObj>();
+    Runtime runtime = make_ref<CpuRuntimeObj>();
+    Graph g = make_ref<GraphObj>(runtime);
     Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
     Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
     auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);
 
     g->dataMalloc();
-    i0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    w0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    RunEngine(Device::CPU).run(g, true, true);
-    double perfTime = RunEngine(Device::CPU).getPerfTime(g);
+    i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    runtime->run(g, true, true);
+    double perfTime = runtime->getPerfTime(g);
     // The example matmul takes 0.0036ms with one core
     EXPECT_GT(perfTime, 0);
     EXPECT_LT(perfTime, 0.01);
     // check answer
     auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
-    ans->dataMalloc();
-    ans->copyData({38, 44, 50, 56, 83, 98, 113, 128});
+    ans->dataMalloc(runtime);
+    ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
     EXPECT_TRUE(matmul->getOutput()->equalData(ans));
 }
 
diff --git a/test/core/test_hash.cc b/test/core/test_hash.cc
index 4f3f29d4..3c244acb 100644
--- a/test/core/test_hash.cc
+++ b/test/core/test_hash.cc
@@ -1,5 +1,5 @@
 #include "core/graph.h"
-#include "core/run_enigne.h"
+#include "core/runtime.h"
 #include "operators/matmul.h"
 #include "test.h"
 
@@ -8,7 +8,7 @@ namespace infini {
 TEST(Hash, OperatorHash) {
     OpPerfKey key1(0, OpType::Unknown), key2(0, OpType::Unknown);
     { // build with addOpWithOutputs
-        Graph g = make_ref<GraphObj>();
+        Graph g = make_ref<GraphObj>(nullptr);
         Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
         Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
         Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
@@ -18,7 +18,7 @@ TEST(Hash, OperatorHash) {
         EXPECT_GT(key1.attrs.size(), 5);
     }
     { // build with addOp
-        Graph g = make_ref<GraphObj>();
+        Graph g = make_ref<GraphObj>(nullptr);
         Tensor i0 = g->addTensor({2, 2, 3}, DataType::UInt32);
         Tensor w0 = g->addTensor({2, 3, 4}, DataType::UInt32);
         auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);
diff --git a/test/operators/test_conv.cc b/test/operators/test_conv.cc
index 4584c660..f5c6f7d9 100644
--- a/test/operators/test_conv.cc
+++ b/test/operators/test_conv.cc
@@ -1,21 +1,24 @@
 #include "core/graph.h"
-#include "core/run_enigne.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
 #include "operators/conv.h"
 #include "test.h"
 
 namespace infini {
 
 TEST(Conv, ShapeInference) {
+    auto runtime = make_ref<CpuRuntimeObj>();
     // Padding modes
     {
-        Graph g = make_ref<GraphObj>();
+        Graph g = make_ref<GraphObj>(runtime);
         Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
         Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
         auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1);
         EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
     }
     {
-        Graph g = make_ref<GraphObj>();
+        Graph g = make_ref<GraphObj>(runtime);
         Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
         Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
         auto conv =
@@ -23,7 +26,7 @@ TEST(Conv, ShapeInference) {
         EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
     }
     {
-        Graph g = make_ref<GraphObj>();
+        Graph g = make_ref<GraphObj>(runtime);
         Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
         Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
         auto conv =
@@ -31,7 +34,7 @@ TEST(Conv, ShapeInference) {
         EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 2, 2}));
     }
     { // dilation & stride
-        Graph g = make_ref<GraphObj>();
+        Graph g = make_ref<GraphObj>(runtime);
         Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
         Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
         auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
@@ -40,7 +43,8 @@ TEST(Conv, ShapeInference) {
 }
 
 TEST(Conv, NaiveCPU) {
-    Graph g = make_ref<GraphObj>();
+    auto runtime = make_ref<CpuRuntimeObj>();
+    Graph g = make_ref<GraphObj>(runtime);
     Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
     Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
     auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
@@ -48,16 +52,75 @@ TEST(Conv, NaiveCPU) {
     g->dataMalloc();
     i0->setData(IncrementalGenerator());
     w0->setData(IncrementalGenerator());
-    RunEngine(Device::CPU).run(g, true, true);
-    double perfTime = RunEngine(Device::CPU).getPerfTime(g);
-    // The example matmul takes 0.0036ms with one core
+    runtime->run(g, true, true);
+    double perfTime = runtime->getPerfTime(g);
+    // The example Conv takes 0.015ms with one core
     EXPECT_GT(perfTime, 0);
-    EXPECT_LT(perfTime, 5);
+    EXPECT_LT(perfTime, 0.1);
     // check answer
     auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32);
-    ans->dataMalloc();
-    ans->copyData({4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
+    ans->dataMalloc(runtime);
+    ans->copyData(
+        vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
     EXPECT_TRUE(conv->getOutput()->equalData(ans));
 }
 
+void testConvCudnn(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    vector<float> ansVec) {
+    auto cpuRuntime = make_ref<CpuRuntimeObj>();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    // Build CUDA graph
+    Graph g = make_ref<GraphObj>(cudaRuntime);
+    Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::Float32);
+    Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::Float32);
+    auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
+
+    // allocate CUDA memory
+    g->dataMalloc();
+
+    // Build input and output data on CPU
+    auto cpui0 = make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32);
+    cpui0->dataMalloc(cpuRuntime);
+    cpui0->setData(generator);
+
+    auto cpuw0 = make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32);
+    cpuw0->dataMalloc(cpuRuntime);
+    cpuw0->setData(generator);
+
+    auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32);
+    ans->dataMalloc(cpuRuntime);
+    ans->copyData(ansVec);
+
+    // Copy inputs from CPU to CUDA
+    cudaMemcpy(i0->getDataRawPtr<void *>(), cpui0->getDataRawPtr<void *>(),
+               cpui0->size() * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(w0->getDataRawPtr<void *>(), cpuw0->getDataRawPtr<void *>(),
+               cpuw0->size() * sizeof(float), cudaMemcpyHostToDevice);
+    // Execute on CUDA
+    cudaRuntime->run(g);
+    // double perfTime = cudaRuntime->getPerfTime(g);
+    // // The example Conv takes 0.015ms with one core
+    // EXPECT_GT(perfTime, 0);
+    // EXPECT_LT(perfTime, 0.1);
+
+    // copy CUDA output to CPU
+    auto o0 = conv->getOutput();
+    auto cpuo0 = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32);
+    cpuo0->dataMalloc(cpuRuntime);
+    cudaMemcpy(cpuo0->getDataRawPtr<void *>(),
+               conv->getOutput()->getDataRawPtr<void *>(),
+               cpuo0->size() * sizeof(float), cudaMemcpyDeviceToHost);
+
+    // check results on CPU
+    EXPECT_TRUE(cpuo0->equalData<float>(ans));
+}
+
+TEST(Conv, cuDNN) {
+    testConvCudnn(OneGenerator(),
+                  vector<float>{12, 12, 18, 18, 12, 12, 18, 18});
+    testConvCudnn(
+        IncrementalGenerator(),
+        vector<float>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
+}
 } // namespace infini
\ No newline at end of file
diff --git a/test/script/env_lotus.sh b/test/script/env_lotus.sh
new file mode 100644
index 00000000..a65a746d
--- /dev/null
+++ b/test/script/env_lotus.sh
@@ -0,0 +1,2 @@
+spack load cuda@11.0.2 cudnn@8.0.3.33-11.0
+export CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc