Add CUDA runtime (#6)

* Fix: add warm-up and repetition in timing * Add: CUDA runtime and float support * Refactor: Cuda and Cpu runtimes inherit Runtime * Add: environment script for Lotus * Add: Lotus build instructions * Update README.md Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
2022-08-22 15:01:03 +08:00 · 2022-08-22 15:01:03 +08:00 · 04ea5eed38
parent 9303ddda8e
commit 04ea5eed38
29 changed files with 875 additions and 127 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -4,6 +4,7 @@ include(CMakeDependentOption)
 project(InfiniTensor C CXX)
 # Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
 option(USE_CUDA "Support CUDA GPU" ON)
 option(BUILD_TEST "Build tests" ON)
 cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
@ -21,7 +22,6 @@ find_package(
  Python
  COMPONENTS Interpreter Development
  REQUIRED)
 find_package(CUDA REQUIRED)
 # OpenMP
 find_package(OpenMP)
 if(OpenMP_C_FOUND)
@ -53,14 +53,19 @@ if(BUILD_TEST)
 endif()
 file(GLOB_RECURSE SRC src/*.cc src/*.cu)
 # file(GLOB_RECURSE FFI src/ffi/ffi_pet.cc)
 # list(REMOVE_ITEM SRC ${TEST} ${FFI})
 add_library(InfiniTensor SHARED ${SRC})
-# Target
+
-# cuda_add_library(it SHARED ${SRC})
+if(USE_CUDA)
-# cuda_add_cublas_to_target(it) # cublas
+  # set(CUDA_HOST_COMPILER /home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc)
-# # target_link_libraries(infini_cpp cudnn curand nlohmann_json::nlohmann_json pybind11::embed)
+  enable_language(CUDA)
  # TODO: how to set option for CUDA_HOST_COMPILER. Now env var CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc takes effect.
  # option(CUDA_HOST_COMPILER "" ${CMAKE_C_COMPILER})
  # TODO:  find_package seems unnecessary for CMake >= 3.8
  find_package(CUDA REQUIRED)
  # message("CUBLAS_LIBRARIES: ${CUDA_LIBRARIES}")
  target_link_libraries(InfiniTensor cudnn curand cublas ${CUDA_LIBRARIES})
 endif()
 # # Python bindings
 # pybind11_add_module(infini MODULE ${FFI})
--- a/README.md
+++ b/README.md
@ -1 +1,9 @@
 # InfiniTensor
 ## Compilation on Lotus
 ``` bash
 # Enter the root of InfiniTensor
 source test/script/env_lotus.sh 
 mkdir build && cd build
 cmake .. && make -j 12
 ```
--- a/include/core/blob.h
+++ b/include/core/blob.h
@ -0,0 +1,24 @@
 #pragma once
 #include "core/common.h"
 #include "core/ref.h"
 namespace infini {
 class RuntimeObj;
 using Runtime = Ref<RuntimeObj>;
 class BlobObj {
    // Runtime might be replaced with a raw pointer for optimization
    Runtime runtime;
    void *ptr;
  public:
    BlobObj(Runtime runtime, void *ptr) : runtime(runtime), ptr(ptr) {}
    BlobObj(BlobObj &other) = delete;
    BlobObj &operator=(BlobObj const &) = delete;
    ~BlobObj();
    template <typename T> T getPtr() const { return reinterpret_cast<T>(ptr); }
 };
 } // namespace infini
--- a/include/core/common.h
+++ b/include/core/common.h
@ -69,6 +69,8 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
    return ret;
 }
-double timeit(const std::function<void()> &func);
+double timeit(const std::function<void()> &func, int warmupRounds = 200,
              int timingRounds = 200,
              const std::function<void(void)> &sync = {});
 } // namespace infini
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -4,16 +4,16 @@
 namespace infini {
 // TODO: graph should be attached to a context
 class GraphObj : public Object {
  protected:
    Runtime runtime;
    TensorVec tensors;
    TensorVec inputs;
    TensorVec outputs;
    OpVec ops;
  public:
-    // Graph(OpVec oplist);
+    GraphObj(Runtime runtime) : runtime(runtime){};
    string toString() const override;
    Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@ -5,8 +5,14 @@
 namespace infini {
 class RuntimeObj; // Forward declaration for Kernel::compute
 struct PerfRecord {
-    double time; // in milliseconds
+    PerfRecord(){};
    PerfRecord(double time) : time(time){};
    virtual ~PerfRecord() {}
    double time = 0; // in milliseconds
 };
 class Kernel {
@ -20,14 +26,16 @@ class Kernel {
     * are required, inherit from PerfRecord and add extra parameters.
     * Otherwire, use PerfRecord directly.
     */
-    virtual void compute(const Operator &op,
+    virtual void compute(const Operator &op, const PerfRecord &record,
-                         const PerfRecord &record) const = 0;
+                         const RuntimeObj *context) const = 0;
    /**
     * @brief Executes an op with a default parameter.
     */
-    virtual void compute(const Operator &op) const = 0;
+    virtual void compute(const Operator &op,
                         const RuntimeObj *context) const = 0;
    // Premise: op is idempotent since it is called multiple times.
-    virtual PerfRecord tune(const Operator &op) const = 0;
+    virtual PerfRecord tune(const Operator &op,
                            const RuntimeObj *context) const = 0;
 };
 class KernelRegistry {
--- a/include/core/run_enigne.h
+++ b/include/core/run_enigne.h
@ -1,26 +0,0 @@
 #pragma once
 #include "core/graph.h"
 #include "core/kernel.h"
 #include "core/perf_engine.h"
 namespace infini {
 class RunEngine {
  private:
    Device device;
  public:
    RunEngine(Device device) : device(device) {}
    ~RunEngine() {}
    void run(const Graph &graph, bool tune = false,
             bool profiling = false) const;
    double getPerfTime(const Graph &graph, bool profiling = false) const;
  private:
    void printProfilingData(double totTime,
                            const std::map<OpType, double> &opTime,
                            const std::map<OpType, int> &opCnt) const;
 };
 } // namespace infini
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -0,0 +1,62 @@
 #pragma once
 #include "core/graph.h"
 #include "core/kernel.h"
 #include "core/perf_engine.h"
 namespace infini {
 class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
  protected:
    Device device;
  public:
    RuntimeObj(Device device) : device(device) {}
    RuntimeObj(RuntimeObj &other) = delete;
    RuntimeObj &operator=(RuntimeObj const &) = delete;
    virtual ~RuntimeObj() {}
    /**
     * @brief Execute a graph.
     *
     * @param graph
     * @param tune If there is no performance record, whether to tune it. These
     * can be independent method.
     * @param profiling Whether to print breakdown of time
     */
    virtual void run(const Graph &graph, bool tune = false,
                     bool profiling = false) const = 0;
    virtual void *alloc(size_t size) = 0;
    virtual void dealloc(void *ptr) = 0;
    /**
     * @brief Get the execution time of each operator in performance record. No
     * execution happens.
     *
     * @param graph
     * @param profiling Whether to print breakdown of time
     * @return double Return the sum of perf time for each operator
     */
    double getPerfTime(const Graph &graph, bool profiling = false) const;
    Blob allocBlob(size_t size);
  protected:
    void printProfilingData(double totTime,
                            const std::map<OpType, double> &opTime,
                            const std::map<OpType, int> &opCnt) const;
 };
 // TODO: change inheritance relation
 class CpuRuntimeObj : public RuntimeObj {
  public:
    CpuRuntimeObj() : RuntimeObj(Device::CPU) {}
    void run(const Graph &graph, bool tune = false,
             bool profiling = false) const override;
    void dealloc(void *ptr) override { return free(ptr); };
    void *alloc(size_t size) override {
        return calloc((size + sizeof(uint64_t) - 1) / sizeof(uint64_t),
                      sizeof(uint64_t));
    };
 };
 } // namespace infini
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -1,5 +1,6 @@
 #pragma once
 #include "core/tensor_base.h"
 #include <cmath>
 namespace infini {
@ -16,21 +17,46 @@ class TensorObj : public TensorBaseObj {
    string toString() const override;
    size_t size() const;
    void dataMalloc();
    Shape getDims() const { return shape; }
    size_t getOffset(const Shape &ds) const;
    using TensorBaseObj::getData;
    VType getData(const Shape &pos) const;
-    void copyData(VType *dptr);
+    void dataMalloc(const Runtime &runtime);
    // void copyData(VType *dptr);
    template <typename T> void copyData(const T *dptr);
    void copyData(vector<VType> dataVector);
    void copyData(vector<float> dataVector);
    void printData() const;
    // TODO: merge these methods
    bool equalData(const Tensor &rhs) const;
-    void
+    template <typename T> bool equalData(const Tensor &rhs) const {
-    setData(std::function<void(void *, size_t, DataType)> generator) const {
+        IT_ASSERT(data != nullptr);
-        generator((void *)(data.get()), size(), dtype);
+        IT_ASSERT(rhs->data != nullptr);
        // TODO: deal with data type
        auto ptr = data->getPtr<T *>();
        auto ptrRhs = rhs->data->getPtr<T *>();
        if (shape != rhs->getDims())
            return false;
        size_t sz = size();
        for (size_t i = 0; i < sz; ++i)
            if (fabs(ptr[i] - ptrRhs[i]) /
                    std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
                1e-6) {
                printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
                return false;
            }
        return true;
    }
    void setData(
        const std::function<void(void *, size_t, DataType)> &generator) const {
        generator(data->getPtr<void *>(), size(), dtype);
    }
  private:
    void printDataFloat() const;
    void printDataUint32_t() const;
    // void setDims(const Dim &dms) { dims = dms; }
    //     bool dataRand(int seed = 0) {
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@ -1,19 +1,23 @@
 #pragma once
 #include "core/blob.h"
 #include "core/object.h"
 #include "core/ref.h"
 namespace infini {
 // class Tensor;
 class TensorBaseObj;
 class TensorObj;
 class OperatorObj;
 class GraphObj;
 class RuntimeObj;
 class BlobObj;
 using TensorBase = Ref<TensorBaseObj>;
 using Tensor = Ref<TensorObj>;
 using Operator = Ref<OperatorObj>;
 using Graph = Ref<GraphObj>;
 using Runtime = Ref<RuntimeObj>;
 using Blob = Ref<BlobObj>;
 using TensorVec = vector<Tensor>;
 using OpVec = vector<Operator>;
@ -40,8 +44,7 @@ class TensorBaseObj : public Object {
    DataType dtype;
    vector<WRef<TensorBaseObj>> inputOf;
    WRef<TensorBaseObj> outputOf;
-    // TODO: Ref<void> -> Ref<Blob>
+    Blob data;
    Ref<VType[]> data;
    // ComputeState computed;
    // static int random_seed[256 * 16];
    // static bool random_inited;
@ -50,7 +53,15 @@ class TensorBaseObj : public Object {
    TensorBaseObj(int dim, DataType dtype);
    virtual ~TensorBaseObj() {}
-    Ref<VType[]> getDataPtr() const { return data; }
+    void dataMalloc(const Blob &blob) {
        IT_ASSERT(data == nullptr);
        data = blob;
    }
    Blob getDataPtr() const { return data; }
    template <typename T> T getDataRawPtr() const {
        IT_ASSERT(data != nullptr);
        return data->getPtr<T>();
    }
    VType getData(size_t offset) const;
    DataType getDType() const { return dtype; }
--- a/include/cuda/cuda_common.h
+++ b/include/cuda/cuda_common.h
@ -0,0 +1,111 @@
 #pragma once
 #include "core/common.h"
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cudnn.h>
 #include <curand.h>
 // TODO: replace with Exception (IT_ASSERT)
 #define checkCudaError(call)                                                   \
    {                                                                          \
        auto err = call;                                                       \
        if (cudaSuccess != err) {                                              \
            fprintf(stderr, "Cuda error in %s:%i : %s.\n", __FILE__, __LINE__, \
                    cudaGetErrorString(err));                                  \
            exit(EXIT_FAILURE);                                                \
        }                                                                      \
    }
 #define checkCublasError(call)                                                 \
    {                                                                          \
        auto err = call;                                                       \
        if (CUBLAS_STATUS_SUCCESS != err) {                                    \
            fprintf(stderr, "cuBLAS error in %s:%i : %s.\n", __FILE__,         \
                    __LINE__, cublasGetErrorString(err));                      \
            exit(EXIT_FAILURE);                                                \
        }                                                                      \
    }
 #define checkCudnnError(call)                                                  \
    {                                                                          \
        auto err = call;                                                       \
        if (CUDNN_STATUS_SUCCESS != err) {                                     \
            fprintf(stderr, "cuDNN error in %s:%i : %s.\n", __FILE__,          \
                    __LINE__, cudnnGetErrorString(err));                       \
            exit(EXIT_FAILURE);                                                \
        }                                                                      \
    }
 #define checkCurandError(call)                                                 \
    {                                                                          \
        auto err = call;                                                       \
        if (CURAND_STATUS_SUCCESS != err) {                                    \
            fprintf(stderr, "cuRAND error in %s:%i : %s.\n", __FILE__,         \
                    __LINE__, curandGetErrorString(err));                      \
            exit(EXIT_FAILURE);                                                \
        }                                                                      \
    }
 namespace infini {
 inline const char *cublasGetErrorString(cublasStatus_t error) {
    switch (error) {
    case CUBLAS_STATUS_SUCCESS:
        return "CUBLAS_STATUS_SUCCESS";
    case CUBLAS_STATUS_NOT_INITIALIZED:
        return "CUBLAS_STATUS_NOT_INITIALIZED";
    case CUBLAS_STATUS_ALLOC_FAILED:
        return "CUBLAS_STATUS_ALLOC_FAILED";
    case CUBLAS_STATUS_INVALID_VALUE:
        return "CUBLAS_STATUS_INVALID_VALUE";
    case CUBLAS_STATUS_ARCH_MISMATCH:
        return "CUBLAS_STATUS_ARCH_MISMATCH";
    case CUBLAS_STATUS_MAPPING_ERROR:
        return "CUBLAS_STATUS_MAPPING_ERROR";
    case CUBLAS_STATUS_EXECUTION_FAILED:
        return "CUBLAS_STATUS_EXECUTION_FAILED";
    case CUBLAS_STATUS_INTERNAL_ERROR:
        return "CUBLAS_STATUS_INTERNAL_ERROR";
    case CUBLAS_STATUS_NOT_SUPPORTED:
        return "CUBLAS_STATUS_NOT_SUPPORTED";
    case CUBLAS_STATUS_LICENSE_ERROR:
        return "CUBLAS_STATUS_LICENSE_ERROR";
    }
    return "<unknown>";
 }
 inline const char *curandGetErrorString(curandStatus_t error) {
    switch (error) {
    case CURAND_STATUS_SUCCESS:
        return "CURAND_STATUS_SUCCESS";
    case CURAND_STATUS_VERSION_MISMATCH:
        return "CURAND_STATUS_VERSION_MISMATCH";
    case CURAND_STATUS_NOT_INITIALIZED:
        return "CURAND_STATUS_NOT_INITIALIZED";
    case CURAND_STATUS_ALLOCATION_FAILED:
        return "CURAND_STATUS_ALLOCATION_FAILED";
    case CURAND_STATUS_TYPE_ERROR:
        return "CURAND_STATUS_TYPE_ERROR";
    case CURAND_STATUS_OUT_OF_RANGE:
        return "CURAND_STATUS_OUT_OF_RANGE";
    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
        return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
        return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
    case CURAND_STATUS_LAUNCH_FAILURE:
        return "CURAND_STATUS_LAUNCH_FAILURE";
    case CURAND_STATUS_PREEXISTING_FAILURE:
        return "CURAND_STATUS_PREEXISTING_FAILURE";
    case CURAND_STATUS_INITIALIZATION_FAILED:
        return "CURAND_STATUS_INITIALIZATION_FAILED";
    case CURAND_STATUS_ARCH_MISMATCH:
        return "CURAND_STATUS_ARCH_MISMATCH";
    case CURAND_STATUS_INTERNAL_ERROR:
        return "CURAND_STATUS_INTERNAL_ERROR";
    }
    return "<unknown>";
 }
 using CudaPtr = void *;
 } // namespace infini
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -0,0 +1,49 @@
 #pragma once
 #include "core/runtime.h"
 #include "cuda/cuda_common.h"
 namespace infini {
 class CudaRuntimeObj : public RuntimeObj {
  private:
    cudnnHandle_t cudnn;
    cublasHandle_t cublas;
    CudaPtr workspace;
    size_t workspaceSize;
  public:
    CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
        checkCudnnError(cudnnCreate(&cudnn));
        checkCublasError(cublasCreate(&cublas));
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
        workspaceSize = 7ll << 30; // 7 GB
        workspace = alloc(workspaceSize);
    }
    virtual ~CudaRuntimeObj() {
        checkCudnnError(cudnnDestroy(cudnn));
        checkCublasError(cublasDestroy(cublas));
    }
    void run(const Graph &graph, bool tune = false,
             bool profiling = false) const;
    // double runEvaluation(const Graph &graph, int nWarmups,
    //                      int nEvaluations) const;
    void sync() const;
    CudaPtr alloc(size_t size) override {
        void *ptr;
        checkCudaError(cudaMalloc(&ptr, size));
        return ptr;
    }
    void dealloc(void *ptr) override { checkCudaError(cudaFree(ptr)); }
    cudnnHandle_t cudnnHandle() const { return cudnn; }
    cublasHandle_t cublasHandle() const { return cublas; }
    CudaPtr getWorkspace(size_t size) const {
        IT_ASSERT(size <= workspaceSize);
        return workspace;
    }
  private:
    void runWithoutSync(const Graph &graph) const;
 };
 } // namespace infini
--- a/include/cuda/cuda_utility.h
+++ b/include/cuda/cuda_utility.h
@ -0,0 +1,11 @@
 #include "core/tensor.h"
 namespace infini {
 void cudaPrintFloat(float *x, int len);
 void cudaPrintTensor(const Tensor &tensor) {
    cudaPrintFloat(tensor->getDataRawPtr<float *>(), tensor->size());
 }
 } // namespace infini
--- a/include/test.h
+++ b/include/test.h
@ -5,12 +5,14 @@
 namespace infini {
 // TODO: isolate these class
 class DataGenerator {
  private:
-    virtual void fill(uint32_t *data, size_t size) { IT_TODO_HALT(); };
+    virtual void fill(uint32_t *data, size_t size) { IT_TODO_HALT(); }
-    virtual void fill(float *data, size_t size) { IT_TODO_HALT(); };
+    virtual void fill(float *data, size_t size) { IT_TODO_HALT(); }
  public:
    virtual ~DataGenerator() {}
    void operator()(void *data, size_t size, DataType dataType) {
        switch (dataType) {
        case DataType::UInt32:
@ -26,10 +28,36 @@ class DataGenerator {
 };
 class IncrementalGenerator : public DataGenerator {
-    void fill(uint32_t *data, size_t size) override {
+  public:
    virtual ~IncrementalGenerator() {}
  private:
    template <typename T> void fill(T *data, size_t size) {
        for (size_t i = 0; i < size; i++) {
            data[i] = i;
        }
    }
    void fill(uint32_t *data, size_t size) override {
        fill<uint32_t>(data, size);
    }
    void fill(float *data, size_t size) override { fill<float>(data, size); }
 };
 class OneGenerator : public DataGenerator {
  public:
    virtual ~OneGenerator() {}
  private:
    template <typename T> void fill(T *data, size_t size) {
        for (size_t i = 0; i < size; i++) {
            data[i] = 1;
        }
    }
    void fill(uint32_t *data, size_t size) override {
        fill<uint32_t>(data, size);
    }
    void fill(float *data, size_t size) override { fill<float>(data, size); }
 };
 } // namespace infini
--- a/src/core/blob.cc
+++ b/src/core/blob.cc
@ -0,0 +1,10 @@
 #include "core/runtime.h"
 namespace infini {
 BlobObj::~BlobObj() {
    // Avoid cycled inclusion
    runtime->dealloc(ptr);
 }
 } // namespace infini
--- a/src/core/common.cc
+++ b/src/core/common.cc
@ -4,11 +4,20 @@
 namespace infini {
-double timeit(const std::function<void()> &func) {
+double timeit(const std::function<void()> &func, int warmupRounds,
-    auto start = std::chrono::high_resolution_clock::now();
+              int timingRounds, const std::function<void(void)> &sync) {
    for (int i = 0; i < warmupRounds; ++i)
        func();
    if (sync)
        sync();
    auto start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < timingRounds; ++i)
        func();
    if (sync)
        sync();
    auto end = std::chrono::high_resolution_clock::now();
-    return std::chrono::duration<double, std::milli>(end - start).count();
+    return std::chrono::duration<double, std::milli>(end - start).count() /
           timingRounds;
 }
 } // namespace infini
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@ -13,8 +13,9 @@ string GraphObj::toString() const {
 }
 void GraphObj::dataMalloc() {
-    for (auto &tensor : tensors)
+    for (auto &tensor : tensors) {
-        tensor->dataMalloc();
+        tensor->dataMalloc(runtime);
    }
 }
 Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
--- a/src/core/run_engine.cc
+++ b/src/core/run_engine.cc
@ -1,9 +1,14 @@
-#include "core/run_enigne.h"
+#include "core/runtime.h"
 #include "core/blob.h"
 #include <chrono>
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <cudnn.h>
 #include <curand.h>
 namespace infini {
-void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
+void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
    if (!tune && profiling)
        IT_TODO_HALT();
    const auto &kernelRegistry = KernelRegistry::getInstance();
@ -12,7 +17,6 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
    double totalTime = 0;
    std::map<OpType, double> opTime;
    std::map<OpType, int> opCnt;
    std::chrono::system_clock::time_point begin, end;
    for (auto &op : graph->getOperators()) {
        // HACK: set correct data type
@ -24,7 +28,7 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
        // If no record and disable tuning, run with the default argument
        if (!perfData && !tune) {
-            kernel->compute(op);
+            kernel->compute(op, this);
            continue;
        }
@ -32,16 +36,18 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
        PerfRecord record;
        // Tune the kernel if there is no record
        if (!perfData) {
-            record = kernel->tune(op);
+            // TODO: record is not used
            record = kernel->tune(op, this);
            perfEngine.setPerfData(perfKey, record);
        } else
            record = *perfData;
        if (!profiling) {
-            kernel->compute(op, *perfData);
+            kernel->compute(op, *perfData, this);
            continue;
        } else {
-            double t = timeit([&]() { kernel->compute(op, *perfData); });
+            double t =
                timeit([&]() { kernel->compute(op, *perfData, this); }, 1, 1);
            op->print();
            printf(" op_time %lf\n", t);
            totalTime += t;
@ -53,7 +59,7 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
        printProfilingData(totalTime, opTime, opCnt);
 }
-double RunEngine::getPerfTime(const Graph &graph, bool profiling) const {
+double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto perfEngine = PerfEngine::getInstance();
    // Statistics
@ -72,7 +78,7 @@ double RunEngine::getPerfTime(const Graph &graph, bool profiling) const {
        PerfRecord record;
        // Tune the kernel if there is no record
        if (!perfData) {
-            record = kernel->tune(op);
+            record = kernel->tune(op, this);
            perfEngine.setPerfData(perfKey, record);
        } else
            record = *perfData;
@ -91,7 +97,7 @@ double RunEngine::getPerfTime(const Graph &graph, bool profiling) const {
    return totalTime;
 }
-void RunEngine::printProfilingData(double totalTime,
+void RuntimeObj::printProfilingData(double totalTime,
                                    const std::map<OpType, double> &opTime,
                                    const std::map<OpType, int> &opCnt) const {
    printf("%11s %3s %7s %7s %7s\n", "Op", "Cnt", "T_tot", "Percent", "T_mean");
@ -102,4 +108,8 @@ void RunEngine::printProfilingData(double totalTime,
    }
 }
 Blob RuntimeObj::allocBlob(size_t size) {
    return make_ref<BlobObj>(shared_from_this(), alloc(size));
 }
 } // namespace infini
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -1,15 +1,12 @@
-#include <core/tensor.h>
+#include "core/tensor.h"
 #include "core/blob.h"
 #include "core/runtime.h"
 namespace infini {
 TensorObj::TensorObj(const Shape &shape, DataType dtype)
    : TensorBaseObj(shape.size(), dtype), shape(shape) {}
 void TensorObj::dataMalloc() {
    IT_ASSERT(data == nullptr);
    // initialized to zero
    data.reset(reinterpret_cast<VType *>(calloc(size(), sizeof(VType))));
 }
 VType TensorObj::getData(const Shape &pos) const {
    return getData(getOffset(pos));
 }
@ -37,24 +34,42 @@ size_t TensorObj::size() const {
    return ret;
 }
-void TensorObj::copyData(VType *dptr) {
+template <typename T> void TensorObj::copyData(const T *dptr) {
    // TODO: cuda
    IT_ASSERT(data != nullptr);
    auto ptr = data->getPtr<T *>();
    size_t sz = size();
 #pragma omp parallel for
    for (size_t i = 0; i < sz; ++i) {
-        data[i] = dptr[i];
+        ptr[i] = dptr[i];
    }
 }
 void TensorObj::copyData(vector<VType> dataVector) {
    IT_ASSERT(dataVector.size() >= size());
    copyData(dataVector.data());
 }
 void TensorObj::copyData(vector<float> dataVector) {
    IT_ASSERT(dataVector.size() >= size());
    copyData(dataVector.data());
 }
 void TensorObj::printData() const {
    IT_ASSERT(data != nullptr);
    if (dtype == DataType::Float32)
        printDataFloat();
    else if (dtype == DataType::UInt32)
        printDataUint32_t();
    else
        IT_TODO_HALT();
 }
 void TensorObj::printDataFloat() const {
    std::cout << "Tensor: " << guid << std::endl;
    auto numDims = shape.size();
    auto dimSzVec = std::vector<int>(numDims, 1);
    auto ptr = data->getPtr<float *>();
    dimSzVec[numDims - 1] = shape[numDims - 1];
    for (int i = numDims - 1; i != 0; --i)
        dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
@ -64,7 +79,35 @@ void TensorObj::printData() const {
                std::cout << "[";
            }
        }
-        std::cout << data[i];
+        printf("%.1f", ptr[i]);
        for (size_t j = 0; j < numDims; ++j) {
            if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
                std::cout << "]";
            }
        }
        if (i != size() - 1)
            std::cout << ", ";
        if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1)
            std::cout << std::endl;
    }
 }
 void TensorObj::printDataUint32_t() const {
    IT_ASSERT(data != nullptr);
    std::cout << "Tensor: " << guid << std::endl;
    auto numDims = shape.size();
    auto dimSzVec = std::vector<int>(numDims, 1);
    auto ptr = data->getPtr<VType *>();
    dimSzVec[numDims - 1] = shape[numDims - 1];
    for (int i = numDims - 1; i != 0; --i)
        dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
    for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
        for (size_t j = 0; j < numDims; ++j) {
            if (i % dimSzVec[j] == 0) {
                std::cout << "[";
            }
        }
        std::cout << ptr[i];
        for (size_t j = 0; j < numDims; ++j) {
            if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
                std::cout << "]";
@ -80,13 +123,26 @@ void TensorObj::printData() const {
 bool TensorObj::equalData(const Tensor &rhs) const {
    IT_ASSERT(data != nullptr);
    IT_ASSERT(rhs->data != nullptr);
    // TODO: deal with data type
    auto ptr = data->getPtr<VType *>();
    auto ptrRhs = rhs->data->getPtr<VType *>();
    if (shape != rhs->getDims())
        return false;
    size_t sz = size();
    for (size_t i = 0; i < sz; ++i)
-        if (data[i] != rhs->data[i])
+        if (ptr[i] != ptrRhs[i])
            return false;
    return true;
 }
 void TensorObj::dataMalloc(const Runtime &runtime) {
    IT_ASSERT(data == nullptr);
    size_t bytesPerElement;
    if (getDType() == DataType::Float32)
        bytesPerElement = sizeof(float);
    else if (getDType() == DataType::UInt32)
        bytesPerElement = sizeof(uint32_t);
    data = runtime->allocBlob(size() * bytesPerElement);
 }
 }; // namespace infini
--- a/src/core/tensor_base.cc
+++ b/src/core/tensor_base.cc
@ -1,9 +1,14 @@
-#include <core/tensor_base.h>
+#include "core/tensor_base.h"
 #include "core/blob.h"
 #include "core/runtime.h"
 namespace infini {
 TensorBaseObj::TensorBaseObj(int dim, DataType dtype)
    : dim(dim), dtype(dtype) {}
-VType TensorBaseObj::getData(size_t offset) const { return data[offset]; }
+VType TensorBaseObj::getData(size_t offset) const {
    // TODO: check cuda array
    return (data->getPtr<VType *>())[offset];
 }
 }; // namespace infini
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@ -0,0 +1,32 @@
 #include "cuda/cuda_runtime.h"
 namespace infini {
 void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto perfEngine = PerfEngine::getInstance();
    for (auto &op : graph->getOperators()) {
        // HACK: set correct data type
        auto kernelAttrs =
            KernelAttrs{device, op->getOpType(), DataType::Float32};
        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
        std::optional<PerfRecord> perfData = perfEngine.getPerfData(perfKey);
        if (perfData)
            kernel->compute(op, *perfData, this);
        else
            kernel->compute(op, this);
    }
 }
 void CudaRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
    if (tune || profiling)
        IT_TODO_HALT();
    runWithoutSync(graph);
    sync();
 }
 void CudaRuntimeObj::sync() const { cudaDeviceSynchronize(); }
 } // namespace infini
--- a/src/cuda/cuda_utility.cu
+++ b/src/cuda/cuda_utility.cu
@ -0,0 +1,20 @@
 #include "cuda/cuda_common.h"
 #include <cstdio>
 __global__ void cudaPrintFloatImpl(float *x, int len) {
    int start = threadIdx.x + blockDim.x * blockIdx.x;
    if (start == 0) {
        for (int i = 0; i < len; ++i) {
            printf("%.3f ", x[i]);
        }
        printf("\n");
    }
 }
 namespace infini {
 void cudaPrintFloat(float *x, int len) {
    cudaPrintFloatImpl<<<1, 1>>>(x, len);
    cudaDeviceSynchronize();
 }
 } // namespace infini
--- a/src/kerels/cpu/conv.cc
+++ b/src/kerels/cpu/conv.cc
@ -4,11 +4,12 @@
 namespace infini {
 template <typename T> class NaiveConv : public Kernel {
-    void compute(const Operator &_op, const PerfRecord &record) const override {
+    void compute(const Operator &_op, const PerfRecord &record,
                 const RuntimeObj *context) const override {
        auto op = as<ConvObj>(_op);
-        T *iptr = reinterpret_cast<T *>(op->getInputs(0)->getDataPtr().get());
+        T *iptr = op->getInputs(0)->getDataRawPtr<T *>();
-        T *wptr = reinterpret_cast<T *>(op->getInputs(1)->getDataPtr().get());
+        T *wptr = op->getInputs(1)->getDataRawPtr<T *>();
-        T *optr = reinterpret_cast<T *>(op->getOutput()->getDataPtr().get());
+        T *optr = op->getOutput()->getDataRawPtr<T *>();
        auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
        auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
        int cpg = op->getChannelPerGroup();
@ -45,10 +46,13 @@ template <typename T> class NaiveConv : public Kernel {
        }
    }
-    void compute(const Operator &op) const override { compute(op, {}); }
+    void compute(const Operator &op, const RuntimeObj *context) const override {
        compute(op, {}, context);
    }
-    PerfRecord tune(const Operator &op) const override {
+    PerfRecord tune(const Operator &op,
-        return PerfRecord{.time = timeit([this, &op]() { compute(op); })};
+                    const RuntimeObj *context) const override {
        return PerfRecord(timeit([&]() { compute(op, context); }));
    }
 };
--- a/src/kerels/cpu/matmul.cc
+++ b/src/kerels/cpu/matmul.cc
@ -4,11 +4,12 @@
 namespace infini {
 template <typename T> class NaiveMatmul : public Kernel {
-    void compute(const Operator &_op, const PerfRecord &record) const override {
+    void compute(const Operator &_op, const PerfRecord &record,
                 const RuntimeObj *context) const override {
        auto op = as<MatmulObj>(_op);
-        T *A = reinterpret_cast<T *>(op->getInputs(0)->getDataPtr().get());
+        T *A = op->getInputs(0)->getDataRawPtr<T *>();
-        T *B = reinterpret_cast<T *>(op->getInputs(1)->getDataPtr().get());
+        T *B = op->getInputs(1)->getDataRawPtr<T *>();
-        T *C = reinterpret_cast<T *>(op->getOutput()->getDataPtr().get());
+        T *C = op->getOutput()->getDataRawPtr<T *>();
        IT_ASSERT(op->getTransA() == false && op->getTransB() == false);
        IT_ASSERT(op->getAct() == ActType::None);
        IT_ASSERT(op->getB() == 1);
@ -23,10 +24,15 @@ template <typename T> class NaiveMatmul : public Kernel {
        }
    }
-    void compute(const Operator &op) const override { compute(op, {}); }
+    void compute(const Operator &op, const RuntimeObj *context) const override {
        compute(op, {}, context);
    }
-    PerfRecord tune(const Operator &op) const override {
+    PerfRecord tune(const Operator &op,
-        return PerfRecord{.time = timeit([this, &op]() { compute(op); })};
+                    const RuntimeObj *context) const override {
        PerfRecord ret;
        ret.time = timeit([&]() { compute(op, context); });
        return ret;
    }
 };
--- a/src/kerels/cuda/conv.cc
+++ b/src/kerels/cuda/conv.cc
@ -0,0 +1,208 @@
 #include "operators/conv.h"
 #include "core/kernel.h"
 #include "cuda/cuda_runtime.h"
 namespace infini {
 static constexpr int N_ALGO = 8;
 static constexpr cudnnConvolutionFwdAlgo_t ALGOS[N_ALGO] = {
    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
    CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
    CUDNN_CONVOLUTION_FWD_ALGO_FFT,
    CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED};
 struct ConvCuDnnPerfRecord : public PerfRecord {
    int algo = 0; // cudnnConvolutionFwdAlgo_t
    size_t workspaceSize = 100000;
    bool fuseAct = false;
 };
 class convCudnn : public Kernel {
    bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
                      const CudaRuntimeObj *context) const {
        cudnnStatus_t stat;
        void *const inData = (op->getInputs(0)->getDataRawPtr<void *>());
        void *const knData = (op->getInputs(1)->getDataRawPtr<void *>());
        if (op->getInputs(2) != nullptr)
            IT_TODO_HALT();
        // void *const biasData = (op->getInputs(2)->getDataRawPtr<void *>());
        void *const outData = (op->getOutput()->getDataRawPtr<void *>());
        const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
        const int cpg = op->getChannelPerGroup();
        const int g = c / cpg;
        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
        int channelsPerGrp = cpg, channels = c;
        // get inputs
        cudnnTensorDescriptor_t inDesc;
        checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
        checkCudnnError(cudnnSetTensor4dDescriptor(
            inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, channels, h, w));
        // get kernels
        cudnnFilterDescriptor_t knDesc;
        checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
        checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
                                                   CUDNN_TENSOR_NCHW, f,
                                                   channelsPerGrp, r, s));
        // get bias
        cudnnTensorDescriptor_t biasDesc;
        checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
        checkCudnnError(cudnnSetTensor4dDescriptor(
            biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1));
        // get convlution descriptor
        cudnnConvolutionDescriptor_t convDesc;
        checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
        // TODO: CUDNN_CONVOLUTION is a tunable argument
        checkCudnnError(cudnnSetConvolution2dDescriptor(
            convDesc, ph, pw, sh, sw, dh, dw, CUDNN_CROSS_CORRELATION,
            CUDNN_DATA_FLOAT));
        if (g > 1) {
            checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g));
        }
        // get activation descriptor
        cudnnActivationDescriptor_t actDesc;
        checkCudnnError(cudnnCreateActivationDescriptor(&actDesc));
        // NOT_PROPAGATE_NAN is requierd by
        // cudnnConvolotionBiasActivationForward
        switch (op->getAct()) {
        case ActType::Relu:
            checkCudnnError(cudnnSetActivationDescriptor(
                actDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0));
            break;
        case ActType::Sigmoid:
            checkCudnnError(cudnnSetActivationDescriptor(
                actDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0));
            break;
        case ActType::None:
            checkCudnnError(
                cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_IDENTITY,
                                             CUDNN_NOT_PROPAGATE_NAN, 0));
            break;
        default:
            assert(false);
        }
        // get outputs
        int outn, outc, outh, outw;
        checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
            convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
        cudnnTensorDescriptor_t outDesc;
        checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
        checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
                                                   CUDNN_DATA_FLOAT, outn, outc,
                                                   outh, outw));
        IT_ASSERT((vector{outn, outc, outh, outw}) ==
                      op->getOutput()->getDims(),
                  "cuDNN output shape mismatches with OP output shape");
        // get workspace
        size_t wsSize = record.workspaceSize;
        stat = cudnnGetConvolutionForwardWorkspaceSize(
            context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
            ALGOS[record.algo], &wsSize);
        if (stat != CUDNN_STATUS_SUCCESS)
            return false;
        // assert(wsSize < (size_t)3 * 1024 * 1024 * 1024);
        // if (wsSize >= (size_t)10 * 1024 * 1024 * 1024)
        //     continue;
        CudaPtr wsData = context->getWorkspace(wsSize);
        float alpha = 1.f, beta = 0.f;
        stat = cudnnConvolutionForward(context->cudnnHandle(), &alpha, inDesc,
                                       inData, knDesc, knData, convDesc,
                                       ALGOS[record.algo], wsData, wsSize,
                                       &beta, outDesc, outData);
        if (stat != CUDNN_STATUS_SUCCESS)
            return false;
        // TODO:
        // // bias
        // if (bias != nullptr) {
        //     auto sz = op.getOutputs()[0]->size();
        //     // TODO: element wise
        //     t += sz * 2 / 400;
        // }
        // // act
        // if (act != None) {
        //     stat = cudnnActivationForward(cudnnHandle(), actDesc,
        //                                   &alpha, inDesc, inData,
        //                                   &beta, outDesc, outData);
        //     checkCudaError(cudaDeviceSynchronize());
        //     end = ch::high_resolution_clock::now();
        //     if (stat != CUDNN_STATUS_SUCCESS) {
        //         durtime = INFINITY;
        //         break;
        //     }
        //     t +=
        //         ch::duration_cast<ch::duration<double>>(end -
        //         beg).count() * 1000; // ms
        // }
        // best = ConvResult{durtime, ALGOS[i], wsSize, false};
        // // w/ bias & act
        // for (int j = 0; j < rounds + warmupRounds; ++j) {
        //     cudnnStatus_t stat;
        //     if (j == warmupRounds) {
        //         checkCudaError(cudaDeviceSynchronize());
        //         beg = ch::high_resolution_clock::now();
        //     }
        //     stat = cudnnConvolutionBiasActivationForward(
        //         cudnnHandle(), &alpha, inDesc, inData, knDesc, knData,
        //         convDesc, ALGOS[i], wsData, wsSize, &beta, outDesc,
        //         outData, biasDesc, biasData, actDesc, outDesc, outData);
        //     if (stat != CUDNN_STATUS_SUCCESS) {
        //         // checkCudnnError(stat);
        //         // Do not checkCudnnError since not all algorithms are
        //         // supported
        //         durtime_fuse = INFINITY;
        //         break;
        //     }
        // }
        // Destories in CUDA does not require sync. But cuDNN does not state
        // whether sync is required before destories.
        checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
        checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
        checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
        checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
        return true;
    }
    void compute(const Operator &op, const RuntimeObj *context) const override {
        ConvCuDnnPerfRecord record; // with paramters in default ctor
        compute(op, record, context);
    }
    PerfRecord tune(const Operator &_op,
                    const RuntimeObj *_context) const override {
        // TODO: real tuning
        ConvCuDnnPerfRecord ret;
        ret.time = timeit([&]() { compute(_op, _context); });
        return ret;
    }
    void compute(const Operator &_op, const PerfRecord &_record,
                 const RuntimeObj *_context) const override {
        auto op = as<ConvObj>(_op);
        auto &record = dynamic_cast<const ConvCuDnnPerfRecord &>(_record);
        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
        bool success = cuDNNUnfused(op, record, context);
        IT_ASSERT(success);
    }
 };
 REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float32, convCudnn,
                "Conv_cuDNN_CUDA_Float32");
 } // namespace infini
--- a/test/core/test_graph.cc
+++ b/test/core/test_graph.cc
@ -1,45 +1,48 @@
 #include "core/blob.h"
 #include "core/graph.h"
-#include "core/run_enigne.h"
+#include "core/runtime.h"
 #include "operators/matmul.h"
 #include "test.h"
 namespace infini {
 TEST(Graph, build_and_run) {
-    Graph g = make_ref<GraphObj>();
+    Runtime runtime = make_ref<CpuRuntimeObj>();
    Graph g = make_ref<GraphObj>(runtime);
    Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
    Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
    Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
    g->dataMalloc();
-    i0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    w0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
    g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
-    RunEngine(Device::CPU).run(g);
+    runtime->run(g);
    // check answer
    auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
-    ans->dataMalloc();
+    ans->dataMalloc(runtime);
-    ans->copyData({38, 44, 50, 56, 83, 98, 113, 128});
+    ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
    EXPECT_TRUE(o0->equalData(ans));
 }
 TEST(Graph, perf_engine) {
-    Graph g = make_ref<GraphObj>();
+    Runtime runtime = make_ref<CpuRuntimeObj>();
    Graph g = make_ref<GraphObj>(runtime);
    Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
    Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
    auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);
    g->dataMalloc();
-    i0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    w0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    RunEngine(Device::CPU).run(g, true, true);
+    runtime->run(g, true, true);
-    double perfTime = RunEngine(Device::CPU).getPerfTime(g);
+    double perfTime = runtime->getPerfTime(g);
    // The example matmul takes 0.0036ms with one core
    EXPECT_GT(perfTime, 0);
    EXPECT_LT(perfTime, 0.01);
    // check answer
    auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
-    ans->dataMalloc();
+    ans->dataMalloc(runtime);
-    ans->copyData({38, 44, 50, 56, 83, 98, 113, 128});
+    ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
    EXPECT_TRUE(matmul->getOutput()->equalData(ans));
 }
--- a/test/core/test_hash.cc
+++ b/test/core/test_hash.cc
@ -1,5 +1,5 @@
 #include "core/graph.h"
-#include "core/run_enigne.h"
+#include "core/runtime.h"
 #include "operators/matmul.h"
 #include "test.h"
@ -8,7 +8,7 @@ namespace infini {
 TEST(Hash, OperatorHash) {
    OpPerfKey key1(0, OpType::Unknown), key2(0, OpType::Unknown);
    { // build with addOpWithOutputs
-        Graph g = make_ref<GraphObj>();
+        Graph g = make_ref<GraphObj>(nullptr);
        Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
        Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
        Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
@ -18,7 +18,7 @@ TEST(Hash, OperatorHash) {
        EXPECT_GT(key1.attrs.size(), 5);
    }
    { // build with addOp
-        Graph g = make_ref<GraphObj>();
+        Graph g = make_ref<GraphObj>(nullptr);
        Tensor i0 = g->addTensor({2, 2, 3}, DataType::UInt32);
        Tensor w0 = g->addTensor({2, 3, 4}, DataType::UInt32);
        auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);
--- a/test/operators/test_conv.cc
+++ b/test/operators/test_conv.cc
@ -1,21 +1,24 @@
 #include "core/graph.h"
-#include "core/run_enigne.h"
+#include "core/runtime.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
 #include "operators/conv.h"
 #include "test.h"
 namespace infini {
 TEST(Conv, ShapeInference) {
    auto runtime = make_ref<CpuRuntimeObj>();
    // Padding modes
    {
-        Graph g = make_ref<GraphObj>();
+        Graph g = make_ref<GraphObj>(runtime);
        Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
        Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
        auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1);
        EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
    }
    {
-        Graph g = make_ref<GraphObj>();
+        Graph g = make_ref<GraphObj>(runtime);
        Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
        Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
        auto conv =
@ -23,7 +26,7 @@ TEST(Conv, ShapeInference) {
        EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
    }
    {
-        Graph g = make_ref<GraphObj>();
+        Graph g = make_ref<GraphObj>(runtime);
        Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
        Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
        auto conv =
@ -31,7 +34,7 @@ TEST(Conv, ShapeInference) {
        EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 2, 2}));
    }
    { // dilation & stride
-        Graph g = make_ref<GraphObj>();
+        Graph g = make_ref<GraphObj>(runtime);
        Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
        Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
        auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
@ -40,7 +43,8 @@ TEST(Conv, ShapeInference) {
 }
 TEST(Conv, NaiveCPU) {
-    Graph g = make_ref<GraphObj>();
+    auto runtime = make_ref<CpuRuntimeObj>();
    Graph g = make_ref<GraphObj>(runtime);
    Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
    Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
    auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
@ -48,16 +52,75 @@ TEST(Conv, NaiveCPU) {
    g->dataMalloc();
    i0->setData(IncrementalGenerator());
    w0->setData(IncrementalGenerator());
-    RunEngine(Device::CPU).run(g, true, true);
+    runtime->run(g, true, true);
-    double perfTime = RunEngine(Device::CPU).getPerfTime(g);
+    double perfTime = runtime->getPerfTime(g);
-    // The example matmul takes 0.0036ms with one core
+    // The example Conv takes 0.015ms with one core
    EXPECT_GT(perfTime, 0);
-    EXPECT_LT(perfTime, 5);
+    EXPECT_LT(perfTime, 0.1);
    // check answer
    auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32);
-    ans->dataMalloc();
+    ans->dataMalloc(runtime);
-    ans->copyData({4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
+    ans->copyData(
        vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
    EXPECT_TRUE(conv->getOutput()->equalData(ans));
 }
 void testConvCudnn(
    const std::function<void(void *, size_t, DataType)> &generator,
    vector<float> ansVec) {
    auto cpuRuntime = make_ref<CpuRuntimeObj>();
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    // Build CUDA graph
    Graph g = make_ref<GraphObj>(cudaRuntime);
    Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::Float32);
    Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::Float32);
    auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
    // allocate CUDA memory
    g->dataMalloc();
    // Build input and output data on CPU
    auto cpui0 = make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32);
    cpui0->dataMalloc(cpuRuntime);
    cpui0->setData(generator);
    auto cpuw0 = make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32);
    cpuw0->dataMalloc(cpuRuntime);
    cpuw0->setData(generator);
    auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32);
    ans->dataMalloc(cpuRuntime);
    ans->copyData(ansVec);
    // Copy inputs from CPU to CUDA
    cudaMemcpy(i0->getDataRawPtr<void *>(), cpui0->getDataRawPtr<void *>(),
               cpui0->size() * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(w0->getDataRawPtr<void *>(), cpuw0->getDataRawPtr<void *>(),
               cpuw0->size() * sizeof(float), cudaMemcpyHostToDevice);
    // Execute on CUDA
    cudaRuntime->run(g);
    // double perfTime = cudaRuntime->getPerfTime(g);
    // // The example Conv takes 0.015ms with one core
    // EXPECT_GT(perfTime, 0);
    // EXPECT_LT(perfTime, 0.1);
    // copy CUDA output to CPU
    auto o0 = conv->getOutput();
    auto cpuo0 = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32);
    cpuo0->dataMalloc(cpuRuntime);
    cudaMemcpy(cpuo0->getDataRawPtr<void *>(),
               conv->getOutput()->getDataRawPtr<void *>(),
               cpuo0->size() * sizeof(float), cudaMemcpyDeviceToHost);
    // check results on CPU
    EXPECT_TRUE(cpuo0->equalData<float>(ans));
 }
 TEST(Conv, cuDNN) {
    testConvCudnn(OneGenerator(),
                  vector<float>{12, 12, 18, 18, 12, 12, 18, 18});
    testConvCudnn(
        IncrementalGenerator(),
        vector<float>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
 }
 } // namespace infini
--- a/test/script/env_lotus.sh
+++ b/test/script/env_lotus.sh
@ -0,0 +1,2 @@
 spack load cuda@11.0.2 cudnn@8.0.3.33-11.0
 export CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc
		`@ -0,0 +1,2 @@`
							`spack load cuda@11.0.2 cudnn@8.0.3.33-11.0`
							`export CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc`