From e076991f2f1bc290b48c0d1da1e3e867416a0ce8 Mon Sep 17 00:00:00 2001
From: Anmuliar <1260818462@qq.com>
Date: Mon, 29 Aug 2022 16:02:48 +0800
Subject: [PATCH] Revert "Operator serialization (#14)" (#15)

This reverts commit 25f0c441d2bb9ecba86fc96154919e27b8d253c3.
---
 CMakeLists.txt              |   2 +-
 include/core/common.h       |   7 ++-
 include/cuda/cuda_runtime.h |   3 +-
 src/core/common.cc          |   5 +-
 src/core/runtime.cc         |   7 ++-
 src/cuda/cuda_runtime.cc    |  40 +++-----------
 src/kernels/cuda/conv.cc    | 103 ++++++------------------------------
 test/operators/test_conv.cc |  28 ----------
 test/script/env_lotus.sh    |   1 -
 9 files changed, 33 insertions(+), 163 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ddb7f8e2..8220b3ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,7 +14,7 @@ set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Werror -Wno-error=deprecated-declarations -Wno-error=sign-compare")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Werror -Wno-error=deprecated-declarations")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion
 
diff --git a/include/core/common.h b/include/core/common.h
index c9b8fe81..7effff81 100644
--- a/include/core/common.h
+++ b/include/core/common.h
@@ -71,9 +71,8 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
     return ret;
 }
 
-double timeit(
-    const std::function<void()> &func,
-    const std::function<void(void)> &sync = []() {}, int warmupRounds = 200,
-    int timingRounds = 200);
+double timeit(const std::function<void()> &func, int warmupRounds = 200,
+              int timingRounds = 200,
+              const std::function<void(void)> &sync = {});
 
 } // namespace infini
diff --git a/include/cuda/cuda_runtime.h b/include/cuda/cuda_runtime.h
index ac2c3092..ee625699 100644
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@@ -21,7 +21,6 @@ class CudaRuntimeObj : public RuntimeObj {
         workspace = alloc(workspaceSize);
     }
     virtual ~CudaRuntimeObj() {
-        dealloc(workspace);
         checkCudnnError(cudnnDestroy(cudnn));
         checkCublasError(cublasDestroy(cublas));
     }
@@ -58,6 +57,6 @@ class CudaRuntimeObj : public RuntimeObj {
     }
 
   private:
-    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
+    void runWithoutSync(const Graph &graph) const;
 };
 } // namespace infini
\ No newline at end of file
diff --git a/src/core/common.cc b/src/core/common.cc
index 71f39244..43192441 100644
--- a/src/core/common.cc
+++ b/src/core/common.cc
@@ -4,9 +4,8 @@
 
 namespace infini {
 
-double timeit(const std::function<void()> &func,
-              const std::function<void(void)> &sync, int warmupRounds,
-              int timingRounds) {
+double timeit(const std::function<void()> &func, int warmupRounds,
+              int timingRounds, const std::function<void(void)> &sync) {
     for (int i = 0; i < warmupRounds; ++i)
         func();
     if (sync)
diff --git a/src/core/runtime.cc b/src/core/runtime.cc
index 93bd1ea7..a97ef48b 100644
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@@ -40,18 +40,17 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
         // Tune the kernel if there is no record
         if (!perfData) {
             // TODO: record is not used
-            // printf("no record data\n");
             record = kernel->tune(op, this);
             perfEngine.setPerfData(perfKey, record);
         } else
             record = *perfData;
 
         if (!profiling) {
-            kernel->compute(op, record, this);
+            kernel->compute(op, *perfData, this);
             continue;
         } else {
-            double t = timeit([&]() { kernel->compute(op, record, this); },
-                              []() {}, 1, 1);
+            double t =
+                timeit([&]() { kernel->compute(op, *perfData, this); }, 1, 1);
             op->print();
             printf(" op_time %lf\n", t);
             totalTime += t;
diff --git a/src/cuda/cuda_runtime.cc b/src/cuda/cuda_runtime.cc
index 78249165..1e91db50 100644
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@@ -4,13 +4,10 @@
 
 namespace infini {
 
-void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
-                                    bool profiling = false) const {
+void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
     const auto &kernelRegistry = KernelRegistry::getInstance();
     auto perfEngine = PerfEngine::getInstance();
-    double totalTime = 0;
-    std::map<OpType, double> opTime;
-    std::map<OpType, int> opCnt;
+
     for (auto &op : graph->getOperators()) {
         // HACK: set correct data type
         auto kernelAttrs =
@@ -18,40 +15,17 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
         Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
         auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
         std::optional<PerfRecord> perfData = perfEngine.getPerfData(perfKey);
-        if (!perfData && !tune) {
+        if (perfData)
+            kernel->compute(op, *perfData, this);
+        else
             kernel->compute(op, this);
-            continue;
-        }
-
-        PerfRecord record;
-
-        if (!perfData) {
-            record = kernel->tune(op, this);
-            perfEngine.setPerfData(perfKey, record);
-
-        } else
-            record = *perfData;
-
-        double t = record.time;
-        totalTime += t;
-
-        if (profiling) {
-            double t = timeit([&]() { kernel->compute(op, record, this); },
-                              [&]() { sync(); }, 1, 1);
-            op->print();
-            printf(" op_time on cuda %lf\n", t);
-            totalTime += t;
-            opTime[op->getOpType()] += t;
-            opCnt[op->getOpType()]++;
-        }
     }
 }
 
 void CudaRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
-    if (profiling)
+    if (tune || profiling)
         IT_TODO_HALT();
-
-    runWithoutSync(graph, tune);
+    runWithoutSync(graph);
     sync();
 }
 
diff --git a/src/kernels/cuda/conv.cc b/src/kernels/cuda/conv.cc
index a3c9e7ad..c31868bb 100644
--- a/src/kernels/cuda/conv.cc
+++ b/src/kernels/cuda/conv.cc
@@ -1,10 +1,7 @@
 #include "operators/conv.h"
 #include "core/kernel.h"
 #include "cuda/cuda_runtime.h"
-#include <chrono>
-#include <functional>
-#include <limits>
-#include <tuple>
+
 namespace infini {
 
 static constexpr int N_ALGO = 8;
@@ -17,25 +14,18 @@ static constexpr cudnnConvolutionFwdAlgo_t ALGOS[N_ALGO] = {
     CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
     CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
     CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED};
-static constexpr int N_MODE = 2;
-static constexpr cudnnConvolutionMode_t MODES[N_MODE] = {
-    CUDNN_CONVOLUTION, CUDNN_CROSS_CORRELATION};
 
 struct ConvCuDnnPerfRecord : public PerfRecord {
     int algo = 0; // cudnnConvolutionFwdAlgo_t
-    int mode = 1;
     size_t workspaceSize = 100000;
     bool fuseAct = false;
 };
 
 class convCudnn : public Kernel {
 
-    std::tuple<void *, void *, void *, cudnnTensorDescriptor_t,
-               cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
-               cudnnConvolutionDescriptor_t, cudnnActivationDescriptor_t,
-               cudnnTensorDescriptor_t>
-    cuDNNDescriptorAccess(const Ref<ConvObj> &op,
-                          const ConvCuDnnPerfRecord &record) const {
+    bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
+                      const CudaRuntimeObj *context) const {
+        cudnnStatus_t stat;
         void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
         if (op->getInputs().size() > 2) // Bias is not supported yet
@@ -62,6 +52,7 @@ class convCudnn : public Kernel {
         checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
                                                    CUDNN_TENSOR_NCHW, f,
                                                    channelsPerGrp, r, s));
+
         // get bias
         cudnnTensorDescriptor_t biasDesc;
         checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
@@ -73,7 +64,7 @@ class convCudnn : public Kernel {
         checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
         // TODO: CUDNN_CONVOLUTION is a tunable argument
         checkCudnnError(cudnnSetConvolution2dDescriptor(
-            convDesc, ph, pw, sh, sw, dh, dw, MODES[record.mode],
+            convDesc, ph, pw, sh, sw, dh, dw, CUDNN_CROSS_CORRELATION,
             CUDNN_DATA_FLOAT));
         if (g > 1) {
             checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g));
@@ -102,6 +93,7 @@ class convCudnn : public Kernel {
             assert(false);
         }
 
+        // get outputs
         int outn, outc, outh, outw;
         checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
             convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
@@ -114,15 +106,6 @@ class convCudnn : public Kernel {
                       op->getOutput()->getDims(),
                   "cuDNN output shape mismatches with OP output shape");
 
-        return tuple(inData, knData, outData, inDesc, knDesc, biasDesc,
-                     convDesc, actDesc, outDesc);
-    }
-    bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
-                      const CudaRuntimeObj *context) const {
-        cudnnStatus_t stat;
-
-        auto [inData, knData, outData, inDesc, knDesc, biasDesc, convDesc,
-              actDesc, outDesc] = cuDNNDescriptorAccess(op, record);
         // get workspace
         size_t wsSize = record.workspaceSize;
         stat = cudnnGetConvolutionForwardWorkspaceSize(
@@ -130,7 +113,9 @@ class convCudnn : public Kernel {
             ALGOS[record.algo], &wsSize);
         if (stat != CUDNN_STATUS_SUCCESS)
             return false;
-
+        // assert(wsSize < (size_t)3 * 1024 * 1024 * 1024);
+        // if (wsSize >= (size_t)10 * 1024 * 1024 * 1024)
+        //     continue;
         CudaPtr wsData = context->getWorkspace(wsSize);
         float alpha = 1.f, beta = 0.f;
 
@@ -187,12 +172,10 @@ class convCudnn : public Kernel {
 
         // Destories in CUDA does not require sync. But cuDNN does not state
         // whether sync is required before destories.
-        checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
-        checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
-        checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
-        checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
-        checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
         checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
+        checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
+        checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
+        checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
         return true;
     }
 
@@ -203,63 +186,9 @@ class convCudnn : public Kernel {
 
     PerfRecord tune(const Operator &_op,
                     const RuntimeObj *_context) const override {
-        ConvCuDnnPerfRecord ret, tmp_ret;
-        ret.time = std::numeric_limits<double>::max();
-        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
-        auto op = as<ConvObj>(_op);
-        // Try every possible data input mode of convolution func
-        for (int i = 0; i < N_MODE; i++) {
-            // Try every possible algorithm of convolution func
-            for (int j = 0; j < N_ALGO; j++) {
-                tmp_ret.algo = j;
-                tmp_ret.mode = i;
-                // Check if the kernel supports the op
-                cudnnStatus_t stat;
-                auto [inData, knData, outData, inDesc, knDesc, biasDesc,
-                      convDesc, actDesc, outDesc] =
-                    cuDNNDescriptorAccess(op, tmp_ret);
-
-                // get workspace
-                size_t wsSize = tmp_ret.workspaceSize;
-                stat = cudnnGetConvolutionForwardWorkspaceSize(
-                    context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
-                    ALGOS[tmp_ret.algo], &wsSize);
-                if (stat != CUDNN_STATUS_SUCCESS)
-                    continue;
-
-                CudaPtr wsData = context->getWorkspace(wsSize);
-                float alpha = 1.f, beta = 0.f;
-
-                stat = cudnnConvolutionForward(
-                    context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
-                    knData, convDesc, ALGOS[tmp_ret.algo], wsData, wsSize,
-                    &beta, outDesc, outData);
-                if (stat != CUDNN_STATUS_SUCCESS)
-                    continue;
-                tmp_ret.time = timeit(
-                    [&]() {
-                        cudnnConvolutionForward(
-                            context->cudnnHandle(), &alpha, inDesc, inData,
-                            knDesc, knData, convDesc, ALGOS[tmp_ret.algo],
-                            wsData, wsSize, &beta, outDesc, outData);
-                    },
-                    [&]() { context->sync(); });
-                printf("mode:%d algo:%d :%.8lf\n", i, j, tmp_ret.time);
-                // Update the tune result
-                if (ret.time > tmp_ret.time)
-                    ret = tmp_ret;
-
-                checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
-                checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
-                checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
-                checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
-                checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
-                checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
-            }
-        }
-        // Test infomation output
-        printf("the best algo is %d, the best conv mode is %d\n", ret.algo,
-               ret.mode);
+        // TODO: real tuning
+        ConvCuDnnPerfRecord ret;
+        ret.time = timeit([&]() { compute(_op, _context); });
         return ret;
     }
 
diff --git a/test/operators/test_conv.cc b/test/operators/test_conv.cc
index db23cb73..f6397c5c 100644
--- a/test/operators/test_conv.cc
+++ b/test/operators/test_conv.cc
@@ -1,10 +1,8 @@
 #include "core/graph.h"
-#include "core/kernel.h"
 #include "core/runtime.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
 #include "operators/conv.h"
-
 #include "test.h"
 
 namespace infini {
@@ -109,30 +107,4 @@ TEST(Conv, cuDNN) {
         IncrementalGenerator(),
         vector<float>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
 }
-
-TEST(Conv, tune) {
-    Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
-    Graph gCpu = make_ref<GraphObj>(cpu);
-    Runtime cuda = make_ref<CudaRuntimeObj>();
-    Graph gCuda = make_ref<GraphObj>(cuda);
-    // Set input data on CPU in a CPU Graph
-    Tensor i0Cpu = gCpu->addTensor({1, 3, 800, 800}, DataType::Float32);
-    Tensor w0Cpu = gCpu->addTensor({2, 3, 5, 5}, DataType::Float32);
-    // Malloc data for all tensors in a graph. Do we need implicit allocation?
-    gCpu->dataMalloc();
-    i0Cpu->setData(IncrementalGenerator());
-    w0Cpu->setData(IncrementalGenerator());
-
-    // Copy input tensors from CPU to CUDA
-    Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
-    Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
-    // Build CUDA graph
-    auto conv =
-        gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 1, 1, 1, 1);
-    // allocate CUDA memory
-    gCuda->dataMalloc();
-    // Execute on CUDA
-    bool tune = true;
-    cuda->run(gCuda, tune);
-}
 } // namespace infini
\ No newline at end of file
diff --git a/test/script/env_lotus.sh b/test/script/env_lotus.sh
index d9c2e170..a65a746d 100644
--- a/test/script/env_lotus.sh
+++ b/test/script/env_lotus.sh
@@ -1,3 +1,2 @@
-. /home/spack/spack/share/spack/setup-env.sh
 spack load cuda@11.0.2 cudnn@8.0.3.33-11.0
 export CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc