Revert "Operator serialization (#14)" (#15)

This reverts commit 25f0c441d2.
2022-08-29 16:02:48 +08:00 · 2022-08-29 16:02:48 +08:00 · e076991f2f
parent 25f0c441d2
commit e076991f2f
9 changed files with 33 additions and 163 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -14,7 +14,7 @@ set(DEFAULT_BUILD_TYPE "RelWithDebInfo")

 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Werror -Wno-error=deprecated-declarations -Wno-error=sign-compare")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Werror -Wno-error=deprecated-declarations")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion

--- a/include/core/common.h
+++ b/include/core/common.h
@ -71,9 +71,8 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
    return ret;
 }

-double timeit(
-    const std::function<void()> &func,
-    const std::function<void(void)> &sync = []() {}, int warmupRounds = 200,
-    int timingRounds = 200);
+double timeit(const std::function<void()> &func, int warmupRounds = 200,
+              int timingRounds = 200,
+              const std::function<void(void)> &sync = {});

 } // namespace infini
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -21,7 +21,6 @@ class CudaRuntimeObj : public RuntimeObj {
        workspace = alloc(workspaceSize);
    }
    virtual ~CudaRuntimeObj() {
-        dealloc(workspace);
        checkCudnnError(cudnnDestroy(cudnn));
        checkCublasError(cublasDestroy(cublas));
    }
@ -58,6 +57,6 @@ class CudaRuntimeObj : public RuntimeObj {
    }

  private:
-    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
+    void runWithoutSync(const Graph &graph) const;
 };
 } // namespace infini
--- a/src/core/common.cc
+++ b/src/core/common.cc
@ -4,9 +4,8 @@

 namespace infini {

-double timeit(const std::function<void()> &func,
-              const std::function<void(void)> &sync, int warmupRounds,
-              int timingRounds) {
+double timeit(const std::function<void()> &func, int warmupRounds,
+              int timingRounds, const std::function<void(void)> &sync) {
    for (int i = 0; i < warmupRounds; ++i)
        func();
    if (sync)
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@ -40,18 +40,17 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
        // Tune the kernel if there is no record
        if (!perfData) {
            // TODO: record is not used
-            // printf("no record data\n");
            record = kernel->tune(op, this);
            perfEngine.setPerfData(perfKey, record);
        } else
            record = *perfData;

        if (!profiling) {
-            kernel->compute(op, record, this);
+            kernel->compute(op, *perfData, this);
            continue;
        } else {
-            double t = timeit([&]() { kernel->compute(op, record, this); },
-                              []() {}, 1, 1);
+            double t =
+                timeit([&]() { kernel->compute(op, *perfData, this); }, 1, 1);
            op->print();
            printf(" op_time %lf\n", t);
            totalTime += t;
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@ -4,13 +4,10 @@

 namespace infini {

-void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
-                                    bool profiling = false) const {
+void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto perfEngine = PerfEngine::getInstance();
-    double totalTime = 0;
-    std::map<OpType, double> opTime;
-    std::map<OpType, int> opCnt;
+
    for (auto &op : graph->getOperators()) {
        // HACK: set correct data type
        auto kernelAttrs =
@ -18,40 +15,17 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
        std::optional<PerfRecord> perfData = perfEngine.getPerfData(perfKey);
-        if (!perfData && !tune) {
+        if (perfData)
+            kernel->compute(op, *perfData, this);
+        else
            kernel->compute(op, this);
-            continue;
-        }
-
-        PerfRecord record;
-
-        if (!perfData) {
-            record = kernel->tune(op, this);
-            perfEngine.setPerfData(perfKey, record);
-
-        } else
-            record = *perfData;
-
-        double t = record.time;
-        totalTime += t;
-
-        if (profiling) {
-            double t = timeit([&]() { kernel->compute(op, record, this); },
-                              [&]() { sync(); }, 1, 1);
-            op->print();
-            printf(" op_time on cuda %lf\n", t);
-            totalTime += t;
-            opTime[op->getOpType()] += t;
-            opCnt[op->getOpType()]++;
-        }
    }
 }

 void CudaRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
-    if (profiling)
+    if (tune || profiling)
        IT_TODO_HALT();
-
-    runWithoutSync(graph, tune);
+    runWithoutSync(graph);
    sync();
 }

--- a/src/kernels/cuda/conv.cc
+++ b/src/kernels/cuda/conv.cc
@ -1,10 +1,7 @@
 #include "operators/conv.h"
 #include "core/kernel.h"
 #include "cuda/cuda_runtime.h"
-#include <chrono>
-#include <functional>
-#include <limits>
-#include <tuple>
+
 namespace infini {

 static constexpr int N_ALGO = 8;
@ -17,25 +14,18 @@ static constexpr cudnnConvolutionFwdAlgo_t ALGOS[N_ALGO] = {
    CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED};
-static constexpr int N_MODE = 2;
-static constexpr cudnnConvolutionMode_t MODES[N_MODE] = {
-    CUDNN_CONVOLUTION, CUDNN_CROSS_CORRELATION};

 struct ConvCuDnnPerfRecord : public PerfRecord {
    int algo = 0; // cudnnConvolutionFwdAlgo_t
-    int mode = 1;
    size_t workspaceSize = 100000;
    bool fuseAct = false;
 };

 class convCudnn : public Kernel {

-    std::tuple<void *, void *, void *, cudnnTensorDescriptor_t,
-               cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
-               cudnnConvolutionDescriptor_t, cudnnActivationDescriptor_t,
-               cudnnTensorDescriptor_t>
-    cuDNNDescriptorAccess(const Ref<ConvObj> &op,
-                          const ConvCuDnnPerfRecord &record) const {
+    bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
+                      const CudaRuntimeObj *context) const {
+        cudnnStatus_t stat;
        void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
        if (op->getInputs().size() > 2) // Bias is not supported yet
@ -62,6 +52,7 @@ class convCudnn : public Kernel {
        checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
                                                   CUDNN_TENSOR_NCHW, f,
                                                   channelsPerGrp, r, s));
+
        // get bias
        cudnnTensorDescriptor_t biasDesc;
        checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
@ -73,7 +64,7 @@ class convCudnn : public Kernel {
        checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
        // TODO: CUDNN_CONVOLUTION is a tunable argument
        checkCudnnError(cudnnSetConvolution2dDescriptor(
-            convDesc, ph, pw, sh, sw, dh, dw, MODES[record.mode],
+            convDesc, ph, pw, sh, sw, dh, dw, CUDNN_CROSS_CORRELATION,
            CUDNN_DATA_FLOAT));
        if (g > 1) {
            checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g));
@ -102,6 +93,7 @@ class convCudnn : public Kernel {
            assert(false);
        }

+        // get outputs
        int outn, outc, outh, outw;
        checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
            convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
@ -114,15 +106,6 @@ class convCudnn : public Kernel {
                      op->getOutput()->getDims(),
                  "cuDNN output shape mismatches with OP output shape");

-        return tuple(inData, knData, outData, inDesc, knDesc, biasDesc,
-                     convDesc, actDesc, outDesc);
-    }
-    bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
-                      const CudaRuntimeObj *context) const {
-        cudnnStatus_t stat;
-
-        auto [inData, knData, outData, inDesc, knDesc, biasDesc, convDesc,
-              actDesc, outDesc] = cuDNNDescriptorAccess(op, record);
        // get workspace
        size_t wsSize = record.workspaceSize;
        stat = cudnnGetConvolutionForwardWorkspaceSize(
@ -130,7 +113,9 @@ class convCudnn : public Kernel {
            ALGOS[record.algo], &wsSize);
        if (stat != CUDNN_STATUS_SUCCESS)
            return false;
-
+        // assert(wsSize < (size_t)3 * 1024 * 1024 * 1024);
+        // if (wsSize >= (size_t)10 * 1024 * 1024 * 1024)
+        //     continue;
        CudaPtr wsData = context->getWorkspace(wsSize);
        float alpha = 1.f, beta = 0.f;

@ -187,12 +172,10 @@ class convCudnn : public Kernel {

        // Destories in CUDA does not require sync. But cuDNN does not state
        // whether sync is required before destories.
-        checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
-        checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
-        checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
-        checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
-        checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
        checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
+        checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
+        checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
+        checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
        return true;
    }

@ -203,63 +186,9 @@ class convCudnn : public Kernel {

    PerfRecord tune(const Operator &_op,
                    const RuntimeObj *_context) const override {
-        ConvCuDnnPerfRecord ret, tmp_ret;
-        ret.time = std::numeric_limits<double>::max();
-        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
-        auto op = as<ConvObj>(_op);
-        // Try every possible data input mode of convolution func
-        for (int i = 0; i < N_MODE; i++) {
-            // Try every possible algorithm of convolution func
-            for (int j = 0; j < N_ALGO; j++) {
-                tmp_ret.algo = j;
-                tmp_ret.mode = i;
-                // Check if the kernel supports the op
-                cudnnStatus_t stat;
-                auto [inData, knData, outData, inDesc, knDesc, biasDesc,
-                      convDesc, actDesc, outDesc] =
-                    cuDNNDescriptorAccess(op, tmp_ret);
-
-                // get workspace
-                size_t wsSize = tmp_ret.workspaceSize;
-                stat = cudnnGetConvolutionForwardWorkspaceSize(
-                    context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
-                    ALGOS[tmp_ret.algo], &wsSize);
-                if (stat != CUDNN_STATUS_SUCCESS)
-                    continue;
-
-                CudaPtr wsData = context->getWorkspace(wsSize);
-                float alpha = 1.f, beta = 0.f;
-
-                stat = cudnnConvolutionForward(
-                    context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
-                    knData, convDesc, ALGOS[tmp_ret.algo], wsData, wsSize,
-                    &beta, outDesc, outData);
-                if (stat != CUDNN_STATUS_SUCCESS)
-                    continue;
-                tmp_ret.time = timeit(
-                    [&]() {
-                        cudnnConvolutionForward(
-                            context->cudnnHandle(), &alpha, inDesc, inData,
-                            knDesc, knData, convDesc, ALGOS[tmp_ret.algo],
-                            wsData, wsSize, &beta, outDesc, outData);
-                    },
-                    [&]() { context->sync(); });
-                printf("mode:%d algo:%d :%.8lf\n", i, j, tmp_ret.time);
-                // Update the tune result
-                if (ret.time > tmp_ret.time)
-                    ret = tmp_ret;
-
-                checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
-                checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
-                checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
-                checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
-                checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
-                checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
-            }
-        }
-        // Test infomation output
-        printf("the best algo is %d, the best conv mode is %d\n", ret.algo,
-               ret.mode);
+        // TODO: real tuning
+        ConvCuDnnPerfRecord ret;
+        ret.time = timeit([&]() { compute(_op, _context); });
        return ret;
    }

--- a/test/operators/test_conv.cc
+++ b/test/operators/test_conv.cc
@ -1,10 +1,8 @@
 #include "core/graph.h"
-#include "core/kernel.h"
 #include "core/runtime.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
 #include "operators/conv.h"
-
 #include "test.h"

 namespace infini {
@ -109,30 +107,4 @@ TEST(Conv, cuDNN) {
        IncrementalGenerator(),
        vector<float>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
 }
-
-TEST(Conv, tune) {
-    Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
-    Graph gCpu = make_ref<GraphObj>(cpu);
-    Runtime cuda = make_ref<CudaRuntimeObj>();
-    Graph gCuda = make_ref<GraphObj>(cuda);
-    // Set input data on CPU in a CPU Graph
-    Tensor i0Cpu = gCpu->addTensor({1, 3, 800, 800}, DataType::Float32);
-    Tensor w0Cpu = gCpu->addTensor({2, 3, 5, 5}, DataType::Float32);
-    // Malloc data for all tensors in a graph. Do we need implicit allocation?
-    gCpu->dataMalloc();
-    i0Cpu->setData(IncrementalGenerator());
-    w0Cpu->setData(IncrementalGenerator());
-
-    // Copy input tensors from CPU to CUDA
-    Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
-    Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
-    // Build CUDA graph
-    auto conv =
-        gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 1, 1, 1, 1);
-    // allocate CUDA memory
-    gCuda->dataMalloc();
-    // Execute on CUDA
-    bool tune = true;
-    cuda->run(gCuda, tune);
-}
 } // namespace infini
--- a/test/script/env_lotus.sh
+++ b/test/script/env_lotus.sh
@ -1,3 +1,2 @@
-. /home/spack/spack/share/spack/setup-env.sh
 spack load cuda@11.0.2 cudnn@8.0.3.33-11.0
 export CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc