Add: python API for timing ConvTranspose (#46)

* Add: python interfaced for timing operators * Fix: CUDA Runtime run Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
2022-10-07 16:03:11 +08:00 · 2022-10-07 16:03:11 +08:00 · 1152adc94a
parent b0c2a08252
commit 1152adc94a
12 changed files with 93 additions and 21 deletions
--- a/include/core/common.h
+++ b/include/core/common.h
@ -44,7 +44,7 @@ using HashType = uint64_t; // compatible with std::hash
         ? void(0)                                                             \
         : throw ::infini::Exception(                                          \
               std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) +  \
-               "] Assertion failed (" + #name + "): " + #info))
+               "] Assertion failed (" + #name + "): " + info))
 #define _IT_ASSERT_1(name) _IT_ASSERT_2(name, "");
 #define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)

--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -16,7 +16,7 @@ class GraphObj : public Object {
    GraphObj(Runtime runtime) : runtime(runtime){};
    string toString() const override;

-    Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
+    Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
    Tensor cloneTensor(const Tensor &tensor) {
        auto ret = addTensor(tensor->getDims(), tensor->getDType());
        ret->dataMalloc();
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@ -102,7 +102,11 @@ class KernelRegistry {
    }
    Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
        auto it = kernels.find(kernelAttrs);
-        IT_ASSERT(it != kernels.end(), "Kernel not found.");
+        IT_ASSERT(it != kernels.end(),
+                  "Kernel not found for key {" +
+                      to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
+                      ", " + OpRegistry::getOpName(std::get<1>(kernelAttrs)) +
+                      ", " + std::get<2>(kernelAttrs).toString());
        return std::get<0>(it->second);
    }
    const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -71,6 +71,7 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
                                 size_t bytes) const = 0;
    virtual void copyBlobToCPU(void *dst, const void *src,
                               size_t bytes) const = 0;
+    virtual string toString() const = 0;

  protected:
    void printProfilingData(double totTime,
@ -102,6 +103,7 @@ class CpuRuntimeObj : public RuntimeObj {
    void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
    void copyBlobInsideRuntime(void *dst, const void *src,
                               size_t bytes) const override;
+    string toString() const override;
 };

 } // namespace infini
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -34,6 +34,7 @@ class CudaRuntimeObj : public RuntimeObj {
        checkCublasError(cublasDestroy(cublas));
        checkCUresult(cuCtxDestroy(newContext));
    }
+    string toString() const override;

    void run(const Graph &graph, bool tune = false,
             bool profiling = false) const;
@ -68,7 +69,9 @@ class CudaRuntimeObj : public RuntimeObj {
        checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
    }

+    void runWithoutSync(const Graph &graph) const;
+
  private:
-    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
+    void tune(const Graph &graph, bool profiling) const;
 };
 } // namespace infini
--- a/include/cuda/operator_timer.h
+++ b/include/cuda/operator_timer.h
@ -6,6 +6,11 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
                        int dilationh, int dilationw, int group,
                        const char *name);

+double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
+                                    int s, int padh, int padw, int strideh,
+                                    int stridew, int dilationh, int dilationw,
+                                    int oph, int opw, int group);
+
 double getPerfMatmulCublas(int b, int m, int n, int k, const char *name);
 } // namespace opTimer
 } // namespace infini
--- a/python/infinitensor/operator_timer.py
+++ b/python/infinitensor/operator_timer.py
@ -2,12 +2,14 @@ from tokenize import Double
 import pyinfinitensor  # import getPerfConv, getPerfMatmul


-def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name):
+def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name=""):
    return pyinfinitensor.getPerfConvCudnn(n, c, h, w, f, r, s, padh, padw,
                                           strideh, stridew, dilationh, dilationw, group, name)


-def getPerfMatmul(b, m, n, k, name):
+def getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group):
+    return pyinfinitensor.getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group)
+
+
+def getPerfMatmul(b, m, n, k, name=""):
    return pyinfinitensor.getPerfMatmulCublas(b, m, n, k, name)
-
-
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@ -139,4 +139,6 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,
    memcpy(dst, src, bytes);
 }

+string CpuRuntimeObj::toString() const { return "CPU Runtime"; }
+
 } // namespace infini
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@ -5,8 +5,25 @@
 #include "operators/matmul.h"
 namespace infini {

-void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
-                                    bool profiling = false) const {
+void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
+    const auto &kernelRegistry = KernelRegistry::getInstance();
+    auto &perfEngine = PerfEngine::getInstance();
+    for (auto &op : graph->getOperators()) {
+        // HACK: set correct data type
+        auto kernelAttrs =
+            KernelAttrs{device, op->getOpType(), DataType::Float32};
+        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
+        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
+        auto perfData = perfEngine.getPerfData(perfKey);
+        // IT_ASSERT(perfData, "No perf data for OP " + op->toString());
+        if (perfData)
+            kernel->compute(op, perfData, this);
+        else
+            kernel->compute(op, this);
+    }
+}
+
+void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto &perfEngine = PerfEngine::getInstance();
    double totalTime = 0;
@ -19,11 +36,6 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
        auto perfData = perfEngine.getPerfData(perfKey);
-        if (!perfData && !tune) {
-            kernel->compute(op, this);
-            continue;
-        }
-
        PerfRecord record;
        if (!perfData) {
            record = kernel->tune(op, this);
@ -46,13 +58,19 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
    }
 }

-void CudaRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
+void CudaRuntimeObj::run(const Graph &graph, bool runTune,
+                         bool profiling) const {
    if (profiling)
        IT_TODO_HALT();
-    runWithoutSync(graph, tune, profiling);
+    if (runTune)
+        tune(graph, profiling);
+    else
+        runWithoutSync(graph);
    sync();
 }

 void CudaRuntimeObj::sync() const { cudaDeviceSynchronize(); }

+string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
+
 } // namespace infini
--- a/src/cuda/operator_timer.cc
+++ b/src/cuda/operator_timer.cc
@ -22,8 +22,9 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
    Runtime cuda = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cuda);
    // Set input data on CPU in a CPU Graph
+    IT_ASSERT(c % group == 0);
    Tensor i0Cpu = gCpu->addTensor({n, c, h, w}, DataType::Float32);
-    Tensor w0Cpu = gCpu->addTensor({f, c, r, s}, DataType::Float32);
+    Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
    // Malloc data for all tensors in a graph. Do we need implicit allocation?
    gCpu->dataMalloc();
    i0Cpu->setData(IncrementalGenerator());
@ -43,6 +44,41 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
    return cuda->getPerfTime(gCuda);
 }

+double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
+                                    int s, int padh, int padw, int strideh,
+                                    int stridew, int dilationh, int dilationw,
+                                    int oph, int opw, int group) {
+    // const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
+    // dilationh, dilationw, group] =
+    //     tuple{1, 512, 14, 14, 512, 3, 3, 2, 2, 1, 1, 2, 2, 1};
+    Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
+    Graph gCpu = make_ref<GraphObj>(cpu);
+    Runtime cuda = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cuda);
+    // Set input data on CPU in a CPU Graph
+    IT_ASSERT(c % group == 0);
+    Tensor i0Cpu = gCpu->addTensor({n, f, h, w}, DataType::Float32);
+    Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
+    // Malloc data for all tensors in a graph. Do we need implicit allocation?
+    gCpu->dataMalloc();
+    i0Cpu->setData(IncrementalGenerator());
+    w0Cpu->setData(IncrementalGenerator());
+
+    // Copy input tensors from CPU to CUDA
+    Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
+    Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
+    // Build CUDA graph
+    auto conv = gCuda->addOp<ConvTransposed2dObj>(
+        i0Cuda, w0Cuda, nullptr, padh, padw, strideh, stridew, dilationh,
+        dilationw, oph, opw, group);
+    // allocate CUDA memory
+    gCuda->dataMalloc();
+    // Execute on CUDA
+    bool tune = true;
+    cuda->run(gCuda, tune);
+    return cuda->getPerfTime(gCuda);
+}
+
 double getPerfMatmulCublas(int b, int m, int n, int k, const char *name) {
    // const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
    // dilationh, dilationw, group] =
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -13,6 +13,7 @@ void register_operator_timer(py::module &m) {
 #ifdef USE_CUDA
    using namespace opTimer;
    m.def("getPerfConvCudnn", &getPerfConvCudnn);
+    m.def("getPerfConvTransposed2dCudnn", &getPerfConvTransposed2dCudnn);
    m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
 #endif
 }
--- a/src/kernels/cuda/conv_transposed.cc
+++ b/src/kernels/cuda/conv_transposed.cc
@ -250,8 +250,7 @@ class convBackwardDataCudnn : public Kernel {
                            outData);
                    },
                    [&]() { context->sync(); });
-                // printf("mode:%d algo:%d :%.8lf\n", mode, algo,
-                // record.time);
+                // printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);

                // Update the tune result
                if (ret.time > record.time)