Add: python API for timing ConvTranspose (#46)

* Add: python interfaced for timing operators * Fix: CUDA Runtime run Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
2022-10-07 16:03:11 +08:00 · 2022-10-07 16:03:11 +08:00 · 1152adc94a
parent b0c2a08252
commit 1152adc94a
12 changed files with 93 additions and 21 deletions
--- a/include/core/common.h
+++ b/include/core/common.h
@ -44,7 +44,7 @@ using HashType = uint64_t; // compatible with std::hash
         ? void(0)                                                             \
         : throw ::infini::Exception(                                          \
               std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) +  \
-               "] Assertion failed (" + #name + "): " + #info))
+               "] Assertion failed (" + #name + "): " + info))
 #define _IT_ASSERT_1(name) _IT_ASSERT_2(name, "");
 #define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -16,7 +16,7 @@ class GraphObj : public Object {
    GraphObj(Runtime runtime) : runtime(runtime){};
    string toString() const override;
-    Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
+    Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
    Tensor cloneTensor(const Tensor &tensor) {
        auto ret = addTensor(tensor->getDims(), tensor->getDType());
        ret->dataMalloc();
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@ -102,7 +102,11 @@ class KernelRegistry {
    }
    Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
        auto it = kernels.find(kernelAttrs);
-        IT_ASSERT(it != kernels.end(), "Kernel not found.");
+        IT_ASSERT(it != kernels.end(),
                  "Kernel not found for key {" +
                      to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
                      ", " + OpRegistry::getOpName(std::get<1>(kernelAttrs)) +
                      ", " + std::get<2>(kernelAttrs).toString());
        return std::get<0>(it->second);
    }
    const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -71,6 +71,7 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
                                 size_t bytes) const = 0;
    virtual void copyBlobToCPU(void *dst, const void *src,
                               size_t bytes) const = 0;
    virtual string toString() const = 0;
  protected:
    void printProfilingData(double totTime,
@ -102,6 +103,7 @@ class CpuRuntimeObj : public RuntimeObj {
    void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
    void copyBlobInsideRuntime(void *dst, const void *src,
                               size_t bytes) const override;
    string toString() const override;
 };
 } // namespace infini
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -34,6 +34,7 @@ class CudaRuntimeObj : public RuntimeObj {
        checkCublasError(cublasDestroy(cublas));
        checkCUresult(cuCtxDestroy(newContext));
    }
    string toString() const override;
    void run(const Graph &graph, bool tune = false,
             bool profiling = false) const;
@ -68,7 +69,9 @@ class CudaRuntimeObj : public RuntimeObj {
        checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
    }
    void runWithoutSync(const Graph &graph) const;
  private:
-    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
+    void tune(const Graph &graph, bool profiling) const;
 };
 } // namespace infini
--- a/include/cuda/operator_timer.h
+++ b/include/cuda/operator_timer.h
@ -6,6 +6,11 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
                        int dilationh, int dilationw, int group,
                        const char *name);
 double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
                                    int s, int padh, int padw, int strideh,
                                    int stridew, int dilationh, int dilationw,
                                    int oph, int opw, int group);
 double getPerfMatmulCublas(int b, int m, int n, int k, const char *name);
 } // namespace opTimer
 } // namespace infini
--- a/python/infinitensor/operator_timer.py
+++ b/python/infinitensor/operator_timer.py
@ -2,12 +2,14 @@ from tokenize import Double
 import pyinfinitensor  # import getPerfConv, getPerfMatmul
-def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name):
+def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name=""):
    return pyinfinitensor.getPerfConvCudnn(n, c, h, w, f, r, s, padh, padw,
-                               strideh, stridew, dilationh, dilationw, group, name)
+                                           strideh, stridew, dilationh, dilationw, group, name)
-def getPerfMatmul(b, m, n, k, name):
+def getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group):
    return pyinfinitensor.getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group)
 def getPerfMatmul(b, m, n, k, name=""):
    return pyinfinitensor.getPerfMatmulCublas(b, m, n, k, name)
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@ -139,4 +139,6 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,
    memcpy(dst, src, bytes);
 }
 string CpuRuntimeObj::toString() const { return "CPU Runtime"; }
 } // namespace infini
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@ -5,8 +5,25 @@
 #include "operators/matmul.h"
 namespace infini {
-void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
+void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
-                                    bool profiling = false) const {
+    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto &perfEngine = PerfEngine::getInstance();
    for (auto &op : graph->getOperators()) {
        // HACK: set correct data type
        auto kernelAttrs =
            KernelAttrs{device, op->getOpType(), DataType::Float32};
        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
        auto perfData = perfEngine.getPerfData(perfKey);
        // IT_ASSERT(perfData, "No perf data for OP " + op->toString());
        if (perfData)
            kernel->compute(op, perfData, this);
        else
            kernel->compute(op, this);
    }
 }
 void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto &perfEngine = PerfEngine::getInstance();
    double totalTime = 0;
@ -19,11 +36,6 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
        auto perfData = perfEngine.getPerfData(perfKey);
        if (!perfData && !tune) {
            kernel->compute(op, this);
            continue;
        }
        PerfRecord record;
        if (!perfData) {
            record = kernel->tune(op, this);
@ -46,13 +58,19 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
    }
 }
-void CudaRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
+void CudaRuntimeObj::run(const Graph &graph, bool runTune,
                         bool profiling) const {
    if (profiling)
        IT_TODO_HALT();
-    runWithoutSync(graph, tune, profiling);
+    if (runTune)
        tune(graph, profiling);
    else
        runWithoutSync(graph);
    sync();
 }
 void CudaRuntimeObj::sync() const { cudaDeviceSynchronize(); }
 string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
 } // namespace infini
--- a/src/cuda/operator_timer.cc
+++ b/src/cuda/operator_timer.cc
@ -22,8 +22,9 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
    Runtime cuda = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cuda);
    // Set input data on CPU in a CPU Graph
    IT_ASSERT(c % group == 0);
    Tensor i0Cpu = gCpu->addTensor({n, c, h, w}, DataType::Float32);
-    Tensor w0Cpu = gCpu->addTensor({f, c, r, s}, DataType::Float32);
+    Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
    // Malloc data for all tensors in a graph. Do we need implicit allocation?
    gCpu->dataMalloc();
    i0Cpu->setData(IncrementalGenerator());
@ -43,6 +44,41 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
    return cuda->getPerfTime(gCuda);
 }
 double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
                                    int s, int padh, int padw, int strideh,
                                    int stridew, int dilationh, int dilationw,
                                    int oph, int opw, int group) {
    // const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
    // dilationh, dilationw, group] =
    //     tuple{1, 512, 14, 14, 512, 3, 3, 2, 2, 1, 1, 2, 2, 1};
    Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
    Graph gCpu = make_ref<GraphObj>(cpu);
    Runtime cuda = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cuda);
    // Set input data on CPU in a CPU Graph
    IT_ASSERT(c % group == 0);
    Tensor i0Cpu = gCpu->addTensor({n, f, h, w}, DataType::Float32);
    Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
    // Malloc data for all tensors in a graph. Do we need implicit allocation?
    gCpu->dataMalloc();
    i0Cpu->setData(IncrementalGenerator());
    w0Cpu->setData(IncrementalGenerator());
    // Copy input tensors from CPU to CUDA
    Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
    Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
    // Build CUDA graph
    auto conv = gCuda->addOp<ConvTransposed2dObj>(
        i0Cuda, w0Cuda, nullptr, padh, padw, strideh, stridew, dilationh,
        dilationw, oph, opw, group);
    // allocate CUDA memory
    gCuda->dataMalloc();
    // Execute on CUDA
    bool tune = true;
    cuda->run(gCuda, tune);
    return cuda->getPerfTime(gCuda);
 }
 double getPerfMatmulCublas(int b, int m, int n, int k, const char *name) {
    // const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
    // dilationh, dilationw, group] =
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -13,6 +13,7 @@ void register_operator_timer(py::module &m) {
 #ifdef USE_CUDA
    using namespace opTimer;
    m.def("getPerfConvCudnn", &getPerfConvCudnn);
    m.def("getPerfConvTransposed2dCudnn", &getPerfConvTransposed2dCudnn);
    m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
 #endif
 }
--- a/src/kernels/cuda/conv_transposed.cc
+++ b/src/kernels/cuda/conv_transposed.cc
@ -250,8 +250,7 @@ class convBackwardDataCudnn : public Kernel {
                            outData);
                    },
                    [&]() { context->sync(); });
-                // printf("mode:%d algo:%d :%.8lf\n", mode, algo,
+                // printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
                // record.time);
                // Update the tune result
                if (ret.time > record.time)