Add: warmup and repeat args in timeNonCtcOperators

2023-04-19 16:22:59 +08:00 · 2023-04-19 16:22:59 +08:00 · e4c20a9ae2
parent 537b3b4ea4
commit e4c20a9ae2
4 changed files with 19 additions and 13 deletions
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -64,7 +64,8 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
     * @return double Return the sum of perf time for each operator
     */
    double getPerfTime(const Graph &graph, bool printProfiling = false,
-                       bool allowEstimation = false) const;
+                       bool allowEstimation = false,
+                       bool ignoreMemboundOp = false) const;
    Blob allocBlob(size_t size);
    bool isCpu() const {
        return device == Device::CPU || device == Device::INTELCPU;
@ -82,7 +83,8 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
    map<UidBaseType, bool>
    getCompileTimeComputableAttribute(const Graph &graph) const;

-    double timeNonCtcOperators(const Graph &graph) const;
+    double timeNonCtcOperators(const Graph &graph, int warmup = 1000,
+                               int repeat = 1000) const;

  protected:
    void printProfilingData(double totTime,
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@ -78,7 +78,8 @@ RuntimeObj::getCompileTimeComputableAttribute(const Graph &graph) const {
 }

 double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
-                               bool allowEstimation) const {
+                               bool allowEstimation,
+                               bool ignoreMemboundOp) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto &perfEngine = PerfEngine::getInstance();
    // Statistics
@ -97,10 +98,14 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
        double time = -1e9;
        if (ctcMap[op->getGuid()]) { // Compile-time computable operators
            time = 0;
+        } else if (op->getOpType() == OpType::Reshape) {
+            time = 0;
+        } else if (op->getOpType() == OpType::MemBound && ignoreMemboundOp) {
+            time = 0;
+        } else if (op->getOpType() == OpType::MemBound && allowEstimation) {
+            time = as<MemBoundObj>(op)->getEstimatedTime();
        } else if (perfData) { // Tune the kernel if there is no record
            time = perfData->time;
-        } else if (allowEstimation && op->getOpType() == OpType::MemBound) {
-            time = as<MemBoundObj>(op)->getEstimatedTime();
        } else {
            // TODO: should tenosrs automatically allocate when access data?
            // allocate memory for empty tensors and release it after
@ -189,7 +194,8 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,

 string NativeCpuRuntimeObj::toString() const { return "CPU Runtime"; }

-double RuntimeObj::timeNonCtcOperators(const Graph &graph) const {
+double RuntimeObj::timeNonCtcOperators(const Graph &graph, int warmup,
+                                       int repeat) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto &perfEngine = PerfEngine::getInstance();
    // compile-time computable
@ -209,14 +215,13 @@ double RuntimeObj::timeNonCtcOperators(const Graph &graph) const {
            kernel->compute(op, perfData, this);
        else
            kernel->compute(op, this);
-        // if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
-        if (op->getOpType() == OpType::Matmul)
+        if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
            kernels.emplace_back(op, kernel, perfData);
    }
    for (auto &[op, kernel, perfData] : kernels) {
        dbg(op);
    }
-    cudaProfilerStart(); // HACK: Debug
+    // cudaProfilerStart();
    double ret = timeit(
        [&]() {
            for (auto &[op, kernel, perfData] : kernels) {
@ -226,8 +231,8 @@ double RuntimeObj::timeNonCtcOperators(const Graph &graph) const {
                    kernel->compute(op, this);
            }
        },
-        [&]() { sync(); });
-    cudaProfilerStop(); // HACK: Debug
+        [&]() { sync(); }, warmup, repeat);
+    // cudaProfilerStop();
    return ret;
 }

--- a/src/core/search_engine.cc
+++ b/src/core/search_engine.cc
@ -450,7 +450,7 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
 }

 double SearchEngine::getEstimatedGraphPerf(Graph graph) {
-    return runtimeExec->getPerfTime(graph, false, true);
+    return runtimeExec->getPerfTime(graph, false, true, true);
 }

 Graph SearchEngine::fuseVertically(const Graph &graph) {
--- a/src/kernels/cuda/membound_tvm_packed_function.cc
+++ b/src/kernels/cuda/membound_tvm_packed_function.cc
@ -41,7 +41,6 @@ class MemboundTVMPackedFunction : public Kernel {
        // auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
        auto tvmRecord = std::dynamic_pointer_cast<TVMRecordObj>(record);
        tvm::runtime::PackedFunc packedFunc = tvmRecord->packedFunc;
-        // IT_ASSERT(packedFunc != nullptr);

        // prepare inputs and outputs
        vector<DLTensorHolder> inputsHolder;