From bc31219bdedb36f9724dbc3c0be8fc409468ec5a Mon Sep 17 00:00:00 2001
From: Liyan Zheng <liyan-zheng@outlook.com>
Date: Mon, 17 Apr 2023 13:24:14 +0800
Subject: [PATCH] Add: exclude compile-time computable operator time

---
 include/core/graph.h      |  3 ++-
 include/core/tensor.h     |  7 +++++--
 src/core/graph.cc         |  5 +++--
 src/core/runtime.cc       | 22 +++++++++++++++++++---
 src/core/tensor.cc        |  6 ++++--
 test/nnet/test_mutator.cc | 10 ++++++----
 6 files changed, 39 insertions(+), 14 deletions(-)
diff --git a/include/core/graph.h b/include/core/graph.h
index dab31d79..00fe2017 100644
--- a/include/core/graph.h
+++ b/include/core/graph.h
@@ -16,7 +16,8 @@ class GraphObj : public Object {
     string toString() const override;
     Runtime getRuntime() const { return runtime; }
 
-    Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
+    Tensor addTensor(Shape dim, DataType dtype = DataType::Float32,
+                     TensorType tensorType = TensorType::Other);
     Tensor addTensor(const Tensor &tensor);
     TensorVec addTensor(const TensorVec &tensors);
     /**
diff --git a/include/core/tensor.h b/include/core/tensor.h
index a1081e15..bcc99a20 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -12,13 +12,14 @@ namespace infini {
 // TODO: how to deal with this
 using ShapeElem = int;
 using Shape = vector<ShapeElem>;
+enum class TensorType { Input, Initialized, Other };
 class TensorObj : public TensorBaseObj {
   private:
     Shape shape;
     size_t _size; // Cache of Π(shape).
     Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
                   // scratch have a new id.
-
+    TensorType tensorType;
     void copyin(const void *ptr, size_t size) {
         runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
     }
@@ -27,7 +28,8 @@ class TensorObj : public TensorBaseObj {
     }
 
   public:
-    TensorObj(Shape shape, DataType dtype, Runtime runtime);
+    TensorObj(Shape shape, DataType dtype, Runtime runtime,
+              TensorType tensorType = TensorType::Other);
     virtual ~TensorObj() {}
     string toString() const override;
 
@@ -39,6 +41,7 @@ class TensorObj : public TensorBaseObj {
     size_t getOffset(const vector<int> &ds) const;
     void dataMalloc();
     UidBaseType getFuid() const { return fuid; }
+    TensorType getTensorType() const { return tensorType; }
 
     void load(std::string file_path);
     void save(std::string file_path);
diff --git a/src/core/graph.cc b/src/core/graph.cc
index f52f8af7..17bcae78 100644
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@@ -129,8 +129,9 @@ void GraphObj::dataMalloc() {
     }
 }
 
-Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
-    return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
+Tensor GraphObj::addTensor(Shape dim, DataType dtype, TensorType tensorType) {
+    return tensors.emplace_back(
+        make_ref<TensorObj>(dim, dtype, runtime, tensorType));
 }
 
 Tensor GraphObj::addTensor(const Tensor &tensor) {
diff --git a/src/core/runtime.cc b/src/core/runtime.cc
index 8151a6f0..3ea0112c 100644
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@@ -65,6 +65,21 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
     double totalTime = 0;
     std::map<OpType, double> opTime;
     std::map<OpType, int> opCnt;
+    map<UidBaseType, bool> ctcMap; // compile-time computable
+
+    // Skip static computation
+    bool status = graph->topo_sort();
+    IT_ASSERT(status, "Topological sort failed");
+    for (auto &op : graph->getOperators()) {
+        bool compileTimeComputable = true;
+        for (auto input : op->getInputs()) {
+            // FIXME: propogate the tensor type. Current only the first operator
+            // after weights are compile-time computable.
+            if (input->getTensorType() != TensorType::Initialized)
+                compileTimeComputable = false;
+        }
+        ctcMap[op->getGuid()] = compileTimeComputable;
+    }
 
     for (auto &op : graph->getOperators()) {
         auto kernelAttrs = KernelAttrs{device, op->getOpType(), op->getDType()};
@@ -73,8 +88,9 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
         auto perfData = perfEngine.getPerfData(perfKey);
 
         double time = -1e9;
-        // Tune the kernel if there is no record
-        if (perfData) {
+        if (ctcMap[op->getGuid()]) { // Compile-time computable operators
+            time = 0;
+        } else if (perfData) { // Tune the kernel if there is no record
             time = perfData->time;
         } else if (allowEstimation && op->getOpType() == OpType::MemBound) {
             time = as<MemBoundObj>(op)->getEstimatedTime();
@@ -107,7 +123,7 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
         totalTime += time;
         if (profiling) {
             op->print();
-            printf(" op_time %lf\n", time);
+            printf("  op_time %lf\n", time);
             opTime[op->getOpType()] += time;
             opCnt[op->getOpType()]++;
         }
diff --git a/src/core/tensor.cc b/src/core/tensor.cc
index 609b1720..23f56d64 100644
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@@ -8,12 +8,14 @@
 
 namespace infini {
 
-TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime)
+TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime,
+                     TensorType tensorType)
     : TensorBaseObj(shape_.size(), dtype, runtime), shape(std::move(shape_)),
       _size(shape.empty()
                 ? 0
                 : std::accumulate(shape.begin(), shape.end(), 1,
-                                  [](auto acc, auto x) { return acc * x; })) {}
+                                  [](auto acc, auto x) { return acc * x; })),
+      tensorType(tensorType) {}
 
 string TensorObj::toString() const {
     // Convert data pointer to string
diff --git a/test/nnet/test_mutator.cc b/test/nnet/test_mutator.cc
index 07374554..1e2b0623 100644
--- a/test/nnet/test_mutator.cc
+++ b/test/nnet/test_mutator.cc
@@ -5,12 +5,13 @@
 #include "core/search_engine.h"
 #include "cuda/cuda_runtime.h"
 #include "nnet/nmutator.h"
+#include "nnet/test.h"
 #include "operators/conv.h"
 #include "test.h"
 
 namespace infini {
 
-TEST(Mutator, NaiveConvWithInterpreter) {
+TEST(NMutator, NaiveConvWithInterpreter) {
     // verifyNaiveMembound True: subgraph after transformation
     // verifyNaiveMembound False: subgraph of one single membound (eOP)
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
@@ -55,7 +56,7 @@ TEST(Mutator, NaiveConvWithInterpreter) {
 }
 
 // FIXME: failed since implicit transpose for DLT
-TEST(Mutator, InfoGAN_TConv_3_correctness) {
+TEST(NMutator, InfoGAN_TConv_3_correctness) {
     const bool useMutatorDirectly = false;
     Runtime runtime = make_ref<CudaRuntimeObj>();
     Graph g = make_ref<GraphObj>(runtime);
@@ -67,8 +68,9 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {
     // const int n = 1, c = 1, h = 2, w = 2, f = 1, r = 4, s = 4;
     // const int n = 1, c = 2, h = 2, w = 2, f = 2, r = 4, s = 4;
 
-    auto i0 = g->addTensor({n, h, w, f});
-    auto w0 = g->addTensor({f, r, s, c});
+    auto i0 = g->addTensor({n, h, w, f}, DataType::Float32, TensorType::Input);
+    auto w0 =
+        g->addTensor({f, r, s, c}, DataType::Float32, TensorType::Initialized);
     g->addOp<ConvTransposed2dNHWCObj>(i0, w0, nullptr, 1, 1, 2, 2, 1, 1);
 
     auto mutator =