Add: exclude compile-time computable operator time

2023-04-17 13:24:14 +08:00 · 2023-04-17 13:24:14 +08:00 · bc31219bde
parent edf4e33353
commit bc31219bde
6 changed files with 39 additions and 14 deletions
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -16,7 +16,8 @@ class GraphObj : public Object {
    string toString() const override;
    Runtime getRuntime() const { return runtime; }

-    Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
+    Tensor addTensor(Shape dim, DataType dtype = DataType::Float32,
+                     TensorType tensorType = TensorType::Other);
    Tensor addTensor(const Tensor &tensor);
    TensorVec addTensor(const TensorVec &tensors);
    /**
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -12,13 +12,14 @@ namespace infini {
 // TODO: how to deal with this
 using ShapeElem = int;
 using Shape = vector<ShapeElem>;
+enum class TensorType { Input, Initialized, Other };
 class TensorObj : public TensorBaseObj {
  private:
    Shape shape;
    size_t _size; // Cache of Π(shape).
    Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
                  // scratch have a new id.
-
+    TensorType tensorType;
    void copyin(const void *ptr, size_t size) {
        runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
    }
@ -27,7 +28,8 @@ class TensorObj : public TensorBaseObj {
    }

  public:
-    TensorObj(Shape shape, DataType dtype, Runtime runtime);
+    TensorObj(Shape shape, DataType dtype, Runtime runtime,
+              TensorType tensorType = TensorType::Other);
    virtual ~TensorObj() {}
    string toString() const override;

@ -39,6 +41,7 @@ class TensorObj : public TensorBaseObj {
    size_t getOffset(const vector<int> &ds) const;
    void dataMalloc();
    UidBaseType getFuid() const { return fuid; }
+    TensorType getTensorType() const { return tensorType; }

    void load(std::string file_path);
    void save(std::string file_path);
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@ -129,8 +129,9 @@ void GraphObj::dataMalloc() {
    }
 }

-Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
-    return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
+Tensor GraphObj::addTensor(Shape dim, DataType dtype, TensorType tensorType) {
+    return tensors.emplace_back(
+        make_ref<TensorObj>(dim, dtype, runtime, tensorType));
 }

 Tensor GraphObj::addTensor(const Tensor &tensor) {
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@ -65,6 +65,21 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
    double totalTime = 0;
    std::map<OpType, double> opTime;
    std::map<OpType, int> opCnt;
+    map<UidBaseType, bool> ctcMap; // compile-time computable
+
+    // Skip static computation
+    bool status = graph->topo_sort();
+    IT_ASSERT(status, "Topological sort failed");
+    for (auto &op : graph->getOperators()) {
+        bool compileTimeComputable = true;
+        for (auto input : op->getInputs()) {
+            // FIXME: propogate the tensor type. Current only the first operator
+            // after weights are compile-time computable.
+            if (input->getTensorType() != TensorType::Initialized)
+                compileTimeComputable = false;
+        }
+        ctcMap[op->getGuid()] = compileTimeComputable;
+    }

    for (auto &op : graph->getOperators()) {
        auto kernelAttrs = KernelAttrs{device, op->getOpType(), op->getDType()};
@ -73,8 +88,9 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
        auto perfData = perfEngine.getPerfData(perfKey);

        double time = -1e9;
-        // Tune the kernel if there is no record
-        if (perfData) {
+        if (ctcMap[op->getGuid()]) { // Compile-time computable operators
+            time = 0;
+        } else if (perfData) { // Tune the kernel if there is no record
            time = perfData->time;
        } else if (allowEstimation && op->getOpType() == OpType::MemBound) {
            time = as<MemBoundObj>(op)->getEstimatedTime();
@ -107,7 +123,7 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
        totalTime += time;
        if (profiling) {
            op->print();
-            printf(" op_time %lf\n", time);
+            printf("  op_time %lf\n", time);
            opTime[op->getOpType()] += time;
            opCnt[op->getOpType()]++;
        }
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -8,12 +8,14 @@

 namespace infini {

-TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime)
+TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime,
+                     TensorType tensorType)
    : TensorBaseObj(shape_.size(), dtype, runtime), shape(std::move(shape_)),
      _size(shape.empty()
                ? 0
                : std::accumulate(shape.begin(), shape.end(), 1,
-                                  [](auto acc, auto x) { return acc * x; })) {}
+                                  [](auto acc, auto x) { return acc * x; })),
+      tensorType(tensorType) {}

 string TensorObj::toString() const {
    // Convert data pointer to string
--- a/test/nnet/test_mutator.cc
+++ b/test/nnet/test_mutator.cc
@ -5,12 +5,13 @@
 #include "core/search_engine.h"
 #include "cuda/cuda_runtime.h"
 #include "nnet/nmutator.h"
+#include "nnet/test.h"
 #include "operators/conv.h"
 #include "test.h"

 namespace infini {

-TEST(Mutator, NaiveConvWithInterpreter) {
+TEST(NMutator, NaiveConvWithInterpreter) {
    // verifyNaiveMembound True: subgraph after transformation
    // verifyNaiveMembound False: subgraph of one single membound (eOP)
    Runtime runtime = NativeCpuRuntimeObj::getInstance();
@ -55,7 +56,7 @@ TEST(Mutator, NaiveConvWithInterpreter) {
 }

 // FIXME: failed since implicit transpose for DLT
-TEST(Mutator, InfoGAN_TConv_3_correctness) {
+TEST(NMutator, InfoGAN_TConv_3_correctness) {
    const bool useMutatorDirectly = false;
    Runtime runtime = make_ref<CudaRuntimeObj>();
    Graph g = make_ref<GraphObj>(runtime);
@ -67,8 +68,9 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {
    // const int n = 1, c = 1, h = 2, w = 2, f = 1, r = 4, s = 4;
    // const int n = 1, c = 2, h = 2, w = 2, f = 2, r = 4, s = 4;

-    auto i0 = g->addTensor({n, h, w, f});
-    auto w0 = g->addTensor({f, r, s, c});
+    auto i0 = g->addTensor({n, h, w, f}, DataType::Float32, TensorType::Input);
+    auto w0 =
+        g->addTensor({f, r, s, c}, DataType::Float32, TensorType::Initialized);
    g->addOp<ConvTransposed2dNHWCObj>(i0, w0, nullptr, 1, 1, 2, 2, 1, 1);

    auto mutator =