From b981951a47217ad623f0c7690fca8fd6deec74ef Mon Sep 17 00:00:00 2001
From: Liyan Zheng <liyan-zheng@outlook.com>
Date: Tue, 18 Apr 2023 09:56:14 +0800
Subject: [PATCH] Add: NMutator::memboundToJson to export memboundOp

---
 include/core/tensor.h             |   2 +-
 include/nnet/Visitor/Serializer.h |   1 +
 include/nnet/expr.h               |   2 +-
 include/nnet/nmutator.h           |   6 +-
 include/operators/membound.h      |   1 +
 src/core/tensor.cc                |   3 +-
 src/ffi/ffi_infinitensor.cc       |   3 +-
 src/nnet/App/test_models.cc       | 110 ++++++++++++++++++++++++-
 src/nnet/nmutator.cc              | 128 ++++++++++++++++++++----------
 src/operators/membound.cc         |   6 ++
 10 files changed, 212 insertions(+), 50 deletions(-)
diff --git a/include/core/tensor.h b/include/core/tensor.h
index bcc99a20..8417a2b2 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -12,7 +12,7 @@ namespace infini {
 // TODO: how to deal with this
 using ShapeElem = int;
 using Shape = vector<ShapeElem>;
-enum class TensorType { Input, Initialized, Other };
+enum class TensorType { Error = 0, Input = 1, Initialized = 2, Other = 3 };
 class TensorObj : public TensorBaseObj {
   private:
     Shape shape;
diff --git a/include/nnet/Visitor/Serializer.h b/include/nnet/Visitor/Serializer.h
index 3568b152..b2c3ff8a 100644
--- a/include/nnet/Visitor/Serializer.h
+++ b/include/nnet/Visitor/Serializer.h
@@ -20,6 +20,7 @@ class Serializer : public Functor<string()> {
     string visit_(const Subscript &c) override;
     string visit_(const Var &c) override;
     string visit_(const Tensor &c) override;
+    string visit_(const Func &c) override;
     string dispatchRoutine(const Routine &c);
 
     Expr buildExprTree(string key);
diff --git a/include/nnet/expr.h b/include/nnet/expr.h
index c8d5a0c8..b84a9eec 100644
--- a/include/nnet/expr.h
+++ b/include/nnet/expr.h
@@ -104,7 +104,7 @@ enum class NodeType {
     FuncNodeType
 };
 
-enum class FuncType { Relu, Tanh, PRelu };
+enum class FuncType { Relu = 1000, Tanh, PRelu };
 
 #define DEFINE_GETTYPE(CLASS, isScalar_v)                                      \
     NodeType getType() const override { return NodeType::CLASS##Type; }        \
diff --git a/include/nnet/nmutator.h b/include/nnet/nmutator.h
index 57d24714..3c366ddd 100644
--- a/include/nnet/nmutator.h
+++ b/include/nnet/nmutator.h
@@ -32,6 +32,8 @@ class NMutator : public Mutator {
     long long cntStates = 0;
     long long cntCandidates = 0;
 
+    static void memboundToJson(const Graph &g, const string path);
+
   private:
     int maxDepth = 8;
     nnet::Expr opToExpression(Operator op);
@@ -57,8 +59,8 @@ class NMutator : public Mutator {
 
     // TODO: recover these rules
     // Graph fuseHetConv(nnet::Expr expr, Graph in_graph);
-    // Graph transformTConv1x1(Operator op);
-    // Graph transformTConv3x3(Operator op);
+    Graph transformConvtransposed1x1(Operator _op);
+    // Graph transformConvtransposed(Operator op);
     // Graph transformDialtedConv(Operator op);
     // Graph transformConv1x1(Operator op);
     // Graph transformConv1xk(Operator op);
diff --git a/include/operators/membound.h b/include/operators/membound.h
index 902e5d93..df42e5b2 100644
--- a/include/operators/membound.h
+++ b/include/operators/membound.h
@@ -33,6 +33,7 @@ class MemBoundObj : public OperatorObj {
         return {expr, hash};
     }
     double getEstimatedTime() const { return exec_time; }
+    void saveAsJson(string path) const;
 
   private:
     vector<int> getWorkloadVector() const override;
diff --git a/src/core/tensor.cc b/src/core/tensor.cc
index 23f56d64..00ee1b7d 100644
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@@ -26,7 +26,8 @@ string TensorObj::toString() const {
         ss << "nullptr data";
     string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
                  std::to_string(fuid) + ", shape " + vecToString(shape) +
-                 ", dtype " + dtype.toString();
+                 ", dtype " + dtype.toString() + ", tensorType " +
+                 std::to_string(enum_to_underlying(tensorType));
     vector<UidBaseType> targetGuids;
     for (const auto &op : targets)
         targetGuids.emplace_back(op.lock()->getGuid());
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 7461f260..e66dd00a 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -341,7 +341,8 @@ void init_graph_builder(py::module &m) {
     py::class_<NMutator, Ref<NMutator>, Mutator>(m, "NMutator")
         .def(py::init<NMutator::Mode>())
         .def(py::init<NMutator::Mode, vector<int>>())
-        .def("run", &NMutator::run);
+        .def("run", &NMutator::run)
+        .def_static("memboundToJson", &NMutator::memboundToJson);
     py::class_<SearchEngine>(m, "SearchEngine")
         .def(py::init<Runtime, Ref<Mutator>>())
         .def("run", &SearchEngine::run);
diff --git a/src/nnet/App/test_models.cc b/src/nnet/App/test_models.cc
index 339021eb..5274b2de 100644
--- a/src/nnet/App/test_models.cc
+++ b/src/nnet/App/test_models.cc
@@ -4,6 +4,7 @@
 #include "core/runtime.h"
 #include "core/search_engine.h"
 #include "cuda/cuda_runtime.h"
+#include "ffi/ffi_callback.h"
 #include "nnet/nmutator.h"
 #include "operators/conv.h"
 #include "operators/unary.h"
@@ -23,12 +24,43 @@ Graph getInfoGAN(int batch, Runtime runtime, int nLayers) {
         {64, 4, 1, 2, false},  {32, 4, 1, 2, true},
     };
 
-    Tensor input = g->addTensor({batch, 1, 1, 228});
+    Tensor input =
+        g->addTensor({batch, 1, 1, 228}, DataType::Float32, TensorType::Input);
     for (int i = 0; i < (int)cs.size() && i < nLayers; ++i) {
         auto [channel, kernelSize, pad, stride, tanh] = cs[i];
         int f = input->getDims()[3]; // n, h, w, f
-        auto weight =
-            g->addTensor({f, kernelSize, kernelSize, channel}); // f, r, s, c
+        auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
+                                   DataType::Float32,
+                                   TensorType::Initialized); // f, r, s, c
+        input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
+                                                  pad, stride, stride, 1, 1)
+                    ->getOutput();
+        if (tanh) {
+            input = g->addOp<TanhObj>(input, nullptr)->getOutput();
+        } else {
+            input = g->addOp<ReluObj>(input, nullptr)->getOutput();
+        }
+    }
+    return g;
+}
+
+Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId) {
+    IT_ASSERT(0 <= layerId && layerId < 5);
+    Graph g = make_ref<GraphObj>(runtime);
+    vector<Tensor> weights;
+    vector<tuple<int, int, int, int, bool>> cs{
+        // Channel, kernelSize, pad, stride, isTanh
+        {448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
+        {64, 4, 1, 2, false},  {32, 4, 1, 2, true},
+    };
+
+    Tensor input = g->addTensor(shape, DataType::Float32, TensorType::Input);
+    for (int i = layerId; i < layerId + 1; ++i) {
+        auto [channel, kernelSize, pad, stride, tanh] = cs[i];
+        int f = input->getDims()[3]; // n, h, w, f
+        auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
+                                   DataType::Float32,
+                                   TensorType::Initialized); // f, r, s, c
         input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
                                                   pad, stride, stride, 1, 1)
                     ->getOutput();
@@ -50,6 +82,77 @@ void printGraph(Graph g) {
     }
 }
 
+Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) {
+    Runtime cpu = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(cpu);
+
+    auto mutator =
+        make_ref<NMutator>(NMutator::Mode::RuleBased,
+                           vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
+    vector<Graph> bestGraphs;
+    SearchEngine searchEngine(runtime, mutator);
+    bestGraphs.emplace_back(searchEngine.run(g));
+    g->topo_sort();
+    dbg(g, bestGraphs[0], bestGraphs.size());
+    g->print();
+
+    g->dataMalloc();
+    map<UidBaseType, Tensor> fuidToInputTensor;
+    for (auto t : g->getInputs()) {
+        IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
+        fuidToInputTensor[t->getFuid()] = t;
+    }
+
+    auto gen = RandomGenerator(-0.1, 0.1, 0);
+    for (auto t : g->getInputs()) {
+        t->setData(gen);
+    }
+    for (auto t : g->getOutputs()) {
+        t->setData(ZeroGenerator());
+    }
+    runtime->run(g);
+    dbg("Baseline graph");
+    printGraph(g);
+    dbg(runtime->getPerfTime(g, true));
+
+    for (size_t i = 0; i < bestGraphs.size(); i++) {
+        auto bestGraphCpu = bestGraphs[i];
+        auto bestGraph =
+            make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
+        bestGraph->topo_sort();
+
+        bestGraph->dataMalloc();
+        // Initialize inputs with random data
+        for (auto t : bestGraph->getInputs()) {
+            t->copyData(fuidToInputTensor[t->getFuid()]);
+        }
+
+        // Initialize outputs with zeros
+        for (auto t : bestGraph->getOutputs()) {
+            t->setData(ZeroGenerator());
+        }
+
+        dbg(bestGraph);
+        dbg(bestGraph->getOutputs());
+
+        if (tuning) {
+            runtime->run(bestGraph, true);  // Tune kernels
+            runtime->run(bestGraph, false); // Execute transfomraed graph
+
+            auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
+            auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
+            // EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
+            dbg(go0->equalData(bgo0, 1e-3));
+            dbg(runtime->getPerfTime(bestGraph, true));
+        }
+
+        dbg("Best graph");
+        printGraph(bestGraph);
+        return bestGraph;
+    }
+    return nullptr;
+}
+
 vector<Tensor> runInfoGAN(int nLayers) {
     Runtime cuda = make_ref<CudaRuntimeObj>();
     Runtime cpu = NativeCpuRuntimeObj::getInstance();
@@ -122,6 +225,7 @@ vector<Tensor> runInfoGAN(int nLayers) {
 
         dbg("Best graph");
         printGraph(bestGraph);
+        callback::exportONNX(bestGraph, "best_graph.onnx"); // Debug
         return {g->getOutputs()[0], bestGraph->getOutputs()[0]};
     }
     return {};
diff --git a/src/nnet/nmutator.cc b/src/nnet/nmutator.cc
index 7b40cab2..e27e3a03 100644
--- a/src/nnet/nmutator.cc
+++ b/src/nnet/nmutator.cc
@@ -78,16 +78,14 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
     OpVec computeOps = in_graph->getComputeOps();
     IT_ASSERT(computeOps.size() == 1);
 
-    /* if (infini::Graph g = transformTConv1x1(computeOps[0])) {
+    // if (infini::Graph g = transformTConv1x1(computeOps[0])) {
     //     out_graphs.emplace_back(g);
     //     return;
     // }
-    // // Commented for debug, not implemented yet
-    // // if (infini::Graph g = transformTConv3x3(computeOps[0])) {
-    // //     Graph graph = new Graph(g->getOperators());
-    // //     out_graphs.emplace_back(graph);
-    // //     return;
-    // // }
+    if (Graph g = transformConvtransposed1x1(computeOps[0])) {
+        out_graphs.emplace_back(g);
+        return;
+    }
     // if (infini::Graph g = transformDialtedConv(computeOps[0])) {
     //     out_graphs.emplace_back(g);
     //     return;
@@ -519,43 +517,82 @@ double NMutator::memboundTime(const Shape &dims) {
 //     return nullptr;
 // }
 
-// Graph NMutator::transformTConv3x3(Operator op) {
-//     if (auto tconvOp = dynamic_cast<ConvTransOp *>(op)) {
-//         dbg(tconvOp->getInputs()[1]->getDims());
-//         if (tconvOp->getPh() == 1 && tconvOp->getSh() == 2 &&
-//             tconvOp->getInputs()[1]->getDims()[0] == 3 &&
-//             tconvOp->getInputs()[1]->getDims()[1] == 3) {
-//             auto g = new infini::Graph();
-//             auto inputDims = tconvOp->getInputs(0)->getDims();
-//             auto weightDims = tconvOp->getInputs(1)->getDims();
-//             auto outputDims = tconvOp->getOutput()->getDims();
-//             // NHWF
-//             auto newA = g->tensor(
-//                 {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]});
-//             // RSFC
-//             auto newW = g->tensor(
-//                 {weightDims[0] * weightDims[1] * weightDims[3],
-//                 weightDims[2]});
-//             auto newO =
-//                 g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
+Graph NMutator::transformConvtransposed1x1(Operator _op) {
+    auto op = as<ConvTransposed2dNHWCObj>(_op);
+    if (!op)
+        return nullptr;
+    const auto &A = op->getInputs()[0];
+    const auto &W = op->getInputs()[1];
+    const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
+    const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+    const Shape inputDims = op->getInputs(0)->getDims();
+    const Shape weightDims = op->getInputs(1)->getDims();
+    const Shape outputDims = op->getOutput()->getDims();
+    const DataType dtype = A->getDType();
+    IT_ASSERT_TODO(op->getNumGroups() == 1);
+    if (h != 1 || w != 1)
+        return {};
+    IT_ASSERT_TODO(ph == pw);
+    IT_ASSERT_TODO(tie(sh, sw) == tuple(1, 1));
+    IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
+    auto g = make_ref<GraphObj>(runtime);
+    // NHWF
+    auto newA = g->addTensor(
+        {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
+    // FRSC
+    auto newW = g->addTensor(
+        {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]}, dtype);
+    g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
+    g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
+    Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 0)->getOutput();
+    g->addOpWithOutputs<ReshapeObj>(newO, g->cloneTensor(op->getOutput()),
+                                    op->getOutput()->getDims());
+    return g;
+}
+
+// Graph NMutator::transformConvtransposed(Operator _op) {
+//     auto op = as<ConvTransposed2dNHWCObj>(_op);
+//     if (!op)
+//         return nullptr;
+//     const auto &AT = op->getInputs()[0];
+//     const auto &KT = op->getInputs()[1];
+//     const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
+//     const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+//     IT_ASSERT_TODO(op->getNumGroups() == 1);
+//     if (r != 4)
+//         return {};
+//     IT_ASSERT_TODO(ph == pw);
+//     IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));
+//     IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
+
+//     auto g = make_ref<Graph>();
+//     // TODO: implement transformation rules
+//     // How to efficiently write an expression...
+//     auto inputDims = op->getInputs(0)->getDims();
+//     auto weightDims = op->getInputs(1)->getDims();
+//     auto outputDims = op->getOutput()->getDims();
+//     // NHWF
+//     auto newA =
+//         g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
+//         inputDims[3]});
+//     // RSFC
+//     auto newW = g->tensor(
+//         {weightDims[0] * weightDims[1] * weightDims[3], weightDims[2]});
+//     auto newO = g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
 //                            weightDims[0] * weightDims[1] * weightDims[3]});
-//             g->reshape(tconvOp->getInputs(0), newA);
-//             g->reshape(tconvOp->getInputs(1), newW);
-//             g->matmul(newA, newW, newO, 0, 1);
-//             // g->reshape(newO, tconvOp->getOutput());
-//             tconvOp->print();
-//             dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
-//             assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
-//             g->membound(
-//                 {newO}, {tconvOp->getOutput()}, {}, nullptr,
+//     g->reshape(op->getInputs(0), newA);
+//     g->reshape(op->getInputs(1), newW);
+//     g->matmul(newA, newW, newO, 0, 1);
+//     // g->reshape(newO, tconvOp->getOutput());
+//     tconvOp->print();
+//     dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
+//     assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
+//     g->membound({newO}, {tconvOp->getOutput()}, {}, nullptr,
 //                 memboundTime(newO->size() + tconvOp->getOutput()->size()),
 //                 "TConv3x3 reduce");
-//             g->updateConnection();
-//             Graph graph = new Graph(g->getOperators());
-//             return graph;
-//         }
-//     }
-//     return nullptr;
+//     g->updateConnection();
+//     Graph graph = new Graph(g->getOperators());
+//     return graph;
 // }
 
 // Graph NMutator::transformTConv1x1(Operator op) {
@@ -711,4 +748,13 @@ NMutator::generateUnaryExpr(const Operator &op) {
             NameNToTensorT{{"T", op->getInputs()[0]}}};
 }
 
+void NMutator::memboundToJson(const Graph &g, const string path) {
+    for (auto &_op : g->getOperators()) {
+        if (auto op = as<MemBoundObj>(_op)) {
+            op->saveAsJson(path + "/" + "membound_" +
+                           std::to_string(op->getGuid()) + ".json");
+        }
+    }
+}
+
 } // namespace infini
diff --git a/src/operators/membound.cc b/src/operators/membound.cc
index 9e9e62ad..3afa9134 100644
--- a/src/operators/membound.cc
+++ b/src/operators/membound.cc
@@ -2,6 +2,7 @@
 #include "nnet/Visitor/CheckOOBVisitor.h"
 #include "nnet/Visitor/HashVisitor.h"
 #include "nnet/Visitor/MergeMemboundMutator.h"
+#include "nnet/Visitor/Serializer.h"
 
 namespace infini {
 
@@ -83,4 +84,9 @@ bool MemBoundObj::checkOOB(nnet::Expr expr) {
         nnet::as<nnet::RangeOpNode>(expr));
 }
 
+void MemBoundObj::saveAsJson(string path) const {
+    bool status = nnet::Serializer().serialize(expr, path);
+    IT_ASSERT(status);
+}
+
 } // namespace infini