Add conv2bgemm and fix mutator::runtime

2023-04-24 13:06:24 +08:00 · 2023-04-24 13:06:24 +08:00 · 1e46750159
parent 079985bc8c
commit 1e46750159
7 changed files with 66 additions and 43 deletions
--- a/include/nnet/nmutator.h
+++ b/include/nnet/nmutator.h
@ -20,8 +20,10 @@ class NMutator : public Mutator {
    const std::vector<int> derivationRules;

  public:
-    NMutator(Mode mode = Mode::Normal);
-    NMutator(Mode mode, const std::vector<int> &derivationRules);
+    NMutator(Mode mode = Mode::Normal,
+             Runtime runtime = NativeCpuRuntimeObj::getInstance());
+    NMutator(Mode mode, const std::vector<int> &derivationRules,
+             Runtime runtime = NativeCpuRuntimeObj::getInstance());
    ~NMutator();

    vector<Graph> run(const Graph &in_graph) override;
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@ -102,8 +102,6 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
        double time = -1e9;
        if (ctcMap[op->getGuid()]) { // Compile-time computable operators
            time = 0;
-        } else if (op->getOpType() == OpType::Reshape) {
-            time = 0;
        } else if (op->getOpType() == OpType::MemBound && ignoreMemboundOp) {
            time = 0;
        } else if (op->getOpType() == OpType::MemBound && allowEstimation) {
@ -136,8 +134,10 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
                t->freeData();
        }

-        if (op->getOpType() != OpType::Transpose &&
-            op->getOpType() != OpType::ReduceMean)
+        // FIXME: ignore trnapose when necessary 
+        //     op->getOpType() != OpType::Transpose &&
+        //     op->getOpType() != OpType::ReduceMean
+        if (op->getOpType() != OpType::Reshape)
            totalTime += time;
        if (profiling) {
            op->print();
--- a/src/core/search_engine.cc
+++ b/src/core/search_engine.cc
@ -351,6 +351,12 @@ std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
            //     mutatedGraphs.resize(2);
            // if (mutatedGraphs.size() >= 2)
            //     mutatedGraphs = {mutatedGraphs[1]};
+            constexpr bool chooseBestMutation = false;
+            if (chooseBestMutation && mutatedGraphs.size() >= 2) {
+                std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
+                          graphTimeComparer);
+                mutatedGraphs = {mutatedGraphs[0]};
+            }
            for (auto graph : graphs) {
                for (auto mutatedGraph : mutatedGraphs) {
                    std::vector<Operator> ops;
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@ -155,7 +155,6 @@ double CudaRuntimeObj::timeWithCudaGraph(Graph graph, int rounds) {
    auto tvm_device = tvm::runtime::DeviceAPI::Get(tvm_device_id);
    tvm_device->SetStream(tvm_device_id, getStream());
 #endif
-
    beginCudaGraphStreamCapture();
    for (auto &[op, kernel, perfData] : kernels) {
        if (perfData)
@ -174,8 +173,8 @@ double CudaRuntimeObj::timeWithCudaGraph(Graph graph, int rounds) {
        [&, cudaGraphInstance = cudaGraphInstance, stream = getStream()]() {
            checkCudaError(cudaGraphLaunch(cudaGraphInstance, stream));
        },
-        [&, stream = getStream()]() { cudaStreamSynchronize(stream); }, 1000,
-        1000);
+        [&, stream = getStream()]() { cudaStreamSynchronize(stream); }, rounds,
+        rounds);
 }

 } // namespace infini
--- a/src/nnet/App/test_models.cc
+++ b/src/nnet/App/test_models.cc
@ -16,6 +16,7 @@
 #include "operators/reshape.h"
 #include "operators/softmax.h"
 #include "operators/transpose.h"
+#include "operators/pooling.h"
 #include "operators/unary.h"
 #include "test.h"
 #include <pybind11/stl.h>
@ -366,11 +367,11 @@ Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning, NMutator::Mode mode,
    Ref<NMutator> mutator;
    if (mode == NMutator::Mode::Normal) {
        dbg(mode);
-        mutator = make_ref<NMutator>(mode);
+        mutator = make_ref<NMutator>(mode, runtime);
    } else if (mode == NMutator::Mode::RuleBased) {
        dbg(mode, rules);
        IT_ASSERT_TODO(rules.size() > 0);
-        mutator = make_ref<NMutator>(mode, rules);
+        mutator = make_ref<NMutator>(mode, rules, runtime);
    } else
        IT_TODO_HALT();
    vector<Graph> bestGraphs;
--- a/src/nnet/nmutator.cc
+++ b/src/nnet/nmutator.cc
@ -22,12 +22,15 @@

 namespace infini {

-NMutator::NMutator(Mode mode) : Mutator(10), mode{mode} {
+NMutator::NMutator(Mode mode, Runtime runtime)
+    : Mutator(10, runtime), mode{mode} {
    IT_ASSERT(mode != Mode::RuleBased, "Specify rules for the RuleBased mode.");
 }

-NMutator::NMutator(Mode mode, const std::vector<int> &derivationRules)
-    : Mutator(10), mode{Mode::RuleBased}, derivationRules{derivationRules} {
+NMutator::NMutator(Mode mode, const std::vector<int> &derivationRules,
+                   Runtime runtime)
+    : Mutator(10, runtime), mode{Mode::RuleBased}, derivationRules{
+                                                       derivationRules} {
    IT_ASSERT(mode == Mode::RuleBased);
 }

@ -93,37 +96,29 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
    IT_ASSERT(computeOps.size() == 1);
    if (Graph g = transformConvtransposed1x1(computeOps[0])) {
        out_graphs.emplace_back(g);
-        return;
    }
    if (Graph g = transformConv1x1(computeOps[0])) {
        out_graphs.emplace_back(g);
-        return;
    }
    if (infini::Graph g = transformConv1xk(computeOps[0])) {
        out_graphs.emplace_back(g);
-        return;
    }
    if (Graph g = transformG2bmm(computeOps[0])) {
        out_graphs.emplace_back(g);
-        return;
    }
    if (Graph g = transformGbmm(computeOps[0])) {
        out_graphs.emplace_back(g);
-        return;
    }
    if (infini::Graph g = transformDialtedConv(computeOps[0])) {
        out_graphs.emplace_back(g);
-        return;
    }
    if (infini::Graph g = transformConvToGEMMReduce(computeOps[0])) {
        out_graphs.emplace_back(g);
-        return;
    }
-
    if (infini::Graph g = transformConvTranposeToGEMMReduce(computeOps[0])) {
        out_graphs.emplace_back(g);
-        return;
    }
+    return;

    const set<OpType> opSet{OpType::Conv, OpType::ConvTransNHWC, OpType::G2BMM,
                            OpType::GBMM};
@ -735,25 +730,45 @@ Graph NMutator::transformConv1x1(Operator _op) {
            g->addOp<MatmulObj>(B, A, nullptr, 0, 0)->getOutput(); // [F, N*H*W]
        g->addOpWithOutputs<ReshapeObj>(O, g->cloneTensor(op->getOutput()),
                                        op->getOutput()->getDims());
-    } else {
-        auto A = g->addOp<TransposeObj>(g->cloneTensor(op->getInputs(0)),
-                                        nullptr, vector{1, 0, 2, 3})
-                     ->getOutput(); // [C,N,H,W]
-        A = g->addOp<ReshapeObj>(A, nullptr,
-                                 vector{shapeA[1], shapeA[0] * shapeA[2] *
-                                                       shapeA[3]}) // [C, N*H*W]
+        // } else { // Tranpose + Matmul + Transpose
+        //     auto A = g->addOp<TransposeObj>(g->cloneTensor(op->getInputs(0)),
+        //                                     nullptr, vector{1, 0, 2, 3})
+        //                  ->getOutput(); // [C,N,H,W]
+        //     A = g->addOp<ReshapeObj>(A, nullptr,
+        //                              vector{shapeA[1], shapeA[0] * shapeA[2]
+        //                              *
+        //                                                    shapeA[3]}) // [C,
+        //                                                    N*H*W]
+        //             ->getOutput();
+        //     auto B = g->addOp<ReshapeObj>(g->cloneTensor(op->getInputs(1)),
+        //     nullptr,
+        //                                   vector{shapeW[0], shapeW[1]}) //
+        //                                   [F, C]
+        //                  ->getOutput();
+        //     auto O =
+        //         g->addOp<MatmulObj>(B, A, nullptr, 0, 0)->getOutput(); // [F,
+        //         NHW]
+        //     O = g->addOp<ReshapeObj>(
+        //              O, nullptr, Shape{shapeO[1], shapeO[0], shapeO[2],
+        //              shapeO[3]})
+        //             ->getOutput(); // [F, NHW]
+        //     O = g->addOpWithOutputs<TransposeObj>(
+        //              O, g->cloneTensor(op->getOutput()), vector{1, 0, 2, 3})
+        //             ->getOutput(); // [F, N*H*W]
+    } else { // BGemm
+        auto A =
+            g->addOp<ReshapeObj>(g->cloneTensor(op->getInputs(0)), nullptr,
+                                 vector{shapeA[0], shapeA[1],
+                                        shapeA[2] * shapeA[3]}) // [N, C, H*W]
+                ->getOutput();
+        auto B =
+            g->addOp<ReshapeObj>(g->cloneTensor(op->getInputs(1)), nullptr,
+                                 vector{1, shapeW[0], shapeW[1]}) // [1, F, C]
                ->getOutput();
-        auto B = g->addOp<ReshapeObj>(g->cloneTensor(op->getInputs(1)), nullptr,
-                                      vector{shapeW[0], shapeW[1]}) // [F, C]
-                     ->getOutput();
        auto O =
-            g->addOp<MatmulObj>(B, A, nullptr, 0, 0)->getOutput(); // [F, NHW]
-        O = g->addOp<ReshapeObj>(
-                 O, nullptr, Shape{shapeO[1], shapeO[0], shapeO[2], shapeO[3]})
-                ->getOutput(); // [F, NHW]
-        O = g->addOpWithOutputs<TransposeObj>(
-                 O, g->cloneTensor(op->getOutput()), vector{1, 0, 2, 3})
-                ->getOutput(); // [F, N*H*W]
+            g->addOp<MatmulObj>(B, A, nullptr, 0, 0)->getOutput(); // [F, N*H*W]
+        g->addOpWithOutputs<ReshapeObj>(O, g->cloneTensor(op->getOutput()),
+                                        op->getOutput()->getDims());
    }
    return g;
 }
--- a/test/nnet/run_models_nnet.py
+++ b/test/nnet/run_models_nnet.py
@ -167,14 +167,14 @@ if __name__ == "__main__":

    for original_g, name in graphs:
        print(f"=== {name}")
-        # save_onnx(original_g, f"hkz_orig_{name}.onnx")
+        # save_onnx(original_g, f"orig_{name}.onnx")
        # original_g = ft.convertNCHWtoNHWCModel(runtime, original_g)
-        # save_onnx(original_g, f"hkz_dlt_{name}.onnx")
+        # save_onnx(dlt_g, f"dlt_{name}.onnx")
+        # exit()

        # run_and_evaluate(runtime, original_g)
        g = ft.optimizeGraph(original_g, runtime, False, ft.NMutatorMode.RuleBased,
                                [1, 7, 7, 2, 8, 6, 6])  # G2BMM/GBMM
-
        # g = ft.optimizeGraph(original_g, runtime, False, ft.NMutatorMode.RuleBased,
        #                      [3, 2, 2, 5, 8, 8, 6, 90]) # Conv2conv
        # g = ft.optimizeGraph(original_g, runtime, False, ft.NMutatorMode.Normal)