From b981951a47217ad623f0c7690fca8fd6deec74ef Mon Sep 17 00:00:00 2001 From: Liyan Zheng Date: Tue, 18 Apr 2023 09:56:14 +0800 Subject: [PATCH] Add: NMutator::memboundToJson to export memboundOp --- include/core/tensor.h | 2 +- include/nnet/Visitor/Serializer.h | 1 + include/nnet/expr.h | 2 +- include/nnet/nmutator.h | 6 +- include/operators/membound.h | 1 + src/core/tensor.cc | 3 +- src/ffi/ffi_infinitensor.cc | 3 +- src/nnet/App/test_models.cc | 110 ++++++++++++++++++++++++- src/nnet/nmutator.cc | 128 ++++++++++++++++++++---------- src/operators/membound.cc | 6 ++ 10 files changed, 212 insertions(+), 50 deletions(-) diff --git a/include/core/tensor.h b/include/core/tensor.h index bcc99a20..8417a2b2 100644 --- a/include/core/tensor.h +++ b/include/core/tensor.h @@ -12,7 +12,7 @@ namespace infini { // TODO: how to deal with this using ShapeElem = int; using Shape = vector; -enum class TensorType { Input, Initialized, Other }; +enum class TensorType { Error = 0, Input = 1, Initialized = 2, Other = 3 }; class TensorObj : public TensorBaseObj { private: Shape shape; diff --git a/include/nnet/Visitor/Serializer.h b/include/nnet/Visitor/Serializer.h index 3568b152..b2c3ff8a 100644 --- a/include/nnet/Visitor/Serializer.h +++ b/include/nnet/Visitor/Serializer.h @@ -20,6 +20,7 @@ class Serializer : public Functor { string visit_(const Subscript &c) override; string visit_(const Var &c) override; string visit_(const Tensor &c) override; + string visit_(const Func &c) override; string dispatchRoutine(const Routine &c); Expr buildExprTree(string key); diff --git a/include/nnet/expr.h b/include/nnet/expr.h index c8d5a0c8..b84a9eec 100644 --- a/include/nnet/expr.h +++ b/include/nnet/expr.h @@ -104,7 +104,7 @@ enum class NodeType { FuncNodeType }; -enum class FuncType { Relu, Tanh, PRelu }; +enum class FuncType { Relu = 1000, Tanh, PRelu }; #define DEFINE_GETTYPE(CLASS, isScalar_v) \ NodeType getType() const override { return NodeType::CLASS##Type; } \ diff --git a/include/nnet/nmutator.h b/include/nnet/nmutator.h index 57d24714..3c366ddd 100644 --- a/include/nnet/nmutator.h +++ b/include/nnet/nmutator.h @@ -32,6 +32,8 @@ class NMutator : public Mutator { long long cntStates = 0; long long cntCandidates = 0; + static void memboundToJson(const Graph &g, const string path); + private: int maxDepth = 8; nnet::Expr opToExpression(Operator op); @@ -57,8 +59,8 @@ class NMutator : public Mutator { // TODO: recover these rules // Graph fuseHetConv(nnet::Expr expr, Graph in_graph); - // Graph transformTConv1x1(Operator op); - // Graph transformTConv3x3(Operator op); + Graph transformConvtransposed1x1(Operator _op); + // Graph transformConvtransposed(Operator op); // Graph transformDialtedConv(Operator op); // Graph transformConv1x1(Operator op); // Graph transformConv1xk(Operator op); diff --git a/include/operators/membound.h b/include/operators/membound.h index 902e5d93..df42e5b2 100644 --- a/include/operators/membound.h +++ b/include/operators/membound.h @@ -33,6 +33,7 @@ class MemBoundObj : public OperatorObj { return {expr, hash}; } double getEstimatedTime() const { return exec_time; } + void saveAsJson(string path) const; private: vector getWorkloadVector() const override; diff --git a/src/core/tensor.cc b/src/core/tensor.cc index 23f56d64..00ee1b7d 100644 --- a/src/core/tensor.cc +++ b/src/core/tensor.cc @@ -26,7 +26,8 @@ string TensorObj::toString() const { ss << "nullptr data"; string ret = "Tensor " + std::to_string(guid) + ", Fuid " + std::to_string(fuid) + ", shape " + vecToString(shape) + - ", dtype " + dtype.toString(); + ", dtype " + dtype.toString() + ", tensorType " + + std::to_string(enum_to_underlying(tensorType)); vector targetGuids; for (const auto &op : targets) targetGuids.emplace_back(op.lock()->getGuid()); diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc index 7461f260..e66dd00a 100644 --- a/src/ffi/ffi_infinitensor.cc +++ b/src/ffi/ffi_infinitensor.cc @@ -341,7 +341,8 @@ void init_graph_builder(py::module &m) { py::class_, Mutator>(m, "NMutator") .def(py::init()) .def(py::init>()) - .def("run", &NMutator::run); + .def("run", &NMutator::run) + .def_static("memboundToJson", &NMutator::memboundToJson); py::class_(m, "SearchEngine") .def(py::init>()) .def("run", &SearchEngine::run); diff --git a/src/nnet/App/test_models.cc b/src/nnet/App/test_models.cc index 339021eb..5274b2de 100644 --- a/src/nnet/App/test_models.cc +++ b/src/nnet/App/test_models.cc @@ -4,6 +4,7 @@ #include "core/runtime.h" #include "core/search_engine.h" #include "cuda/cuda_runtime.h" +#include "ffi/ffi_callback.h" #include "nnet/nmutator.h" #include "operators/conv.h" #include "operators/unary.h" @@ -23,12 +24,43 @@ Graph getInfoGAN(int batch, Runtime runtime, int nLayers) { {64, 4, 1, 2, false}, {32, 4, 1, 2, true}, }; - Tensor input = g->addTensor({batch, 1, 1, 228}); + Tensor input = + g->addTensor({batch, 1, 1, 228}, DataType::Float32, TensorType::Input); for (int i = 0; i < (int)cs.size() && i < nLayers; ++i) { auto [channel, kernelSize, pad, stride, tanh] = cs[i]; int f = input->getDims()[3]; // n, h, w, f - auto weight = - g->addTensor({f, kernelSize, kernelSize, channel}); // f, r, s, c + auto weight = g->addTensor({f, kernelSize, kernelSize, channel}, + DataType::Float32, + TensorType::Initialized); // f, r, s, c + input = g->addOp(input, weight, nullptr, pad, + pad, stride, stride, 1, 1) + ->getOutput(); + if (tanh) { + input = g->addOp(input, nullptr)->getOutput(); + } else { + input = g->addOp(input, nullptr)->getOutput(); + } + } + return g; +} + +Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId) { + IT_ASSERT(0 <= layerId && layerId < 5); + Graph g = make_ref(runtime); + vector weights; + vector> cs{ + // Channel, kernelSize, pad, stride, isTanh + {448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false}, + {64, 4, 1, 2, false}, {32, 4, 1, 2, true}, + }; + + Tensor input = g->addTensor(shape, DataType::Float32, TensorType::Input); + for (int i = layerId; i < layerId + 1; ++i) { + auto [channel, kernelSize, pad, stride, tanh] = cs[i]; + int f = input->getDims()[3]; // n, h, w, f + auto weight = g->addTensor({f, kernelSize, kernelSize, channel}, + DataType::Float32, + TensorType::Initialized); // f, r, s, c input = g->addOp(input, weight, nullptr, pad, pad, stride, stride, 1, 1) ->getOutput(); @@ -50,6 +82,77 @@ void printGraph(Graph g) { } } +Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) { + Runtime cpu = NativeCpuRuntimeObj::getInstance(); + Graph gCpu = make_ref(cpu); + + auto mutator = + make_ref(NMutator::Mode::RuleBased, + vector{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90}); + vector bestGraphs; + SearchEngine searchEngine(runtime, mutator); + bestGraphs.emplace_back(searchEngine.run(g)); + g->topo_sort(); + dbg(g, bestGraphs[0], bestGraphs.size()); + g->print(); + + g->dataMalloc(); + map fuidToInputTensor; + for (auto t : g->getInputs()) { + IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0); + fuidToInputTensor[t->getFuid()] = t; + } + + auto gen = RandomGenerator(-0.1, 0.1, 0); + for (auto t : g->getInputs()) { + t->setData(gen); + } + for (auto t : g->getOutputs()) { + t->setData(ZeroGenerator()); + } + runtime->run(g); + dbg("Baseline graph"); + printGraph(g); + dbg(runtime->getPerfTime(g, true)); + + for (size_t i = 0; i < bestGraphs.size(); i++) { + auto bestGraphCpu = bestGraphs[i]; + auto bestGraph = + make_ref(runtime, bestGraphCpu->getOperators()); + bestGraph->topo_sort(); + + bestGraph->dataMalloc(); + // Initialize inputs with random data + for (auto t : bestGraph->getInputs()) { + t->copyData(fuidToInputTensor[t->getFuid()]); + } + + // Initialize outputs with zeros + for (auto t : bestGraph->getOutputs()) { + t->setData(ZeroGenerator()); + } + + dbg(bestGraph); + dbg(bestGraph->getOutputs()); + + if (tuning) { + runtime->run(bestGraph, true); // Tune kernels + runtime->run(bestGraph, false); // Execute transfomraed graph + + auto go0 = gCpu->cloneTensor(g->getOutputs()[0]); + auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]); + // EXPECT_TRUE(go0->equalData(bgo0, 1e-3)); + dbg(go0->equalData(bgo0, 1e-3)); + dbg(runtime->getPerfTime(bestGraph, true)); + } + + dbg("Best graph"); + printGraph(bestGraph); + return bestGraph; + } + return nullptr; +} + vector runInfoGAN(int nLayers) { Runtime cuda = make_ref(); Runtime cpu = NativeCpuRuntimeObj::getInstance(); @@ -122,6 +225,7 @@ vector runInfoGAN(int nLayers) { dbg("Best graph"); printGraph(bestGraph); + callback::exportONNX(bestGraph, "best_graph.onnx"); // Debug return {g->getOutputs()[0], bestGraph->getOutputs()[0]}; } return {}; diff --git a/src/nnet/nmutator.cc b/src/nnet/nmutator.cc index 7b40cab2..e27e3a03 100644 --- a/src/nnet/nmutator.cc +++ b/src/nnet/nmutator.cc @@ -78,16 +78,14 @@ void NMutator::runSingleOp(Graph in_graph, std::vector &out_graphs) { OpVec computeOps = in_graph->getComputeOps(); IT_ASSERT(computeOps.size() == 1); - /* if (infini::Graph g = transformTConv1x1(computeOps[0])) { + // if (infini::Graph g = transformTConv1x1(computeOps[0])) { // out_graphs.emplace_back(g); // return; // } - // // Commented for debug, not implemented yet - // // if (infini::Graph g = transformTConv3x3(computeOps[0])) { - // // Graph graph = new Graph(g->getOperators()); - // // out_graphs.emplace_back(graph); - // // return; - // // } + if (Graph g = transformConvtransposed1x1(computeOps[0])) { + out_graphs.emplace_back(g); + return; + } // if (infini::Graph g = transformDialtedConv(computeOps[0])) { // out_graphs.emplace_back(g); // return; @@ -519,43 +517,82 @@ double NMutator::memboundTime(const Shape &dims) { // return nullptr; // } -// Graph NMutator::transformTConv3x3(Operator op) { -// if (auto tconvOp = dynamic_cast(op)) { -// dbg(tconvOp->getInputs()[1]->getDims()); -// if (tconvOp->getPh() == 1 && tconvOp->getSh() == 2 && -// tconvOp->getInputs()[1]->getDims()[0] == 3 && -// tconvOp->getInputs()[1]->getDims()[1] == 3) { -// auto g = new infini::Graph(); -// auto inputDims = tconvOp->getInputs(0)->getDims(); -// auto weightDims = tconvOp->getInputs(1)->getDims(); -// auto outputDims = tconvOp->getOutput()->getDims(); -// // NHWF -// auto newA = g->tensor( -// {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}); -// // RSFC -// auto newW = g->tensor( -// {weightDims[0] * weightDims[1] * weightDims[3], -// weightDims[2]}); -// auto newO = -// g->tensor({inputDims[0] * inputDims[1] * inputDims[2], +Graph NMutator::transformConvtransposed1x1(Operator _op) { + auto op = as(_op); + if (!op) + return nullptr; + const auto &A = op->getInputs()[0]; + const auto &W = op->getInputs()[1]; + const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS(); + const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); + const Shape inputDims = op->getInputs(0)->getDims(); + const Shape weightDims = op->getInputs(1)->getDims(); + const Shape outputDims = op->getOutput()->getDims(); + const DataType dtype = A->getDType(); + IT_ASSERT_TODO(op->getNumGroups() == 1); + if (h != 1 || w != 1) + return {}; + IT_ASSERT_TODO(ph == pw); + IT_ASSERT_TODO(tie(sh, sw) == tuple(1, 1)); + IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1)); + auto g = make_ref(runtime); + // NHWF + auto newA = g->addTensor( + {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype); + // FRSC + auto newW = g->addTensor( + {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]}, dtype); + g->addOpWithOutputs(g->cloneTensor(A), newA, newA->getDims()); + g->addOpWithOutputs(g->cloneTensor(W), newW, newW->getDims()); + Tensor newO = g->addOp(newA, newW, nullptr, 0, 0)->getOutput(); + g->addOpWithOutputs(newO, g->cloneTensor(op->getOutput()), + op->getOutput()->getDims()); + return g; +} + +// Graph NMutator::transformConvtransposed(Operator _op) { +// auto op = as(_op); +// if (!op) +// return nullptr; +// const auto &AT = op->getInputs()[0]; +// const auto &KT = op->getInputs()[1]; +// const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS(); +// const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); +// IT_ASSERT_TODO(op->getNumGroups() == 1); +// if (r != 4) +// return {}; +// IT_ASSERT_TODO(ph == pw); +// IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2)); +// IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1)); + +// auto g = make_ref(); +// // TODO: implement transformation rules +// // How to efficiently write an expression... +// auto inputDims = op->getInputs(0)->getDims(); +// auto weightDims = op->getInputs(1)->getDims(); +// auto outputDims = op->getOutput()->getDims(); +// // NHWF +// auto newA = +// g->tensor({inputDims[0] * inputDims[1] * inputDims[2], +// inputDims[3]}); +// // RSFC +// auto newW = g->tensor( +// {weightDims[0] * weightDims[1] * weightDims[3], weightDims[2]}); +// auto newO = g->tensor({inputDims[0] * inputDims[1] * inputDims[2], // weightDims[0] * weightDims[1] * weightDims[3]}); -// g->reshape(tconvOp->getInputs(0), newA); -// g->reshape(tconvOp->getInputs(1), newW); -// g->matmul(newA, newW, newO, 0, 1); -// // g->reshape(newO, tconvOp->getOutput()); -// tconvOp->print(); -// dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9); -// assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9); -// g->membound( -// {newO}, {tconvOp->getOutput()}, {}, nullptr, +// g->reshape(op->getInputs(0), newA); +// g->reshape(op->getInputs(1), newW); +// g->matmul(newA, newW, newO, 0, 1); +// // g->reshape(newO, tconvOp->getOutput()); +// tconvOp->print(); +// dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9); +// assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9); +// g->membound({newO}, {tconvOp->getOutput()}, {}, nullptr, // memboundTime(newO->size() + tconvOp->getOutput()->size()), // "TConv3x3 reduce"); -// g->updateConnection(); -// Graph graph = new Graph(g->getOperators()); -// return graph; -// } -// } -// return nullptr; +// g->updateConnection(); +// Graph graph = new Graph(g->getOperators()); +// return graph; // } // Graph NMutator::transformTConv1x1(Operator op) { @@ -711,4 +748,13 @@ NMutator::generateUnaryExpr(const Operator &op) { NameNToTensorT{{"T", op->getInputs()[0]}}}; } +void NMutator::memboundToJson(const Graph &g, const string path) { + for (auto &_op : g->getOperators()) { + if (auto op = as(_op)) { + op->saveAsJson(path + "/" + "membound_" + + std::to_string(op->getGuid()) + ".json"); + } + } +} + } // namespace infini diff --git a/src/operators/membound.cc b/src/operators/membound.cc index 9e9e62ad..3afa9134 100644 --- a/src/operators/membound.cc +++ b/src/operators/membound.cc @@ -2,6 +2,7 @@ #include "nnet/Visitor/CheckOOBVisitor.h" #include "nnet/Visitor/HashVisitor.h" #include "nnet/Visitor/MergeMemboundMutator.h" +#include "nnet/Visitor/Serializer.h" namespace infini { @@ -83,4 +84,9 @@ bool MemBoundObj::checkOOB(nnet::Expr expr) { nnet::as(expr)); } +void MemBoundObj::saveAsJson(string path) const { + bool status = nnet::Serializer().serialize(expr, path); + IT_ASSERT(status); +} + } // namespace infini