forked from jiuyuan/InfiniTensor
Add: NMutator::memboundToJson to export memboundOp
This commit is contained in:
parent
99b5c95455
commit
b981951a47
|
@ -12,7 +12,7 @@ namespace infini {
|
|||
// TODO: how to deal with this
|
||||
using ShapeElem = int;
|
||||
using Shape = vector<ShapeElem>;
|
||||
enum class TensorType { Input, Initialized, Other };
|
||||
enum class TensorType { Error = 0, Input = 1, Initialized = 2, Other = 3 };
|
||||
class TensorObj : public TensorBaseObj {
|
||||
private:
|
||||
Shape shape;
|
||||
|
|
|
@ -20,6 +20,7 @@ class Serializer : public Functor<string()> {
|
|||
string visit_(const Subscript &c) override;
|
||||
string visit_(const Var &c) override;
|
||||
string visit_(const Tensor &c) override;
|
||||
string visit_(const Func &c) override;
|
||||
string dispatchRoutine(const Routine &c);
|
||||
|
||||
Expr buildExprTree(string key);
|
||||
|
|
|
@ -104,7 +104,7 @@ enum class NodeType {
|
|||
FuncNodeType
|
||||
};
|
||||
|
||||
enum class FuncType { Relu, Tanh, PRelu };
|
||||
enum class FuncType { Relu = 1000, Tanh, PRelu };
|
||||
|
||||
#define DEFINE_GETTYPE(CLASS, isScalar_v) \
|
||||
NodeType getType() const override { return NodeType::CLASS##Type; } \
|
||||
|
|
|
@ -32,6 +32,8 @@ class NMutator : public Mutator {
|
|||
long long cntStates = 0;
|
||||
long long cntCandidates = 0;
|
||||
|
||||
static void memboundToJson(const Graph &g, const string path);
|
||||
|
||||
private:
|
||||
int maxDepth = 8;
|
||||
nnet::Expr opToExpression(Operator op);
|
||||
|
@ -57,8 +59,8 @@ class NMutator : public Mutator {
|
|||
|
||||
// TODO: recover these rules
|
||||
// Graph fuseHetConv(nnet::Expr expr, Graph in_graph);
|
||||
// Graph transformTConv1x1(Operator op);
|
||||
// Graph transformTConv3x3(Operator op);
|
||||
Graph transformConvtransposed1x1(Operator _op);
|
||||
// Graph transformConvtransposed(Operator op);
|
||||
// Graph transformDialtedConv(Operator op);
|
||||
// Graph transformConv1x1(Operator op);
|
||||
// Graph transformConv1xk(Operator op);
|
||||
|
|
|
@ -33,6 +33,7 @@ class MemBoundObj : public OperatorObj {
|
|||
return {expr, hash};
|
||||
}
|
||||
double getEstimatedTime() const { return exec_time; }
|
||||
void saveAsJson(string path) const;
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
|
|
|
@ -26,7 +26,8 @@ string TensorObj::toString() const {
|
|||
ss << "nullptr data";
|
||||
string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
|
||||
std::to_string(fuid) + ", shape " + vecToString(shape) +
|
||||
", dtype " + dtype.toString();
|
||||
", dtype " + dtype.toString() + ", tensorType " +
|
||||
std::to_string(enum_to_underlying(tensorType));
|
||||
vector<UidBaseType> targetGuids;
|
||||
for (const auto &op : targets)
|
||||
targetGuids.emplace_back(op.lock()->getGuid());
|
||||
|
|
|
@ -341,7 +341,8 @@ void init_graph_builder(py::module &m) {
|
|||
py::class_<NMutator, Ref<NMutator>, Mutator>(m, "NMutator")
|
||||
.def(py::init<NMutator::Mode>())
|
||||
.def(py::init<NMutator::Mode, vector<int>>())
|
||||
.def("run", &NMutator::run);
|
||||
.def("run", &NMutator::run)
|
||||
.def_static("memboundToJson", &NMutator::memboundToJson);
|
||||
py::class_<SearchEngine>(m, "SearchEngine")
|
||||
.def(py::init<Runtime, Ref<Mutator>>())
|
||||
.def("run", &SearchEngine::run);
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#include "core/runtime.h"
|
||||
#include "core/search_engine.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "ffi/ffi_callback.h"
|
||||
#include "nnet/nmutator.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/unary.h"
|
||||
|
@ -23,12 +24,43 @@ Graph getInfoGAN(int batch, Runtime runtime, int nLayers) {
|
|||
{64, 4, 1, 2, false}, {32, 4, 1, 2, true},
|
||||
};
|
||||
|
||||
Tensor input = g->addTensor({batch, 1, 1, 228});
|
||||
Tensor input =
|
||||
g->addTensor({batch, 1, 1, 228}, DataType::Float32, TensorType::Input);
|
||||
for (int i = 0; i < (int)cs.size() && i < nLayers; ++i) {
|
||||
auto [channel, kernelSize, pad, stride, tanh] = cs[i];
|
||||
int f = input->getDims()[3]; // n, h, w, f
|
||||
auto weight =
|
||||
g->addTensor({f, kernelSize, kernelSize, channel}); // f, r, s, c
|
||||
auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
|
||||
DataType::Float32,
|
||||
TensorType::Initialized); // f, r, s, c
|
||||
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
|
||||
pad, stride, stride, 1, 1)
|
||||
->getOutput();
|
||||
if (tanh) {
|
||||
input = g->addOp<TanhObj>(input, nullptr)->getOutput();
|
||||
} else {
|
||||
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
|
||||
}
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId) {
|
||||
IT_ASSERT(0 <= layerId && layerId < 5);
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
vector<Tensor> weights;
|
||||
vector<tuple<int, int, int, int, bool>> cs{
|
||||
// Channel, kernelSize, pad, stride, isTanh
|
||||
{448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
|
||||
{64, 4, 1, 2, false}, {32, 4, 1, 2, true},
|
||||
};
|
||||
|
||||
Tensor input = g->addTensor(shape, DataType::Float32, TensorType::Input);
|
||||
for (int i = layerId; i < layerId + 1; ++i) {
|
||||
auto [channel, kernelSize, pad, stride, tanh] = cs[i];
|
||||
int f = input->getDims()[3]; // n, h, w, f
|
||||
auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
|
||||
DataType::Float32,
|
||||
TensorType::Initialized); // f, r, s, c
|
||||
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
|
||||
pad, stride, stride, 1, 1)
|
||||
->getOutput();
|
||||
|
@ -50,6 +82,77 @@ void printGraph(Graph g) {
|
|||
}
|
||||
}
|
||||
|
||||
Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) {
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
|
||||
auto mutator =
|
||||
make_ref<NMutator>(NMutator::Mode::RuleBased,
|
||||
vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
|
||||
vector<Graph> bestGraphs;
|
||||
SearchEngine searchEngine(runtime, mutator);
|
||||
bestGraphs.emplace_back(searchEngine.run(g));
|
||||
g->topo_sort();
|
||||
dbg(g, bestGraphs[0], bestGraphs.size());
|
||||
g->print();
|
||||
|
||||
g->dataMalloc();
|
||||
map<UidBaseType, Tensor> fuidToInputTensor;
|
||||
for (auto t : g->getInputs()) {
|
||||
IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
|
||||
fuidToInputTensor[t->getFuid()] = t;
|
||||
}
|
||||
|
||||
auto gen = RandomGenerator(-0.1, 0.1, 0);
|
||||
for (auto t : g->getInputs()) {
|
||||
t->setData(gen);
|
||||
}
|
||||
for (auto t : g->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
runtime->run(g);
|
||||
dbg("Baseline graph");
|
||||
printGraph(g);
|
||||
dbg(runtime->getPerfTime(g, true));
|
||||
|
||||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
auto bestGraphCpu = bestGraphs[i];
|
||||
auto bestGraph =
|
||||
make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
|
||||
bestGraph->topo_sort();
|
||||
|
||||
bestGraph->dataMalloc();
|
||||
// Initialize inputs with random data
|
||||
for (auto t : bestGraph->getInputs()) {
|
||||
t->copyData(fuidToInputTensor[t->getFuid()]);
|
||||
}
|
||||
|
||||
// Initialize outputs with zeros
|
||||
for (auto t : bestGraph->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
|
||||
dbg(bestGraph);
|
||||
dbg(bestGraph->getOutputs());
|
||||
|
||||
if (tuning) {
|
||||
runtime->run(bestGraph, true); // Tune kernels
|
||||
runtime->run(bestGraph, false); // Execute transfomraed graph
|
||||
|
||||
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
|
||||
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
|
||||
// EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
|
||||
dbg(go0->equalData(bgo0, 1e-3));
|
||||
dbg(runtime->getPerfTime(bestGraph, true));
|
||||
}
|
||||
|
||||
dbg("Best graph");
|
||||
printGraph(bestGraph);
|
||||
return bestGraph;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
vector<Tensor> runInfoGAN(int nLayers) {
|
||||
Runtime cuda = make_ref<CudaRuntimeObj>();
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||
|
@ -122,6 +225,7 @@ vector<Tensor> runInfoGAN(int nLayers) {
|
|||
|
||||
dbg("Best graph");
|
||||
printGraph(bestGraph);
|
||||
callback::exportONNX(bestGraph, "best_graph.onnx"); // Debug
|
||||
return {g->getOutputs()[0], bestGraph->getOutputs()[0]};
|
||||
}
|
||||
return {};
|
||||
|
|
|
@ -78,16 +78,14 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
|
|||
OpVec computeOps = in_graph->getComputeOps();
|
||||
IT_ASSERT(computeOps.size() == 1);
|
||||
|
||||
/* if (infini::Graph g = transformTConv1x1(computeOps[0])) {
|
||||
// if (infini::Graph g = transformTConv1x1(computeOps[0])) {
|
||||
// out_graphs.emplace_back(g);
|
||||
// return;
|
||||
// }
|
||||
// // Commented for debug, not implemented yet
|
||||
// // if (infini::Graph g = transformTConv3x3(computeOps[0])) {
|
||||
// // Graph graph = new Graph(g->getOperators());
|
||||
// // out_graphs.emplace_back(graph);
|
||||
// // return;
|
||||
// // }
|
||||
if (Graph g = transformConvtransposed1x1(computeOps[0])) {
|
||||
out_graphs.emplace_back(g);
|
||||
return;
|
||||
}
|
||||
// if (infini::Graph g = transformDialtedConv(computeOps[0])) {
|
||||
// out_graphs.emplace_back(g);
|
||||
// return;
|
||||
|
@ -519,43 +517,82 @@ double NMutator::memboundTime(const Shape &dims) {
|
|||
// return nullptr;
|
||||
// }
|
||||
|
||||
// Graph NMutator::transformTConv3x3(Operator op) {
|
||||
// if (auto tconvOp = dynamic_cast<ConvTransOp *>(op)) {
|
||||
// dbg(tconvOp->getInputs()[1]->getDims());
|
||||
// if (tconvOp->getPh() == 1 && tconvOp->getSh() == 2 &&
|
||||
// tconvOp->getInputs()[1]->getDims()[0] == 3 &&
|
||||
// tconvOp->getInputs()[1]->getDims()[1] == 3) {
|
||||
// auto g = new infini::Graph();
|
||||
// auto inputDims = tconvOp->getInputs(0)->getDims();
|
||||
// auto weightDims = tconvOp->getInputs(1)->getDims();
|
||||
// auto outputDims = tconvOp->getOutput()->getDims();
|
||||
// // NHWF
|
||||
// auto newA = g->tensor(
|
||||
// {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]});
|
||||
// // RSFC
|
||||
// auto newW = g->tensor(
|
||||
// {weightDims[0] * weightDims[1] * weightDims[3],
|
||||
// weightDims[2]});
|
||||
// auto newO =
|
||||
// g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
|
||||
Graph NMutator::transformConvtransposed1x1(Operator _op) {
|
||||
auto op = as<ConvTransposed2dNHWCObj>(_op);
|
||||
if (!op)
|
||||
return nullptr;
|
||||
const auto &A = op->getInputs()[0];
|
||||
const auto &W = op->getInputs()[1];
|
||||
const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
const Shape inputDims = op->getInputs(0)->getDims();
|
||||
const Shape weightDims = op->getInputs(1)->getDims();
|
||||
const Shape outputDims = op->getOutput()->getDims();
|
||||
const DataType dtype = A->getDType();
|
||||
IT_ASSERT_TODO(op->getNumGroups() == 1);
|
||||
if (h != 1 || w != 1)
|
||||
return {};
|
||||
IT_ASSERT_TODO(ph == pw);
|
||||
IT_ASSERT_TODO(tie(sh, sw) == tuple(1, 1));
|
||||
IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
|
||||
auto g = make_ref<GraphObj>(runtime);
|
||||
// NHWF
|
||||
auto newA = g->addTensor(
|
||||
{inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
|
||||
// FRSC
|
||||
auto newW = g->addTensor(
|
||||
{weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]}, dtype);
|
||||
g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
|
||||
g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
|
||||
Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 0)->getOutput();
|
||||
g->addOpWithOutputs<ReshapeObj>(newO, g->cloneTensor(op->getOutput()),
|
||||
op->getOutput()->getDims());
|
||||
return g;
|
||||
}
|
||||
|
||||
// Graph NMutator::transformConvtransposed(Operator _op) {
|
||||
// auto op = as<ConvTransposed2dNHWCObj>(_op);
|
||||
// if (!op)
|
||||
// return nullptr;
|
||||
// const auto &AT = op->getInputs()[0];
|
||||
// const auto &KT = op->getInputs()[1];
|
||||
// const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
// const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
// IT_ASSERT_TODO(op->getNumGroups() == 1);
|
||||
// if (r != 4)
|
||||
// return {};
|
||||
// IT_ASSERT_TODO(ph == pw);
|
||||
// IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));
|
||||
// IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
|
||||
|
||||
// auto g = make_ref<Graph>();
|
||||
// // TODO: implement transformation rules
|
||||
// // How to efficiently write an expression...
|
||||
// auto inputDims = op->getInputs(0)->getDims();
|
||||
// auto weightDims = op->getInputs(1)->getDims();
|
||||
// auto outputDims = op->getOutput()->getDims();
|
||||
// // NHWF
|
||||
// auto newA =
|
||||
// g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
|
||||
// inputDims[3]});
|
||||
// // RSFC
|
||||
// auto newW = g->tensor(
|
||||
// {weightDims[0] * weightDims[1] * weightDims[3], weightDims[2]});
|
||||
// auto newO = g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
|
||||
// weightDims[0] * weightDims[1] * weightDims[3]});
|
||||
// g->reshape(tconvOp->getInputs(0), newA);
|
||||
// g->reshape(tconvOp->getInputs(1), newW);
|
||||
// g->matmul(newA, newW, newO, 0, 1);
|
||||
// // g->reshape(newO, tconvOp->getOutput());
|
||||
// tconvOp->print();
|
||||
// dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
|
||||
// assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
|
||||
// g->membound(
|
||||
// {newO}, {tconvOp->getOutput()}, {}, nullptr,
|
||||
// g->reshape(op->getInputs(0), newA);
|
||||
// g->reshape(op->getInputs(1), newW);
|
||||
// g->matmul(newA, newW, newO, 0, 1);
|
||||
// // g->reshape(newO, tconvOp->getOutput());
|
||||
// tconvOp->print();
|
||||
// dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
|
||||
// assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
|
||||
// g->membound({newO}, {tconvOp->getOutput()}, {}, nullptr,
|
||||
// memboundTime(newO->size() + tconvOp->getOutput()->size()),
|
||||
// "TConv3x3 reduce");
|
||||
// g->updateConnection();
|
||||
// Graph graph = new Graph(g->getOperators());
|
||||
// return graph;
|
||||
// }
|
||||
// }
|
||||
// return nullptr;
|
||||
// g->updateConnection();
|
||||
// Graph graph = new Graph(g->getOperators());
|
||||
// return graph;
|
||||
// }
|
||||
|
||||
// Graph NMutator::transformTConv1x1(Operator op) {
|
||||
|
@ -711,4 +748,13 @@ NMutator::generateUnaryExpr(const Operator &op) {
|
|||
NameNToTensorT{{"T", op->getInputs()[0]}}};
|
||||
}
|
||||
|
||||
void NMutator::memboundToJson(const Graph &g, const string path) {
|
||||
for (auto &_op : g->getOperators()) {
|
||||
if (auto op = as<MemBoundObj>(_op)) {
|
||||
op->saveAsJson(path + "/" + "membound_" +
|
||||
std::to_string(op->getGuid()) + ".json");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#include "nnet/Visitor/CheckOOBVisitor.h"
|
||||
#include "nnet/Visitor/HashVisitor.h"
|
||||
#include "nnet/Visitor/MergeMemboundMutator.h"
|
||||
#include "nnet/Visitor/Serializer.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -83,4 +84,9 @@ bool MemBoundObj::checkOOB(nnet::Expr expr) {
|
|||
nnet::as<nnet::RangeOpNode>(expr));
|
||||
}
|
||||
|
||||
void MemBoundObj::saveAsJson(string path) const {
|
||||
bool status = nnet::Serializer().serialize(expr, path);
|
||||
IT_ASSERT(status);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
Loading…
Reference in New Issue