forked from jiuyuan/InfiniTensor
Add conv2bgemm and fix mutator::runtime
This commit is contained in:
parent
079985bc8c
commit
1e46750159
|
@ -20,8 +20,10 @@ class NMutator : public Mutator {
|
|||
const std::vector<int> derivationRules;
|
||||
|
||||
public:
|
||||
NMutator(Mode mode = Mode::Normal);
|
||||
NMutator(Mode mode, const std::vector<int> &derivationRules);
|
||||
NMutator(Mode mode = Mode::Normal,
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance());
|
||||
NMutator(Mode mode, const std::vector<int> &derivationRules,
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance());
|
||||
~NMutator();
|
||||
|
||||
vector<Graph> run(const Graph &in_graph) override;
|
||||
|
|
|
@ -102,8 +102,6 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
|
|||
double time = -1e9;
|
||||
if (ctcMap[op->getGuid()]) { // Compile-time computable operators
|
||||
time = 0;
|
||||
} else if (op->getOpType() == OpType::Reshape) {
|
||||
time = 0;
|
||||
} else if (op->getOpType() == OpType::MemBound && ignoreMemboundOp) {
|
||||
time = 0;
|
||||
} else if (op->getOpType() == OpType::MemBound && allowEstimation) {
|
||||
|
@ -136,8 +134,10 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
|
|||
t->freeData();
|
||||
}
|
||||
|
||||
if (op->getOpType() != OpType::Transpose &&
|
||||
op->getOpType() != OpType::ReduceMean)
|
||||
// FIXME: ignore trnapose when necessary
|
||||
// op->getOpType() != OpType::Transpose &&
|
||||
// op->getOpType() != OpType::ReduceMean
|
||||
if (op->getOpType() != OpType::Reshape)
|
||||
totalTime += time;
|
||||
if (profiling) {
|
||||
op->print();
|
||||
|
|
|
@ -351,6 +351,12 @@ std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
|
|||
// mutatedGraphs.resize(2);
|
||||
// if (mutatedGraphs.size() >= 2)
|
||||
// mutatedGraphs = {mutatedGraphs[1]};
|
||||
constexpr bool chooseBestMutation = false;
|
||||
if (chooseBestMutation && mutatedGraphs.size() >= 2) {
|
||||
std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
|
||||
graphTimeComparer);
|
||||
mutatedGraphs = {mutatedGraphs[0]};
|
||||
}
|
||||
for (auto graph : graphs) {
|
||||
for (auto mutatedGraph : mutatedGraphs) {
|
||||
std::vector<Operator> ops;
|
||||
|
|
|
@ -155,7 +155,6 @@ double CudaRuntimeObj::timeWithCudaGraph(Graph graph, int rounds) {
|
|||
auto tvm_device = tvm::runtime::DeviceAPI::Get(tvm_device_id);
|
||||
tvm_device->SetStream(tvm_device_id, getStream());
|
||||
#endif
|
||||
|
||||
beginCudaGraphStreamCapture();
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
if (perfData)
|
||||
|
@ -174,8 +173,8 @@ double CudaRuntimeObj::timeWithCudaGraph(Graph graph, int rounds) {
|
|||
[&, cudaGraphInstance = cudaGraphInstance, stream = getStream()]() {
|
||||
checkCudaError(cudaGraphLaunch(cudaGraphInstance, stream));
|
||||
},
|
||||
[&, stream = getStream()]() { cudaStreamSynchronize(stream); }, 1000,
|
||||
1000);
|
||||
[&, stream = getStream()]() { cudaStreamSynchronize(stream); }, rounds,
|
||||
rounds);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include "operators/reshape.h"
|
||||
#include "operators/softmax.h"
|
||||
#include "operators/transpose.h"
|
||||
#include "operators/pooling.h"
|
||||
#include "operators/unary.h"
|
||||
#include "test.h"
|
||||
#include <pybind11/stl.h>
|
||||
|
@ -366,11 +367,11 @@ Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning, NMutator::Mode mode,
|
|||
Ref<NMutator> mutator;
|
||||
if (mode == NMutator::Mode::Normal) {
|
||||
dbg(mode);
|
||||
mutator = make_ref<NMutator>(mode);
|
||||
mutator = make_ref<NMutator>(mode, runtime);
|
||||
} else if (mode == NMutator::Mode::RuleBased) {
|
||||
dbg(mode, rules);
|
||||
IT_ASSERT_TODO(rules.size() > 0);
|
||||
mutator = make_ref<NMutator>(mode, rules);
|
||||
mutator = make_ref<NMutator>(mode, rules, runtime);
|
||||
} else
|
||||
IT_TODO_HALT();
|
||||
vector<Graph> bestGraphs;
|
||||
|
|
|
@ -22,12 +22,15 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
NMutator::NMutator(Mode mode) : Mutator(10), mode{mode} {
|
||||
NMutator::NMutator(Mode mode, Runtime runtime)
|
||||
: Mutator(10, runtime), mode{mode} {
|
||||
IT_ASSERT(mode != Mode::RuleBased, "Specify rules for the RuleBased mode.");
|
||||
}
|
||||
|
||||
NMutator::NMutator(Mode mode, const std::vector<int> &derivationRules)
|
||||
: Mutator(10), mode{Mode::RuleBased}, derivationRules{derivationRules} {
|
||||
NMutator::NMutator(Mode mode, const std::vector<int> &derivationRules,
|
||||
Runtime runtime)
|
||||
: Mutator(10, runtime), mode{Mode::RuleBased}, derivationRules{
|
||||
derivationRules} {
|
||||
IT_ASSERT(mode == Mode::RuleBased);
|
||||
}
|
||||
|
||||
|
@ -93,37 +96,29 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
|
|||
IT_ASSERT(computeOps.size() == 1);
|
||||
if (Graph g = transformConvtransposed1x1(computeOps[0])) {
|
||||
out_graphs.emplace_back(g);
|
||||
return;
|
||||
}
|
||||
if (Graph g = transformConv1x1(computeOps[0])) {
|
||||
out_graphs.emplace_back(g);
|
||||
return;
|
||||
}
|
||||
if (infini::Graph g = transformConv1xk(computeOps[0])) {
|
||||
out_graphs.emplace_back(g);
|
||||
return;
|
||||
}
|
||||
if (Graph g = transformG2bmm(computeOps[0])) {
|
||||
out_graphs.emplace_back(g);
|
||||
return;
|
||||
}
|
||||
if (Graph g = transformGbmm(computeOps[0])) {
|
||||
out_graphs.emplace_back(g);
|
||||
return;
|
||||
}
|
||||
if (infini::Graph g = transformDialtedConv(computeOps[0])) {
|
||||
out_graphs.emplace_back(g);
|
||||
return;
|
||||
}
|
||||
if (infini::Graph g = transformConvToGEMMReduce(computeOps[0])) {
|
||||
out_graphs.emplace_back(g);
|
||||
return;
|
||||
}
|
||||
|
||||
if (infini::Graph g = transformConvTranposeToGEMMReduce(computeOps[0])) {
|
||||
out_graphs.emplace_back(g);
|
||||
return;
|
||||
}
|
||||
return;
|
||||
|
||||
const set<OpType> opSet{OpType::Conv, OpType::ConvTransNHWC, OpType::G2BMM,
|
||||
OpType::GBMM};
|
||||
|
@ -735,25 +730,45 @@ Graph NMutator::transformConv1x1(Operator _op) {
|
|||
g->addOp<MatmulObj>(B, A, nullptr, 0, 0)->getOutput(); // [F, N*H*W]
|
||||
g->addOpWithOutputs<ReshapeObj>(O, g->cloneTensor(op->getOutput()),
|
||||
op->getOutput()->getDims());
|
||||
} else {
|
||||
auto A = g->addOp<TransposeObj>(g->cloneTensor(op->getInputs(0)),
|
||||
nullptr, vector{1, 0, 2, 3})
|
||||
->getOutput(); // [C,N,H,W]
|
||||
A = g->addOp<ReshapeObj>(A, nullptr,
|
||||
vector{shapeA[1], shapeA[0] * shapeA[2] *
|
||||
shapeA[3]}) // [C, N*H*W]
|
||||
// } else { // Tranpose + Matmul + Transpose
|
||||
// auto A = g->addOp<TransposeObj>(g->cloneTensor(op->getInputs(0)),
|
||||
// nullptr, vector{1, 0, 2, 3})
|
||||
// ->getOutput(); // [C,N,H,W]
|
||||
// A = g->addOp<ReshapeObj>(A, nullptr,
|
||||
// vector{shapeA[1], shapeA[0] * shapeA[2]
|
||||
// *
|
||||
// shapeA[3]}) // [C,
|
||||
// N*H*W]
|
||||
// ->getOutput();
|
||||
// auto B = g->addOp<ReshapeObj>(g->cloneTensor(op->getInputs(1)),
|
||||
// nullptr,
|
||||
// vector{shapeW[0], shapeW[1]}) //
|
||||
// [F, C]
|
||||
// ->getOutput();
|
||||
// auto O =
|
||||
// g->addOp<MatmulObj>(B, A, nullptr, 0, 0)->getOutput(); // [F,
|
||||
// NHW]
|
||||
// O = g->addOp<ReshapeObj>(
|
||||
// O, nullptr, Shape{shapeO[1], shapeO[0], shapeO[2],
|
||||
// shapeO[3]})
|
||||
// ->getOutput(); // [F, NHW]
|
||||
// O = g->addOpWithOutputs<TransposeObj>(
|
||||
// O, g->cloneTensor(op->getOutput()), vector{1, 0, 2, 3})
|
||||
// ->getOutput(); // [F, N*H*W]
|
||||
} else { // BGemm
|
||||
auto A =
|
||||
g->addOp<ReshapeObj>(g->cloneTensor(op->getInputs(0)), nullptr,
|
||||
vector{shapeA[0], shapeA[1],
|
||||
shapeA[2] * shapeA[3]}) // [N, C, H*W]
|
||||
->getOutput();
|
||||
auto B =
|
||||
g->addOp<ReshapeObj>(g->cloneTensor(op->getInputs(1)), nullptr,
|
||||
vector{1, shapeW[0], shapeW[1]}) // [1, F, C]
|
||||
->getOutput();
|
||||
auto B = g->addOp<ReshapeObj>(g->cloneTensor(op->getInputs(1)), nullptr,
|
||||
vector{shapeW[0], shapeW[1]}) // [F, C]
|
||||
->getOutput();
|
||||
auto O =
|
||||
g->addOp<MatmulObj>(B, A, nullptr, 0, 0)->getOutput(); // [F, NHW]
|
||||
O = g->addOp<ReshapeObj>(
|
||||
O, nullptr, Shape{shapeO[1], shapeO[0], shapeO[2], shapeO[3]})
|
||||
->getOutput(); // [F, NHW]
|
||||
O = g->addOpWithOutputs<TransposeObj>(
|
||||
O, g->cloneTensor(op->getOutput()), vector{1, 0, 2, 3})
|
||||
->getOutput(); // [F, N*H*W]
|
||||
g->addOp<MatmulObj>(B, A, nullptr, 0, 0)->getOutput(); // [F, N*H*W]
|
||||
g->addOpWithOutputs<ReshapeObj>(O, g->cloneTensor(op->getOutput()),
|
||||
op->getOutput()->getDims());
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
|
|
@ -167,14 +167,14 @@ if __name__ == "__main__":
|
|||
|
||||
for original_g, name in graphs:
|
||||
print(f"=== {name}")
|
||||
# save_onnx(original_g, f"hkz_orig_{name}.onnx")
|
||||
# save_onnx(original_g, f"orig_{name}.onnx")
|
||||
# original_g = ft.convertNCHWtoNHWCModel(runtime, original_g)
|
||||
# save_onnx(original_g, f"hkz_dlt_{name}.onnx")
|
||||
# save_onnx(dlt_g, f"dlt_{name}.onnx")
|
||||
# exit()
|
||||
|
||||
# run_and_evaluate(runtime, original_g)
|
||||
g = ft.optimizeGraph(original_g, runtime, False, ft.NMutatorMode.RuleBased,
|
||||
[1, 7, 7, 2, 8, 6, 6]) # G2BMM/GBMM
|
||||
|
||||
# g = ft.optimizeGraph(original_g, runtime, False, ft.NMutatorMode.RuleBased,
|
||||
# [3, 2, 2, 5, 8, 8, 6, 90]) # Conv2conv
|
||||
# g = ft.optimizeGraph(original_g, runtime, False, ft.NMutatorMode.Normal)
|
||||
|
|
Loading…
Reference in New Issue