forked from jiuyuan/InfiniTensor
Add: enable mutator search in python
This commit is contained in:
parent
d25b606e12
commit
c6c445991a
|
@ -19,12 +19,15 @@ class NMutator : public Mutator {
|
||||||
// If in RuleBased mode, use derivationRules in derivator
|
// If in RuleBased mode, use derivationRules in derivator
|
||||||
const std::vector<int> derivationRules;
|
const std::vector<int> derivationRules;
|
||||||
bool searchFilter = false;
|
bool searchFilter = false;
|
||||||
|
bool enableRules = false; // Enable operator-level transformation rules
|
||||||
|
|
||||||
public:
|
public:
|
||||||
NMutator(Mode mode = Mode::Normal,
|
NMutator(Mode mode = Mode::Normal,
|
||||||
Runtime runtime = NativeCpuRuntimeObj::getInstance());
|
Runtime runtime = NativeCpuRuntimeObj::getInstance(),
|
||||||
|
bool enableRules = false);
|
||||||
NMutator(Mode mode, const std::vector<int> &derivationRules,
|
NMutator(Mode mode, const std::vector<int> &derivationRules,
|
||||||
Runtime runtime = NativeCpuRuntimeObj::getInstance());
|
Runtime runtime = NativeCpuRuntimeObj::getInstance(),
|
||||||
|
bool enableRules = false);
|
||||||
~NMutator();
|
~NMutator();
|
||||||
|
|
||||||
vector<Graph> run(const Graph &in_graph) override;
|
vector<Graph> run(const Graph &in_graph) override;
|
||||||
|
|
|
@ -16,6 +16,7 @@ void initializeGraphTensors(Graph g, double l, double r, bool useInt);
|
||||||
Graph convertNCHWtoNHWCModel(Runtime runtime, Graph inG);
|
Graph convertNCHWtoNHWCModel(Runtime runtime, Graph inG);
|
||||||
Graph optimizeWithDepthConstraint(Graph g, Runtime _runtime, int maxDepth);
|
Graph optimizeWithDepthConstraint(Graph g, Runtime _runtime, int maxDepth);
|
||||||
Graph optimizeModel(Graph g, Runtime _runtime, string name);
|
Graph optimizeModel(Graph g, Runtime _runtime, string name);
|
||||||
|
Graph optimizeModelWithRules(Graph g, Runtime _runtime, vector<int> rules);
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
||||||
|
|
|
@ -82,10 +82,11 @@ Tensor GraphHandlerObj::convTransposed2dNHWC(Tensor input, Tensor weight,
|
||||||
oph, opw);
|
oph, opw);
|
||||||
return output;
|
return output;
|
||||||
} else {
|
} else {
|
||||||
return g->addOp<ConvTransposed2dNHWCObj>(std::move(input),
|
return g
|
||||||
std::move(weight), output, ph,
|
->addOp<ConvTransposed2dNHWCObj>(std::move(input),
|
||||||
pw, sh, sw, dh, dw, oph, opw)
|
std::move(weight), output, ph, pw,
|
||||||
->getOutput();
|
sh, sw, dh, dw, oph, opw)
|
||||||
|
->getOutput();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -134,7 +134,7 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
|
||||||
t->freeData();
|
t->freeData();
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: ignore trnapose when necessary
|
// FIXME: ignore trnapose when necessary
|
||||||
// op->getOpType() != OpType::Transpose &&
|
// op->getOpType() != OpType::Transpose &&
|
||||||
// op->getOpType() != OpType::ReduceMean
|
// op->getOpType() != OpType::ReduceMean
|
||||||
if (op->getOpType() != OpType::Reshape)
|
if (op->getOpType() != OpType::Reshape)
|
||||||
|
|
|
@ -357,25 +357,30 @@ std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
|
||||||
auto mutatedGraphs = mutator->run(node.graph);
|
auto mutatedGraphs = mutator->run(node.graph);
|
||||||
if (mutator->hasTunedKernel)
|
if (mutator->hasTunedKernel)
|
||||||
chooseBestMutation = false;
|
chooseBestMutation = false;
|
||||||
if (searchFilter == 1) {
|
std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
|
||||||
std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
|
graphTimeComparer);
|
||||||
graphTimeComparer);
|
if (mutatedGraphs.size() >= 10)
|
||||||
if (mutatedGraphs.size() >= 10)
|
mutatedGraphs.resize(10);
|
||||||
mutatedGraphs.resize(10);
|
mutatedGraphs = {mutatedGraphs[0]};
|
||||||
mutatedGraphs = {mutatedGraphs[0]};
|
// if (searchFilter == 1) {
|
||||||
} else if (chooseBestMutation && mutatedGraphs.size() >= 2) {
|
// std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
|
||||||
std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
|
// graphTimeComparer);
|
||||||
graphTimeComparer);
|
// if (mutatedGraphs.size() >= 10)
|
||||||
if (mutatedGraphs.size() >= 10)
|
// mutatedGraphs.resize(10);
|
||||||
mutatedGraphs.resize(10);
|
// mutatedGraphs = {mutatedGraphs[0]};
|
||||||
mutatedGraphs = {mutatedGraphs[0]};
|
// } else if (chooseBestMutation && mutatedGraphs.size() >= 2) {
|
||||||
} else { // avoid repeated kernel genreation
|
// std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
|
||||||
if (mutatedGraphs.size() >= 2) // INFOGAN
|
// graphTimeComparer);
|
||||||
mutatedGraphs = {mutatedGraphs[1]};
|
// if (mutatedGraphs.size() >= 10)
|
||||||
// if (mutatedGraphs.size() > 2) {
|
// mutatedGraphs.resize(10);
|
||||||
// mutatedGraphs.resize(2);
|
// mutatedGraphs = {mutatedGraphs[0]};
|
||||||
// }
|
// } else { // avoid repeated kernel genreation
|
||||||
}
|
// if (mutatedGraphs.size() >= 2) // INFOGAN
|
||||||
|
// mutatedGraphs = {mutatedGraphs[1]};
|
||||||
|
// // if (mutatedGraphs.size() > 2) {
|
||||||
|
// // mutatedGraphs.resize(2);
|
||||||
|
// // }
|
||||||
|
// }
|
||||||
|
|
||||||
for (auto graph : graphs) {
|
for (auto graph : graphs) {
|
||||||
for (auto mutatedGraph : mutatedGraphs) {
|
for (auto mutatedGraph : mutatedGraphs) {
|
||||||
|
|
|
@ -432,7 +432,8 @@ void export_test_model(py::module &m) {
|
||||||
"l"_a = -0.1, "r"_a = 0.1, "useInt"_a = false)
|
"l"_a = -0.1, "r"_a = 0.1, "useInt"_a = false)
|
||||||
.def("convertNCHWtoNHWCModel", &convertNCHWtoNHWCModel)
|
.def("convertNCHWtoNHWCModel", &convertNCHWtoNHWCModel)
|
||||||
.def("optimizeWithDepthConstraint", &optimizeWithDepthConstraint)
|
.def("optimizeWithDepthConstraint", &optimizeWithDepthConstraint)
|
||||||
.def("optimizeModel", &optimizeModel);
|
.def("optimizeModel", &optimizeModel)
|
||||||
|
.def("optimizeModelWithRules", &optimizeModelWithRules);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
#include "operators/matmul.h"
|
#include "operators/matmul.h"
|
||||||
#include "core/kernel.h"
|
#include "core/kernel.h"
|
||||||
#include "cuda/cuda_runtime.h"
|
#include "cuda/cuda_runtime.h"
|
||||||
#include "nnet/dbg.h"
|
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
|
|
@ -358,12 +358,20 @@ Graph convertNCHWtoNHWCModel(Runtime runtime, Graph inG) {
|
||||||
return g;
|
return g;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Graph optimizeModelWithRules(Graph g, Runtime _runtime, vector<int> rules) {
|
||||||
|
auto runtime = as<CudaRuntimeObj>(_runtime);
|
||||||
|
// make_ref<NMutator>(NMutator::Mode::RuleBased, metaRules, runtime);
|
||||||
|
Ref<NMutator> mutator =
|
||||||
|
make_ref<NMutator>(NMutator::Mode::RuleBased, rules, runtime);
|
||||||
|
vector<Graph> bestGraphs;
|
||||||
|
SearchEngine searchEngine(runtime, mutator);
|
||||||
|
g->dataFree();
|
||||||
|
return searchEngine.run(g);
|
||||||
|
}
|
||||||
|
|
||||||
Graph optimizeModel(Graph g, Runtime _runtime, string name) {
|
Graph optimizeModel(Graph g, Runtime _runtime, string name) {
|
||||||
auto runtime = as<CudaRuntimeObj>(_runtime);
|
auto runtime = as<CudaRuntimeObj>(_runtime);
|
||||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
Ref<NMutator> mutator = make_ref<NMutator>(NMutator::Mode::Normal, runtime);
|
||||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
|
||||||
Ref<NMutator> mutator =
|
|
||||||
make_ref<NMutator>(NMutator::Mode::RuleBased, metaRules, runtime);
|
|
||||||
vector<Graph> bestGraphs;
|
vector<Graph> bestGraphs;
|
||||||
SearchEngine searchEngine(runtime, mutator);
|
SearchEngine searchEngine(runtime, mutator);
|
||||||
g->dataFree();
|
g->dataFree();
|
||||||
|
|
|
@ -549,7 +549,9 @@ void Derivator::printDerivationRules() {
|
||||||
if (!startGuided && ruleId != 4)
|
if (!startGuided && ruleId != 4)
|
||||||
++cntNonGuideRules;
|
++cntNonGuideRules;
|
||||||
}
|
}
|
||||||
printf("#Steps w/o converging derivation %d, #Steps w/ converging derivation %d\n", cntRules, cntNonGuideRules);
|
printf("#Steps w/o converging derivation %d, #Steps w/ converging "
|
||||||
|
"derivation %d\n",
|
||||||
|
cntRules, cntNonGuideRules);
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,15 +22,15 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
NMutator::NMutator(Mode mode, Runtime runtime)
|
NMutator::NMutator(Mode mode, Runtime runtime, bool enableRules)
|
||||||
: Mutator(10, runtime), mode{mode} {
|
: Mutator(10, runtime), mode{mode}, enableRules{enableRules} {
|
||||||
IT_ASSERT(mode != Mode::RuleBased, "Specify rules for the RuleBased mode.");
|
IT_ASSERT(mode != Mode::RuleBased, "Specify rules for the RuleBased mode.");
|
||||||
}
|
}
|
||||||
|
|
||||||
NMutator::NMutator(Mode mode, const std::vector<int> &derivationRules,
|
NMutator::NMutator(Mode mode, const std::vector<int> &derivationRules,
|
||||||
Runtime runtime)
|
Runtime runtime, bool enableRules)
|
||||||
: Mutator(10, runtime), mode{Mode::RuleBased}, derivationRules{
|
: Mutator(10, runtime), mode{Mode::RuleBased},
|
||||||
derivationRules} {
|
derivationRules{derivationRules}, enableRules{enableRules} {
|
||||||
IT_ASSERT(mode == Mode::RuleBased);
|
IT_ASSERT(mode == Mode::RuleBased);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -94,32 +94,38 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
|
||||||
void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
|
void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
|
||||||
OpVec computeOps = in_graph->getComputeOps();
|
OpVec computeOps = in_graph->getComputeOps();
|
||||||
IT_ASSERT(computeOps.size() == 1);
|
IT_ASSERT(computeOps.size() == 1);
|
||||||
if (Graph g = transformConvtransposed1x1(computeOps[0])) {
|
printf("Mutator states enableRules = %d, mode = %d\n", int(enableRules),
|
||||||
out_graphs.emplace_back(g);
|
int(mode));
|
||||||
|
if (enableRules) {
|
||||||
|
// TODO: unify rules
|
||||||
|
if (Graph g = transformConvtransposed1x1(computeOps[0])) {
|
||||||
|
out_graphs.emplace_back(g);
|
||||||
|
}
|
||||||
|
for (auto g : transformConv1x1(computeOps[0]))
|
||||||
|
out_graphs.emplace_back(g);
|
||||||
|
for (auto g : transformConv1xk(computeOps[0]))
|
||||||
|
out_graphs.emplace_back(g);
|
||||||
|
for (auto g : transformConv3x3ONNX(computeOps[0]))
|
||||||
|
out_graphs.emplace_back(g);
|
||||||
|
if (Graph g = transformG2bmm(computeOps[0])) {
|
||||||
|
out_graphs.emplace_back(g);
|
||||||
|
}
|
||||||
|
if (Graph g = transformGbmm(computeOps[0])) {
|
||||||
|
out_graphs.emplace_back(g);
|
||||||
|
}
|
||||||
|
if (infini::Graph g = transformDialtedConv(computeOps[0])) {
|
||||||
|
out_graphs.emplace_back(g);
|
||||||
|
}
|
||||||
|
if (infini::Graph g = transformConvToGEMMReduce(computeOps[0])) {
|
||||||
|
out_graphs.emplace_back(g);
|
||||||
|
}
|
||||||
|
if (infini::Graph g =
|
||||||
|
transformConvTranposeToGEMMReduce(computeOps[0])) {
|
||||||
|
out_graphs.emplace_back(g);
|
||||||
|
}
|
||||||
|
if (out_graphs.size() > 1)
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
for (auto g : transformConv1x1(computeOps[0]))
|
|
||||||
out_graphs.emplace_back(g);
|
|
||||||
for (auto g : transformConv1xk(computeOps[0]))
|
|
||||||
out_graphs.emplace_back(g);
|
|
||||||
for (auto g : transformConv3x3ONNX(computeOps[0]))
|
|
||||||
out_graphs.emplace_back(g);
|
|
||||||
if (Graph g = transformG2bmm(computeOps[0])) {
|
|
||||||
out_graphs.emplace_back(g);
|
|
||||||
}
|
|
||||||
if (Graph g = transformGbmm(computeOps[0])) {
|
|
||||||
out_graphs.emplace_back(g);
|
|
||||||
}
|
|
||||||
if (infini::Graph g = transformDialtedConv(computeOps[0])) {
|
|
||||||
out_graphs.emplace_back(g);
|
|
||||||
}
|
|
||||||
if (infini::Graph g = transformConvToGEMMReduce(computeOps[0])) {
|
|
||||||
out_graphs.emplace_back(g);
|
|
||||||
}
|
|
||||||
if (infini::Graph g = transformConvTranposeToGEMMReduce(computeOps[0])) {
|
|
||||||
out_graphs.emplace_back(g);
|
|
||||||
}
|
|
||||||
if (out_graphs.size() > 1)
|
|
||||||
return;
|
|
||||||
|
|
||||||
const set<OpType> opSet{OpType::Conv, OpType::ConvTransNHWC, OpType::G2BMM,
|
const set<OpType> opSet{OpType::Conv, OpType::ConvTransNHWC, OpType::G2BMM,
|
||||||
OpType::GBMM};
|
OpType::GBMM};
|
||||||
|
@ -140,7 +146,7 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
|
||||||
} else
|
} else
|
||||||
IT_TODO_HALT_MSG("Unknown NMutator search mode.");
|
IT_TODO_HALT_MSG("Unknown NMutator search mode.");
|
||||||
const auto &candidates = derivator.getCandidates();
|
const auto &candidates = derivator.getCandidates();
|
||||||
// dbg(candidates.size());
|
dbg(candidates.size());
|
||||||
// derivator.print();
|
// derivator.print();
|
||||||
for (const auto &candidate : candidates) {
|
for (const auto &candidate : candidates) {
|
||||||
// dbg(nnet::FullPrinterVisitor().print(candidate.root));
|
// dbg(nnet::FullPrinterVisitor().print(candidate.root));
|
||||||
|
|
|
@ -89,10 +89,10 @@ TEST(cuDNN_Conv, run) {
|
||||||
|
|
||||||
TEST(cuDNN_Conv, runNHWC) {
|
TEST(cuDNN_Conv, runNHWC) {
|
||||||
testConvNHWCCudnn(OneGenerator(),
|
testConvNHWCCudnn(OneGenerator(),
|
||||||
vector<float>{12., 12., 12., 12., 18., 18., 18., 18.});
|
vector<float>{12., 12., 12., 12., 18., 18., 18., 18.});
|
||||||
testConvNHWCCudnn(
|
testConvNHWCCudnn(
|
||||||
IncrementalGenerator(),
|
IncrementalGenerator(),
|
||||||
vector<float>{3350, 7562, 2306, 5546, 9480, 24546, 7185, 20793});
|
vector<float>{3350, 7562, 2306, 5546, 9480, 24546, 7185, 20793});
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(cuDNN_Conv, tune) {
|
TEST(cuDNN_Conv, tune) {
|
||||||
|
|
|
@ -6,6 +6,7 @@ import pandas as pd
|
||||||
import pyinfinitensor as pit
|
import pyinfinitensor as pit
|
||||||
from pyinfinitensor import backend as ft
|
from pyinfinitensor import backend as ft
|
||||||
from pyinfinitensor.onnx import OnnxStub
|
from pyinfinitensor.onnx import OnnxStub
|
||||||
|
from pyinfinitensor.tensorrt_backend import get_trt_time
|
||||||
|
|
||||||
|
|
||||||
def to_pytorch_tensor(tensor) -> torch.Tensor:
|
def to_pytorch_tensor(tensor) -> torch.Tensor:
|
||||||
|
@ -28,9 +29,11 @@ def load_onnx(runtime, filename: str) -> ft.Graph:
|
||||||
def run_and_evaluate(runtime, g):
|
def run_and_evaluate(runtime, g):
|
||||||
ft.initializeGraphTensors(g)
|
ft.initializeGraphTensors(g)
|
||||||
runtime.run(g, True)
|
runtime.run(g, True)
|
||||||
# print(f'getPerfTime = {runtime.getPerfTime(g, True, False, False)}')
|
print(f'Op perf time = {runtime.getPerfTime(g, True, False, False)}')
|
||||||
# print(f'Non-ctc time = {runtime.timeNonCtcOperators(g, 10, 10)}')
|
print(f'Graph perf time = {runtime.timeNonCtcOperators(g, 10, 10)}')
|
||||||
print(f'Cuda graph time = {runtime.timeWithCudaGraph(g, 10)}')
|
t = runtime.timeWithCudaGraph(g, 100)
|
||||||
|
print(f'Cuda graph time = {t}')
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
def run_graph_get_output_as_torch_tensor(runtime, g):
|
def run_graph_get_output_as_torch_tensor(runtime, g):
|
||||||
|
@ -101,16 +104,32 @@ def construct_convTranspose2d(runtime, n, c, h, w, f, r, s, pad, stride, dilatio
|
||||||
return handler.getGraph()
|
return handler.getGraph()
|
||||||
|
|
||||||
|
|
||||||
def construct_conv(runtime, n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw):
|
def construct_gemm(runtime, b, m, n, k, transA, transB):
|
||||||
|
handler = ft.GraphHandler(runtime)
|
||||||
|
input = handler.tensor([b, k, m] if transA else [b, m, k],
|
||||||
|
tensor_type=ft.TensorType.Input)
|
||||||
|
w = handler.tensor([b, n, k] if transB else [b, k, n],
|
||||||
|
tensor_type=ft.TensorType.Initialized)
|
||||||
|
handler.matmul(input, w, None, transA, transB, None, ft.Linear)
|
||||||
|
return handler.getGraph()
|
||||||
|
|
||||||
|
|
||||||
|
def construct_conv(runtime, n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, bias=False, relu=False):
|
||||||
handler = ft.GraphHandler(runtime)
|
handler = ft.GraphHandler(runtime)
|
||||||
# input = handler.tensor([1, 56, 32, 32], tensor_type=ft.TensorType.Input)
|
# input = handler.tensor([1, 56, 32, 32], tensor_type=ft.TensorType.Input)
|
||||||
# w = handler.tensor([12, 56, 1, 1], tensor_type=ft.TensorType.Initialized)
|
# w = handler.tensor([12, 56, 1, 1], tensor_type=ft.TensorType.Initialized)
|
||||||
# handler.conv(input, w, None, 0, 0, 1, 1, 1, 1)
|
# handler.conv(input, w, None, 0, 0, 1, 1, 1, 1)
|
||||||
input = handler.tensor([n, c, h, w], tensor_type=ft.TensorType.Input)
|
input = handler.tensor([n, c, h, w], tensor_type=ft.TensorType.Input)
|
||||||
w = handler.tensor([f, c, r, s], tensor_type=ft.TensorType.Initialized)
|
w = handler.tensor([f, c, r, s], tensor_type=ft.TensorType.Initialized)
|
||||||
handler.conv(input, w, None, ph, pw, sh, sw, dh, dw)
|
x = handler.conv(input, w, None, ph, pw, sh, sw, dh, dw)
|
||||||
|
if bias:
|
||||||
|
bias = handler.tensor([f, 1, 1], tensor_type=ft.TensorType.Initialized)
|
||||||
|
x = handler.add(x, bias, None)
|
||||||
|
if relu:
|
||||||
|
x = handler.relu(x, None)
|
||||||
return handler.getGraph()
|
return handler.getGraph()
|
||||||
|
|
||||||
|
|
||||||
def construct_conv_nhwc(runtime, n, c, h, w, f, r, s, pad, stride, dilation):
|
def construct_conv_nhwc(runtime, n, c, h, w, f, r, s, pad, stride, dilation):
|
||||||
handler = ft.GraphHandler(runtime)
|
handler = ft.GraphHandler(runtime)
|
||||||
# input = handler.tensor([1, 56, 32, 32], tensor_type=ft.TensorType.Input)
|
# input = handler.tensor([1, 56, 32, 32], tensor_type=ft.TensorType.Input)
|
||||||
|
@ -118,14 +137,17 @@ def construct_conv_nhwc(runtime, n, c, h, w, f, r, s, pad, stride, dilation):
|
||||||
# handler.conv(input, w, None, 0, 0, 1, 1, 1, 1)
|
# handler.conv(input, w, None, 0, 0, 1, 1, 1, 1)
|
||||||
input = handler.tensor([n, h, w, c], tensor_type=ft.TensorType.Input)
|
input = handler.tensor([n, h, w, c], tensor_type=ft.TensorType.Input)
|
||||||
w = handler.tensor([f, r, s, c], tensor_type=ft.TensorType.Initialized)
|
w = handler.tensor([f, r, s, c], tensor_type=ft.TensorType.Initialized)
|
||||||
handler.convNHWC(input, w, None, pad, pad, stride, stride, dilation, dilation)
|
handler.convNHWC(input, w, None, pad, pad, stride,
|
||||||
|
stride, dilation, dilation)
|
||||||
return handler.getGraph()
|
return handler.getGraph()
|
||||||
|
|
||||||
|
|
||||||
def construct_convtranposed_nhwc(runtime, n, c, h, w, f, r, s, pad, stride, dilation):
|
def construct_convtranposed_nhwc(runtime, n, c, h, w, f, r, s, pad, stride, dilation):
|
||||||
handler = ft.GraphHandler(runtime)
|
handler = ft.GraphHandler(runtime)
|
||||||
input = handler.tensor([n, h, w, c], tensor_type=ft.TensorType.Input)
|
input = handler.tensor([n, h, w, c], tensor_type=ft.TensorType.Input)
|
||||||
w = handler.tensor([f, r, s, c], tensor_type=ft.TensorType.Initialized)
|
w = handler.tensor([f, r, s, c], tensor_type=ft.TensorType.Initialized)
|
||||||
handler.convtransposed2dNHWC(input, w, None, pad, pad, stride, stride, dilation, dilation)
|
handler.convtransposed2dNHWC(
|
||||||
|
input, w, None, pad, pad, stride, stride, dilation, dilation)
|
||||||
return handler.getGraph()
|
return handler.getGraph()
|
||||||
|
|
||||||
|
|
||||||
|
@ -160,33 +182,112 @@ def search_depth_exp():
|
||||||
# print(f'getPerfTime = {runtime.getPerfTime(g, True, True, False)}')
|
# print(f'getPerfTime = {runtime.getPerfTime(g, True, True, False)}')
|
||||||
# print(f'Non-ctc time = {runtime.timeNonCtcOperators(g, 10, 10)}')
|
# print(f'Non-ctc time = {runtime.timeNonCtcOperators(g, 10, 10)}')
|
||||||
# save_onnx(g, f"opt_{name}_depth{i}.onnx")
|
# save_onnx(g, f"opt_{name}_depth{i}.onnx")
|
||||||
print(f'{name} Depth = {i}: {runtime.getPerfTime(g, True, True, False)} ms')
|
print(
|
||||||
|
f'{name} Depth = {i}: {runtime.getPerfTime(g, True, True, False)} ms')
|
||||||
def model_e2e_exp():
|
|
||||||
|
|
||||||
|
def get_e2e_time(runtime, g, name: str):
|
||||||
|
if name.startswith('resnet'):
|
||||||
|
return get_trt_time(g)
|
||||||
|
else:
|
||||||
|
return run_and_evaluate(runtime, g)
|
||||||
|
|
||||||
|
|
||||||
|
def model_e2e_exp(allow_tf32: bool):
|
||||||
runtime = ft.cuda_runtime()
|
runtime = ft.cuda_runtime()
|
||||||
model_evaluation =[
|
runtime.setEnableTF32(allow_tf32)
|
||||||
(lambda : ft.getGANGraph(1, runtime, 5, 0), 'InfoGAN.bs1'),
|
model_evaluation = [
|
||||||
(lambda : ft.getGANGraph(16, runtime, 5, 0), 'InfoGAN.bs16'),
|
# (lambda: construct_conv(runtime, 1, 512, 7,
|
||||||
(lambda : ft.getGANGraph(1, runtime, 5, 1), 'DCGAN.bs1'),
|
# 7, 512, 3, 3, 1, 1, 1, 1, 1, 1), 'ResNet-conv3x3'),
|
||||||
(lambda : ft.getGANGraph(16, runtime, 5, 1), 'DCGAN.bs16'),
|
# (lambda: construct_conv(runtime, 1, 512, 7,
|
||||||
(lambda : ft.getFSRCNNGraph(1, runtime), "fsrcnn.bs1"),
|
# 7, 512, 3, 3, 1, 1, 1, 1, 1, 1, True, True), 'ResNet-conv3x3-BiasRelu'),
|
||||||
(lambda : ft.getFSRCNNGraph(16, runtime), "fsrcnn.bs16"),
|
# (lambda: construct_conv(runtime, 1, 1, 7,
|
||||||
(lambda : load_onnx(runtime, '/mnt/auxHome/models/einnet/gcn.bs1.onnx'), 'gcn.bs1'),
|
# 7, 1, 3, 3, 1, 1, 1, 1, 1, 1), 'ResNet-conv3x3-c1'),
|
||||||
(lambda : load_onnx(runtime, '/mnt/auxHome/models/einnet/gcn.bs16.onnx'), 'gcn.bs16'),
|
# (lambda: construct_conv(runtime, 1, 3, 7,
|
||||||
(lambda : load_onnx(runtime, '/mnt/auxHome/models/einnet/csrnet.bs1.onnx'), 'csrnet.bs1'),
|
# 7, 3, 3, 3, 1, 1, 1, 1, 1, 1), 'ResNet-conv3x3-c3'),
|
||||||
(lambda : load_onnx(runtime, '/mnt/auxHome/models/einnet/csrnet.bs16.onnx'), 'csrnet.bs16'),
|
# (lambda: construct_conv(runtime, 1, 32, 7,
|
||||||
(lambda : ft.getLongformer(runtime, 1), 'longformer.bs1'),
|
# 7, 32, 3, 3, 1, 1, 1, 1, 1, 1), 'ResNet-conv3x3-c32'),
|
||||||
(lambda : ft.getLongformer(runtime, 16), 'longformer.bs16'),
|
# (lambda: construct_conv(runtime, 1, 128, 7,
|
||||||
]
|
# 7, 128, 3, 3, 1, 1, 1, 1, 1, 1), 'ResNet-conv3x3-c128'),
|
||||||
|
# (lambda: ft.getGANGraph(1, runtime, 5, 0), 'InfoGAN.bs1'),
|
||||||
|
# (lambda: ft.getGANGraph(16, runtime, 5, 0), 'InfoGAN.bs16'),
|
||||||
|
# (lambda: ft.getGANGraph(1, runtime, 5, 1), 'DCGAN.bs1'),
|
||||||
|
# (lambda: ft.getGANGraph(16, runtime, 5, 1), 'DCGAN.bs16'),
|
||||||
|
# (lambda: ft.getFSRCNNGraph(1, runtime), "fsrcnn.bs1"),
|
||||||
|
# (lambda: ft.getFSRCNNGraph(16, runtime), "fsrcnn.bs16"),
|
||||||
|
# (lambda: load_onnx(runtime, '/mnt/auxHome/models/einnet/gcn.bs1.onnx'), 'gcn.bs1'),
|
||||||
|
# (lambda: load_onnx(runtime, '/mnt/auxHome/models/einnet/gcn.bs16.onnx'), 'gcn.bs16'),
|
||||||
|
(lambda: load_onnx(runtime, '/mnt/auxHome/models/einnet/resnet18.bs1.onnx'), 'resnet.bs1'),
|
||||||
|
# (lambda: load_onnx(runtime, '/mnt/auxHome/models/einnet/resnet18.bs16.onnx'), 'resnet.bs16'),
|
||||||
|
# (lambda: load_onnx(runtime, '/mnt/auxHome/models/einnet/csrnet.bs1.onnx'), 'csrnet.bs1'),
|
||||||
|
# (lambda: load_onnx(runtime, '/mnt/auxHome/models/einnet/csrnet.bs16.onnx'), 'csrnet.bs16'),
|
||||||
|
# (lambda : ft.getLongformer(runtime, 1), 'longformer.bs1'),
|
||||||
|
# (lambda : ft.getLongformer(runtime, 16), 'longformer.bs16'),
|
||||||
|
# (lambda : load_onnx(runtime, '/home/whj/workspace/InfiniTensor/cuda-build/efficientnet-b1_bs1.onnx'), 'efficientnet.b1'),
|
||||||
|
# (lambda : load_onnx(runtime, '/home/whj/workspace/InfiniTensor/cuda-build/mobilenet_v2_bs1.onnx'), 'mobilenet_v2.bs1'),
|
||||||
|
]
|
||||||
print("Figure 12")
|
print("Figure 12")
|
||||||
for graph_ctor, name in model_evaluation:
|
for graph_ctor, name in model_evaluation:
|
||||||
|
t_orig, t_opt = 99999999, 99999999
|
||||||
print(f"=== {name}")
|
print(f"=== {name}")
|
||||||
original_g = graph_ctor()
|
original_g = graph_ctor()
|
||||||
|
# original_g = ft.convertNCHWtoNHWCModel(runtime, original_g)
|
||||||
|
# save_onnx(original_g, f"orig_{name}.onnx")
|
||||||
|
# print('Time:', get_e2e_time(runtime, original_g, name))
|
||||||
|
t_orig = run_and_evaluate(runtime, original_g)
|
||||||
g = ft.optimizeModel(original_g, runtime, name)
|
g = ft.optimizeModel(original_g, runtime, name)
|
||||||
# g = ft.optimizeGraph(original_g, runtime, False, ft.NMutatorMode.RuleBased,
|
# g = ft.optimizeGraph(original_g, runtime, False, ft.NMutatorMode.RuleBased,
|
||||||
# [3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90]) # Convtranspose2gemm
|
# [3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90]) # Convtranspose2gemm
|
||||||
# save_onnx(g, f"opt_{name}.onnx")
|
# g = ft.optimizeModelWithRules(original_g, runtime,
|
||||||
run_and_evaluate(runtime, g)
|
# [3, 2, 2, 5, 8, 8, 6, 90]) # Conv2Gemm
|
||||||
|
save_onnx(g, f"opt_{name}.onnx")
|
||||||
|
# run_and_evaluate(runtime, g)
|
||||||
|
# print(get_e2e_time(runtime, g, name))
|
||||||
|
t_opt = run_and_evaluate(runtime, g)
|
||||||
|
print(
|
||||||
|
f'=== {name} orig/opt=speedup {t_orig:.3f} {t_opt:.3f} {t_orig/t_opt:.2f}')
|
||||||
|
verify_graphs(runtime, original_g, g)
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemm_tf32(allow_tf32: bool):
|
||||||
|
configs = [
|
||||||
|
[1, 1024, 196, 85],
|
||||||
|
[1, 128, 3136, 256],
|
||||||
|
[1, 128, 784, 512],
|
||||||
|
[1, 196, 231, 1024],
|
||||||
|
[1, 196, 231, 21],
|
||||||
|
[1, 196, 425, 1024],
|
||||||
|
[1, 196, 896, 1024],
|
||||||
|
[1, 196, 896, 128],
|
||||||
|
[1, 2048, 49, 128],
|
||||||
|
[1, 21, 50176, 21],
|
||||||
|
[1, 231, 3136, 21],
|
||||||
|
[1, 231, 3136, 256],
|
||||||
|
[1, 256, 3136, 64],
|
||||||
|
[1, 425, 196, 1024],
|
||||||
|
[1, 425, 196, 85],
|
||||||
|
[1, 425, 784, 512],
|
||||||
|
[1, 49, 231, 2048],
|
||||||
|
[1, 49, 231, 21],
|
||||||
|
[1, 49, 896, 128],
|
||||||
|
[1, 512, 784, 128],
|
||||||
|
[1, 64, 3136, 256],
|
||||||
|
[1, 784, 231, 21],
|
||||||
|
[1, 784, 231, 512],
|
||||||
|
[1, 896, 196, 128],
|
||||||
|
[1, 896, 49, 2048],
|
||||||
|
]
|
||||||
|
runtime = ft.cuda_runtime()
|
||||||
|
runtime.setEnableTF32(allow_tf32)
|
||||||
|
for config in configs:
|
||||||
|
for transA, transB in ((False, False), (False, True), (True, False), (True, True)):
|
||||||
|
s = 16
|
||||||
|
align_config = [config[0], config[1]*16, config[2], config[3]]
|
||||||
|
align_config = [config[0]]+[(v+s-1)//s*s for v in align_config[1:]]
|
||||||
|
# align_config = config
|
||||||
|
g = construct_gemm(runtime, *align_config, transA, transB)
|
||||||
|
print(
|
||||||
|
f"{allow_tf32} {transA} {transB} {align_config} {run_and_evaluate(runtime, g)}")
|
||||||
|
|
||||||
|
|
||||||
def perf_test():
|
def perf_test():
|
||||||
|
@ -196,56 +297,9 @@ def perf_test():
|
||||||
g = ft.getLongformer(runtime, 1)
|
g = ft.getLongformer(runtime, 1)
|
||||||
run_and_evaluate(runtime, g)
|
run_and_evaluate(runtime, g)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# perf_test()
|
# perf_test()
|
||||||
model_e2e_exp()
|
for b in [False]:
|
||||||
exit()
|
model_e2e_exp(b)
|
||||||
runtime = ft.cuda_runtime()
|
# test_gemm_tf32(b)
|
||||||
graphs = [
|
|
||||||
# (construct_conv(runtime, 16, 56, 32, 32, 12, 1, 1, 0, 1, 1), 'conv1x1'), # FSRCNN Conv_2 1x1
|
|
||||||
# (construct_conv(runtime, 1, 12, 32, 32, 12, 3, 3, 1, 1, 1), 'conv3x3'), # FSRCNN Conv_4 3x3
|
|
||||||
# (construct_conv(runtime, 1, 12, 32, 32, 12, 3, 1, 1, 0, 1, 1, 1, 1), 'conv3x1'), #
|
|
||||||
# (construct_conv(runtime, 1, 12, 32, 32, 12, 1, 11, 0, 5, 1, 1, 1, 1), 'conv1x11'), #
|
|
||||||
# (construct_conv(runtime, 16, 12, 32, 32, 12, 1, 11, 0, 5, 1, 1, 1, 1), 'conv1x11_bs16'), #
|
|
||||||
# (construct_conv(runtime, 16,32,224,224, 1, 5, 5, 2, 2, 1, 1, 1, 1), 'conv5x5'), #
|
|
||||||
# (ft.getLongformer(runtime, 1), 'longformer.bs1'),
|
|
||||||
# (ft.getLongformer(runtime, 16), 'longformer.bs16'),
|
|
||||||
# construct_convTranspose2d(runtime)
|
|
||||||
# (load_onnx(runtime, '/mnt/auxHome/models/einnet/fsrcnn.bs1.onnx'), 'fsrcnn.bs1'),
|
|
||||||
# (ft.getFSRCNNGraph(1, runtime), "fsrcnn.bs1"),
|
|
||||||
# (ft.getFSRCNNGraph(16, runtime), "fsrcnn.bs16"),
|
|
||||||
# (construct_conv_nhwc(runtime, 1, 56, 32, 32, 12, 1, 1, 0, 1, 1), 'conv1x1')
|
|
||||||
# (load_onnx(runtime, '/mnt/auxHome/models/einnet/gcn.bs1.onnx'), 'gcn.bs1'),
|
|
||||||
# (load_onnx(runtime, '/mnt/auxHome/models/einnet/gcn.bs16.onnx'), 'gcn.bs16'),
|
|
||||||
# (load_onnx(runtime, '/mnt/auxHome/models/einnet/csrnet.bs1.onnx'), 'csrnet.bs1'),
|
|
||||||
# (load_onnx(runtime, '/mnt/auxHome/models/einnet/csrnet.bs16.onnx'), 'csrnet.bs16'),
|
|
||||||
(ft.getLongformer(runtime, 1), 'longformer.bs1'),
|
|
||||||
# (ft.getLongformer(runtime, 16), 'longformer.bs16'),
|
|
||||||
# (load_onnx(runtime, '/mnt/auxHome/models/einnet/resnet18.bs1.onnx'), 'resnet18.bs1'),
|
|
||||||
# (load_onnx(runtime, '/mnt/auxHome/models/einnet/resnet18.bs16.onnx'), 'resnet18.bs16'),
|
|
||||||
# (ft.getGANGraph(1, runtime, 5, 0), 'InfoGAN.bs1'),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
for original_g, name in graphs:
|
|
||||||
print(f"=== {name}")
|
|
||||||
# save_onnx(original_g, f"orig_{name}.onnx")
|
|
||||||
# original_g = ft.convertNCHWtoNHWCModel(runtime, original_g)
|
|
||||||
# save_onnx(dlt_g, f"dlt_{name}.onnx")
|
|
||||||
# exit()
|
|
||||||
|
|
||||||
# run_and_evaluate(runtime, original_g)
|
|
||||||
# g = ft.optimizeGraph(original_g, runtime, False, ft.NMutatorMode.RuleBased,
|
|
||||||
# [1, 7, 7, 2, 8, 6, 6]) # G2BMM/GBMM
|
|
||||||
# g = ft.optimizeGraph(original_g, runtime, False, ft.NMutatorMode.RuleBased,
|
|
||||||
# [3, 2, 2, 5, 8, 8, 6, 90]) # Conv2conv
|
|
||||||
g = ft.optimizeGraph(original_g, runtime, False, ft.NMutatorMode.RuleBased,
|
|
||||||
[3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90]) # Convtranspose2gemm
|
|
||||||
# g = ft.optimizeGraph(original_g, runtime, False, ft.NMutatorMode.Normal)
|
|
||||||
# g = ft.convertNCHWtoNHWCModel(original_g, runtime, i)
|
|
||||||
|
|
||||||
# run_and_evaluate(runtime, original_g)
|
|
||||||
run_and_evaluate(runtime, g)
|
|
||||||
save_onnx(g, f"opt_{name}.onnx")
|
|
||||||
# verify_graphs(runtime, original_g, g)
|
|
||||||
# run_and_evaluate(runtime, g)
|
|
||||||
|
|
Loading…
Reference in New Issue