Merge branch 'NNET_e2e' into NNET_gcn

This commit is contained in:
Liyan Zheng 2023-04-23 23:07:42 +08:00
commit 18d6ba4022
27 changed files with 726 additions and 43 deletions

@ -1 +1 @@
Subproject commit 3bb9240cb15459768adb3e7d963a20e1523a6294 Subproject commit f30744bcf726ea3735df7ecf9e9de9ddac540283

@ -1 +1 @@
Subproject commit b796f7d44681514f58a683a3a71ff17c94edb0c1 Subproject commit e2239ee6043f73722e7aa812a459f54a28552929

@ -1 +1 @@
Subproject commit 13132dd361c8c5b5753983d5186cf54f689d90f9 Subproject commit 6aebf09233951e4ce30a63919186a70b2b195756

2
3rd-party/pybind11 vendored

@ -1 +1 @@
Subproject commit 0bd8896a4010f2d91b2340570c24fa08606ec406 Subproject commit 1e3400b6742288429f2069aaf5febf92d0662dae

View File

@ -57,6 +57,11 @@ class GraphHandlerObj {
Tensor convTransposed2d(Tensor input, Tensor weight, Tensor output, int ph, Tensor convTransposed2d(Tensor input, Tensor weight, Tensor output, int ph,
int pw, int sh, int sw, int dh, int dw, int oph, int pw, int sh, int sw, int dh, int dw, int oph,
int opw); int opw);
Tensor convNHWC(Tensor input, Tensor weight, Tensor output, int ph, int pw,
int sh, int sw, int dh, int dw);
Tensor convTransposed2dNHWC(Tensor input, Tensor weight, Tensor output,
int ph, int pw, int sh, int sw, int dh, int dw,
int oph, int opw);
Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB, Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
Tensor bias, ActType act); Tensor bias, ActType act);
Tensor batchNorm(Tensor input, Tensor output, Tensor mean, Tensor var, Tensor batchNorm(Tensor input, Tensor output, Tensor mean, Tensor var,

View File

@ -11,6 +11,7 @@ enum class OpType {
Matmul, Matmul,
ConvTrans, ConvTrans,
ConvTransNHWC, ConvTransNHWC,
ConvNHWC,
G2BMM, G2BMM,
GBMM, GBMM,
Pad, Pad,
@ -102,7 +103,10 @@ enum class OpType {
Dropout, Dropout,
// //
MemBound = 300, MemBound = 300,
Any, //
Conv2dReduce = 400,
Conv2dReduceTranspose,
Any
}; };
using KernelAttrs = std::tuple<Device, OpType, DataType>; using KernelAttrs = std::tuple<Device, OpType, DataType>;
@ -123,6 +127,7 @@ class OpRegistry {
FOP(Matmul); FOP(Matmul);
FOP(ConvTrans); FOP(ConvTrans);
FOP(ConvTransNHWC); FOP(ConvTransNHWC);
FOP(ConvNHWC);
FOP(G2BMM); FOP(G2BMM);
FOP(GBMM); FOP(GBMM);
FOP(Pad); FOP(Pad);
@ -210,6 +215,9 @@ class OpRegistry {
FOP(BitRightShift); FOP(BitRightShift);
// //
FOP(MemBound); FOP(MemBound);
//
FOP(Conv2dReduce);
FOP(Conv2dReduceTranspose);
FOP(Any); FOP(Any);
default: default:
IT_ASSERT(false, "Unknown OpType " + IT_ASSERT(false, "Unknown OpType " +

View File

@ -1,7 +1,6 @@
#pragma once #pragma once
#include "core/runtime.h" #include "core/runtime.h"
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include "nnet/dbg.h"
namespace infini { namespace infini {

View File

@ -66,6 +66,9 @@ class NMutator : public Mutator {
Graph transformGbmm(Operator op); Graph transformGbmm(Operator op);
Graph transformDialtedConv(Operator _op); Graph transformDialtedConv(Operator _op);
Graph transformConv1xk(Operator op); Graph transformConv1xk(Operator op);
// Graph transformConv1xk(Operator op);
Graph transformConvToGEMMReduce(Operator _op);
Graph transformConvTranposeToGEMMReduce(Operator _op);
Tensor splitTransposeMerge(Graph g, Tensor A, int dim, int chunkSize, Tensor splitTransposeMerge(Graph g, Tensor A, int dim, int chunkSize,
Tensor output = nullptr); Tensor output = nullptr);

View File

@ -6,6 +6,7 @@
namespace infini { namespace infini {
Graph getGANGraph(int batch, Runtime runtime, int nLayers, int modelId); Graph getGANGraph(int batch, Runtime runtime, int nLayers, int modelId);
Graph getFSRCNNGraph(int batch, Runtime runtime);
Graph getLongformer(Runtime runtime, int bs); Graph getLongformer(Runtime runtime, int bs);
vector<Tensor> runInfoGAN(int nLayers); vector<Tensor> runInfoGAN(int nLayers);
Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId); Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId);

View File

@ -111,7 +111,7 @@ class ConvBaseObj : public OperatorObj {
auto getNCHWFRS() const { return tuple(n, c, h, w, f, r, s); } auto getNCHWFRS() const { return tuple(n, c, h, w, f, r, s); }
auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); } auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); }
int getChannelPerGroup() const { int getChannelPerGroup() const {
if (type == OpType::ConvTransNHWC) { if (type == OpType::ConvTransNHWC || type == OpType::ConvNHWC) {
return inputs[1]->getDims()[3]; return inputs[1]->getDims()[3];
} else { } else {
return inputs[1]->getDims()[1]; return inputs[1]->getDims()[1];
@ -149,6 +149,25 @@ class ConvObj : public ConvBaseObj {
void setAuxilaryAttributes(PaddingMode mode) override; void setAuxilaryAttributes(PaddingMode mode) override;
}; };
class ConvNHWCObj : public ConvBaseObj {
public:
ConvNHWCObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output, int ph,
int pw, int sh = 1, int sw = 1, int dh = 1, int dw = 1,
Tensor bias = nullptr, ActType act = ActType::None);
// Constructors for setting padding mode
ConvNHWCObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
PaddingMode mode = PaddingMode::Same, int sh = 1, int sw = 1,
int dh = 1, int dw = 1, Tensor bias = nullptr,
ActType act = ActType::None);
OP_CLONE(ConvNHWCObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int getNumGroups() const override { return c / getChannelPerGroup(); }
private:
void setAuxilaryAttributes(PaddingMode mode) override;
};
class ConvBackwardFilterObj : public ConvBaseObj { class ConvBackwardFilterObj : public ConvBaseObj {
private: private:
ActType act; ActType act;
@ -220,6 +239,7 @@ class ConvTransposed2dNHWCObj : public ConvBaseObj {
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int getNumGroups() const override { return group; } int getNumGroups() const override { return group; }
std::pair<int, int> getOutputPadding() const { return {oph, opw}; }
private: private:
void setAuxilaryAttributes(PaddingMode mode) override; void setAuxilaryAttributes(PaddingMode mode) override;

View File

@ -0,0 +1,62 @@
#pragma once
#include "core/operator.h"
namespace infini {
class Conv2dReduceBase : public OperatorObj {
protected:
Tensor bias;
int ph, pw;
int sh, sw;
int dh, dw;
int n, h, w, f, r, s; // c has been reduced
bool PReLU;
float paramReLU;
public:
Conv2dReduceBase(OpType opType, Tensor input, Tensor bias, Tensor output,
bool PReLU_, float paramReLU_, int ph_, int pw_,
int sh_ = 1, int sw_ = 1, int dh_ = 1, int dw_ = 1);
std::string toString() const override;
int numInputs() const override { return 2; }
int numOutputs() const override { return 1; }
int getDh() const { return dh; }
int getDw() const { return dw; }
int getPh() const { return ph; }
int getPw() const { return pw; }
int getSh() const { return sh; }
int getSw() const { return sw; }
bool getPReLU() const { return PReLU; }
float getParamReLU() const { return paramReLU; }
Tensor getBias() const { return bias; }
// optional<vector<Shape>> inferShape(const TensorVec &inputs) const
// override;
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
class Conv2dReduce : public Conv2dReduceBase {
public:
Conv2dReduce(GraphObj *graph, Tensor input, Tensor bias, Tensor output,
bool PReLU_, float paramReLU_, int ph_, int pw_, int sh_ = 1,
int sw_ = 1, int dh_ = 1, int dw_ = 1);
OP_CLONE(Conv2dReduce);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
};
class Conv2dReduceTranspose : public Conv2dReduceBase {
public:
Conv2dReduceTranspose(GraphObj *graph, Tensor input, Tensor bias,
Tensor output, bool PReLU_, float paramReLU_, int ph_,
int pw_, int sh_ = 1, int sw_ = 1, int dh_ = 1,
int dw_ = 1);
OP_CLONE(Conv2dReduceTranspose);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
};
} // namespace infini

View File

@ -709,7 +709,7 @@ class OnnxStub:
ctx.push_output(f"{name}_{i}_{it.guid()}", it) ctx.push_output(f"{name}_{i}_{it.guid()}", it)
for (i, it) in enumerate(op.outputs()) for (i, it) in enumerate(op.outputs())
] ]
if ty == backend.OpType.Conv: if ty == backend.OpType.Conv or ty == backend.OpType.ConvNHWC:
ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op) ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op)
ctx.push_node( ctx.push_node(
make_node( make_node(
@ -723,7 +723,7 @@ class OnnxStub:
group=op.inputs()[0].shape()[1] // op.inputs()[1].shape()[1], group=op.inputs()[0].shape()[1] // op.inputs()[1].shape()[1],
) )
) )
elif ty == backend.OpType.ConvTrans: elif ty == backend.OpType.ConvTrans or ty == backend.OpType.ConvTransNHWC:
ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op) ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
ctx.push_node( ctx.push_node(
make_node( make_node(
@ -895,6 +895,26 @@ class OnnxStub:
domain="nnet", domain="nnet",
) )
) )
elif ty == backend.OpType.Conv2dReduce:
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
domain="nnet",
)
)
elif ty == backend.OpType.Conv2dReduceTranspose:
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
domain="nnet",
)
)
elif ty == backend.OpType.MemBound: elif ty == backend.OpType.MemBound:
ctx.push_node( ctx.push_node(
make_node( make_node(

View File

@ -57,6 +57,38 @@ Tensor GraphHandlerObj::convTransposed2d(Tensor input, Tensor weight,
} }
} }
Tensor GraphHandlerObj::convNHWC(Tensor input, Tensor weight, Tensor output,
int ph, int pw, int sh, int sw, int dh,
int dw) {
if (output) {
g->addOpWithOutputs<ConvNHWCObj>(std::move(input), std::move(weight),
output, ph, pw, sh, sw, dh, dw);
return output;
} else {
return g
->addOp<ConvNHWCObj>(std::move(input), std::move(weight), output,
ph, pw, sh, sw, dh, dw)
->getOutput();
}
}
Tensor GraphHandlerObj::convTransposed2dNHWC(Tensor input, Tensor weight,
Tensor output, int ph, int pw,
int sh, int sw, int dh, int dw,
int oph, int opw) {
if (output) {
g->addOpWithOutputs<ConvTransposed2dNHWCObj>(
std::move(input), std::move(weight), output, ph, pw, sh, sw, dh, dw,
oph, opw);
return output;
} else {
return g->addOp<ConvTransposed2dNHWCObj>(std::move(input),
std::move(weight), output, ph,
pw, sh, sw, dh, dw, oph, opw)
->getOutput();
}
}
Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA, Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA,
bool transB, Tensor bias, ActType act) { bool transB, Tensor bias, ActType act) {
if (y) { if (y) {

View File

@ -26,7 +26,8 @@ bool OperatorObj::isConcatOp() const { return type == OpType::Concat; }
bool OperatorObj::isComputeOp() const { bool OperatorObj::isComputeOp() const {
return type == OpType::Conv || type == OpType::Matmul || return type == OpType::Conv || type == OpType::Matmul ||
type == OpType::ConvTrans || type == OpType::ConvTransNHWC || type == OpType::ConvTrans || type == OpType::ConvTransNHWC ||
type == OpType::G2BMM || type == OpType::GBMM; type == OpType::G2BMM || type == OpType::GBMM ||
type == OpType::ConvNHWC;
} }
bool OperatorObj::isTransposeOp() const { return type == OpType::Transpose; } bool OperatorObj::isTransposeOp() const { return type == OpType::Transpose; }

View File

@ -1,6 +1,7 @@
#include "core/search_engine.h" #include "core/search_engine.h"
#include "core/hash.h" #include "core/hash.h"
#include "core/runtime.h" #include "core/runtime.h"
#include "ffi/ffi_callback.h"
#include "nnet/dbg.h" #include "nnet/dbg.h"
#include <algorithm> #include <algorithm>
@ -348,8 +349,8 @@ std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
// // HACK: only try the first one for debug // // HACK: only try the first one for debug
// if (mutatedGraphs.size() > 2) // if (mutatedGraphs.size() > 2)
// mutatedGraphs.resize(2); // mutatedGraphs.resize(2);
if (mutatedGraphs.size() >= 2) // if (mutatedGraphs.size() >= 2)
mutatedGraphs = {mutatedGraphs[1]}; // mutatedGraphs = {mutatedGraphs[1]};
for (auto graph : graphs) { for (auto graph : graphs) {
for (auto mutatedGraph : mutatedGraphs) { for (auto mutatedGraph : mutatedGraphs) {
std::vector<Operator> ops; std::vector<Operator> ops;
@ -455,6 +456,9 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
} }
double SearchEngine::getEstimatedGraphPerf(Graph graph) { double SearchEngine::getEstimatedGraphPerf(Graph graph) {
// dbg(graph);
// // hkz
// callback::exportONNX(graph, "a.onnx");
return runtimeExec->getPerfTime(graph, false, true, true); return runtimeExec->getPerfTime(graph, false, true, true);
} }
@ -502,6 +506,7 @@ Graph SearchEngine::fuseVertically(const Graph &graph) {
auto bestGraph = make_ref<GraphObj>(runtimeExec, chainOps); auto bestGraph = make_ref<GraphObj>(runtimeExec, chainOps);
// Eliminate transpose and reshape operators // Eliminate transpose and reshape operators
// FIXME: current Relu only support 3D and 4D tensors
if (auto eliminatedGraph = mutator->eliminateVertically( if (auto eliminatedGraph = mutator->eliminateVertically(
make_ref<GraphObj>(runtimeExec, chainOps))) make_ref<GraphObj>(runtimeExec, chainOps)))
bestGraph = eliminatedGraph; bestGraph = eliminatedGraph;

View File

@ -3,6 +3,7 @@
#include "core/perf_engine.h" #include "core/perf_engine.h"
#include "core/runtime.h" #include "core/runtime.h"
#include "cuda_profiler_api.h" #include "cuda_profiler_api.h"
#include "nnet/dbg.h"
#include "operators/conv.h" #include "operators/conv.h"
#include "operators/matmul.h" #include "operators/matmul.h"
#ifdef INFINI_USE_TVM #ifdef INFINI_USE_TVM

View File

@ -69,6 +69,7 @@ void export_values(py::module &m) {
.VALUE(OpType, Matmul) .VALUE(OpType, Matmul)
.VALUE(OpType, ConvTrans) .VALUE(OpType, ConvTrans)
.VALUE(OpType, ConvTransNHWC) .VALUE(OpType, ConvTransNHWC)
.VALUE(OpType, ConvNHWC)
.VALUE(OpType, G2BMM) .VALUE(OpType, G2BMM)
.VALUE(OpType, GBMM) .VALUE(OpType, GBMM)
.VALUE(OpType, Pad) .VALUE(OpType, Pad)
@ -100,6 +101,8 @@ void export_values(py::module &m) {
.VALUE(OpType, Abs) .VALUE(OpType, Abs)
.VALUE(OpType, Resize) .VALUE(OpType, Resize)
.VALUE(OpType, Dropout) .VALUE(OpType, Dropout)
.VALUE(OpType, Conv2dReduce)
.VALUE(OpType, Conv2dReduceTranspose)
.VALUE(OpType, MemBound) .VALUE(OpType, MemBound)
.VALUE(OpType, Any) .VALUE(OpType, Any)
.export_values(); .export_values();
@ -144,17 +147,32 @@ static Ref<RuntimeObj> intelcpu_runtime() { return make_ref<MklRuntimeObj>(); }
#endif #endif
static std::tuple<int, int, int, int, int, int> conv_attrs_of(Operator op) { static std::tuple<int, int, int, int, int, int> conv_attrs_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Conv); IT_ASSERT(op->getOpType() == OpType::Conv ||
auto conv = dynamic_cast<const ConvObj *>(op.get()); op->getOpType() == OpType::ConvNHWC);
auto conv = dynamic_cast<const ConvBaseObj *>(op.get());
return std::make_tuple(conv->getPh(), conv->getPw(), conv->getDh(), return std::make_tuple(conv->getPh(), conv->getPw(), conv->getDh(),
conv->getDw(), conv->getSh(), conv->getSw()); conv->getDw(), conv->getSh(), conv->getSw());
} }
static std::tuple<int, int, int, int, int, int, int, int> static std::tuple<int, int, int, int, int, int, int, int>
conv_trans_attrs_of(Operator op) { conv_trans_attrs_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::ConvTrans); IT_ASSERT(op->getOpType() == OpType::ConvTrans ||
auto conv = dynamic_cast<const ConvTransposed2dObj *>(op.get()); op->getOpType() == OpType::ConvTransNHWC);
auto [oph, opw] = conv->getOutputPadding(); auto conv = dynamic_cast<const ConvBaseObj *>(op.get());
int oph, opw;
if (op->getOpType() == OpType::ConvTrans) {
auto _conv = dynamic_cast<const ConvTransposed2dObj *>(op.get());
auto output_pad = _conv->getOutputPadding();
oph = output_pad.first;
opw = output_pad.second;
} else {
auto _conv = dynamic_cast<const ConvTransposed2dNHWCObj *>(op.get());
auto output_pad = _conv->getOutputPadding();
oph = output_pad.first;
opw = output_pad.second;
}
return std::make_tuple(conv->getPh(), conv->getPw(), conv->getDh(), return std::make_tuple(conv->getPh(), conv->getPw(), conv->getDh(),
conv->getDw(), conv->getSh(), conv->getSw(), oph, conv->getDw(), conv->getSh(), conv->getSw(), oph,
opw); opw);
@ -328,6 +346,9 @@ void init_graph_builder(py::module &m) {
"tensor_type"_a = TensorType::Other) "tensor_type"_a = TensorType::Other)
.def("conv", &Handler::conv, policy::move) .def("conv", &Handler::conv, policy::move)
.def("convTransposed2d", &Handler::convTransposed2d, policy::move) .def("convTransposed2d", &Handler::convTransposed2d, policy::move)
.def("convNHWC", &Handler::convNHWC, policy::move)
.def("convtransposed2dNHWC", &Handler::convTransposed2dNHWC,
policy::move)
.def("matmul", &Handler::matmul, policy::move) .def("matmul", &Handler::matmul, policy::move)
.def("batchNorm", &Handler::batchNorm, policy::move) .def("batchNorm", &Handler::batchNorm, policy::move)
.def("maxPool", &Handler::maxPool, policy::move) .def("maxPool", &Handler::maxPool, policy::move)
@ -386,6 +407,7 @@ void export_test_model(py::module &m) {
#ifdef USE_CUDA #ifdef USE_CUDA
m.def("runInfoGAN", &runInfoGAN) m.def("runInfoGAN", &runInfoGAN)
.def("getGANGraph", &getGANGraph) .def("getGANGraph", &getGANGraph)
.def("getFSRCNNGraph", &getFSRCNNGraph)
.def("getLongformer", &getLongformer) .def("getLongformer", &getLongformer)
.def("getConvtransposedNHWC", &getConvtransposedNHWC) .def("getConvtransposedNHWC", &getConvtransposedNHWC)
.def("optimizeGraph", &optimizeGraph, "graph"_a, "runtime"_a, .def("optimizeGraph", &optimizeGraph, "graph"_a, "runtime"_a,

View File

@ -52,7 +52,7 @@ class convCudnn : public Kernel {
cudnnFilterDescriptor_t, cudnnTensorDescriptor_t, cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
cudnnConvolutionDescriptor_t, cudnnActivationDescriptor_t, cudnnConvolutionDescriptor_t, cudnnActivationDescriptor_t,
cudnnTensorDescriptor_t> cudnnTensorDescriptor_t>
createCuDNNDescriptor(const Ref<ConvObj> &op, createCuDNNDescriptor(const Ref<ConvBaseObj> &op,
const ConvCuDnnPerfRecord &record) const { const ConvCuDnnPerfRecord &record) const {
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>()); void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const knData = (op->getInputs(1)->getRawDataPtr<void *>()); void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -68,15 +68,23 @@ class convCudnn : public Kernel {
int channelsPerGrp = cpg, channels = c; int channelsPerGrp = cpg, channels = c;
// set input format
cudnnTensorFormat_t tensorFormat = (op->getOpType() == OpType::ConvNHWC)
? CUDNN_TENSOR_NHWC
: CUDNN_TENSOR_NCHW;
// get inputs // get inputs
cudnnTensorDescriptor_t inDesc; cudnnTensorDescriptor_t inDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
checkCudnnError(cudnnSetTensor4dDescriptor( checkCudnnError(cudnnSetTensor4dDescriptor(
inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, channels, h, w)); inDesc, tensorFormat, CUDNN_DATA_FLOAT, n, channels, h, w));
// get kernels // get kernels
cudnnFilterDescriptor_t knDesc; cudnnFilterDescriptor_t knDesc;
checkCudnnError(cudnnCreateFilterDescriptor(&knDesc)); checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
// FIXME: filter data layout is not changed with input data layout
// since FCRS shows better performance for NHWC inputs in some cases.
// This should be tunable.
checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT, checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
CUDNN_TENSOR_NCHW, f, CUDNN_TENSOR_NCHW, f,
channelsPerGrp, r, s)); channelsPerGrp, r, s));
@ -84,7 +92,7 @@ class convCudnn : public Kernel {
cudnnTensorDescriptor_t biasDesc; cudnnTensorDescriptor_t biasDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
checkCudnnError(cudnnSetTensor4dDescriptor( checkCudnnError(cudnnSetTensor4dDescriptor(
biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1)); biasDesc, tensorFormat, CUDNN_DATA_FLOAT, 1, f, 1, 1));
// get convlution descriptor // get convlution descriptor
cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionDescriptor_t convDesc;
@ -125,18 +133,25 @@ class convCudnn : public Kernel {
convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw)); convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
cudnnTensorDescriptor_t outDesc; cudnnTensorDescriptor_t outDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW, checkCudnnError(cudnnSetTensor4dDescriptor(
CUDNN_DATA_FLOAT, outn, outc, outDesc, tensorFormat, CUDNN_DATA_FLOAT, outn, outc, outh, outw));
outh, outw));
if (op->getOpType() == OpType::ConvNHWC) {
IT_ASSERT((vector{outn, outh, outw, outc}) ==
op->getOutput()->getDims(),
"cuDNN output shape mismatches with OP output shape");
} else {
IT_ASSERT((vector{outn, outc, outh, outw}) == IT_ASSERT((vector{outn, outc, outh, outw}) ==
op->getOutput()->getDims(), op->getOutput()->getDims(),
"cuDNN output shape mismatches with OP output shape"); "cuDNN output shape mismatches with OP output shape");
}
return tuple(inData, knData, outData, inDesc, knDesc, biasDesc, return tuple(inData, knData, outData, inDesc, knDesc, biasDesc,
convDesc, actDesc, outDesc); convDesc, actDesc, outDesc);
} }
bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record, bool cuDNNUnfused(const Ref<ConvBaseObj> &op,
const ConvCuDnnPerfRecord &record,
const CudaRuntimeObj *context) const { const CudaRuntimeObj *context) const {
cudnnStatus_t stat; cudnnStatus_t stat;
@ -220,11 +235,12 @@ class convCudnn : public Kernel {
ConvCuDnnPerfRecordObj ret; ConvCuDnnPerfRecordObj ret;
ret.time = std::numeric_limits<double>::max(); ret.time = std::numeric_limits<double>::max();
auto context = dynamic_cast<const CudaRuntimeObj *>(_context); auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
auto op = as<ConvObj>(_op); auto op = as<ConvBaseObj>(_op);
int try_algo = op->getOpType() == OpType::ConvNHWC ? 2 : N_ALGO;
// Both modes have the same performance. Only run cross-correlation. // Both modes have the same performance. Only run cross-correlation.
for (int mode = 1; mode < 2; mode++) { for (int mode = 1; mode < 2; mode++) {
// Try every possible algorithm of convolution // Try every possible algorithm of convolution
for (int algo = 0; algo < N_ALGO; algo++) { for (int algo = 0; algo < try_algo; algo++) {
auto recordRef = make_ref<ConvCuDnnPerfRecordObj>(); auto recordRef = make_ref<ConvCuDnnPerfRecordObj>();
auto &record = *recordRef; auto &record = *recordRef;
record.mode = mode; record.mode = mode;
@ -283,7 +299,7 @@ class convCudnn : public Kernel {
void compute(const Operator &_op, const PerfRecord &_record, void compute(const Operator &_op, const PerfRecord &_record,
const RuntimeObj *_context) const override { const RuntimeObj *_context) const override {
auto op = as<ConvObj>(_op); auto op = as<ConvBaseObj>(_op);
auto record = as<ConvCuDnnPerfRecordObj>(_record); auto record = as<ConvCuDnnPerfRecordObj>(_record);
auto context = dynamic_cast<const CudaRuntimeObj *>(_context); auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
bool success = cuDNNUnfused(op, record, context); bool success = cuDNNUnfused(op, record, context);
@ -294,5 +310,8 @@ class convCudnn : public Kernel {
REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float32, convCudnn, REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float32, convCudnn,
"Conv_cuDNN_CUDA_Float32"); "Conv_cuDNN_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::ConvNHWC, DataType::Float32, convCudnn,
"ConvNHWC_cuDNN_CUDA_Float32");
REGISTER_CONSTRUCTOR(1, ConvCuDnnPerfRecordObj::from_json); REGISTER_CONSTRUCTOR(1, ConvCuDnnPerfRecordObj::from_json);
} // namespace infini } // namespace infini

View File

@ -0,0 +1,44 @@
#include "operators/conv2dreduce.h"
#include "cuda/cuda_conv2dreduce.h"
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h"
namespace infini {
class Conv2dReduceCuda : public CudaKernelWithoutConfig {
void compute(const Operator &_op, const RuntimeObj *_context) const {
auto op = as<Conv2dReduceBase>(_op);
float *const input = (op->getInputs(0)->getRawDataPtr<float *>());
float *const bias =
op->getBias() ? (op->getBias()->getRawDataPtr<float *>()) : nullptr;
float *const output = (op->getOutput()->getRawDataPtr<float *>());
auto dim = op->getInputs(0)->getDims();
int n = dim[0], h = dim[1], w = dim[2], f = dim[3], r = dim[4],
s = dim[5];
int dh = op->getDh(), dw = op->getDw();
int sh = op->getSh(), sw = op->getSw();
int ph = op->getPh(), pw = op->getPw();
auto odim = op->getOutput()->getDims();
int oh = odim[1], ow = odim[2];
bool PReLU = op->getPReLU();
// float paramReLU = op->getParamReLU();
auto opType = op->getOpType();
if (opType == OpType::Conv2dReduce) {
conv2dreduce_kernel(input, bias, output, PReLU, n, h, w, f, r, s,
oh, ow, ph, pw, sh, sw, dh, dw);
} else {
convTranspose2dreduce_kernel(input, bias, output, PReLU, n, h, w, f,
r, s, oh, ow, ph, pw, sh, sw, dh, dw);
}
}
};
REGISTER_KERNEL(Device::CUDA, OpType::Conv2dReduce, DataType::Float32,
Conv2dReduceCuda, "Conv2dReduce_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Conv2dReduceTranspose, DataType::Float32,
Conv2dReduceCuda, "Conv2dReduceTranspose_CUDA_Float32");
} // namespace infini

View File

@ -1,4 +1,5 @@
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include "nnet/dbg.h"
using dtype = float; using dtype = float;
@ -40,18 +41,71 @@ __global__ void conv2dreduce_kernel_(float *__restrict__ input,
output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm; output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
} }
} }
__global__ void convTranspose2dreduce_kernel2_(
float *__restrict__ input, float *__restrict__ bias,
float *__restrict__ output, const bool PReLU, const int n, const int f,
const int h, const int w, const int oh, const int ow, const int r,
const int s, const int ph, const int pw, const int dh, const int dw,
const int sh, const int sw) {
int warp_id = (blockDim.x / 32) * blockIdx.x + threadIdx.x / 32;
int lane = threadIdx.x % 32;
int nid = warp_id / (f * oh * ow);
int fid = (warp_id - nid * (f * oh * ow)) / (oh * ow);
int hid = (warp_id - nid * (f * oh * ow) - fid * (oh * ow)) / ow;
int wid = warp_id % ow;
if (hid >= oh || wid >= ow || nid > n || fid > f)
return;
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
nchunck = h * hchunk;
float *nfinput = input + nid * nchunck + fid * fchunck;
// view as conv, the true ph and pw
int tph = r - ph - 1, tpw = s - pw - 1;
int th = (h - 1) * sh + 1, tw = (w - 1) * sw + 1;
float imm = 0.0;
int ihst = hid - tph;
int iwst = wid - tpw;
for (int idx = lane; idx < r * s; idx += 32) {
int ri = idx / s;
int si = idx % s;
int ihid = ihst + r - ri - 1;
int iwid = iwst + s - si - 1;
if (ihid >= 0 && ihid < th && iwid >= 0 && iwid < tw &&
(ihid % sh == 0) && (iwid % sw == 0)) {
imm += *(nfinput + (ihid / sh) * hchunk + (iwid / sw) * wchunk +
ri * s + si);
}
}
for (int k = 16; k > 0; k >>= 1) {
imm += __shfl_down_sync(0xffffffff, imm, k); // sum
}
if (lane == 0) {
if (bias) {
imm += bias[fid];
}
if (PReLU) {
imm = imm > 0.0 ? imm : 0.0;
}
output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
}
}
__global__ void convTranspose2dreduce_kernel_( __global__ void convTranspose2dreduce_kernel_(
float *__restrict__ input, float *__restrict__ bias, float *__restrict__ input, float *__restrict__ bias,
float *__restrict__ output, const bool PReLU, const int n, const int f, float *__restrict__ output, const bool PReLU, const int n, const int f,
const int h, const int w, const int oh, const int ow, const int r, const int h, const int w, const int oh, const int ow, const int r,
const int s, const int ph, const int pw, const int dh, const int dw, const int s, const int ph, const int pw, const int dh, const int dw,
const int sh, const int sw) { const int sh, const int sw, const int block_x_num, const int block_y_num) {
// assert dh = dw = 1 // assert dh = dw = 1
int nid = blockIdx.x, fid = blockIdx.y; int nid = blockIdx.x / block_x_num, fid = blockIdx.y / block_y_num;
int hid = threadIdx.x, wid = threadIdx.y; int hid = (blockIdx.x % block_x_num) * blockDim.x + threadIdx.x,
wid = (blockIdx.y % block_y_num) * blockDim.y + threadIdx.y;
if (hid >= oh || wid >= ow)
return;
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk, const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
nchunck = n * hchunk; nchunck = h * hchunk;
float *nfinput = input + nid * nchunck + fid * fchunck; float *nfinput = input + nid * nchunck + fid * fchunck;
// view as conv, the true ph and pw // view as conv, the true ph and pw
int tph = r - ph - 1, tpw = s - pw - 1; int tph = r - ph - 1, tpw = s - pw - 1;
@ -162,8 +216,22 @@ void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
reduce_4x4<<<(M * N + 127) / 128, 128>>>(input, output, act, n, f, oh, reduce_4x4<<<(M * N + 127) / 128, 128>>>(input, output, act, n, f, oh,
ow, h, w); ow, h, w);
} else { } else {
puts("why use this conv2dreduce"); // puts("why use this conv2dreduce");
convTranspose2dreduce_kernel_<<<grid, block, 0>>>( // block.x = 32;
// block.y = 32;
// int block_x_num = (oh + block.x - 1) / block.x;
// int block_y_num = (ow + block.y - 1) / block.y;
// grid.x = n * (block_x_num);
// grid.y = f * (block_y_num);
// convTranspose2dreduce_kernel_<<<grid, block, 0>>>(
// input, bias, output, (bool)act, n, f, h, w, oh, ow, r, s, ph, pw,
// dh, dw, sh, sw, block_x_num, block_y_num);
block.x = 128;
block.y = 1;
grid.x = (n * f * ow * oh + block.x / 32 - 1) / (block.x / 32);
grid.y = 1;
convTranspose2dreduce_kernel2_<<<grid, block, 0>>>(
input, bias, output, (bool)act, n, f, h, w, oh, ow, r, s, ph, pw, input, bias, output, (bool)act, n, f, h, w, oh, ow, r, s, ph, pw,
dh, dw, sh, sw); dh, dw, sh, sw);
} }

View File

@ -31,8 +31,9 @@ class ActivationCudnn : public CudaKernelWithoutConfig {
n = dim[0], c = dim[1], h = dim[2], w = dim[3]; n = dim[0], c = dim[1], h = dim[2], w = dim[3];
} else if (dim.size() == 3) { } else if (dim.size() == 3) {
n = 1, c = dim[0], h = dim[1], w = dim[2]; n = 1, c = dim[0], h = dim[1], w = dim[2];
} else } else {
IT_TODO_HALT(); IT_TODO_HALT();
}
// get inputs // get inputs
checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));

View File

@ -86,6 +86,60 @@ Graph getGANGraph(int batch, Runtime runtime, int nLayers, int modelId) {
return g; return g;
} }
// NHWC
Graph getFSRCNNGraph(int batch, Runtime runtime) {
// n, c, h, w, f, r, s, stride, pad, dilation, has_pReLU
const DetailedConfigs fsrcnn_config = {
{batch, 1, 32, 32, 56, 5, 5, 1, 2, 1, true},
{batch, 56, 32, 32, 12, 1, 1, 1, 0, 1, true},
{batch, 12, 32, 32, 12, 3, 3, 1, 1, 1, false},
{batch, 12, 32, 32, 12, 3, 3, 1, 1, 1, false},
{batch, 12, 32, 32, 12, 3, 3, 1, 1, 1, false},
{batch, 12, 32, 32, 12, 3, 3, 1, 1, 1, true},
{batch, 12, 32, 32, 56, 1, 1, 1, 0, 1, true},
{batch, 56, 32, 32, 1, 9, 9, 1, 3, 4, false} // ConvTransNHWC
// n, f, h, w, c, r, s, stride, pad, dilation, has_pReLU
};
Graph g = make_ref<GraphObj>(runtime);
Tensor input;
{
auto &[n, c, h, w, f, r, s, stride, pad, dilation, has_pReLU] =
fsrcnn_config[0];
input = g->addTensor({batch, h, w, c}, DataType::Float32,
TensorType::Input);
}
for (int i = 0; i < (int)fsrcnn_config.size() - 1; ++i) {
// auto [channel, kernelSize, pad, stride, tanh] = configs[i];
auto &[n, c, h, w, f, r, s, stride, pad, dilation, has_pReLU] =
fsrcnn_config[i];
IT_ASSERT(input->getDims()[3] == c);
auto weight = g->addTensor({f, r, s, c}, DataType::Float32,
TensorType::Initialized); // f, r, s, c
input = g->addOp<ConvNHWCObj>(input, weight, nullptr, pad, pad, stride,
stride, 1, 1)
->getOutput();
if (has_pReLU) {
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
}
}
// last operator is a ConvTransNHWC
{
auto &[n, f, h, w, c, r, s, stride, pad, dilation, has_pReLU] =
fsrcnn_config[fsrcnn_config.size() - 1];
IT_ASSERT(input->getDims()[3] == f);
auto weight = g->addTensor({f, r, s, c}, DataType::Float32,
TensorType::Initialized); // f, r, s, c
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
pad, stride, stride, 1, 1)
->getOutput();
}
return g;
}
Graph getLongformer(Runtime runtime, int bs) { Graph getLongformer(Runtime runtime, int bs) {
const int seqlen = 10000, w = 1000, featlen = 512, heads = 8, d = 4; const int seqlen = 10000, w = 1000, featlen = 512, heads = 8, d = 4;
const int hidden = featlen, hiddenPerHead = hidden / heads; const int hidden = featlen, hiddenPerHead = hidden / heads;
@ -165,8 +219,8 @@ Graph getLongformer(Runtime runtime, int bs) {
g->addOpWithOutputs<ReshapeObj>(q0, q1); g->addOpWithOutputs<ReshapeObj>(q0, q1);
g->addOpWithOutputs<ReshapeObj>(k0, k1); g->addOpWithOutputs<ReshapeObj>(k0, k1);
g->addOpWithOutputs<ReshapeObj>(v0, v1); g->addOpWithOutputs<ReshapeObj>(v0, v1);
// For example, when perm=(1, 0, 2), given an input tensor of shape (1, 2, // For example, when perm=(1, 0, 2), given an input tensor of shape (1,
// 3), the output shape will be (2, 1, 3). // 2, 3), the output shape will be (2, 1, 3).
g->addOpWithOutputs<TransposeObj>(q1, q2, vector{0, 2, 1, 3}); g->addOpWithOutputs<TransposeObj>(q1, q2, vector{0, 2, 1, 3});
g->addOpWithOutputs<TransposeObj>(k1, k2, vector{0, 2, 1, 3}); g->addOpWithOutputs<TransposeObj>(k1, k2, vector{0, 2, 1, 3});
g->addOpWithOutputs<TransposeObj>(v1, v2, vector{0, 2, 1, 3}); g->addOpWithOutputs<TransposeObj>(v1, v2, vector{0, 2, 1, 3});

View File

@ -11,6 +11,7 @@
#include "operators/GBMM.h" #include "operators/GBMM.h"
#include "operators/any.h" #include "operators/any.h"
#include "operators/conv.h" #include "operators/conv.h"
#include "operators/conv2dreduce.h"
#include "operators/matmul.h" #include "operators/matmul.h"
#include "operators/membound.h" #include "operators/membound.h"
#include "operators/reduce_mean.h" #include "operators/reduce_mean.h"
@ -98,6 +99,10 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
out_graphs.emplace_back(g); out_graphs.emplace_back(g);
return; return;
} }
if (infini::Graph g = transformConv1xk(computeOps[0])) {
out_graphs.emplace_back(g);
return;
}
if (Graph g = transformG2bmm(computeOps[0])) { if (Graph g = transformG2bmm(computeOps[0])) {
out_graphs.emplace_back(g); out_graphs.emplace_back(g);
return; return;
@ -110,7 +115,12 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
out_graphs.emplace_back(g); out_graphs.emplace_back(g);
return; return;
} }
if (infini::Graph g = transformConv1xk(computeOps[0])) { if (infini::Graph g = transformConvToGEMMReduce(computeOps[0])) {
out_graphs.emplace_back(g);
return;
}
if (infini::Graph g = transformConvTranposeToGEMMReduce(computeOps[0])) {
out_graphs.emplace_back(g); out_graphs.emplace_back(g);
return; return;
} }
@ -522,7 +532,9 @@ Graph NMutator::transformConvtransposed1x1(Operator _op) {
if (h != 1 || w != 1) if (h != 1 || w != 1)
return {}; return {};
IT_ASSERT_TODO(ph == pw); IT_ASSERT_TODO(ph == pw);
IT_ASSERT_TODO(tie(sh, sw) == tuple(1, 1)); if (tie(sh, sw) != tuple(1, 1)) {
return nullptr;
}
IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1)); IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
auto g = make_ref<GraphObj>(runtime); auto g = make_ref<GraphObj>(runtime);
// NHWF // NHWF
@ -543,6 +555,80 @@ Graph NMutator::transformConvtransposed1x1(Operator _op) {
return g; return g;
} }
Graph NMutator::transformConvToGEMMReduce(Operator _op) {
auto op = as<ConvNHWCObj>(_op);
if (!op)
return nullptr;
const auto &A = op->getInputs()[0];
const auto &W = op->getInputs()[1];
const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
const Shape inputDims = op->getInputs(0)->getDims();
const Shape weightDims = op->getInputs(1)->getDims();
const Shape outputDims = op->getOutput()->getDims();
IT_ASSERT(weightDims[0] == f);
IT_ASSERT(weightDims[1] == r);
IT_ASSERT(weightDims[2] == s);
IT_ASSERT(weightDims[3] == c);
IT_ASSERT(inputDims[0] == n);
IT_ASSERT(inputDims[1] == h);
IT_ASSERT(inputDims[2] == w);
IT_ASSERT(inputDims[3] == c);
const DataType dtype = A->getDType();
auto g = make_ref<GraphObj>(runtime);
auto newA = g->addTensor(
{inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
// // If use Matmul with transpose 0,0
// auto newW = g->addTensor(
// {weightDims[3], weightDims[0] * weightDims[1] * weightDims[2]}, dtype);
// If use Matmul with transpose 0, 1
auto newW = g->addTensor(
{weightDims[0] * weightDims[1] * weightDims[2], weightDims[3]},
dtype);
g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 1)->getOutput();
auto new1 = g->addTensor({n, h, w, f, r, s}, dtype);
g->addOpWithOutputs<ReshapeObj>(newO, new1, new1->getDims());
g->addOpWithOutputs<Conv2dReduce>(
new1, nullptr, g->cloneTensor(op->getOutput()), false, 0.f, ph, pw);
return g;
}
Graph NMutator::transformConvTranposeToGEMMReduce(Operator _op) {
auto op = as<ConvTransposed2dNHWCObj>(_op);
if (!op)
return nullptr;
const auto &A = op->getInputs()[0];
const auto &W = op->getInputs()[1];
// f is the de-facto input channel for ConvTranspose
const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
const Shape inputDims = op->getInputs(0)->getDims();
const Shape weightDims = op->getInputs(1)->getDims();
const Shape outputDims = op->getOutput()->getDims();
const DataType dtype = A->getDType();
auto g = make_ref<GraphObj>(runtime);
auto newA = g->addTensor( // [N,H,W,F]
{inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
auto newW = g->addTensor( // [F, CRS]
{weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]},
dtype); // HACK: this should be a transpose
g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
// newO [NHW, CRS]
Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 0)->getOutput();
auto new1 = g->addTensor({n, h, w, c, r, s}, dtype);
g->addOpWithOutputs<ReshapeObj>(newO, new1, new1->getDims());
// [NHW, CRS] -> [N,H,W,C]
g->addOpWithOutputs<Conv2dReduceTranspose>(
new1, nullptr, g->cloneTensor(op->getOutput()), false, 0.f, ph, pw);
return g;
}
// Graph NMutator::transformConvtransposed(Operator _op) { // Graph NMutator::transformConvtransposed(Operator _op) {
// auto op = as<ConvTransposed2dNHWCObj>(_op); // auto op = as<ConvTransposed2dNHWCObj>(_op);
// if (!op) // if (!op)

View File

@ -114,6 +114,75 @@ optional<vector<Shape>> ConvObj::inferShape(const TensorVec &inputs) const {
return {{{on, oc, oh, ow}}}; return {{{on, oc, oh, ow}}};
} }
void ConvNHWCObj::setAuxilaryAttributes(PaddingMode mode) {
const Tensor &input = inputs[0];
const Tensor &weight = inputs[1];
n = input->getDims()[0], c = input->getDims()[3], h = input->getDims()[1],
w = input->getDims()[2], f = weight->getDims()[0], r = weight->getDims()[1],
s = weight->getDims()[2];
if (mode == PaddingMode::Same) {
int oh = h / sh;
int ow = w / sw;
ph = (h - oh * sh + (r - sh) * dh) / 2;
pw = (w - ow * sw + (s - sw) * dw) / 2;
} else if (mode == PaddingMode::Valid) {
ph = pw = 0;
}
}
ConvNHWCObj::ConvNHWCObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
int ph, int pw, int sh, int sw, int dh, int dw, Tensor bias,
ActType act)
: ConvBaseObj(OpType::ConvNHWC, {input, weight}, output, ph, pw, sh, sw, dh, dw,
input, weight, act) {
if (bias)
IT_TODO_HALT();
setAuxilaryAttributes(PaddingMode::Other);
IT_ASSERT(checkValid(graph));
}
ConvNHWCObj::ConvNHWCObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
PaddingMode mode, int sh, int sw, int dh, int dw, Tensor bias,
ActType act)
: ConvBaseObj(OpType::ConvNHWC, {input, weight}, output, mode, sh, sw, dh, dw,
input, weight, act) {
if (bias)
IT_TODO_HALT();
setAuxilaryAttributes(mode);
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>> ConvNHWCObj::inferShape(const TensorVec &inputs) const {
const auto &input = inputs[0], &weight = inputs[1];
auto n = input->getDims()[0];
auto h = input->getDims()[1];
auto w = input->getDims()[2];
auto f = weight->getDims()[0];
auto r = weight->getDims()[1];
auto s = weight->getDims()[2];
int on = n, oc = f;
int oh = 0, ow = 0;
// For NCHW+FCRS layout, C of input is divisable by C of weight
if (input->getDims()[3] % weight->getDims()[3] != 0)
return {};
// Set padding size
if (padding == PaddingMode::Other) {
oh = (h - (r - sh) * dh + ph * 2) / sh;
ow = (w - (s - sw) * dw + pw * 2) / sw;
} else if (padding == PaddingMode::Same) {
oh = h / sh;
ow = w / sw;
// ph = (h - oh * sh + (r - sh) * dh) / 2;
// pw = (w - ow * sw + (s - sw) * dw) / 2;
} else if (padding == PaddingMode::Valid) {
int ph = 0;
int pw = 0;
oh = (h - (r - sh) * dh + ph * 2) / sh;
ow = (w - (s - sw) * dw + pw * 2) / sw;
}
return {{{on, oh, ow, oc}}};
}
ConvTransposed2dObj::ConvTransposed2dObj(GraphObj *graph, Tensor input, ConvTransposed2dObj::ConvTransposed2dObj(GraphObj *graph, Tensor input,
Tensor weight, Tensor output, int ph, Tensor weight, Tensor output, int ph,
int pw, int sh, int sw, int dh, int dw, int pw, int sh, int sw, int dh, int dw,

View File

@ -0,0 +1,98 @@
#include "operators/conv2dreduce.h"
namespace infini {
Conv2dReduceBase::Conv2dReduceBase(OpType opType, Tensor input, Tensor bias_,
Tensor output, bool PReLU_, float paramReLU_,
int ph_, int pw_, int sh_, int sw_, int dh_,
int dw_)
: OperatorObj(opType, {input}, {output}), bias(bias_), ph(ph_), pw(pw_),
sh(sh_), sw(sw_), dh(dh_), dw(dw_), PReLU(PReLU_), paramReLU(paramReLU_) {
// expect input shape is (n, h, w, f, r, s)
auto inputShape = input->getDims();
IT_ASSERT(inputShape.size() == 6);
n = inputShape[0];
h = inputShape[1];
w = inputShape[2];
f = inputShape[3];
r = inputShape[4];
s = inputShape[5];
if (bias) {
auto biasShape = bias->getDims();
IT_ASSERT(biasShape.size() == 1);
IT_ASSERT(biasShape[0] == f);
}
}
std::string Conv2dReduceBase::toString() const {
std::ostringstream os;
os << OpRegistry::getOpName(getOpType()) << "[" << getGuid() << "]";
os << "(";
if (inputs.size() == 2) {
os << vecToString(inputs[0]->getDims()) << ",";
os << vecToString(inputs[1]->getDims()) << ",";
} else {
os << vecToString(inputs[0]->getDims()) << ",";
}
os << "p=[" << ph << "," << pw << "],";
os << "s=[" << sh << "," << sw << "],";
os << "d=[" << dh << "," << dw << "],";
os << "PReLU=" << (PReLU ? "true" : "false") << ",";
// os << "act=" << enum_to_underlying(act) << ",";
os << "input=" << inputs[0]->getGuid() << ",";
if (bias != nullptr) {
os << "bias=" << bias->getGuid() << ",";
}
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
std::vector<int> Conv2dReduceBase::getWorkloadVector() const {
return {enum_to_underlying(type), n, h, w, f, r, s, ph, pw, sh, sw, dh, dw};
}
std::vector<int> Conv2dReduceBase::getOpAttrVector() const {
return {enum_to_underlying(type), ph, pw, sh, sw, dh, dw};
}
Conv2dReduce::Conv2dReduce(GraphObj *graph, Tensor input, Tensor bias,
Tensor output, bool PReLU_, float paramReLU_,
int ph_, int pw_, int sh_, int sw_, int dh_, int dw_)
: Conv2dReduceBase(OpType::Conv2dReduce, input, bias, output, PReLU_,
paramReLU_, ph_, pw_, sh_, sw_, dh_, dw_) {
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>>
Conv2dReduce::inferShape(const TensorVec &inputs) const {
// const auto &input = inputs[0], &bias = inputs[1];
int on = n, of = f;
int oh = (h + ph * 2 - dh * (r - 1) - 1) / sh + 1;
int ow = (w + pw * 2 - dw * (s - 1) - 1) / sw + 1;
return {{{on, oh, ow, of}}};
}
Conv2dReduceTranspose::Conv2dReduceTranspose(GraphObj *graph, Tensor input,
Tensor bias, Tensor output,
bool PReLU_, float paramReLU_,
int ph_, int pw_, int sh_, int sw_,
int dh_, int dw_)
: Conv2dReduceBase(OpType::Conv2dReduceTranspose, input, bias, output,
PReLU_, paramReLU_, ph_, pw_, sh_, sw_, dh_, dw_) {
IT_ASSERT(dh_ == 1);
IT_ASSERT(dw_ == 1);
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>>
Conv2dReduceTranspose::inferShape(const TensorVec &inputs) const {
// const auto &input = inputs[0], &bias = inputs[1];
int on = n, of = f;
int oh = (h - 1) * sh - 2 * ph + dh * (r - 1) + 1;
int ow = (w - 1) * sw - 2 * pw + dw * (s - 1) + 1;
return {{{on, oh, ow, of}}};
}
} // namespace infini

View File

@ -43,6 +43,42 @@ void testConvCudnn(
gCuda->print(); gCuda->print();
} }
void testConvNHWCCudnn(
const std::function<void(void *, size_t, DataType)> &generator,
vector<float> ansVec) {
// Construct Runtime and graph for CPU and CUDA
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime cuda = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cuda);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({1, 4, 4, 3}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({2, 3, 3, 3}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(generator);
w0Cpu->setData(generator);
// Copy input tensors from CPU to CUDA
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
// Build CUDA graph
auto conv =
gCuda->addOp<ConvNHWCObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
// allocate CUDA memory
gCuda->dataMalloc();
// Execute on CUDA
cuda->run(gCuda);
// copy output from CUDA to CPU
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
o0Cpu->print();
o0Cpu->printData();
// check results on CPU
EXPECT_TRUE(o0Cpu->equalData(ansVec));
// print a tensor/operator/graph by print()
gCuda->print();
}
TEST(cuDNN_Conv, run) { TEST(cuDNN_Conv, run) {
testConvCudnn(OneGenerator(), testConvCudnn(OneGenerator(),
vector<float>{12, 12, 18, 18, 12, 12, 18, 18}); vector<float>{12, 12, 18, 18, 12, 12, 18, 18});
@ -51,6 +87,14 @@ TEST(cuDNN_Conv, run) {
vector<float>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656}); vector<float>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
} }
TEST(cuDNN_Conv, runNHWC) {
testConvNHWCCudnn(OneGenerator(),
vector<float>{12., 12., 12., 12., 18., 18., 18., 18.});
testConvNHWCCudnn(
IncrementalGenerator(),
vector<float>{3350, 7562, 2306, 5546, 9480, 24546, 7185, 20793});
}
TEST(cuDNN_Conv, tune) { TEST(cuDNN_Conv, tune) {
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu); Graph gCpu = make_ref<GraphObj>(cpu);

View File

@ -30,7 +30,7 @@ def run_and_evaluate(runtime, g):
runtime.run(g, True) runtime.run(g, True)
print(f'getPerfTime = {runtime.getPerfTime(g, True, False, False)}') print(f'getPerfTime = {runtime.getPerfTime(g, True, False, False)}')
print(f'Non-ctc time = {runtime.timeNonCtcOperators(g, 1000, 1000)}') print(f'Non-ctc time = {runtime.timeNonCtcOperators(g, 1000, 1000)}')
print(f'Cuda graph time = {runtime.timeWithCudaGraph(g)}') print(f'Cuda graph time = {runtime.timeWithCudaGraph(g, 100)}')
def run_graph_get_output_as_torch_tensor(runtime, g): def run_graph_get_output_as_torch_tensor(runtime, g):
@ -111,6 +111,23 @@ def construct_conv(runtime, n, c, h, w, f, r, s, pad, stride, dilation):
handler.conv(input, w, None, pad, pad, stride, stride, dilation, dilation) handler.conv(input, w, None, pad, pad, stride, stride, dilation, dilation)
return handler.getGraph() return handler.getGraph()
def construct_conv_nhwc(runtime, n, c, h, w, f, r, s, pad, stride, dilation):
handler = ft.GraphHandler(runtime)
# input = handler.tensor([1, 56, 32, 32], tensor_type=ft.TensorType.Input)
# w = handler.tensor([12, 56, 1, 1], tensor_type=ft.TensorType.Initialized)
# handler.conv(input, w, None, 0, 0, 1, 1, 1, 1)
input = handler.tensor([n, h, w, c], tensor_type=ft.TensorType.Input)
w = handler.tensor([f, r, s, c], tensor_type=ft.TensorType.Initialized)
handler.convNHWC(input, w, None, pad, pad, stride, stride, dilation, dilation)
return handler.getGraph()
def construct_convtranposed_nhwc(runtime, n, c, h, w, f, r, s, pad, stride, dilation):
handler = ft.GraphHandler(runtime)
input = handler.tensor([n, h, w, c], tensor_type=ft.TensorType.Input)
w = handler.tensor([f, r, s, c], tensor_type=ft.TensorType.Initialized)
handler.convtransposed2dNHWC(input, w, None, pad, pad, stride, stride, dilation, dilation)
return handler.getGraph()
def export_op_level_onnx(runtime): def export_op_level_onnx(runtime):
graphs = [ graphs = [
@ -134,9 +151,13 @@ if __name__ == "__main__":
# (construct_conv(runtime, 1, 12, 32, 32, 12, 3, 3, 1, 1, 1), 'conv3x3'), # FSRCNN Conv_4 3x3 # (construct_conv(runtime, 1, 12, 32, 32, 12, 3, 3, 1, 1, 1), 'conv3x3'), # FSRCNN Conv_4 3x3
# ft.getGANGraph(batch, runtime, 5, 1) # ft.getGANGraph(batch, runtime, 5, 1)
# (ft.getLongformer(runtime, 1), 'longformer.bs1'), # (ft.getLongformer(runtime, 1), 'longformer.bs1'),
(ft.getLongformer(runtime, 16), 'longformer.bs16'), # (ft.getLongformer(runtime, 16), 'longformer.bs16'),
# construct_convTranspose2d(runtime) # construct_convTranspose2d(runtime)
# (load_onnx(runtime, '/mnt/auxHome/models/einnet/fsrcnn.bs1.onnx'), 'fsrcnn.bs1'), # (load_onnx(runtime, '/mnt/auxHome/models/einnet/fsrcnn.bs1.onnx'), 'fsrcnn.bs1'),
# (ft.getFSRCNNGraph(1, runtime), "fsrcnn.bs1"),
# (ft.getFSRCNNGraph(16, runtime), "fsrcnn.bs16"),
# (construct_conv_nhwc(runtime, 1, 56, 32, 32, 12, 1, 1, 0, 1, 1), 'conv1x1')
(load_onnx(runtime, '/mnt/auxHome/models/einnet/gcn.bs1.onnx'), 'gcn.bs1'),
] ]
for original_g, name in graphs: for original_g, name in graphs: