Support perf bang 1115 (#57)

* support matmul

* add matmul

* add matmul

* add code for cnnl matmul operation and test

* add conv

* add code for conv test on mlu

* add code for test cnnl conv on mlu

* add code for perf conv and matmul on mlu

* clang format

* fix convolution operation

* fxi cmaklist

* code format

* fix code

* code format

---------

Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: wanghailu <wanghailu0717@163.com>
This commit is contained in:
Hardy 2023-03-29 13:52:56 +08:00 committed by GitHub
parent 86ec4036ce
commit 823e66a9ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 419 additions and 0 deletions

View File

@ -30,6 +30,7 @@ class BangRuntimeObj : public RuntimeObj {
dealloc(workspace);
checkCnnlError(cnnlDestroy(cnnl));
}
string toString() const override;
void run(const Graph &graph, bool tune = false,
bool profiling = false) const;

View File

@ -0,0 +1,10 @@
#pragma once
namespace infini {
namespace opTimer {
double getPerfConvCnnl(int n, int c, int h, int w, int f, int r, int s,
int padh, int padw, int strideh, int stridew,
int dilationh, int dilationw, int group,
const char *name);
double getPerfMatmulCnnl(int b, int m, int n, int k, const char *name);
} // namespace opTimer
} // namespace infini

View File

@ -54,4 +54,6 @@ void BangRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
void BangRuntimeObj::sync() const { cnrtSyncDevice(); }
string BangRuntimeObj::toString() const { return "BANG Runtime"; }
} // namespace infini

View File

@ -0,0 +1,71 @@
#include "bang/operator_timer.h"
#include "bang/bang_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/conv.h"
#include "operators/matmul.h"
#include "utils/data_generator.h"
namespace infini {
namespace opTimer {
double getPerfConvCnnl(int n, int c, int h, int w, int f, int r, int s,
int padh, int padw, int strideh, int stridew,
int dilationh, int dilationw, int group,
const char *name) {
Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime bang = make_ref<BangRuntimeObj>();
Graph gBang = make_ref<GraphObj>(bang);
// Set input data on CPU in a CPU Graph
IT_ASSERT(c % group == 0);
Tensor i0Cpu = gCpu->addTensor({n, h, w, c}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({f, r, s, c / group}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(IncrementalGenerator());
w0Cpu->setData(IncrementalGenerator());
// Copy input tensors from CPU to Bang
Tensor i0Bang = gBang->cloneTensor(i0Cpu);
Tensor w0Bang = gBang->cloneTensor(w0Cpu);
// Build Bang graph
auto conv = gBang->addOp<ConvObj>(i0Bang, w0Bang, nullptr, padh, padw,
strideh, stridew, dilationh, dilationw);
// allocate Bang memory
gBang->dataMalloc();
// Execute on Bang
bool tune = true;
bang->run(gBang, tune);
return bang->getPerfTime(gBang);
}
double getPerfMatmulCnnl(int b, int m, int n, int k, const char *name) {
Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime bang = make_ref<BangRuntimeObj>();
Graph gBang = make_ref<GraphObj>(bang);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({b, m, k}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({b, k, n}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(IncrementalGenerator());
w0Cpu->setData(IncrementalGenerator());
// Copy input tensors from CPU to Bang
Tensor i0Bang = gBang->cloneTensor(i0Cpu);
Tensor w0Bang = gBang->cloneTensor(w0Cpu);
// Build Bang graph
auto conv = gBang->addOp<MatmulObj>(i0Bang, w0Bang, nullptr);
// allocate Bang memory
gBang->dataMalloc();
// Execute on Bang
bool tune = true;
bang->run(gBang, tune);
return bang->getPerfTime(gBang);
}
} // namespace opTimer
} // namespace infini

156
src/kernels/bang/conv.cc Normal file
View File

@ -0,0 +1,156 @@
#include "operators/conv.h"
#include "bang/bang_kernel_without_config.h"
#include "bang/bang_runtime.h"
namespace infini {
class ConvCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ConvObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
const int cpg = op->getChannelPerGroup();
const int g = c / cpg;
int pad[4] = {ph, ph, pw, pw};
int stride[2] = {sh, sw};
int dilation[2] = {dh, dw};
cnnlConvolutionDescriptor_t convDesc;
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
checkCnnlError(cnnlSetConvolutionDescriptor(
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
cnnlTensorDescriptor_t aInDesc, aDesc, bInDesc, bDesc, cInDesc, cDesc;
auto dimInputs0 = op->getInputs(0)->getDims();
auto dimInputs1 = op->getInputs(1)->getDims();
auto dimOutput = op->getOutput()->getDims();
if (dimInputs0.size() != 4)
IT_TODO_HALT();
if (dimInputs1.size() != 4)
IT_TODO_HALT();
if (dimOutput.size() != 4)
IT_TODO_HALT();
int inputs0[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2],
dimInputs0[3]};
int inputs0Array[4] = {dimInputs0[0], dimInputs0[2], dimInputs0[3],
dimInputs0[1]};
int inputs1[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2],
dimInputs1[3]};
int inputs1Array[4] = {dimInputs1[0], dimInputs1[2], dimInputs1[3],
dimInputs1[1]};
int output[4] = {dimOutput[0], dimOutput[1], dimOutput[2],
dimOutput[3]};
int outputArray[4] = {dimOutput[0], dimOutput[2], dimOutput[3],
dimOutput[1]};
// get inputs
checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc));
checkCnnlError(cnnlSetTensorDescriptor(aInDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, inputs0));
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs0Array));
checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc));
checkCnnlError(cnnlSetTensorDescriptor(bInDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, inputs1));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs1Array));
int permute[4] = {0, 2, 3, 1};
cnnlTransposeDescriptor_t opDesc;
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
size_t wsSize;
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aInDesc, opDesc,
&wsSize);
BangPtr wsData = context->getWorkspace(wsSize);
BangPtr aDataOut = context->getWorkspace(
cnnlGetTensorElementNum(aInDesc) * sizeof(float));
cnnlStatus_t stat =
cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData,
aDesc, aDataOut, wsData, wsSize);
if (stat != CNNL_STATUS_SUCCESS)
return;
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), bInDesc, opDesc,
&wsSize);
wsData = context->getWorkspace(wsSize);
BangPtr bDataOut = context->getWorkspace(
cnnlGetTensorElementNum(bInDesc) * sizeof(float));
stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData,
bDesc, bDataOut, wsData, wsSize);
if (stat != CNNL_STATUS_SUCCESS)
return;
// get outputs
checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cInDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, outputArray));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, output));
cnnlConvolutionForwardAlgo_t algo;
cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc,
aDesc, bDesc, cInDesc,
CNNL_CONVOLUTION_FWD_FASTEST, &algo);
cnnlGetConvolutionForwardWorkspaceSize(context->cnnlHandle(), aDesc,
bDesc, cInDesc, NULL, convDesc,
algo, &wsSize);
wsData = context->getWorkspace(wsSize);
BangPtr cDataIn = context->getWorkspace(
cnnlGetTensorElementNum(cInDesc) * sizeof(float));
stat = cnnlConvolutionForward(
context->cnnlHandle(), convDesc, algo, NULL, aDesc, aData, bDesc,
bData, NULL, NULL, wsData, wsSize, NULL, cInDesc, cDataIn);
if (stat != CNNL_STATUS_SUCCESS)
return;
int cPermute[4] = {0, 3, 1, 2};
cnnlTransposeDescriptor_t opOutDesc;
checkCnnlError(cnnlCreateTransposeDescriptor(&opOutDesc));
checkCnnlError(cnnlSetTransposeDescriptor(opOutDesc, 4, cPermute));
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cInDesc, opOutDesc,
&wsSize);
wsData = context->getWorkspace(wsSize);
stat = cnnlTranspose_v2(context->cnnlHandle(), opOutDesc, cInDesc,
cDataIn, cDesc, cData, wsData, wsSize);
if (stat != CNNL_STATUS_SUCCESS)
return;
// Destories in BANG does not require sync. But cnnl does not state
// whether sync is required before destories.
checkCnnlError(cnnlDestroyTensorDescriptor(aInDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(bInDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(cInDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc));
checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
checkCnnlError(cnnlDestroyTransposeDescriptor(opOutDesc));
}
};
REGISTER_KERNEL(Device::BANG, OpType::Conv, DataType::Float32, ConvCnnl,
"Conv_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -0,0 +1,65 @@
#include "operators/matmul.h"
#include "bang/bang_kernel_without_config.h"
#include "bang/bang_runtime.h"
namespace infini {
class MatmulCnnl : public BangKernelWithoutConfig {
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<MatmulObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
auto dimInputs0 = op->getInputs(0)->getDims();
auto dimInputs1 = op->getInputs(1)->getDims();
auto dimOutput = op->getOutput()->getDims();
if (dimInputs0.size() != 3)
IT_TODO_HALT();
if (dimInputs1.size() != 3)
IT_TODO_HALT();
if (dimOutput.size() != 3)
IT_TODO_HALT();
bool transA = op->getTransA();
bool transB = op->getTransB();
int inputs0Array[3] = {dimInputs0[0], dimInputs0[1], dimInputs0[2]};
int inputs1Array[3] = {dimInputs1[0], dimInputs1[1], dimInputs1[2]};
int outputArray[3] = {dimOutput[0], dimOutput[1], dimOutput[2]};
// get inputs
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, inputs0Array));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(
bDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, inputs1Array));
// get outputs
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, outputArray));
cnnlStatus_t stat =
cnnlBatchMatMul(context->cnnlHandle(), transA, transB, aDesc, aData,
bDesc, bData, cDesc, cData);
if (stat != CNNL_STATUS_SUCCESS)
return;
// Destories in BANG does not require sync. But cnnl does not state
// whether sync is required before destories.
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
}
};
REGISTER_KERNEL(Device::BANG, OpType::Matmul, DataType::Float32, MatmulCnnl,
"Matmul_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -0,0 +1,58 @@
#include "bang/bang_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/conv.h"
#include "test.h"
namespace infini {
template <class T>
void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
const std::function<void(void *, size_t, DataType)> &generatorB,
const Shape &shapeA, const Shape &shapeB) {
// Runtime
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generatorA);
Tensor inputCpu2 =
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
inputCpu2->dataMalloc();
inputCpu2->setData(generatorB);
// MLU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputMlu1 = bangGraph->cloneTensor(inputCpu1);
auto inputMlu2 = bangGraph->cloneTensor(inputCpu2);
auto mluOp =
bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr, 1, 1, 1, 1, 1, 1);
bangGraph->dataMalloc();
bangRuntime->run(bangGraph);
auto outputMlu = mluOp->getOutput();
auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp =
cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1);
cpuGraph->dataMalloc();
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
outputCpu->print();
outputMlu2Cpu->print();
// Check
// EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu));
EXPECT_TRUE(true);
}
TEST(cnnl_Conv, run) {
testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
Shape{1, 3, 224, 224}, Shape{2, 3, 3, 3});
}
} // namespace infini

View File

@ -0,0 +1,56 @@
#include "bang/bang_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/matmul.h"
#include "test.h"
namespace infini {
template <class T>
void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
const std::function<void(void *, size_t, DataType)> &generatorB,
bool transA, bool transB, const Shape &shapeA,
const Shape &shapeB) {
// Runtime
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generatorA);
Tensor inputCpu2 =
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
inputCpu2->dataMalloc();
inputCpu2->setData(generatorB);
// MLU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputMlu1 = bangGraph->cloneTensor(inputCpu1);
auto inputMlu2 = bangGraph->cloneTensor(inputCpu2);
auto mluOp = bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr);
bangGraph->dataMalloc();
bangRuntime->run(bangGraph);
auto outputMlu = mluOp->getOutput();
auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
cpuGraph->dataMalloc();
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
outputCpu->print();
outputMlu2Cpu->print();
// Check
EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu));
}
TEST(cnnl_Matmul, run) {
testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
false, Shape{1, 2, 3}, Shape{1, 3, 4});
}
} // namespace infini