forked from jiuyuan/InfiniTensor
Support perf bang 1115 (#57)
* support matmul * add matmul * add matmul * add code for cnnl matmul operation and test * add conv * add code for conv test on mlu * add code for test cnnl conv on mlu * add code for perf conv and matmul on mlu * clang format * fix convolution operation * fxi cmaklist * code format * fix code * code format --------- Co-authored-by: wanghailu <wanghailu@qiyuanlab.com> Co-authored-by: wanghailu <wanghailu0717@163.com>
This commit is contained in:
parent
86ec4036ce
commit
823e66a9ff
|
@ -30,6 +30,7 @@ class BangRuntimeObj : public RuntimeObj {
|
||||||
dealloc(workspace);
|
dealloc(workspace);
|
||||||
checkCnnlError(cnnlDestroy(cnnl));
|
checkCnnlError(cnnlDestroy(cnnl));
|
||||||
}
|
}
|
||||||
|
string toString() const override;
|
||||||
|
|
||||||
void run(const Graph &graph, bool tune = false,
|
void run(const Graph &graph, bool tune = false,
|
||||||
bool profiling = false) const;
|
bool profiling = false) const;
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
#pragma once
|
||||||
|
namespace infini {
|
||||||
|
namespace opTimer {
|
||||||
|
double getPerfConvCnnl(int n, int c, int h, int w, int f, int r, int s,
|
||||||
|
int padh, int padw, int strideh, int stridew,
|
||||||
|
int dilationh, int dilationw, int group,
|
||||||
|
const char *name);
|
||||||
|
double getPerfMatmulCnnl(int b, int m, int n, int k, const char *name);
|
||||||
|
} // namespace opTimer
|
||||||
|
} // namespace infini
|
|
@ -54,4 +54,6 @@ void BangRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
||||||
|
|
||||||
void BangRuntimeObj::sync() const { cnrtSyncDevice(); }
|
void BangRuntimeObj::sync() const { cnrtSyncDevice(); }
|
||||||
|
|
||||||
|
string BangRuntimeObj::toString() const { return "BANG Runtime"; }
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -0,0 +1,71 @@
|
||||||
|
#include "bang/operator_timer.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/conv.h"
|
||||||
|
#include "operators/matmul.h"
|
||||||
|
#include "utils/data_generator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
namespace opTimer {
|
||||||
|
|
||||||
|
double getPerfConvCnnl(int n, int c, int h, int w, int f, int r, int s,
|
||||||
|
int padh, int padw, int strideh, int stridew,
|
||||||
|
int dilationh, int dilationw, int group,
|
||||||
|
const char *name) {
|
||||||
|
Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||||
|
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||||
|
Runtime bang = make_ref<BangRuntimeObj>();
|
||||||
|
Graph gBang = make_ref<GraphObj>(bang);
|
||||||
|
// Set input data on CPU in a CPU Graph
|
||||||
|
IT_ASSERT(c % group == 0);
|
||||||
|
Tensor i0Cpu = gCpu->addTensor({n, h, w, c}, DataType::Float32);
|
||||||
|
Tensor w0Cpu = gCpu->addTensor({f, r, s, c / group}, DataType::Float32);
|
||||||
|
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||||
|
gCpu->dataMalloc();
|
||||||
|
i0Cpu->setData(IncrementalGenerator());
|
||||||
|
w0Cpu->setData(IncrementalGenerator());
|
||||||
|
|
||||||
|
// Copy input tensors from CPU to Bang
|
||||||
|
Tensor i0Bang = gBang->cloneTensor(i0Cpu);
|
||||||
|
Tensor w0Bang = gBang->cloneTensor(w0Cpu);
|
||||||
|
// Build Bang graph
|
||||||
|
auto conv = gBang->addOp<ConvObj>(i0Bang, w0Bang, nullptr, padh, padw,
|
||||||
|
strideh, stridew, dilationh, dilationw);
|
||||||
|
// allocate Bang memory
|
||||||
|
gBang->dataMalloc();
|
||||||
|
// Execute on Bang
|
||||||
|
bool tune = true;
|
||||||
|
bang->run(gBang, tune);
|
||||||
|
return bang->getPerfTime(gBang);
|
||||||
|
}
|
||||||
|
|
||||||
|
double getPerfMatmulCnnl(int b, int m, int n, int k, const char *name) {
|
||||||
|
Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||||
|
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||||
|
Runtime bang = make_ref<BangRuntimeObj>();
|
||||||
|
Graph gBang = make_ref<GraphObj>(bang);
|
||||||
|
// Set input data on CPU in a CPU Graph
|
||||||
|
Tensor i0Cpu = gCpu->addTensor({b, m, k}, DataType::Float32);
|
||||||
|
Tensor w0Cpu = gCpu->addTensor({b, k, n}, DataType::Float32);
|
||||||
|
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||||
|
gCpu->dataMalloc();
|
||||||
|
i0Cpu->setData(IncrementalGenerator());
|
||||||
|
w0Cpu->setData(IncrementalGenerator());
|
||||||
|
|
||||||
|
// Copy input tensors from CPU to Bang
|
||||||
|
Tensor i0Bang = gBang->cloneTensor(i0Cpu);
|
||||||
|
Tensor w0Bang = gBang->cloneTensor(w0Cpu);
|
||||||
|
// Build Bang graph
|
||||||
|
auto conv = gBang->addOp<MatmulObj>(i0Bang, w0Bang, nullptr);
|
||||||
|
// allocate Bang memory
|
||||||
|
gBang->dataMalloc();
|
||||||
|
// Execute on Bang
|
||||||
|
bool tune = true;
|
||||||
|
bang->run(gBang, tune);
|
||||||
|
return bang->getPerfTime(gBang);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace opTimer
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,156 @@
|
||||||
|
#include "operators/conv.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class ConvCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ConvObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||||
|
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||||
|
const int cpg = op->getChannelPerGroup();
|
||||||
|
const int g = c / cpg;
|
||||||
|
|
||||||
|
int pad[4] = {ph, ph, pw, pw};
|
||||||
|
int stride[2] = {sh, sw};
|
||||||
|
int dilation[2] = {dh, dw};
|
||||||
|
|
||||||
|
cnnlConvolutionDescriptor_t convDesc;
|
||||||
|
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
|
||||||
|
checkCnnlError(cnnlSetConvolutionDescriptor(
|
||||||
|
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aInDesc, aDesc, bInDesc, bDesc, cInDesc, cDesc;
|
||||||
|
auto dimInputs0 = op->getInputs(0)->getDims();
|
||||||
|
auto dimInputs1 = op->getInputs(1)->getDims();
|
||||||
|
auto dimOutput = op->getOutput()->getDims();
|
||||||
|
|
||||||
|
if (dimInputs0.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
if (dimInputs1.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
if (dimOutput.size() != 4)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
int inputs0[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2],
|
||||||
|
dimInputs0[3]};
|
||||||
|
int inputs0Array[4] = {dimInputs0[0], dimInputs0[2], dimInputs0[3],
|
||||||
|
dimInputs0[1]};
|
||||||
|
int inputs1[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2],
|
||||||
|
dimInputs1[3]};
|
||||||
|
int inputs1Array[4] = {dimInputs1[0], dimInputs1[2], dimInputs1[3],
|
||||||
|
dimInputs1[1]};
|
||||||
|
int output[4] = {dimOutput[0], dimOutput[1], dimOutput[2],
|
||||||
|
dimOutput[3]};
|
||||||
|
int outputArray[4] = {dimOutput[0], dimOutput[2], dimOutput[3],
|
||||||
|
dimOutput[1]};
|
||||||
|
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aInDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, inputs0));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs0Array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(bInDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, inputs1));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
bDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs1Array));
|
||||||
|
|
||||||
|
int permute[4] = {0, 2, 3, 1};
|
||||||
|
cnnlTransposeDescriptor_t opDesc;
|
||||||
|
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
|
||||||
|
checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aInDesc, opDesc,
|
||||||
|
&wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
BangPtr aDataOut = context->getWorkspace(
|
||||||
|
cnnlGetTensorElementNum(aInDesc) * sizeof(float));
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData,
|
||||||
|
aDesc, aDataOut, wsData, wsSize);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), bInDesc, opDesc,
|
||||||
|
&wsSize);
|
||||||
|
wsData = context->getWorkspace(wsSize);
|
||||||
|
BangPtr bDataOut = context->getWorkspace(
|
||||||
|
cnnlGetTensorElementNum(bInDesc) * sizeof(float));
|
||||||
|
stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData,
|
||||||
|
bDesc, bDataOut, wsData, wsSize);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cInDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, outputArray));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, 4, output));
|
||||||
|
|
||||||
|
cnnlConvolutionForwardAlgo_t algo;
|
||||||
|
cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc,
|
||||||
|
aDesc, bDesc, cInDesc,
|
||||||
|
CNNL_CONVOLUTION_FWD_FASTEST, &algo);
|
||||||
|
|
||||||
|
cnnlGetConvolutionForwardWorkspaceSize(context->cnnlHandle(), aDesc,
|
||||||
|
bDesc, cInDesc, NULL, convDesc,
|
||||||
|
algo, &wsSize);
|
||||||
|
wsData = context->getWorkspace(wsSize);
|
||||||
|
BangPtr cDataIn = context->getWorkspace(
|
||||||
|
cnnlGetTensorElementNum(cInDesc) * sizeof(float));
|
||||||
|
|
||||||
|
stat = cnnlConvolutionForward(
|
||||||
|
context->cnnlHandle(), convDesc, algo, NULL, aDesc, aData, bDesc,
|
||||||
|
bData, NULL, NULL, wsData, wsSize, NULL, cInDesc, cDataIn);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
int cPermute[4] = {0, 3, 1, 2};
|
||||||
|
cnnlTransposeDescriptor_t opOutDesc;
|
||||||
|
checkCnnlError(cnnlCreateTransposeDescriptor(&opOutDesc));
|
||||||
|
checkCnnlError(cnnlSetTransposeDescriptor(opOutDesc, 4, cPermute));
|
||||||
|
|
||||||
|
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cInDesc, opOutDesc,
|
||||||
|
&wsSize);
|
||||||
|
wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
stat = cnnlTranspose_v2(context->cnnlHandle(), opOutDesc, cInDesc,
|
||||||
|
cDataIn, cDesc, cData, wsData, wsSize);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aInDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bInDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cInDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTransposeDescriptor(opOutDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Conv, DataType::Float32, ConvCnnl,
|
||||||
|
"Conv_cnnl_BANG_Float32");
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,65 @@
|
||||||
|
#include "operators/matmul.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class MatmulCnnl : public BangKernelWithoutConfig {
|
||||||
|
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<MatmulObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||||
|
auto dimInputs0 = op->getInputs(0)->getDims();
|
||||||
|
auto dimInputs1 = op->getInputs(1)->getDims();
|
||||||
|
auto dimOutput = op->getOutput()->getDims();
|
||||||
|
if (dimInputs0.size() != 3)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
if (dimInputs1.size() != 3)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
if (dimOutput.size() != 3)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
|
||||||
|
bool transA = op->getTransA();
|
||||||
|
bool transB = op->getTransB();
|
||||||
|
|
||||||
|
int inputs0Array[3] = {dimInputs0[0], dimInputs0[1], dimInputs0[2]};
|
||||||
|
int inputs1Array[3] = {dimInputs1[0], dimInputs1[1], dimInputs1[2]};
|
||||||
|
int outputArray[3] = {dimOutput[0], dimOutput[1], dimOutput[2]};
|
||||||
|
|
||||||
|
// get inputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, inputs0Array));
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
bDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, inputs1Array));
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, outputArray));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlBatchMatMul(context->cnnlHandle(), transA, transB, aDesc, aData,
|
||||||
|
bDesc, bData, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Matmul, DataType::Float32, MatmulCnnl,
|
||||||
|
"Matmul_cnnl_BANG_Float32");
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,58 @@
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/conv.h"
|
||||||
|
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
|
||||||
|
const std::function<void(void *, size_t, DataType)> &generatorB,
|
||||||
|
const Shape &shapeA, const Shape &shapeB) {
|
||||||
|
// Runtime
|
||||||
|
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor inputCpu1 =
|
||||||
|
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
|
||||||
|
inputCpu1->dataMalloc();
|
||||||
|
inputCpu1->setData(generatorA);
|
||||||
|
Tensor inputCpu2 =
|
||||||
|
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
|
||||||
|
inputCpu2->dataMalloc();
|
||||||
|
inputCpu2->setData(generatorB);
|
||||||
|
|
||||||
|
// MLU
|
||||||
|
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto inputMlu1 = bangGraph->cloneTensor(inputCpu1);
|
||||||
|
auto inputMlu2 = bangGraph->cloneTensor(inputCpu2);
|
||||||
|
auto mluOp =
|
||||||
|
bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr, 1, 1, 1, 1, 1, 1);
|
||||||
|
bangGraph->dataMalloc();
|
||||||
|
bangRuntime->run(bangGraph);
|
||||||
|
auto outputMlu = mluOp->getOutput();
|
||||||
|
auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
|
||||||
|
// CPU
|
||||||
|
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||||
|
auto cpuOp =
|
||||||
|
cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1);
|
||||||
|
cpuGraph->dataMalloc();
|
||||||
|
cpuRuntime->run(cpuGraph);
|
||||||
|
auto outputCpu = cpuOp->getOutput();
|
||||||
|
outputCpu->print();
|
||||||
|
outputMlu2Cpu->print();
|
||||||
|
// Check
|
||||||
|
// EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu));
|
||||||
|
EXPECT_TRUE(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(cnnl_Conv, run) {
|
||||||
|
testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
|
||||||
|
Shape{1, 3, 224, 224}, Shape{2, 3, 3, 3});
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,56 @@
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/matmul.h"
|
||||||
|
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
|
||||||
|
const std::function<void(void *, size_t, DataType)> &generatorB,
|
||||||
|
bool transA, bool transB, const Shape &shapeA,
|
||||||
|
const Shape &shapeB) {
|
||||||
|
// Runtime
|
||||||
|
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor inputCpu1 =
|
||||||
|
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
|
||||||
|
inputCpu1->dataMalloc();
|
||||||
|
inputCpu1->setData(generatorA);
|
||||||
|
Tensor inputCpu2 =
|
||||||
|
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
|
||||||
|
inputCpu2->dataMalloc();
|
||||||
|
inputCpu2->setData(generatorB);
|
||||||
|
|
||||||
|
// MLU
|
||||||
|
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto inputMlu1 = bangGraph->cloneTensor(inputCpu1);
|
||||||
|
auto inputMlu2 = bangGraph->cloneTensor(inputCpu2);
|
||||||
|
auto mluOp = bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr);
|
||||||
|
bangGraph->dataMalloc();
|
||||||
|
bangRuntime->run(bangGraph);
|
||||||
|
auto outputMlu = mluOp->getOutput();
|
||||||
|
auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
|
||||||
|
// CPU
|
||||||
|
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||||
|
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||||
|
cpuGraph->dataMalloc();
|
||||||
|
cpuRuntime->run(cpuGraph);
|
||||||
|
auto outputCpu = cpuOp->getOutput();
|
||||||
|
outputCpu->print();
|
||||||
|
outputMlu2Cpu->print();
|
||||||
|
// Check
|
||||||
|
EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(cnnl_Matmul, run) {
|
||||||
|
testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
|
||||||
|
false, Shape{1, 2, 3}, Shape{1, 3, 4});
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
Loading…
Reference in New Issue