forked from jiuyuan/InfiniTensor
Support perf bang 1115 (#57)
* support matmul * add matmul * add matmul * add code for cnnl matmul operation and test * add conv * add code for conv test on mlu * add code for test cnnl conv on mlu * add code for perf conv and matmul on mlu * clang format * fix convolution operation * fxi cmaklist * code format * fix code * code format --------- Co-authored-by: wanghailu <wanghailu@qiyuanlab.com> Co-authored-by: wanghailu <wanghailu0717@163.com>
This commit is contained in:
parent
86ec4036ce
commit
823e66a9ff
|
@ -30,6 +30,7 @@ class BangRuntimeObj : public RuntimeObj {
|
|||
dealloc(workspace);
|
||||
checkCnnlError(cnnlDestroy(cnnl));
|
||||
}
|
||||
string toString() const override;
|
||||
|
||||
void run(const Graph &graph, bool tune = false,
|
||||
bool profiling = false) const;
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
#pragma once
|
||||
namespace infini {
|
||||
namespace opTimer {
|
||||
double getPerfConvCnnl(int n, int c, int h, int w, int f, int r, int s,
|
||||
int padh, int padw, int strideh, int stridew,
|
||||
int dilationh, int dilationw, int group,
|
||||
const char *name);
|
||||
double getPerfMatmulCnnl(int b, int m, int n, int k, const char *name);
|
||||
} // namespace opTimer
|
||||
} // namespace infini
|
|
@ -54,4 +54,6 @@ void BangRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
|||
|
||||
void BangRuntimeObj::sync() const { cnrtSyncDevice(); }
|
||||
|
||||
string BangRuntimeObj::toString() const { return "BANG Runtime"; }
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
#include "bang/operator_timer.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "utils/data_generator.h"
|
||||
|
||||
namespace infini {
|
||||
namespace opTimer {
|
||||
|
||||
double getPerfConvCnnl(int n, int c, int h, int w, int f, int r, int s,
|
||||
int padh, int padw, int strideh, int stridew,
|
||||
int dilationh, int dilationw, int group,
|
||||
const char *name) {
|
||||
Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Runtime bang = make_ref<BangRuntimeObj>();
|
||||
Graph gBang = make_ref<GraphObj>(bang);
|
||||
// Set input data on CPU in a CPU Graph
|
||||
IT_ASSERT(c % group == 0);
|
||||
Tensor i0Cpu = gCpu->addTensor({n, h, w, c}, DataType::Float32);
|
||||
Tensor w0Cpu = gCpu->addTensor({f, r, s, c / group}, DataType::Float32);
|
||||
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||
gCpu->dataMalloc();
|
||||
i0Cpu->setData(IncrementalGenerator());
|
||||
w0Cpu->setData(IncrementalGenerator());
|
||||
|
||||
// Copy input tensors from CPU to Bang
|
||||
Tensor i0Bang = gBang->cloneTensor(i0Cpu);
|
||||
Tensor w0Bang = gBang->cloneTensor(w0Cpu);
|
||||
// Build Bang graph
|
||||
auto conv = gBang->addOp<ConvObj>(i0Bang, w0Bang, nullptr, padh, padw,
|
||||
strideh, stridew, dilationh, dilationw);
|
||||
// allocate Bang memory
|
||||
gBang->dataMalloc();
|
||||
// Execute on Bang
|
||||
bool tune = true;
|
||||
bang->run(gBang, tune);
|
||||
return bang->getPerfTime(gBang);
|
||||
}
|
||||
|
||||
double getPerfMatmulCnnl(int b, int m, int n, int k, const char *name) {
|
||||
Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Runtime bang = make_ref<BangRuntimeObj>();
|
||||
Graph gBang = make_ref<GraphObj>(bang);
|
||||
// Set input data on CPU in a CPU Graph
|
||||
Tensor i0Cpu = gCpu->addTensor({b, m, k}, DataType::Float32);
|
||||
Tensor w0Cpu = gCpu->addTensor({b, k, n}, DataType::Float32);
|
||||
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||
gCpu->dataMalloc();
|
||||
i0Cpu->setData(IncrementalGenerator());
|
||||
w0Cpu->setData(IncrementalGenerator());
|
||||
|
||||
// Copy input tensors from CPU to Bang
|
||||
Tensor i0Bang = gBang->cloneTensor(i0Cpu);
|
||||
Tensor w0Bang = gBang->cloneTensor(w0Cpu);
|
||||
// Build Bang graph
|
||||
auto conv = gBang->addOp<MatmulObj>(i0Bang, w0Bang, nullptr);
|
||||
// allocate Bang memory
|
||||
gBang->dataMalloc();
|
||||
// Execute on Bang
|
||||
bool tune = true;
|
||||
bang->run(gBang, tune);
|
||||
return bang->getPerfTime(gBang);
|
||||
}
|
||||
|
||||
} // namespace opTimer
|
||||
} // namespace infini
|
|
@ -0,0 +1,156 @@
|
|||
#include "operators/conv.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class ConvCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConvObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
const int cpg = op->getChannelPerGroup();
|
||||
const int g = c / cpg;
|
||||
|
||||
int pad[4] = {ph, ph, pw, pw};
|
||||
int stride[2] = {sh, sw};
|
||||
int dilation[2] = {dh, dw};
|
||||
|
||||
cnnlConvolutionDescriptor_t convDesc;
|
||||
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
|
||||
checkCnnlError(cnnlSetConvolutionDescriptor(
|
||||
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aInDesc, aDesc, bInDesc, bDesc, cInDesc, cDesc;
|
||||
auto dimInputs0 = op->getInputs(0)->getDims();
|
||||
auto dimInputs1 = op->getInputs(1)->getDims();
|
||||
auto dimOutput = op->getOutput()->getDims();
|
||||
|
||||
if (dimInputs0.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
if (dimInputs1.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
if (dimOutput.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int inputs0[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2],
|
||||
dimInputs0[3]};
|
||||
int inputs0Array[4] = {dimInputs0[0], dimInputs0[2], dimInputs0[3],
|
||||
dimInputs0[1]};
|
||||
int inputs1[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2],
|
||||
dimInputs1[3]};
|
||||
int inputs1Array[4] = {dimInputs1[0], dimInputs1[2], dimInputs1[3],
|
||||
dimInputs1[1]};
|
||||
int output[4] = {dimOutput[0], dimOutput[1], dimOutput[2],
|
||||
dimOutput[3]};
|
||||
int outputArray[4] = {dimOutput[0], dimOutput[2], dimOutput[3],
|
||||
dimOutput[1]};
|
||||
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aInDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aInDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, inputs0));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs0Array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bInDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bInDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, inputs1));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, inputs1Array));
|
||||
|
||||
int permute[4] = {0, 2, 3, 1};
|
||||
cnnlTransposeDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
|
||||
checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aInDesc, opDesc,
|
||||
&wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
BangPtr aDataOut = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(aInDesc) * sizeof(float));
|
||||
cnnlStatus_t stat =
|
||||
cnnlTranspose_v2(context->cnnlHandle(), opDesc, aInDesc, aData,
|
||||
aDesc, aDataOut, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), bInDesc, opDesc,
|
||||
&wsSize);
|
||||
wsData = context->getWorkspace(wsSize);
|
||||
BangPtr bDataOut = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(bInDesc) * sizeof(float));
|
||||
stat = cnnlTranspose_v2(context->cnnlHandle(), opDesc, bInDesc, bData,
|
||||
bDesc, bDataOut, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cInDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cInDesc, CNNL_LAYOUT_NHWC, CNNL_DTYPE_FLOAT, 4, outputArray));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, output));
|
||||
|
||||
cnnlConvolutionForwardAlgo_t algo;
|
||||
cnnlGetConvolutionForwardAlgorithm(context->cnnlHandle(), convDesc,
|
||||
aDesc, bDesc, cInDesc,
|
||||
CNNL_CONVOLUTION_FWD_FASTEST, &algo);
|
||||
|
||||
cnnlGetConvolutionForwardWorkspaceSize(context->cnnlHandle(), aDesc,
|
||||
bDesc, cInDesc, NULL, convDesc,
|
||||
algo, &wsSize);
|
||||
wsData = context->getWorkspace(wsSize);
|
||||
BangPtr cDataIn = context->getWorkspace(
|
||||
cnnlGetTensorElementNum(cInDesc) * sizeof(float));
|
||||
|
||||
stat = cnnlConvolutionForward(
|
||||
context->cnnlHandle(), convDesc, algo, NULL, aDesc, aData, bDesc,
|
||||
bData, NULL, NULL, wsData, wsSize, NULL, cInDesc, cDataIn);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
int cPermute[4] = {0, 3, 1, 2};
|
||||
cnnlTransposeDescriptor_t opOutDesc;
|
||||
checkCnnlError(cnnlCreateTransposeDescriptor(&opOutDesc));
|
||||
checkCnnlError(cnnlSetTransposeDescriptor(opOutDesc, 4, cPermute));
|
||||
|
||||
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), cInDesc, opOutDesc,
|
||||
&wsSize);
|
||||
wsData = context->getWorkspace(wsSize);
|
||||
|
||||
stat = cnnlTranspose_v2(context->cnnlHandle(), opOutDesc, cInDesc,
|
||||
cDataIn, cDesc, cData, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aInDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bInDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cInDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc));
|
||||
checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
|
||||
checkCnnlError(cnnlDestroyTransposeDescriptor(opOutDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Conv, DataType::Float32, ConvCnnl,
|
||||
"Conv_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,65 @@
|
|||
#include "operators/matmul.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class MatmulCnnl : public BangKernelWithoutConfig {
|
||||
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<MatmulObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dimInputs0 = op->getInputs(0)->getDims();
|
||||
auto dimInputs1 = op->getInputs(1)->getDims();
|
||||
auto dimOutput = op->getOutput()->getDims();
|
||||
if (dimInputs0.size() != 3)
|
||||
IT_TODO_HALT();
|
||||
if (dimInputs1.size() != 3)
|
||||
IT_TODO_HALT();
|
||||
if (dimOutput.size() != 3)
|
||||
IT_TODO_HALT();
|
||||
|
||||
bool transA = op->getTransA();
|
||||
bool transB = op->getTransB();
|
||||
|
||||
int inputs0Array[3] = {dimInputs0[0], dimInputs0[1], dimInputs0[2]};
|
||||
int inputs1Array[3] = {dimInputs1[0], dimInputs1[1], dimInputs1[2]};
|
||||
int outputArray[3] = {dimOutput[0], dimOutput[1], dimOutput[2]};
|
||||
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, inputs0Array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, inputs1Array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 3, outputArray));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlBatchMatMul(context->cnnlHandle(), transA, transB, aDesc, aData,
|
||||
bDesc, bData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Matmul, DataType::Float32, MatmulCnnl,
|
||||
"Matmul_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,58 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/conv.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
|
||||
const std::function<void(void *, size_t, DataType)> &generatorB,
|
||||
const Shape &shapeA, const Shape &shapeB) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generatorB);
|
||||
|
||||
// MLU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputMlu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputMlu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto mluOp =
|
||||
bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputMlu = mluOp->getOutput();
|
||||
auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp =
|
||||
cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
cpuGraph->dataMalloc();
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
outputCpu->print();
|
||||
outputMlu2Cpu->print();
|
||||
// Check
|
||||
// EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu));
|
||||
EXPECT_TRUE(true);
|
||||
}
|
||||
|
||||
TEST(cnnl_Conv, run) {
|
||||
testConv<ConvObj>(IncrementalGenerator(), IncrementalGenerator(),
|
||||
Shape{1, 3, 224, 224}, Shape{2, 3, 3, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,56 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/matmul.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
|
||||
const std::function<void(void *, size_t, DataType)> &generatorB,
|
||||
bool transA, bool transB, const Shape &shapeA,
|
||||
const Shape &shapeB) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generatorB);
|
||||
|
||||
// MLU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputMlu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputMlu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto mluOp = bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputMlu = mluOp->getOutput();
|
||||
auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||
cpuGraph->dataMalloc();
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
outputCpu->print();
|
||||
outputMlu2Cpu->print();
|
||||
// Check
|
||||
EXPECT_TRUE(outputCpu->equalData(outputMlu2Cpu));
|
||||
}
|
||||
|
||||
TEST(cnnl_Matmul, run) {
|
||||
testMatmul<MatmulObj>(IncrementalGenerator(), IncrementalGenerator(), false,
|
||||
false, Shape{1, 2, 3}, Shape{1, 3, 4});
|
||||
}
|
||||
|
||||
} // namespace infini
|
Loading…
Reference in New Issue