diff --git a/src/kernels/bang/element_wise.cc b/src/kernels/bang/element_wise.cc index 90ac173a..825f61db 100644 --- a/src/kernels/bang/element_wise.cc +++ b/src/kernels/bang/element_wise.cc @@ -468,6 +468,104 @@ class FloorDivTruncCnnl : public BangKernelWithoutConfig { } }; +class FloorModCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + size_t wsSize; + cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, + &wsSize); + + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = cnnlFloorMod(context->cnnlHandle(), + aDesc, aData, bDesc, bData, cDesc, cData, wsData, wsSize); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +// class FloorModTruncCnnl : public BangKernelWithoutConfig { +// void compute(const Operator &_op, +// const RuntimeObj *_context) const override { +// auto op = as(_op); +// auto context = dynamic_cast(_context); +// +// void *const aData = (op->getInputs(0)->getRawDataPtr()); +// void *const bData = (op->getInputs(1)->getRawDataPtr()); +// void *const cData = (op->getOutput()->getRawDataPtr()); +// +// cnnlTensorDescriptor_t aDesc, bDesc, cDesc; +// auto dim = op->getInputs(0)->getDims(); +// if (dim.size() != 4) +// IT_TODO_HALT(); +// +// int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; +// // get inputs +// checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); +// checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, +// CNNL_DTYPE_FLOAT, 4, dim_array)); +// +// checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); +// checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, +// CNNL_DTYPE_FLOAT, 4, dim_array)); +// +// // get outputs +// checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); +// checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, +// CNNL_DTYPE_FLOAT, 4, dim_array)); +// +// size_t wsSize; +// cnnlGetFloorModTruncWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, +// &wsSize); +// +// BangPtr wsData = context->getWorkspace(wsSize); +// +// cnnlStatus_t stat = cnnlFloorModTrunc(context->cnnlHandle(), +// aDesc, aData, bDesc, bData, cDesc, cData, wsData, wsSize); +// if (stat != CNNL_STATUS_SUCCESS) +// return; +// +// // Destories in BANG does not require sync. But cnnl does not state +// // whether sync is required before destories. +// checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); +// checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); +// checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); +// } +// }; + class AddCnnl : public ElementWiseCnnl { cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_ADD; } }; @@ -515,6 +613,10 @@ REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, DataType::Float32, FloorDivCnnl, "FloorDiv_cnnl_BANG_Float32"); REGISTER_KERNEL(Device::BANG, OpType::FloorDivTrunc, DataType::Float32, FloorDivTruncCnnl, "FloorDivTrunc_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::FloorMod, DataType::Float32, FloorModCnnl, + "FloorMod_cnnl_BANG_Float32"); +// REGISTER_KERNEL(Device::BANG, OpType::FloorModTrunc, DataType::Float32, FloorModTruncCnnl, +// "FloorModTrunc_cnnl_BANG_Float32"); // REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32, // ElementWiseBang, // "Pow_Bang_Float32"); diff --git a/test/kernels/bang/test_bang_floormod.cc b/test/kernels/bang/test_bang_floormod.cc new file mode 100644 index 00000000..e720ec9b --- /dev/null +++ b/test/kernels/bang/test_bang_floormod.cc @@ -0,0 +1,49 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/element_wise.h" + +#include "test.h" + +namespace infini { + +template +void testFloorMod( + const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = CpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // Check + inputCpu1->printData(); + inputCpu2->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_FloorMod, run) { + testFloorMod(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini