forked from jiuyuan/InfiniTensor
add floormod operation
This commit is contained in:
parent
dbb606f158
commit
5ae96ce060
|
@ -468,6 +468,104 @@ class FloorDivTruncCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
class FloorModCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
&wsSize);
|
||||
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat = cnnlFloorMod(context->cnnlHandle(),
|
||||
aDesc, aData, bDesc, bData, cDesc, cData, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
// class FloorModTruncCnnl : public BangKernelWithoutConfig {
|
||||
// void compute(const Operator &_op,
|
||||
// const RuntimeObj *_context) const override {
|
||||
// auto op = as<ElementWiseObj>(_op);
|
||||
// auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
//
|
||||
// void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
// void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
// void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
//
|
||||
// cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
// auto dim = op->getInputs(0)->getDims();
|
||||
// if (dim.size() != 4)
|
||||
// IT_TODO_HALT();
|
||||
//
|
||||
// int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// // get inputs
|
||||
// checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
// checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
// CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
//
|
||||
// checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
// checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
// CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
//
|
||||
// // get outputs
|
||||
// checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
// checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
// CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
//
|
||||
// size_t wsSize;
|
||||
// cnnlGetFloorModTruncWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
// &wsSize);
|
||||
//
|
||||
// BangPtr wsData = context->getWorkspace(wsSize);
|
||||
//
|
||||
// cnnlStatus_t stat = cnnlFloorModTrunc(context->cnnlHandle(),
|
||||
// aDesc, aData, bDesc, bData, cDesc, cData, wsData, wsSize);
|
||||
// if (stat != CNNL_STATUS_SUCCESS)
|
||||
// return;
|
||||
//
|
||||
// // Destories in BANG does not require sync. But cnnl does not state
|
||||
// // whether sync is required before destories.
|
||||
// checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
// checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
// checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
// }
|
||||
// };
|
||||
|
||||
class AddCnnl : public ElementWiseCnnl {
|
||||
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_ADD; }
|
||||
};
|
||||
|
@ -515,6 +613,10 @@ REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, DataType::Float32, FloorDivCnnl,
|
|||
"FloorDiv_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::FloorDivTrunc, DataType::Float32, FloorDivTruncCnnl,
|
||||
"FloorDivTrunc_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::FloorMod, DataType::Float32, FloorModCnnl,
|
||||
"FloorMod_cnnl_BANG_Float32");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::FloorModTrunc, DataType::Float32, FloorModTruncCnnl,
|
||||
// "FloorModTrunc_cnnl_BANG_Float32");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
|
||||
// ElementWiseBang,
|
||||
// "Pow_Bang_Float32");
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testFloorMod(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu1->printData();
|
||||
inputCpu2->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_FloorMod, run) {
|
||||
testFloorMod<FloorModObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
Loading…
Reference in New Issue