diff --git a/src/kernels/kunlun/softmax.cc b/src/kernels/kunlun/softmax.cc new file mode 100644 index 00000000..56374766 --- /dev/null +++ b/src/kernels/kunlun/softmax.cc @@ -0,0 +1,26 @@ +#include "operators/softmax.h" +#include "kunlun/kunlun_kernel_without_config.h" +#include "kunlun/kunlun_runtime.h" + +namespace infini { +class SoftmaxXdnn : public KUNLUNKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + auto dim = op->getInputs(0)->getDims(); + auto axis = op->getAxis(); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + auto ret = baidu::xpu::api::softmax( + context->KUNLUNHandle(), (float *)aData, (float *)cData, dim, axis); + assert(ret == 0); + return; + } +}; + +REGISTER_KERNEL(Device::KUNLUN, OpType::Softmax, DataType::Float32, SoftmaxXdnn, + "Softmax_xdnn_KUNLUN_Float32"); +}; // namespace infini diff --git a/test/kernels/kunlun/test_kunlun_softmax.cc b/test/kernels/kunlun/test_kunlun_softmax.cc new file mode 100644 index 00000000..77d6dbd8 --- /dev/null +++ b/test/kernels/kunlun/test_kunlun_softmax.cc @@ -0,0 +1,136 @@ +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "kunlun/kunlun_runtime.h" +#include "operators/softmax.h" +#include "test.h" +#include +namespace infini { + +TEST(XDNN_Softmax, run_axis1) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto kunlunRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = + make_ref(Shape{2, 4}, DataType::Float32, cpuRuntime); + + // KUNLUN XPU + Graph kunlunGraph = make_ref(kunlunRuntime); + auto inputKunlun = kunlunGraph->cloneTensor(inputCpu); + auto kunlunOp = kunlunGraph->addOp(inputKunlun, nullptr, 1); + kunlunGraph->dataMalloc(); + inputKunlun->copyin(vector{0, 1, 2, 3, 10000, 10001, 10002, 10003}); + kunlunRuntime->run(kunlunGraph); + auto outputKunlun = kunlunOp->getOutput(); + auto outputKunlun2Cpu = outputKunlun->clone(cpuRuntime); + + // Check + EXPECT_TRUE(outputKunlun2Cpu->equalData( + vector{0.032058604, 0.08714432, 0.23688284, 0.6439143, + 0.032058604, 0.08714432, 0.23688284, 0.6439143})); +} + +TEST(XDNN_Softmax, run_axis0) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto kunlunRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = + make_ref(Shape{2, 4}, DataType::Float32, cpuRuntime); + + // KUNLUN XPU + Graph kunlunGraph = make_ref(kunlunRuntime); + auto inputKunlun = kunlunGraph->cloneTensor(inputCpu); + auto kunlunOp = kunlunGraph->addOp(inputKunlun, nullptr, 0); + kunlunGraph->dataMalloc(); + inputKunlun->copyin(vector{0, 1, 2, 3, 10000, 10001, 10002, 10003}); + kunlunRuntime->run(kunlunGraph); + auto outputKunlun = kunlunOp->getOutput(); + auto outputKunlun2Cpu = outputKunlun->clone(cpuRuntime); + + // Check + EXPECT_TRUE( + outputKunlun2Cpu->equalData(vector{0., 0., 0., 0., 1, 1, 1, 1})); +} + +TEST(XDNN_Softmax2, run_axis1) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto kunlunRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = + make_ref(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime); + + // KUNLUN XPU + Graph kunlunGraph = make_ref(kunlunRuntime); + auto inputKunlun = kunlunGraph->cloneTensor(inputCpu); + auto kunlunOp = kunlunGraph->addOp(inputKunlun, nullptr, 1); + kunlunGraph->dataMalloc(); + inputKunlun->setData(IncrementalGenerator()); + kunlunRuntime->run(kunlunGraph); + auto outputKunlun = kunlunOp->getOutput(); + auto outputKunlun2Cpu = outputKunlun->clone(cpuRuntime); + + // Check + EXPECT_TRUE(outputKunlun2Cpu->equalData(vector{ + 0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138, 0.9820138, + 0.9820138, 0.9820138, 0.0179862, 0.0179862, 0.0179862, 0.0179862, + 0.9820138, 0.9820138, 0.9820138, 0.9820138})); +} + +TEST(XDNN_Softmax2, run_axis2) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto kunlunRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = + make_ref(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime); + + // KUNLUN XPU + Graph kunlunGraph = make_ref(kunlunRuntime); + auto inputKunlun = kunlunGraph->cloneTensor(inputCpu); + auto kunlunOp = kunlunGraph->addOp(inputKunlun, nullptr, 2); + kunlunGraph->dataMalloc(); + inputKunlun->setData(IncrementalGenerator()); + kunlunRuntime->run(kunlunGraph); + auto outputKunlun = kunlunOp->getOutput(); + auto outputKunlun2Cpu = outputKunlun->clone(cpuRuntime); + + // Check + EXPECT_TRUE(outputKunlun2Cpu->equalData(vector{ + 0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029, + 0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971, 0.8807971, + 0.1192029, 0.1192029, 0.8807971, 0.8807971})); +} + +TEST(XDNN_Softmax2, run_axis3) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto kunlunRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = + make_ref(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime); + + // KUNLUN XPU + Graph kunlunGraph = make_ref(kunlunRuntime); + auto inputKunlun = kunlunGraph->cloneTensor(inputCpu); + auto kunlunOp = kunlunGraph->addOp(inputKunlun, nullptr, 3); + kunlunGraph->dataMalloc(); + inputKunlun->setData(IncrementalGenerator()); + kunlunRuntime->run(kunlunGraph); + auto outputKunlun = kunlunOp->getOutput(); + auto outputKunlun2Cpu = outputKunlun->clone(cpuRuntime); + + // Check + EXPECT_TRUE(outputKunlun2Cpu->equalData(vector{ + 0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586, + 0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586, + 0.2689414, 0.7310586, 0.2689414, 0.7310586})); +} +} // namespace infini