diff --git a/include/bang/bang_softmax.h b/include/bang/bang_softmax.h index ae260d4e..9fde8419 100644 --- a/include/bang/bang_softmax.h +++ b/include/bang/bang_softmax.h @@ -1,11 +1,11 @@ #pragma once #include "bang/bang_runtime.h" -#include "bang_highSoftmax.h" +#include "bang_bangSoftmax.h" #include "operators/softmax.h" namespace infini { void softmax_kernel(const RuntimeObj *obj, const Operator &_op) { - auto op = as(_op); + auto op = as(_op); void *const mlu_src = (op->getInputs(0)->getRawDataPtr()); void *const mlu_destination = (op->getOutput()->getRawDataPtr()); @@ -31,7 +31,7 @@ void softmax_kernel(const RuntimeObj *obj, const Operator &_op) { othersize *= shape[s]; } } - if (op->getOpType() == OpType::Softmax) + if (op->getOpType() == OpType::BangSoftmax) softmaxKernel(context->cnnlHandle(), (float *)mlu_destination, (float *)mlu_src, nDim, axis, othersize, frontsize, dimsize, stride); diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h index ce455d62..edbacfb7 100644 --- a/include/core/graph_handler.h +++ b/include/core/graph_handler.h @@ -59,6 +59,7 @@ class GraphHandlerObj { Tensor tanh(Tensor x, Tensor y); Tensor erf(Tensor x, Tensor y); Tensor softmax(Tensor x, Tensor y, int axis); + Tensor bangSoftmax(Tensor x, Tensor y, int axis); Tensor abs(Tensor x, Tensor y); Tensor sqrt(Tensor x, Tensor y); Tensor neg(Tensor x, Tensor y); diff --git a/include/core/op_type.h b/include/core/op_type.h index dbcfbdb9..acfaa160 100644 --- a/include/core/op_type.h +++ b/include/core/op_type.h @@ -180,6 +180,7 @@ struct OpType { Size, Slice, Softmax, + BangSoftmax, SoftmaxCrossEntropyLoss, Softplus, Softsign, diff --git a/include/operators/softmax.h b/include/operators/softmax.h index b24c0ffb..88134a47 100644 --- a/include/operators/softmax.h +++ b/include/operators/softmax.h @@ -24,4 +24,26 @@ class SoftmaxObj : public OperatorObj { vector getWorkloadVector() const override; vector getOpAttrVector() const override; }; +class BangSoftmaxObj : public OperatorObj { + int axis; + + public: + BangSoftmaxObj(GraphObj *graph, Tensor input, Tensor output, int axis); + + OP_CLONE(BangSoftmaxObj); + + optional> inferShape(const TensorVec &inputs) override { + return {{inputs[0]->getDims()}}; + }; + + std::string toString() const override; + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + + int getAxis() const { return axis; } + + private: + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; } // namespace infini diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc index 0821121d..4b0b8dac 100644 --- a/src/core/graph_handler.cc +++ b/src/core/graph_handler.cc @@ -227,6 +227,15 @@ Tensor GraphHandlerObj::softmax(Tensor input, Tensor output, int axis) { ->getOutput(); } } +Tensor GraphHandlerObj::bangSoftmax(Tensor input, Tensor output, int axis) { + if (output) { + g->addOpWithOutputs(std::move(input), output, axis); + return output; + } else { + return g->addOp(std::move(input), output, axis) + ->getOutput(); + } +} Tensor GraphHandlerObj::flatten(Tensor input, Tensor output, int axis) { if (output) { diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc index 9dc43510..7a39e547 100644 --- a/src/ffi/ffi_infinitensor.cc +++ b/src/ffi/ffi_infinitensor.cc @@ -522,6 +522,7 @@ void init_graph_builder(py::module &m) { .def("hardSigmoid", &Handler::hardSigmoid, policy::move) .def("hardSwish", &Handler::hardSwish, policy::move) .def("softmax", &Handler::softmax, policy::move) + .def("bangSoftmax", &Handler::bangSoftmax, policy::move) .def("abs", &Handler::abs, policy::move) .def("sqrt", &Handler::sqrt, policy::move) .def("neg", &Handler::neg, policy::move) diff --git a/src/kernels/bang/activation.cc b/src/kernels/bang/activation.cc index a6b98db6..4105b168 100644 --- a/src/kernels/bang/activation.cc +++ b/src/kernels/bang/activation.cc @@ -246,7 +246,8 @@ REGISTER_KERNEL(Device::BANG, OpType::PRelu, PReluCnnl, "PRelu_cnnl_BANG"); REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl, "Sigmoid_cnnl_BANG"); REGISTER_KERNEL(Device::BANG, OpType::Round, RoundCnnl, "Round_cnnl_BANG"); - +REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxCnnl, + "Softmax_cnnl_BANG"); REGISTER_KERNEL(Device::BANG, OpType::HardSigmoid, HardSigmoidCnnl, "HardSigmoid_cnnl_BANG"); REGISTER_KERNEL(Device::BANG, OpType::HardSwish, HardSwishCnnl, diff --git a/src/kernels/bang/softmax.cc b/src/kernels/bang/softmax.cc index 985d8c46..2ac09934 100644 --- a/src/kernels/bang/softmax.cc +++ b/src/kernels/bang/softmax.cc @@ -10,5 +10,5 @@ class SoftmaxBang : public BangKernelWithoutConfig { } }; -REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxBang, "Softmax_BANG"); +REGISTER_KERNEL(Device::BANG, OpType::BangSoftmax, SoftmaxBang, "Softmax_BANG"); }; // namespace infini diff --git a/src/kernels/mlu/include/highSoftmax.h b/src/kernels/mlu/include/bangSoftmax.h similarity index 100% rename from src/kernels/mlu/include/highSoftmax.h rename to src/kernels/mlu/include/bangSoftmax.h diff --git a/src/kernels/mlu/include/bang_highSoftmax.h b/src/kernels/mlu/include/bang_bangSoftmax.h similarity index 100% rename from src/kernels/mlu/include/bang_highSoftmax.h rename to src/kernels/mlu/include/bang_bangSoftmax.h diff --git a/src/kernels/mlu/src/highSoftmax.mlu b/src/kernels/mlu/src/bangSoftmax.mlu similarity index 90% rename from src/kernels/mlu/src/highSoftmax.mlu rename to src/kernels/mlu/src/bangSoftmax.mlu index 4771f243..f384d0ae 100644 --- a/src/kernels/mlu/src/highSoftmax.mlu +++ b/src/kernels/mlu/src/bangSoftmax.mlu @@ -1,5 +1,5 @@ -#include "bang_highSoftmax.h" -#include "highSoftmax.h" +#include "bang_bangSoftmax.h" +#include "bangSoftmax.h" namespace infini{ void softmaxKernel(cnnlHandle_t handle, float *mlu_destination, float *mlu_src, int nDim, int axis, int othersize, int frontsize, int dimsize, int stride){ diff --git a/src/kernels/mlu/src/highSoftmax_device.mlu b/src/kernels/mlu/src/bangSoftmax_device.mlu similarity index 100% rename from src/kernels/mlu/src/highSoftmax_device.mlu rename to src/kernels/mlu/src/bangSoftmax_device.mlu diff --git a/src/operators/softmax.cc b/src/operators/softmax.cc index f9dde777..dcac0766 100644 --- a/src/operators/softmax.cc +++ b/src/operators/softmax.cc @@ -31,4 +31,33 @@ vector SoftmaxObj::getWorkloadVector() const { vector SoftmaxObj::getOpAttrVector() const { return {type.underlying(), axis}; } +BangSoftmaxObj::BangSoftmaxObj(GraphObj *graph, Tensor input, Tensor output, + int _axis) + : OperatorObj(OpType::BangSoftmax, {input}, {output}) { + int rank = input->getRank(); + axis = get_real_axis(_axis, rank); + IT_ASSERT(checkValid(graph)); +} + +std::string BangSoftmaxObj::toString() const { + std::ostringstream os; + os << type.toString() << "[" << getGuid() << "]"; + os << "("; + os << vecToString(inputs[0]->getDims()) << ","; + os << "input=" << inputs[0]->getGuid() << ","; + os << "output=" << outputs[0]->getGuid() << ","; + os << "axis=" << axis << ")"; + return os.str(); +} + +vector BangSoftmaxObj::getWorkloadVector() const { + vector ret{type.underlying(), axis}; + const Shape shape = outputs[0]->getDims(); + ret.insert(ret.end(), shape.begin(), shape.end()); + return ret; +} + +vector BangSoftmaxObj::getOpAttrVector() const { + return {type.underlying(), axis}; +} } // namespace infini diff --git a/test/kernels/bang/test_bang_softmax.cc b/test/kernels/bang/test_bang_softmax.cc index 83fcd0e8..0da12779 100644 --- a/test/kernels/bang/test_bang_softmax.cc +++ b/test/kernels/bang/test_bang_softmax.cc @@ -5,134 +5,121 @@ #include "operators/softmax.h" #include "test.h" #include +#include namespace infini { double eps = 3e-3; -TEST(cuDNN_Softmax, run_axis1) { - // Runtime +void test_softmaxFp32(const Shape &inputShape, const vector &inputData, + int axis, const vector &expectData) { Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); auto bangRuntime = make_ref(); // Build input data on CPU Tensor inputCpu = - make_ref(Shape{2, 4}, DataType::Float32, cpuRuntime); + make_ref(inputShape, DataType::Float32, cpuRuntime); // GPU Graph bangGraph = make_ref(bangRuntime); auto inputGpu = bangGraph->cloneTensor(inputCpu); - auto gpuOp = bangGraph->addOp(inputGpu, nullptr, 1); + // cnnlSoftmax + auto gpuOp = bangGraph->addOp(inputGpu, nullptr, axis); bangGraph->dataMalloc(); - inputGpu->copyin(vector{0, 1, 2, 3, 10000, 10001, 10002, 10003}); + inputGpu->copyin(inputData); bangRuntime->run(bangGraph); auto outputGpu = gpuOp->getOutput(); auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // bangSoftmax + auto bangGpuOp = bangGraph->addOp(inputGpu, nullptr, axis); + bangGraph->dataMalloc(); + inputGpu->copyin(inputData); + bangRuntime->run(bangGraph); + auto bangOutputGpu = gpuOp->getOutput(); + auto bangOutputGpu2Cpu = bangOutputGpu->clone(cpuRuntime); // Check - EXPECT_TRUE(outputGpu2Cpu->equalData( + EXPECT_TRUE(outputGpu2Cpu->equalData(expectData, eps)); // cnnlSoftmax + EXPECT_TRUE(bangOutputGpu2Cpu->equalData(expectData, eps)); // bangSoftmax +} +double get_walltime() { + struct timeval tp; + gettimeofday(&tp, NULL); + return (double)(tp.tv_sec + tp.tv_usec * 1e-6); +} +float err(float *x, float *y, const Shape &inputShape, int nDim) { + int size = 1; + for (int i = 0; i < nDim; i++) { + size *= inputShape[i]; + } + float error = 0; + for (int i = 0; i < size; i++) { + if (fabs(x[i] - y[i]) > error) { + error = fabs(x[i] - y[i]); + } + } + return error; +} +void test_compareSoftmaxFp32( + int axis, const Shape &inputShape, int nDim, + const std::function &generator) { + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = + make_ref(inputShape, DataType::Float32, cpuRuntime); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + // cnnlSoftmax + auto gpuOp = bangGraph->addOp(inputGpu, nullptr, axis); + bangGraph->dataMalloc(); + inputGpu->setData(generator); + double bangst, bangela; + bangst = get_walltime(); + bangRuntime->run(bangGraph); + bangela = 1000 * (get_walltime() - bangst); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // bangSoftmax + auto bangGpuOp = bangGraph->addOp(inputGpu, nullptr, axis); + bangGraph->dataMalloc(); + inputGpu->setData(generator); + double cnnlst, cnnlela; + cnnlst = get_walltime(); + bangRuntime->run(bangGraph); + cnnlela = 1000 * (get_walltime() - cnnlst); + auto bangOutputGpu = gpuOp->getOutput(); + auto bangOutputGpu2Cpu = bangOutputGpu->clone(cpuRuntime); + // Check + float error = + err(outputGpu2Cpu->getRawDataPtr(), + bangOutputGpu2Cpu->getRawDataPtr(), inputShape, nDim); + printf("axis:%d. bang time:%.2f ms, cnnl time:%.2f ms, err:%.4e\n", axis, + bangela, cnnlela, error); +} +TEST(BANG_SoftmaxFp32, run) { + test_softmaxFp32( + Shape{2, 3, 2, 2}, + vector{0., 1., 2., 3., 4., 5., 6., 7., + 8., 9., 10., 11., 12., 13., 14., 15., + 16., 17., 18., 19., 20., 21., 22., 23.}, + 0, vector{6.14417422e-06, 6.14417422e-06, 6.14417422e-06, + 6.14417422e-06, 6.14417422e-06, 6.14417422e-06, + 6.14417422e-06, 6.14417422e-06, 6.14417422e-06, + 6.14417422e-06, 6.14417422e-06, 6.14417422e-06, + 9.99993801e-01, 9.99993801e-01, 9.99993801e-01, + 9.99993801e-01, 9.99993801e-01, 9.99993801e-01, + 9.99993801e-01, 9.99993801e-01, 9.99993801e-01, + 9.99993801e-01, 9.99993801e-01, 9.99993801e-01}); + test_softmaxFp32( + Shape{2, 4}, vector{0., 1., 2., 3., 1000, 1001, 1002, 1003}, 1, vector{0.032058604, 0.08714432, 0.23688284, 0.6439143, - 0.032058604, 0.08714432, 0.23688284, 0.6439143}, - eps)); + 0.032058604, 0.08714432, 0.23688284, 0.6439143}); +} +TEST(BANG_CompareSoftmaxFp32, run) { + test_compareSoftmaxFp32(3, Shape{1, 32, 1, 5}, 4, RandomGenerator()); + test_compareSoftmaxFp32(3, Shape{1, 32, 128, 5}, 4, RandomGenerator()); + test_compareSoftmaxFp32(3, Shape{1, 32, 1, 5}, 4, IncrementalGenerator()); + test_compareSoftmaxFp32(3, Shape{1, 32, 128, 5}, 4, IncrementalGenerator()); } -TEST(cuDNN_Softmax, run_axis0) { - // Runtime - Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); - auto bangRuntime = make_ref(); - - // Build input data on CPU - Tensor inputCpu = - make_ref(Shape{2, 4}, DataType::Float32, cpuRuntime); - - // GPU - Graph bangGraph = make_ref(bangRuntime); - auto inputGpu = bangGraph->cloneTensor(inputCpu); - auto gpuOp = bangGraph->addOp(inputGpu, nullptr, 0); - bangGraph->dataMalloc(); - inputGpu->copyin(vector{0, 1, 2, 3, 10000, 10001, 10002, 10003}); - bangRuntime->run(bangGraph); - auto outputGpu = gpuOp->getOutput(); - auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); - // Check - EXPECT_TRUE(outputGpu2Cpu->equalData( - vector{0., 0., 0., 0., 1, 1, 1, 1}, eps)); -} - -TEST(cuDNN_Softmax2, run_axis1) { - // Runtime - Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); - auto bangRuntime = make_ref(); - - // Build input data on CPU - Tensor inputCpu = - make_ref(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime); - - // GPU - Graph bangGraph = make_ref(bangRuntime); - auto inputGpu = bangGraph->cloneTensor(inputCpu); - auto gpuOp = bangGraph->addOp(inputGpu, nullptr, 1); - bangGraph->dataMalloc(); - inputGpu->setData(IncrementalGenerator()); - bangRuntime->run(bangGraph); - auto outputGpu = gpuOp->getOutput(); - auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); - // Check - EXPECT_TRUE(outputGpu2Cpu->equalData( - vector{0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138, - 0.9820138, 0.9820138, 0.9820138, 0.0179862, 0.0179862, - 0.0179862, 0.0179862, 0.9820138, 0.9820138, 0.9820138, - 0.9820138}, - eps)); -} - -TEST(cuDNN_Softmax2, run_axis2) { - // Runtime - Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); - auto bangRuntime = make_ref(); - - // Build input data on CPU - Tensor inputCpu = - make_ref(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime); - - // GPU - Graph bangGraph = make_ref(bangRuntime); - auto inputGpu = bangGraph->cloneTensor(inputCpu); - auto gpuOp = bangGraph->addOp(inputGpu, nullptr, 2); - bangGraph->dataMalloc(); - inputGpu->setData(IncrementalGenerator()); - bangRuntime->run(bangGraph); - auto outputGpu = gpuOp->getOutput(); - auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); - // Check - EXPECT_TRUE(outputGpu2Cpu->equalData( - vector{0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029, - 0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029, - 0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971, - 0.8807971}, - eps)); -} - -TEST(cuDNN_Softmax2, run_axis3) { - // Runtime - Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); - auto bangRuntime = make_ref(); - - // Build input data on CPU - Tensor inputCpu = - make_ref(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime); - - // GPU - Graph bangGraph = make_ref(bangRuntime); - auto inputGpu = bangGraph->cloneTensor(inputCpu); - auto gpuOp = bangGraph->addOp(inputGpu, nullptr, 3); - bangGraph->dataMalloc(); - inputGpu->setData(IncrementalGenerator()); - bangRuntime->run(bangGraph); - auto outputGpu = gpuOp->getOutput(); - auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); - // Check - EXPECT_TRUE(outputGpu2Cpu->equalData( - vector{0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, - 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586, - 0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, - 0.7310586}, - eps)); -} } // namespace infini