diff --git a/src/kernels/bang/activation.cc b/src/kernels/bang/activation.cc index 23b86cc4..87b8396f 100644 --- a/src/kernels/bang/activation.cc +++ b/src/kernels/bang/activation.cc @@ -1,5 +1,6 @@ #include "bang/bang_kernel_without_config.h" #include "bang/bang_runtime.h" +#include "operators/softmax.h" #include "operators/unary.h" namespace infini { @@ -113,6 +114,72 @@ class PReluCnnl : public BangKernelWithoutConfig { } }; +class SoftmaxCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto aDim = op->getInputs(0)->getDims(); + + cnnlSoftmaxMode_t mode; + size_t axis = op->getAxis(); + std::vector inDim = {1, 1, 1}; + std::vector outDim = inDim; + + if (axis == 0) { + mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION; + inDim[0] = aDim[0]; + inDim[1] = aDim[1]; + for (size_t i = 2; i < aDim.size(); ++i) { + inDim[2] *= aDim[i]; + } + outDim = inDim; + } else if (axis == aDim.size() - 1) { + mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION; + inDim[0] = aDim[0]; + for (size_t i = 1; i < axis; ++i) { + inDim[1] *= aDim[i]; + } + inDim[2] = aDim[axis]; + outDim = inDim; + } else { + mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION; + for (size_t i = 0; i < axis; ++i) { + inDim[0] *= aDim[i]; + } + inDim[1] = aDim[axis]; + for (size_t i = axis + 1; i < aDim.size(); ++i) { + inDim[2] *= aDim[i]; + } + outDim = inDim; + } + + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, inDim.size(), + inDim.data())); + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, outDim.size(), + outDim.data())); + float alpha = 1.0; + float beta = 0.0; + cnnlStatus_t stat = + cnnlSoftmaxForward_v2(context->cnnlHandle(), CNNL_SOFTMAX_ACCURATE, + mode, CNNL_COMPUTATION_HIGH_PRECISION, &alpha, + aDesc, aData, &beta, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + class ReluCnnl : public UnaryCnnl { cnnlActivationMode_t getOpType() const override { return CNNL_ACTIVATION_RELU; @@ -135,5 +202,7 @@ REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, DataType::Float32, SigmoidCnnl, "Sigmoid_cnnl_BANG_Float32"); REGISTER_KERNEL(Device::BANG, OpType::Round, DataType::Float32, RoundCnnl, "Round_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Softmax, DataType::Float32, SoftmaxCnnl, + "Softmax_cnnl_BANG_Float32"); }; // namespace infini diff --git a/src/kernels/bang/conv_trans.cc b/src/kernels/bang/conv_trans.cc index 05ec04fb..baa84c8a 100644 --- a/src/kernels/bang/conv_trans.cc +++ b/src/kernels/bang/conv_trans.cc @@ -39,24 +39,17 @@ class ConvTransCnnl : public BangKernelWithoutConfig { if (dimOutput.size() != 4) IT_TODO_HALT(); - int inputs0[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2], - dimInputs0[3]}; - int inputs1[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2], - dimInputs1[3]}; - int output[4] = {dimOutput[0], dimOutput[1], dimOutput[2], - dimOutput[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); - checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, inputs0)); + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs0.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); - checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, inputs1)); + checkCnnlError(cnnlSetTensorDescriptor( + bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimInputs1.data())); // get outputs checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); - checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, output)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimOutput.data())); cnnlConvolutionBwdDataAlgo_t algo; cnnlGetConvolutionBackwardDataAlgorithm( @@ -69,7 +62,7 @@ class ConvTransCnnl : public BangKernelWithoutConfig { BangPtr wsData = context->getWorkspace(wsSize); cnnlStatus_t stat = cnnlConvolutionBackwardData( - context->cnnlHandle(), NULL, aDesc, aData, bDesc, bData, convDesc, + context->cnnlHandle(), NULL, bDesc, bData, aDesc, aData, convDesc, algo, wsData, wsSize, NULL, cDesc, cData); if (stat != CNNL_STATUS_SUCCESS) return; diff --git a/src/kernels/bang/element_wise.cc b/src/kernels/bang/element_wise.cc index 9255e3da..9c1d95b4 100644 --- a/src/kernels/bang/element_wise.cc +++ b/src/kernels/bang/element_wise.cc @@ -21,6 +21,13 @@ class ElementWiseCnnl : public BangKernelWithoutConfig { auto a_dim = op->getInputs(0)->getDims(); auto b_dim = op->getInputs(1)->getDims(); auto c_dim = op->getOutput()->getDims(); + if (a_dim.size() == 0) { + a_dim.push_back(1); + } + + if (b_dim.size() == 0) { + b_dim.push_back(1); + } checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, @@ -77,6 +84,13 @@ class LogicOpCnnl : public BangKernelWithoutConfig { auto a_dim = op->getInputs(0)->getDims(); auto b_dim = op->getInputs(1)->getDims(); auto c_dim = op->getOutput()->getDims(); + if (a_dim.size() == 0) { + a_dim.push_back(1); + } + + if (b_dim.size() == 0) { + b_dim.push_back(1); + } checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, @@ -123,6 +137,13 @@ class BitComputeCnnl : public BangKernelWithoutConfig { auto a_dim = op->getInputs(0)->getDims(); auto b_dim = op->getInputs(1)->getDims(); auto c_dim = op->getOutput()->getDims(); + if (a_dim.size() == 0) { + a_dim.push_back(1); + } + + if (b_dim.size() == 0) { + b_dim.push_back(1); + } checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, @@ -168,6 +189,13 @@ class DivCnnl : public BangKernelWithoutConfig { auto a_dim = op->getInputs(0)->getDims(); auto b_dim = op->getInputs(1)->getDims(); auto c_dim = op->getOutput()->getDims(); + if (a_dim.size() == 0) { + a_dim.push_back(1); + } + + if (b_dim.size() == 0) { + b_dim.push_back(1); + } checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, @@ -213,6 +241,13 @@ class MaximumCnnl : public BangKernelWithoutConfig { auto a_dim = op->getInputs(0)->getDims(); auto b_dim = op->getInputs(1)->getDims(); auto c_dim = op->getOutput()->getDims(); + if (a_dim.size() == 0) { + a_dim.push_back(1); + } + + if (b_dim.size() == 0) { + b_dim.push_back(1); + } checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, @@ -257,6 +292,13 @@ class MinimumCnnl : public BangKernelWithoutConfig { auto a_dim = op->getInputs(0)->getDims(); auto b_dim = op->getInputs(1)->getDims(); auto c_dim = op->getOutput()->getDims(); + if (a_dim.size() == 0) { + a_dim.push_back(1); + } + + if (b_dim.size() == 0) { + b_dim.push_back(1); + } checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, @@ -301,6 +343,13 @@ class MSELossCnnl : public BangKernelWithoutConfig { auto a_dim = op->getInputs(0)->getDims(); auto b_dim = op->getInputs(1)->getDims(); auto c_dim = op->getOutput()->getDims(); + if (a_dim.size() == 0) { + a_dim.push_back(1); + } + + if (b_dim.size() == 0) { + b_dim.push_back(1); + } checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, @@ -351,6 +400,14 @@ class PowerCnnl : public BangKernelWithoutConfig { auto b_dim = op->getInputs(1)->getDims(); auto c_dim = op->getOutput()->getDims(); + if (a_dim.size() == 0) { + a_dim.push_back(1); + } + + if (b_dim.size() == 0) { + b_dim.push_back(1); + } + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, a_dim.size(), @@ -395,6 +452,13 @@ class FloorDivCnnl : public BangKernelWithoutConfig { auto a_dim = op->getInputs(0)->getDims(); auto b_dim = op->getInputs(1)->getDims(); auto c_dim = op->getOutput()->getDims(); + if (a_dim.size() == 0) { + a_dim.push_back(1); + } + + if (b_dim.size() == 0) { + b_dim.push_back(1); + } checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, @@ -440,6 +504,13 @@ class FloorModCnnl : public BangKernelWithoutConfig { auto a_dim = op->getInputs(0)->getDims(); auto b_dim = op->getInputs(1)->getDims(); auto c_dim = op->getOutput()->getDims(); + if (a_dim.size() == 0) { + a_dim.push_back(1); + } + + if (b_dim.size() == 0) { + b_dim.push_back(1); + } checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, @@ -485,6 +556,13 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig { auto a_dim = op->getInputs(0)->getDims(); auto b_dim = op->getInputs(1)->getDims(); auto c_dim = op->getOutput()->getDims(); + if (a_dim.size() == 0) { + a_dim.push_back(1); + } + + if (b_dim.size() == 0) { + b_dim.push_back(1); + } checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, diff --git a/src/kernels/bang/gather.cc b/src/kernels/bang/gather.cc new file mode 100644 index 00000000..b5a326fc --- /dev/null +++ b/src/kernels/bang/gather.cc @@ -0,0 +1,53 @@ +#include "operators/gather.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class GatherCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto aDim = op->getInputs(0)->getDims(); + auto bDim = op->getInputs(1)->getDims(); + auto cDim = op->getOutput()->getDims(); + + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_INT32, bDim.size(), + bDim.data())); + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); + + BangPtr wsData = context->getWorkspace(aDim.size() * 4); + context->copyBlobFromCPU(wsData, aDim.data(), aDim.size() * 4); + + auto axis = op->getAxis(); + cnnlStatus_t stat = + cnnlGatherV2(context->cnnlHandle(), axis, aDesc, aData, + (int *)wsData, bDesc, (int *)bData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Gather, DataType::Float32, GatherCnnl, + "Gather_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/reduce_mean.cc b/src/kernels/bang/reduce_mean.cc new file mode 100644 index 00000000..2dd77e10 --- /dev/null +++ b/src/kernels/bang/reduce_mean.cc @@ -0,0 +1,69 @@ +#include "operators/reduce_mean.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class ReduceMeanCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + auto aDim = op->getInputs(0)->getDims(); + auto axes_set = op->getAxes(); + std::vector axes; + axes.assign(axes_set.begin(), axes_set.end()); + auto bDim = aDim; + for (auto it : axes) { + bDim[it] = 1; + } + + cnnlTensorDescriptor_t inDesc, outDesc; + checkCnnlError(cnnlCreateTensorDescriptor(&inDesc)); + checkCnnlError(cnnlCreateTensorDescriptor(&outDesc)); + checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); + checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, bDim.size(), + bDim.data())); + + // get reduce descriptor + cnnlReduceDescriptor_t reduceDesc; + checkCnnlError(cnnlCreateReduceDescriptor(&reduceDesc)); + checkCnnlError(cnnlSetReduceDescriptor_v2( + reduceDesc, axes.data(), axes.size(), CNNL_REDUCE_AVG, + CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, + CNNL_32BIT_INDICES, 0.0)); + + // get workspace + size_t workspaceSize = 0; + checkCnnlError(cnnlGetReduceOpWorkspaceSize(context->cnnlHandle(), + inDesc, outDesc, reduceDesc, + &workspaceSize)); + int indicesSize = axes.size() * sizeof(int); + BangPtr wsData = context->getWorkspace(workspaceSize + indicesSize); + + BangPtr indicesData = (char *)wsData + workspaceSize; + context->copyBlobFromCPU(indicesData, axes.data(), indicesSize); + + // reduce + float alpha = 1.f, beta = 0.f; + checkCnnlError(cnnlReduce( + context->cnnlHandle(), reduceDesc, wsData, workspaceSize, &alpha, + inDesc, aData, indicesSize, indicesData, &beta, outDesc, cData)); + + // Destories in CUDA does not require sync. But cuDNN does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(inDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(outDesc)); + checkCnnlError(cnnlDestroyReduceDescriptor(reduceDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::ReduceMean, DataType::Float32, + ReduceMeanCnnl, "ReduceMean_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/where.cc b/src/kernels/bang/where.cc new file mode 100644 index 00000000..725b63e0 --- /dev/null +++ b/src/kernels/bang/where.cc @@ -0,0 +1,73 @@ +#include "operators/where.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class WhereCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getInputs(2)->getRawDataPtr()); + void *const dData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc, dDesc; + auto aDim = op->getInputs(0)->getDims(); + auto bDim = op->getInputs(1)->getDims(); + auto cDim = op->getInputs(2)->getDims(); + auto dDim = op->getOutput()->getDims(); + + if (aDim.size() == 0) { + aDim.push_back(1); + } + if (bDim.size() == 0) { + bDim.push_back(1); + } + if (cDim.size() == 0) { + cDim.push_back(1); + } + if (dDim.size() == 0) { + dDim.push_back(1); + } + + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, bDim.size(), + bDim.data())); + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_BOOL, cDim.size(), + cDim.data())); + checkCnnlError(cnnlCreateTensorDescriptor(&dDesc)); + checkCnnlError(cnnlSetTensorDescriptor(dDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, dDim.size(), + dDim.data())); + size_t wsSize; + cnnlGetSelectV2WorkspaceSize(context->cnnlHandle(), cDesc, aDesc, bDesc, + &wsSize); + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = + cnnlSelectV2(context->cnnlHandle(), cDesc, cData, aDesc, aData, + bDesc, bData, wsData, wsSize, dDesc, dData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(dDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Where, DataType::Float32, WhereCnnl, + "Where_cnnl_BANG_Float32"); + +}; // namespace infini