diff --git a/src/kernels/bang/activation.cc b/src/kernels/bang/activation.cc index cd19906b..23b86cc4 100644 --- a/src/kernels/bang/activation.cc +++ b/src/kernels/bang/activation.cc @@ -16,25 +16,17 @@ class UnaryCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - int len = dim.size(); - int size = 1; - for (int i = 0; i < len; ++i) { - size *= dim[i]; - } + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[1] = {size}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); - checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, - CNNL_DTYPE_FLOAT, 1, dim_array)); - - // get outputs + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); - checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, - CNNL_DTYPE_FLOAT, 1, dim_array)); - - // get op descriptor + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlActivationDescriptor_t opDesc; checkCnnlError(cnnlCreateActivationDescriptor(&opDesc)); checkCnnlError(cnnlSetActivationDescriptor( @@ -46,9 +38,6 @@ class UnaryCnnl : public BangKernelWithoutConfig { aData, &beta, cDesc, cData); if (stat != CNNL_STATUS_SUCCESS) return; - - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); checkCnnlError(cnnlDestroyActivationDescriptor(opDesc)); @@ -65,28 +54,21 @@ class RoundCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlStatus_t stat = cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData); if (stat != CNNL_STATUS_SUCCESS) return; - - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } @@ -103,31 +85,28 @@ class PReluCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, bDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); + auto bDim = op->getInputs(1)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - int alpha_array[4] = {1, 1, 1, 1}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); - checkCnnlError(cnnlSetTensorDescriptor( - bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, alpha_array)); - // get outputs + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, bDim.size(), + bDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlStatus_t stat = cnnlPrelu(context->cnnlHandle(), aDesc, aData, bDesc, bData, cDesc, cData); if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); diff --git a/src/kernels/bang/activation_backward.cc b/src/kernels/bang/activation_backward.cc index 78be8c0d..cc70afce 100644 --- a/src/kernels/bang/activation_backward.cc +++ b/src/kernels/bang/activation_backward.cc @@ -18,28 +18,27 @@ class ActivationBackwardCnnl : public BangKernelWithoutConfig { void *const diffXData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t yDesc, diffYDesc, xDesc, diffXDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto yDim = op->getInputs(0)->getDims(); + auto diffyDim = op->getInputs(1)->getDims(); + auto xDim = op->getInputs(2)->getDims(); + auto diffxDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&yDesc)); checkCnnlError(cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - // get inputs + CNNL_DTYPE_FLOAT, yDim.size(), + yDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc)); - checkCnnlError(cnnlSetTensorDescriptor(diffYDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - // get inputs + checkCnnlError(cnnlSetTensorDescriptor( + diffYDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffyDim.size(), + diffyDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&xDesc)); checkCnnlError(cnnlSetTensorDescriptor(xDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - // get outputs + CNNL_DTYPE_FLOAT, xDim.size(), + xDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc)); - checkCnnlError(cnnlSetTensorDescriptor(diffXDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - + checkCnnlError(cnnlSetTensorDescriptor( + diffXDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, diffxDim.size(), + diffxDim.data())); // get op descriptor cnnlActivationDescriptor_t opDesc; checkCnnlError(cnnlCreateActivationDescriptor(&opDesc)); @@ -53,8 +52,6 @@ class ActivationBackwardCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(yDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(diffYDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(xDesc)); diff --git a/src/kernels/bang/cast.cc b/src/kernels/bang/cast.cc index a3d56654..267e4c2d 100644 --- a/src/kernels/bang/cast.cc +++ b/src/kernels/bang/cast.cc @@ -14,154 +14,189 @@ class CastCnnl : public BangKernelWithoutConfig { cnnlTensorDescriptor_t aDesc, cDesc; auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); - - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); cnnlCastDataType_t NlCastType; CastType type = op->getType(); switch (type) { case CastType::Float2Int64: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT64, dim.size(), + dim.data())); NlCastType = CNNL_CAST_FLOAT_TO_INT64; break; case CastType::Float2Int32: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT32, dim.size(), + dim.data())); NlCastType = CNNL_CAST_FLOAT_TO_INT32; break; case CastType::Float2Int16: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT16, dim.size(), + dim.data())); NlCastType = CNNL_CAST_FLOAT_TO_INT16; break; case CastType::Float2Int8: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT8, dim.size(), + dim.data())); NlCastType = CNNL_CAST_FLOAT_TO_INT8; break; case CastType::Int322Float: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT32, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, dim.size(), + dim.data())); NlCastType = CNNL_CAST_INT32_TO_FLOAT; break; case CastType::Int322Int8: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT32, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT8, dim.size(), + dim.data())); NlCastType = CNNL_CAST_INT32_TO_INT8; break; case CastType::Int322Int16: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT32, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT16, dim.size(), + dim.data())); NlCastType = CNNL_CAST_INT32_TO_INT16; break; case CastType::Int162Float: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT16, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, dim.size(), + dim.data())); NlCastType = CNNL_CAST_INT16_TO_FLOAT; break; case CastType::Int162Int32: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT16, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT32, dim.size(), + dim.data())); NlCastType = CNNL_CAST_INT16_TO_INT32; break; case CastType::Int82Float: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT8, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, dim.size(), + dim.data())); NlCastType = CNNL_CAST_INT8_TO_FLOAT; break; case CastType::Int82Int16: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT8, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT16, dim.size(), + dim.data())); NlCastType = CNNL_CAST_INT8_TO_INT16; break; case CastType::Int82Int32: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT8, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT32, dim.size(), + dim.data())); NlCastType = CNNL_CAST_INT8_TO_INT32; break; case CastType::Uint82Float: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_UINT8, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, dim.size(), + dim.data())); NlCastType = CNNL_CAST_UINT8_TO_FLOAT; break; case CastType::Uint82Int32: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_UINT8, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT32, dim.size(), + dim.data())); NlCastType = CNNL_CAST_UINT8_TO_INT32; break; case CastType::Uint82Int64: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_UINT8, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT64, dim.size(), + dim.data())); NlCastType = CNNL_CAST_UINT8_TO_INT64; break; case CastType::Int322Int64: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT32, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT64, dim.size(), + dim.data())); NlCastType = CNNL_CAST_INT32_TO_INT64; break; case CastType::Int642Int32: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT64, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT32, dim.size(), + dim.data())); NlCastType = CNNL_CAST_INT64_TO_INT32; break; case CastType::Int642Uint32: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT32, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT64, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_UINT32, + dim.size(), dim.data())); NlCastType = CNNL_CAST_INT64_TO_UINT32; break; case CastType::Int642Float: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT64, dim.size(), + dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, dim.size(), + dim.data())); NlCastType = CNNL_CAST_INT64_TO_FLOAT; break; case CastType::Uint322Int64: - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT32, 4, dim_array)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_UINT32, + dim.size(), dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT64, dim.size(), + dim.data())); NlCastType = CNNL_CAST_UINT32_TO_INT64; break; default: @@ -172,8 +207,6 @@ class CastCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } diff --git a/src/kernels/bang/ceil.cc b/src/kernels/bang/ceil.cc index 5770f412..c3d0f3d0 100644 --- a/src/kernels/bang/ceil.cc +++ b/src/kernels/bang/ceil.cc @@ -13,28 +13,23 @@ class CeilCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlStatus_t stat = cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData); if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } diff --git a/src/kernels/bang/clip.cc b/src/kernels/bang/clip.cc index bdfb473b..12b71fdc 100644 --- a/src/kernels/bang/clip.cc +++ b/src/kernels/bang/clip.cc @@ -15,23 +15,17 @@ class ClipCnnl : public BangKernelWithoutConfig { float max = op->getMax().value(); cnnlTensorDescriptor_t aDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); cnnlStatus_t stat = cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData); if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); } }; diff --git a/src/kernels/bang/concat.cc b/src/kernels/bang/concat.cc index 1bfc1d33..ab535879 100644 --- a/src/kernels/bang/concat.cc +++ b/src/kernels/bang/concat.cc @@ -10,40 +10,29 @@ class ConcatCnnl : public BangKernelWithoutConfig { auto context = dynamic_cast(_context); int num = op->numInputs(); int axis = op->getDim(); + + auto cDim = op->getOutput()->getDims(); + cnnlTensorDescriptor_t desc; + checkCnnlError(cnnlCreateTensorDescriptor(&desc)); + checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); + + cnnlTensorDescriptor_t descArray[num]; + for (int i = 0; i < num; ++i) { + checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i])); + checkCnnlError(cnnlSetTensorDescriptor( + descArray[i], CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, + op->getInputs(i)->getDims().size(), + op->getInputs(i)->getDims().data())); + } + void *argv[num]; for (int i = 0; i < num; ++i) { argv[i] = op->getInputs(i)->getRawDataPtr(); } void *const cData = (op->getOutput()->getRawDataPtr()); - cnnlTensorDescriptor_t desc; - - int dim_array[num][4]; - for (int i = 0; i < num; ++i) { - auto dim = op->getInputs(i)->getDims(); - if (dim.size() != 4) { - IT_TODO_HALT(); - } - dim_array[i][0] = dim[0]; - dim_array[i][1] = dim[1]; - dim_array[i][2] = dim[2]; - dim_array[i][3] = dim[3]; - } - - auto dim = op->getOutput()->getDims(); - int dimout_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - - checkCnnlError(cnnlCreateTensorDescriptor(&desc)); - checkCnnlError(cnnlSetTensorDescriptor( - desc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimout_array)); - cnnlTensorDescriptor_t descArray[num]; - for (int i = 0; i < num; ++i) { - checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i])); - checkCnnlError( - cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array[i])); - } - size_t wsSize; cnnlGetConcatWorkspaceSize(context->cnnlHandle(), num, &wsSize); BangPtr wsData = context->getWorkspace(wsSize); @@ -54,8 +43,6 @@ class ConcatCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. for (int i = 0; i < num; ++i) { checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i])); } diff --git a/src/kernels/bang/det.cc b/src/kernels/bang/det.cc index 02726e69..182baaa7 100644 --- a/src/kernels/bang/det.cc +++ b/src/kernels/bang/det.cc @@ -21,28 +21,22 @@ class DetCnnl : public BangKernelWithoutConfig { cnnlTensorDescriptor_t aDesc, cDesc; auto dimin = op->getInputs(0)->getDims(); auto dimout = op->getOutput()->getDims(); - if (dimin.size() != 4 || dimout.size() != 2) - IT_TODO_HALT(); - int dimin_array[4] = {dimin[0], dimin[1], dimin[2], dimin[3]}; - int dimout_array[2] = {dimout[0], dimout[1]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimin_array)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, dimin.size(), + dimin.data())); - // get outputs checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 2, dimout_array)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, dimout.size(), + dimout.data())); cnnlStatus_t stat = cnnlDet(context->cnnlHandle(), nlMode, aDesc, aData, cDesc, cData); if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } diff --git a/src/kernels/bang/element_wise.cc b/src/kernels/bang/element_wise.cc index 7130b6a7..9255e3da 100644 --- a/src/kernels/bang/element_wise.cc +++ b/src/kernels/bang/element_wise.cc @@ -22,24 +22,21 @@ class ElementWiseCnnl : public BangKernelWithoutConfig { auto b_dim = op->getInputs(1)->getDims(); auto c_dim = op->getOutput()->getDims(); - if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4) - IT_TODO_HALT(); - - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, a_dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, a_dim.size(), + a_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); - checkCnnlError(cnnlSetTensorDescriptor( - bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, b_dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, b_dim.size(), + b_dim.data())); - // get outputs checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, c_dim.data())); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, c_dim.size(), + c_dim.data())); - // get op descriptor cnnlOpTensorDescriptor_t opDesc; checkCnnlError(cnnlCreateOpTensorDescriptor(&opDesc)); checkCnnlError(cnnlSetOpTensorDescriptor( @@ -58,8 +55,6 @@ class ElementWiseCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); @@ -79,29 +74,26 @@ class LogicOpCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, bDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getInputs(1)->getDims(); + auto c_dim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - + CNNL_DTYPE_FLOAT, a_dim.size(), + a_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, b_dim.size(), + b_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, c_dim.size(), + c_dim.data())); size_t wsSize; cnnlGetLogicOpWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, &wsSize); - BangPtr wsData = context->getWorkspace(wsSize); cnnlStatus_t stat = @@ -110,8 +102,6 @@ class LogicOpCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); @@ -130,29 +120,26 @@ class BitComputeCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, bDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getInputs(1)->getDims(); + auto c_dim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_INT32, 4, dim_array)); - + CNNL_DTYPE_INT32, a_dim.size(), + a_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_INT32, 4, dim_array)); - - // get outputs + CNNL_DTYPE_INT32, b_dim.size(), + b_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_INT32, 4, dim_array)); + CNNL_DTYPE_INT32, c_dim.size(), + c_dim.data())); size_t wsSize; cnnlGetBitComputeWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, &wsSize); - BangPtr wsData = context->getWorkspace(wsSize); cnnlStatus_t stat = @@ -161,8 +148,6 @@ class BitComputeCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); @@ -180,29 +165,26 @@ class DivCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, bDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getInputs(1)->getDims(); + auto c_dim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - + CNNL_DTYPE_FLOAT, a_dim.size(), + a_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, b_dim.size(), + b_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, c_dim.size(), + c_dim.data())); size_t wsSize; cnnlGetDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, &wsSize); - BangPtr wsData = context->getWorkspace(wsSize); cnnlStatus_t stat = cnnlDiv_v2( @@ -211,8 +193,6 @@ class DivCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); @@ -230,26 +210,23 @@ class MaximumCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, bDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getInputs(1)->getDims(); + auto c_dim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - + CNNL_DTYPE_FLOAT, a_dim.size(), + a_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, b_dim.size(), + b_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, c_dim.size(), + c_dim.data())); - // get op descriptor size_t wsSize; cnnlGetMaximumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize); BangPtr wsData = context->getWorkspace(wsSize); @@ -260,8 +237,6 @@ class MaximumCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); @@ -279,26 +254,23 @@ class MinimumCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, bDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getInputs(1)->getDims(); + auto c_dim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - + CNNL_DTYPE_FLOAT, a_dim.size(), + a_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, b_dim.size(), + b_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, c_dim.size(), + c_dim.data())); - // get op descriptor size_t wsSize; cnnlGetMinimumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize); BangPtr wsData = context->getWorkspace(wsSize); @@ -309,8 +281,6 @@ class MinimumCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); @@ -328,30 +298,23 @@ class MSELossCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); MSELossObj::Reduction reduction = op->getReduction(); cnnlTensorDescriptor_t aDesc, bDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getInputs(1)->getDims(); + auto c_dim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - int dim_out[4] = {1, 1, 1, 1}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - + CNNL_DTYPE_FLOAT, a_dim.size(), + a_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, b_dim.size(), + b_dim.data())); - // get outputs checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); - if (reduction == MSELossObj::None) { - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); - } else { - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_out)); - } + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, c_dim.size(), + c_dim.data())); cnnlStatus_t stat; if (reduction == MSELossObj::None) { stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_NONE, aDesc, @@ -367,8 +330,6 @@ class MSELossCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); @@ -386,26 +347,23 @@ class PowerCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, bDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getInputs(1)->getDims(); + auto c_dim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - + CNNL_DTYPE_FLOAT, a_dim.size(), + a_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, b_dim.size(), + b_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, c_dim.size(), + c_dim.data())); - // get op descriptor size_t wsSize; cnnlGetPowWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, &wsSize); @@ -417,8 +375,6 @@ class PowerCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); @@ -436,29 +392,26 @@ class FloorDivCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, bDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getInputs(1)->getDims(); + auto c_dim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - + CNNL_DTYPE_FLOAT, a_dim.size(), + a_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, b_dim.size(), + b_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, c_dim.size(), + c_dim.data())); size_t wsSize; cnnlGetFloorDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, &wsSize); - BangPtr wsData = context->getWorkspace(wsSize); cnnlStatus_t stat = cnnlFloorDiv_v2( @@ -467,8 +420,6 @@ class FloorDivCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); @@ -486,29 +437,26 @@ class FloorModCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, bDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getInputs(1)->getDims(); + auto c_dim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - + CNNL_DTYPE_FLOAT, a_dim.size(), + a_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, b_dim.size(), + b_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, c_dim.size(), + c_dim.data())); size_t wsSize; cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, &wsSize); - BangPtr wsData = context->getWorkspace(wsSize); cnnlStatus_t stat = @@ -517,8 +465,6 @@ class FloorModCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); @@ -536,29 +482,26 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, bDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getInputs(1)->getDims(); + auto c_dim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - + CNNL_DTYPE_FLOAT, a_dim.size(), + a_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, b_dim.size(), + b_dim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, c_dim.size(), + c_dim.data())); size_t wsSize; cnnlGetSquaredDifferenceWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, &wsSize); - BangPtr wsData = context->getWorkspace(wsSize); cnnlStatus_t stat = @@ -567,8 +510,6 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); diff --git a/src/kernels/bang/erf.cc b/src/kernels/bang/erf.cc index 86c1e3a3..5f1c0985 100644 --- a/src/kernels/bang/erf.cc +++ b/src/kernels/bang/erf.cc @@ -13,20 +13,17 @@ class ErfCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlStatus_t stat = cnnlErf_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, @@ -34,8 +31,6 @@ class ErfCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } diff --git a/src/kernels/bang/exp.cc b/src/kernels/bang/exp.cc index 9d7d31f4..fa71be72 100644 --- a/src/kernels/bang/exp.cc +++ b/src/kernels/bang/exp.cc @@ -13,20 +13,17 @@ class ExpCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlStatus_t stat = cnnlExp_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, @@ -34,8 +31,6 @@ class ExpCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } diff --git a/src/kernels/bang/fill.cc b/src/kernels/bang/fill.cc index 0f8fb846..c3f75311 100644 --- a/src/kernels/bang/fill.cc +++ b/src/kernels/bang/fill.cc @@ -13,23 +13,18 @@ class FillCnnl : public BangKernelWithoutConfig { float value = op->getValue(); cnnlTensorDescriptor_t cDesc; - auto dim = op->getOutput()->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get outputs checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlStatus_t stat = cnnlFill(context->cnnlHandle(), value, cDesc, cData); if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } }; diff --git a/src/kernels/bang/floor.cc b/src/kernels/bang/floor.cc index a0f2a082..dd049d1d 100644 --- a/src/kernels/bang/floor.cc +++ b/src/kernels/bang/floor.cc @@ -13,28 +13,23 @@ class FloorCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlStatus_t stat = cnnlFloor(context->cnnlHandle(), aDesc, aData, cDesc, cData); if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } diff --git a/src/kernels/bang/hardtanh.cc b/src/kernels/bang/hardtanh.cc index 1c4ad697..2cdb89fe 100644 --- a/src/kernels/bang/hardtanh.cc +++ b/src/kernels/bang/hardtanh.cc @@ -16,22 +16,16 @@ class HardtanhCnnl : public BangKernelWithoutConfig { cnnlTensorDescriptor_t aDesc; auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); - checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, dim.size(), dim.data())); cnnlStatus_t stat = cnnlHardtanh(context->cnnlHandle(), aDesc, aData, max, min, aDesc, cData); if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); } }; diff --git a/src/kernels/bang/l2loss.cc b/src/kernels/bang/l2loss.cc index d7c66859..7fb5d3a8 100644 --- a/src/kernels/bang/l2loss.cc +++ b/src/kernels/bang/l2loss.cc @@ -14,22 +14,16 @@ class L2LossCnnl : public BangKernelWithoutConfig { cnnlTensorDescriptor_t aDesc; auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); - checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, dim.size(), dim.data())); cnnlStatus_t stat = cnnlL2Loss(context->cnnlHandle(), aDesc, aData, cData); if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); } }; diff --git a/src/kernels/bang/log.cc b/src/kernels/bang/log.cc index 4976b1ca..6237992e 100644 --- a/src/kernels/bang/log.cc +++ b/src/kernels/bang/log.cc @@ -28,20 +28,17 @@ class LogCnnl : public BangKernelWithoutConfig { } cnnlTensorDescriptor_t aDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlStatus_t stat = cnnlLog_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, @@ -49,8 +46,6 @@ class LogCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } diff --git a/src/kernels/bang/matmul.cc b/src/kernels/bang/matmul.cc index 56d9cf0f..39888e71 100644 --- a/src/kernels/bang/matmul.cc +++ b/src/kernels/bang/matmul.cc @@ -22,7 +22,6 @@ class MatmulCnnl : public BangKernelWithoutConfig { int32_t transA = op->getTransA(); int32_t transB = op->getTransB(); - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError( cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, @@ -33,7 +32,6 @@ class MatmulCnnl : public BangKernelWithoutConfig { cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dimInputs1.size(), dimInputs1.data())); - // get outputs checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError( cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, @@ -68,8 +66,6 @@ class MatmulCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); diff --git a/src/kernels/bang/negtensor.cc b/src/kernels/bang/negtensor.cc index b0171120..02c5c37c 100644 --- a/src/kernels/bang/negtensor.cc +++ b/src/kernels/bang/negtensor.cc @@ -13,28 +13,23 @@ class NegTensorCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlStatus_t stat = cnnlNegTensor(context->cnnlHandle(), aDesc, aData, cDesc, cData); if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } diff --git a/src/kernels/bang/reciprocal.cc b/src/kernels/bang/reciprocal.cc index 38a22fd6..6ac3f334 100644 --- a/src/kernels/bang/reciprocal.cc +++ b/src/kernels/bang/reciprocal.cc @@ -13,28 +13,23 @@ class ReciprocalCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlStatus_t stat = cnnlReciprocal(context->cnnlHandle(), aDesc, aData, cDesc, cData); if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } diff --git a/src/kernels/bang/reshape.cc b/src/kernels/bang/reshape.cc index 883cd9ba..564ed1d7 100644 --- a/src/kernels/bang/reshape.cc +++ b/src/kernels/bang/reshape.cc @@ -11,17 +11,11 @@ class CopyBang : public BangKernelWithoutConfig { auto outData = op->getOutputs()[0]->getRawDataPtr(); cnnlTensorDescriptor_t aDesc; auto dim = op->getInputs(0)->getDims(); - int len = dim.size(); - int size = 1; - for (int i = 0; i < len; ++i) { - size *= dim[i]; - } - int dim_array[1] = {size}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, - CNNL_DTYPE_FLOAT, 1, dim_array)); + CNNL_DTYPE_FLOAT, dim.size(), + dim.data())); cnnlStatus_t stat = cnnlCopy(context->cnnlHandle(), aDesc, inData, aDesc, outData); if (stat != CNNL_STATUS_SUCCESS) diff --git a/src/kernels/bang/rsqrt.cc b/src/kernels/bang/rsqrt.cc index fea06e13..0da3c74d 100644 --- a/src/kernels/bang/rsqrt.cc +++ b/src/kernels/bang/rsqrt.cc @@ -13,20 +13,17 @@ class RsqrtCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlStatus_t stat = cnnlRsqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, @@ -34,8 +31,6 @@ class RsqrtCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } diff --git a/src/kernels/bang/split.cc b/src/kernels/bang/split.cc index bfa842bc..bf3f8123 100644 --- a/src/kernels/bang/split.cc +++ b/src/kernels/bang/split.cc @@ -10,39 +10,26 @@ class SplitCnnl : public BangKernelWithoutConfig { auto context = dynamic_cast(_context); int num = op->numOutputs(); int axis = op->getDim(); - void *argv[num]; - for (int i = 0; i < num; ++i) { - argv[i] = op->getOutput(i)->getRawDataPtr(); - } - void *const inputData = (op->getInputs(0)->getRawDataPtr()); - cnnlTensorDescriptor_t desc; - - int dimout_array[num][4]; - for (int i = 0; i < num; ++i) { - auto dim = op->getOutput(i)->getDims(); - if (dim.size() != 4) { - IT_TODO_HALT(); - } - dimout_array[i][0] = dim[0]; - dimout_array[i][1] = dim[1]; - dimout_array[i][2] = dim[2]; - dimout_array[i][3] = dim[3]; - } auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) { - IT_TODO_HALT(); - } - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + cnnlTensorDescriptor_t desc; checkCnnlError(cnnlCreateTensorDescriptor(&desc)); - checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + desc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, dim.size(), dim.data())); + cnnlTensorDescriptor_t descArray[num]; for (int i = 0; i < num; ++i) { checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i])); - checkCnnlError( - cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dimout_array[i])); + checkCnnlError(cnnlSetTensorDescriptor( + descArray[i], CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, + op->getOutput(i)->getDims().size(), + op->getOutput(i)->getDims().data())); + } + + void *const inputData = (op->getInputs(0)->getRawDataPtr()); + void *argv[num]; + for (int i = 0; i < num; ++i) { + argv[i] = op->getOutput(i)->getRawDataPtr(); } size_t wsSize; @@ -55,8 +42,6 @@ class SplitCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. for (int i = 0; i < num; ++i) { checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i])); } diff --git a/src/kernels/bang/sqrt.cc b/src/kernels/bang/sqrt.cc index 68715912..52fea02a 100644 --- a/src/kernels/bang/sqrt.cc +++ b/src/kernels/bang/sqrt.cc @@ -13,20 +13,17 @@ class SqrtCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); cnnlStatus_t stat = cnnlSqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, @@ -34,8 +31,6 @@ class SqrtCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); } diff --git a/src/kernels/bang/transpose.cc b/src/kernels/bang/transpose.cc index c484824c..c87c4c28 100644 --- a/src/kernels/bang/transpose.cc +++ b/src/kernels/bang/transpose.cc @@ -15,26 +15,21 @@ class TransposeCnnl : public BangKernelWithoutConfig { cnnlTensorDescriptor_t aDesc, cDesc; auto dimin = op->getInputs(0)->getDims(); auto dimout = op->getOutput()->getDims(); - if (dimin.size() != 4 || dimout.size() != 4) - IT_TODO_HALT(); - int dimin_array[4] = {dimin[0], dimin[1], dimin[2], dimin[3]}; - int dimout_array[4] = {dimout[0], dimout[1], dimout[2], dimout[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); - checkCnnlError(cnnlSetTensorDescriptor( - aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimin_array)); - - // get outputs + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, dimin.size(), + dimin.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); - checkCnnlError(cnnlSetTensorDescriptor( - cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimout_array)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, dimout.size(), + dimout.data())); - // get op descriptor auto permute = op->getPermute(); cnnlTransposeDescriptor_t opDesc; checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc)); - checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute.data())); + checkCnnlError( + cnnlSetTransposeDescriptor(opDesc, permute.size(), permute.data())); size_t wsSize; cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aDesc, opDesc, @@ -47,8 +42,6 @@ class TransposeCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc)); diff --git a/src/kernels/bang/trigon.cc b/src/kernels/bang/trigon.cc index fcf56472..b4842b95 100644 --- a/src/kernels/bang/trigon.cc +++ b/src/kernels/bang/trigon.cc @@ -15,22 +15,18 @@ class TrigonCnnl : public BangKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cnnlTensorDescriptor_t aDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) - IT_TODO_HALT(); + auto aDim = op->getInputs(0)->getDims(); + auto cDim = op->getOutput()->getDims(); - int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; - // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); - - // get outputs + CNNL_DTYPE_FLOAT, aDim.size(), + aDim.data())); checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, - CNNL_DTYPE_FLOAT, 4, dim_array)); + CNNL_DTYPE_FLOAT, cDim.size(), + cDim.data())); - // get op descriptor cnnlTrigonDescriptor_t opDesc; checkCnnlError(cnnlCreateTrigonDescriptor(&opDesc)); checkCnnlError(cnnlSetTrigonDescriptor(opDesc, getOpType())); @@ -40,8 +36,6 @@ class TrigonCnnl : public BangKernelWithoutConfig { if (stat != CNNL_STATUS_SUCCESS) return; - // Destories in BANG does not require sync. But cnnl does not state - // whether sync is required before destories. checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); checkCnnlError(cnnlDestroyTrigonDescriptor(opDesc));