From 86133c8d0af4021d68f4d45ba7a78e6819dbade7 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Wed, 10 Apr 2024 11:16:54 +0800 Subject: [PATCH] modified expand --- include/cuda/cuda_expand.h | 5 ++- src/kernels/cuda/expand.cc | 25 ++++++-------- src/kernels/cuda/expand.cu | 68 ++++++++++++++++++-------------------- src/kernels/cuda/matmul.cc | 29 ++++++++-------- src/kernels/cuda/where.cu | 2 +- 5 files changed, 59 insertions(+), 70 deletions(-) diff --git a/include/cuda/cuda_expand.h b/include/cuda/cuda_expand.h index 4001df41..049abfd9 100644 --- a/include/cuda/cuda_expand.h +++ b/include/cuda/cuda_expand.h @@ -3,9 +3,8 @@ #include "operators/unary.h" #include "utils/small_array.h" namespace infini { -void expandKernel(int dType, void *input, void *output, int nDims, - int outputsize, SmallArray inputShape, - SmallArray outputShape); +void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2, + int a3, int b0, int b1, int b2, int b3); void expandRowKernel(int dType, void *input, void *output, int n_rows, int row_len); diff --git a/src/kernels/cuda/expand.cc b/src/kernels/cuda/expand.cc index 35b14f85..230893c9 100644 --- a/src/kernels/cuda/expand.cc +++ b/src/kernels/cuda/expand.cc @@ -12,22 +12,19 @@ class ExpandCuda : public CudaKernelWithoutConfig { void *const inputData = (op->getInputs(0)->getRawDataPtr()); void *const outputData = (op->getOutput()->getRawDataPtr()); - const auto &in_Shape = op->getInputs(0)->getDims(); // input shape - const auto &out_Shape = op->getShape(); // output shape + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getOutput()->getDims(); // output shape - SmallArray inputShape, outputShape; - int nDims = op->getInputs(0)->getDims().size(); - - IT_ASSERT(nDims <= SMALL_ARRAY_SIZE); - int outputsize = 1; // the length of the output vector after flatten - for (int i = 0; i < nDims; ++i) { - outputShape.data[i] = out_Shape[i]; - inputShape.data[i] = in_Shape[i]; - outputsize *= out_Shape[i]; - } const int dType = op->getDType().getIndex(); - expandKernel(dType, inputData, outputData, nDims, outputsize, - inputShape, outputShape); + if (a_dim.size() > 4 || b_dim.size() > 4) + IT_TODO_HALT(); + + int a[4] = {1, 1, 1, 1}; + int b[4] = {1, 1, 1, 1}; + std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size())); + std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size())); + expandKernel(dType, inputData, outputData, a[0], a[1], a[2], a[3], b[0], + b[1], b[2], b[3]); } }; diff --git a/src/kernels/cuda/expand.cu b/src/kernels/cuda/expand.cu index 3fbf929e..3a934ae7 100644 --- a/src/kernels/cuda/expand.cu +++ b/src/kernels/cuda/expand.cu @@ -6,36 +6,29 @@ constexpr unsigned int num_threads() { return 32 * 4; } constexpr int thread_work_size() { return 4; } constexpr int block_work_size() { return thread_work_size() * num_threads(); } - +const int repeat = 1; template -__global__ void _expandKernel(void *input, void *output, int nDims, - int outputsize, infini::SmallArray inputShape, - infini::SmallArray outputShape) { +__global__ void _expandKernel(void *input, void *output, int a0, int a1, int a2, + int a3, int b0, int b1, int b2, int b3) { - int outputIdx = - blockIdx.x * blockDim.x + threadIdx.x; // i(JKS) + j(KS) + k(S) + s - if (outputIdx < outputsize) { - int inputIdx = 0; // record input index - int temp = 1; // stored S, KS, JKS, in order - int tmp = 1; // stored s,k,j,i in order - int v = outputIdx; // v = i(JKS) + j(KS) + k(S) + s - for (int i = nDims - 1; i >= 0; --i) { - if (i == 0) { - tmp = v; // i = outputIdx/(JKS) - } else { - tmp = v % outputShape.data[i]; // store s,k,j in order - } - if (inputShape.data[i] == - 1) { // if input shape = 1, the index only equal 0 - inputIdx += 0; - } else { - inputIdx += - tmp * temp; // otherwise +i(JKS) or j(KS) or k(S) or s - } - temp *= inputShape.data[i]; - v = v / outputShape.data[i]; + int index = threadIdx.x + blockIdx.x * blockDim.x; + + int stride1 = b2 * b3; + int stride0 = b1 * stride1; + int n = b0 * stride0; + int end = (repeat * index + repeat < n ? repeat * index + repeat : n); + for (int i = repeat * index; i < end; i++) { + int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0); + bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1); + if (aIdx) { + int b0_index = i / stride0; + int b1_index = (i % stride0) / stride1; + int b2_index = (i % stride1) / b3; + int b3_index = i % b3; + xIdx = (b0_index % a0) * a1 * a2 * a3 + (b1_index % a1) * a2 * a3 + + (b2_index % a2) * a3 + b3_index % a3; } - ((T *)output)[outputIdx] = ((T *)input)[inputIdx]; + ((T *)output)[i] = ((T *)input)[xIdx]; } } @@ -50,9 +43,9 @@ static __global__ void _expandRowKernel(void *__restrict__ dst, namespace infini { #define CASE(T) \ - _expandKernel::t><<>>( \ - input, output, nDims, outputsize, inputShape, outputShape); + _expandKernel::t> \ + <<>>( \ + input, output, a0, a1, a2, a3, b0, b1, b2, b3); #define SWITCH_DTYPE(DTYPE) \ switch (DTYPE) { \ @@ -96,11 +89,12 @@ namespace infini { IT_TODO_HALT(); \ } -void expandKernel(int dType, void *input, void *output, int nDims, - int outputsize, SmallArray inputShape, - SmallArray outputShape) { +void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2, + int a3, int b0, int b1, int b2, int b3) { int blocksize = block_work_size(); - int gridsize = (outputsize + block_work_size() - 1) / block_work_size(); + int outputsize = b0 * b1 * b2 * b3; + int gridsize = (outputsize + repeat * block_work_size() - 1) / + (repeat * block_work_size()); SWITCH_DTYPE(dType) } @@ -150,7 +144,8 @@ void expandKernel(int dType, void *input, void *output, int nDims, IT_TODO_HALT(); \ } -// Optimization for expanding a row vector. The row length must be a multiple of 32 +// Optimization for expanding a row vector. The row length must be a multiple of +// 32 void expandRowKernel(int dType, void *input, void *output, int n_rows, int row_len) { // Factorize row_len: row_len = a x b x 32 (32 is the warp size), b<=32 @@ -160,7 +155,8 @@ void expandRowKernel(int dType, void *input, void *output, int n_rows, // block: b x 32 auto c = row_len / 32, b = c; if (b > 32) { - for (b = 32; c % b != 0; --b); + for (b = 32; c % b != 0; --b) + ; } auto a = c / b; dim3 grid(a, n_rows), block(32, b); diff --git a/src/kernels/cuda/matmul.cc b/src/kernels/cuda/matmul.cc index de2c646e..02a4497c 100644 --- a/src/kernels/cuda/matmul.cc +++ b/src/kernels/cuda/matmul.cc @@ -87,20 +87,7 @@ class matmulCublas : public Kernel { beta_naive = 1.f; auto inC = op->getInputs(2); auto out = op->getOutput(); - SmallArray inputShape, outputShape; - int nDims = out->getRank(); - IT_ASSERT(nDims <= SMALL_ARRAY_SIZE); - // FIXME(constroy): use size_t for outputsize. - int outputsize = 1; // the length of the output vector after flatten - int offset = nDims - inC->getRank(); - for (int i = 0; i < offset; ++i) - inputShape.data[i] = 1; - for (int i = 0; i < nDims; ++i) { - outputShape.data[i] = out->getDims()[i]; - outputsize *= outputShape.data[i]; - if (i >= offset) - inputShape.data[i] = inC->getDims()[i - offset]; - } + const int dType = dataType.getIndex(); // Bias in linear layer is row vector of (1,n), n is the number of @@ -111,9 +98,19 @@ class matmulCublas : public Kernel { out->size() / inC->getDims()[0], inC->getDims()[0]); } else { + auto a_dim = out->getDims(); + auto b_dim = inC->getDims(); // output shape + + if (a_dim.size() > 4 || b_dim.size() > 4) + IT_TODO_HALT(); + + int a[4] = {1, 1, 1, 1}; + int b[4] = {1, 1, 1, 1}; + std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size())); + std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size())); expandKernel(dType, inC->getRawDataPtr(), - out->getRawDataPtr(), nDims, outputsize, - inputShape, outputShape); + out->getRawDataPtr(), a[0], a[1], a[2], + a[3], b[0], b[1], b[2], b[3]); } } // TODO:use compute type diff --git a/src/kernels/cuda/where.cu b/src/kernels/cuda/where.cu index c60dcbcb..206a1463 100644 --- a/src/kernels/cuda/where.cu +++ b/src/kernels/cuda/where.cu @@ -1,6 +1,6 @@ #include "cuda/cuda_common.h" #include "cuda/cuda_utility.h" -const int repeat = 3; +const int repeat = 1; template __global__ void