From 86133c8d0af4021d68f4d45ba7a78e6819dbade7 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Wed, 10 Apr 2024 11:16:54 +0800
Subject: [PATCH] modified expand

---
 include/cuda/cuda_expand.h |  5 ++-
 src/kernels/cuda/expand.cc | 25 ++++++--------
 src/kernels/cuda/expand.cu | 68 ++++++++++++++++++--------------------
 src/kernels/cuda/matmul.cc | 29 ++++++++--------
 src/kernels/cuda/where.cu  |  2 +-
 5 files changed, 59 insertions(+), 70 deletions(-)
diff --git a/include/cuda/cuda_expand.h b/include/cuda/cuda_expand.h
index 4001df41..049abfd9 100644
--- a/include/cuda/cuda_expand.h
+++ b/include/cuda/cuda_expand.h
@@ -3,9 +3,8 @@
 #include "operators/unary.h"
 #include "utils/small_array.h"
 namespace infini {
-void expandKernel(int dType, void *input, void *output, int nDims,
-                  int outputsize, SmallArray inputShape,
-                  SmallArray outputShape);
+void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2,
+                  int a3, int b0, int b1, int b2, int b3);
 
 void expandRowKernel(int dType, void *input, void *output, int n_rows,
                      int row_len);
diff --git a/src/kernels/cuda/expand.cc b/src/kernels/cuda/expand.cc
index 35b14f85..230893c9 100644
--- a/src/kernels/cuda/expand.cc
+++ b/src/kernels/cuda/expand.cc
@@ -12,22 +12,19 @@ class ExpandCuda : public CudaKernelWithoutConfig {
 
         void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
         void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
-        const auto &in_Shape = op->getInputs(0)->getDims(); // input shape
-        const auto &out_Shape = op->getShape();             // output shape
+        auto a_dim = op->getInputs(0)->getDims();
+        auto b_dim = op->getOutput()->getDims(); // output shape
 
-        SmallArray inputShape, outputShape;
-        int nDims = op->getInputs(0)->getDims().size();
-
-        IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
-        int outputsize = 1; // the length of the output vector after flatten
-        for (int i = 0; i < nDims; ++i) {
-            outputShape.data[i] = out_Shape[i];
-            inputShape.data[i] = in_Shape[i];
-            outputsize *= out_Shape[i];
-        }
         const int dType = op->getDType().getIndex();
-        expandKernel(dType, inputData, outputData, nDims, outputsize,
-                     inputShape, outputShape);
+        if (a_dim.size() > 4 || b_dim.size() > 4)
+            IT_TODO_HALT();
+
+        int a[4] = {1, 1, 1, 1};
+        int b[4] = {1, 1, 1, 1};
+        std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
+        std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
+        expandKernel(dType, inputData, outputData, a[0], a[1], a[2], a[3], b[0],
+                     b[1], b[2], b[3]);
     }
 };
 
diff --git a/src/kernels/cuda/expand.cu b/src/kernels/cuda/expand.cu
index 3fbf929e..3a934ae7 100644
--- a/src/kernels/cuda/expand.cu
+++ b/src/kernels/cuda/expand.cu
@@ -6,36 +6,29 @@
 constexpr unsigned int num_threads() { return 32 * 4; }
 constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
-
+const int repeat = 1;
 template <class T>
-__global__ void _expandKernel(void *input, void *output, int nDims,
-                              int outputsize, infini::SmallArray inputShape,
-                              infini::SmallArray outputShape) {
+__global__ void _expandKernel(void *input, void *output, int a0, int a1, int a2,
+                              int a3, int b0, int b1, int b2, int b3) {
 
-    int outputIdx =
-        blockIdx.x * blockDim.x + threadIdx.x; // i(JKS) + j(KS) + k(S) + s
-    if (outputIdx < outputsize) {
-        int inputIdx = 0;  // record input index
-        int temp = 1;      // stored S, KS, JKS, in order
-        int tmp = 1;       // stored s,k,j,i in order
-        int v = outputIdx; // v = i(JKS) + j(KS) + k(S) + s
-        for (int i = nDims - 1; i >= 0; --i) {
-            if (i == 0) {
-                tmp = v; // i = outputIdx/(JKS)
-            } else {
-                tmp = v % outputShape.data[i]; // store s,k,j in order
-            }
-            if (inputShape.data[i] ==
-                1) { // if input shape = 1, the index only equal 0
-                inputIdx += 0;
-            } else {
-                inputIdx +=
-                    tmp * temp; // otherwise +i(JKS) or j(KS) or k(S) or s
-            }
-            temp *= inputShape.data[i];
-            v = v / outputShape.data[i];
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+    int stride1 = b2 * b3;
+    int stride0 = b1 * stride1;
+    int n = b0 * stride0;
+    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
+    for (int i = repeat * index; i < end; i++) {
+        int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
+        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
+        if (aIdx) {
+            int b0_index = i / stride0;
+            int b1_index = (i % stride0) / stride1;
+            int b2_index = (i % stride1) / b3;
+            int b3_index = i % b3;
+            xIdx = (b0_index % a0) * a1 * a2 * a3 + (b1_index % a1) * a2 * a3 +
+                   (b2_index % a2) * a3 + b3_index % a3;
         }
-        ((T *)output)[outputIdx] = ((T *)input)[inputIdx];
+        ((T *)output)[i] = ((T *)input)[xIdx];
     }
 }
 
@@ -50,9 +43,9 @@ static __global__ void _expandRowKernel(void *__restrict__ dst,
 namespace infini {
 
 #define CASE(T)                                                                \
-    _expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize,                        \
-        0, CUDAStream::getCurrentStream()>>>(                                  \
-        input, output, nDims, outputsize, inputShape, outputShape);
+    _expandKernel<DT_CUDA<T>::t>                                               \
+        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(          \
+            input, output, a0, a1, a2, a3, b0, b1, b2, b3);
 
 #define SWITCH_DTYPE(DTYPE)                                                    \
     switch (DTYPE) {                                                           \
@@ -96,11 +89,12 @@ namespace infini {
         IT_TODO_HALT();                                                        \
     }
 
-void expandKernel(int dType, void *input, void *output, int nDims,
-                  int outputsize, SmallArray inputShape,
-                  SmallArray outputShape) {
+void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2,
+                  int a3, int b0, int b1, int b2, int b3) {
     int blocksize = block_work_size();
-    int gridsize = (outputsize + block_work_size() - 1) / block_work_size();
+    int outputsize = b0 * b1 * b2 * b3;
+    int gridsize = (outputsize + repeat * block_work_size() - 1) /
+                   (repeat * block_work_size());
     SWITCH_DTYPE(dType)
 }
 
@@ -150,7 +144,8 @@ void expandKernel(int dType, void *input, void *output, int nDims,
         IT_TODO_HALT();                                                        \
     }
 
-// Optimization for expanding a row vector. The row length must be a multiple of 32
+// Optimization for expanding a row vector. The row length must be a multiple of
+// 32
 void expandRowKernel(int dType, void *input, void *output, int n_rows,
                      int row_len) {
     // Factorize row_len: row_len = a x b x 32 (32 is the warp size), b<=32
@@ -160,7 +155,8 @@ void expandRowKernel(int dType, void *input, void *output, int n_rows,
     // block: b x 32
     auto c = row_len / 32, b = c;
     if (b > 32) {
-        for (b = 32; c % b != 0; --b);
+        for (b = 32; c % b != 0; --b)
+            ;
     }
     auto a = c / b;
     dim3 grid(a, n_rows), block(32, b);
diff --git a/src/kernels/cuda/matmul.cc b/src/kernels/cuda/matmul.cc
index de2c646e..02a4497c 100644
--- a/src/kernels/cuda/matmul.cc
+++ b/src/kernels/cuda/matmul.cc
@@ -87,20 +87,7 @@ class matmulCublas : public Kernel {
             beta_naive = 1.f;
             auto inC = op->getInputs(2);
             auto out = op->getOutput();
-            SmallArray inputShape, outputShape;
-            int nDims = out->getRank();
-            IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
-            // FIXME(constroy): use size_t for outputsize.
-            int outputsize = 1; // the length of the output vector after flatten
-            int offset = nDims - inC->getRank();
-            for (int i = 0; i < offset; ++i)
-                inputShape.data[i] = 1;
-            for (int i = 0; i < nDims; ++i) {
-                outputShape.data[i] = out->getDims()[i];
-                outputsize *= outputShape.data[i];
-                if (i >= offset)
-                    inputShape.data[i] = inC->getDims()[i - offset];
-            }
+
             const int dType = dataType.getIndex();
 
             // Bias in linear layer is row vector of (1,n), n is the number of
@@ -111,9 +98,19 @@ class matmulCublas : public Kernel {
                                 out->size() / inC->getDims()[0],
                                 inC->getDims()[0]);
             } else {
+                auto a_dim = out->getDims();
+                auto b_dim = inC->getDims(); // output shape
+
+                if (a_dim.size() > 4 || b_dim.size() > 4)
+                    IT_TODO_HALT();
+
+                int a[4] = {1, 1, 1, 1};
+                int b[4] = {1, 1, 1, 1};
+                std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
+                std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
                 expandKernel(dType, inC->getRawDataPtr<void *>(),
-                             out->getRawDataPtr<void *>(), nDims, outputsize,
-                             inputShape, outputShape);
+                             out->getRawDataPtr<void *>(), a[0], a[1], a[2],
+                             a[3], b[0], b[1], b[2], b[3]);
             }
         }
         // TODO:use compute type
diff --git a/src/kernels/cuda/where.cu b/src/kernels/cuda/where.cu
index c60dcbcb..206a1463 100644
--- a/src/kernels/cuda/where.cu
+++ b/src/kernels/cuda/where.cu
@@ -1,6 +1,6 @@
 #include "cuda/cuda_common.h"
 #include "cuda/cuda_utility.h"
-const int repeat = 3;
+const int repeat = 1;
 
 template <typename T>
 __global__ void