modified expand

This commit is contained in:
xgqdut2016 2024-04-10 11:16:54 +08:00
parent 2761d46737
commit 86133c8d0a
5 changed files with 59 additions and 70 deletions

View File

@ -3,9 +3,8 @@
#include "operators/unary.h"
#include "utils/small_array.h"
namespace infini {
void expandKernel(int dType, void *input, void *output, int nDims,
int outputsize, SmallArray inputShape,
SmallArray outputShape);
void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2,
int a3, int b0, int b1, int b2, int b3);
void expandRowKernel(int dType, void *input, void *output, int n_rows,
int row_len);

View File

@ -12,22 +12,19 @@ class ExpandCuda : public CudaKernelWithoutConfig {
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
const auto &in_Shape = op->getInputs(0)->getDims(); // input shape
const auto &out_Shape = op->getShape(); // output shape
auto a_dim = op->getInputs(0)->getDims();
auto b_dim = op->getOutput()->getDims(); // output shape
SmallArray inputShape, outputShape;
int nDims = op->getInputs(0)->getDims().size();
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
int outputsize = 1; // the length of the output vector after flatten
for (int i = 0; i < nDims; ++i) {
outputShape.data[i] = out_Shape[i];
inputShape.data[i] = in_Shape[i];
outputsize *= out_Shape[i];
}
const int dType = op->getDType().getIndex();
expandKernel(dType, inputData, outputData, nDims, outputsize,
inputShape, outputShape);
if (a_dim.size() > 4 || b_dim.size() > 4)
IT_TODO_HALT();
int a[4] = {1, 1, 1, 1};
int b[4] = {1, 1, 1, 1};
std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
expandKernel(dType, inputData, outputData, a[0], a[1], a[2], a[3], b[0],
b[1], b[2], b[3]);
}
};

View File

@ -6,36 +6,29 @@
constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
const int repeat = 1;
template <class T>
__global__ void _expandKernel(void *input, void *output, int nDims,
int outputsize, infini::SmallArray inputShape,
infini::SmallArray outputShape) {
__global__ void _expandKernel(void *input, void *output, int a0, int a1, int a2,
int a3, int b0, int b1, int b2, int b3) {
int outputIdx =
blockIdx.x * blockDim.x + threadIdx.x; // i(JKS) + j(KS) + k(S) + s
if (outputIdx < outputsize) {
int inputIdx = 0; // record input index
int temp = 1; // stored S, KS, JKS, in order
int tmp = 1; // stored s,k,j,i in order
int v = outputIdx; // v = i(JKS) + j(KS) + k(S) + s
for (int i = nDims - 1; i >= 0; --i) {
if (i == 0) {
tmp = v; // i = outputIdx/(JKS)
} else {
tmp = v % outputShape.data[i]; // store s,k,j in order
}
if (inputShape.data[i] ==
1) { // if input shape = 1, the index only equal 0
inputIdx += 0;
} else {
inputIdx +=
tmp * temp; // otherwise +i(JKS) or j(KS) or k(S) or s
}
temp *= inputShape.data[i];
v = v / outputShape.data[i];
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride1 = b2 * b3;
int stride0 = b1 * stride1;
int n = b0 * stride0;
int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
for (int i = repeat * index; i < end; i++) {
int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
if (aIdx) {
int b0_index = i / stride0;
int b1_index = (i % stride0) / stride1;
int b2_index = (i % stride1) / b3;
int b3_index = i % b3;
xIdx = (b0_index % a0) * a1 * a2 * a3 + (b1_index % a1) * a2 * a3 +
(b2_index % a2) * a3 + b3_index % a3;
}
((T *)output)[outputIdx] = ((T *)input)[inputIdx];
((T *)output)[i] = ((T *)input)[xIdx];
}
}
@ -50,9 +43,9 @@ static __global__ void _expandRowKernel(void *__restrict__ dst,
namespace infini {
#define CASE(T) \
_expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize, \
0, CUDAStream::getCurrentStream()>>>( \
input, output, nDims, outputsize, inputShape, outputShape);
_expandKernel<DT_CUDA<T>::t> \
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>( \
input, output, a0, a1, a2, a3, b0, b1, b2, b3);
#define SWITCH_DTYPE(DTYPE) \
switch (DTYPE) { \
@ -96,11 +89,12 @@ namespace infini {
IT_TODO_HALT(); \
}
void expandKernel(int dType, void *input, void *output, int nDims,
int outputsize, SmallArray inputShape,
SmallArray outputShape) {
void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2,
int a3, int b0, int b1, int b2, int b3) {
int blocksize = block_work_size();
int gridsize = (outputsize + block_work_size() - 1) / block_work_size();
int outputsize = b0 * b1 * b2 * b3;
int gridsize = (outputsize + repeat * block_work_size() - 1) /
(repeat * block_work_size());
SWITCH_DTYPE(dType)
}
@ -150,7 +144,8 @@ void expandKernel(int dType, void *input, void *output, int nDims,
IT_TODO_HALT(); \
}
// Optimization for expanding a row vector. The row length must be a multiple of 32
// Optimization for expanding a row vector. The row length must be a multiple of
// 32
void expandRowKernel(int dType, void *input, void *output, int n_rows,
int row_len) {
// Factorize row_len: row_len = a x b x 32 (32 is the warp size), b<=32
@ -160,7 +155,8 @@ void expandRowKernel(int dType, void *input, void *output, int n_rows,
// block: b x 32
auto c = row_len / 32, b = c;
if (b > 32) {
for (b = 32; c % b != 0; --b);
for (b = 32; c % b != 0; --b)
;
}
auto a = c / b;
dim3 grid(a, n_rows), block(32, b);

View File

@ -87,20 +87,7 @@ class matmulCublas : public Kernel {
beta_naive = 1.f;
auto inC = op->getInputs(2);
auto out = op->getOutput();
SmallArray inputShape, outputShape;
int nDims = out->getRank();
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
// FIXME(constroy): use size_t for outputsize.
int outputsize = 1; // the length of the output vector after flatten
int offset = nDims - inC->getRank();
for (int i = 0; i < offset; ++i)
inputShape.data[i] = 1;
for (int i = 0; i < nDims; ++i) {
outputShape.data[i] = out->getDims()[i];
outputsize *= outputShape.data[i];
if (i >= offset)
inputShape.data[i] = inC->getDims()[i - offset];
}
const int dType = dataType.getIndex();
// Bias in linear layer is row vector of (1,n), n is the number of
@ -111,9 +98,19 @@ class matmulCublas : public Kernel {
out->size() / inC->getDims()[0],
inC->getDims()[0]);
} else {
auto a_dim = out->getDims();
auto b_dim = inC->getDims(); // output shape
if (a_dim.size() > 4 || b_dim.size() > 4)
IT_TODO_HALT();
int a[4] = {1, 1, 1, 1};
int b[4] = {1, 1, 1, 1};
std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
expandKernel(dType, inC->getRawDataPtr<void *>(),
out->getRawDataPtr<void *>(), nDims, outputsize,
inputShape, outputShape);
out->getRawDataPtr<void *>(), a[0], a[1], a[2],
a[3], b[0], b[1], b[2], b[3]);
}
}
// TODO:use compute type

View File

@ -1,6 +1,6 @@
#include "cuda/cuda_common.h"
#include "cuda/cuda_utility.h"
const int repeat = 3;
const int repeat = 1;
template <typename T>
__global__ void