forked from jiuyuan/InfiniTensor
modified expand
This commit is contained in:
parent
2761d46737
commit
86133c8d0a
|
@ -3,9 +3,8 @@
|
|||
#include "operators/unary.h"
|
||||
#include "utils/small_array.h"
|
||||
namespace infini {
|
||||
void expandKernel(int dType, void *input, void *output, int nDims,
|
||||
int outputsize, SmallArray inputShape,
|
||||
SmallArray outputShape);
|
||||
void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2,
|
||||
int a3, int b0, int b1, int b2, int b3);
|
||||
|
||||
void expandRowKernel(int dType, void *input, void *output, int n_rows,
|
||||
int row_len);
|
||||
|
|
|
@ -12,22 +12,19 @@ class ExpandCuda : public CudaKernelWithoutConfig {
|
|||
|
||||
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
const auto &in_Shape = op->getInputs(0)->getDims(); // input shape
|
||||
const auto &out_Shape = op->getShape(); // output shape
|
||||
auto a_dim = op->getInputs(0)->getDims();
|
||||
auto b_dim = op->getOutput()->getDims(); // output shape
|
||||
|
||||
SmallArray inputShape, outputShape;
|
||||
int nDims = op->getInputs(0)->getDims().size();
|
||||
|
||||
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
||||
int outputsize = 1; // the length of the output vector after flatten
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
outputShape.data[i] = out_Shape[i];
|
||||
inputShape.data[i] = in_Shape[i];
|
||||
outputsize *= out_Shape[i];
|
||||
}
|
||||
const int dType = op->getDType().getIndex();
|
||||
expandKernel(dType, inputData, outputData, nDims, outputsize,
|
||||
inputShape, outputShape);
|
||||
if (a_dim.size() > 4 || b_dim.size() > 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int a[4] = {1, 1, 1, 1};
|
||||
int b[4] = {1, 1, 1, 1};
|
||||
std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
|
||||
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
|
||||
expandKernel(dType, inputData, outputData, a[0], a[1], a[2], a[3], b[0],
|
||||
b[1], b[2], b[3]);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -6,36 +6,29 @@
|
|||
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
||||
const int repeat = 1;
|
||||
template <class T>
|
||||
__global__ void _expandKernel(void *input, void *output, int nDims,
|
||||
int outputsize, infini::SmallArray inputShape,
|
||||
infini::SmallArray outputShape) {
|
||||
__global__ void _expandKernel(void *input, void *output, int a0, int a1, int a2,
|
||||
int a3, int b0, int b1, int b2, int b3) {
|
||||
|
||||
int outputIdx =
|
||||
blockIdx.x * blockDim.x + threadIdx.x; // i(JKS) + j(KS) + k(S) + s
|
||||
if (outputIdx < outputsize) {
|
||||
int inputIdx = 0; // record input index
|
||||
int temp = 1; // stored S, KS, JKS, in order
|
||||
int tmp = 1; // stored s,k,j,i in order
|
||||
int v = outputIdx; // v = i(JKS) + j(KS) + k(S) + s
|
||||
for (int i = nDims - 1; i >= 0; --i) {
|
||||
if (i == 0) {
|
||||
tmp = v; // i = outputIdx/(JKS)
|
||||
} else {
|
||||
tmp = v % outputShape.data[i]; // store s,k,j in order
|
||||
}
|
||||
if (inputShape.data[i] ==
|
||||
1) { // if input shape = 1, the index only equal 0
|
||||
inputIdx += 0;
|
||||
} else {
|
||||
inputIdx +=
|
||||
tmp * temp; // otherwise +i(JKS) or j(KS) or k(S) or s
|
||||
}
|
||||
temp *= inputShape.data[i];
|
||||
v = v / outputShape.data[i];
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
int stride1 = b2 * b3;
|
||||
int stride0 = b1 * stride1;
|
||||
int n = b0 * stride0;
|
||||
int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
|
||||
for (int i = repeat * index; i < end; i++) {
|
||||
int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
|
||||
bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
|
||||
if (aIdx) {
|
||||
int b0_index = i / stride0;
|
||||
int b1_index = (i % stride0) / stride1;
|
||||
int b2_index = (i % stride1) / b3;
|
||||
int b3_index = i % b3;
|
||||
xIdx = (b0_index % a0) * a1 * a2 * a3 + (b1_index % a1) * a2 * a3 +
|
||||
(b2_index % a2) * a3 + b3_index % a3;
|
||||
}
|
||||
((T *)output)[outputIdx] = ((T *)input)[inputIdx];
|
||||
((T *)output)[i] = ((T *)input)[xIdx];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -50,9 +43,9 @@ static __global__ void _expandRowKernel(void *__restrict__ dst,
|
|||
namespace infini {
|
||||
|
||||
#define CASE(T) \
|
||||
_expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize, \
|
||||
0, CUDAStream::getCurrentStream()>>>( \
|
||||
input, output, nDims, outputsize, inputShape, outputShape);
|
||||
_expandKernel<DT_CUDA<T>::t> \
|
||||
<<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>( \
|
||||
input, output, a0, a1, a2, a3, b0, b1, b2, b3);
|
||||
|
||||
#define SWITCH_DTYPE(DTYPE) \
|
||||
switch (DTYPE) { \
|
||||
|
@ -96,11 +89,12 @@ namespace infini {
|
|||
IT_TODO_HALT(); \
|
||||
}
|
||||
|
||||
void expandKernel(int dType, void *input, void *output, int nDims,
|
||||
int outputsize, SmallArray inputShape,
|
||||
SmallArray outputShape) {
|
||||
void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2,
|
||||
int a3, int b0, int b1, int b2, int b3) {
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (outputsize + block_work_size() - 1) / block_work_size();
|
||||
int outputsize = b0 * b1 * b2 * b3;
|
||||
int gridsize = (outputsize + repeat * block_work_size() - 1) /
|
||||
(repeat * block_work_size());
|
||||
SWITCH_DTYPE(dType)
|
||||
}
|
||||
|
||||
|
@ -150,7 +144,8 @@ void expandKernel(int dType, void *input, void *output, int nDims,
|
|||
IT_TODO_HALT(); \
|
||||
}
|
||||
|
||||
// Optimization for expanding a row vector. The row length must be a multiple of 32
|
||||
// Optimization for expanding a row vector. The row length must be a multiple of
|
||||
// 32
|
||||
void expandRowKernel(int dType, void *input, void *output, int n_rows,
|
||||
int row_len) {
|
||||
// Factorize row_len: row_len = a x b x 32 (32 is the warp size), b<=32
|
||||
|
@ -160,7 +155,8 @@ void expandRowKernel(int dType, void *input, void *output, int n_rows,
|
|||
// block: b x 32
|
||||
auto c = row_len / 32, b = c;
|
||||
if (b > 32) {
|
||||
for (b = 32; c % b != 0; --b);
|
||||
for (b = 32; c % b != 0; --b)
|
||||
;
|
||||
}
|
||||
auto a = c / b;
|
||||
dim3 grid(a, n_rows), block(32, b);
|
||||
|
|
|
@ -87,20 +87,7 @@ class matmulCublas : public Kernel {
|
|||
beta_naive = 1.f;
|
||||
auto inC = op->getInputs(2);
|
||||
auto out = op->getOutput();
|
||||
SmallArray inputShape, outputShape;
|
||||
int nDims = out->getRank();
|
||||
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
||||
// FIXME(constroy): use size_t for outputsize.
|
||||
int outputsize = 1; // the length of the output vector after flatten
|
||||
int offset = nDims - inC->getRank();
|
||||
for (int i = 0; i < offset; ++i)
|
||||
inputShape.data[i] = 1;
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
outputShape.data[i] = out->getDims()[i];
|
||||
outputsize *= outputShape.data[i];
|
||||
if (i >= offset)
|
||||
inputShape.data[i] = inC->getDims()[i - offset];
|
||||
}
|
||||
|
||||
const int dType = dataType.getIndex();
|
||||
|
||||
// Bias in linear layer is row vector of (1,n), n is the number of
|
||||
|
@ -111,9 +98,19 @@ class matmulCublas : public Kernel {
|
|||
out->size() / inC->getDims()[0],
|
||||
inC->getDims()[0]);
|
||||
} else {
|
||||
auto a_dim = out->getDims();
|
||||
auto b_dim = inC->getDims(); // output shape
|
||||
|
||||
if (a_dim.size() > 4 || b_dim.size() > 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int a[4] = {1, 1, 1, 1};
|
||||
int b[4] = {1, 1, 1, 1};
|
||||
std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
|
||||
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
|
||||
expandKernel(dType, inC->getRawDataPtr<void *>(),
|
||||
out->getRawDataPtr<void *>(), nDims, outputsize,
|
||||
inputShape, outputShape);
|
||||
out->getRawDataPtr<void *>(), a[0], a[1], a[2],
|
||||
a[3], b[0], b[1], b[2], b[3]);
|
||||
}
|
||||
}
|
||||
// TODO:use compute type
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
const int repeat = 3;
|
||||
const int repeat = 1;
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
|
|
Loading…
Reference in New Issue