forked from jiuyuan/InfiniTensor
Cuda softmax (#129)
* "add softmax.cu,.cc,.h" * Modify cuda softmax * "modified the introduction of softmax.cu" * "add format of cuda_softmax.h" * "modified where.cc(.cu,.h) and softmax.cu" * "modified format" * Fix cpu softmax kernel * "modified the // introduction of softmax.cu" * "modified softmax.cu and use 1D block" * "modified softmax.cu,format, and use 1D block" * "introduce share mem to speed softmax" * "reduce the input of function" * modified the format * remodify 2D block softmax * remodify 1D block softmax * modified the share memory * add warp reduce * conflict solve two * remove extra space line * solve comment --------- Co-authored-by: Haojie Wang <haojie0429@gmail.com> Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
This commit is contained in:
parent
1a6fccccbe
commit
d3e7543291
|
@ -0,0 +1,6 @@
|
|||
#pragma once
|
||||
#include "utils/small_array.h"
|
||||
namespace infini {
|
||||
void softmax_kernel(int num_blocks, float *input, float *output, int size,
|
||||
int dimsize, int stride);
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
namespace infini {
|
||||
void softmax_kernel(int max_threadblock_size, int batch_size, float *x,
|
||||
float *y, int dim, int stride);
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
#include "operators/unary.h"
|
||||
#include "core/constants.h"
|
||||
#include "core/kernel.h"
|
||||
#include "operators/softmax.h"
|
||||
|
||||
namespace infini {
|
||||
template <typename T> class NativeUnary : public CpuKernelWithoutConfig {
|
||||
|
@ -22,7 +23,7 @@ template <typename T> class NativeUnary : public CpuKernelWithoutConfig {
|
|||
template <typename T> class NaiveSoftmax : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto op = as<SoftmaxObj>(_op);
|
||||
T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
|
||||
T *outptr = op->getOutput()->getRawDataPtr<T *>();
|
||||
|
||||
|
|
|
@ -1,30 +1,30 @@
|
|||
#include "operators/softmax.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/softmax.h"
|
||||
#include "cuda/cuda_softmax.h"
|
||||
|
||||
namespace infini {
|
||||
class SoftmaxCudnn : public CudaKernelWithoutConfig {
|
||||
class SoftmaxCuda : public CudaKernelWithoutConfig {
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<SoftmaxObj>(_op);
|
||||
auto x = op->getInputs(0)->getRawDataPtr<float *>();
|
||||
auto y = op->getOutput(0)->getRawDataPtr<float *>();
|
||||
auto input = op->getInputs(0)->getRawDataPtr<float *>();
|
||||
auto output = op->getOutput(0)->getRawDataPtr<float *>();
|
||||
const auto &inShape = op->getInputs(0)->getDims(); // input shape
|
||||
auto dims = op->getInputs(0)->getDims();
|
||||
|
||||
int batch_size = 1;
|
||||
for (size_t i = 0; i < dims.size(); ++i)
|
||||
batch_size *= dims[i];
|
||||
int dim = dims[op->getAxis()];
|
||||
int size; // size = i(JKS) + j(KS) + k(S) + s
|
||||
size = op->getOutput(0)->size();
|
||||
int dimsize = dims[op->getAxis()];
|
||||
int stride = op->getInputs(0)->getStride().at(op->getAxis());
|
||||
|
||||
int block_num = batch_size / dim;
|
||||
int max_threadblock_size = batch_size / block_num;
|
||||
softmax_kernel(max_threadblock_size, block_num, x, y, dim,
|
||||
op->getInputs(0)->getStride().at(op->getAxis()));
|
||||
int num_blocks = size / dimsize;
|
||||
softmax_kernel(num_blocks, (float *)input, (float *)output, size,
|
||||
dimsize, stride);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, SoftmaxCudnn,
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, SoftmaxCuda,
|
||||
"Softmax_CUDA_Float32");
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,77 +1,183 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
#include "cuda/softmax.h"
|
||||
#include <cub/cub.cuh>
|
||||
|
||||
struct __align__(8) MD {
|
||||
float data;
|
||||
float d;
|
||||
struct __align__(8) DataMaxSum { // update the global max and sum, store the
|
||||
// output at max_tmp and sum_tmp
|
||||
float max_tmp; // store max
|
||||
float sum_tmp; // store sum
|
||||
};
|
||||
__device__ __forceinline__ DataMaxSum reduce_dms_op(DataMaxSum a,
|
||||
DataMaxSum b) {
|
||||
bool a_bigger = (a.max_tmp > b.max_tmp);
|
||||
DataMaxSum bigger = a_bigger ? a : b;
|
||||
DataMaxSum smaller = a_bigger ? b : a;
|
||||
bigger.sum_tmp = bigger.sum_tmp +
|
||||
smaller.sum_tmp * __expf(smaller.max_tmp - bigger.max_tmp);
|
||||
|
||||
return bigger;
|
||||
}
|
||||
template <int BLOCK_DIM>
|
||||
__launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel(
|
||||
float *__restrict input, float *__restrict output, int size, int dimsize,
|
||||
int stride) { // if set axis = 1, inputShape=[I,J,K,S]
|
||||
// tid = i(JKS) + j(KS) + k(S) + s
|
||||
|
||||
// blockDim.x = size/dimsize = IKS
|
||||
// blockIdx.x = i(KS) + k(S) + s,blockIdx.x%stride = k(S) + s
|
||||
|
||||
int tid =
|
||||
blockIdx.x % stride + (blockIdx.x - blockIdx.x % stride) *
|
||||
dimsize; // now, tid = i(JKS) + k(S) + s;
|
||||
|
||||
DataMaxSum dms_partial;
|
||||
dms_partial.max_tmp = -__FLT_MAX__;
|
||||
dms_partial.sum_tmp = 0.0f;
|
||||
DataMaxSum dms_input;
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
|
||||
|
||||
dms_input.max_tmp =
|
||||
input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
|
||||
|
||||
dms_input.sum_tmp = 1.0f;
|
||||
dms_partial = reduce_dms_op(dms_partial,
|
||||
dms_input); // reduce the data to one block
|
||||
}
|
||||
typedef cub::BlockReduce<DataMaxSum, BLOCK_DIM> BlockReduce;
|
||||
__shared__ typename BlockReduce::TempStorage temp_storage;
|
||||
__shared__ DataMaxSum dms_total;
|
||||
DataMaxSum dms_block =
|
||||
BlockReduce(temp_storage).Reduce(dms_partial, reduce_dms_op);
|
||||
if (threadIdx.x ==
|
||||
0) { // must set threadIdx.x = 0 write the output to memory
|
||||
dms_total = dms_block;
|
||||
}
|
||||
__syncthreads();
|
||||
//-----------------
|
||||
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
|
||||
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
|
||||
__expf(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
|
||||
dms_total.max_tmp) *
|
||||
__fdividef(1.0F, dms_total.sum_tmp);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> struct SumOp {
|
||||
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
|
||||
return a + b;
|
||||
}
|
||||
};
|
||||
|
||||
__device__ __forceinline__ MD reduce_md_op(MD a, MD b) {
|
||||
bool a_bigger = (a.data > b.data);
|
||||
MD bigger_m = a_bigger ? a : b;
|
||||
MD smaller_m = a_bigger ? b : a;
|
||||
MD res;
|
||||
res.d = bigger_m.d + smaller_m.d * __expf(smaller_m.data - bigger_m.data);
|
||||
res.data = bigger_m.data;
|
||||
return res;
|
||||
}
|
||||
|
||||
template <int THREADBLOCK_SIZE>
|
||||
__launch_bounds__(THREADBLOCK_SIZE) __global__
|
||||
void online_softmax(const float *__restrict in, float *__restrict out,
|
||||
int dimSize, int stride) {
|
||||
|
||||
// reposition in and out to data for the current vector
|
||||
int blockOffset = blockIdx.x;
|
||||
if (blockIdx.x >= stride) {
|
||||
int tmp = blockIdx.x % stride;
|
||||
blockOffset = tmp + (blockIdx.x - tmp) * dimSize;
|
||||
template <typename T> struct MaxOp {
|
||||
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
|
||||
return max(a, b);
|
||||
}
|
||||
in += blockOffset;
|
||||
out += blockOffset;
|
||||
|
||||
MD md_partial;
|
||||
md_partial.data = -FLT_MAX;
|
||||
md_partial.d = 0.0F;
|
||||
|
||||
for (int elem_id = threadIdx.x; elem_id < dimSize;
|
||||
elem_id += THREADBLOCK_SIZE) {
|
||||
MD new_elem;
|
||||
new_elem.data = in[elem_id * stride];
|
||||
new_elem.d = 1.0F;
|
||||
md_partial = reduce_md_op(md_partial, new_elem);
|
||||
};
|
||||
template <template <typename> class ReductionOp, typename T,
|
||||
int thread_group_width>
|
||||
__inline__ __device__ T WarpAllReduce(T val) {
|
||||
for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
|
||||
val = ReductionOp<T>()(val, __shfl_xor_sync(0xffffffff, val, mask));
|
||||
}
|
||||
|
||||
// blockreduce for THREADBLOCK_SIZE threads.
|
||||
// The actrual threads num used in the block is "dimsSize"
|
||||
typedef cub::BlockReduce<MD, THREADBLOCK_SIZE> BlockReduce;
|
||||
|
||||
__shared__ typename BlockReduce::TempStorage temp_storage;
|
||||
__shared__ MD md_total;
|
||||
|
||||
MD md = BlockReduce(temp_storage).Reduce(md_partial, reduce_md_op);
|
||||
if (threadIdx.x == 0)
|
||||
md_total = md;
|
||||
__syncthreads();
|
||||
|
||||
float d_total_inverse = __fdividef(1.0F, md_total.d);
|
||||
for (int elem_id = threadIdx.x; elem_id < dimSize;
|
||||
elem_id += THREADBLOCK_SIZE)
|
||||
out[elem_id * stride] =
|
||||
__expf(in[elem_id * stride] - md_total.data) * d_total_inverse;
|
||||
return val;
|
||||
}
|
||||
template <int BLOCK_DIM_x, int BLOCK_DIM_y>
|
||||
__global__ void _warpSoftmaxKernel(float *__restrict input,
|
||||
float *__restrict output, int size,
|
||||
int dimsize, int stride) {
|
||||
int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
int otherSize = size / dimsize;
|
||||
int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
|
||||
|
||||
if (otherIdx < otherSize) {
|
||||
|
||||
__shared__ float max_total[BLOCK_DIM_y];
|
||||
__shared__ float sum_total[BLOCK_DIM_y];
|
||||
float max_data = -__FLT_MAX__;
|
||||
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
|
||||
max_data =
|
||||
max(max_data,
|
||||
input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride]);
|
||||
}
|
||||
|
||||
max_data = WarpAllReduce<MaxOp, float, BLOCK_DIM_x>(max_data);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
max_total[threadIdx.y] = max_data;
|
||||
|
||||
//--------------------------------------------
|
||||
float sum_data = 0.0f;
|
||||
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
|
||||
sum_data +=
|
||||
__expf(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
|
||||
max_total[threadIdx.y]);
|
||||
}
|
||||
|
||||
sum_data = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sum_data);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
sum_total[threadIdx.y] = sum_data;
|
||||
|
||||
//--------------------------------------------
|
||||
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
|
||||
output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
|
||||
__expf(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
|
||||
max_total[threadIdx.y]) *
|
||||
__fdividef(1.0F, sum_total[threadIdx.y]);
|
||||
}
|
||||
}
|
||||
}
|
||||
//-----------------
|
||||
|
||||
//-----------------
|
||||
namespace infini {
|
||||
void softmax_kernel(int max_threadblock_size, int blockNum, float *in,
|
||||
float *out, int dimSize, int stride) {
|
||||
if (max_threadblock_size >= 255)
|
||||
online_softmax<256><<<blockNum, 256>>>(in, out, dimSize, stride);
|
||||
else if (max_threadblock_size >= 128)
|
||||
online_softmax<128><<<blockNum, 128>>>(in, out, dimSize, stride);
|
||||
else if (max_threadblock_size >= 64)
|
||||
online_softmax<64><<<blockNum, 64>>>(in, out, dimSize, stride);
|
||||
else
|
||||
online_softmax<32><<<blockNum, 32>>>(in, out, dimSize, stride);
|
||||
void softmax_kernel(int num_blocks, float *input, float *output, int size,
|
||||
int dimsize, int stride) {
|
||||
|
||||
if (dimsize > 1024) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<1024>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
int BLOCK_DIM_y = 32;
|
||||
int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
_warpSoftmaxKernel<32, 32>
|
||||
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
int BLOCK_DIM_y = 64;
|
||||
int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
_warpSoftmaxKernel<16, 64>
|
||||
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
int BLOCK_DIM_y = 128;
|
||||
int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
_warpSoftmaxKernel<8, 128>
|
||||
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
int BLOCK_DIM_y = 256;
|
||||
int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
_warpSoftmaxKernel<4, 256>
|
||||
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
|
@ -8,38 +8,38 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
void test_where(const Shape &inputxshape, const vector<float> &inputxdata,
|
||||
const Shape &inputyshape, const vector<float> &inputydata,
|
||||
const Shape &conditionshape,
|
||||
const vector<uint8_t> &conditiondata,
|
||||
void test_where(const Shape &inputXShape, const vector<float> &inputXData,
|
||||
const Shape &inputYShape, const vector<float> &inputYData,
|
||||
const Shape &conditionShape,
|
||||
const vector<uint8_t> &conditionData,
|
||||
const vector<float> &ExpectData) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
auto condition = gCpu->addTensor(conditionshape, DataType::UInt8);
|
||||
auto inputx = gCpu->addTensor(inputxshape, DataType::Float32);
|
||||
auto inputy = gCpu->addTensor(inputyshape, DataType::Float32);
|
||||
auto condition = gCpu->addTensor(conditionShape, DataType::UInt8);
|
||||
auto inputX = gCpu->addTensor(inputXShape, DataType::Float32);
|
||||
auto inputY = gCpu->addTensor(inputYShape, DataType::Float32);
|
||||
|
||||
gCpu->dataMalloc();
|
||||
condition->copyin(conditiondata); //
|
||||
inputx->copyin(inputxdata);
|
||||
inputy->copyin(inputydata); //
|
||||
condition->copyin(conditionData); //
|
||||
inputX->copyin(inputXData);
|
||||
inputY->copyin(inputYData); //
|
||||
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto conditionGpu = gCuda->cloneTensor(condition);
|
||||
auto inputxGpu = gCuda->cloneTensor(inputx);
|
||||
auto inputyGpu = gCuda->cloneTensor(inputy);
|
||||
auto inputXGpu = gCuda->cloneTensor(inputX);
|
||||
auto inputYGpu = gCuda->cloneTensor(inputY);
|
||||
|
||||
auto op = gCuda->addOp<WhereObj>(inputxGpu, inputyGpu, conditionGpu,
|
||||
auto op = gCuda->addOp<WhereObj>(inputXGpu, inputYGpu, conditionGpu,
|
||||
nullptr); // WhereObj
|
||||
gCuda->dataMalloc();
|
||||
conditionGpu->copyin(conditiondata);
|
||||
inputxGpu->copyin(inputxdata);
|
||||
inputyGpu->copyin(inputydata);
|
||||
conditionGpu->copyin(conditionData);
|
||||
inputXGpu->copyin(inputXData);
|
||||
inputYGpu->copyin(inputYData);
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput()); // move data from gpu to cpu
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
|
||||
oCpu->printData(); //->printData
|
||||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue