Modify memory allocation method

This commit is contained in:
xgqdut2016 2024-02-26 02:13:04 +00:00
parent e5d5085e6a
commit 920b23cad8
2 changed files with 100 additions and 77 deletions

View File

@ -11,7 +11,7 @@ void softmaxKernel(cnnlHandle_t handle, float *mlu_destination, float *mlu_src,
k_dim.y = 1;
k_dim.z = 1;
k_type = CNRT_FUNC_TYPE_UNION1;
// launch 任务
// launch kernel
softmaxUnion1<<<k_dim, k_type, queue>>>(mlu_destination, mlu_src, nDim, axis, othersize, frontsize, dimsize, stride);
}

View File

@ -1,19 +1,20 @@
#include <bang.h>
#include <bang_device_functions.h>
#define EPS 1e-7
const int NRAM_MAX_SIZE = 1024 * 64;//后续树状求和必须保证NRAM_MAX_SIZE为2的幂次
const int maxNum = NRAM_MAX_SIZE/sizeof(float); //NRAM上最多存储maxNum个float元素
const int NRAM_MAX_SIZE = 1024 * 256;//Apply for maximum memory in advance from NRAM
const int nramNum = NRAM_MAX_SIZE/sizeof(float);
const int SRC_MAX_SIZE = 1024 * 32;//The subsequent tree summation must ensure that SRC-MAX-SIZE is a power of 2
const int maxNum = SRC_MAX_SIZE/sizeof(float);
const int warpSize = 32;
//strideS是大于等于stride的最小的二的幂次方
__mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int frontsize, int dimsize, int stride) {
__mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int frontsize, int dimsize, int stride, int strideS) {
// 0<axis<dim -1
__nram__ float src[maxNum];
__nram__ float nram_buffer[nramNum];
if(stride >= maxNum){
__nram__ float tmpSum[maxNum];
__nram__ float tmpNewMax[maxNum];
__nram__ float tmpOldMax[maxNum];
float *src = nram_buffer;
float *tmpSum = src + maxNum;
float *tmpNewMax = tmpSum + maxNum;
float *tmpOldMax = tmpNewMax + maxNum;
int remain = stride % maxNum;
int repeat = (stride - remain) / maxNum;
@ -34,7 +35,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
__bang_write_zero(src, maxNum);
for(int i = 0; i < dimsize; i++){
__memcpy(src, source + frontIdx + i * stride + j * maxNum, maxNum * sizeof(float), GDRAM2NRAM);
__bang_maxequal(tmpNewMax, tmpNewMax, src, maxNum);//不断更新最大值
__bang_maxequal(tmpNewMax, tmpNewMax, src, maxNum);//Continuously updating the maximum value
__bang_sub(src, src, tmpNewMax, maxNum);//x - M
__bang_active_exp_less_0(src, src, maxNum);//exp(x - M)
if(i > 0){
@ -46,8 +47,8 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
__memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM
}
__bang_active_reciphp(tmpSum, tmpSum, maxNum);//计算1/sum
//开始指数变换并且写回GDRAM
__bang_mul(src, src, tmpSum, maxNum);//上面循环结束src存储的数据可以利用
//Start exponential transformation and write back to GDRAM
__bang_mul(src, src, tmpSum, maxNum);//The data stored in the src at the end of the loop above can be utilized
__memcpy(destination + (dimsize - 1) * stride + frontIdx + j * maxNum, src, maxNum * sizeof(float), NRAM2GDRAM);
for(int i = 0; i < dimsize - 1; i++){
__memcpy(src, source + frontIdx + i * stride + j * maxNum, maxNum * sizeof(float), GDRAM2NRAM);
@ -76,8 +77,8 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
}
//-------------------
__bang_active_reciphp(tmpSum, tmpSum, maxNum);//计算1/sum
//开始指数变换并且写回GDRAM
__bang_mul(src, src, tmpSum, maxNum);//上面循环结束src存储的数据可以利用
//Start exponential transformation and write back to GDRAM
__bang_mul(src, src, tmpSum, maxNum);//The data stored in the src at the end of the loop above can be utilized
__memcpy(destination + (dimsize - 1) * stride + frontIdx + repeat * maxNum, src, remain * sizeof(float), NRAM2GDRAM);
for(int i = 0; i < dimsize - 1; i++){
__memcpy(src, source + i * stride + frontIdx + repeat * maxNum, remain * sizeof(float), GDRAM2NRAM);
@ -91,22 +92,23 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
}
}
else if(stride < maxNum && dimsize * stride >= maxNum){
const int strideS = 1024;
__nram__ float tmp[strideS];
__nram__ float tmpOldMax[strideS];
__nram__ float tmpNewMax[strideS];
__nram__ float tmpSum[strideS];
float* src = nram_buffer;
float* tmp = src + maxNum;
float* tmpOldMax = tmp + strideS;
float* tmpNewMax = tmpOldMax + strideS;
float* tmpSum = tmpNewMax + strideS;
int multiple = maxNum / stride;
int size = multiple * stride;//一个src最多可以放的数据量
int remain = dimsize % multiple;//如果不能整除,这部分数据需要特殊处理
int repeat = (dimsize - remain) / multiple;//为了加载整个dimsize需要的循环总数
int size = multiple * stride;//The maximum amount of data that can be stored in an SRC
int remain = dimsize % multiple;//If it cannot be divisible, this part of the data needs special processing
int repeat = (dimsize - remain) / multiple;//The total number of loops required to load the entire dimsize
int taskRemain = frontsize % taskDim;
int stepEasy = (frontsize - taskRemain) / taskDim;
int stepHard = stepEasy + 1;
int step = (taskId < taskRemain ? stepHard : stepEasy);//每个taskId处理frontsize的数目
int step = (taskId < taskRemain ? stepHard : stepEasy);//The number of frontsize processed per taskId
int indStart = (taskId < taskRemain ? taskId * stepHard : taskRemain * stepHard + (taskId - taskRemain) * stepEasy);
source = source + indStart * dimsize * stride;
destination = destination + indStart * dimsize * stride;
@ -114,18 +116,18 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
for(int ind = 0; ind < step; ind++){
int frontIdx = ind * dimsize * stride;
__bang_write_value(tmpNewMax, strideS, -INFINITY);//必须初始化为负无穷
__bang_write_value(tmp, strideS, -INFINITY);//必须初始化为负无穷
__bang_write_zero(tmpSum, strideS);//必须初始化为0
__bang_write_value(tmpNewMax, strideS, -INFINITY);//Must be initialized to negative infinity
__bang_write_value(tmp, strideS, -INFINITY);//Must be initialized to negative infinity
__bang_write_zero(tmpSum, strideS);//Must be initialized to zero
for(int j = 0; j < repeat; j++){
__memcpy(src, source + frontIdx + j * multiple * stride, size * sizeof(float), GDRAM2NRAM);
for(int m = 0; m < multiple; m++){
__memcpy(tmp, src + m * stride, stride * sizeof(float), NRAM2NRAM);
__bang_maxequal(tmpNewMax, tmpNewMax, tmp, strideS);//虽然tmpNewMax后面strideS-stride部分是0但是不用写回GDRAM不影响结果
__bang_maxequal(tmpNewMax, tmpNewMax, tmp, strideS);//Although the stream S stream section after tmpNewMax is 0, there is no need to write back to GDRAM, which does not affect the result
__bang_sub(tmp, tmp, tmpNewMax, strideS);//tmp后面strideS-stride部分是0
__bang_sub(tmp, tmp, tmpNewMax, strideS);//The stripe S stripe section after tmp is 0
__bang_active_exp_less_0(tmp, tmp, strideS);
if(j != 0 || m != 0){
__bang_sub(tmpOldMax, tmpOldMax, tmpNewMax, strideS);//oldM = oldM - newM
@ -143,7 +145,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
for(int m = 0; m < remain; m++){
__memcpy(tmp, src + m * stride, stride * sizeof(float), NRAM2NRAM);
__bang_maxequal(tmpNewMax, tmpNewMax, tmp, strideS);
__bang_sub(tmp, tmp, tmpNewMax, strideS);//tmp后面strideS-stride部分是0
__bang_sub(tmp, tmp, tmpNewMax, strideS);//The stripe S stripe section after tmp is 0
__bang_active_exp_less_0(tmp, tmp, strideS);
if(repeat != 0 || m != 0){
__bang_sub(tmpOldMax, tmpOldMax, tmpNewMax, strideS);//oldM = oldM - newM
@ -155,7 +157,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
}
}
//此时tmpNewMax存储的是对应于固定frontIdxbehindsize对应数据的最大值而tmpSum存储的就是对应数值和
//At this point, tmpNewMax stores the maximum value of the data corresponding to a fixed frontIdx and bedsize, while tmpSum stores the corresponding value sum
//__bang_printf("tmpOldMax[0]:%.2f,tmpSum[0]:%.2f\n", tmpNewMax[2],tmpSum[2]);
__bang_active_reciphp(tmpSum, tmpSum, strideS);
//__bang_printf("tmpOldMax[0]:%.2f,tmpSum[0]:%.2f\n", tmpNewMax[2],tmpSum[2]);
@ -183,14 +185,14 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
}
}
else if(dimsize * stride < maxNum){
const int strideS = 32;
__nram__ float tmp[strideS];
__nram__ float tmpOldMax[strideS];
__nram__ float tmpNewMax[strideS];
__nram__ float tmpSum[strideS];
float* src = nram_buffer;
float* tmp = src + maxNum;
float* tmpOldMax = tmp + strideS;
float* tmpNewMax = tmpOldMax + strideS;
float* tmpSum = tmpNewMax + strideS;
int behindsize = dimsize * stride;
int multiple = maxNum / behindsize;//表示一个maxNum能够在frontsize中分担的量
int multiple = maxNum / behindsize;//Represents the amount that a maxNum can share in frontsize
int remainF = frontsize % (taskDim * multiple);
int remainT = remainF % taskDim;
@ -198,11 +200,11 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
int stepHard = stepEasy + 1;
int step = (taskId < remainT ? stepHard : stepEasy);
int taskRepeat = (frontsize - remainF) / (taskDim * multiple);
//此时对应于frontsize每个taskId处理的数据量是taskRepeat * multiple + step
//At this point, corresponding to frontsize, the amount of data processed by each taskId is taskRepeat * multiple+step
int startHard = taskId * (taskRepeat * multiple + stepHard);
int startEasy = remainT * (taskRepeat * multiple + stepHard) + (taskId - remainT) * (taskRepeat * multiple + stepEasy);
int indStart = (taskId < remainT ? startHard: startEasy);
source = source + indStart * behindsize;//indStart * behindsize表示不同taskId对应的偏移量
source = source + indStart * behindsize;//indStart * behindsize Indicates the offset corresponding to different taskIds
destination = destination + indStart * behindsize;
int tid;
for(int s = 0; s < taskRepeat; s++){
@ -226,7 +228,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
__memcpy(tmpOldMax, tmpNewMax, stride * sizeof(float), NRAM2NRAM);//oldM = newM
}
__bang_active_reciphp(tmpSum, tmpSum, strideS);
__bang_mul(tmp, tmp, tmpSum, strideS);//上面循环结束tmp存储的数据可以利用
__bang_mul(tmp, tmp, tmpSum, strideS);//The data stored in tmp at the end of the loop above can be utilized
//__memcpy(destination + tid + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2GDRAM);
__memcpy(src + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2NRAM);
for(int i = 0; i < dimsize - 1; i++){
@ -263,7 +265,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
}
//__bang_printf("max:%.2f,%.2f, sum:%.2f,sum:%.2f\n", tmpNewMax[0], tmpNewMax[1], tmpSum[0], tmpSum[0]);
__bang_active_reciphp(tmpSum, tmpSum, strideS);
__bang_mul(tmp, tmp, tmpSum, strideS);//上面循环结束tmp存储的数据可以利用
__bang_mul(tmp, tmp, tmpSum, strideS);//The data stored in tmp at the end of the loop above can be utilized
//__memcpy(destination + tid + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2GDRAM);
__memcpy(src + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2NRAM);
for(int i = 0; i < dimsize - 1; i++){
@ -280,8 +282,8 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
}
}
const int dimS = 32;
__mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int othersize, int dimsize) {// axis = -1
__mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int othersize, int dimsize, int dimS) {// axis = -1
int multiple = maxNum / dimsize;
int size = taskDim * multiple;
int remainS = othersize % size;
@ -290,21 +292,22 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
int stepEasy = (remainS - remainT) / taskDim;
int stepHard = stepEasy + 1;
int step = (taskId < remainT ? stepHard : stepEasy);
//每个taskId处理othersize分配的量就是taskRepeat * multiple + step
//整体来看每个taskId处理的数据量就是(taskRepeat * multiple + step) * dimsize
//The amount allocated for processing othersize for each taskId is taskRepeat * multiple+step
//Overall, the amount of data processed by each taskId is (taskRepeat * multiple+step) * dimsize
int startHard = taskId * (taskRepeat * multiple + stepHard);
int startEasy = remainT * (taskRepeat * multiple + stepHard) + (taskId - remainT) * (taskRepeat * multiple + stepEasy);
int indStart = (taskId < remainT ? startHard: startEasy);
source = source + indStart * dimsize;
destination = destination + indStart * dimsize;
__nram__ float src[maxNum];
__nram__ float tmp[dimS];
__nram__ float destSum[dimS];//后面数值求和
int remainDim = dimsize % dimS;//dimsize可能不是2的幂次方
__nram__ float nram_buffer[nramNum];
float* src = nram_buffer;
float* tmp = src + maxNum;
float* destSum = tmp + dimS;
int remainDim = dimsize % dimS;//Dimsize may not be a power of 2
int repeatDim = (dimsize - remainDim) / dimS;
__nram__ float destSumFinal[warpSize];//将destSum规约到destFinal[0]
__nram__ float destSumFinal[warpSize];//Reduce destSum to destFinal [0]
__nram__ float srcMax[2];
__nram__ float destOldMax;
__nram__ float destNewMax;
@ -339,7 +342,7 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
if(destNewMax < srcMax[0]){
destNewMax = srcMax[0];
}
__bang_write_value(tmp, dimS, destNewMax);//必须重新初始化为NewMax
__bang_write_value(tmp, dimS, destNewMax);//Must be reinitialized to NewMax
__memcpy(tmp, src + j * dimsize + repeatDim * dimS, remainDim * sizeof(float), NRAM2NRAM);
__bang_sub_scalar(tmp, tmp, destNewMax, dimS);
__bang_active_exp_less_0(tmp, tmp, dimS);
@ -350,17 +353,17 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
destOldMax = destNewMax;
}
int segNum = dimS / warpSize;//开始数值求和
int segNum = dimS / warpSize;//Starting numerical summation
for(int strip = segNum/2; strip > 0; strip = strip / 2){
for(int i = 0; i < strip ; i++){
__bang_add(destSum + i * warpSize, destSum + i * warpSize, destSum + (i + strip) * warpSize, warpSize);
}
}
__bang_reduce_sum(destSumFinal, destSum, warpSize);//此时destSumFinal[0]保存的就是当前dimsize长度数据的数值和
__bang_reduce_sum(destSumFinal, destSum, warpSize);//At this point, destSumFinal [0] saves the numerical value of the current dimsize length data sum
if(remainDim){
destSumFinal[0] = destSumFinal[0] - (dimS - remainDim);
}
//下面开始写回数据
//Now let's start writing back the data
float globalSumInv = 1.0/destSumFinal[0];
if(remainDim){
__bang_mul_scalar(tmp, tmp, globalSumInv, dimS);
@ -373,17 +376,18 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
__bang_mul_scalar(tmp, tmp, globalSumInv, dimS);
__memcpy(destination + tid + j * dimsize + i * dimS, tmp, dimS * sizeof(float), NRAM2GDRAM);
}
}//必须马上写回GDRAM如果先写回src然后src写回GDRAM可能出现src写回GDRAM没有结束就修改src数据的情况
}
//it is necessary to write back to GDRAM immediately. If you first write back to src and then write back to GDRAM,
//there may be a situation where src writes back to GDRAM before modifying the src data
}
if(step){//step针对的是othersize不能整除multiple*dimsize的部分
if(step){//Step targets parts of othersize that cannot be divided by multiple * dimsize
tid = taskRepeat * multiple * dimsize;
__memcpy(src, source + tid, step * dimsize * sizeof(float), GDRAM2NRAM);
for(int j = 0; j < step; j++){
__bang_write_zero(destSum, dimS);
__bang_write_zero(destSumFinal, warpSize);
destNewMax = -INFINITY;
for(int i = 0; i < repeatDim; i++){//repeatDim针对的是固定otherIdx后靠dimS读取当前dimsize数据一共需要的循环次数
for(int i = 0; i < repeatDim; i++){//RepeatDim refers to the total number of cycles required to read the current dimsize data using dimS after fixing otherIdx
__memcpy(tmp, src + j * dimsize + i * dimS, dimS * sizeof(float), NRAM2NRAM);
__bang_argmax(srcMax, tmp, dimS);
if(destNewMax < srcMax[0]){
@ -397,7 +401,7 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
__bang_add(destSum, destSum, tmp, dimS);
destOldMax = destNewMax;
}
if(remainDim){//remainDim针对的是固定otherIdx后dimsize不能整除dimS的部分
if(remainDim){//RemainDim refers to the part of dimsize that cannot be divided by dimS after fixing otherIdx
__bang_write_value(tmp, dimS, -INFINITY);
__memcpy(tmp, src + j * dimsize + repeatDim * dimS, remainDim * sizeof(float), NRAM2NRAM);
__bang_argmax(srcMax, tmp, dimS);
@ -405,7 +409,7 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
destNewMax = srcMax[0];
}
__bang_write_value(tmp, dimS, destNewMax);//必须重新初始化为NewMax
__bang_write_value(tmp, dimS, destNewMax);//Must be reinitialized to NewMax
__memcpy(tmp, src + j * dimsize + repeatDim * dimS, remainDim * sizeof(float), NRAM2NRAM);
__bang_sub_scalar(tmp, tmp, destNewMax, dimS);
__bang_active_exp_less_0(tmp, tmp, dimS);
@ -415,14 +419,14 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
__bang_add(destSum, destSum, tmp, dimS);
destOldMax = destNewMax;
}
int segNum = dimS / warpSize;//开始数值求和
int segNum = dimS / warpSize;//Starting numerical summation
for(int strip = segNum/2; strip > 0; strip = strip / 2){
for(int i = 0; i < strip ; i++){
__bang_add(destSum + i * warpSize, destSum + i * warpSize, destSum + (i + strip) * warpSize, warpSize);
}
}
__bang_reduce_sum(destSumFinal, destSum, warpSize);//此时destSumFinal[0]保存的就是当前dimsize长度数据的数值和
__bang_reduce_sum(destSumFinal, destSum, warpSize);
//At this point, destSumFinal [0] saves the numerical value of the current dimsize length data sum
if(remainDim){
destSumFinal[0] = destSumFinal[0] - (dimS - remainDim);
}
@ -443,7 +447,7 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
}
}
__mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int othersize, int dimsize, int stride) {// axis = 0
__nram__ float src[maxNum];//每次搬运maxNum数据到NRAM
__nram__ float src[maxNum];//Transfer maxNum data to NRAM every time
__nram__ float tmpSum[maxNum];
__nram__ float tmpNewMax[maxNum];
__nram__ float tmpOldMax[maxNum];
@ -451,7 +455,7 @@ __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int o
int remain = othersize % taskDim;
int stepEasy = (othersize - remain)/taskDim;
int stepHard = stepEasy + 1;
int step = (taskId < remain ? stepHard : stepEasy);//前部分taskId多处理一个元素
int step = (taskId < remain ? stepHard : stepEasy);//The first part of taskId handles an additional element
int indStart = (taskId < remain ? taskId * stepHard : remain * stepHard + (taskId - remain) * stepEasy);
int remainNram = step%maxNum;
int repeat = (step - remainNram)/maxNum;
@ -462,7 +466,7 @@ __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int o
__bang_write_zero(tmpSum, maxNum);
for(int i = 0; i < dimsize; i++){
__memcpy(src, source + i * stride + indStart + j * maxNum, maxNum * sizeof(float), GDRAM2NRAM);
__bang_maxequal(tmpNewMax, tmpNewMax, src, maxNum);//不断更新最大值
__bang_maxequal(tmpNewMax, tmpNewMax, src, maxNum);//Continuously updating the maximum value
__bang_sub(src, src, tmpNewMax, maxNum);//x - M
__bang_active_exp_less_0(src, src, maxNum);//exp(x - M)
if(i > 0){
@ -473,9 +477,9 @@ __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int o
__bang_add(tmpSum, tmpSum, src, maxNum);//sum += exp(x - M)
__memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM
}
__bang_active_reciphp(tmpSum, tmpSum, maxNum);//计算1/sum
//开始指数变换并且写回GDRAM
__bang_mul(src, src, tmpSum, maxNum);//上面循环结束src存储的数据可以利用
__bang_active_reciphp(tmpSum, tmpSum, maxNum);//compute 1/sum
//Start exponential transformation and write back to GDRAM
__bang_mul(src, src, tmpSum, maxNum);//The data stored in the src at the end of the loop above can be utilized
__memcpy(destination + (dimsize - 1) * stride + indStart + j * maxNum, src, maxNum * sizeof(float), NRAM2GDRAM);
for(int i = 0; i < dimsize - 1; i++){
__memcpy(src, source + i * stride + indStart + j * maxNum, maxNum * sizeof(float), GDRAM2NRAM);
@ -505,9 +509,9 @@ __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int o
__memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM
}
__bang_active_reciphp(tmpSum, tmpSum, maxNum);//计算1/sum
//开始指数变换并且写回GDRAM
__bang_mul(src, src, tmpSum, maxNum);//上面循环结束src存储的数据可以利用
__bang_active_reciphp(tmpSum, tmpSum, maxNum);//compute 1/sum
//Start exponential transformation and write back to GDRAM
__bang_mul(src, src, tmpSum, maxNum);//The data stored in the src at the end of the loop above can be utilized
__memcpy(destination + (dimsize - 1) * stride + indStart + repeat * maxNum, src, remainNram * sizeof(float), NRAM2GDRAM);
for(int i = 0; i < dimsize - 1; i++){
__memcpy(src, source + i * stride + indStart + repeat * maxNum, remainNram * sizeof(float), GDRAM2NRAM);
@ -523,13 +527,32 @@ __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int o
__mlu_global__ void softmaxUnion1(float *mlu_destination, float *mlu_src, int nDim, int axis, int othersize, int frontsize, int dimsize, int stride){
if(axis == nDim - 1){
softmaxKernelAxis_e(mlu_destination, mlu_src, othersize, dimsize);
int dimS;
float mi = log2(dimsize);
if(floor(mi) == mi){
dimS = dimsize;
}
else{
dimS = pow(2,floor(mi) + 1);
}
if(dimS < warpSize){
dimS = warpSize;
}
softmaxKernelAxis_e(mlu_destination, mlu_src, othersize, dimsize, dimS);
}
else if(axis == 0){
softmaxKernelAxis_s(mlu_destination, mlu_src, othersize, dimsize, stride);
}
else{
softmaxKernelAxis_m(mlu_destination, mlu_src, frontsize, dimsize, stride);
float mi = log2(stride);
int strideS;
if(floor(mi) == mi){
strideS = stride;
}
else{
strideS = pow(2,floor(mi) + 1);
}
softmaxKernelAxis_m(mlu_destination, mlu_src, frontsize, dimsize, stride, strideS);
}
}