Modify memory allocation method

2024-02-26 02:13:04 +00:00 · 2024-02-26 02:13:04 +00:00 · 920b23cad8
parent e5d5085e6a
commit 920b23cad8
2 changed files with 100 additions and 77 deletions
--- a/src/kernels/mlu/src/highSoftmax.mlu
+++ b/src/kernels/mlu/src/highSoftmax.mlu
@ -11,7 +11,7 @@ void softmaxKernel(cnnlHandle_t handle, float *mlu_destination, float *mlu_src,
  k_dim.y = 1;
  k_dim.z = 1;
  k_type = CNRT_FUNC_TYPE_UNION1;
-  // launch 任务
+  // launch kernel
  softmaxUnion1<<<k_dim, k_type, queue>>>(mlu_destination, mlu_src, nDim, axis, othersize, frontsize, dimsize, stride);
  
 }
--- a/src/kernels/mlu/src/highSoftmax_device.mlu
+++ b/src/kernels/mlu/src/highSoftmax_device.mlu
@ -1,19 +1,20 @@
 #include <bang.h>
 #include <bang_device_functions.h>
 #define EPS 1e-7
-const int NRAM_MAX_SIZE = 1024 * 64;//后续树状求和必须保证NRAM_MAX_SIZE为2的幂次
-const int maxNum = NRAM_MAX_SIZE/sizeof(float); //NRAM上最多存储maxNum个float元素
+const int NRAM_MAX_SIZE = 1024 * 256;//Apply for maximum memory in advance from NRAM
+const int nramNum = NRAM_MAX_SIZE/sizeof(float);
+const int SRC_MAX_SIZE = 1024 * 32;//The subsequent tree summation must ensure that SRC-MAX-SIZE is a power of 2
+const int maxNum = SRC_MAX_SIZE/sizeof(float); 
 const int warpSize = 32;
-//strideS是大于等于stride的最小的二的幂次方

-__mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int frontsize, int dimsize, int stride) {
+__mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int frontsize, int dimsize, int stride, int strideS) {
  // 0<axis<dim -1 
-  __nram__ float src[maxNum];
+  __nram__  float nram_buffer[nramNum];
  if(stride >= maxNum){
-    
-    __nram__ float tmpSum[maxNum];
-    __nram__ float tmpNewMax[maxNum];
-    __nram__ float tmpOldMax[maxNum];
+    float *src = nram_buffer;
+    float *tmpSum = src + maxNum;
+    float *tmpNewMax = tmpSum + maxNum;
+    float *tmpOldMax = tmpNewMax + maxNum;

    int remain = stride % maxNum;
    int repeat = (stride - remain) / maxNum;
@ -34,7 +35,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
        __bang_write_zero(src, maxNum);
        for(int i = 0; i < dimsize; i++){
          __memcpy(src, source + frontIdx + i * stride + j * maxNum, maxNum * sizeof(float), GDRAM2NRAM);
-          __bang_maxequal(tmpNewMax, tmpNewMax, src, maxNum);//不断更新最大值
+          __bang_maxequal(tmpNewMax, tmpNewMax, src, maxNum);//Continuously updating the maximum value
          __bang_sub(src, src, tmpNewMax, maxNum);//x - M
          __bang_active_exp_less_0(src, src, maxNum);//exp(x - M)
          if(i > 0){
@ -46,8 +47,8 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
          __memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM
        }
        __bang_active_reciphp(tmpSum, tmpSum, maxNum);//计算1/sum
-        //开始指数变换并且写回GDRAM
-        __bang_mul(src, src, tmpSum, maxNum);//上面循环结束src存储的数据可以利用
+        //Start exponential transformation and write back to GDRAM
+        __bang_mul(src, src, tmpSum, maxNum);//The data stored in the src at the end of the loop above can be utilized
        __memcpy(destination + (dimsize - 1) * stride + frontIdx + j * maxNum, src, maxNum * sizeof(float), NRAM2GDRAM);
        for(int i = 0; i < dimsize - 1; i++){
          __memcpy(src, source + frontIdx + i * stride + j * maxNum, maxNum * sizeof(float), GDRAM2NRAM);
@ -76,8 +77,8 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
        }
        //-------------------
        __bang_active_reciphp(tmpSum, tmpSum, maxNum);//计算1/sum
-        //开始指数变换并且写回GDRAM
-        __bang_mul(src, src, tmpSum, maxNum);//上面循环结束src存储的数据可以利用
+        //Start exponential transformation and write back to GDRAM
+        __bang_mul(src, src, tmpSum, maxNum);//The data stored in the src at the end of the loop above can be utilized
        __memcpy(destination + (dimsize - 1) * stride + frontIdx + repeat * maxNum, src, remain * sizeof(float), NRAM2GDRAM);
        for(int i = 0; i < dimsize - 1; i++){
          __memcpy(src, source + i * stride + frontIdx + repeat * maxNum, remain * sizeof(float), GDRAM2NRAM);
@ -91,22 +92,23 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
    }
  }
  else if(stride < maxNum && dimsize * stride >= maxNum){
-    const int strideS = 1024;
-    
-    __nram__ float tmp[strideS];
-    __nram__ float tmpOldMax[strideS];
-    __nram__ float tmpNewMax[strideS];
-    __nram__ float tmpSum[strideS];
+   
+  
+    float* src = nram_buffer;
+    float* tmp = src + maxNum;
+    float* tmpOldMax = tmp + strideS;
+    float* tmpNewMax = tmpOldMax + strideS;
+    float* tmpSum = tmpNewMax + strideS;

    int multiple = maxNum / stride;
-    int size = multiple * stride;//一个src最多可以放的数据量
-    int remain = dimsize % multiple;//如果不能整除，这部分数据需要特殊处理
-    int repeat = (dimsize - remain) / multiple;//为了加载整个dimsize需要的循环总数
+    int size = multiple * stride;//The maximum amount of data that can be stored in an SRC
+    int remain = dimsize % multiple;//If it cannot be divisible, this part of the data needs special processing
+    int repeat = (dimsize - remain) / multiple;//The total number of loops required to load the entire dimsize

    int taskRemain = frontsize % taskDim;
    int stepEasy = (frontsize - taskRemain) / taskDim;
    int stepHard = stepEasy + 1;
-    int step = (taskId < taskRemain ? stepHard : stepEasy);//每个taskId处理frontsize的数目
+    int step = (taskId < taskRemain ? stepHard : stepEasy);//The number of frontsize processed per taskId
    int indStart = (taskId < taskRemain ? taskId * stepHard : taskRemain * stepHard + (taskId - taskRemain) * stepEasy);
    source = source + indStart * dimsize * stride;
    destination = destination + indStart * dimsize * stride;
@ -114,18 +116,18 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
    for(int ind = 0; ind < step; ind++){
      int frontIdx = ind * dimsize * stride;
      
-      __bang_write_value(tmpNewMax, strideS, -INFINITY);//必须初始化为负无穷
-      __bang_write_value(tmp, strideS, -INFINITY);//必须初始化为负无穷
-      __bang_write_zero(tmpSum, strideS);//必须初始化为0
+      __bang_write_value(tmpNewMax, strideS, -INFINITY);//Must be initialized to negative infinity
+      __bang_write_value(tmp, strideS, -INFINITY);//Must be initialized to negative infinity
+      __bang_write_zero(tmpSum, strideS);//Must be initialized to zero
      
      for(int j = 0; j < repeat; j++){
        __memcpy(src, source + frontIdx + j * multiple * stride, size * sizeof(float), GDRAM2NRAM);
        for(int m = 0; m < multiple; m++){
          __memcpy(tmp, src + m * stride, stride * sizeof(float), NRAM2NRAM);
          
-          __bang_maxequal(tmpNewMax, tmpNewMax, tmp, strideS);//虽然tmpNewMax后面strideS-stride部分是0，但是不用写回GDRAM，不影响结果
+          __bang_maxequal(tmpNewMax, tmpNewMax, tmp, strideS);//Although the stream S stream section after tmpNewMax is 0, there is no need to write back to GDRAM, which does not affect the result
          
-          __bang_sub(tmp, tmp, tmpNewMax, strideS);//tmp后面strideS-stride部分是0
+          __bang_sub(tmp, tmp, tmpNewMax, strideS);//The stripe S stripe section after tmp is 0
          __bang_active_exp_less_0(tmp, tmp, strideS);
          if(j != 0 || m != 0){
            __bang_sub(tmpOldMax, tmpOldMax, tmpNewMax, strideS);//oldM = oldM - newM
@ -143,7 +145,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
        for(int m = 0; m < remain; m++){
          __memcpy(tmp, src + m * stride, stride * sizeof(float), NRAM2NRAM);
          __bang_maxequal(tmpNewMax, tmpNewMax, tmp, strideS);
-          __bang_sub(tmp, tmp, tmpNewMax, strideS);//tmp后面strideS-stride部分是0
+          __bang_sub(tmp, tmp, tmpNewMax, strideS);//The stripe S stripe section after tmp is 0
          __bang_active_exp_less_0(tmp, tmp, strideS);
          if(repeat != 0 || m != 0){
            __bang_sub(tmpOldMax, tmpOldMax, tmpNewMax, strideS);//oldM = oldM - newM
@ -155,7 +157,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
        }
      }
      
-      //此时tmpNewMax存储的是对应于固定frontIdx，behindsize对应数据的最大值，而tmpSum存储的就是对应数值和
+      //At this point, tmpNewMax stores the maximum value of the data corresponding to a fixed frontIdx and bedsize, while tmpSum stores the corresponding value sum
      //__bang_printf("tmpOldMax[0]:%.2f,tmpSum[0]:%.2f\n", tmpNewMax[2],tmpSum[2]);
      __bang_active_reciphp(tmpSum, tmpSum, strideS);
      //__bang_printf("tmpOldMax[0]:%.2f,tmpSum[0]:%.2f\n", tmpNewMax[2],tmpSum[2]);
@ -183,14 +185,14 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
    }
  }
  else if(dimsize * stride < maxNum){
-    const int strideS = 32;
-    
-    __nram__ float tmp[strideS];
-    __nram__ float tmpOldMax[strideS];
-    __nram__ float tmpNewMax[strideS];
-    __nram__ float tmpSum[strideS];
+  
+    float* src = nram_buffer;
+    float* tmp = src + maxNum;
+    float* tmpOldMax = tmp + strideS;
+    float* tmpNewMax = tmpOldMax + strideS;
+    float* tmpSum = tmpNewMax + strideS;
    int behindsize = dimsize * stride;
-    int multiple = maxNum / behindsize;//表示一个maxNum能够在frontsize中分担的量
+    int multiple = maxNum / behindsize;//Represents the amount that a maxNum can share in frontsize
    
    int remainF = frontsize % (taskDim * multiple);
    int remainT = remainF % taskDim;
@ -198,11 +200,11 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
    int stepHard = stepEasy + 1;
    int step = (taskId < remainT ? stepHard : stepEasy);
    int taskRepeat = (frontsize - remainF) / (taskDim * multiple);
-    //此时对应于frontsize，每个taskId处理的数据量是taskRepeat * multiple + step
+    //At this point, corresponding to frontsize, the amount of data processed by each taskId is taskRepeat * multiple+step
    int startHard = taskId * (taskRepeat * multiple + stepHard);
    int startEasy = remainT * (taskRepeat * multiple + stepHard) + (taskId - remainT) * (taskRepeat * multiple + stepEasy);
    int indStart = (taskId < remainT ? startHard: startEasy);
-    source = source + indStart * behindsize;//indStart * behindsize表示不同taskId对应的偏移量
+    source = source + indStart * behindsize;//indStart * behindsize Indicates the offset corresponding to different taskIds
    destination = destination + indStart * behindsize;
    int tid;
    for(int s = 0; s < taskRepeat; s++){
@ -226,7 +228,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
          __memcpy(tmpOldMax, tmpNewMax, stride * sizeof(float), NRAM2NRAM);//oldM = newM
        }
        __bang_active_reciphp(tmpSum, tmpSum, strideS);
-        __bang_mul(tmp, tmp, tmpSum, strideS);//上面循环结束tmp存储的数据可以利用
+        __bang_mul(tmp, tmp, tmpSum, strideS);//The data stored in tmp at the end of the loop above can be utilized
        //__memcpy(destination + tid + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2GDRAM);
        __memcpy(src + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2NRAM);
        for(int i = 0; i < dimsize - 1; i++){
@ -263,7 +265,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
        }
        //__bang_printf("max:%.2f,%.2f, sum:%.2f,sum:%.2f\n", tmpNewMax[0], tmpNewMax[1], tmpSum[0], tmpSum[0]);
        __bang_active_reciphp(tmpSum, tmpSum, strideS);
-        __bang_mul(tmp, tmp, tmpSum, strideS);//上面循环结束tmp存储的数据可以利用
+        __bang_mul(tmp, tmp, tmpSum, strideS);//The data stored in tmp at the end of the loop above can be utilized
        //__memcpy(destination + tid + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2GDRAM);
        __memcpy(src + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2NRAM);
        for(int i = 0; i < dimsize - 1; i++){
@ -280,8 +282,8 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
  }
    
 }
-const int dimS = 32;
-__mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int othersize, int dimsize) {// axis = -1
+
+__mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int othersize, int dimsize, int dimS) {// axis = -1
  int multiple = maxNum / dimsize;
  int size = taskDim * multiple;
  int remainS = othersize % size;
@ -290,21 +292,22 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
  int stepEasy = (remainS - remainT) / taskDim;
  int stepHard = stepEasy + 1;
  int step = (taskId < remainT ? stepHard : stepEasy);
-  //每个taskId处理othersize分配的量就是taskRepeat * multiple + step
-  //整体来看，每个taskId处理的数据量就是(taskRepeat * multiple + step) * dimsize
+  //The amount allocated for processing othersize for each taskId is taskRepeat * multiple+step
+  //Overall, the amount of data processed by each taskId is (taskRepeat * multiple+step) * dimsize
  int startHard = taskId * (taskRepeat * multiple + stepHard);
  int startEasy = remainT * (taskRepeat * multiple + stepHard) + (taskId - remainT) * (taskRepeat * multiple + stepEasy);
  int indStart = (taskId < remainT ? startHard: startEasy);
  source = source + indStart * dimsize;
  destination = destination + indStart * dimsize;
  
-  __nram__ float src[maxNum];
-
-  __nram__ float tmp[dimS];
-  __nram__ float destSum[dimS];//后面数值求和
-  int remainDim = dimsize % dimS;//dimsize可能不是2的幂次方
+  __nram__  float nram_buffer[nramNum];
+  
+  float* src = nram_buffer;
+  float* tmp = src + maxNum;
+  float* destSum = tmp + dimS;
+  int remainDim = dimsize % dimS;//Dimsize may not be a power of 2
  int repeatDim = (dimsize - remainDim) / dimS;
-  __nram__ float destSumFinal[warpSize];//将destSum规约到destFinal[0]
+  __nram__ float destSumFinal[warpSize];//Reduce destSum to destFinal [0]
  __nram__ float srcMax[2];
  __nram__ float destOldMax;
  __nram__ float destNewMax;
@ -339,7 +342,7 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
        if(destNewMax < srcMax[0]){
          destNewMax = srcMax[0];
        }
-        __bang_write_value(tmp, dimS, destNewMax);//必须重新初始化为NewMax
+        __bang_write_value(tmp, dimS, destNewMax);//Must be reinitialized to NewMax
        __memcpy(tmp, src + j * dimsize + repeatDim * dimS, remainDim * sizeof(float), NRAM2NRAM);
        __bang_sub_scalar(tmp, tmp, destNewMax, dimS);
        __bang_active_exp_less_0(tmp, tmp, dimS);
@ -350,17 +353,17 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
        destOldMax = destNewMax;
      }
      
-      int segNum = dimS / warpSize;//开始数值求和
+      int segNum = dimS / warpSize;//Starting numerical summation
      for(int strip = segNum/2; strip > 0; strip = strip / 2){
        for(int i = 0; i < strip ; i++){
          __bang_add(destSum + i * warpSize, destSum + i * warpSize, destSum + (i + strip) * warpSize, warpSize);
        } 
      }
-      __bang_reduce_sum(destSumFinal, destSum, warpSize);//此时destSumFinal[0]保存的就是当前dimsize长度数据的数值和
+      __bang_reduce_sum(destSumFinal, destSum, warpSize);//At this point, destSumFinal [0] saves the numerical value of the current dimsize length data sum
      if(remainDim){
        destSumFinal[0] = destSumFinal[0] - (dimS - remainDim);
      }
-      //下面开始写回数据
+      //Now let's start writing back the data
      float globalSumInv = 1.0/destSumFinal[0];
      if(remainDim){
        __bang_mul_scalar(tmp, tmp, globalSumInv, dimS);
@ -373,17 +376,18 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
        __bang_mul_scalar(tmp, tmp, globalSumInv, dimS);
        __memcpy(destination + tid + j * dimsize + i * dimS, tmp, dimS * sizeof(float), NRAM2GDRAM);
      }
-    }//必须马上写回GDRAM，如果先写回src，然后src写回GDRAM，可能出现src写回GDRAM没有结束就修改src数据的情况
-    
+    }
+    //it is necessary to write back to GDRAM immediately. If you first write back to src and then write back to GDRAM, 
+    //there may be a situation where src writes back to GDRAM before modifying the src data
  }
-  if(step){//step针对的是othersize不能整除multiple*dimsize的部分
+  if(step){//Step targets parts of othersize that cannot be divided by multiple * dimsize
    tid = taskRepeat * multiple * dimsize;
    __memcpy(src, source + tid, step * dimsize * sizeof(float), GDRAM2NRAM);
    for(int j = 0; j < step; j++){
      __bang_write_zero(destSum, dimS);
      __bang_write_zero(destSumFinal, warpSize);
      destNewMax = -INFINITY;
-      for(int i = 0; i < repeatDim; i++){//repeatDim针对的是固定otherIdx后，靠dimS读取当前dimsize数据一共需要的循环次数
+      for(int i = 0; i < repeatDim; i++){//RepeatDim refers to the total number of cycles required to read the current dimsize data using dimS after fixing otherIdx
        __memcpy(tmp, src + j * dimsize + i * dimS, dimS * sizeof(float), NRAM2NRAM);
        __bang_argmax(srcMax, tmp, dimS);
        if(destNewMax < srcMax[0]){
@ -397,7 +401,7 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
        __bang_add(destSum, destSum, tmp, dimS);
        destOldMax = destNewMax;
      }
-      if(remainDim){//remainDim针对的是固定otherIdx后，dimsize不能整除dimS的部分
+      if(remainDim){//RemainDim refers to the part of dimsize that cannot be divided by dimS after fixing otherIdx
        __bang_write_value(tmp, dimS, -INFINITY);
        __memcpy(tmp, src + j * dimsize + repeatDim * dimS, remainDim * sizeof(float), NRAM2NRAM);
        __bang_argmax(srcMax, tmp, dimS);
@ -405,7 +409,7 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
          destNewMax = srcMax[0];
        }
        
-        __bang_write_value(tmp, dimS, destNewMax);//必须重新初始化为NewMax
+        __bang_write_value(tmp, dimS, destNewMax);//Must be reinitialized to NewMax
        __memcpy(tmp, src + j * dimsize + repeatDim * dimS, remainDim * sizeof(float), NRAM2NRAM);
        __bang_sub_scalar(tmp, tmp, destNewMax, dimS);
        __bang_active_exp_less_0(tmp, tmp, dimS);
@ -415,14 +419,14 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
        __bang_add(destSum, destSum, tmp, dimS);
        destOldMax = destNewMax;
      }
-      int segNum = dimS / warpSize;//开始数值求和
+      int segNum = dimS / warpSize;//Starting numerical summation
      for(int strip = segNum/2; strip > 0; strip = strip / 2){
        for(int i = 0; i < strip ; i++){
          __bang_add(destSum + i * warpSize, destSum + i * warpSize, destSum + (i + strip) * warpSize, warpSize);
        } 
      }
-      __bang_reduce_sum(destSumFinal, destSum, warpSize);//此时destSumFinal[0]保存的就是当前dimsize长度数据的数值和
-      
+      __bang_reduce_sum(destSumFinal, destSum, warpSize);
+      //At this point, destSumFinal [0] saves the numerical value of the current dimsize length data sum
      if(remainDim){
        destSumFinal[0] = destSumFinal[0] - (dimS - remainDim);
      }
@ -443,7 +447,7 @@ __mlu_device__ void softmaxKernelAxis_e(float* destination, float* source, int o
  } 
 }
 __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int othersize, int dimsize, int stride) {// axis = 0
-  __nram__ float src[maxNum];//每次搬运maxNum数据到NRAM
+  __nram__ float src[maxNum];//Transfer maxNum data to NRAM every time
  __nram__ float tmpSum[maxNum];
  __nram__ float tmpNewMax[maxNum];
  __nram__ float tmpOldMax[maxNum];
@ -451,7 +455,7 @@ __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int o
  int remain = othersize % taskDim;
  int stepEasy = (othersize - remain)/taskDim;
  int stepHard = stepEasy + 1;
-  int step = (taskId < remain ? stepHard : stepEasy);//前部分taskId多处理一个元素
+  int step = (taskId < remain ? stepHard : stepEasy);//The first part of taskId handles an additional element
  int indStart = (taskId < remain ? taskId * stepHard : remain * stepHard + (taskId - remain) * stepEasy);
  int remainNram = step%maxNum;
  int repeat = (step - remainNram)/maxNum;
@ -462,7 +466,7 @@ __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int o
    __bang_write_zero(tmpSum, maxNum);
    for(int i = 0; i < dimsize; i++){
      __memcpy(src, source + i * stride + indStart + j * maxNum, maxNum * sizeof(float), GDRAM2NRAM);
-      __bang_maxequal(tmpNewMax, tmpNewMax, src, maxNum);//不断更新最大值
+      __bang_maxequal(tmpNewMax, tmpNewMax, src, maxNum);//Continuously updating the maximum value
      __bang_sub(src, src, tmpNewMax, maxNum);//x - M
      __bang_active_exp_less_0(src, src, maxNum);//exp(x - M)
      if(i > 0){
@ -473,9 +477,9 @@ __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int o
      __bang_add(tmpSum, tmpSum, src, maxNum);//sum += exp(x - M)
      __memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM
    } 
-    __bang_active_reciphp(tmpSum, tmpSum, maxNum);//计算1/sum
-    //开始指数变换并且写回GDRAM
-    __bang_mul(src, src, tmpSum, maxNum);//上面循环结束src存储的数据可以利用
+    __bang_active_reciphp(tmpSum, tmpSum, maxNum);//compute 1/sum
+    //Start exponential transformation and write back to GDRAM
+    __bang_mul(src, src, tmpSum, maxNum);//The data stored in the src at the end of the loop above can be utilized
    __memcpy(destination + (dimsize - 1) * stride + indStart + j * maxNum, src, maxNum * sizeof(float), NRAM2GDRAM);
    for(int i = 0; i < dimsize - 1; i++){
      __memcpy(src, source + i * stride + indStart + j * maxNum, maxNum * sizeof(float), GDRAM2NRAM);
@ -505,9 +509,9 @@ __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int o
      __memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM
    } 
    
-    __bang_active_reciphp(tmpSum, tmpSum, maxNum);//计算1/sum
-    //开始指数变换并且写回GDRAM
-    __bang_mul(src, src, tmpSum, maxNum);//上面循环结束src存储的数据可以利用
+    __bang_active_reciphp(tmpSum, tmpSum, maxNum);//compute 1/sum
+    //Start exponential transformation and write back to GDRAM
+    __bang_mul(src, src, tmpSum, maxNum);//The data stored in the src at the end of the loop above can be utilized
    __memcpy(destination + (dimsize - 1) * stride + indStart + repeat * maxNum, src, remainNram * sizeof(float), NRAM2GDRAM);
    for(int i = 0; i < dimsize - 1; i++){
      __memcpy(src, source + i * stride + indStart + repeat * maxNum, remainNram * sizeof(float), GDRAM2NRAM);
@ -523,13 +527,32 @@ __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int o

 __mlu_global__ void softmaxUnion1(float *mlu_destination, float *mlu_src, int nDim, int axis, int othersize, int frontsize, int dimsize, int stride){
  if(axis == nDim - 1){
-    softmaxKernelAxis_e(mlu_destination, mlu_src, othersize, dimsize);
+    int dimS;
+    float mi = log2(dimsize);
+    if(floor(mi) == mi){
+      dimS = dimsize;
+    }
+    else{
+      dimS = pow(2,floor(mi) + 1);
+    }
+    if(dimS < warpSize){
+      dimS = warpSize;
+    }
+    softmaxKernelAxis_e(mlu_destination, mlu_src, othersize, dimsize, dimS);
  }
  else if(axis == 0){
    softmaxKernelAxis_s(mlu_destination, mlu_src, othersize, dimsize, stride);
  }
  else{
-    softmaxKernelAxis_m(mlu_destination, mlu_src, frontsize, dimsize, stride);
+    float mi = log2(stride);
+    int strideS;
+    if(floor(mi) == mi){
+      strideS = stride;
+    }
+    else{
+      strideS = pow(2,floor(mi) + 1);
+    }
+    softmaxKernelAxis_m(mlu_destination, mlu_src, frontsize, dimsize, stride, strideS);
  }
 }