diff --git a/include/cuda/cuda_softmax.h b/include/cuda/cuda_softmax.h
new file mode 100644
index 00000000..671f46f8
--- /dev/null
+++ b/include/cuda/cuda_softmax.h
@@ -0,0 +1,6 @@
+#pragma once
+#include "utils/small_array.h"
+namespace infini {
+void softmax_kernel(int num_blocks, float *input, float *output, int size,
+                    int dimsize, int stride);
+}
diff --git a/include/cuda/softmax.h b/include/cuda/softmax.h
deleted file mode 100644
index 5c0eccf9..00000000
--- a/include/cuda/softmax.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#pragma once
-
-namespace infini {
-void softmax_kernel(int max_threadblock_size, int batch_size, float *x,
-                    float *y, int dim, int stride);
-}
diff --git a/src/kernels/cpu/unary.cc b/src/kernels/cpu/unary.cc
index 8975d7cd..3ea61b41 100644
--- a/src/kernels/cpu/unary.cc
+++ b/src/kernels/cpu/unary.cc
@@ -1,6 +1,7 @@
 #include "operators/unary.h"
 #include "core/constants.h"
 #include "core/kernel.h"
+#include "operators/softmax.h"
 
 namespace infini {
 template <typename T> class NativeUnary : public CpuKernelWithoutConfig {
@@ -22,7 +23,7 @@ template <typename T> class NativeUnary : public CpuKernelWithoutConfig {
 template <typename T> class NaiveSoftmax : public CpuKernelWithoutConfig {
     void compute(const Operator &_op,
                  const RuntimeObj *context) const override {
-        auto op = as<UnaryObj>(_op);
+        auto op = as<SoftmaxObj>(_op);
         T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
         T *outptr = op->getOutput()->getRawDataPtr<T *>();
 
diff --git a/src/kernels/cuda/softmax.cc b/src/kernels/cuda/softmax.cc
index 437ed849..024288c2 100644
--- a/src/kernels/cuda/softmax.cc
+++ b/src/kernels/cuda/softmax.cc
@@ -1,30 +1,30 @@
 #include "operators/softmax.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"
-#include "cuda/softmax.h"
+#include "cuda/cuda_softmax.h"
 
 namespace infini {
-class SoftmaxCudnn : public CudaKernelWithoutConfig {
+class SoftmaxCuda : public CudaKernelWithoutConfig {
 
     void compute(const Operator &_op,
                  const RuntimeObj *_context) const override {
         auto op = as<SoftmaxObj>(_op);
-        auto x = op->getInputs(0)->getRawDataPtr<float *>();
-        auto y = op->getOutput(0)->getRawDataPtr<float *>();
+        auto input = op->getInputs(0)->getRawDataPtr<float *>();
+        auto output = op->getOutput(0)->getRawDataPtr<float *>();
+        const auto &inShape = op->getInputs(0)->getDims(); // input shape
         auto dims = op->getInputs(0)->getDims();
 
-        int batch_size = 1;
-        for (size_t i = 0; i < dims.size(); ++i)
-            batch_size *= dims[i];
-        int dim = dims[op->getAxis()];
+        int size; // size = i(JKS) + j(KS) + k(S) + s
+        size = op->getOutput(0)->size();
+        int dimsize = dims[op->getAxis()];
+        int stride = op->getInputs(0)->getStride().at(op->getAxis());
 
-        int block_num = batch_size / dim;
-        int max_threadblock_size = batch_size / block_num;
-        softmax_kernel(max_threadblock_size, block_num, x, y, dim,
-                       op->getInputs(0)->getStride().at(op->getAxis()));
+        int num_blocks = size / dimsize;
+        softmax_kernel(num_blocks, (float *)input, (float *)output, size,
+                       dimsize, stride);
     }
 };
 
-REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, SoftmaxCudnn,
+REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, SoftmaxCuda,
                 "Softmax_CUDA_Float32");
 } // namespace infini
diff --git a/src/kernels/cuda/softmax.cu b/src/kernels/cuda/softmax.cu
index 1f7f39e6..7e85ec43 100644
--- a/src/kernels/cuda/softmax.cu
+++ b/src/kernels/cuda/softmax.cu
@@ -1,77 +1,183 @@
 #include "cuda/cuda_common.h"
-#include "cuda/softmax.h"
 #include <cub/cub.cuh>
 
-struct __align__(8) MD {
-    float data;
-    float d;
+struct __align__(8) DataMaxSum { // update the global max and sum, store the
+                                 // output at max_tmp and sum_tmp
+    float max_tmp;               // store max
+    float sum_tmp;               // store sum
+};
+__device__ __forceinline__ DataMaxSum reduce_dms_op(DataMaxSum a,
+                                                    DataMaxSum b) {
+    bool a_bigger = (a.max_tmp > b.max_tmp);
+    DataMaxSum bigger = a_bigger ? a : b;
+    DataMaxSum smaller = a_bigger ? b : a;
+    bigger.sum_tmp = bigger.sum_tmp +
+                     smaller.sum_tmp * __expf(smaller.max_tmp - bigger.max_tmp);
+
+    return bigger;
+}
+template <int BLOCK_DIM>
+__launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel(
+    float *__restrict input, float *__restrict output, int size, int dimsize,
+    int stride) { // if set axis = 1, inputShape=[I,J,K,S]
+                  // tid = i(JKS) + j(KS) + k(S) + s
+
+    // blockDim.x = size/dimsize = IKS
+    // blockIdx.x = i(KS) + k(S) + s,blockIdx.x%stride = k(S) + s
+
+    int tid =
+        blockIdx.x % stride + (blockIdx.x - blockIdx.x % stride) *
+                                  dimsize; // now, tid = i(JKS) + k(S) + s;
+
+    DataMaxSum dms_partial;
+    dms_partial.max_tmp = -__FLT_MAX__;
+    dms_partial.sum_tmp = 0.0f;
+    DataMaxSum dms_input;
+    for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+
+        dms_input.max_tmp =
+            input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
+
+        dms_input.sum_tmp = 1.0f;
+        dms_partial = reduce_dms_op(dms_partial,
+                                    dms_input); // reduce the data to one block
+    }
+    typedef cub::BlockReduce<DataMaxSum, BLOCK_DIM> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    __shared__ DataMaxSum dms_total;
+    DataMaxSum dms_block =
+        BlockReduce(temp_storage).Reduce(dms_partial, reduce_dms_op);
+    if (threadIdx.x ==
+        0) { // must set threadIdx.x = 0 write the output to memory
+        dms_total = dms_block;
+    }
+    __syncthreads();
+    //-----------------
+
+    for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
+        output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
+            __expf(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
+                   dms_total.max_tmp) *
+            __fdividef(1.0F, dms_total.sum_tmp);
+    }
+}
+
+template <typename T> struct SumOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return a + b;
+    }
 };
 
-__device__ __forceinline__ MD reduce_md_op(MD a, MD b) {
-    bool a_bigger = (a.data > b.data);
-    MD bigger_m = a_bigger ? a : b;
-    MD smaller_m = a_bigger ? b : a;
-    MD res;
-    res.d = bigger_m.d + smaller_m.d * __expf(smaller_m.data - bigger_m.data);
-    res.data = bigger_m.data;
-    return res;
-}
-
-template <int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void online_softmax(const float *__restrict in, float *__restrict out,
-                        int dimSize, int stride) {
-
-    // reposition in and out to data for the current vector
-    int blockOffset = blockIdx.x;
-    if (blockIdx.x >= stride) {
-        int tmp = blockIdx.x % stride;
-        blockOffset = tmp + (blockIdx.x - tmp) * dimSize;
+template <typename T> struct MaxOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return max(a, b);
     }
-    in += blockOffset;
-    out += blockOffset;
-
-    MD md_partial;
-    md_partial.data = -FLT_MAX;
-    md_partial.d = 0.0F;
-
-    for (int elem_id = threadIdx.x; elem_id < dimSize;
-         elem_id += THREADBLOCK_SIZE) {
-        MD new_elem;
-        new_elem.data = in[elem_id * stride];
-        new_elem.d = 1.0F;
-        md_partial = reduce_md_op(md_partial, new_elem);
+};
+template <template <typename> class ReductionOp, typename T,
+          int thread_group_width>
+__inline__ __device__ T WarpAllReduce(T val) {
+    for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+        val = ReductionOp<T>()(val, __shfl_xor_sync(0xffffffff, val, mask));
     }
-
-    // blockreduce for THREADBLOCK_SIZE threads.
-    // The actrual threads num used in the block is "dimsSize"
-    typedef cub::BlockReduce<MD, THREADBLOCK_SIZE> BlockReduce;
-
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-    __shared__ MD md_total;
-
-    MD md = BlockReduce(temp_storage).Reduce(md_partial, reduce_md_op);
-    if (threadIdx.x == 0)
-        md_total = md;
-    __syncthreads();
-
-    float d_total_inverse = __fdividef(1.0F, md_total.d);
-    for (int elem_id = threadIdx.x; elem_id < dimSize;
-         elem_id += THREADBLOCK_SIZE)
-        out[elem_id * stride] =
-            __expf(in[elem_id * stride] - md_total.data) * d_total_inverse;
+    return val;
 }
+template <int BLOCK_DIM_x, int BLOCK_DIM_y>
+__global__ void _warpSoftmaxKernel(float *__restrict input,
+                                   float *__restrict output, int size,
+                                   int dimsize, int stride) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+    int otherSize = size / dimsize;
+    int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
 
+    if (otherIdx < otherSize) {
+
+        __shared__ float max_total[BLOCK_DIM_y];
+        __shared__ float sum_total[BLOCK_DIM_y];
+        float max_data = -__FLT_MAX__;
+
+        for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
+            max_data =
+                max(max_data,
+                    input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride]);
+        }
+
+        max_data = WarpAllReduce<MaxOp, float, BLOCK_DIM_x>(max_data);
+
+        if (threadIdx.x == 0)
+            max_total[threadIdx.y] = max_data;
+
+        //--------------------------------------------
+        float sum_data = 0.0f;
+
+        for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
+            sum_data +=
+                __expf(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                       max_total[threadIdx.y]);
+        }
+
+        sum_data = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sum_data);
+
+        if (threadIdx.x == 0)
+            sum_total[threadIdx.y] = sum_data;
+
+        //--------------------------------------------
+
+        for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
+            output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
+                __expf(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
+                       max_total[threadIdx.y]) *
+                __fdividef(1.0F, sum_total[threadIdx.y]);
+        }
+    }
+}
+//-----------------
+
+//-----------------
 namespace infini {
-void softmax_kernel(int max_threadblock_size, int blockNum, float *in,
-                    float *out, int dimSize, int stride) {
-    if (max_threadblock_size >= 255)
-        online_softmax<256><<<blockNum, 256>>>(in, out, dimSize, stride);
-    else if (max_threadblock_size >= 128)
-        online_softmax<128><<<blockNum, 128>>>(in, out, dimSize, stride);
-    else if (max_threadblock_size >= 64)
-        online_softmax<64><<<blockNum, 64>>>(in, out, dimSize, stride);
-    else
-        online_softmax<32><<<blockNum, 32>>>(in, out, dimSize, stride);
+void softmax_kernel(int num_blocks, float *input, float *output, int size,
+                    int dimsize, int stride) {
+
+    if (dimsize > 1024) {
+
+        int BLOCK_DIM = 1024;
+        _blockSoftmaxKernel<1024>
+            <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 31) {
+        int BLOCK_DIM_x = 32;
+        int BLOCK_DIM_y = 32;
+        int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        _warpSoftmaxKernel<32, 32>
+            <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 15) {
+        int BLOCK_DIM_x = 16;
+        int BLOCK_DIM_y = 64;
+        int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        _warpSoftmaxKernel<16, 64>
+            <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
+    } else if (dimsize > 7) {
+        int BLOCK_DIM_x = 8;
+        int BLOCK_DIM_y = 128;
+        int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        _warpSoftmaxKernel<8, 128>
+            <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
+    } else {
+        int BLOCK_DIM_x = 4;
+        int BLOCK_DIM_y = 256;
+        int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        _warpSoftmaxKernel<4, 256>
+            <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
+    }
 }
 } // namespace infini
diff --git a/test/kernels/cuda/test_cuda_where.cc b/test/kernels/cuda/test_cuda_where.cc
index 74f114d4..32c2f253 100644
--- a/test/kernels/cuda/test_cuda_where.cc
+++ b/test/kernels/cuda/test_cuda_where.cc
@@ -8,38 +8,38 @@
 
 namespace infini {
 
-void test_where(const Shape &inputxshape, const vector<float> &inputxdata,
-                const Shape &inputyshape, const vector<float> &inputydata,
-                const Shape &conditionshape,
-                const vector<uint8_t> &conditiondata,
+void test_where(const Shape &inputXShape, const vector<float> &inputXData,
+                const Shape &inputYShape, const vector<float> &inputYData,
+                const Shape &conditionShape,
+                const vector<uint8_t> &conditionData,
                 const vector<float> &ExpectData) {
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
     Graph gCpu = make_ref<GraphObj>(runtime);
-    auto condition = gCpu->addTensor(conditionshape, DataType::UInt8);
-    auto inputx = gCpu->addTensor(inputxshape, DataType::Float32);
-    auto inputy = gCpu->addTensor(inputyshape, DataType::Float32);
+    auto condition = gCpu->addTensor(conditionShape, DataType::UInt8);
+    auto inputX = gCpu->addTensor(inputXShape, DataType::Float32);
+    auto inputY = gCpu->addTensor(inputYShape, DataType::Float32);
 
     gCpu->dataMalloc();
-    condition->copyin(conditiondata); //
-    inputx->copyin(inputxdata);
-    inputy->copyin(inputydata); //
+    condition->copyin(conditionData); //
+    inputX->copyin(inputXData);
+    inputY->copyin(inputYData); //
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
 
     auto conditionGpu = gCuda->cloneTensor(condition);
-    auto inputxGpu = gCuda->cloneTensor(inputx);
-    auto inputyGpu = gCuda->cloneTensor(inputy);
+    auto inputXGpu = gCuda->cloneTensor(inputX);
+    auto inputYGpu = gCuda->cloneTensor(inputY);
 
-    auto op = gCuda->addOp<WhereObj>(inputxGpu, inputyGpu, conditionGpu,
+    auto op = gCuda->addOp<WhereObj>(inputXGpu, inputYGpu, conditionGpu,
                                      nullptr); // WhereObj
     gCuda->dataMalloc();
-    conditionGpu->copyin(conditiondata);
-    inputxGpu->copyin(inputxdata);
-    inputyGpu->copyin(inputydata);
+    conditionGpu->copyin(conditionData);
+    inputXGpu->copyin(inputXData);
+    inputYGpu->copyin(inputYData);
     cudaRuntime->run(gCuda);
 
-    auto oCpu = gCpu->cloneTensor(op->getOutput()); // move data from gpu to cpu
+    auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
     oCpu->printData();                              //->printData
     EXPECT_TRUE(oCpu->equalData(ExpectData));
 }