diff --git a/include/bang/bang_softmax.h b/include/bang/bang_softmax.h
index ae260d4e..9fde8419 100644
--- a/include/bang/bang_softmax.h
+++ b/include/bang/bang_softmax.h
@@ -1,11 +1,11 @@
 #pragma once
 #include "bang/bang_runtime.h"
-#include "bang_highSoftmax.h"
+#include "bang_bangSoftmax.h"
 #include "operators/softmax.h"
 namespace infini {
 
 void softmax_kernel(const RuntimeObj *obj, const Operator &_op) {
-    auto op = as<SoftmaxObj>(_op);
+    auto op = as<BangSoftmaxObj>(_op);
     void *const mlu_src = (op->getInputs(0)->getRawDataPtr<void *>());
     void *const mlu_destination = (op->getOutput()->getRawDataPtr<void *>());
 
@@ -31,7 +31,7 @@ void softmax_kernel(const RuntimeObj *obj, const Operator &_op) {
             othersize *= shape[s];
         }
     }
-    if (op->getOpType() == OpType::Softmax)
+    if (op->getOpType() == OpType::BangSoftmax)
         softmaxKernel(context->cnnlHandle(), (float *)mlu_destination,
                       (float *)mlu_src, nDim, axis, othersize, frontsize,
                       dimsize, stride);
diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index ce455d62..edbacfb7 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -59,6 +59,7 @@ class GraphHandlerObj {
     Tensor tanh(Tensor x, Tensor y);
     Tensor erf(Tensor x, Tensor y);
     Tensor softmax(Tensor x, Tensor y, int axis);
+    Tensor bangSoftmax(Tensor x, Tensor y, int axis);
     Tensor abs(Tensor x, Tensor y);
     Tensor sqrt(Tensor x, Tensor y);
     Tensor neg(Tensor x, Tensor y);
diff --git a/include/core/op_type.h b/include/core/op_type.h
index dbcfbdb9..acfaa160 100644
--- a/include/core/op_type.h
+++ b/include/core/op_type.h
@@ -180,6 +180,7 @@ struct OpType {
         Size,
         Slice,
         Softmax,
+        BangSoftmax,
         SoftmaxCrossEntropyLoss,
         Softplus,
         Softsign,
diff --git a/include/operators/softmax.h b/include/operators/softmax.h
index b24c0ffb..88134a47 100644
--- a/include/operators/softmax.h
+++ b/include/operators/softmax.h
@@ -24,4 +24,26 @@ class SoftmaxObj : public OperatorObj {
     vector<int> getWorkloadVector() const override;
     vector<int> getOpAttrVector() const override;
 };
+class BangSoftmaxObj : public OperatorObj {
+    int axis;
+
+  public:
+    BangSoftmaxObj(GraphObj *graph, Tensor input, Tensor output, int axis);
+
+    OP_CLONE(BangSoftmaxObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
+        return {{inputs[0]->getDims()}};
+    };
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+    int getAxis() const { return axis; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
 } // namespace infini
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index 0821121d..4b0b8dac 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -227,6 +227,15 @@ Tensor GraphHandlerObj::softmax(Tensor input, Tensor output, int axis) {
             ->getOutput();
     }
 }
+Tensor GraphHandlerObj::bangSoftmax(Tensor input, Tensor output, int axis) {
+    if (output) {
+        g->addOpWithOutputs<BangSoftmaxObj>(std::move(input), output, axis);
+        return output;
+    } else {
+        return g->addOp<BangSoftmaxObj>(std::move(input), output, axis)
+            ->getOutput();
+    }
+}
 
 Tensor GraphHandlerObj::flatten(Tensor input, Tensor output, int axis) {
     if (output) {
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 9dc43510..7a39e547 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -522,6 +522,7 @@ void init_graph_builder(py::module &m) {
         .def("hardSigmoid", &Handler::hardSigmoid, policy::move)
         .def("hardSwish", &Handler::hardSwish, policy::move)
         .def("softmax", &Handler::softmax, policy::move)
+        .def("bangSoftmax", &Handler::bangSoftmax, policy::move)
         .def("abs", &Handler::abs, policy::move)
         .def("sqrt", &Handler::sqrt, policy::move)
         .def("neg", &Handler::neg, policy::move)
diff --git a/src/kernels/bang/activation.cc b/src/kernels/bang/activation.cc
index a6b98db6..4105b168 100644
--- a/src/kernels/bang/activation.cc
+++ b/src/kernels/bang/activation.cc
@@ -246,7 +246,8 @@ REGISTER_KERNEL(Device::BANG, OpType::PRelu, PReluCnnl, "PRelu_cnnl_BANG");
 REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
                 "Sigmoid_cnnl_BANG");
 REGISTER_KERNEL(Device::BANG, OpType::Round, RoundCnnl, "Round_cnnl_BANG");
-
+REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxCnnl,
+                "Softmax_cnnl_BANG");
 REGISTER_KERNEL(Device::BANG, OpType::HardSigmoid, HardSigmoidCnnl,
                 "HardSigmoid_cnnl_BANG");
 REGISTER_KERNEL(Device::BANG, OpType::HardSwish, HardSwishCnnl,
diff --git a/src/kernels/bang/softmax.cc b/src/kernels/bang/softmax.cc
index 985d8c46..2ac09934 100644
--- a/src/kernels/bang/softmax.cc
+++ b/src/kernels/bang/softmax.cc
@@ -10,5 +10,5 @@ class SoftmaxBang : public BangKernelWithoutConfig {
     }
 };
 
-REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxBang, "Softmax_BANG");
+REGISTER_KERNEL(Device::BANG, OpType::BangSoftmax, SoftmaxBang, "Softmax_BANG");
 }; // namespace infini
diff --git a/src/kernels/mlu/include/highSoftmax.h b/src/kernels/mlu/include/bangSoftmax.h
similarity index 100%
rename from src/kernels/mlu/include/highSoftmax.h
rename to src/kernels/mlu/include/bangSoftmax.h
diff --git a/src/kernels/mlu/include/bang_highSoftmax.h b/src/kernels/mlu/include/bang_bangSoftmax.h
similarity index 100%
rename from src/kernels/mlu/include/bang_highSoftmax.h
rename to src/kernels/mlu/include/bang_bangSoftmax.h
diff --git a/src/kernels/mlu/src/highSoftmax.mlu b/src/kernels/mlu/src/bangSoftmax.mlu
similarity index 90%
rename from src/kernels/mlu/src/highSoftmax.mlu
rename to src/kernels/mlu/src/bangSoftmax.mlu
index 4771f243..f384d0ae 100644
--- a/src/kernels/mlu/src/highSoftmax.mlu
+++ b/src/kernels/mlu/src/bangSoftmax.mlu
@@ -1,5 +1,5 @@
-#include "bang_highSoftmax.h"
-#include "highSoftmax.h"
+#include "bang_bangSoftmax.h"
+#include "bangSoftmax.h"
 
 namespace infini{
 void softmaxKernel(cnnlHandle_t handle, float *mlu_destination, float *mlu_src, int nDim, int axis, int othersize, int frontsize, int dimsize, int stride){
diff --git a/src/kernels/mlu/src/highSoftmax_device.mlu b/src/kernels/mlu/src/bangSoftmax_device.mlu
similarity index 100%
rename from src/kernels/mlu/src/highSoftmax_device.mlu
rename to src/kernels/mlu/src/bangSoftmax_device.mlu
diff --git a/src/operators/softmax.cc b/src/operators/softmax.cc
index f9dde777..dcac0766 100644
--- a/src/operators/softmax.cc
+++ b/src/operators/softmax.cc
@@ -31,4 +31,33 @@ vector<int> SoftmaxObj::getWorkloadVector() const {
 vector<int> SoftmaxObj::getOpAttrVector() const {
     return {type.underlying(), axis};
 }
+BangSoftmaxObj::BangSoftmaxObj(GraphObj *graph, Tensor input, Tensor output,
+                               int _axis)
+    : OperatorObj(OpType::BangSoftmax, {input}, {output}) {
+    int rank = input->getRank();
+    axis = get_real_axis(_axis, rank);
+    IT_ASSERT(checkValid(graph));
+}
+
+std::string BangSoftmaxObj::toString() const {
+    std::ostringstream os;
+    os << type.toString() << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ",";
+    os << "axis=" << axis << ")";
+    return os.str();
+}
+
+vector<int> BangSoftmaxObj::getWorkloadVector() const {
+    vector<int> ret{type.underlying(), axis};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> BangSoftmaxObj::getOpAttrVector() const {
+    return {type.underlying(), axis};
+}
 } // namespace infini
diff --git a/test/kernels/bang/test_bang_softmax.cc b/test/kernels/bang/test_bang_softmax.cc
index 83fcd0e8..0da12779 100644
--- a/test/kernels/bang/test_bang_softmax.cc
+++ b/test/kernels/bang/test_bang_softmax.cc
@@ -5,134 +5,121 @@
 #include "operators/softmax.h"
 #include "test.h"
 #include <cmath>
+#include <sys/time.h>
 namespace infini {
 double eps = 3e-3;
-TEST(cuDNN_Softmax, run_axis1) {
-    // Runtime
+void test_softmaxFp32(const Shape &inputShape, const vector<float> &inputData,
+                      int axis, const vector<float> &expectData) {
     Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
     auto bangRuntime = make_ref<BangRuntimeObj>();
 
     // Build input data on CPU
     Tensor inputCpu =
-        make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
+        make_ref<TensorObj>(inputShape, DataType::Float32, cpuRuntime);
 
     // GPU
     Graph bangGraph = make_ref<GraphObj>(bangRuntime);
     auto inputGpu = bangGraph->cloneTensor(inputCpu);
-    auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
+    // cnnlSoftmax
+    auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, axis);
     bangGraph->dataMalloc();
-    inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
+    inputGpu->copyin(inputData);
     bangRuntime->run(bangGraph);
     auto outputGpu = gpuOp->getOutput();
     auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // bangSoftmax
+    auto bangGpuOp = bangGraph->addOp<BangSoftmaxObj>(inputGpu, nullptr, axis);
+    bangGraph->dataMalloc();
+    inputGpu->copyin(inputData);
+    bangRuntime->run(bangGraph);
+    auto bangOutputGpu = gpuOp->getOutput();
+    auto bangOutputGpu2Cpu = bangOutputGpu->clone(cpuRuntime);
     // Check
-    EXPECT_TRUE(outputGpu2Cpu->equalData(
+    EXPECT_TRUE(outputGpu2Cpu->equalData(expectData, eps));     // cnnlSoftmax
+    EXPECT_TRUE(bangOutputGpu2Cpu->equalData(expectData, eps)); // bangSoftmax
+}
+double get_walltime() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    return (double)(tp.tv_sec + tp.tv_usec * 1e-6);
+}
+float err(float *x, float *y, const Shape &inputShape, int nDim) {
+    int size = 1;
+    for (int i = 0; i < nDim; i++) {
+        size *= inputShape[i];
+    }
+    float error = 0;
+    for (int i = 0; i < size; i++) {
+        if (fabs(x[i] - y[i]) > error) {
+            error = fabs(x[i] - y[i]);
+        }
+    }
+    return error;
+}
+void test_compareSoftmaxFp32(
+    int axis, const Shape &inputShape, int nDim,
+    const std::function<void(void *, size_t, DataType)> &generator) {
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(inputShape, DataType::Float32, cpuRuntime);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    // cnnlSoftmax
+    auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, axis);
+    bangGraph->dataMalloc();
+    inputGpu->setData(generator);
+    double bangst, bangela;
+    bangst = get_walltime();
+    bangRuntime->run(bangGraph);
+    bangela = 1000 * (get_walltime() - bangst);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // bangSoftmax
+    auto bangGpuOp = bangGraph->addOp<BangSoftmaxObj>(inputGpu, nullptr, axis);
+    bangGraph->dataMalloc();
+    inputGpu->setData(generator);
+    double cnnlst, cnnlela;
+    cnnlst = get_walltime();
+    bangRuntime->run(bangGraph);
+    cnnlela = 1000 * (get_walltime() - cnnlst);
+    auto bangOutputGpu = gpuOp->getOutput();
+    auto bangOutputGpu2Cpu = bangOutputGpu->clone(cpuRuntime);
+    // Check
+    float error =
+        err(outputGpu2Cpu->getRawDataPtr<float *>(),
+            bangOutputGpu2Cpu->getRawDataPtr<float *>(), inputShape, nDim);
+    printf("axis:%d. bang time:%.2f ms, cnnl time:%.2f ms, err:%.4e\n", axis,
+           bangela, cnnlela, error);
+}
+TEST(BANG_SoftmaxFp32, run) {
+    test_softmaxFp32(
+        Shape{2, 3, 2, 2},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,
+                      8.,  9.,  10., 11., 12., 13., 14., 15.,
+                      16., 17., 18., 19., 20., 21., 22., 23.},
+        0, vector<float>{6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
+                         6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
+                         6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
+                         6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
+                         9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
+                         9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
+                         9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
+                         9.99993801e-01, 9.99993801e-01, 9.99993801e-01});
+    test_softmaxFp32(
+        Shape{2, 4}, vector<float>{0., 1., 2., 3., 1000, 1001, 1002, 1003}, 1,
         vector<float>{0.032058604, 0.08714432, 0.23688284, 0.6439143,
-                      0.032058604, 0.08714432, 0.23688284, 0.6439143},
-        eps));
+                      0.032058604, 0.08714432, 0.23688284, 0.6439143});
+}
+TEST(BANG_CompareSoftmaxFp32, run) {
+    test_compareSoftmaxFp32(3, Shape{1, 32, 1, 5}, 4, RandomGenerator());
+    test_compareSoftmaxFp32(3, Shape{1, 32, 128, 5}, 4, RandomGenerator());
+    test_compareSoftmaxFp32(3, Shape{1, 32, 1, 5}, 4, IncrementalGenerator());
+    test_compareSoftmaxFp32(3, Shape{1, 32, 128, 5}, 4, IncrementalGenerator());
 }
 
-TEST(cuDNN_Softmax, run_axis0) {
-    // Runtime
-    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
-    auto bangRuntime = make_ref<BangRuntimeObj>();
-
-    // Build input data on CPU
-    Tensor inputCpu =
-        make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
-
-    // GPU
-    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
-    auto inputGpu = bangGraph->cloneTensor(inputCpu);
-    auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 0);
-    bangGraph->dataMalloc();
-    inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
-    bangRuntime->run(bangGraph);
-    auto outputGpu = gpuOp->getOutput();
-    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
-    // Check
-    EXPECT_TRUE(outputGpu2Cpu->equalData(
-        vector<float>{0., 0., 0., 0., 1, 1, 1, 1}, eps));
-}
-
-TEST(cuDNN_Softmax2, run_axis1) {
-    // Runtime
-    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
-    auto bangRuntime = make_ref<BangRuntimeObj>();
-
-    // Build input data on CPU
-    Tensor inputCpu =
-        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
-
-    // GPU
-    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
-    auto inputGpu = bangGraph->cloneTensor(inputCpu);
-    auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
-    bangGraph->dataMalloc();
-    inputGpu->setData(IncrementalGenerator());
-    bangRuntime->run(bangGraph);
-    auto outputGpu = gpuOp->getOutput();
-    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
-    // Check
-    EXPECT_TRUE(outputGpu2Cpu->equalData(
-        vector<float>{0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138,
-                      0.9820138, 0.9820138, 0.9820138, 0.0179862, 0.0179862,
-                      0.0179862, 0.0179862, 0.9820138, 0.9820138, 0.9820138,
-                      0.9820138},
-        eps));
-}
-
-TEST(cuDNN_Softmax2, run_axis2) {
-    // Runtime
-    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
-    auto bangRuntime = make_ref<BangRuntimeObj>();
-
-    // Build input data on CPU
-    Tensor inputCpu =
-        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
-
-    // GPU
-    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
-    auto inputGpu = bangGraph->cloneTensor(inputCpu);
-    auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 2);
-    bangGraph->dataMalloc();
-    inputGpu->setData(IncrementalGenerator());
-    bangRuntime->run(bangGraph);
-    auto outputGpu = gpuOp->getOutput();
-    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
-    // Check
-    EXPECT_TRUE(outputGpu2Cpu->equalData(
-        vector<float>{0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029,
-                      0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029,
-                      0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971,
-                      0.8807971},
-        eps));
-}
-
-TEST(cuDNN_Softmax2, run_axis3) {
-    // Runtime
-    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
-    auto bangRuntime = make_ref<BangRuntimeObj>();
-
-    // Build input data on CPU
-    Tensor inputCpu =
-        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
-
-    // GPU
-    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
-    auto inputGpu = bangGraph->cloneTensor(inputCpu);
-    auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 3);
-    bangGraph->dataMalloc();
-    inputGpu->setData(IncrementalGenerator());
-    bangRuntime->run(bangGraph);
-    auto outputGpu = gpuOp->getOutput();
-    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
-    // Check
-    EXPECT_TRUE(outputGpu2Cpu->equalData(
-        vector<float>{0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414,
-                      0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
-                      0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414,
-                      0.7310586},
-        eps));
-}
 } // namespace infini