diff --git a/include/core/tensor.h b/include/core/tensor.h
index 63efd0f7..ff4f3480 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -145,14 +145,17 @@ class TensorObj : public TensorBaseObj {
     void printData() const;
     bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
 
-    template <typename T> bool equalData(const vector<T> &dataVector) {
+    template <typename T>
+    bool equalData(const vector<T> &dataVector, double relativeError = 1e-6) {
         IT_ASSERT(size() == dataVector.size());
         if (dtype == DataType::Float16) {
             return equalDataImpl_fp16(getRawDataPtr<uint16_t *>(),
-                                      (float *)dataVector.data(), size());
+                                      (float *)dataVector.data(), size(),
+                                      relativeError);
         }
         IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
-        return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
+        return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size(),
+                             relativeError);
     }
 
     size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const;
@@ -198,24 +201,34 @@ class TensorObj : public TensorBaseObj {
                 if (a[i] != b[i])
                     return false;
             } else if constexpr (std::is_floating_point_v<T>) {
-                if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
-                    relativeError) {
-                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
-                    return false;
+                if (fabs(b[i]) < 1e-6) {
+                    if (fabs(a[i] - b[i]) > relativeError) {
+                        printf("Error on %lu: %f %f\n", i, a[i], b[i]);
+                        return false;
+                    }
+                } else {
+                    if (fabs(a[i] - b[i]) /
+                            (std::max(fabs(a[i]), fabs(b[i])) + 1e-6) >
+                        relativeError) {
+                        printf("Error on %lu: %f %f\n", i, a[i], b[i]);
+                        return false;
+                    }
                 }
+
             } else
                 static_assert(!sizeof(T), "Unsupported data type");
         }
         return true;
     }
 
-    bool equalDataImpl_fp16(const uint16_t *a, const float *b,
-                            size_t size) const {
+    bool equalDataImpl_fp16(const uint16_t *a, const float *b, size_t size,
+                            double relativeError = 1e-6) const {
         for (size_t i = 0; i < size; ++i) {
             auto a_fp32 = fp16_to_float(a[i]);
             auto b_fp32 = b[i];
-            if (fabs(a_fp32 - b_fp32) / std::max(fabs(a_fp32), fabs(b_fp32)) >
-                1e-6) {
+            if (fabs(a_fp32 - b_fp32) /
+                    (std::max(fabs(a_fp32), fabs(b_fp32)) + 1e-6) >
+                relativeError) {
                 printf("Error on %lu: %f %f\n", i, a_fp32, b_fp32);
                 return false;
             }
diff --git a/src/kernels/mlu/include/highSoftmax.h b/src/kernels/mlu/include/highSoftmax.h
index 60c3954f..df8d0539 100644
--- a/src/kernels/mlu/include/highSoftmax.h
+++ b/src/kernels/mlu/include/highSoftmax.h
@@ -1,8 +1,8 @@
-#ifndef BANG_KERNELS_DIVOPERATION_DIV_H_
-#define BANG_KERNELS_DIVOPERATION_DIV_H_
+#ifndef BANG_KERNELS_SOFTMAXOPERATION_SOFTMAX_H_
+#define BANG_KERNELS_SOFTMAXOPERATION_SOFTMAX_H_
 
 __mlu_global__ void softmaxUnion1(float *mlu_destination, float *mlu_src,
                                   int nDim, int axis, int othersize,
                                   int frontsize, int dimsize, int stride);
 
-#endif // BANG_KERNELS_DIVOPERATION_DIV_H_
+#endif // BANG_KERNELS_SOFTMAXOPERATION_SOFTMAX_H_
diff --git a/src/kernels/mlu/src/highSoftmax_device.mlu b/src/kernels/mlu/src/highSoftmax_device.mlu
index b9e47d96..276f69a4 100644
--- a/src/kernels/mlu/src/highSoftmax_device.mlu
+++ b/src/kernels/mlu/src/highSoftmax_device.mlu
@@ -45,7 +45,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
           __bang_add(tmpSum, tmpSum, src, maxNum);//sum += exp(x - M)
           __memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM
         }
-        __bang_active_recip(tmpSum, tmpSum, maxNum);//计算1/sum
+        __bang_active_reciphp(tmpSum, tmpSum, maxNum);//计算1/sum
         //开始指数变换并且写回GDRAM
         __bang_mul(src, src, tmpSum, maxNum);//上面循环结束src存储的数据可以利用
         __memcpy(destination + (dimsize - 1) * stride + frontIdx + j * maxNum, src, maxNum * sizeof(float), NRAM2GDRAM);
@@ -75,7 +75,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
           __memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM
         }
         //-------------------
-        __bang_active_recip(tmpSum, tmpSum, maxNum);//计算1/sum
+        __bang_active_reciphp(tmpSum, tmpSum, maxNum);//计算1/sum
         //开始指数变换并且写回GDRAM
         __bang_mul(src, src, tmpSum, maxNum);//上面循环结束src存储的数据可以利用
         __memcpy(destination + (dimsize - 1) * stride + frontIdx + repeat * maxNum, src, remain * sizeof(float), NRAM2GDRAM);
@@ -157,7 +157,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
       
       //此时tmpNewMax存储的是对应于固定frontIdx，behindsize对应数据的最大值，而tmpSum存储的就是对应数值和
       //__bang_printf("tmpOldMax[0]:%.2f,tmpSum[0]:%.2f\n", tmpNewMax[2],tmpSum[2]);
-      __bang_active_recip(tmpSum, tmpSum, strideS);
+      __bang_active_reciphp(tmpSum, tmpSum, strideS);
       //__bang_printf("tmpOldMax[0]:%.2f,tmpSum[0]:%.2f\n", tmpNewMax[2],tmpSum[2]);
       if(remain){
         for(int m = 0; m < remain; m++){
@@ -225,7 +225,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
           __bang_add(tmpSum, tmpSum, tmp, strideS);//sum += exp(x - M)
           __memcpy(tmpOldMax, tmpNewMax, stride * sizeof(float), NRAM2NRAM);//oldM = newM
         }
-        __bang_active_recip(tmpSum, tmpSum, strideS);
+        __bang_active_reciphp(tmpSum, tmpSum, strideS);
         __bang_mul(tmp, tmp, tmpSum, strideS);//上面循环结束tmp存储的数据可以利用
         //__memcpy(destination + tid + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2GDRAM);
         __memcpy(src + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2NRAM);
@@ -262,7 +262,7 @@ __mlu_device__ void softmaxKernelAxis_m(float* destination, float* source, int f
           __memcpy(tmpOldMax, tmpNewMax, stride * sizeof(float), NRAM2NRAM);//oldM = newM
         }
         //__bang_printf("max:%.2f,%.2f, sum:%.2f,sum:%.2f\n", tmpNewMax[0], tmpNewMax[1], tmpSum[0], tmpSum[0]);
-        __bang_active_recip(tmpSum, tmpSum, strideS);
+        __bang_active_reciphp(tmpSum, tmpSum, strideS);
         __bang_mul(tmp, tmp, tmpSum, strideS);//上面循环结束tmp存储的数据可以利用
         //__memcpy(destination + tid + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2GDRAM);
         __memcpy(src + m * behindsize + (dimsize - 1) * stride, tmp, stride * sizeof(float), NRAM2NRAM);
@@ -473,7 +473,7 @@ __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int o
       __bang_add(tmpSum, tmpSum, src, maxNum);//sum += exp(x - M)
       __memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM
     } 
-    __bang_active_recip(tmpSum, tmpSum, maxNum);//计算1/sum
+    __bang_active_reciphp(tmpSum, tmpSum, maxNum);//计算1/sum
     //开始指数变换并且写回GDRAM
     __bang_mul(src, src, tmpSum, maxNum);//上面循环结束src存储的数据可以利用
     __memcpy(destination + (dimsize - 1) * stride + indStart + j * maxNum, src, maxNum * sizeof(float), NRAM2GDRAM);
@@ -505,7 +505,7 @@ __mlu_device__ void softmaxKernelAxis_s(float* destination, float* source, int o
       __memcpy(tmpOldMax, tmpNewMax, maxNum * sizeof(float), NRAM2NRAM);//oldM = newM
     } 
     
-    __bang_active_recip(tmpSum, tmpSum, maxNum);//计算1/sum
+    __bang_active_reciphp(tmpSum, tmpSum, maxNum);//计算1/sum
     //开始指数变换并且写回GDRAM
     __bang_mul(src, src, tmpSum, maxNum);//上面循环结束src存储的数据可以利用
     __memcpy(destination + (dimsize - 1) * stride + indStart + repeat * maxNum, src, remainNram * sizeof(float), NRAM2GDRAM);
diff --git a/test/kernels/bang/test_bang_softmax.cc b/test/kernels/bang/test_bang_softmax.cc
index 0ce65776..83fcd0e8 100644
--- a/test/kernels/bang/test_bang_softmax.cc
+++ b/test/kernels/bang/test_bang_softmax.cc
@@ -6,7 +6,7 @@
 #include "test.h"
 #include <cmath>
 namespace infini {
-
+double eps = 3e-3;
 TEST(cuDNN_Softmax, run_axis1) {
     // Runtime
     Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
@@ -28,7 +28,8 @@ TEST(cuDNN_Softmax, run_axis1) {
     // Check
     EXPECT_TRUE(outputGpu2Cpu->equalData(
         vector<float>{0.032058604, 0.08714432, 0.23688284, 0.6439143,
-                      0.032058604, 0.08714432, 0.23688284, 0.6439143}));
+                      0.032058604, 0.08714432, 0.23688284, 0.6439143},
+        eps));
 }
 
 TEST(cuDNN_Softmax, run_axis0) {
@@ -50,8 +51,8 @@ TEST(cuDNN_Softmax, run_axis0) {
     auto outputGpu = gpuOp->getOutput();
     auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
     // Check
-    EXPECT_TRUE(
-        outputGpu2Cpu->equalData(vector<float>{0., 0., 0., 0., 1, 1, 1, 1}));
+    EXPECT_TRUE(outputGpu2Cpu->equalData(
+        vector<float>{0., 0., 0., 0., 1, 1, 1, 1}, eps));
 }
 
 TEST(cuDNN_Softmax2, run_axis1) {
@@ -73,10 +74,12 @@ TEST(cuDNN_Softmax2, run_axis1) {
     auto outputGpu = gpuOp->getOutput();
     auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
     // Check
-    EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
-        0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138, 0.9820138,
-        0.9820138, 0.9820138, 0.0179862, 0.0179862, 0.0179862, 0.0179862,
-        0.9820138, 0.9820138, 0.9820138, 0.9820138}));
+    EXPECT_TRUE(outputGpu2Cpu->equalData(
+        vector<float>{0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138,
+                      0.9820138, 0.9820138, 0.9820138, 0.0179862, 0.0179862,
+                      0.0179862, 0.0179862, 0.9820138, 0.9820138, 0.9820138,
+                      0.9820138},
+        eps));
 }
 
 TEST(cuDNN_Softmax2, run_axis2) {
@@ -98,10 +101,12 @@ TEST(cuDNN_Softmax2, run_axis2) {
     auto outputGpu = gpuOp->getOutput();
     auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
     // Check
-    EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
-        0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029,
-        0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971, 0.8807971,
-        0.1192029, 0.1192029, 0.8807971, 0.8807971}));
+    EXPECT_TRUE(outputGpu2Cpu->equalData(
+        vector<float>{0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029,
+                      0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029,
+                      0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971,
+                      0.8807971},
+        eps));
 }
 
 TEST(cuDNN_Softmax2, run_axis3) {
@@ -123,9 +128,11 @@ TEST(cuDNN_Softmax2, run_axis3) {
     auto outputGpu = gpuOp->getOutput();
     auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
     // Check
-    EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
-        0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
-        0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
-        0.2689414, 0.7310586, 0.2689414, 0.7310586}));
+    EXPECT_TRUE(outputGpu2Cpu->equalData(
+        vector<float>{0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414,
+                      0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
+                      0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414,
+                      0.7310586},
+        eps));
 }
 } // namespace infini