diff --git a/src/kernels/cuda/dynamic_quantize_linear.cu b/src/kernels/cuda/dynamic_quantize_linear.cu
index 035419a0..e5dc7862 100644
--- a/src/kernels/cuda/dynamic_quantize_linear.cu
+++ b/src/kernels/cuda/dynamic_quantize_linear.cu
@@ -3,7 +3,7 @@
 #include "cuda/cuda_common.h"
 
 __device__ float _saturate(float x) {
-    return x < 0.f ? 0.f : (x > 255.0 ? 255.0 : x);
+    return x < 0.f ? 0.f : (x > 255.0f ? 255.0f : x);
 }
 
 template <class T>
@@ -22,8 +22,8 @@ __launch_bounds__(BLOCK_DIM) __global__
                                       float *yScale, uint8_t *yZeroPoint,
                                       int size) {
     int i = threadIdx.x + blockIdx.x * BLOCK_DIM;
-    float maxData = __FLT_MAX__;
-    float minData = -__FLT_MAX__;
+    float maxData = -__FLT_MAX__;
+    float minData = __FLT_MAX__;
     int remain = size % BLOCK_DIM;
     int step = (size - remain) / BLOCK_DIM + 1;
 
@@ -38,6 +38,7 @@ __launch_bounds__(BLOCK_DIM) __global__
                                    (threadIdx.x - remain) * (step - 1) + ind]);
         }
     }
+
     if (threadIdx.x < remain) {
         for (int ind = 0; ind < step; ind++) {
             minData = min___(minData, input[threadIdx.x * step + ind]);
@@ -49,25 +50,27 @@ __launch_bounds__(BLOCK_DIM) __global__
                                    (threadIdx.x - remain) * (step - 1) + ind]);
         }
     }
-    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-    __shared__ float maxTotal;
-    float blockMax = BlockReduce(temp_storage).Reduce(maxData, cub::Max());
 
-    __shared__ float minTotal;
-    float blockMin = BlockReduce(temp_storage).Reduce(minData, cub::Min());
-    if (threadIdx.x == 0) {
-        maxTotal = blockMax;
-        minTotal = blockMin;
-    }
-    __syncthreads();
+    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storageMax;
+
+    float blockMax = BlockReduce(temp_storageMax).Reduce(maxData, cub::Max());
+
+    __shared__ typename BlockReduce::TempStorage temp_storageMin;
+
+    float blockMin = BlockReduce(temp_storageMin).Reduce(minData, cub::Min());
+    //-----
     int qmax = 255;
     int qmin = 0;
-    float absMax = max___(abs(maxTotal), abs(minTotal));
-    yScale[0] = absMax * 2 / (254 - qmin);
-    float intermediate_zero_point = 254 - absMax / yScale[0];
-    float _yZeroPoint = round(_saturate(intermediate_zero_point));
-    yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
+    __shared__ float _yZeroPoint;
+    if (threadIdx.x == 0) {
+        yScale[0] = (max(0.0f, blockMax) - min(0.0f, blockMin)) / (qmax - qmin);
+        float intermediate_zero_point = qmin - blockMin / yScale[0];
+        _yZeroPoint = round(_saturate(intermediate_zero_point));
+        yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
+    }
+    __syncthreads();
+
     if (i < size) {
         outputY[i] = static_cast<uint8_t>(
             _saturate(round(input[i] / yScale[0]) + _yZeroPoint));
@@ -81,8 +84,8 @@ __launch_bounds__(BLOCK_DIM) __global__
                                       float *yScale, uint8_t *yZeroPoint,
                                       int size) {
     int i = threadIdx.x + blockIdx.x * BLOCK_DIM;
-    float maxData = __FLT_MAX__;
-    float minData = -__FLT_MAX__;
+    float maxData = -__FLT_MAX__;
+    float minData = __FLT_MAX__;
     int remain = size % BLOCK_DIM;
     int step = (size - remain) / BLOCK_DIM + 1;
     float dataPerThread[numPerThread];
@@ -99,6 +102,7 @@ __launch_bounds__(BLOCK_DIM) __global__
             maxData = max___(maxData, dataPerThread[ind]);
         }
     }
+
     if (threadIdx.x < remain) {
         for (int ind = 0; ind < step; ind++) {
             minData = min___(minData, dataPerThread[ind]);
@@ -108,25 +112,27 @@ __launch_bounds__(BLOCK_DIM) __global__
             minData = min___(minData, dataPerThread[ind]);
         }
     }
-    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-    __shared__ float maxTotal;
-    float blockMax = BlockReduce(temp_storage).Reduce(maxData, cub::Max());
 
-    __shared__ float minTotal;
-    float blockMin = BlockReduce(temp_storage).Reduce(minData, cub::Min());
-    if (threadIdx.x == 0) {
-        maxTotal = blockMax;
-        minTotal = blockMin;
-    }
-    __syncthreads();
+    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storageMax;
+
+    float blockMax = BlockReduce(temp_storageMax).Reduce(maxData, cub::Max());
+
+    __shared__ typename BlockReduce::TempStorage temp_storageMin;
+
+    float blockMin = BlockReduce(temp_storageMin).Reduce(minData, cub::Min());
+    //-----
     int qmax = 255;
     int qmin = 0;
-    float absMax = max___(abs(maxTotal), abs(minTotal));
-    yScale[0] = absMax * 2 / (254 - qmin);
-    float intermediate_zero_point = 254 - absMax / yScale[0];
-    float _yZeroPoint = round(_saturate(intermediate_zero_point));
-    yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
+    __shared__ float _yZeroPoint;
+    if (threadIdx.x == 0) {
+        yScale[0] = (max(0.0f, blockMax) - min(0.0f, blockMin)) / (qmax - qmin);
+        float intermediate_zero_point = qmin - blockMin / yScale[0];
+        _yZeroPoint = round(_saturate(intermediate_zero_point));
+        yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
+    }
+    __syncthreads();
+
     if (i < size) {
         outputY[i] = static_cast<uint8_t>(
             _saturate(round(input[i] / yScale[0]) + _yZeroPoint));
diff --git a/test/kernels/cuda/test_cuda_dynamic_quantize_linear.cc b/test/kernels/cuda/test_cuda_dynamic_quantize_linear.cc
new file mode 100644
index 00000000..350de0f1
--- /dev/null
+++ b/test/kernels/cuda/test_cuda_dynamic_quantize_linear.cc
@@ -0,0 +1,74 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/dynamic_quantize_linear.h"
+
+#include "test.h"
+
+namespace infini {
+
+void test_dynamicquantizeLinearFp32(
+    const Shape &inputShape, const vector<float> &inputData,
+    const vector<uint8_t> &outputYData, const vector<float> &outputYScaleData,
+    const vector<uint8_t> &outputYZeroPointData) {
+
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    auto input = gCpu->addTensor(inputShape, DataType::Float32);
+
+    gCpu->dataMalloc();
+
+    input->copyin(inputData);
+
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+    auto inputGpu = gCuda->cloneTensor(input);
+
+    auto op = gCuda->addOp<DynamicQuantizeLinearObj>(
+        inputGpu,
+        std::nullopt); // DynamicQuantizeLinear
+    gCuda->dataMalloc();
+
+    inputGpu->copyin(inputData);
+
+    cudaRuntime->run(gCuda);
+
+    EXPECT_EQ(op->getOutputs().size(), (size_t)3);
+    auto o0Cpu = gCpu->cloneTensor(op->getOutput(0));
+    auto o1Cpu = gCpu->cloneTensor(op->getOutput(1));
+    auto o2Cpu = gCpu->cloneTensor(op->getOutput(2));
+
+    EXPECT_TRUE(o0Cpu->equalData(outputYData));
+    EXPECT_TRUE(o1Cpu->equalData(outputYScaleData));
+    EXPECT_TRUE(o2Cpu->equalData(outputYZeroPointData));
+}
+
+TEST(CUDA_DynamicquantizeLinearFp32, run) {
+
+    test_dynamicquantizeLinearFp32(
+        Shape{2, 3, 2, 3},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
+                      9.,  10., 11., 12., 13., 14., 15., 16., 17.,
+                      18., 19., 20., 21., 22., 23., 24., 25., 26.,
+                      27., 28., 29., 30., 31., 32., 33., 34., 35.},
+        vector<uint8_t>{0,   7,   15,  22,  29,  36,  44,  51,  58,
+                        66,  73,  80,  87,  95,  102, 109, 117, 124,
+                        131, 138, 146, 153, 160, 168, 175, 182, 189,
+                        197, 204, 211, 219, 226, 233, 240, 248, 255},
+        vector<float>{0.1372549}, vector<uint8_t>{0});
+    test_dynamicquantizeLinearFp32(
+        Shape{2, 3, 2, 2},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,
+                      8.,  9.,  10., 11., 12., 13., 14., 15.,
+                      16., 17., 18., 19., 20., 21., 22., 23.},
+        vector<uint8_t>{0,   11,  22,  33,  44,  55,  67,  78,
+                        89,  100, 111, 122, 133, 144, 155, 166,
+                        177, 188, 200, 211, 222, 233, 244, 255},
+        vector<float>{0.0901961}, vector<uint8_t>{0});
+
+} // python output
+
+} // namespace infini