add half and float dequantizeLinear

2023-12-18 17:47:53 +08:00 · 2023-12-18 17:47:53 +08:00 · 9c82936386
parent 03ed8c4de7
commit 9c82936386
10 changed files with 693 additions and 45 deletions
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -85,6 +85,8 @@ class GraphHandlerObj {
    Tensor cast(Tensor input, Tensor output, int to);
    Tensor expand(Tensor input, Tensor output, Shape dims);
    Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
+    Tensor dequantizeLinear(Tensor inputX, Tensor inputScale, Tensor output,
+                            Tensor inputZeroPoint, int axis);
    std::vector<int> getDims(Tensor x) { return x->getDims(); }

    Tensor allReduceSum(Tensor input, Tensor output);
@ -102,9 +104,6 @@ class GraphHandlerObj {
    TensorVec dynamicQuantizeLinear(Tensor input,
                                    std::optional<TensorVec> outputs);

-    Tensor dequantizeLinear(Tensor input, Tensor scale, Tensor zero_point,
-                            Tensor output, int axis);
-
    //------ modifiers

    inline bool topo_sort() { return g->topo_sort(); }
--- a/include/cuda/cuda_dequantizeLinear.h
+++ b/include/cuda/cuda_dequantizeLinear.h
@ -0,0 +1,17 @@
+#pragma once
+#include "operators/unary.h"
+
+namespace infini {
+void DequantizeLinearKernel(const uint8_t *inputX, const float *inputScale,
+                            float *output, const int dimsize, const int stride,
+                            const uint8_t *inputZeroPoint, const int size);
+void DequantizeLinearKernel(const uint8_t *inputX, const float *inputScale,
+                            float *output, const int dimsize, const int stride,
+                            const int size);
+void DequantizeLinearKernel(const uint8_t *inputX, const half *inputScale,
+                            half *output, const int dimsize, const int stride,
+                            const uint8_t *inputZeroPoint, const int size);
+void DequantizeLinearKernel(const uint8_t *inputX, const half *inputScale,
+                            half *output, const int dimsize, const int stride,
+                            const int size);
+}; // namespace infini
--- a/include/operators/dequantize_linear.h
+++ b/include/operators/dequantize_linear.h
@ -3,9 +3,8 @@

 namespace infini {
 /**
- * @brief The linear dequantization operator.
- * It consumes a quantized tensor, a scale, and a zero point to compute
- * the full precision tensor.
+ * @brief y = (x - x_zero_point) *x_scale
+ *
 */
 class DequantizeLinearObj : public OperatorObj {
    int axis;
@ -15,26 +14,30 @@ class DequantizeLinearObj : public OperatorObj {
     * @brief Construct a new DequantizeLinear object.
     *
     * @param graph The computation graph that this operator belongs to.
-     * @param input The input tensor.
-     * @param scale Scale for input.
-     * @param zero_point Zero point for input.
-     * @param outputs The output tensors.
-     * @param axis The axis of the dequantizing dimension of the input tensor.
+     * @param inputX The input tensor X.
+     * @param inputScale The input tensor x_scale.
+     * @param output The output tensor.
+     * @param inputZeroPoint The z_zero_point.
     */
-    DequantizeLinearObj(GraphObj *graph, Tensor input, Tensor scale,
-                        Tensor zero_pointr, Tensor output, int axis);
+    DequantizeLinearObj(GraphObj *graph, Tensor inputX, Tensor inputScale,
+                        Tensor output, Tensor inputZeroPoint = nullptr,
+                        int axis = 1);
    OP_CLONE(DequantizeLinearObj);

    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
+
+    Tensor getZeroPoint() const {
+        return inputs.size() > 2 ? inputs[2] : nullptr;
+    }
    int numInputs() const override { return inputs.size(); }
    int numOutputs() const override { return 1; }
+    int getAxis() const { return axis; }

  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
-
    vector<DataType> inferDataType(const TensorVec &inputs) const override;
 };

--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -866,19 +866,23 @@ class OnnxStub:
                    ):
                        tensors[name] = tensor
                elif node.op_type == "DequantizeLinear":
-                    attributes = _parse_attribute(
-                        node,
-                        {
-                            "axis": 1,
-                        },
+                    (inputX, inputScale) = (tensors[node.input[i]] for i in [0, 1])
+                    inputZeroPoint = (
+                        None if len(node.input) < 3 else tensors[node.input[2]]
+                    )
+                    output = tensors.get(node.output[0])
+                    axis = next(
+                        (attr.i for attr in node.attribute if attr.name == "axis"),
+                        0,
                    )
-                    axis = attributes["axis"]
                    tensors[node.output[0]] = self.handler.dequantizeLinear(
-                        tensor[node.input[0]],
-                        tensor[node.input[1]],
-                        tensor[node.input[2]] if len(node.input) > 2 else None,
+                        inputX,
+                        inputScale,
+                        output,
+                        inputZeroPoint,
                        axis,
                    )
+
                else:
                    raise Exception('Unsupported operator "{}"'.format(node.op_type))
                new_node_name.append(node.name)
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@ -6,7 +6,7 @@
 #include "operators/broadcast.h"
 #include "operators/concat.h"
 #include "operators/conv.h"
-#include "operators/dequantize_linear.h"
+#include "operators/dequantizeLinear.h"
 #include "operators/dynamic_quantize_linear.h"
 #include "operators/element_wise.h"
 #include "operators/expand.h"
@ -521,18 +521,19 @@ GraphHandlerObj::dynamicQuantizeLinear(Tensor input,
    }
 }

-Tensor GraphHandlerObj::dequantizeLinear(Tensor input, Tensor scale,
-                                         Tensor zero_point, Tensor output,
+Tensor GraphHandlerObj::dequantizeLinear(Tensor inputX, Tensor inputScale,
+                                         Tensor output, Tensor inputZeroPoint,
                                         int axis) {
    if (output) {
        g->addOpWithOutputs<DequantizeLinearObj>(
-            std::move(input), std::move(scale), std::move(zero_point), output,
-            axis);
+            std::move(inputX), std::move(inputScale), output,
+            std::move(inputZeroPoint), axis);
        return output;
    } else {
        return g
-            ->addOp<DequantizeLinearObj>(std::move(input), std::move(scale),
-                                         std::move(zero_point), output, axis)
+            ->addOp<DequantizeLinearObj>(std::move(inputX),
+                                         std::move(inputScale), output,
+                                         std::move(inputZeroPoint), axis)
            ->getOutput();
    }
 }
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -517,6 +517,7 @@ void init_graph_builder(py::module &m) {
        .def("expand", &Handler::expand, policy::move)
        .def("erf", &Handler::erf, policy::move)
        .def("where", &Handler::where, policy::move)
+        .def("dequantizeLinear", &Handler::dequantizeLinear, policy::move)
        .def("topo_sort", &Handler::topo_sort, policy::automatic)
        .def("optimize", &Handler::optimize, policy::automatic)
        .def("operators", &Handler::operators, policy::move)
--- a/src/kernels/cuda/dequantizeLinear.cc
+++ b/src/kernels/cuda/dequantizeLinear.cc
@ -0,0 +1,57 @@
+#include "operators/dequantizeLinear.h"
+#include "cuda/cuda_dequantizeLinear.h"
+#include "cuda/cuda_kernel_wihtout_config.h"
+#include "cuda/cuda_runtime.h"
+
+namespace infini {
+
+class DequantizeLinearCuda : public CudaKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<DequantizeLinearObj>(_op);
+
+        void *const inputX = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const inputScale = (op->getInputs(1)->getRawDataPtr<void *>());
+
+        void *const output = (op->getOutput()->getRawDataPtr<void *>());
+
+        const int axis = op->getAxis();
+        const int stride = op->getInputs(0)->getStride().at(axis);
+
+        auto dims = op->getInputs(0)->getDims();
+        int dimsize = dims[op->getAxis()];
+        int size = op->getOutput()->size();
+
+        if (op->getInputs(1)->getDType() == DataType::Float32) {
+
+            if (op->numInputs() == 3) {
+                void *const inputZeroPoint =
+                    (op->getInputs(2)->getRawDataPtr<void *>());
+
+                DequantizeLinearKernel((uint8_t *)inputX, (float *)inputScale,
+                                       (float *)output, dimsize, stride,
+                                       (uint8_t *)inputZeroPoint, size);
+            } else {
+                DequantizeLinearKernel((uint8_t *)inputX, (float *)inputScale,
+                                       (float *)output, dimsize, stride, size);
+            }
+        } else if (op->getInputs(1)->getDType() == DataType::Float16) {
+            if (op->numInputs() == 3) {
+                void *const inputZeroPoint =
+                    (op->getInputs(2)->getRawDataPtr<void *>());
+
+                DequantizeLinearKernel((uint8_t *)inputX, (half *)inputScale,
+                                       (half *)output, dimsize, stride,
+                                       (uint8_t *)inputZeroPoint, size);
+            } else {
+                DequantizeLinearKernel((uint8_t *)inputX, (half *)inputScale,
+                                       (half *)output, dimsize, stride, size);
+            }
+        }
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::DequantizeLinear, DequantizeLinearCuda,
+                "DequantizeLinear_CUDA");
+
+}; // namespace infini
--- a/src/kernels/cuda/dequantizeLinear.cu
+++ b/src/kernels/cuda/dequantizeLinear.cu
@ -0,0 +1,345 @@
+#include "cuda/cuda_common.h"
+#include <cub/cub.cuh>
+
+template <typename T, int BLOCK_DIM>
+__launch_bounds__(BLOCK_DIM) __global__
+    void blockDequantizeLinearKernel(const uint8_t *inputX, const T *inputScale,
+                                     T *output, const int dimsize,
+                                     const int stride,
+                                     const uint8_t *inputZeroPoint) {
+    // len(scale) = len(bias) = dimsize
+    int tmp = blockIdx.x % stride;
+    int tid = (blockIdx.x - tmp) * dimsize + tmp;
+    int remain = dimsize % BLOCK_DIM;
+    int step = (dimsize - remain) / BLOCK_DIM + 1;
+    if (threadIdx.x < remain) {
+        for (int ind = 0; ind < step; ind++) {
+            output[tid + (threadIdx.x * step + ind) * stride] =
+                static_cast<T>(
+                    inputX[tid + (threadIdx.x * step + ind) * stride] -
+                    inputZeroPoint[threadIdx.x * step + ind]) *
+                inputScale[threadIdx.x * step + ind];
+        }
+    } else {
+        for (int ind = 0; ind < step - 1; ind++) {
+            output[tid +
+                   (remain * step + (threadIdx.x - remain) * (step - 1) + ind) *
+                       stride] =
+                static_cast<T>(
+                    inputX[tid + (remain * step +
+                                  (threadIdx.x - remain) * (step - 1) + ind) *
+                                     stride] -
+                    inputZeroPoint[remain * step +
+                                   (threadIdx.x - remain) * (step - 1) + ind]) *
+                inputScale[remain * step + (threadIdx.x - remain) * (step - 1) +
+                           ind];
+        }
+    }
+}
+template <typename T, int BLOCK_DIM>
+__launch_bounds__(BLOCK_DIM) __global__
+    void blockDequantizeLinearKernel(const uint8_t *inputX, const T *inputScale,
+                                     T *output, const int dimsize,
+                                     const int stride) {
+    // len(scale) = len(bias) = dimsize
+    int tmp = blockIdx.x % stride;
+    int tid = (blockIdx.x - tmp) * dimsize + tmp;
+    int remain = dimsize % BLOCK_DIM;
+    int step = (dimsize - remain) / BLOCK_DIM + 1;
+    if (threadIdx.x < remain) {
+        for (int ind = 0; ind < step; ind++) {
+            output[tid + (threadIdx.x * step + ind) * stride] =
+                static_cast<T>(
+                    inputX[tid + (threadIdx.x * step + ind) * stride]) *
+                inputScale[threadIdx.x * step + ind];
+        }
+    } else {
+        for (int ind = 0; ind < step - 1; ind++) {
+            output[tid +
+                   (remain * step + (threadIdx.x - remain) * (step - 1) + ind) *
+                       stride] =
+                static_cast<T>(
+                    inputX[tid + (remain * step +
+                                  (threadIdx.x - remain) * (step - 1) + ind) *
+                                     stride]) *
+                inputScale[remain * step + (threadIdx.x - remain) * (step - 1) +
+                           ind];
+        }
+    }
+}
+
+template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y>
+__global__ void
+warpDequantizeLinearKernel(const uint8_t *inputX, const T *inputScale,
+                           T *output, const int dimsize, const int otherSize,
+                           const int stride, const uint8_t *inputZeroPoint) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+
+    int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
+    int remain = dimsize % BLOCK_DIM_x;
+    int step = (dimsize - remain) / BLOCK_DIM_x + 1;
+    if (otherIdx < otherSize) {
+        if (threadIdx.x < remain) {
+            for (int ind = 0; ind < step; ind++) {
+                output[tid + (threadIdx.x * step + ind) * stride] =
+                    static_cast<T>(
+                        inputX[tid + (threadIdx.x * step + ind) * stride] -
+                        inputZeroPoint[threadIdx.x * step + ind]) *
+                    inputScale[threadIdx.x * step + ind];
+            }
+        } else {
+            for (int ind = 0; ind < step - 1; ind++) {
+                output[tid + (remain * step +
+                              (threadIdx.x - remain) * (step - 1) + ind) *
+                                 stride] =
+                    static_cast<T>(
+                        inputX[tid +
+                               (remain * step +
+                                (threadIdx.x - remain) * (step - 1) + ind) *
+                                   stride] -
+                        inputZeroPoint[remain * step +
+                                       (threadIdx.x - remain) * (step - 1) +
+                                       ind]) *
+                    inputScale[remain * step +
+                               (threadIdx.x - remain) * (step - 1) + ind];
+            }
+        }
+    }
+}
+template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y>
+__global__ void
+warpDequantizeLinearKernel(const uint8_t *inputX, const T *inputScale,
+                           T *output, const int dimsize, const int otherSize,
+                           const int stride) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+
+    int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
+    int remain = dimsize % BLOCK_DIM_x;
+    int step = (dimsize - remain) / BLOCK_DIM_x + 1;
+    if (otherIdx < otherSize) {
+        if (threadIdx.x < remain) {
+            for (int ind = 0; ind < step; ind++) {
+                output[tid + (threadIdx.x * step + ind) * stride] =
+                    static_cast<T>(
+                        inputX[tid + (threadIdx.x * step + ind) * stride]) *
+                    inputScale[threadIdx.x * step + ind];
+            }
+        } else {
+            for (int ind = 0; ind < step - 1; ind++) {
+                output[tid + (remain * step +
+                              (threadIdx.x - remain) * (step - 1) + ind) *
+                                 stride] =
+                    static_cast<T>(
+                        inputX[tid +
+                               (remain * step +
+                                (threadIdx.x - remain) * (step - 1) + ind) *
+                                   stride]) *
+                    inputScale[remain * step +
+                               (threadIdx.x - remain) * (step - 1) + ind];
+            }
+        }
+    }
+}
+namespace infini {
+void DequantizeLinearKernel(const uint8_t *inputX, const float *inputScale,
+                            float *output, const int dimsize, const int stride,
+                            const uint8_t *inputZeroPoint, const int size) {
+
+    int num_block = size / dimsize;
+    if (dimsize > 1024) {
+        int BLOCK_DIM = 1024;
+
+        blockDequantizeLinearKernel<float, 1024><<<num_block, BLOCK_DIM>>>(
+            inputX, inputScale, output, dimsize, stride, inputZeroPoint);
+    } else if (dimsize > 31) {
+        int BLOCK_DIM_x = 32;
+        int BLOCK_DIM_y = 32;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<float, 32, 32>
+            <<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
+                                      num_block, stride, inputZeroPoint);
+    } else if (dimsize > 15) {
+        int BLOCK_DIM_x = 16;
+        int BLOCK_DIM_y = 64;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<float, 16, 64>
+            <<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
+                                      num_block, stride, inputZeroPoint);
+    } else if (dimsize > 7) {
+        int BLOCK_DIM_x = 8;
+        int BLOCK_DIM_y = 128;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<float, 8, 128>
+            <<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
+                                      num_block, stride, inputZeroPoint);
+    } else {
+        int BLOCK_DIM_x = 4;
+        int BLOCK_DIM_y = 256;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<float, 4, 256>
+            <<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
+                                      num_block, stride, inputZeroPoint);
+    }
+}
+
+void DequantizeLinearKernel(const uint8_t *inputX, const float *inputScale,
+                            float *output, const int dimsize, const int stride,
+                            const int size) {
+    int num_block = size / dimsize;
+    if (dimsize > 1024) {
+        int BLOCK_DIM = 1024;
+
+        blockDequantizeLinearKernel<float, 1024><<<num_block, BLOCK_DIM>>>(
+            inputX, inputScale, output, dimsize, stride);
+    } else if (dimsize > 31) {
+        int BLOCK_DIM_x = 32;
+        int BLOCK_DIM_y = 32;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<float, 32, 32><<<grid_dim, block_dim>>>(
+            inputX, inputScale, output, dimsize, num_block, stride);
+    } else if (dimsize > 15) {
+        int BLOCK_DIM_x = 16;
+        int BLOCK_DIM_y = 64;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<float, 16, 64><<<grid_dim, block_dim>>>(
+            inputX, inputScale, output, dimsize, num_block, stride);
+    } else if (dimsize > 7) {
+        int BLOCK_DIM_x = 8;
+        int BLOCK_DIM_y = 128;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<float, 8, 128><<<grid_dim, block_dim>>>(
+            inputX, inputScale, output, dimsize, num_block, stride);
+    } else {
+        int BLOCK_DIM_x = 4;
+        int BLOCK_DIM_y = 256;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<float, 4, 256><<<grid_dim, block_dim>>>(
+            inputX, inputScale, output, dimsize, num_block, stride);
+    }
+}
+//-------------
+void DequantizeLinearKernel(const uint8_t *inputX, const half *inputScale,
+                            half *output, const int dimsize, const int stride,
+                            const uint8_t *inputZeroPoint, const int size) {
+    int num_block = size / dimsize;
+    if (dimsize > 1024) {
+        int BLOCK_DIM = 1024;
+
+        blockDequantizeLinearKernel<half, 1024><<<num_block, BLOCK_DIM>>>(
+            inputX, inputScale, output, dimsize, stride, inputZeroPoint);
+    } else if (dimsize > 31) {
+        int BLOCK_DIM_x = 32;
+        int BLOCK_DIM_y = 32;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<half, 32, 32>
+            <<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
+                                      num_block, stride, inputZeroPoint);
+    } else if (dimsize > 15) {
+        int BLOCK_DIM_x = 16;
+        int BLOCK_DIM_y = 64;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<half, 16, 64>
+            <<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
+                                      num_block, stride, inputZeroPoint);
+    } else if (dimsize > 7) {
+        int BLOCK_DIM_x = 8;
+        int BLOCK_DIM_y = 128;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<half, 8, 128>
+            <<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
+                                      num_block, stride, inputZeroPoint);
+    } else {
+        int BLOCK_DIM_x = 4;
+        int BLOCK_DIM_y = 256;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<half, 4, 256>
+            <<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
+                                      num_block, stride, inputZeroPoint);
+    }
+}
+
+void DequantizeLinearKernel(const uint8_t *inputX, const half *inputScale,
+                            half *output, const int dimsize, const int stride,
+                            const int size) {
+    int num_block = size / dimsize;
+    if (dimsize > 1024) {
+        int BLOCK_DIM = 1024;
+
+        blockDequantizeLinearKernel<half, 1024><<<num_block, BLOCK_DIM>>>(
+            inputX, inputScale, output, dimsize, stride);
+    } else if (dimsize > 31) {
+        int BLOCK_DIM_x = 32;
+        int BLOCK_DIM_y = 32;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<half, 32, 32><<<grid_dim, block_dim>>>(
+            inputX, inputScale, output, dimsize, num_block, stride);
+    } else if (dimsize > 15) {
+        int BLOCK_DIM_x = 16;
+        int BLOCK_DIM_y = 64;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<half, 16, 64><<<grid_dim, block_dim>>>(
+            inputX, inputScale, output, dimsize, num_block, stride);
+    } else if (dimsize > 7) {
+        int BLOCK_DIM_x = 8;
+        int BLOCK_DIM_y = 128;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<half, 8, 128><<<grid_dim, block_dim>>>(
+            inputX, inputScale, output, dimsize, num_block, stride);
+    } else {
+        int BLOCK_DIM_x = 4;
+        int BLOCK_DIM_y = 256;
+        int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
+        dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+
+        warpDequantizeLinearKernel<half, 4, 256><<<grid_dim, block_dim>>>(
+            inputX, inputScale, output, dimsize, num_block, stride);
+    }
+}
+
+} // namespace infini
--- a/src/operators/dequantize_linear.cc
+++ b/src/operators/dequantize_linear.cc
@ -1,13 +1,15 @@
-#include "operators/dequantize_linear.h"
+#include "operators/dequantizeLinear.h"
 #include "utils/operator_utils.h"

 namespace infini {
-DequantizeLinearObj::DequantizeLinearObj(GraphObj *graph, Tensor input,
-                                         Tensor scale, Tensor zero_point,
-                                         Tensor output, int axis)
+
+DequantizeLinearObj::DequantizeLinearObj(GraphObj *graph, Tensor inputX,
+                                         Tensor inputScale, Tensor output,
+                                         [[maybe_unused]] Tensor inputZeroPoint,
+                                         int axis)
    : OperatorObj(OpType::DequantizeLinear,
-                  zero_point ? TensorVec{input, scale, zero_point}
-                             : TensorVec{input, scale},
+                  inputZeroPoint ? TensorVec{inputX, inputScale, inputZeroPoint}
+                                 : TensorVec{inputX, inputScale},
                  {output}),
      axis(axis) {
    IT_ASSERT(checkValid(graph));
@ -15,13 +17,14 @@ DequantizeLinearObj::DequantizeLinearObj(GraphObj *graph, Tensor input,

 optional<vector<Shape>>
 DequantizeLinearObj::inferShape(const TensorVec &inputs) {
-    return {{inputs[0]->getDims()}};
+    return {{inputs[0]->getDims()}}; // x.shape = output.shape = inputs[0].shape
 }
-
 vector<DataType>
 DequantizeLinearObj::inferDataType(const TensorVec &inputs) const {
    IT_ASSERT(inputs.size() == 2 || inputs.size() == 3);
-    return {inputs[1]->getDType()};
+
+    return {
+        inputs[1]->getDType()}; // scale.dtype = output.dtype = inputs[1].dtype
 }

 std::string DequantizeLinearObj::toString() const {
@ -29,12 +32,11 @@ std::string DequantizeLinearObj::toString() const {
    os << "DequantizeLinear[" << getGuid() << "]";
    os << "(";
    os << vecToString(inputs[0]->getDims()) << ",";
-    os << "input=" << inputs[0]->getGuid() << ",";
-    os << "scale=" << inputs[1]->getGuid() << ",";
+    os << "inputX=" << inputs[0]->getGuid() << ",";
+    os << "inputScale=" << inputs[1]->getGuid() << ",";
+    // os << "inputZeroPoint=" << inputs[2]->getGuid() << ",";
    os << "axis=" << axis << ",";
-    os << "output=";
-    for (auto output : outputs)
-        os << output->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
    return os.str();
 }

@ -45,7 +47,7 @@ vector<int> DequantizeLinearObj::getWorkloadVector() const {
 }

 vector<int> DequantizeLinearObj::getOpAttrVector() const {
-    return {type.underlying()};
+    return {type.underlying(), axis};
 }

 } // namespace infini
--- a/test/kernels/cuda/test_cuda_dequantizeLinear.cc
+++ b/test/kernels/cuda/test_cuda_dequantizeLinear.cc
@ -0,0 +1,219 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/dequantizeLinear.h"
+
+#include "test.h"
+
+namespace infini {
+
+void test_dequantizeLinearFp32(
+    const Shape &inputXShape, const vector<uint8_t> &inputXData,
+    const Shape &inputScaleShape, const vector<float> &inputScaleData, int axis,
+    const vector<float> &ExpectData,
+    const std::optional<Shape> &zeroPointShape = std::nullopt,
+    const std::optional<std::vector<uint8_t>> &inputZeroPointData =
+        std::nullopt) {
+
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    if (zeroPointShape.has_value() && inputZeroPointData.has_value()) {
+        Shape inputZeroPointShape = *zeroPointShape;
+
+        auto inputZeroPoint =
+            gCpu->addTensor(inputZeroPointShape, DataType::UInt8);
+        auto inputX = gCpu->addTensor(inputXShape, DataType::UInt8);
+        auto inputScale = gCpu->addTensor(inputScaleShape, DataType::Float32);
+        gCpu->dataMalloc();
+        inputZeroPoint->copyin(*inputZeroPointData); //
+        inputX->copyin(inputXData);
+        inputScale->copyin(inputScaleData); //
+
+        // inputX->printData();
+        // inputZeroPoint->printData();
+        // inputScale->printData();
+
+        auto cudaRuntime = make_ref<CudaRuntimeObj>();
+        Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+        auto inputZeroPointGpu = gCuda->cloneTensor(inputZeroPoint);
+        auto inputXGpu = gCuda->cloneTensor(inputX);
+        auto inputScaleGpu = gCuda->cloneTensor(inputScale);
+
+        auto op = gCuda->addOp<DequantizeLinearObj>(
+            inputXGpu, inputScaleGpu, nullptr, inputZeroPointGpu,
+            axis); // DequantizeLinearObj
+        gCuda->dataMalloc();
+        inputZeroPointGpu->copyin(*inputZeroPointData);
+        // gCpu->cloneTensor(inputZeroPointGpu)->printData();
+        inputXGpu->copyin(inputXData);
+        inputScaleGpu->copyin(inputScaleData);
+        cudaRuntime->run(gCuda);
+
+        auto oCpu =
+            gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
+        oCpu->printData();                      //->printData
+        EXPECT_TRUE(oCpu->equalData(ExpectData));
+    } else {
+
+        auto inputX = gCpu->addTensor(inputXShape, DataType::UInt8);
+        auto inputScale = gCpu->addTensor(inputScaleShape, DataType::Float32);
+        gCpu->dataMalloc();
+
+        inputX->copyin(inputXData);
+        inputScale->copyin(inputScaleData); //
+        auto cudaRuntime = make_ref<CudaRuntimeObj>();
+        Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+        auto inputXGpu = gCuda->cloneTensor(inputX);
+        auto inputScaleGpu = gCuda->cloneTensor(inputScale);
+        auto op = gCuda->addOp<DequantizeLinearObj>(
+            inputXGpu, inputScaleGpu, nullptr, nullptr,
+            axis); // DequantizeLinearObj
+        gCuda->dataMalloc();
+
+        inputXGpu->copyin(inputXData);
+        inputScaleGpu->copyin(inputScaleData);
+        cudaRuntime->run(gCuda);
+
+        auto oCpu =
+            gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
+        oCpu->printData();                      //->printData
+        EXPECT_TRUE(oCpu->equalData(ExpectData));
+    }
+}
+void test_dequantizeLinearFp16(
+    const Shape &inputXShape, const vector<uint8_t> &inputXData,
+    const Shape &inputScaleShape,
+    const std::function<void(void *, size_t, DataType)> &generator, int axis,
+    const vector<float> &ExpectData,
+    const std::optional<Shape> &zeroPointShape = std::nullopt,
+    const std::optional<std::vector<uint8_t>> &inputZeroPointData =
+        std::nullopt) {
+
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    if (zeroPointShape.has_value() && inputZeroPointData.has_value()) {
+        Shape inputZeroPointShape = *zeroPointShape;
+
+        auto inputZeroPoint =
+            gCpu->addTensor(inputZeroPointShape, DataType::UInt8);
+        auto inputX = gCpu->addTensor(inputXShape, DataType::UInt8);
+        auto inputScale = gCpu->addTensor(inputScaleShape, DataType::Float16);
+        gCpu->dataMalloc();
+        inputZeroPoint->copyin(*inputZeroPointData); //
+        // inputZeroPoint->printData();
+        inputX->copyin(inputXData);
+        inputScale->setData(generator);
+        auto cudaRuntime = make_ref<CudaRuntimeObj>();
+        Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+        auto inputZeroPointGpu = gCuda->cloneTensor(inputZeroPoint);
+        auto inputXGpu = gCuda->cloneTensor(inputX);
+        auto inputScaleGpu = gCuda->cloneTensor(inputScale);
+        // gCpu->cloneTensor(inputZeroPointGpu)->printData();
+        auto op = gCuda->addOp<DequantizeLinearObj>(
+            inputXGpu, inputScaleGpu, nullptr, inputZeroPointGpu,
+            axis); // DequantizeLinearObj
+        gCuda->dataMalloc();
+        inputZeroPointGpu->copyin(*inputZeroPointData);
+        // gCpu->cloneTensor(inputZeroPointGpu)->printData();
+        inputXGpu->copyin(inputXData);
+        inputScaleGpu->setData(generator);
+        cudaRuntime->run(gCuda);
+
+        auto oCpu =
+            gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
+        oCpu->printData();                      //->printData
+        EXPECT_TRUE(oCpu->equalData(ExpectData));
+    } else {
+
+        auto inputX = gCpu->addTensor(inputXShape, DataType::UInt8);
+        auto inputScale = gCpu->addTensor(inputScaleShape, DataType::Float16);
+        gCpu->dataMalloc();
+
+        inputX->copyin(inputXData);
+        inputScale->setData(generator);
+        auto cudaRuntime = make_ref<CudaRuntimeObj>();
+        Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+        auto inputXGpu = gCuda->cloneTensor(inputX);
+        auto inputScaleGpu = gCuda->cloneTensor(inputScale);
+        auto op = gCuda->addOp<DequantizeLinearObj>(
+            inputXGpu, inputScaleGpu, nullptr, nullptr,
+            axis); // DequantizeLinearObj
+        gCuda->dataMalloc();
+
+        inputXGpu->copyin(inputXData);
+        inputScaleGpu->setData(generator);
+        cudaRuntime->run(gCuda);
+
+        auto oCpu =
+            gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
+        oCpu->printData();                      //->printData
+        EXPECT_TRUE(oCpu->equalData(ExpectData));
+    }
+}
+
+TEST(CUDA_DequantizeLinearFp32, run) {
+
+    test_dequantizeLinearFp32(
+        Shape{2, 3, 2, 3},
+        vector<uint8_t>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                        12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                        24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
+        Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1,
+        vector<float>{-0.3000000, 0.0000000,  0.3000000,  0.6000000,
+                      0.9000000,  1.2000000,  0.8000000,  1.0000000,
+                      1.2000000,  1.4000000,  1.6000000,  1.8000001,
+                      4.5000000,  5.0000000,  5.5000000,  6.0000000,
+                      6.5000000,  7.0000000,  5.1000004,  5.4000001,
+                      5.7000003,  6.0000000,  6.3000002,  6.6000004,
+                      4.4000001,  4.5999999,  4.8000002,  5.0000000,
+                      5.2000003,  5.4000001,  13.5000000, 14.0000000,
+                      14.5000000, 15.0000000, 15.5000000, 16.0000000},
+        Shape{3}, vector<uint8_t>{1, 2, 3});
+    test_dequantizeLinearFp32(
+        Shape{2, 3, 2, 3},
+        vector<uint8_t>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                        12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                        24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
+        Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1,
+        vector<float>{0.0000000,  0.3000000,  0.6000000,  0.9000000,
+                      1.2000000,  1.5000000,  1.2000000,  1.4000000,
+                      1.6000000,  1.8000001,  2.0000000,  2.2000000,
+                      6.0000000,  6.5000000,  7.0000000,  7.5000000,
+                      8.0000000,  8.5000000,  5.4000001,  5.7000003,
+                      6.0000000,  6.3000002,  6.6000004,  6.9000001,
+                      4.8000002,  5.0000000,  5.2000003,  5.4000001,
+                      5.5999999,  5.8000002,  15.0000000, 15.5000000,
+                      16.0000000, 16.5000000, 17.0000000, 17.5000000});
+
+} // python output
+TEST(CUDA_DequantizeLinearFp16, run) {
+    test_dequantizeLinearFp16(
+        Shape{2, 3, 2, 3},
+        vector<uint8_t>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                        12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                        24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
+        Shape{3}, ValGenerator<2>(), 1,
+        vector<float>{-2., 0.,  2.,  4.,  6.,  8.,  8.,  10., 12.,
+                      14., 16., 18., 18., 20., 22., 24., 26., 28.,
+                      34., 36., 38., 40., 42., 44., 44., 46., 48.,
+                      50., 52., 54., 54., 56., 58., 60., 62., 64.},
+        Shape{3}, vector<uint8_t>{1, 2, 3});
+    test_dequantizeLinearFp16(
+        Shape{2, 3, 2, 3},
+        vector<uint8_t>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                        12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                        24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
+        Shape{3}, ValGenerator<2>(), 1,
+        vector<float>{0.,  2.,  4.,  6.,  8.,  10., 12., 14., 16.,
+                      18., 20., 22., 24., 26., 28., 30., 32., 34.,
+                      36., 38., 40., 42., 44., 46., 48., 50., 52.,
+                      54., 56., 58., 60., 62., 64., 66., 68., 70.});
+
+} // python output
+
+} // namespace infini