feat: add dynamic quantize linear kernel

2023-12-19 14:40:50 +08:00 · 2023-12-19 14:40:50 +08:00 · 0e75f99e7e
parent 97e3377ca5
commit 0e75f99e7e
11 changed files with 177 additions and 9 deletions
--- a/include/cuda/cuda_dequantize_linear.h
+++ b/include/cuda/cuda_dequantize_linear.h
--- a/include/operators/dequantize_linear.h
+++ b/include/operators/dequantize_linear.h
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@ -6,7 +6,7 @@
 #include "operators/broadcast.h"
 #include "operators/concat.h"
 #include "operators/conv.h"
-#include "operators/dequantizeLinear.h"
+#include "operators/dequantize_linear.h"
 #include "operators/dynamic_quantize_linear.h"
 #include "operators/element_wise.h"
 #include "operators/expand.h"
--- a/src/kernels/cuda/dequantize_linear.cc
+++ b/src/kernels/cuda/dequantize_linear.cc
@ -1,5 +1,5 @@
-#include "operators/dequantizeLinear.h"
-#include "cuda/cuda_dequantizeLinear.h"
+#include "operators/dequantize_linear.h"
+#include "cuda/cuda_dequantize_linear.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"

--- a/src/kernels/cuda/dequantize_linear.cu
+++ b/src/kernels/cuda/dequantize_linear.cu
--- a/src/kernels/cuda/dynamic_quantize_linear.cc
+++ b/src/kernels/cuda/dynamic_quantize_linear.cc
--- a/src/kernels/cuda/dynamic_quantize_linear.cu
+++ b/src/kernels/cuda/dynamic_quantize_linear.cu
@ -0,0 +1,170 @@
+// #include "cuda/cuda_common.h"
+// #include <cub/cub.cuh>
+// template <int BLOCK_DIM>
+// __launch_bounds__(BLOCK_DIM) __global__
+//     void _dynamicQuantizeLinearKernel(float *input, float *outputY,
+//                                       uint8_t yScale, uint8_t yZeroPoint,
+//                                       int size) {
+//     int i = threadIdx.x + blockIdx.x * BLOCK_DIM;
+//     float maxData = __FLT_MAX__;
+//     float minData = -__FLT_MAX__;
+//     int remain = size % BLOCK_DIM;
+//     int step = (size - remain) / BLOCK_DIM + 1;
+
+//     if (threadIdx.x < remain) {
+//         for (int ind = 0; ind < step; ind++) {
+
+//             maxData = max(maxData, input[threadIdx.x * step + ind]);
+//         }
+//     } else {
+//         for (int ind = 0; ind < step - 1; ind++) {
+
+//             maxData =
+//                 max(maxData, input[remain * step +
+//                                    (threadIdx.x - remain) * (step - 1) +
+//                                    ind]);
+//         }
+//     }
+//     if (threadIdx.x < remain) {
+//         for (int ind = 0; ind < step; ind++) {
+
+//             minData = min(minData, input[threadIdx.x * step + ind]);
+//         }
+//     } else {
+//         for (int ind = 0; ind < step - 1; ind++) {
+
+//             minData =
+//                 min(minData, input[remain * step +
+//                                    (threadIdx.x - remain) * (step - 1) +
+//                                    ind]);
+//         }
+//     }
+//     typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+//     __shared__ typename BlockReduce::TempStorage temp_storage;
+//     __shared__ float maxTotal;
+//     float blockMax = BlockReduce(temp_storage).Reduce(maxData, cub::Max());
+
+//     __shared__ float minTotal;
+//     float blockMin = BlockReduce(temp_storage).Reduce(minData, cub::Min());
+//     if (threadIdx.x == 0) {
+//         maxTotal = blockMax;
+//         minTotal = blockMin;
+//     }
+//     __syncthreads();
+//     int qmax = 255;
+//     int qmin = 0;
+//     yScale = (max(0, maxTotal) - min(0, minTotal)) / (qmax - qmin);
+//     intermediate_zero_point = qmin - minTotal / yScale;
+//     yZeroPoint = cast(round(saturate(itermediate_zero_point)));
+//     if (i < size) {
+//         outputY[i] = saturate(round(input[i] / yScale) + yZeroPoint);
+//     }
+// }
+// //----------
+
+// template <int BLOCK_DIM, int numPerThread>
+// __launch_bounds__(BLOCK_DIM) __global__
+//     void _dynamicQuantizeLinearKernel(float *input, float *outputY,
+//                                       uint8_t yScale, uint8_t yZeroPoint,
+//                                       int size) {
+//     int i = threadIdx.x + blockIdx.x * BLOCK_DIM;
+//     float maxData = __FLT_MAX__;
+//     float minData = -__FLT_MAX__;
+//     int remain = size % BLOCK_DIM;
+//     int step = (size - remain) / BLOCK_DIM + 1;
+//     float dataPerThread[numPerThread];
+//     if (threadIdx.x < remain) {
+//         for (int ind = 0; ind < step; ind++) {
+//             dataPerThread[ind] = input[threadIdx.x * step + ind];
+//             maxData = max(maxData, dataPerThread[ind]);
+//         }
+//     } else {
+//         for (int ind = 0; ind < step - 1; ind++) {
+//             dataPerThread[ind] =
+//                 input[remain * step + (threadIdx.x - remain) * (step - 1) +
+//                       ind];
+//             maxData = max(maxData, dataPerThread[ind]);
+//         }
+//     }
+//     if (threadIdx.x < remain) {
+//         for (int ind = 0; ind < step; ind++) {
+
+//             minData = min(minData, dataPerThread[ind]);
+//         }
+//     } else {
+//         for (int ind = 0; ind < step - 1; ind++) {
+
+//             minData = min(minData, dataPerThread[ind]);
+//         }
+//     }
+//     typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+//     __shared__ typename BlockReduce::TempStorage temp_storage;
+//     __shared__ float maxTotal;
+//     float blockMax = BlockReduce(temp_storage).Reduce(maxData, cub::Max());
+
+//     __shared__ float minTotal;
+//     float blockMin = BlockReduce(temp_storage).Reduce(minData, cub::Min());
+//     if (threadIdx.x == 0) {
+//         maxTotal = blockMax;
+//         minTotal = blockMin;
+//     }
+//     __syncthreads();
+//     int qmax = 255;
+//     int qmin = 0;
+//     yScale = (max(0.0, maxTotal) - min(0.0, minTotal)) / (qmax - qmin);
+//     intermediate_zero_point = qmin - minTotal / yScale;
+//     yZeroPoint = cast(round(saturate(itermediate_zero_point)));
+//     if (i < size) {
+//         outputY[i] = saturate(round(input[i] / yScale) + yZeroPoint);
+//     }
+// }
+
+// namespace infini {
+// void dynamicQuantizeLinearKernel(float *input, float *outputY, uint8_t
+// yScale,
+//                                  uint8_t yZeroPoint, int size) {
+
+//     if (size > 1024 * 128) {
+
+//         int BLOCK_DIM = 1024;
+//         int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
+//         _dynamicQuantizeLinearKernel<1024><<<num_blocks, BLOCK_DIM>>>(
+//             input, outputY, yScale, yZeroPoint, size);
+//     } else if (size > 1024 * 64) {
+
+//         int BLOCK_DIM = 1024;
+//         int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
+//         _dynamicQuantizeLinearKernel<1024, 128><<<num_blocks, BLOCK_DIM>>>(
+//             input, outputY, yScale, yZeroPoint, size);
+//     } else if (size > 1024 * 32) {
+
+//         int BLOCK_DIM = 1024;
+//         int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
+//         _dynamicQuantizeLinearKernel<1024, 64><<<num_blocks, BLOCK_DIM>>>(
+//             input, outputY, yScale, yZeroPoint, size);
+//     } else if (size > 1024 * 16) {
+
+//         int BLOCK_DIM = 1024;
+//         int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
+//         _dynamicQuantizeLinearKernel<1024, 32><<<num_blocks, BLOCK_DIM>>>(
+//             input, outputY, yScale, yZeroPoint, size);
+//     } else if (size > 1024 * 4) {
+
+//         int BLOCK_DIM = 1024;
+//         int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
+//         _dynamicQuantizeLinearKernel<1024, 16><<<num_blocks, BLOCK_DIM>>>(
+//             input, outputY, yScale, yZeroPoint, size);
+//     } else if (size > 1024) {
+
+//         int BLOCK_DIM = 1024;
+//         int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
+//         _dynamicQuantizeLinearKernel<1024, 4><<<num_blocks, BLOCK_DIM>>>(
+//             input, outputY, yScale, yZeroPoint, size);
+//     } else {
+//         int BLOCK_DIM = 1024;
+//         int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
+//         _dynamicQuantizeLinearKernel<1024, 1><<<num_blocks, BLOCK_DIM>>>(
+//             input, outputY, yScale, yZeroPoint, size);
+//     }
+// }
+// } // namespace infini
--- a/src/kernels/cuda/unary.cu
+++ b/src/kernels/cuda/unary.cu
@ -287,9 +287,7 @@ void unary_kernel(const Operator &_op) {
        } else {
            IT_TODO_HALT();
        }
-    }
-
-    else if (op->getOpType() == OpType::Erf) {
+    } else if (op->getOpType() == OpType::Erf) {
        if (_op->getDType() == DataType::Float32) {
            erf_kernel<float>((float *)inputData, (float *)outputData, num);
        } else {
--- a/src/operators/dequantize_linear.cc
+++ b/src/operators/dequantize_linear.cc
@ -1,4 +1,4 @@
-#include "operators/dequantizeLinear.h"
+#include "operators/dequantize_linear.h"
 #include "utils/operator_utils.h"

 namespace infini {
--- a/src/operators/dynamic_quantize_linear.cc
+++ b/src/operators/dynamic_quantize_linear.cc
@ -17,7 +17,7 @@ DynamicQuantizeLinearObj::inferShape(const TensorVec &inputs) {
 vector<DataType>
 DynamicQuantizeLinearObj::inferDataType(const TensorVec &inputs) const {
    IT_ASSERT(inputs.size() == 1);
-    return {inputs[1]->getDType()};
+    return {DataType(2), DataType(1), DataType(2)};
 }

 std::string DynamicQuantizeLinearObj::toString() const {
--- a/test/kernels/cuda/test_cuda_dequantize_linear.cc
+++ b/test/kernels/cuda/test_cuda_dequantize_linear.cc
@ -2,7 +2,7 @@
 #include "core/runtime.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
-#include "operators/dequantizeLinear.h"
+#include "operators/dequantize_linear.h"

 #include "test.h"