feat: add dynamic quantize linear kernel

This commit is contained in:
kilinchange 2023-12-19 14:40:50 +08:00
parent 97e3377ca5
commit 0e75f99e7e
11 changed files with 177 additions and 9 deletions

View File

@ -6,7 +6,7 @@
#include "operators/broadcast.h"
#include "operators/concat.h"
#include "operators/conv.h"
#include "operators/dequantizeLinear.h"
#include "operators/dequantize_linear.h"
#include "operators/dynamic_quantize_linear.h"
#include "operators/element_wise.h"
#include "operators/expand.h"

View File

@ -1,5 +1,5 @@
#include "operators/dequantizeLinear.h"
#include "cuda/cuda_dequantizeLinear.h"
#include "operators/dequantize_linear.h"
#include "cuda/cuda_dequantize_linear.h"
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h"

View File

@ -0,0 +1,170 @@
// #include "cuda/cuda_common.h"
// #include <cub/cub.cuh>
// template <int BLOCK_DIM>
// __launch_bounds__(BLOCK_DIM) __global__
// void _dynamicQuantizeLinearKernel(float *input, float *outputY,
// uint8_t yScale, uint8_t yZeroPoint,
// int size) {
// int i = threadIdx.x + blockIdx.x * BLOCK_DIM;
// float maxData = __FLT_MAX__;
// float minData = -__FLT_MAX__;
// int remain = size % BLOCK_DIM;
// int step = (size - remain) / BLOCK_DIM + 1;
// if (threadIdx.x < remain) {
// for (int ind = 0; ind < step; ind++) {
// maxData = max(maxData, input[threadIdx.x * step + ind]);
// }
// } else {
// for (int ind = 0; ind < step - 1; ind++) {
// maxData =
// max(maxData, input[remain * step +
// (threadIdx.x - remain) * (step - 1) +
// ind]);
// }
// }
// if (threadIdx.x < remain) {
// for (int ind = 0; ind < step; ind++) {
// minData = min(minData, input[threadIdx.x * step + ind]);
// }
// } else {
// for (int ind = 0; ind < step - 1; ind++) {
// minData =
// min(minData, input[remain * step +
// (threadIdx.x - remain) * (step - 1) +
// ind]);
// }
// }
// typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
// __shared__ typename BlockReduce::TempStorage temp_storage;
// __shared__ float maxTotal;
// float blockMax = BlockReduce(temp_storage).Reduce(maxData, cub::Max());
// __shared__ float minTotal;
// float blockMin = BlockReduce(temp_storage).Reduce(minData, cub::Min());
// if (threadIdx.x == 0) {
// maxTotal = blockMax;
// minTotal = blockMin;
// }
// __syncthreads();
// int qmax = 255;
// int qmin = 0;
// yScale = (max(0, maxTotal) - min(0, minTotal)) / (qmax - qmin);
// intermediate_zero_point = qmin - minTotal / yScale;
// yZeroPoint = cast(round(saturate(itermediate_zero_point)));
// if (i < size) {
// outputY[i] = saturate(round(input[i] / yScale) + yZeroPoint);
// }
// }
// //----------
// template <int BLOCK_DIM, int numPerThread>
// __launch_bounds__(BLOCK_DIM) __global__
// void _dynamicQuantizeLinearKernel(float *input, float *outputY,
// uint8_t yScale, uint8_t yZeroPoint,
// int size) {
// int i = threadIdx.x + blockIdx.x * BLOCK_DIM;
// float maxData = __FLT_MAX__;
// float minData = -__FLT_MAX__;
// int remain = size % BLOCK_DIM;
// int step = (size - remain) / BLOCK_DIM + 1;
// float dataPerThread[numPerThread];
// if (threadIdx.x < remain) {
// for (int ind = 0; ind < step; ind++) {
// dataPerThread[ind] = input[threadIdx.x * step + ind];
// maxData = max(maxData, dataPerThread[ind]);
// }
// } else {
// for (int ind = 0; ind < step - 1; ind++) {
// dataPerThread[ind] =
// input[remain * step + (threadIdx.x - remain) * (step - 1) +
// ind];
// maxData = max(maxData, dataPerThread[ind]);
// }
// }
// if (threadIdx.x < remain) {
// for (int ind = 0; ind < step; ind++) {
// minData = min(minData, dataPerThread[ind]);
// }
// } else {
// for (int ind = 0; ind < step - 1; ind++) {
// minData = min(minData, dataPerThread[ind]);
// }
// }
// typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
// __shared__ typename BlockReduce::TempStorage temp_storage;
// __shared__ float maxTotal;
// float blockMax = BlockReduce(temp_storage).Reduce(maxData, cub::Max());
// __shared__ float minTotal;
// float blockMin = BlockReduce(temp_storage).Reduce(minData, cub::Min());
// if (threadIdx.x == 0) {
// maxTotal = blockMax;
// minTotal = blockMin;
// }
// __syncthreads();
// int qmax = 255;
// int qmin = 0;
// yScale = (max(0.0, maxTotal) - min(0.0, minTotal)) / (qmax - qmin);
// intermediate_zero_point = qmin - minTotal / yScale;
// yZeroPoint = cast(round(saturate(itermediate_zero_point)));
// if (i < size) {
// outputY[i] = saturate(round(input[i] / yScale) + yZeroPoint);
// }
// }
// namespace infini {
// void dynamicQuantizeLinearKernel(float *input, float *outputY, uint8_t
// yScale,
// uint8_t yZeroPoint, int size) {
// if (size > 1024 * 128) {
// int BLOCK_DIM = 1024;
// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
// _dynamicQuantizeLinearKernel<1024><<<num_blocks, BLOCK_DIM>>>(
// input, outputY, yScale, yZeroPoint, size);
// } else if (size > 1024 * 64) {
// int BLOCK_DIM = 1024;
// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
// _dynamicQuantizeLinearKernel<1024, 128><<<num_blocks, BLOCK_DIM>>>(
// input, outputY, yScale, yZeroPoint, size);
// } else if (size > 1024 * 32) {
// int BLOCK_DIM = 1024;
// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
// _dynamicQuantizeLinearKernel<1024, 64><<<num_blocks, BLOCK_DIM>>>(
// input, outputY, yScale, yZeroPoint, size);
// } else if (size > 1024 * 16) {
// int BLOCK_DIM = 1024;
// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
// _dynamicQuantizeLinearKernel<1024, 32><<<num_blocks, BLOCK_DIM>>>(
// input, outputY, yScale, yZeroPoint, size);
// } else if (size > 1024 * 4) {
// int BLOCK_DIM = 1024;
// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
// _dynamicQuantizeLinearKernel<1024, 16><<<num_blocks, BLOCK_DIM>>>(
// input, outputY, yScale, yZeroPoint, size);
// } else if (size > 1024) {
// int BLOCK_DIM = 1024;
// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
// _dynamicQuantizeLinearKernel<1024, 4><<<num_blocks, BLOCK_DIM>>>(
// input, outputY, yScale, yZeroPoint, size);
// } else {
// int BLOCK_DIM = 1024;
// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
// _dynamicQuantizeLinearKernel<1024, 1><<<num_blocks, BLOCK_DIM>>>(
// input, outputY, yScale, yZeroPoint, size);
// }
// }
// } // namespace infini

View File

@ -287,9 +287,7 @@ void unary_kernel(const Operator &_op) {
} else {
IT_TODO_HALT();
}
}
else if (op->getOpType() == OpType::Erf) {
} else if (op->getOpType() == OpType::Erf) {
if (_op->getDType() == DataType::Float32) {
erf_kernel<float>((float *)inputData, (float *)outputData, num);
} else {

View File

@ -1,4 +1,4 @@
#include "operators/dequantizeLinear.h"
#include "operators/dequantize_linear.h"
#include "utils/operator_utils.h"
namespace infini {

View File

@ -17,7 +17,7 @@ DynamicQuantizeLinearObj::inferShape(const TensorVec &inputs) {
vector<DataType>
DynamicQuantizeLinearObj::inferDataType(const TensorVec &inputs) const {
IT_ASSERT(inputs.size() == 1);
return {inputs[1]->getDType()};
return {DataType(2), DataType(1), DataType(2)};
}
std::string DynamicQuantizeLinearObj::toString() const {

View File

@ -2,7 +2,7 @@
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/dequantizeLinear.h"
#include "operators/dequantize_linear.h"
#include "test.h"