diff --git a/include/cuda/cuda_dequantizeLinear.h b/include/cuda/cuda_dequantize_linear.h similarity index 100% rename from include/cuda/cuda_dequantizeLinear.h rename to include/cuda/cuda_dequantize_linear.h diff --git a/include/operators/dequantizeLinear.h b/include/operators/dequantize_linear.h similarity index 100% rename from include/operators/dequantizeLinear.h rename to include/operators/dequantize_linear.h diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc index f4c8e761..2ef610ef 100644 --- a/src/core/graph_handler.cc +++ b/src/core/graph_handler.cc @@ -6,7 +6,7 @@ #include "operators/broadcast.h" #include "operators/concat.h" #include "operators/conv.h" -#include "operators/dequantizeLinear.h" +#include "operators/dequantize_linear.h" #include "operators/dynamic_quantize_linear.h" #include "operators/element_wise.h" #include "operators/expand.h" diff --git a/src/kernels/cuda/dequantizeLinear.cc b/src/kernels/cuda/dequantize_linear.cc similarity index 96% rename from src/kernels/cuda/dequantizeLinear.cc rename to src/kernels/cuda/dequantize_linear.cc index 63c78fe0..08db6eed 100644 --- a/src/kernels/cuda/dequantizeLinear.cc +++ b/src/kernels/cuda/dequantize_linear.cc @@ -1,5 +1,5 @@ -#include "operators/dequantizeLinear.h" -#include "cuda/cuda_dequantizeLinear.h" +#include "operators/dequantize_linear.h" +#include "cuda/cuda_dequantize_linear.h" #include "cuda/cuda_kernel_wihtout_config.h" #include "cuda/cuda_runtime.h" diff --git a/src/kernels/cuda/dequantizeLinear.cu b/src/kernels/cuda/dequantize_linear.cu similarity index 100% rename from src/kernels/cuda/dequantizeLinear.cu rename to src/kernels/cuda/dequantize_linear.cu diff --git a/src/kernels/cuda/dynamic_quantize_linear.cc b/src/kernels/cuda/dynamic_quantize_linear.cc new file mode 100644 index 00000000..e69de29b diff --git a/src/kernels/cuda/dynamic_quantize_linear.cu b/src/kernels/cuda/dynamic_quantize_linear.cu new file mode 100644 index 00000000..dc8d9394 --- /dev/null +++ b/src/kernels/cuda/dynamic_quantize_linear.cu @@ -0,0 +1,170 @@ +// #include "cuda/cuda_common.h" +// #include +// template +// __launch_bounds__(BLOCK_DIM) __global__ +// void _dynamicQuantizeLinearKernel(float *input, float *outputY, +// uint8_t yScale, uint8_t yZeroPoint, +// int size) { +// int i = threadIdx.x + blockIdx.x * BLOCK_DIM; +// float maxData = __FLT_MAX__; +// float minData = -__FLT_MAX__; +// int remain = size % BLOCK_DIM; +// int step = (size - remain) / BLOCK_DIM + 1; + +// if (threadIdx.x < remain) { +// for (int ind = 0; ind < step; ind++) { + +// maxData = max(maxData, input[threadIdx.x * step + ind]); +// } +// } else { +// for (int ind = 0; ind < step - 1; ind++) { + +// maxData = +// max(maxData, input[remain * step + +// (threadIdx.x - remain) * (step - 1) + +// ind]); +// } +// } +// if (threadIdx.x < remain) { +// for (int ind = 0; ind < step; ind++) { + +// minData = min(minData, input[threadIdx.x * step + ind]); +// } +// } else { +// for (int ind = 0; ind < step - 1; ind++) { + +// minData = +// min(minData, input[remain * step + +// (threadIdx.x - remain) * (step - 1) + +// ind]); +// } +// } +// typedef cub::BlockReduce BlockReduce; +// __shared__ typename BlockReduce::TempStorage temp_storage; +// __shared__ float maxTotal; +// float blockMax = BlockReduce(temp_storage).Reduce(maxData, cub::Max()); + +// __shared__ float minTotal; +// float blockMin = BlockReduce(temp_storage).Reduce(minData, cub::Min()); +// if (threadIdx.x == 0) { +// maxTotal = blockMax; +// minTotal = blockMin; +// } +// __syncthreads(); +// int qmax = 255; +// int qmin = 0; +// yScale = (max(0, maxTotal) - min(0, minTotal)) / (qmax - qmin); +// intermediate_zero_point = qmin - minTotal / yScale; +// yZeroPoint = cast(round(saturate(itermediate_zero_point))); +// if (i < size) { +// outputY[i] = saturate(round(input[i] / yScale) + yZeroPoint); +// } +// } +// //---------- + +// template +// __launch_bounds__(BLOCK_DIM) __global__ +// void _dynamicQuantizeLinearKernel(float *input, float *outputY, +// uint8_t yScale, uint8_t yZeroPoint, +// int size) { +// int i = threadIdx.x + blockIdx.x * BLOCK_DIM; +// float maxData = __FLT_MAX__; +// float minData = -__FLT_MAX__; +// int remain = size % BLOCK_DIM; +// int step = (size - remain) / BLOCK_DIM + 1; +// float dataPerThread[numPerThread]; +// if (threadIdx.x < remain) { +// for (int ind = 0; ind < step; ind++) { +// dataPerThread[ind] = input[threadIdx.x * step + ind]; +// maxData = max(maxData, dataPerThread[ind]); +// } +// } else { +// for (int ind = 0; ind < step - 1; ind++) { +// dataPerThread[ind] = +// input[remain * step + (threadIdx.x - remain) * (step - 1) + +// ind]; +// maxData = max(maxData, dataPerThread[ind]); +// } +// } +// if (threadIdx.x < remain) { +// for (int ind = 0; ind < step; ind++) { + +// minData = min(minData, dataPerThread[ind]); +// } +// } else { +// for (int ind = 0; ind < step - 1; ind++) { + +// minData = min(minData, dataPerThread[ind]); +// } +// } +// typedef cub::BlockReduce BlockReduce; +// __shared__ typename BlockReduce::TempStorage temp_storage; +// __shared__ float maxTotal; +// float blockMax = BlockReduce(temp_storage).Reduce(maxData, cub::Max()); + +// __shared__ float minTotal; +// float blockMin = BlockReduce(temp_storage).Reduce(minData, cub::Min()); +// if (threadIdx.x == 0) { +// maxTotal = blockMax; +// minTotal = blockMin; +// } +// __syncthreads(); +// int qmax = 255; +// int qmin = 0; +// yScale = (max(0.0, maxTotal) - min(0.0, minTotal)) / (qmax - qmin); +// intermediate_zero_point = qmin - minTotal / yScale; +// yZeroPoint = cast(round(saturate(itermediate_zero_point))); +// if (i < size) { +// outputY[i] = saturate(round(input[i] / yScale) + yZeroPoint); +// } +// } + +// namespace infini { +// void dynamicQuantizeLinearKernel(float *input, float *outputY, uint8_t +// yScale, +// uint8_t yZeroPoint, int size) { + +// if (size > 1024 * 128) { + +// int BLOCK_DIM = 1024; +// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM; +// _dynamicQuantizeLinearKernel<1024><<>>( +// input, outputY, yScale, yZeroPoint, size); +// } else if (size > 1024 * 64) { + +// int BLOCK_DIM = 1024; +// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM; +// _dynamicQuantizeLinearKernel<1024, 128><<>>( +// input, outputY, yScale, yZeroPoint, size); +// } else if (size > 1024 * 32) { + +// int BLOCK_DIM = 1024; +// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM; +// _dynamicQuantizeLinearKernel<1024, 64><<>>( +// input, outputY, yScale, yZeroPoint, size); +// } else if (size > 1024 * 16) { + +// int BLOCK_DIM = 1024; +// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM; +// _dynamicQuantizeLinearKernel<1024, 32><<>>( +// input, outputY, yScale, yZeroPoint, size); +// } else if (size > 1024 * 4) { + +// int BLOCK_DIM = 1024; +// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM; +// _dynamicQuantizeLinearKernel<1024, 16><<>>( +// input, outputY, yScale, yZeroPoint, size); +// } else if (size > 1024) { + +// int BLOCK_DIM = 1024; +// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM; +// _dynamicQuantizeLinearKernel<1024, 4><<>>( +// input, outputY, yScale, yZeroPoint, size); +// } else { +// int BLOCK_DIM = 1024; +// int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM; +// _dynamicQuantizeLinearKernel<1024, 1><<>>( +// input, outputY, yScale, yZeroPoint, size); +// } +// } +// } // namespace infini diff --git a/src/kernels/cuda/unary.cu b/src/kernels/cuda/unary.cu index 37a90765..ace30fdc 100644 --- a/src/kernels/cuda/unary.cu +++ b/src/kernels/cuda/unary.cu @@ -287,9 +287,7 @@ void unary_kernel(const Operator &_op) { } else { IT_TODO_HALT(); } - } - - else if (op->getOpType() == OpType::Erf) { + } else if (op->getOpType() == OpType::Erf) { if (_op->getDType() == DataType::Float32) { erf_kernel((float *)inputData, (float *)outputData, num); } else { diff --git a/src/operators/dequantizeLinear.cc b/src/operators/dequantize_linear.cc similarity index 97% rename from src/operators/dequantizeLinear.cc rename to src/operators/dequantize_linear.cc index 50d5903c..2f165e31 100644 --- a/src/operators/dequantizeLinear.cc +++ b/src/operators/dequantize_linear.cc @@ -1,4 +1,4 @@ -#include "operators/dequantizeLinear.h" +#include "operators/dequantize_linear.h" #include "utils/operator_utils.h" namespace infini { diff --git a/src/operators/dynamic_quantize_linear.cc b/src/operators/dynamic_quantize_linear.cc index a34b4861..8317eda5 100644 --- a/src/operators/dynamic_quantize_linear.cc +++ b/src/operators/dynamic_quantize_linear.cc @@ -17,7 +17,7 @@ DynamicQuantizeLinearObj::inferShape(const TensorVec &inputs) { vector DynamicQuantizeLinearObj::inferDataType(const TensorVec &inputs) const { IT_ASSERT(inputs.size() == 1); - return {inputs[1]->getDType()}; + return {DataType(2), DataType(1), DataType(2)}; } std::string DynamicQuantizeLinearObj::toString() const { diff --git a/test/kernels/cuda/test_cuda_dequantizeLinear.cc b/test/kernels/cuda/test_cuda_dequantize_linear.cc similarity index 99% rename from test/kernels/cuda/test_cuda_dequantizeLinear.cc rename to test/kernels/cuda/test_cuda_dequantize_linear.cc index 31a85d75..65e15ba8 100644 --- a/test/kernels/cuda/test_cuda_dequantizeLinear.cc +++ b/test/kernels/cuda/test_cuda_dequantize_linear.cc @@ -2,7 +2,7 @@ #include "core/runtime.h" #include "cuda/cuda_runtime.h" #include "cuda/cuda_utility.h" -#include "operators/dequantizeLinear.h" +#include "operators/dequantize_linear.h" #include "test.h"