diff --git a/src/kernels/cuda/dynamic_quantize_linear.cu b/src/kernels/cuda/dynamic_quantize_linear.cu index 035419a0..e5dc7862 100644 --- a/src/kernels/cuda/dynamic_quantize_linear.cu +++ b/src/kernels/cuda/dynamic_quantize_linear.cu @@ -3,7 +3,7 @@ #include "cuda/cuda_common.h" __device__ float _saturate(float x) { - return x < 0.f ? 0.f : (x > 255.0 ? 255.0 : x); + return x < 0.f ? 0.f : (x > 255.0f ? 255.0f : x); } template @@ -22,8 +22,8 @@ __launch_bounds__(BLOCK_DIM) __global__ float *yScale, uint8_t *yZeroPoint, int size) { int i = threadIdx.x + blockIdx.x * BLOCK_DIM; - float maxData = __FLT_MAX__; - float minData = -__FLT_MAX__; + float maxData = -__FLT_MAX__; + float minData = __FLT_MAX__; int remain = size % BLOCK_DIM; int step = (size - remain) / BLOCK_DIM + 1; @@ -38,6 +38,7 @@ __launch_bounds__(BLOCK_DIM) __global__ (threadIdx.x - remain) * (step - 1) + ind]); } } + if (threadIdx.x < remain) { for (int ind = 0; ind < step; ind++) { minData = min___(minData, input[threadIdx.x * step + ind]); @@ -49,25 +50,27 @@ __launch_bounds__(BLOCK_DIM) __global__ (threadIdx.x - remain) * (step - 1) + ind]); } } - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - __shared__ float maxTotal; - float blockMax = BlockReduce(temp_storage).Reduce(maxData, cub::Max()); - __shared__ float minTotal; - float blockMin = BlockReduce(temp_storage).Reduce(minData, cub::Min()); - if (threadIdx.x == 0) { - maxTotal = blockMax; - minTotal = blockMin; - } - __syncthreads(); + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storageMax; + + float blockMax = BlockReduce(temp_storageMax).Reduce(maxData, cub::Max()); + + __shared__ typename BlockReduce::TempStorage temp_storageMin; + + float blockMin = BlockReduce(temp_storageMin).Reduce(minData, cub::Min()); + //----- int qmax = 255; int qmin = 0; - float absMax = max___(abs(maxTotal), abs(minTotal)); - yScale[0] = absMax * 2 / (254 - qmin); - float intermediate_zero_point = 254 - absMax / yScale[0]; - float _yZeroPoint = round(_saturate(intermediate_zero_point)); - yZeroPoint[0] = static_cast(_yZeroPoint); + __shared__ float _yZeroPoint; + if (threadIdx.x == 0) { + yScale[0] = (max(0.0f, blockMax) - min(0.0f, blockMin)) / (qmax - qmin); + float intermediate_zero_point = qmin - blockMin / yScale[0]; + _yZeroPoint = round(_saturate(intermediate_zero_point)); + yZeroPoint[0] = static_cast(_yZeroPoint); + } + __syncthreads(); + if (i < size) { outputY[i] = static_cast( _saturate(round(input[i] / yScale[0]) + _yZeroPoint)); @@ -81,8 +84,8 @@ __launch_bounds__(BLOCK_DIM) __global__ float *yScale, uint8_t *yZeroPoint, int size) { int i = threadIdx.x + blockIdx.x * BLOCK_DIM; - float maxData = __FLT_MAX__; - float minData = -__FLT_MAX__; + float maxData = -__FLT_MAX__; + float minData = __FLT_MAX__; int remain = size % BLOCK_DIM; int step = (size - remain) / BLOCK_DIM + 1; float dataPerThread[numPerThread]; @@ -99,6 +102,7 @@ __launch_bounds__(BLOCK_DIM) __global__ maxData = max___(maxData, dataPerThread[ind]); } } + if (threadIdx.x < remain) { for (int ind = 0; ind < step; ind++) { minData = min___(minData, dataPerThread[ind]); @@ -108,25 +112,27 @@ __launch_bounds__(BLOCK_DIM) __global__ minData = min___(minData, dataPerThread[ind]); } } - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - __shared__ float maxTotal; - float blockMax = BlockReduce(temp_storage).Reduce(maxData, cub::Max()); - __shared__ float minTotal; - float blockMin = BlockReduce(temp_storage).Reduce(minData, cub::Min()); - if (threadIdx.x == 0) { - maxTotal = blockMax; - minTotal = blockMin; - } - __syncthreads(); + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storageMax; + + float blockMax = BlockReduce(temp_storageMax).Reduce(maxData, cub::Max()); + + __shared__ typename BlockReduce::TempStorage temp_storageMin; + + float blockMin = BlockReduce(temp_storageMin).Reduce(minData, cub::Min()); + //----- int qmax = 255; int qmin = 0; - float absMax = max___(abs(maxTotal), abs(minTotal)); - yScale[0] = absMax * 2 / (254 - qmin); - float intermediate_zero_point = 254 - absMax / yScale[0]; - float _yZeroPoint = round(_saturate(intermediate_zero_point)); - yZeroPoint[0] = static_cast(_yZeroPoint); + __shared__ float _yZeroPoint; + if (threadIdx.x == 0) { + yScale[0] = (max(0.0f, blockMax) - min(0.0f, blockMin)) / (qmax - qmin); + float intermediate_zero_point = qmin - blockMin / yScale[0]; + _yZeroPoint = round(_saturate(intermediate_zero_point)); + yZeroPoint[0] = static_cast(_yZeroPoint); + } + __syncthreads(); + if (i < size) { outputY[i] = static_cast( _saturate(round(input[i] / yScale[0]) + _yZeroPoint)); diff --git a/test/kernels/cuda/test_cuda_dynamic_quantize_linear.cc b/test/kernels/cuda/test_cuda_dynamic_quantize_linear.cc new file mode 100644 index 00000000..350de0f1 --- /dev/null +++ b/test/kernels/cuda/test_cuda_dynamic_quantize_linear.cc @@ -0,0 +1,74 @@ +#include "core/graph.h" +#include "core/runtime.h" +#include "cuda/cuda_runtime.h" +#include "cuda/cuda_utility.h" +#include "operators/dynamic_quantize_linear.h" + +#include "test.h" + +namespace infini { + +void test_dynamicquantizeLinearFp32( + const Shape &inputShape, const vector &inputData, + const vector &outputYData, const vector &outputYScaleData, + const vector &outputYZeroPointData) { + + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + Graph gCpu = make_ref(runtime); + + auto input = gCpu->addTensor(inputShape, DataType::Float32); + + gCpu->dataMalloc(); + + input->copyin(inputData); + + auto cudaRuntime = make_ref(); + Graph gCuda = make_ref(cudaRuntime); + + auto inputGpu = gCuda->cloneTensor(input); + + auto op = gCuda->addOp( + inputGpu, + std::nullopt); // DynamicQuantizeLinear + gCuda->dataMalloc(); + + inputGpu->copyin(inputData); + + cudaRuntime->run(gCuda); + + EXPECT_EQ(op->getOutputs().size(), (size_t)3); + auto o0Cpu = gCpu->cloneTensor(op->getOutput(0)); + auto o1Cpu = gCpu->cloneTensor(op->getOutput(1)); + auto o2Cpu = gCpu->cloneTensor(op->getOutput(2)); + + EXPECT_TRUE(o0Cpu->equalData(outputYData)); + EXPECT_TRUE(o1Cpu->equalData(outputYScaleData)); + EXPECT_TRUE(o2Cpu->equalData(outputYZeroPointData)); +} + +TEST(CUDA_DynamicquantizeLinearFp32, run) { + + test_dynamicquantizeLinearFp32( + Shape{2, 3, 2, 3}, + vector{0., 1., 2., 3., 4., 5., 6., 7., 8., + 9., 10., 11., 12., 13., 14., 15., 16., 17., + 18., 19., 20., 21., 22., 23., 24., 25., 26., + 27., 28., 29., 30., 31., 32., 33., 34., 35.}, + vector{0, 7, 15, 22, 29, 36, 44, 51, 58, + 66, 73, 80, 87, 95, 102, 109, 117, 124, + 131, 138, 146, 153, 160, 168, 175, 182, 189, + 197, 204, 211, 219, 226, 233, 240, 248, 255}, + vector{0.1372549}, vector{0}); + test_dynamicquantizeLinearFp32( + Shape{2, 3, 2, 2}, + vector{0., 1., 2., 3., 4., 5., 6., 7., + 8., 9., 10., 11., 12., 13., 14., 15., + 16., 17., 18., 19., 20., 21., 22., 23.}, + vector{0, 11, 22, 33, 44, 55, 67, 78, + 89, 100, 111, 122, 133, 144, 155, 166, + 177, 188, 200, 211, 222, 233, 244, 255}, + vector{0.0901961}, vector{0}); + +} // python output + +} // namespace infini