modified dynamic_quantize_linear

This commit is contained in:
xgqdut2016 2023-12-25 17:33:43 +08:00
parent 7b48b93fb3
commit ce23b8356f
2 changed files with 117 additions and 37 deletions

View File

@ -3,7 +3,7 @@
#include "cuda/cuda_common.h"
__device__ float _saturate(float x) {
return x < 0.f ? 0.f : (x > 255.0 ? 255.0 : x);
return x < 0.f ? 0.f : (x > 255.0f ? 255.0f : x);
}
template <class T>
@ -22,8 +22,8 @@ __launch_bounds__(BLOCK_DIM) __global__
float *yScale, uint8_t *yZeroPoint,
int size) {
int i = threadIdx.x + blockIdx.x * BLOCK_DIM;
float maxData = __FLT_MAX__;
float minData = -__FLT_MAX__;
float maxData = -__FLT_MAX__;
float minData = __FLT_MAX__;
int remain = size % BLOCK_DIM;
int step = (size - remain) / BLOCK_DIM + 1;
@ -38,6 +38,7 @@ __launch_bounds__(BLOCK_DIM) __global__
(threadIdx.x - remain) * (step - 1) + ind]);
}
}
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
minData = min___(minData, input[threadIdx.x * step + ind]);
@ -49,25 +50,27 @@ __launch_bounds__(BLOCK_DIM) __global__
(threadIdx.x - remain) * (step - 1) + ind]);
}
}
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ float maxTotal;
float blockMax = BlockReduce(temp_storage).Reduce(maxData, cub::Max());
__shared__ float minTotal;
float blockMin = BlockReduce(temp_storage).Reduce(minData, cub::Min());
if (threadIdx.x == 0) {
maxTotal = blockMax;
minTotal = blockMin;
}
__syncthreads();
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storageMax;
float blockMax = BlockReduce(temp_storageMax).Reduce(maxData, cub::Max());
__shared__ typename BlockReduce::TempStorage temp_storageMin;
float blockMin = BlockReduce(temp_storageMin).Reduce(minData, cub::Min());
//-----
int qmax = 255;
int qmin = 0;
float absMax = max___(abs(maxTotal), abs(minTotal));
yScale[0] = absMax * 2 / (254 - qmin);
float intermediate_zero_point = 254 - absMax / yScale[0];
float _yZeroPoint = round(_saturate(intermediate_zero_point));
yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
__shared__ float _yZeroPoint;
if (threadIdx.x == 0) {
yScale[0] = (max(0.0f, blockMax) - min(0.0f, blockMin)) / (qmax - qmin);
float intermediate_zero_point = qmin - blockMin / yScale[0];
_yZeroPoint = round(_saturate(intermediate_zero_point));
yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
}
__syncthreads();
if (i < size) {
outputY[i] = static_cast<uint8_t>(
_saturate(round(input[i] / yScale[0]) + _yZeroPoint));
@ -81,8 +84,8 @@ __launch_bounds__(BLOCK_DIM) __global__
float *yScale, uint8_t *yZeroPoint,
int size) {
int i = threadIdx.x + blockIdx.x * BLOCK_DIM;
float maxData = __FLT_MAX__;
float minData = -__FLT_MAX__;
float maxData = -__FLT_MAX__;
float minData = __FLT_MAX__;
int remain = size % BLOCK_DIM;
int step = (size - remain) / BLOCK_DIM + 1;
float dataPerThread[numPerThread];
@ -99,6 +102,7 @@ __launch_bounds__(BLOCK_DIM) __global__
maxData = max___(maxData, dataPerThread[ind]);
}
}
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
minData = min___(minData, dataPerThread[ind]);
@ -108,25 +112,27 @@ __launch_bounds__(BLOCK_DIM) __global__
minData = min___(minData, dataPerThread[ind]);
}
}
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ float maxTotal;
float blockMax = BlockReduce(temp_storage).Reduce(maxData, cub::Max());
__shared__ float minTotal;
float blockMin = BlockReduce(temp_storage).Reduce(minData, cub::Min());
if (threadIdx.x == 0) {
maxTotal = blockMax;
minTotal = blockMin;
}
__syncthreads();
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storageMax;
float blockMax = BlockReduce(temp_storageMax).Reduce(maxData, cub::Max());
__shared__ typename BlockReduce::TempStorage temp_storageMin;
float blockMin = BlockReduce(temp_storageMin).Reduce(minData, cub::Min());
//-----
int qmax = 255;
int qmin = 0;
float absMax = max___(abs(maxTotal), abs(minTotal));
yScale[0] = absMax * 2 / (254 - qmin);
float intermediate_zero_point = 254 - absMax / yScale[0];
float _yZeroPoint = round(_saturate(intermediate_zero_point));
yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
__shared__ float _yZeroPoint;
if (threadIdx.x == 0) {
yScale[0] = (max(0.0f, blockMax) - min(0.0f, blockMin)) / (qmax - qmin);
float intermediate_zero_point = qmin - blockMin / yScale[0];
_yZeroPoint = round(_saturate(intermediate_zero_point));
yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
}
__syncthreads();
if (i < size) {
outputY[i] = static_cast<uint8_t>(
_saturate(round(input[i] / yScale[0]) + _yZeroPoint));

View File

@ -0,0 +1,74 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/dynamic_quantize_linear.h"
#include "test.h"
namespace infini {
void test_dynamicquantizeLinearFp32(
const Shape &inputShape, const vector<float> &inputData,
const vector<uint8_t> &outputYData, const vector<float> &outputYScaleData,
const vector<uint8_t> &outputYZeroPointData) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto input = gCpu->addTensor(inputShape, DataType::Float32);
gCpu->dataMalloc();
input->copyin(inputData);
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = gCuda->cloneTensor(input);
auto op = gCuda->addOp<DynamicQuantizeLinearObj>(
inputGpu,
std::nullopt); // DynamicQuantizeLinear
gCuda->dataMalloc();
inputGpu->copyin(inputData);
cudaRuntime->run(gCuda);
EXPECT_EQ(op->getOutputs().size(), (size_t)3);
auto o0Cpu = gCpu->cloneTensor(op->getOutput(0));
auto o1Cpu = gCpu->cloneTensor(op->getOutput(1));
auto o2Cpu = gCpu->cloneTensor(op->getOutput(2));
EXPECT_TRUE(o0Cpu->equalData(outputYData));
EXPECT_TRUE(o1Cpu->equalData(outputYScaleData));
EXPECT_TRUE(o2Cpu->equalData(outputYZeroPointData));
}
TEST(CUDA_DynamicquantizeLinearFp32, run) {
test_dynamicquantizeLinearFp32(
Shape{2, 3, 2, 3},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
9., 10., 11., 12., 13., 14., 15., 16., 17.,
18., 19., 20., 21., 22., 23., 24., 25., 26.,
27., 28., 29., 30., 31., 32., 33., 34., 35.},
vector<uint8_t>{0, 7, 15, 22, 29, 36, 44, 51, 58,
66, 73, 80, 87, 95, 102, 109, 117, 124,
131, 138, 146, 153, 160, 168, 175, 182, 189,
197, 204, 211, 219, 226, 233, 240, 248, 255},
vector<float>{0.1372549}, vector<uint8_t>{0});
test_dynamicquantizeLinearFp32(
Shape{2, 3, 2, 2},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
8., 9., 10., 11., 12., 13., 14., 15.,
16., 17., 18., 19., 20., 21., 22., 23.},
vector<uint8_t>{0, 11, 22, 33, 44, 55, 67, 78,
89, 100, 111, 122, 133, 144, 155, 166,
177, 188, 200, 211, 222, 233, 244, 255},
vector<float>{0.0901961}, vector<uint8_t>{0});
} // python output
} // namespace infini