fix: run int8 llama but has nan output

This commit is contained in:
kilinchange 2023-12-19 17:17:22 +08:00
parent 8ae5958b29
commit 8d901ba7aa
7 changed files with 21 additions and 5 deletions

View File

@ -518,6 +518,8 @@ void init_graph_builder(py::module &m) {
.def("erf", &Handler::erf, policy::move)
.def("where", &Handler::where, policy::move)
.def("dequantizeLinear", &Handler::dequantizeLinear, policy::move)
.def("dynamicQuantizeLinear", &Handler::dynamicQuantizeLinear,
policy::move)
.def("matmulInteger", &Handler::matmulInteger, policy::move)
.def("topo_sort", &Handler::topo_sort, policy::automatic)
.def("optimize", &Handler::optimize, policy::automatic)

View File

@ -63,8 +63,9 @@ __launch_bounds__(BLOCK_DIM) __global__
__syncthreads();
int qmax = 255;
int qmin = 0;
yScale[0] = (max___(0.f, maxTotal) - min___(0.f, minTotal)) / (qmax - qmin);
float intermediate_zero_point = qmin - minTotal / yScale[0];
float absMax = max___(abs(maxTotal), abs(minTotal));
yScale[0] = absMax * 2 / (254 - qmin);
float intermediate_zero_point = 254 - absMax / yScale[0];
float _yZeroPoint = round(_saturate(intermediate_zero_point));
yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
if (i < size) {
@ -121,8 +122,9 @@ __launch_bounds__(BLOCK_DIM) __global__
__syncthreads();
int qmax = 255;
int qmin = 0;
yScale[0] = (max___(0.f, maxTotal) - min___(0.f, minTotal)) / (qmax - qmin);
float intermediate_zero_point = qmin - minTotal / yScale[0];
float absMax = max___(abs(maxTotal), abs(minTotal));
yScale[0] = absMax * 2 / (254 - qmin);
float intermediate_zero_point = 254 - absMax / yScale[0];
float _yZeroPoint = round(_saturate(intermediate_zero_point));
yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
if (i < size) {

View File

@ -27,6 +27,9 @@ class GatherCuda : public CudaKernelWithoutConfig {
} else if (op->getDType() == DataType::Int8) {
gather_kernel<int8_t>((int8_t *)inputData, (int8_t *)outputData,
metaData, op->getOutput()->size());
} else if (op->getDType() == DataType::UInt8) {
gather_kernel<uint8_t>((uint8_t *)inputData, (uint8_t *)outputData,
metaData, op->getOutput()->size());
} else {
IT_ASSERT(false);
}

View File

@ -56,4 +56,6 @@ template void gather_kernel<half>(half *in, half *out, GatherMetaData metaData,
size_t num);
template void gather_kernel<int8_t>(int8_t *in, int8_t *out,
GatherMetaData metaData, size_t num);
template void gather_kernel<uint8_t>(uint8_t *in, uint8_t *out,
GatherMetaData metaData, size_t num);
} // namespace infini

View File

@ -47,6 +47,11 @@ class CastCuda : public CudaKernelWithoutConfig {
op->getOutDType() == DataType::Float32);
cast_kernel<int8_t, float>((int8_t *)inputData, (float *)outputData,
num);
} else if (op->getType() == CastType::Int322Float) {
IT_ASSERT(op->getInDType() == DataType::Int32 &&
op->getOutDType() == DataType::Float32);
cast_kernel<int32_t, float>((int32_t *)inputData,
(float *)outputData, num);
} else {
IT_ASSERT(false);
}

View File

@ -313,5 +313,7 @@ template void cast_kernel<float, int8_t>(float *input, int8_t *output,
size_t num);
template void cast_kernel<int8_t, float>(int8_t *input, float *output,
size_t num);
template void cast_kernel<int32_t, float>(int32_t *input, float *output,
size_t num);
}; // namespace infini

View File

@ -11,7 +11,7 @@ DynamicQuantizeLinearObj::DynamicQuantizeLinearObj(
optional<vector<Shape>>
DynamicQuantizeLinearObj::inferShape(const TensorVec &inputs) {
return {{inputs[0]->getDims()}};
return {{inputs[0]->getDims(), {}, {}}};
}
vector<DataType>