// Example code illustrating the theory exposed in doc/quantization.md /* Command line to build and run on x86: c++ doc/quantization_example.cc -I . --std=c++11 -msse4.1 -lpthread \ -o /tmp/quantization_example && \ /tmp/quantization_example */ #include #include #include #include #include #include #include #include "../public/gemmlowp.h" #include "../public/output_stages.h" // We will handle both float and quantized matrices, which we will // represent as gemmlowp::MatrixMap. // We will need to be able to print them. // Output a matrix to a std::ostream template std::ostream& operator<<(std::ostream& s, const gemmlowp::MatrixMap& m) { for (int i = 0; i < m.rows(); i++) { for (int j = 0; j < m.cols(); j++) { if (j) { s << '\t'; } s << static_cast(m(i, j)); } s << '\n'; } return s; } // Find the min and max value in a float matrix. template void FindMinMax(const gemmlowp::MatrixMap& m, float* min, float* max) { *min = *max = m(0, 0); for (int i = 0; i < m.rows(); i++) { for (int j = 0; j < m.cols(); j++) { const float val = m(i, j); *min = std::min(*min, val); *max = std::max(*max, val); } } } // A structure to hold quantization parameters 'scale' and 'zero_point' // as discussed in doc/quantization.md. As explained there, the meaning // of these values is as the constants in the quantization equation // // real_value = scale * (quantized_value - zero_point) // // In other words, 'zero_point' is the quantized value that corresponds // to the real value 0, and 'scale' is the difference of real values // corresponding to consecutive quantized values. struct QuantizationParams { float scale; std::uint8_t zero_point; }; // Given the min and max values of a float array, return // reasonable quantization parameters to use for this array. QuantizationParams ChooseQuantizationParams(float min, float max) { // We extend the [min, max] interval to ensure that it contains 0. // Otherwise, we would not meet the requirement that 0 be an exactly // representable value. min = std::min(min, 0.f); max = std::max(max, 0.f); // the min and max quantized values, as floating-point values const float qmin = 0; const float qmax = 255; // First determine the scale. const double scale = (max - min) / (qmax - qmin); // Zero-point computation. // First the initial floating-point computation. The zero-point can be // determined from solving an affine equation for any known pair // (real value, corresponding quantized value). // We know two such pairs: (rmin, qmin) and (rmax, qmax). // Let's use the first one here. const double initial_zero_point = qmin - min / scale; // Now we need to nudge the zero point to be an integer // (our zero points are integer, and this is motivated by the requirement // to be able to represent the real value "0" exactly as a quantized value, // which is required in multiple places, for example in Im2col with SAME // padding). std::uint8_t nudged_zero_point = 0; if (initial_zero_point < qmin) { nudged_zero_point = qmin; } else if (initial_zero_point > qmax) { nudged_zero_point = qmax; } else { nudged_zero_point = static_cast(std::round(initial_zero_point)); } QuantizationParams result; result.scale = scale; result.zero_point = nudged_zero_point; return result; } template void FloatMatrixMultiplication( const gemmlowp::MatrixMap& lhs, const gemmlowp::MatrixMap& rhs, gemmlowp::MatrixMap* result) { assert(lhs.cols() == rhs.rows()); assert(lhs.rows() == result->rows()); assert(rhs.cols() == result->cols()); for (int i = 0; i < lhs.rows(); i++) { for (int k = 0; k < rhs.cols(); k++) { (*result)(i, k) = 0; for (int j = 0; j < lhs.cols(); j++) { (*result)(i, k) += lhs(i, j) * rhs(j, k); } } } } void Quantize(const QuantizationParams& qparams, const std::vector& src, std::vector* dst) { assert(src.size() == dst->size()); for (std::size_t i = 0; i < src.size(); i++) { const float real_val = src[i]; const float transformed_val = qparams.zero_point + real_val / qparams.scale; const float clamped_val = std::max(0.f, std::min(255.f, transformed_val)); (*dst)[i] = static_cast(std::round(clamped_val)); } } void Dequantize(const QuantizationParams& qparams, const std::vector& src, std::vector* dst) { assert(src.size() == dst->size()); for (std::size_t i = 0; i < src.size(); i++) { const std::uint8_t quantized_val = src[i]; (*dst)[i] = qparams.scale * (quantized_val - qparams.zero_point); } } template class MatrixWithStorage { public: MatrixWithStorage(int rows, int cols) : storage(rows * cols), matrix_map(storage.data(), rows, cols) {} void MakeRandom() { static std::mt19937 random_engine; std::uniform_real_distribution distribution(-1, 1); for (auto& x : storage) { x = static_cast(distribution(random_engine)); } } gemmlowp::MatrixMap ConstMap() const { return gemmlowp::MatrixMap( storage.data(), matrix_map.rows(), matrix_map.cols()); } gemmlowp::MatrixMap Map() { return gemmlowp::MatrixMap( storage.data(), matrix_map.rows(), matrix_map.cols()); } const std::vector& Storage() const { return storage; } std::vector& Storage() { return storage; } private: std::vector storage; gemmlowp::MatrixMap matrix_map; }; template std::ostream& operator<<(std::ostream& s, const MatrixWithStorage& m) { return s << m.ConstMap(); } // Given a real_multiplier in the interval (0, 1), // produces a pair (quantized_multiplier, right_shift) where // quantized_multiplier is an int32 representing a fixed-point value // in the interval [-1, 1) (in practice we only produce positive values) // and right_shift is an amount to shift right by, so that the // floating-point multiplication of some int32 input value by real_multiplier, // // return static_cast(int32_value * real_multiplier); // // is best approximated by the integer-arithmetic-only code // // return RoundingRightShift( // FixedPointMultiplication(int32_value, quantized_multiplier), // right_shift); // // This is how to obtain the fixed-point multiplier and right shift // parameters to pass to // OutputStageQuantizeDownInt32ByFixedPoint. // // Note: all this code only needs to run offline to generate the quantized // neural network workload, not at runtime on the // device on which quantized neural networks need to run. So it's not // performance-critical at all. void QuantizeMultiplierSmallerThanOne(float real_multiplier, std::int32_t* quantized_multiplier, int* right_shift) { assert(real_multiplier > 0.f); assert(real_multiplier < 1.f); int s = 0; // We want to bring the real multiplier into the interval [1/2, 1). // We can do so by multiplying it by two, and recording how many times // we multiplied by two so that we can compensate that by a right // shift by the same amount. while (real_multiplier < 0.5f) { real_multiplier *= 2.0f; s++; } // Now that the real multiplier is in [1/2, 1), we convert it // into a fixed-point number. std::int64_t q = static_cast(std::round(real_multiplier * (1ll << 31))); assert(q <= (1ll << 31)); // Handle the special case when the real multiplier was so close to 1 // that its fixed-point approximation was undistinguishable from 1. // We handle this by dividing it by two, and remembering to decrement // the right shift amount. if (q == (1ll << 31)) { q /= 2; s--; } assert(s >= 0); assert(q <= std::numeric_limits::max()); *quantized_multiplier = static_cast(q); *right_shift = s; } int main() { std::cout.precision(3); const int rows = 2; const int depth = 4; const int cols = 3; const auto kOrder = gemmlowp::MapOrder::ColMajor; std::cout << "First, let us make some float matrices LHS and RHS, " << "and compute their product.\n" << std::endl; MatrixWithStorage float_lhs(rows, depth); float_lhs.MakeRandom(); MatrixWithStorage float_rhs(depth, cols); float_rhs.MakeRandom(); MatrixWithStorage reference_float_result(rows, cols); auto reference_float_result_map = reference_float_result.Map(); FloatMatrixMultiplication(float_lhs.ConstMap(), float_rhs.ConstMap(), &reference_float_result_map); std::cout << "Here is the float LHS matrix:\n" << float_lhs << std::endl; std::cout << "Here is the float RHS matrix:\n" << float_rhs << std::endl; std::cout << "Here is the float product (LHS * RHS) matrix obtained by " << "ordinary float matrix multiplication, i.e. as far as we are " << "concerned, the REFERENCE RESULT:\n" << reference_float_result << std::endl; std::cout << "Now we embark on reproducing this result using " << "quantized arithmetic. The code below splits into two parts: " << "quantization code that only needs to run offline (e.g. to " << "generate a quantized neural network workload), and actual " << "runtime quantized code, which is typically performance-critical " << "and where we typically do not want to use any floating-point " << "arithmetic. We want to clearly distinguish between the two.\n" << std::endl; std::cout << "The below is OFFLINE QUANTIZATION CODE. We still use some " << "floating-point arithmetic in the process of generating the " << "quantized workload to be run on-device.\n" << std::endl; std::cout << "Now, let us choose quantization parameters for these matrices. " << "You might ask, what good is quantization if we need to pick " << "quantization parameters for the result before we can run the " << "quantized computation to obtain the result? The idea is that we " << "target applications such as neural networks, where unknown results " << "are only allowed to vary within preexisting bounds. In practice, the " << "bounds for the results are typically learned during the neural " "network " << "training process. The min and max of the result do not have to be " << "exact. If they are too broad, we just get lower quantization " "accuracy. " << "If they are too narrow, we just get clamping at the bounds.\n" << std::endl; float lhs_min, lhs_max, rhs_min, rhs_max, result_min, result_max; FindMinMax(float_lhs.Map(), &lhs_min, &lhs_max); FindMinMax(float_rhs.Map(), &rhs_min, &rhs_max); FindMinMax(reference_float_result.Map(), &result_min, &result_max); const auto lhs_qparams = ChooseQuantizationParams(lhs_min, lhs_max); const auto rhs_qparams = ChooseQuantizationParams(rhs_min, rhs_max); const auto result_qparams = ChooseQuantizationParams(result_min, result_max); std::cout << "For LHS, we have min = " << lhs_min << ", max = " << lhs_max << ", scale = " << lhs_qparams.scale << ", zero_point = " << static_cast(lhs_qparams.zero_point) << std::endl; std::cout << "For RHS, we have min = " << rhs_min << ", max = " << rhs_max << ", scale = " << rhs_qparams.scale << ", zero_point = " << static_cast(rhs_qparams.zero_point) << std::endl; std::cout << "For the result, we have min = " << result_min << ", max = " << result_max << ", scale = " << result_qparams.scale << ", zero_point = " << static_cast(result_qparams.zero_point) << std::endl; std::cout << std::endl; MatrixWithStorage uint8_lhs(rows, depth); MatrixWithStorage uint8_rhs(depth, cols); MatrixWithStorage actual_uint8_result(rows, cols); Quantize(lhs_qparams, float_lhs.Storage(), &uint8_lhs.Storage()); Quantize(rhs_qparams, float_rhs.Storage(), &uint8_rhs.Storage()); std::cout << "Quantized uint8 LHS matrix:\n" << uint8_lhs << std::endl; std::cout << "Quantized uint8 RHS matrix:\n" << uint8_rhs << std::endl; const int lhs_offset = -lhs_qparams.zero_point; const int rhs_offset = -rhs_qparams.zero_point; const int result_offset = result_qparams.zero_point; const float real_multiplier = lhs_qparams.scale * rhs_qparams.scale / result_qparams.scale; std::int32_t quantized_multiplier; int right_shift; QuantizeMultiplierSmallerThanOne(real_multiplier, &quantized_multiplier, &right_shift); std::cout << "End of OFFLINE QUANTIZATION CODE.\n" << std::endl; std::cout << "The below is ON-DEVICE RUNTIME QUANTIZED CODE. " << "This is the part that is performance-critical and may only " << "use quantized arithmetic.\n" << std::endl; gemmlowp::OutputStageQuantizeDownInt32ByFixedPoint quantize_down_stage; quantize_down_stage.result_offset_after_shift = result_offset; quantize_down_stage.result_fixedpoint_multiplier = quantized_multiplier; quantize_down_stage.result_shift = right_shift; gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; const auto& output_pipeline = std::make_tuple(quantize_down_stage, saturating_cast_stage); auto actual_uint8_result_map = actual_uint8_result.Map(); gemmlowp::GemmContext gemm_context; gemmlowp::GemmWithOutputPipeline( &gemm_context, uint8_lhs.ConstMap(), uint8_rhs.ConstMap(), &actual_uint8_result_map, lhs_offset, rhs_offset, output_pipeline); std::cout << "Quantized uint8 result matrix obtained by quantized " << "multiplication:\n" << actual_uint8_result << std::endl; std::cout << "End of ON-DEVICE RUNTIME QUANTIZED CODE.\n" << std::endl; MatrixWithStorage actual_float_result(rows, cols); Dequantize(result_qparams, actual_uint8_result.Storage(), &actual_float_result.Storage()); std::cout << "Here is the actual float product (LHS * RHS) matrix obtained by " << "dequantizing the above uint8 result, i.e. " << "as far as we are concerned, the ACTUAL RESULT:\n" << actual_float_result << std::endl; MatrixWithStorage diff_float_result(rows, cols); for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { diff_float_result.Map()(i, j) = actual_float_result.Map()(i, j) - reference_float_result.Map()(i, j); } } std::cout << "Difference between ACTUAL and REFERENCE float results:\n" << diff_float_result << std::endl; }