345 lines
12 KiB
C++
345 lines
12 KiB
C++
// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include <unistd.h>
|
|
#ifdef __APPLE__
|
|
#include <sys/time.h>
|
|
#endif
|
|
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#include <ctime>
|
|
#include <iostream>
|
|
#include <map>
|
|
#include <vector>
|
|
|
|
#include "../meta/legacy_multi_thread_gemm.h"
|
|
#include "../public/gemmlowp.h"
|
|
#include "test.h"
|
|
// lets include these so we make sure they always compile
|
|
#include "../meta/multi_thread_gemm.h"
|
|
#include "../meta/multi_thread_transform.h"
|
|
#include "../meta/legacy_multi_thread_common.h"
|
|
|
|
#if defined(__arm__) && !defined(GEMMLOWP_NEON)
|
|
#warning "Building without NEON support on ARM, check your compiler setup!"
|
|
#endif
|
|
|
|
double time() {
|
|
#ifdef __APPLE__
|
|
timeval t;
|
|
gettimeofday(&t, nullptr);
|
|
return t.tv_sec + 1e-6 * t.tv_usec;
|
|
#else
|
|
timespec t;
|
|
clock_gettime(CLOCK_REALTIME, &t);
|
|
return t.tv_sec + 1e-9 * t.tv_nsec;
|
|
#endif
|
|
}
|
|
|
|
void prepare_test_data(std::uint8_t* data, std::int32_t rows, std::int32_t cols,
|
|
std::int32_t seed, std::int32_t seed_2) {
|
|
std::int32_t value = seed;
|
|
for (int i = 0; i < rows; ++i) {
|
|
for (int j = 0; j < cols; ++j) {
|
|
data[i * cols + j] = static_cast<std::uint8_t>(value);
|
|
value = ((value * seed_2) + seed) % 256;
|
|
}
|
|
}
|
|
}
|
|
|
|
void check_result(std::uint8_t* left, std::uint8_t* right, std::uint8_t* result,
|
|
std::int32_t rows, std::int32_t cols, std::int32_t depth,
|
|
std::int32_t lhs_offset, std::int32_t rhs_offset,
|
|
std::int32_t sum_offset, std::int32_t mul_offset,
|
|
std::int32_t shift) {
|
|
std::int32_t rounding = (1 << (shift - 1));
|
|
std::int32_t wrong = 0;
|
|
for (int i = 0; i < rows; ++i) {
|
|
for (int j = 0; j < cols; ++j) {
|
|
std::int32_t expected = 0;
|
|
for (int k = 0; k < depth; ++k) {
|
|
expected +=
|
|
(static_cast<std::int32_t>(left[depth * i + k]) + lhs_offset) *
|
|
(static_cast<std::int32_t>(right[depth * j + k]) + rhs_offset);
|
|
}
|
|
expected += sum_offset;
|
|
expected *= mul_offset;
|
|
expected += rounding;
|
|
expected = (expected >> shift);
|
|
if (expected < 0) {
|
|
expected = 0;
|
|
} else if (expected > 255) {
|
|
expected = 255;
|
|
}
|
|
expected = static_cast<std::int32_t>(static_cast<std::uint8_t>(expected));
|
|
std::int32_t actual = static_cast<std::int32_t>(result[i * cols + j]);
|
|
if (actual != expected) {
|
|
std::cout << "(" << i << ", " << j << "): " << expected << "!="
|
|
<< actual << std::endl;
|
|
wrong++;
|
|
}
|
|
}
|
|
}
|
|
if (wrong > 0) {
|
|
std::cout << "Wrong: " << rows << "x" << cols << "x" << depth << " : "
|
|
<< wrong << "/" << (rows * cols) << std::endl
|
|
<< std::flush;
|
|
std::exit(1);
|
|
} else {
|
|
std::cout << "." << std::flush;
|
|
}
|
|
}
|
|
|
|
void check_result_f(std::uint8_t* left, std::uint8_t* right, float* result,
|
|
std::int32_t rows, std::int32_t cols, std::int32_t depth,
|
|
std::int32_t lhs_offset, std::int32_t rhs_offset,
|
|
float result_offset) {
|
|
std::int32_t wrong = 0;
|
|
for (int i = 0; i < rows; ++i) {
|
|
for (int j = 0; j < cols; ++j) {
|
|
std::int32_t expected = 0;
|
|
for (int k = 0; k < depth; ++k) {
|
|
expected +=
|
|
(static_cast<std::int32_t>(left[depth * i + k]) + lhs_offset) *
|
|
(static_cast<std::int32_t>(right[depth * j + k]) + rhs_offset);
|
|
}
|
|
float expected_float = static_cast<float>(expected) * result_offset;
|
|
float actual_float = result[i * cols + j];
|
|
if (actual_float != expected_float) {
|
|
std::cout << "(" << i << ", " << j << "): " << expected_float << "!="
|
|
<< actual_float << std::endl;
|
|
wrong++;
|
|
}
|
|
}
|
|
}
|
|
if (wrong > 0) {
|
|
std::cout << "Wrong: " << rows << "x" << cols << "x" << depth << " : "
|
|
<< wrong << "/" << (rows * cols) << std::endl
|
|
<< std::flush;
|
|
std::exit(1);
|
|
} else {
|
|
std::cout << "." << std::flush;
|
|
}
|
|
}
|
|
|
|
|
|
void check_result_i32(std::uint8_t* left, std::uint8_t* right,
|
|
std::int32_t* result, std::int32_t rows,
|
|
std::int32_t cols, std::int32_t depth,
|
|
std::int32_t lhs_offset, std::int32_t rhs_offset) {
|
|
std::int32_t wrong = 0;
|
|
for (int i = 0; i < rows; ++i) {
|
|
for (int j = 0; j < cols; ++j) {
|
|
std::int32_t expected = 0;
|
|
for (int k = 0; k < depth; ++k) {
|
|
expected +=
|
|
(static_cast<std::int32_t>(left[depth * i + k]) + lhs_offset) *
|
|
(static_cast<std::int32_t>(right[depth * j + k]) + rhs_offset);
|
|
}
|
|
std::int32_t actual = result[i * cols + j];
|
|
if (actual != expected) {
|
|
std::cout << "(" << i << ", " << j << "): " << expected << "!="
|
|
<< actual << std::endl;
|
|
wrong++;
|
|
}
|
|
}
|
|
}
|
|
if (wrong > 0) {
|
|
std::cout << "Wrong: " << rows << "x" << cols << "x" << depth << " : "
|
|
<< wrong << "/" << (rows * cols) << std::endl
|
|
<< std::flush;
|
|
std::exit(1);
|
|
} else {
|
|
std::cout << "." << std::flush;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void clear(T* result, std::int32_t rows, std::int32_t cols) {
|
|
for (int i = 0; i < rows * cols; ++i) {
|
|
result[i] = static_cast<T>(0);
|
|
}
|
|
}
|
|
|
|
void test(std::uint8_t* scratch, std::uint8_t* lhs, std::uint8_t* rhs,
|
|
std::int32_t m, std::int32_t n, std::int32_t k, std::uint8_t* result,
|
|
gemmlowp::WorkersPool* pool, std::int32_t pool_size) {
|
|
prepare_test_data(lhs, m, k, 11, 13);
|
|
prepare_test_data(rhs, n, k, 177, 19);
|
|
|
|
clear(result, m, n);
|
|
gemmlowp::meta::multi_thread_gemm_q8(pool, pool_size, scratch, lhs, rhs, m, n,
|
|
k, -127, -127, 127 * k, 1, 7, result);
|
|
check_result(lhs, rhs, result, m, n, k, -127, -127, 127 * k, 1, 7);
|
|
}
|
|
|
|
void test_f(std::uint8_t* scratch, std::uint8_t* lhs, std::uint8_t* rhs,
|
|
std::int32_t m, std::int32_t n, std::int32_t k, float* result,
|
|
gemmlowp::WorkersPool* pool, std::int32_t pool_size) {
|
|
prepare_test_data(lhs, m, k, 11, 13);
|
|
prepare_test_data(rhs, n, k, 177, 19);
|
|
|
|
clear(result, m, n);
|
|
float scale = 1.0f / 1234567.8f;
|
|
gemmlowp::meta::multi_thread_gemm_f(pool, pool_size, scratch, lhs, rhs, m, n,
|
|
k, -127, -127, scale, result);
|
|
check_result_f(lhs, rhs, result, m, n, k, -127, -127, scale);
|
|
}
|
|
|
|
void test_i32(std::uint8_t* scratch, std::uint8_t* lhs, std::uint8_t* rhs,
|
|
std::int32_t m, std::int32_t n, std::int32_t k,
|
|
std::int32_t* result, gemmlowp::WorkersPool* pool,
|
|
std::int32_t pool_size) {
|
|
prepare_test_data(lhs, m, k, 11, 13);
|
|
prepare_test_data(rhs, n, k, 177, 19);
|
|
|
|
clear(result, m, n);
|
|
gemmlowp::meta::multi_thread_gemm_i32(pool, pool_size, scratch, lhs, rhs, m,
|
|
n, k, -127, -127, result);
|
|
check_result_i32(lhs, rhs, result, m, n, k, -127, -127);
|
|
}
|
|
|
|
void q_suite(int mi, int ni, int ki, int mx, int nx, int kx, int md, int nd,
|
|
int kd, std::uint8_t* scratch, std::uint8_t* left,
|
|
std::uint8_t* right, std::uint8_t* result,
|
|
gemmlowp::WorkersPool* pool, int t) {
|
|
for (int m = mi; m < mx; m += md) {
|
|
for (int n = ni; n < nx; n += nd) {
|
|
for (int k = ki; k < kx; k += kd) {
|
|
test(scratch, left, right, m, n, k, result, pool, t);
|
|
}
|
|
}
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
void f_suite(int mi, int ni, int ki, int mx, int nx, int kx, int md, int nd,
|
|
int kd, std::uint8_t* scratch, std::uint8_t* left,
|
|
std::uint8_t* right, float* result, gemmlowp::WorkersPool* pool,
|
|
int t) {
|
|
for (int m = mi; m < mx; m += md) {
|
|
for (int n = ni; n < nx; n += nd) {
|
|
for (int k = ki; k < kx; k += kd) {
|
|
test_f(scratch, left, right, m, n, k, result, pool, t);
|
|
}
|
|
}
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
void i32_suite(int mi, int ni, int ki, int mx, int nx, int kx, int md, int nd,
|
|
int kd, std::uint8_t* scratch, std::uint8_t* left,
|
|
std::uint8_t* right, std::int32_t* result,
|
|
gemmlowp::WorkersPool* pool, int t) {
|
|
for (int m = mi; m < mx; m += md) {
|
|
for (int n = ni; n < nx; n += nd) {
|
|
for (int k = ki; k < kx; k += kd) {
|
|
test_i32(scratch, left, right, m, n, k, result, pool, t);
|
|
}
|
|
}
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
int main(int argc, char* argv[]) {
|
|
bool run_long_test = false;
|
|
|
|
if (argc > 1 && strcmp(argv[1], "long")) {
|
|
run_long_test = true;
|
|
}
|
|
|
|
const std::int32_t min_n = 1;
|
|
const std::int32_t min_m = 1;
|
|
const std::int32_t min_k = 8;
|
|
|
|
const std::int32_t max_n = 1024;
|
|
const std::int32_t max_m = 1024;
|
|
const std::int32_t max_k = 2048;
|
|
|
|
std::uint8_t* left = new std::uint8_t[max_m * max_k];
|
|
std::uint8_t* right = new std::uint8_t[max_n * max_k];
|
|
std::uint8_t* result = new std::uint8_t[max_m * max_n];
|
|
float* result_float = new float[max_m * max_n];
|
|
std::int32_t* result_i32 = new std::int32_t[max_m * max_n];
|
|
std::uint8_t* scratch = new std::uint8_t[1024 * 1024 * 64];
|
|
|
|
gemmlowp::WorkersPool pool;
|
|
|
|
int max_repetitions = run_long_test ? 10 : 1;
|
|
|
|
for (int repetitions = 0; repetitions < max_repetitions; ++repetitions) {
|
|
int t = std::min(repetitions + 1, 4);
|
|
std::cout << "Threads: " << t << std::endl << std::flush;
|
|
|
|
std::cout << "Quantized 8 bit." << std::endl << std::flush;
|
|
|
|
std::cout << "Small." << std::endl << std::flush;
|
|
q_suite(1, 1, 1, 16, 16, 32, 1, 1, 1, scratch, left, right, result, &pool,
|
|
t);
|
|
|
|
if (run_long_test) {
|
|
std::cout << "Big." << std::endl << std::flush;
|
|
q_suite(1, 1, 1, 512, 512, 2048, 111, 111, 111, scratch, left, right,
|
|
result, &pool, t);
|
|
}
|
|
|
|
std::cout << "Gemv." << std::endl << std::flush;
|
|
q_suite(1, 1, 1, 2, 512, 2048, 1, 111, 111, scratch, left, right, result,
|
|
&pool, t);
|
|
q_suite(1, 1, 1, 512, 2, 2048, 111, 1, 111, scratch, left, right, result,
|
|
&pool, t);
|
|
|
|
std::cout << std::endl << "Floats." << std::endl << std::flush;
|
|
|
|
std::cout << "Small." << std::endl << std::flush;
|
|
f_suite(1, 1, 1, 16, 16, 32, 1, 1, 1, scratch, left, right, result_float,
|
|
&pool, t);
|
|
|
|
if (run_long_test) {
|
|
std::cout << "Big." << std::endl << std::flush;
|
|
f_suite(1, 1, 1, 512, 512, 2048, 111, 111, 111, scratch, left, right,
|
|
result_float, &pool, t);
|
|
}
|
|
|
|
std::cout << "Gemv." << std::endl << std::flush;
|
|
f_suite(1, 1, 1, 2, 512, 2048, 1, 111, 111, scratch, left, right,
|
|
result_float, &pool, t);
|
|
f_suite(1, 1, 1, 512, 2, 2048, 111, 1, 111, scratch, left, right,
|
|
result_float, &pool, t);
|
|
|
|
std::cout << std::endl << "Int32." << std::endl << std::flush;
|
|
|
|
std::cout << "Small." << std::endl << std::flush;
|
|
i32_suite(1, 1, 1, 16, 16, 32, 1, 1, 1, scratch, left, right, result_i32,
|
|
&pool, t);
|
|
|
|
if (run_long_test) {
|
|
std::cout << "Big." << std::endl << std::flush;
|
|
i32_suite(1, 1, 1, 512, 512, 2048, 111, 111, 111, scratch, left, right,
|
|
result_i32, &pool, t);
|
|
}
|
|
|
|
std::cout << "Gemv." << std::endl << std::flush;
|
|
i32_suite(1, 1, 1, 2, 512, 2048, 1, 111, 111, scratch, left, right,
|
|
result_i32, &pool, t);
|
|
i32_suite(1, 1, 1, 512, 2, 2048, 111, 1, 111, scratch, left, right,
|
|
result_i32, &pool, t);
|
|
|
|
std::cout << std::endl << std::flush;
|
|
}
|
|
|
|
std::cout << "Done." << std::endl << std::flush;
|
|
}
|