InfiniTensor/generated_code/tmp.cu

85 lines
2.7 KiB
Plaintext

#include "cuda.h"
#include "cuda_utils.h"
#include <vector>
void invoke_func_2(float *src, float *dst);
int main() {
std::vector<int> shape = {7 * 7, 232, 2};
std::vector<int> perm = {0, 2, 1};
float *src, *dst;
size_t size = 1;
for (auto x : shape) {
size *= x;
}
std::vector<int> stride_src(shape.size()), stride_dst(shape.size());
stride_dst[0] = 1;
for (int i = 1; i < shape.size(); i++) {
stride_dst[i] = stride_dst[i-1] * shape[i-1];
}
size_t this_stride = 1;
for (int i = 0; i < shape.size(); i++) {
for (int j = 0; j < shape.size(); j++) {
if (perm[j] == i) {
stride_src[i] = this_stride;
this_stride *= shape[j];
}
}
}
cudaSafeCall(cudaMalloc((void **)&src, size * sizeof(float)));
cudaSafeCall(cudaMalloc((void **)&dst, size * sizeof(float)));
float *src_host, *dst_host;
src_host = (float *)malloc(size * sizeof(float));
dst_host = (float *)malloc(size * sizeof(float));
for (size_t i = 0; i < size; i++) {
src_host[i] = i;
}
cudaSafeCall(cudaMemcpy(src, src_host, size * sizeof(float), cudaMemcpyHostToDevice));
invoke_func_2(src, dst);
cudaSafeCall(cudaMemcpy(dst_host, dst, size * sizeof(float), cudaMemcpyDeviceToHost));
bool flag = 0;
for (size_t i = 0; i < size; i++) {
size_t base = i;
size_t offset_src = 0;
for (int j = 0; j < shape.size(); j++) {
offset_src += base % shape[j] * stride_src[perm[j]];
base /= shape[j];
}
if (dst_host[i] != src_host[offset_src]) {
flag = 1;
std::cout << "[ERROR] at " << i << "," << offset_src << ":" << dst_host[i] << "," << src_host[offset_src] << std::endl;
break;
}
}
if (!flag) {
std::cout << "[INFO] transpose correct." << std::endl;
} else {
std::cout << "[ERROR] transpose incorrect." << std::endl;
}
float duration = 0;
cudaEvent_t st, ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
int cnt = 128;
for (int t = 0; t < cnt; t++) {
invoke_func_2(src, dst);
}
cudaEventRecord(st, 0);
for (int t = 0; t < cnt; t++) {
invoke_func_2(src, dst);
}
cudaEventRecord(ed, 0);
cudaEventSynchronize(st);
cudaEventSynchronize(ed);
cudaEventElapsedTime(&duration, st, ed);
std::cout << "[INFO] time: " << duration / cnt << std::endl;
double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
std::cout << "[Exit] successful." << std::endl;
}