InfiniTensor/generated_code/tmp.cu

#include "cuda.h"
#include "cuda_utils.h"

#include <vector>

void invoke_func_2(float *src, float *dst);

int main() {
    std::vector<int> shape = {7 * 7, 232, 2};
    std::vector<int> perm = {0, 2, 1};
    float *src, *dst;
    size_t size = 1;
    for (auto x : shape) {
        size *= x;
    }
    std::vector<int> stride_src(shape.size()), stride_dst(shape.size());
    stride_dst[0] = 1;
    for (int i = 1; i < shape.size(); i++) {
        stride_dst[i] = stride_dst[i-1] * shape[i-1];
    }
    size_t this_stride = 1;
    for (int i = 0; i < shape.size(); i++) {
        for (int j = 0; j < shape.size(); j++) {
            if (perm[j] == i) {
                stride_src[i] = this_stride;
                this_stride *= shape[j];
            }
        }
    }

    cudaSafeCall(cudaMalloc((void **)&src, size * sizeof(float)));
    cudaSafeCall(cudaMalloc((void **)&dst, size * sizeof(float)));

    float *src_host, *dst_host;
    src_host = (float *)malloc(size * sizeof(float));
    dst_host = (float *)malloc(size * sizeof(float));
    for (size_t i = 0; i < size; i++) {
        src_host[i] = i;
    }
    cudaSafeCall(cudaMemcpy(src, src_host, size * sizeof(float), cudaMemcpyHostToDevice));
    invoke_func_2(src, dst);
    cudaSafeCall(cudaMemcpy(dst_host, dst, size * sizeof(float), cudaMemcpyDeviceToHost));
    bool flag = 0;
    for (size_t i = 0; i < size; i++) {
        size_t base = i;
        size_t offset_src = 0;
        for (int j = 0; j < shape.size(); j++) {
            offset_src += base % shape[j] * stride_src[perm[j]];
            base /= shape[j];
        }
        if (dst_host[i] != src_host[offset_src]) {
            flag = 1;
            std::cout << "[ERROR] at " << i << "," << offset_src << ":" << dst_host[i] << "," << src_host[offset_src] << std::endl;
            break;
        }
    }

    if (!flag) {
        std::cout << "[INFO] transpose correct." << std::endl;
    } else {
        std::cout << "[ERROR] transpose incorrect." << std::endl;
    }

    float duration = 0;
    cudaEvent_t st, ed;
    cudaEventCreate(&st);
    cudaEventCreate(&ed);
    int cnt = 128;
    for (int t = 0; t < cnt; t++) {
        invoke_func_2(src, dst);
    }
    cudaEventRecord(st, 0);
    for (int t = 0; t < cnt; t++) {
        invoke_func_2(src, dst);
    }
    cudaEventRecord(ed, 0);
    cudaEventSynchronize(st);
    cudaEventSynchronize(ed);
    cudaEventElapsedTime(&duration, st, ed);
    std::cout << "[INFO] time: " << duration / cnt << std::endl;
    double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
    std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
    std::cout << "[Exit] successful." << std::endl;
}