Merge branch 'benchmark_softmax' of github.com:InfiniTensor/InfiniTensor into benchmark_conv

2023-07-20 14:15:37 +08:00 · 2023-07-20 14:15:37 +08:00 · 4b762cc8d9
parent ddaddf375e 1c55c74151
commit 4b762cc8d9
1 changed files with 19 additions and 13 deletions
--- a/benchmark/kernels/cuda/softmax.cc
+++ b/benchmark/kernels/cuda/softmax.cc
@ -12,23 +12,24 @@

 using namespace infini;

-namespace ch {
-    using namespace std::chrono;
-}
+#define M 1048576

 int main() {

+    // Benchmark Settings
    int warmupRounds = 200;
    int timingRounds = 200;
    Shape INPUT_SHAPE = {16, 3, 128, 128};
    DataType dtype = DataType::Float32;

+    // Get data size
    size_t size = 1;
    for (auto dim: INPUT_SHAPE) {
        size *= dim;
    }
    size_t sizeInBytes = size * sizeof(dtype);

+    // Init time variables
    double time_memcpy_htod = 0.0, time_memcpy_dtoh = 0.0;
    double time_op = 0.0;

@ -56,25 +57,35 @@ int main() {
        warmupRounds, timingRounds
    );

+    // Build output data on CPU
    auto outputGpu = inputGpu->clone(cudaRuntime);

+    // Build output data on GPU
+    Tensor outputCpu =
+        make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime);
+    outputCpu->dataMalloc();
+
+    // Build cudnn descriptors
    cudnnTensorDescriptor_t inputDesc, outputDesc;

-    // get inputs
+    // input descriptor
    checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
    checkCudnnError(cudnnSetTensor4dDescriptor(
        inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, INPUT_SHAPE[0],
        INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]));

-    // get outputs
+    // output descriptor
    checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
    checkCudnnError(cudnnSetTensor4dDescriptor(
        outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, INPUT_SHAPE[0],
        INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]));

+    // cudnn operator settings
    float alpha = 1.0, beta = 0.0;
    cudnnSoftmaxAlgorithm_t algo = CUDNN_SOFTMAX_FAST;
    cudnnSoftmaxMode_t mode = CUDNN_SOFTMAX_MODE_INSTANCE;
+
+    // Do forward
    time_op += timeit(
        [&]() {
            cudnnSoftmaxForward(cudaRuntime->cudnnHandle(), algo, mode, 
@ -87,11 +98,8 @@ int main() {
    
    checkCudnnError(cudnnDestroyTensorDescriptor(inputDesc));
    checkCudnnError(cudnnDestroyTensorDescriptor(outputDesc));
-
-    Tensor outputCpu =
-        make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime);
-    outputCpu->dataMalloc();
-
+    
+    // Do memcpy device to host
    time_memcpy_dtoh += timeit(
        [&]() {
            outputCpu = outputGpu->clone(cpuRuntime);
@ -100,15 +108,13 @@ int main() {
        warmupRounds, timingRounds
    );

+    // Print Results
    printf("Operator - Softmax:\n");
    printf("Input shape: (%d, %d, %d, %d)\n", 
        INPUT_SHAPE[0], INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]);
    printf("Input size: %ld, dtype: %s, size in bytes: %ld\n", 
        size, dtype.toString().c_str(), sizeInBytes);

-    // int G = 1073741824;
-    int M = 1048576;
-
    printf("TFlops: %.5lf tflops\n", 5 * size / 1e9 / time_op);
    printf("Memcpy time: h2d - %.6lf ms, d2h - %.6lf ms\n",
        time_memcpy_htod, time_memcpy_dtoh);