Add comments

Add: benchmark for cudnn softmax
fix: Makefile
2023-07-20 14:14:06 +08:00 · 2023-07-20 11:03:07 +08:00 · 2023-07-19 11:29:15 +08:00 · 2023-07-19 11:10:18 +08:00 · 2023-07-19 10:55:51 +08:00 · 2023-07-19 10:06:47 +08:00
4 changed files with 148 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -262,3 +262,19 @@ if(BUILD_TEST)
    target_link_libraries(nnet_reader InfiniTensor)
  endif()
 endif()
+
+function(build_bench files)
+  file(GLOB BENCH_SOURCES ${files})
+  foreach(benchsourcefile ${BENCH_SOURCES})
+    get_filename_component(benchname ${benchsourcefile} NAME_WE)
+    add_executable("benchmark_${benchname}" ${benchsourcefile})
+    target_link_libraries("benchmark_${benchname}" InfiniTensor)
+    # add_custom_target(NAME ${benchname} COMMAND ${benchname})
+  endforeach(benchsourcefile ${BENCH_SOURCES})
+endfunction()
+
+if (BENCH)
+  if (USE_CUDA)
+    build_bench(benchmark/kernels/cuda/*.cc)
+  endif()
+endif()
--- a/2
+++ b/2
@ -6,12 +6,14 @@ BANG ?= OFF
 INTELCPU ?= off
 BACKTRACE ?= ON
 TEST ?= ON
+BENCH ?= ON

 CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
 CMAKE_OPT += -DUSE_CUDA=$(CUDA)
 CMAKE_OPT += -DUSE_BANG=$(BANG)
 CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
 CMAKE_OPT += -DBUILD_TEST=$(TEST)
+CMAKE_OPT += -DBENCH=$(BENCH)

 ifeq ($(INTELCPU), ON)
 	CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp
--- a/benchmark/kernels/cuda/softmax.cc
+++ b/benchmark/kernels/cuda/softmax.cc
@ -0,0 +1,126 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/softmax.h"
+#include "benchmark.h"
+#include <iostream>
+#include <cmath>
+#include <chrono>
+#include <sys/time.h>
+
+using namespace infini;
+
+#define M 1048576
+
+int main() {
+
+    // Benchmark Settings
+    int warmupRounds = 200;
+    int timingRounds = 200;
+    Shape INPUT_SHAPE = {16, 3, 128, 128};
+    DataType dtype = DataType::Float32;
+
+    // Get data size
+    size_t size = 1;
+    for (auto dim: INPUT_SHAPE) {
+        size *= dim;
+    }
+    size_t sizeInBytes = size * sizeof(dtype);
+
+    // Init time variables
+    double time_memcpy_htod = 0.0, time_memcpy_dtoh = 0.0;
+    double time_op = 0.0;
+
+    // Create runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(RandomGenerator());
+
+    // Build input data on GPU
+    Tensor inputGpu = 
+        make_ref<TensorObj>(INPUT_SHAPE, dtype, cudaRuntime);
+    inputGpu->dataMalloc();
+
+    // Do memcpy host to device
+    time_memcpy_htod += timeit(
+        [&]() {
+            inputGpu = inputCpu->clone(cudaRuntime);
+        },
+        [&]() { cudaRuntime->sync(); }, 
+        warmupRounds, timingRounds
+    );
+
+    // Build output data on CPU
+    auto outputGpu = inputGpu->clone(cudaRuntime);
+
+    // Build output data on GPU
+    Tensor outputCpu =
+        make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime);
+    outputCpu->dataMalloc();
+
+    // Build cudnn descriptors
+    cudnnTensorDescriptor_t inputDesc, outputDesc;
+
+    // input descriptor
+    checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
+    checkCudnnError(cudnnSetTensor4dDescriptor(
+        inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, INPUT_SHAPE[0],
+        INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]));
+
+    // output descriptor
+    checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
+    checkCudnnError(cudnnSetTensor4dDescriptor(
+        outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, INPUT_SHAPE[0],
+        INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]));
+
+    // cudnn operator settings
+    float alpha = 1.0, beta = 0.0;
+    cudnnSoftmaxAlgorithm_t algo = CUDNN_SOFTMAX_FAST;
+    cudnnSoftmaxMode_t mode = CUDNN_SOFTMAX_MODE_INSTANCE;
+
+    // Do forward
+    time_op += timeit(
+        [&]() {
+            cudnnSoftmaxForward(cudaRuntime->cudnnHandle(), algo, mode, 
+                &alpha, inputDesc, inputGpu->getRawDataPtr<void *>(), 
+                &beta, outputDesc, outputGpu->getRawDataPtr<void *>());
+        },
+        [&]() { cudaRuntime->sync(); },
+        warmupRounds, timingRounds
+    );
+    
+    checkCudnnError(cudnnDestroyTensorDescriptor(inputDesc));
+    checkCudnnError(cudnnDestroyTensorDescriptor(outputDesc));
+    
+    // Do memcpy device to host
+    time_memcpy_dtoh += timeit(
+        [&]() {
+            outputCpu = outputGpu->clone(cpuRuntime);
+        },
+        [&]() { cudaRuntime->sync(); },
+        warmupRounds, timingRounds
+    );
+
+    // Print Results
+    printf("Operator - Softmax:\n");
+    printf("Input shape: (%d, %d, %d, %d)\n", 
+        INPUT_SHAPE[0], INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]);
+    printf("Input size: %ld, dtype: %s, size in bytes: %ld\n", 
+        size, dtype.toString().c_str(), sizeInBytes);
+
+    printf("TFlops: %.5lf tflops\n", 5 * size / 1e9 / time_op);
+    printf("Memcpy time: h2d - %.6lf ms, d2h - %.6lf ms\n",
+        time_memcpy_htod, time_memcpy_dtoh);
+    printf("Memcpy throughput: h2d - %.6lf MB/ms, d2h: %.6lf MB/ms\n",
+        sizeInBytes / M / time_memcpy_htod, sizeInBytes / M / time_memcpy_dtoh);    
+    printf("Operation: %.6lf ms\n", time_op);
+
+    return 0;
+}
--- a/include/benchmark.h
+++ b/include/benchmark.h
@ -0,0 +1,4 @@
+#pragma once
+#include "core/common.h"
+#include "core/tensor_base.h"
+#include "utils/data_generator.h"
Author	SHA1	Message	Date
bolun	1c55c74151	Add comments	2023-07-20 14:14:06 +08:00
bolun	7945693131	Add: benchmark for cudnn softmax	2023-07-20 11:03:07 +08:00
bolun	fdb2d30868	fix: Makefile	2023-07-19 11:29:15 +08:00
zhangyue207	f532784d4f	update CMakeList.txt Makefile for micro benchmark	2023-07-19 11:10:18 +08:00
zhangyue207	454b7651a8	update CMakeList.txt Makefile for micro benchmark	2023-07-19 10:55:51 +08:00
zhangyue207	48322dbf27	Conv Kernel FP16	2023-07-19 10:06:47 +08:00
zhangyue207	523946cb8b	add conv_half kernel	2023-07-18 16:55:00 +08:00