InfiniTensor/include/cuda/cuda_runtime.h

#pragma once
#include "core/runtime.h"
#include "cuda/cuda_common.h"

namespace infini {

class CudaRuntimeObj : public RuntimeObj {
  private:
    cudnnHandle_t cudnn;
    cublasHandle_t cublas;
    CudaPtr workspace;
    size_t workspaceSize;

  public:
    CUdevice cuDevice;
    CUcontext newContext;

  public:
    CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
        checkCudnnError(cudnnCreate(&cudnn));
        checkCublasError(cublasCreate(&cublas));
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
        workspaceSize = 7ll << 30; // 7 GB
        workspace = alloc(workspaceSize);

        checkCUresult(cuInit(0));
        checkCUresult(cuDeviceGet(&cuDevice, 0));
        checkCUresult(cuCtxCreate(&newContext, 0, cuDevice));
    }
    virtual ~CudaRuntimeObj() {
        dealloc(workspace);
        checkCudnnError(cudnnDestroy(cudnn));
        checkCublasError(cublasDestroy(cublas));
        checkCUresult(cuCtxDestroy(newContext));
    }

    void run(const Graph &graph, bool tune = false,
             bool profiling = false) const;
    // double runEvaluation(const Graph &graph, int nWarmups,
    //                      int nEvaluations) const;
    void sync() const;
    CudaPtr alloc(size_t size) override {
        void *ptr;
        checkCudaError(cudaMalloc(&ptr, size));
        return ptr;
    }
    void dealloc(void *ptr) override { checkCudaError(cudaFree(ptr)); }
    cudnnHandle_t cudnnHandle() const { return cudnn; }
    cublasHandle_t cublasHandle() const { return cublas; }
    CudaPtr getWorkspace(size_t size) const {
        IT_ASSERT(size <= workspaceSize);
        return workspace;
    }

    void copyBlobFromCPU(void *dst, const void *src,
                         size_t bytes) const override {
        checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice));
    }

    void copyBlobToCPU(void *dst, const void *src,
                       size_t bytes) const override {
        checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost));
    }

    void copyBlobInsideRuntime(void *dst, const void *src,
                               size_t bytes) const override {
        checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
    }

  private:
    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
};
} // namespace infini
Add CUDA runtime (#6) * Fix: add warm-up and repetition in timing * Add: CUDA runtime and float support * Refactor: Cuda and Cpu runtimes inherit Runtime * Add: environment script for Lotus * Add: Lotus build instructions * Update README.md Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-08-22 15:01:03 +08:00			`#pragma once`
			`#include "core/runtime.h"`
			`#include "cuda/cuda_common.h"`

			`namespace infini {`

			`class CudaRuntimeObj : public RuntimeObj {`
			`private:`
			`cudnnHandle_t cudnn;`
			`cublasHandle_t cublas;`
			`CudaPtr workspace;`
			`size_t workspaceSize;`

Add TVM codegen for MemboundOp (#35) * Add: interface for membound TVM kernel and test * add getAnsorCode * add evaluation, but link failed * add evaluation of kernel, but link failed * Fix: link libcuda and nvrtc * add print * Add: const for source of copy * compile and evaluate the kernel * add compute * fix gen_ansor_op.py * fix membound_TVM * format and fix CMakeLists.txt * fix memory leak Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> Co-authored-by: huangshuhong <huangsh19@mails.tsinghua.edu.cn> 2022-09-22 18:06:45 +08:00			`public:`
			`CUdevice cuDevice;`
			`CUcontext newContext;`

Add CUDA runtime (#6) * Fix: add warm-up and repetition in timing * Add: CUDA runtime and float support * Refactor: Cuda and Cpu runtimes inherit Runtime * Add: environment script for Lotus * Add: Lotus build instructions * Update README.md Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-08-22 15:01:03 +08:00			`public:`
			`CudaRuntimeObj() : RuntimeObj(Device::CUDA) {`
			`checkCudnnError(cudnnCreate(&cudnn));`
			`checkCublasError(cublasCreate(&cublas));`
			`// 10GB for Longformer`
			`// size_t longformerNum = 3lu * (1 << 30);`
			`workspaceSize = 7ll << 30; // 7 GB`
			`workspace = alloc(workspaceSize);`
Add TVM codegen for MemboundOp (#35) * Add: interface for membound TVM kernel and test * add getAnsorCode * add evaluation, but link failed * add evaluation of kernel, but link failed * Fix: link libcuda and nvrtc * add print * Add: const for source of copy * compile and evaluate the kernel * add compute * fix gen_ansor_op.py * fix membound_TVM * format and fix CMakeLists.txt * fix memory leak Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> Co-authored-by: huangshuhong <huangsh19@mails.tsinghua.edu.cn> 2022-09-22 18:06:45 +08:00
			`checkCUresult(cuInit(0));`
			`checkCUresult(cuDeviceGet(&cuDevice, 0));`
			`checkCUresult(cuCtxCreate(&newContext, 0, cuDevice));`
Add CUDA runtime (#6) * Fix: add warm-up and repetition in timing * Add: CUDA runtime and float support * Refactor: Cuda and Cpu runtimes inherit Runtime * Add: environment script for Lotus * Add: Lotus build instructions * Update README.md Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-08-22 15:01:03 +08:00			`}`
			`virtual ~CudaRuntimeObj() {`
cuDNN conv tuning (#16) * Function tune and corresponding testcase. Add: Tune function in /src/kernel/cuda/conv.cc and corresponding testcase in test_conv. Fix: A little bug of perfRecord using in /src/core/runtime.cc. * Tune part debug Add: recover the code, fixed the commit error. Add: some anotations in tune function * clang formmat test * Fix: mem leak in CUDA Runtime and Conv * Fix: sync in conv and default sync in timeit * Change the way to tune operator conv. Timeit function cudNNUnfused -> Timeit function cudnnConvolutionForward. * Change: merge the common part of cudnnunfused&tune into cudnndescriptoraccess * clang test * clang-format * clang-format bash. * Chore: remove print and blank lines Co-authored-by: wcz112 <wcz19@mails.tsinghua.edu.cn> Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-08-29 21:37:07 +08:00			`dealloc(workspace);`
Add CUDA runtime (#6) * Fix: add warm-up and repetition in timing * Add: CUDA runtime and float support * Refactor: Cuda and Cpu runtimes inherit Runtime * Add: environment script for Lotus * Add: Lotus build instructions * Update README.md Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-08-22 15:01:03 +08:00			`checkCudnnError(cudnnDestroy(cudnn));`
			`checkCublasError(cublasDestroy(cublas));`
Add TVM codegen for MemboundOp (#35) * Add: interface for membound TVM kernel and test * add getAnsorCode * add evaluation, but link failed * add evaluation of kernel, but link failed * Fix: link libcuda and nvrtc * add print * Add: const for source of copy * compile and evaluate the kernel * add compute * fix gen_ansor_op.py * fix membound_TVM * format and fix CMakeLists.txt * fix memory leak Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> Co-authored-by: huangshuhong <huangsh19@mails.tsinghua.edu.cn> 2022-09-22 18:06:45 +08:00			`checkCUresult(cuCtxDestroy(newContext));`
Add CUDA runtime (#6) * Fix: add warm-up and repetition in timing * Add: CUDA runtime and float support * Refactor: Cuda and Cpu runtimes inherit Runtime * Add: environment script for Lotus * Add: Lotus build instructions * Update README.md Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-08-22 15:01:03 +08:00			`}`

			`void run(const Graph &graph, bool tune = false,`
			`bool profiling = false) const;`
			`// double runEvaluation(const Graph &graph, int nWarmups,`
			`// int nEvaluations) const;`
			`void sync() const;`
			`CudaPtr alloc(size_t size) override {`
			`void *ptr;`
			`checkCudaError(cudaMalloc(&ptr, size));`
			`return ptr;`
			`}`
			`void dealloc(void *ptr) override { checkCudaError(cudaFree(ptr)); }`
			`cudnnHandle_t cudnnHandle() const { return cudnn; }`
			`cublasHandle_t cublasHandle() const { return cublas; }`
			`CudaPtr getWorkspace(size_t size) const {`
			`IT_ASSERT(size <= workspaceSize);`
			`return workspace;`
			`}`

Add TVM codegen for MemboundOp (#35) * Add: interface for membound TVM kernel and test * add getAnsorCode * add evaluation, but link failed * add evaluation of kernel, but link failed * Fix: link libcuda and nvrtc * add print * Add: const for source of copy * compile and evaluate the kernel * add compute * fix gen_ansor_op.py * fix membound_TVM * format and fix CMakeLists.txt * fix memory leak Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> Co-authored-by: huangshuhong <huangsh19@mails.tsinghua.edu.cn> 2022-09-22 18:06:45 +08:00			`void copyBlobFromCPU(void dst, const void src,`
			`size_t bytes) const override {`
Extended DataType class and Runtime interaction (#9) * Add: DataType class * Add: data-type-oblivious tensor interface * Rename: copyBlobToCPU Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-08-23 16:55:59 +08:00			`checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice));`
			`}`

Add TVM codegen for MemboundOp (#35) * Add: interface for membound TVM kernel and test * add getAnsorCode * add evaluation, but link failed * add evaluation of kernel, but link failed * Fix: link libcuda and nvrtc * add print * Add: const for source of copy * compile and evaluate the kernel * add compute * fix gen_ansor_op.py * fix membound_TVM * format and fix CMakeLists.txt * fix memory leak Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> Co-authored-by: huangshuhong <huangsh19@mails.tsinghua.edu.cn> 2022-09-22 18:06:45 +08:00			`void copyBlobToCPU(void dst, const void src,`
			`size_t bytes) const override {`
Extended DataType class and Runtime interaction (#9) * Add: DataType class * Add: data-type-oblivious tensor interface * Rename: copyBlobToCPU Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-08-23 16:55:59 +08:00			`checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost));`
			`}`

Add TVM codegen for MemboundOp (#35) * Add: interface for membound TVM kernel and test * add getAnsorCode * add evaluation, but link failed * add evaluation of kernel, but link failed * Fix: link libcuda and nvrtc * add print * Add: const for source of copy * compile and evaluate the kernel * add compute * fix gen_ansor_op.py * fix membound_TVM * format and fix CMakeLists.txt * fix memory leak Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> Co-authored-by: huangshuhong <huangsh19@mails.tsinghua.edu.cn> 2022-09-22 18:06:45 +08:00			`void copyBlobInsideRuntime(void dst, const void src,`
Extended DataType class and Runtime interaction (#9) * Add: DataType class * Add: data-type-oblivious tensor interface * Rename: copyBlobToCPU Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-08-23 16:55:59 +08:00			`size_t bytes) const override {`
			`checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));`
			`}`

Add CUDA runtime (#6) * Fix: add warm-up and repetition in timing * Add: CUDA runtime and float support * Refactor: Cuda and Cpu runtimes inherit Runtime * Add: environment script for Lotus * Add: Lotus build instructions * Update README.md Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-08-22 15:01:03 +08:00			`private:`
cuDNN conv tuning (#16) * Function tune and corresponding testcase. Add: Tune function in /src/kernel/cuda/conv.cc and corresponding testcase in test_conv. Fix: A little bug of perfRecord using in /src/core/runtime.cc. * Tune part debug Add: recover the code, fixed the commit error. Add: some anotations in tune function * clang formmat test * Fix: mem leak in CUDA Runtime and Conv * Fix: sync in conv and default sync in timeit * Change the way to tune operator conv. Timeit function cudNNUnfused -> Timeit function cudnnConvolutionForward. * Change: merge the common part of cudnnunfused&tune into cudnndescriptoraccess * clang test * clang-format * clang-format bash. * Chore: remove print and blank lines Co-authored-by: wcz112 <wcz19@mails.tsinghua.edu.cn> Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-08-29 21:37:07 +08:00			`void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;`
Add CUDA runtime (#6) * Fix: add warm-up and repetition in timing * Add: CUDA runtime and float support * Refactor: Cuda and Cpu runtimes inherit Runtime * Add: environment script for Lotus * Add: Lotus build instructions * Update README.md Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-08-22 15:01:03 +08:00			`};`
			`} // namespace infini`