From 04ea5eed3881812aa1e78b489b179d46e0988919 Mon Sep 17 00:00:00 2001 From: zhengly123 Date: Mon, 22 Aug 2022 15:01:03 +0800 Subject: [PATCH] Add CUDA runtime (#6) * Fix: add warm-up and repetition in timing * Add: CUDA runtime and float support * Refactor: Cuda and Cpu runtimes inherit Runtime * Add: environment script for Lotus * Add: Lotus build instructions * Update README.md Co-authored-by: Liyan Zheng --- CMakeLists.txt | 19 ++- README.md | 10 +- include/core/blob.h | 24 +++ include/core/common.h | 4 +- include/core/graph.h | 4 +- include/core/kernel.h | 18 ++- include/core/run_enigne.h | 26 ---- include/core/runtime.h | 62 ++++++++ include/core/tensor.h | 36 ++++- include/core/tensor_base.h | 19 ++- include/cuda/cuda_common.h | 111 +++++++++++++ include/cuda/cuda_runtime.h | 49 ++++++ include/cuda/cuda_utility.h | 11 ++ include/test.h | 34 +++- src/core/blob.cc | 10 ++ src/core/common.cc | 15 +- src/core/graph.cc | 5 +- src/core/{run_engine.cc => runtime.cc} | 34 ++-- src/core/tensor.cc | 78 ++++++++-- src/core/tensor_base.cc | 9 +- src/cuda/cuda_runtime.cc | 32 ++++ src/cuda/cuda_utility.cu | 20 +++ src/kerels/cpu/conv.cc | 18 ++- src/kerels/cpu/matmul.cc | 20 ++- src/kerels/cuda/conv.cc | 208 +++++++++++++++++++++++++ test/core/test_graph.cc | 31 ++-- test/core/test_hash.cc | 6 +- test/operators/test_conv.cc | 87 +++++++++-- test/script/env_lotus.sh | 2 + 29 files changed, 875 insertions(+), 127 deletions(-) create mode 100644 include/core/blob.h delete mode 100644 include/core/run_enigne.h create mode 100644 include/core/runtime.h create mode 100644 include/cuda/cuda_common.h create mode 100644 include/cuda/cuda_runtime.h create mode 100644 include/cuda/cuda_utility.h create mode 100644 src/core/blob.cc rename src/core/{run_engine.cc => runtime.cc} (75%) create mode 100644 src/cuda/cuda_runtime.cc create mode 100644 src/cuda/cuda_utility.cu create mode 100644 src/kerels/cuda/conv.cc create mode 100644 test/script/env_lotus.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 70935137..8220b3ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,7 @@ include(CMakeDependentOption) project(InfiniTensor C CXX) # Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them. +option(USE_CUDA "Support CUDA GPU" ON) option(BUILD_TEST "Build tests" ON) cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF) cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF) @@ -21,7 +22,6 @@ find_package( Python COMPONENTS Interpreter Development REQUIRED) -find_package(CUDA REQUIRED) # OpenMP find_package(OpenMP) if(OpenMP_C_FOUND) @@ -53,14 +53,19 @@ if(BUILD_TEST) endif() file(GLOB_RECURSE SRC src/*.cc src/*.cu) -# file(GLOB_RECURSE FFI src/ffi/ffi_pet.cc) -# list(REMOVE_ITEM SRC ${TEST} ${FFI}) add_library(InfiniTensor SHARED ${SRC}) -# Target -# cuda_add_library(it SHARED ${SRC}) -# cuda_add_cublas_to_target(it) # cublas -# # target_link_libraries(infini_cpp cudnn curand nlohmann_json::nlohmann_json pybind11::embed) + +if(USE_CUDA) + # set(CUDA_HOST_COMPILER /home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc) + enable_language(CUDA) + # TODO: how to set option for CUDA_HOST_COMPILER. Now env var CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc takes effect. + # option(CUDA_HOST_COMPILER "" ${CMAKE_C_COMPILER}) + # TODO: find_package seems unnecessary for CMake >= 3.8 + find_package(CUDA REQUIRED) + # message("CUBLAS_LIBRARIES: ${CUDA_LIBRARIES}") + target_link_libraries(InfiniTensor cudnn curand cublas ${CUDA_LIBRARIES}) +endif() # # Python bindings # pybind11_add_module(infini MODULE ${FFI}) diff --git a/README.md b/README.md index 6d449566..d3778328 100644 --- a/README.md +++ b/README.md @@ -1 +1,9 @@ -# InfiniTensor \ No newline at end of file +# InfiniTensor + +## Compilation on Lotus +``` bash +# Enter the root of InfiniTensor +source test/script/env_lotus.sh +mkdir build && cd build +cmake .. && make -j 12 +``` diff --git a/include/core/blob.h b/include/core/blob.h new file mode 100644 index 00000000..4ff05ea8 --- /dev/null +++ b/include/core/blob.h @@ -0,0 +1,24 @@ +#pragma once +#include "core/common.h" +#include "core/ref.h" + +namespace infini { + +class RuntimeObj; +using Runtime = Ref; + +class BlobObj { + // Runtime might be replaced with a raw pointer for optimization + Runtime runtime; + void *ptr; + + public: + BlobObj(Runtime runtime, void *ptr) : runtime(runtime), ptr(ptr) {} + BlobObj(BlobObj &other) = delete; + BlobObj &operator=(BlobObj const &) = delete; + ~BlobObj(); + + template T getPtr() const { return reinterpret_cast(ptr); } +}; + +} // namespace infini \ No newline at end of file diff --git a/include/core/common.h b/include/core/common.h index 54ad8065..e296c331 100644 --- a/include/core/common.h +++ b/include/core/common.h @@ -69,6 +69,8 @@ template std::string vecToString(const std::vector &vec) { return ret; } -double timeit(const std::function &func); +double timeit(const std::function &func, int warmupRounds = 200, + int timingRounds = 200, + const std::function &sync = {}); } // namespace infini diff --git a/include/core/graph.h b/include/core/graph.h index 7abe5f0e..3e159792 100644 --- a/include/core/graph.h +++ b/include/core/graph.h @@ -4,16 +4,16 @@ namespace infini { -// TODO: graph should be attached to a context class GraphObj : public Object { protected: + Runtime runtime; TensorVec tensors; TensorVec inputs; TensorVec outputs; OpVec ops; public: - // Graph(OpVec oplist); + GraphObj(Runtime runtime) : runtime(runtime){}; string toString() const override; Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32); diff --git a/include/core/kernel.h b/include/core/kernel.h index 4ecb2065..4d1a97d5 100644 --- a/include/core/kernel.h +++ b/include/core/kernel.h @@ -5,8 +5,14 @@ namespace infini { +class RuntimeObj; // Forward declaration for Kernel::compute + struct PerfRecord { - double time; // in milliseconds + PerfRecord(){}; + PerfRecord(double time) : time(time){}; + virtual ~PerfRecord() {} + + double time = 0; // in milliseconds }; class Kernel { @@ -20,14 +26,16 @@ class Kernel { * are required, inherit from PerfRecord and add extra parameters. * Otherwire, use PerfRecord directly. */ - virtual void compute(const Operator &op, - const PerfRecord &record) const = 0; + virtual void compute(const Operator &op, const PerfRecord &record, + const RuntimeObj *context) const = 0; /** * @brief Executes an op with a default parameter. */ - virtual void compute(const Operator &op) const = 0; + virtual void compute(const Operator &op, + const RuntimeObj *context) const = 0; // Premise: op is idempotent since it is called multiple times. - virtual PerfRecord tune(const Operator &op) const = 0; + virtual PerfRecord tune(const Operator &op, + const RuntimeObj *context) const = 0; }; class KernelRegistry { diff --git a/include/core/run_enigne.h b/include/core/run_enigne.h deleted file mode 100644 index 1b4877a1..00000000 --- a/include/core/run_enigne.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once -#include "core/graph.h" -#include "core/kernel.h" -#include "core/perf_engine.h" - -namespace infini { - -class RunEngine { - private: - Device device; - - public: - RunEngine(Device device) : device(device) {} - ~RunEngine() {} - - void run(const Graph &graph, bool tune = false, - bool profiling = false) const; - double getPerfTime(const Graph &graph, bool profiling = false) const; - - private: - void printProfilingData(double totTime, - const std::map &opTime, - const std::map &opCnt) const; -}; - -} // namespace infini \ No newline at end of file diff --git a/include/core/runtime.h b/include/core/runtime.h new file mode 100644 index 00000000..da15578e --- /dev/null +++ b/include/core/runtime.h @@ -0,0 +1,62 @@ +#pragma once +#include "core/graph.h" +#include "core/kernel.h" +#include "core/perf_engine.h" +namespace infini { + +class RuntimeObj : public std::enable_shared_from_this { + protected: + Device device; + + public: + RuntimeObj(Device device) : device(device) {} + RuntimeObj(RuntimeObj &other) = delete; + RuntimeObj &operator=(RuntimeObj const &) = delete; + virtual ~RuntimeObj() {} + + /** + * @brief Execute a graph. + * + * @param graph + * @param tune If there is no performance record, whether to tune it. These + * can be independent method. + * @param profiling Whether to print breakdown of time + */ + virtual void run(const Graph &graph, bool tune = false, + bool profiling = false) const = 0; + virtual void *alloc(size_t size) = 0; + virtual void dealloc(void *ptr) = 0; + + /** + * @brief Get the execution time of each operator in performance record. No + * execution happens. + * + * @param graph + * @param profiling Whether to print breakdown of time + * @return double Return the sum of perf time for each operator + */ + double getPerfTime(const Graph &graph, bool profiling = false) const; + Blob allocBlob(size_t size); + + protected: + void printProfilingData(double totTime, + const std::map &opTime, + const std::map &opCnt) const; +}; + +// TODO: change inheritance relation +class CpuRuntimeObj : public RuntimeObj { + public: + CpuRuntimeObj() : RuntimeObj(Device::CPU) {} + + void run(const Graph &graph, bool tune = false, + bool profiling = false) const override; + void dealloc(void *ptr) override { return free(ptr); }; + + void *alloc(size_t size) override { + return calloc((size + sizeof(uint64_t) - 1) / sizeof(uint64_t), + sizeof(uint64_t)); + }; +}; + +} // namespace infini \ No newline at end of file diff --git a/include/core/tensor.h b/include/core/tensor.h index dcdd4aef..67dddc95 100644 --- a/include/core/tensor.h +++ b/include/core/tensor.h @@ -1,5 +1,6 @@ #pragma once #include "core/tensor_base.h" +#include namespace infini { @@ -16,21 +17,46 @@ class TensorObj : public TensorBaseObj { string toString() const override; size_t size() const; - void dataMalloc(); Shape getDims() const { return shape; } size_t getOffset(const Shape &ds) const; using TensorBaseObj::getData; VType getData(const Shape &pos) const; - void copyData(VType *dptr); + void dataMalloc(const Runtime &runtime); + // void copyData(VType *dptr); + template void copyData(const T *dptr); void copyData(vector dataVector); + void copyData(vector dataVector); void printData() const; + // TODO: merge these methods bool equalData(const Tensor &rhs) const; - void - setData(std::function generator) const { - generator((void *)(data.get()), size(), dtype); + template bool equalData(const Tensor &rhs) const { + IT_ASSERT(data != nullptr); + IT_ASSERT(rhs->data != nullptr); + // TODO: deal with data type + auto ptr = data->getPtr(); + auto ptrRhs = rhs->data->getPtr(); + if (shape != rhs->getDims()) + return false; + size_t sz = size(); + for (size_t i = 0; i < sz; ++i) + if (fabs(ptr[i] - ptrRhs[i]) / + std::max(fabs(ptr[i]), fabs(ptrRhs[i])) > + 1e-6) { + printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]); + return false; + } + return true; } + void setData( + const std::function &generator) const { + generator(data->getPtr(), size(), dtype); + } + + private: + void printDataFloat() const; + void printDataUint32_t() const; // void setDims(const Dim &dms) { dims = dms; } // bool dataRand(int seed = 0) { diff --git a/include/core/tensor_base.h b/include/core/tensor_base.h index d746f273..d38458a7 100644 --- a/include/core/tensor_base.h +++ b/include/core/tensor_base.h @@ -1,19 +1,23 @@ #pragma once +#include "core/blob.h" #include "core/object.h" #include "core/ref.h" namespace infini { -// class Tensor; class TensorBaseObj; class TensorObj; class OperatorObj; class GraphObj; +class RuntimeObj; +class BlobObj; using TensorBase = Ref; using Tensor = Ref; using Operator = Ref; using Graph = Ref; +using Runtime = Ref; +using Blob = Ref; using TensorVec = vector; using OpVec = vector; @@ -40,8 +44,7 @@ class TensorBaseObj : public Object { DataType dtype; vector> inputOf; WRef outputOf; - // TODO: Ref -> Ref - Ref data; + Blob data; // ComputeState computed; // static int random_seed[256 * 16]; // static bool random_inited; @@ -50,7 +53,15 @@ class TensorBaseObj : public Object { TensorBaseObj(int dim, DataType dtype); virtual ~TensorBaseObj() {} - Ref getDataPtr() const { return data; } + void dataMalloc(const Blob &blob) { + IT_ASSERT(data == nullptr); + data = blob; + } + Blob getDataPtr() const { return data; } + template T getDataRawPtr() const { + IT_ASSERT(data != nullptr); + return data->getPtr(); + } VType getData(size_t offset) const; DataType getDType() const { return dtype; } diff --git a/include/cuda/cuda_common.h b/include/cuda/cuda_common.h new file mode 100644 index 00000000..fdf7b1de --- /dev/null +++ b/include/cuda/cuda_common.h @@ -0,0 +1,111 @@ +#pragma once +#include "core/common.h" +#include +#include +#include +#include + +// TODO: replace with Exception (IT_ASSERT) +#define checkCudaError(call) \ + { \ + auto err = call; \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in %s:%i : %s.\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define checkCublasError(call) \ + { \ + auto err = call; \ + if (CUBLAS_STATUS_SUCCESS != err) { \ + fprintf(stderr, "cuBLAS error in %s:%i : %s.\n", __FILE__, \ + __LINE__, cublasGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define checkCudnnError(call) \ + { \ + auto err = call; \ + if (CUDNN_STATUS_SUCCESS != err) { \ + fprintf(stderr, "cuDNN error in %s:%i : %s.\n", __FILE__, \ + __LINE__, cudnnGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define checkCurandError(call) \ + { \ + auto err = call; \ + if (CURAND_STATUS_SUCCESS != err) { \ + fprintf(stderr, "cuRAND error in %s:%i : %s.\n", __FILE__, \ + __LINE__, curandGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + +namespace infini { + +inline const char *cublasGetErrorString(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + return ""; +} + +inline const char *curandGetErrorString(curandStatus_t error) { + switch (error) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + return ""; +} + +using CudaPtr = void *; + +} // namespace infini \ No newline at end of file diff --git a/include/cuda/cuda_runtime.h b/include/cuda/cuda_runtime.h new file mode 100644 index 00000000..9ad15f0c --- /dev/null +++ b/include/cuda/cuda_runtime.h @@ -0,0 +1,49 @@ +#pragma once +#include "core/runtime.h" +#include "cuda/cuda_common.h" + +namespace infini { + +class CudaRuntimeObj : public RuntimeObj { + private: + cudnnHandle_t cudnn; + cublasHandle_t cublas; + CudaPtr workspace; + size_t workspaceSize; + + public: + CudaRuntimeObj() : RuntimeObj(Device::CUDA) { + checkCudnnError(cudnnCreate(&cudnn)); + checkCublasError(cublasCreate(&cublas)); + // 10GB for Longformer + // size_t longformerNum = 3lu * (1 << 30); + workspaceSize = 7ll << 30; // 7 GB + workspace = alloc(workspaceSize); + } + virtual ~CudaRuntimeObj() { + checkCudnnError(cudnnDestroy(cudnn)); + checkCublasError(cublasDestroy(cublas)); + } + + void run(const Graph &graph, bool tune = false, + bool profiling = false) const; + // double runEvaluation(const Graph &graph, int nWarmups, + // int nEvaluations) const; + void sync() const; + CudaPtr alloc(size_t size) override { + void *ptr; + checkCudaError(cudaMalloc(&ptr, size)); + return ptr; + } + void dealloc(void *ptr) override { checkCudaError(cudaFree(ptr)); } + cudnnHandle_t cudnnHandle() const { return cudnn; } + cublasHandle_t cublasHandle() const { return cublas; } + CudaPtr getWorkspace(size_t size) const { + IT_ASSERT(size <= workspaceSize); + return workspace; + } + + private: + void runWithoutSync(const Graph &graph) const; +}; +} // namespace infini \ No newline at end of file diff --git a/include/cuda/cuda_utility.h b/include/cuda/cuda_utility.h new file mode 100644 index 00000000..354a38ff --- /dev/null +++ b/include/cuda/cuda_utility.h @@ -0,0 +1,11 @@ +#include "core/tensor.h" + +namespace infini { + +void cudaPrintFloat(float *x, int len); + +void cudaPrintTensor(const Tensor &tensor) { + cudaPrintFloat(tensor->getDataRawPtr(), tensor->size()); +} + +} // namespace infini \ No newline at end of file diff --git a/include/test.h b/include/test.h index ad8fc8a5..35131fae 100644 --- a/include/test.h +++ b/include/test.h @@ -5,12 +5,14 @@ namespace infini { +// TODO: isolate these class class DataGenerator { private: - virtual void fill(uint32_t *data, size_t size) { IT_TODO_HALT(); }; - virtual void fill(float *data, size_t size) { IT_TODO_HALT(); }; + virtual void fill(uint32_t *data, size_t size) { IT_TODO_HALT(); } + virtual void fill(float *data, size_t size) { IT_TODO_HALT(); } public: + virtual ~DataGenerator() {} void operator()(void *data, size_t size, DataType dataType) { switch (dataType) { case DataType::UInt32: @@ -26,10 +28,36 @@ class DataGenerator { }; class IncrementalGenerator : public DataGenerator { - void fill(uint32_t *data, size_t size) override { + public: + virtual ~IncrementalGenerator() {} + + private: + template void fill(T *data, size_t size) { for (size_t i = 0; i < size; i++) { data[i] = i; } } + + void fill(uint32_t *data, size_t size) override { + fill(data, size); + } + void fill(float *data, size_t size) override { fill(data, size); } +}; + +class OneGenerator : public DataGenerator { + public: + virtual ~OneGenerator() {} + + private: + template void fill(T *data, size_t size) { + for (size_t i = 0; i < size; i++) { + data[i] = 1; + } + } + + void fill(uint32_t *data, size_t size) override { + fill(data, size); + } + void fill(float *data, size_t size) override { fill(data, size); } }; } // namespace infini \ No newline at end of file diff --git a/src/core/blob.cc b/src/core/blob.cc new file mode 100644 index 00000000..72b00be4 --- /dev/null +++ b/src/core/blob.cc @@ -0,0 +1,10 @@ +#include "core/runtime.h" + +namespace infini { + +BlobObj::~BlobObj() { + // Avoid cycled inclusion + runtime->dealloc(ptr); +} + +} // namespace infini \ No newline at end of file diff --git a/src/core/common.cc b/src/core/common.cc index d1c7fd40..43192441 100644 --- a/src/core/common.cc +++ b/src/core/common.cc @@ -4,11 +4,20 @@ namespace infini { -double timeit(const std::function &func) { +double timeit(const std::function &func, int warmupRounds, + int timingRounds, const std::function &sync) { + for (int i = 0; i < warmupRounds; ++i) + func(); + if (sync) + sync(); auto start = std::chrono::high_resolution_clock::now(); - func(); + for (int i = 0; i < timingRounds; ++i) + func(); + if (sync) + sync(); auto end = std::chrono::high_resolution_clock::now(); - return std::chrono::duration(end - start).count(); + return std::chrono::duration(end - start).count() / + timingRounds; } } // namespace infini \ No newline at end of file diff --git a/src/core/graph.cc b/src/core/graph.cc index f8d85122..13e4678f 100644 --- a/src/core/graph.cc +++ b/src/core/graph.cc @@ -13,8 +13,9 @@ string GraphObj::toString() const { } void GraphObj::dataMalloc() { - for (auto &tensor : tensors) - tensor->dataMalloc(); + for (auto &tensor : tensors) { + tensor->dataMalloc(runtime); + } } Tensor GraphObj::addTensor(Shape dim, DataType dtype) { diff --git a/src/core/run_engine.cc b/src/core/runtime.cc similarity index 75% rename from src/core/run_engine.cc rename to src/core/runtime.cc index eace9218..38e3cc46 100644 --- a/src/core/run_engine.cc +++ b/src/core/runtime.cc @@ -1,9 +1,14 @@ -#include "core/run_enigne.h" +#include "core/runtime.h" +#include "core/blob.h" #include +#include +#include +#include +#include namespace infini { -void RunEngine::run(const Graph &graph, bool tune, bool profiling) const { +void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const { if (!tune && profiling) IT_TODO_HALT(); const auto &kernelRegistry = KernelRegistry::getInstance(); @@ -12,7 +17,6 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const { double totalTime = 0; std::map opTime; std::map opCnt; - std::chrono::system_clock::time_point begin, end; for (auto &op : graph->getOperators()) { // HACK: set correct data type @@ -24,7 +28,7 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const { // If no record and disable tuning, run with the default argument if (!perfData && !tune) { - kernel->compute(op); + kernel->compute(op, this); continue; } @@ -32,16 +36,18 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const { PerfRecord record; // Tune the kernel if there is no record if (!perfData) { - record = kernel->tune(op); + // TODO: record is not used + record = kernel->tune(op, this); perfEngine.setPerfData(perfKey, record); } else record = *perfData; if (!profiling) { - kernel->compute(op, *perfData); + kernel->compute(op, *perfData, this); continue; } else { - double t = timeit([&]() { kernel->compute(op, *perfData); }); + double t = + timeit([&]() { kernel->compute(op, *perfData, this); }, 1, 1); op->print(); printf(" op_time %lf\n", t); totalTime += t; @@ -53,7 +59,7 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const { printProfilingData(totalTime, opTime, opCnt); } -double RunEngine::getPerfTime(const Graph &graph, bool profiling) const { +double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const { const auto &kernelRegistry = KernelRegistry::getInstance(); auto perfEngine = PerfEngine::getInstance(); // Statistics @@ -72,7 +78,7 @@ double RunEngine::getPerfTime(const Graph &graph, bool profiling) const { PerfRecord record; // Tune the kernel if there is no record if (!perfData) { - record = kernel->tune(op); + record = kernel->tune(op, this); perfEngine.setPerfData(perfKey, record); } else record = *perfData; @@ -91,9 +97,9 @@ double RunEngine::getPerfTime(const Graph &graph, bool profiling) const { return totalTime; } -void RunEngine::printProfilingData(double totalTime, - const std::map &opTime, - const std::map &opCnt) const { +void RuntimeObj::printProfilingData(double totalTime, + const std::map &opTime, + const std::map &opCnt) const { printf("%11s %3s %7s %7s %7s\n", "Op", "Cnt", "T_tot", "Percent", "T_mean"); for (const auto &[type, t] : opTime) { printf("%11s %3d %7.3f %7.1f %7.3f\n", @@ -102,4 +108,8 @@ void RunEngine::printProfilingData(double totalTime, } } +Blob RuntimeObj::allocBlob(size_t size) { + return make_ref(shared_from_this(), alloc(size)); +} + } // namespace infini \ No newline at end of file diff --git a/src/core/tensor.cc b/src/core/tensor.cc index c6aac89d..ec6991ae 100644 --- a/src/core/tensor.cc +++ b/src/core/tensor.cc @@ -1,15 +1,12 @@ -#include +#include "core/tensor.h" +#include "core/blob.h" +#include "core/runtime.h" + namespace infini { TensorObj::TensorObj(const Shape &shape, DataType dtype) : TensorBaseObj(shape.size(), dtype), shape(shape) {} -void TensorObj::dataMalloc() { - IT_ASSERT(data == nullptr); - // initialized to zero - data.reset(reinterpret_cast(calloc(size(), sizeof(VType)))); -} - VType TensorObj::getData(const Shape &pos) const { return getData(getOffset(pos)); } @@ -37,24 +34,42 @@ size_t TensorObj::size() const { return ret; } -void TensorObj::copyData(VType *dptr) { +template void TensorObj::copyData(const T *dptr) { + // TODO: cuda IT_ASSERT(data != nullptr); + auto ptr = data->getPtr(); size_t sz = size(); #pragma omp parallel for for (size_t i = 0; i < sz; ++i) { - data[i] = dptr[i]; + ptr[i] = dptr[i]; } } + void TensorObj::copyData(vector dataVector) { IT_ASSERT(dataVector.size() >= size()); copyData(dataVector.data()); } +void TensorObj::copyData(vector dataVector) { + IT_ASSERT(dataVector.size() >= size()); + copyData(dataVector.data()); +} + void TensorObj::printData() const { IT_ASSERT(data != nullptr); + if (dtype == DataType::Float32) + printDataFloat(); + else if (dtype == DataType::UInt32) + printDataUint32_t(); + else + IT_TODO_HALT(); +} + +void TensorObj::printDataFloat() const { std::cout << "Tensor: " << guid << std::endl; auto numDims = shape.size(); auto dimSzVec = std::vector(numDims, 1); + auto ptr = data->getPtr(); dimSzVec[numDims - 1] = shape[numDims - 1]; for (int i = numDims - 1; i != 0; --i) dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1]; @@ -64,7 +79,35 @@ void TensorObj::printData() const { std::cout << "["; } } - std::cout << data[i]; + printf("%.1f", ptr[i]); + for (size_t j = 0; j < numDims; ++j) { + if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) { + std::cout << "]"; + } + } + if (i != size() - 1) + std::cout << ", "; + if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1) + std::cout << std::endl; + } +} + +void TensorObj::printDataUint32_t() const { + IT_ASSERT(data != nullptr); + std::cout << "Tensor: " << guid << std::endl; + auto numDims = shape.size(); + auto dimSzVec = std::vector(numDims, 1); + auto ptr = data->getPtr(); + dimSzVec[numDims - 1] = shape[numDims - 1]; + for (int i = numDims - 1; i != 0; --i) + dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1]; + for (size_t i = 0, iEnd = size(); i < iEnd; ++i) { + for (size_t j = 0; j < numDims; ++j) { + if (i % dimSzVec[j] == 0) { + std::cout << "["; + } + } + std::cout << ptr[i]; for (size_t j = 0; j < numDims; ++j) { if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) { std::cout << "]"; @@ -80,13 +123,26 @@ void TensorObj::printData() const { bool TensorObj::equalData(const Tensor &rhs) const { IT_ASSERT(data != nullptr); IT_ASSERT(rhs->data != nullptr); + // TODO: deal with data type + auto ptr = data->getPtr(); + auto ptrRhs = rhs->data->getPtr(); if (shape != rhs->getDims()) return false; size_t sz = size(); for (size_t i = 0; i < sz; ++i) - if (data[i] != rhs->data[i]) + if (ptr[i] != ptrRhs[i]) return false; return true; } +void TensorObj::dataMalloc(const Runtime &runtime) { + IT_ASSERT(data == nullptr); + size_t bytesPerElement; + if (getDType() == DataType::Float32) + bytesPerElement = sizeof(float); + else if (getDType() == DataType::UInt32) + bytesPerElement = sizeof(uint32_t); + data = runtime->allocBlob(size() * bytesPerElement); +} + }; // namespace infini \ No newline at end of file diff --git a/src/core/tensor_base.cc b/src/core/tensor_base.cc index 84cad959..9414361b 100644 --- a/src/core/tensor_base.cc +++ b/src/core/tensor_base.cc @@ -1,9 +1,14 @@ -#include +#include "core/tensor_base.h" +#include "core/blob.h" +#include "core/runtime.h" namespace infini { TensorBaseObj::TensorBaseObj(int dim, DataType dtype) : dim(dim), dtype(dtype) {} -VType TensorBaseObj::getData(size_t offset) const { return data[offset]; } +VType TensorBaseObj::getData(size_t offset) const { + // TODO: check cuda array + return (data->getPtr())[offset]; +} }; // namespace infini \ No newline at end of file diff --git a/src/cuda/cuda_runtime.cc b/src/cuda/cuda_runtime.cc new file mode 100644 index 00000000..59fcabe2 --- /dev/null +++ b/src/cuda/cuda_runtime.cc @@ -0,0 +1,32 @@ +#include "cuda/cuda_runtime.h" + +namespace infini { + +void CudaRuntimeObj::runWithoutSync(const Graph &graph) const { + const auto &kernelRegistry = KernelRegistry::getInstance(); + auto perfEngine = PerfEngine::getInstance(); + + for (auto &op : graph->getOperators()) { + // HACK: set correct data type + auto kernelAttrs = + KernelAttrs{device, op->getOpType(), DataType::Float32}; + Kernel *kernel = kernelRegistry.getKernel(kernelAttrs); + auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; + std::optional perfData = perfEngine.getPerfData(perfKey); + if (perfData) + kernel->compute(op, *perfData, this); + else + kernel->compute(op, this); + } +} + +void CudaRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const { + if (tune || profiling) + IT_TODO_HALT(); + runWithoutSync(graph); + sync(); +} + +void CudaRuntimeObj::sync() const { cudaDeviceSynchronize(); } + +} // namespace infini \ No newline at end of file diff --git a/src/cuda/cuda_utility.cu b/src/cuda/cuda_utility.cu new file mode 100644 index 00000000..ca0a2d01 --- /dev/null +++ b/src/cuda/cuda_utility.cu @@ -0,0 +1,20 @@ +#include "cuda/cuda_common.h" +#include + +__global__ void cudaPrintFloatImpl(float *x, int len) { + int start = threadIdx.x + blockDim.x * blockIdx.x; + if (start == 0) { + for (int i = 0; i < len; ++i) { + printf("%.3f ", x[i]); + } + printf("\n"); + } +} + +namespace infini { + +void cudaPrintFloat(float *x, int len) { + cudaPrintFloatImpl<<<1, 1>>>(x, len); + cudaDeviceSynchronize(); +} +} // namespace infini \ No newline at end of file diff --git a/src/kerels/cpu/conv.cc b/src/kerels/cpu/conv.cc index f68479d4..ac8f7b43 100644 --- a/src/kerels/cpu/conv.cc +++ b/src/kerels/cpu/conv.cc @@ -4,11 +4,12 @@ namespace infini { template class NaiveConv : public Kernel { - void compute(const Operator &_op, const PerfRecord &record) const override { + void compute(const Operator &_op, const PerfRecord &record, + const RuntimeObj *context) const override { auto op = as(_op); - T *iptr = reinterpret_cast(op->getInputs(0)->getDataPtr().get()); - T *wptr = reinterpret_cast(op->getInputs(1)->getDataPtr().get()); - T *optr = reinterpret_cast(op->getOutput()->getDataPtr().get()); + T *iptr = op->getInputs(0)->getDataRawPtr(); + T *wptr = op->getInputs(1)->getDataRawPtr(); + T *optr = op->getOutput()->getDataRawPtr(); auto [n, c, h, w, f, r, s] = op->getNCHWFRS(); auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); int cpg = op->getChannelPerGroup(); @@ -45,10 +46,13 @@ template class NaiveConv : public Kernel { } } - void compute(const Operator &op) const override { compute(op, {}); } + void compute(const Operator &op, const RuntimeObj *context) const override { + compute(op, {}, context); + } - PerfRecord tune(const Operator &op) const override { - return PerfRecord{.time = timeit([this, &op]() { compute(op); })}; + PerfRecord tune(const Operator &op, + const RuntimeObj *context) const override { + return PerfRecord(timeit([&]() { compute(op, context); })); } }; diff --git a/src/kerels/cpu/matmul.cc b/src/kerels/cpu/matmul.cc index a1e83103..d7182954 100644 --- a/src/kerels/cpu/matmul.cc +++ b/src/kerels/cpu/matmul.cc @@ -4,11 +4,12 @@ namespace infini { template class NaiveMatmul : public Kernel { - void compute(const Operator &_op, const PerfRecord &record) const override { + void compute(const Operator &_op, const PerfRecord &record, + const RuntimeObj *context) const override { auto op = as(_op); - T *A = reinterpret_cast(op->getInputs(0)->getDataPtr().get()); - T *B = reinterpret_cast(op->getInputs(1)->getDataPtr().get()); - T *C = reinterpret_cast(op->getOutput()->getDataPtr().get()); + T *A = op->getInputs(0)->getDataRawPtr(); + T *B = op->getInputs(1)->getDataRawPtr(); + T *C = op->getOutput()->getDataRawPtr(); IT_ASSERT(op->getTransA() == false && op->getTransB() == false); IT_ASSERT(op->getAct() == ActType::None); IT_ASSERT(op->getB() == 1); @@ -23,10 +24,15 @@ template class NaiveMatmul : public Kernel { } } - void compute(const Operator &op) const override { compute(op, {}); } + void compute(const Operator &op, const RuntimeObj *context) const override { + compute(op, {}, context); + } - PerfRecord tune(const Operator &op) const override { - return PerfRecord{.time = timeit([this, &op]() { compute(op); })}; + PerfRecord tune(const Operator &op, + const RuntimeObj *context) const override { + PerfRecord ret; + ret.time = timeit([&]() { compute(op, context); }); + return ret; } }; diff --git a/src/kerels/cuda/conv.cc b/src/kerels/cuda/conv.cc new file mode 100644 index 00000000..8289ea4f --- /dev/null +++ b/src/kerels/cuda/conv.cc @@ -0,0 +1,208 @@ +#include "operators/conv.h" +#include "core/kernel.h" +#include "cuda/cuda_runtime.h" + +namespace infini { + +static constexpr int N_ALGO = 8; +static constexpr cudnnConvolutionFwdAlgo_t ALGOS[N_ALGO] = { + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, + CUDNN_CONVOLUTION_FWD_ALGO_FFT, + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED}; + +struct ConvCuDnnPerfRecord : public PerfRecord { + int algo = 0; // cudnnConvolutionFwdAlgo_t + size_t workspaceSize = 100000; + bool fuseAct = false; +}; + +class convCudnn : public Kernel { + + bool cuDNNUnfused(const Ref &op, const ConvCuDnnPerfRecord &record, + const CudaRuntimeObj *context) const { + cudnnStatus_t stat; + void *const inData = (op->getInputs(0)->getDataRawPtr()); + void *const knData = (op->getInputs(1)->getDataRawPtr()); + if (op->getInputs(2) != nullptr) + IT_TODO_HALT(); + // void *const biasData = (op->getInputs(2)->getDataRawPtr()); + void *const outData = (op->getOutput()->getDataRawPtr()); + + const auto [n, c, h, w, f, r, s] = op->getNCHWFRS(); + const int cpg = op->getChannelPerGroup(); + const int g = c / cpg; + const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); + + int channelsPerGrp = cpg, channels = c; + + // get inputs + cudnnTensorDescriptor_t inDesc; + checkCudnnError(cudnnCreateTensorDescriptor(&inDesc)); + checkCudnnError(cudnnSetTensor4dDescriptor( + inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, channels, h, w)); + + // get kernels + cudnnFilterDescriptor_t knDesc; + checkCudnnError(cudnnCreateFilterDescriptor(&knDesc)); + checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT, + CUDNN_TENSOR_NCHW, f, + channelsPerGrp, r, s)); + + // get bias + cudnnTensorDescriptor_t biasDesc; + checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc)); + checkCudnnError(cudnnSetTensor4dDescriptor( + biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1)); + + // get convlution descriptor + cudnnConvolutionDescriptor_t convDesc; + checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc)); + // TODO: CUDNN_CONVOLUTION is a tunable argument + checkCudnnError(cudnnSetConvolution2dDescriptor( + convDesc, ph, pw, sh, sw, dh, dw, CUDNN_CROSS_CORRELATION, + CUDNN_DATA_FLOAT)); + if (g > 1) { + checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g)); + } + + // get activation descriptor + cudnnActivationDescriptor_t actDesc; + checkCudnnError(cudnnCreateActivationDescriptor(&actDesc)); + // NOT_PROPAGATE_NAN is requierd by + // cudnnConvolotionBiasActivationForward + switch (op->getAct()) { + case ActType::Relu: + checkCudnnError(cudnnSetActivationDescriptor( + actDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0)); + break; + case ActType::Sigmoid: + checkCudnnError(cudnnSetActivationDescriptor( + actDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0)); + break; + case ActType::None: + checkCudnnError( + cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_IDENTITY, + CUDNN_NOT_PROPAGATE_NAN, 0)); + break; + default: + assert(false); + } + + // get outputs + int outn, outc, outh, outw; + checkCudnnError(cudnnGetConvolution2dForwardOutputDim( + convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw)); + cudnnTensorDescriptor_t outDesc; + checkCudnnError(cudnnCreateTensorDescriptor(&outDesc)); + checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, outn, outc, + outh, outw)); + IT_ASSERT((vector{outn, outc, outh, outw}) == + op->getOutput()->getDims(), + "cuDNN output shape mismatches with OP output shape"); + + // get workspace + size_t wsSize = record.workspaceSize; + stat = cudnnGetConvolutionForwardWorkspaceSize( + context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc, + ALGOS[record.algo], &wsSize); + if (stat != CUDNN_STATUS_SUCCESS) + return false; + // assert(wsSize < (size_t)3 * 1024 * 1024 * 1024); + // if (wsSize >= (size_t)10 * 1024 * 1024 * 1024) + // continue; + CudaPtr wsData = context->getWorkspace(wsSize); + float alpha = 1.f, beta = 0.f; + + stat = cudnnConvolutionForward(context->cudnnHandle(), &alpha, inDesc, + inData, knDesc, knData, convDesc, + ALGOS[record.algo], wsData, wsSize, + &beta, outDesc, outData); + if (stat != CUDNN_STATUS_SUCCESS) + return false; + // TODO: + // // bias + // if (bias != nullptr) { + // auto sz = op.getOutputs()[0]->size(); + // // TODO: element wise + // t += sz * 2 / 400; + // } + // // act + // if (act != None) { + // stat = cudnnActivationForward(cudnnHandle(), actDesc, + // &alpha, inDesc, inData, + // &beta, outDesc, outData); + // checkCudaError(cudaDeviceSynchronize()); + // end = ch::high_resolution_clock::now(); + // if (stat != CUDNN_STATUS_SUCCESS) { + // durtime = INFINITY; + // break; + // } + // t += + // ch::duration_cast>(end - + // beg).count() * 1000; // ms + // } + + // best = ConvResult{durtime, ALGOS[i], wsSize, false}; + + // // w/ bias & act + // for (int j = 0; j < rounds + warmupRounds; ++j) { + // cudnnStatus_t stat; + // if (j == warmupRounds) { + // checkCudaError(cudaDeviceSynchronize()); + // beg = ch::high_resolution_clock::now(); + // } + // stat = cudnnConvolutionBiasActivationForward( + // cudnnHandle(), &alpha, inDesc, inData, knDesc, knData, + // convDesc, ALGOS[i], wsData, wsSize, &beta, outDesc, + // outData, biasDesc, biasData, actDesc, outDesc, outData); + // if (stat != CUDNN_STATUS_SUCCESS) { + // // checkCudnnError(stat); + // // Do not checkCudnnError since not all algorithms are + // // supported + // durtime_fuse = INFINITY; + // break; + // } + // } + + // Destories in CUDA does not require sync. But cuDNN does not state + // whether sync is required before destories. + checkCudnnError(cudnnDestroyTensorDescriptor(inDesc)); + checkCudnnError(cudnnDestroyTensorDescriptor(outDesc)); + checkCudnnError(cudnnDestroyFilterDescriptor(knDesc)); + checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc)); + return true; + } + + void compute(const Operator &op, const RuntimeObj *context) const override { + ConvCuDnnPerfRecord record; // with paramters in default ctor + compute(op, record, context); + } + + PerfRecord tune(const Operator &_op, + const RuntimeObj *_context) const override { + // TODO: real tuning + ConvCuDnnPerfRecord ret; + ret.time = timeit([&]() { compute(_op, _context); }); + return ret; + } + + void compute(const Operator &_op, const PerfRecord &_record, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto &record = dynamic_cast(_record); + auto context = dynamic_cast(_context); + bool success = cuDNNUnfused(op, record, context); + IT_ASSERT(success); + } +}; + +REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float32, convCudnn, + "Conv_cuDNN_CUDA_Float32"); + +} // namespace infini \ No newline at end of file diff --git a/test/core/test_graph.cc b/test/core/test_graph.cc index 7c764141..ad1752f8 100644 --- a/test/core/test_graph.cc +++ b/test/core/test_graph.cc @@ -1,45 +1,48 @@ +#include "core/blob.h" #include "core/graph.h" -#include "core/run_enigne.h" +#include "core/runtime.h" #include "operators/matmul.h" #include "test.h" namespace infini { TEST(Graph, build_and_run) { - Graph g = make_ref(); + Runtime runtime = make_ref(); + Graph g = make_ref(runtime); Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32); Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32); Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32); g->dataMalloc(); - i0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); - w0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + i0->copyData(vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + w0->copyData(vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); g->addOpWithOutputs(i0, w0, o0); - RunEngine(Device::CPU).run(g); + runtime->run(g); // check answer auto ans = make_ref(Shape{1, 2, 4}, DataType::UInt32); - ans->dataMalloc(); - ans->copyData({38, 44, 50, 56, 83, 98, 113, 128}); + ans->dataMalloc(runtime); + ans->copyData(vector{38, 44, 50, 56, 83, 98, 113, 128}); EXPECT_TRUE(o0->equalData(ans)); } TEST(Graph, perf_engine) { - Graph g = make_ref(); + Runtime runtime = make_ref(); + Graph g = make_ref(runtime); Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32); Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32); auto matmul = g->addOp(i0, w0, nullptr); g->dataMalloc(); - i0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); - w0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); - RunEngine(Device::CPU).run(g, true, true); - double perfTime = RunEngine(Device::CPU).getPerfTime(g); + i0->copyData(vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + w0->copyData(vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + runtime->run(g, true, true); + double perfTime = runtime->getPerfTime(g); // The example matmul takes 0.0036ms with one core EXPECT_GT(perfTime, 0); EXPECT_LT(perfTime, 0.01); // check answer auto ans = make_ref(Shape{1, 2, 4}, DataType::UInt32); - ans->dataMalloc(); - ans->copyData({38, 44, 50, 56, 83, 98, 113, 128}); + ans->dataMalloc(runtime); + ans->copyData(vector{38, 44, 50, 56, 83, 98, 113, 128}); EXPECT_TRUE(matmul->getOutput()->equalData(ans)); } diff --git a/test/core/test_hash.cc b/test/core/test_hash.cc index 4f3f29d4..3c244acb 100644 --- a/test/core/test_hash.cc +++ b/test/core/test_hash.cc @@ -1,5 +1,5 @@ #include "core/graph.h" -#include "core/run_enigne.h" +#include "core/runtime.h" #include "operators/matmul.h" #include "test.h" @@ -8,7 +8,7 @@ namespace infini { TEST(Hash, OperatorHash) { OpPerfKey key1(0, OpType::Unknown), key2(0, OpType::Unknown); { // build with addOpWithOutputs - Graph g = make_ref(); + Graph g = make_ref(nullptr); Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32); Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32); Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32); @@ -18,7 +18,7 @@ TEST(Hash, OperatorHash) { EXPECT_GT(key1.attrs.size(), 5); } { // build with addOp - Graph g = make_ref(); + Graph g = make_ref(nullptr); Tensor i0 = g->addTensor({2, 2, 3}, DataType::UInt32); Tensor w0 = g->addTensor({2, 3, 4}, DataType::UInt32); auto matmul = g->addOp(i0, w0, nullptr); diff --git a/test/operators/test_conv.cc b/test/operators/test_conv.cc index 4584c660..f5c6f7d9 100644 --- a/test/operators/test_conv.cc +++ b/test/operators/test_conv.cc @@ -1,21 +1,24 @@ #include "core/graph.h" -#include "core/run_enigne.h" +#include "core/runtime.h" +#include "cuda/cuda_runtime.h" +#include "cuda/cuda_utility.h" #include "operators/conv.h" #include "test.h" namespace infini { TEST(Conv, ShapeInference) { + auto runtime = make_ref(); // Padding modes { - Graph g = make_ref(); + Graph g = make_ref(runtime); Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32); Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32); auto conv = g->addOp(i0, w0, nullptr, 1, 1); EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4})); } { - Graph g = make_ref(); + Graph g = make_ref(runtime); Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32); Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32); auto conv = @@ -23,7 +26,7 @@ TEST(Conv, ShapeInference) { EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4})); } { - Graph g = make_ref(); + Graph g = make_ref(runtime); Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32); Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32); auto conv = @@ -31,7 +34,7 @@ TEST(Conv, ShapeInference) { EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 2, 2})); } { // dilation & stride - Graph g = make_ref(); + Graph g = make_ref(runtime); Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32); Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32); auto conv = g->addOp(i0, w0, nullptr, 1, 1, 2, 1, 1, 2); @@ -40,7 +43,8 @@ TEST(Conv, ShapeInference) { } TEST(Conv, NaiveCPU) { - Graph g = make_ref(); + auto runtime = make_ref(); + Graph g = make_ref(runtime); Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32); Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32); auto conv = g->addOp(i0, w0, nullptr, 1, 1, 2, 1, 1, 2); @@ -48,16 +52,75 @@ TEST(Conv, NaiveCPU) { g->dataMalloc(); i0->setData(IncrementalGenerator()); w0->setData(IncrementalGenerator()); - RunEngine(Device::CPU).run(g, true, true); - double perfTime = RunEngine(Device::CPU).getPerfTime(g); - // The example matmul takes 0.0036ms with one core + runtime->run(g, true, true); + double perfTime = runtime->getPerfTime(g); + // The example Conv takes 0.015ms with one core EXPECT_GT(perfTime, 0); - EXPECT_LT(perfTime, 5); + EXPECT_LT(perfTime, 0.1); // check answer auto ans = make_ref(Shape{1, 2, 2, 2}, DataType::UInt32); - ans->dataMalloc(); - ans->copyData({4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656}); + ans->dataMalloc(runtime); + ans->copyData( + vector{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656}); EXPECT_TRUE(conv->getOutput()->equalData(ans)); } +void testConvCudnn( + const std::function &generator, + vector ansVec) { + auto cpuRuntime = make_ref(); + auto cudaRuntime = make_ref(); + // Build CUDA graph + Graph g = make_ref(cudaRuntime); + Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::Float32); + Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::Float32); + auto conv = g->addOp(i0, w0, nullptr, 1, 1, 2, 1, 1, 2); + + // allocate CUDA memory + g->dataMalloc(); + + // Build input and output data on CPU + auto cpui0 = make_ref(Shape{1, 3, 4, 4}, DataType::Float32); + cpui0->dataMalloc(cpuRuntime); + cpui0->setData(generator); + + auto cpuw0 = make_ref(Shape{2, 3, 3, 3}, DataType::Float32); + cpuw0->dataMalloc(cpuRuntime); + cpuw0->setData(generator); + + auto ans = make_ref(Shape{1, 2, 2, 2}, DataType::Float32); + ans->dataMalloc(cpuRuntime); + ans->copyData(ansVec); + + // Copy inputs from CPU to CUDA + cudaMemcpy(i0->getDataRawPtr(), cpui0->getDataRawPtr(), + cpui0->size() * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(w0->getDataRawPtr(), cpuw0->getDataRawPtr(), + cpuw0->size() * sizeof(float), cudaMemcpyHostToDevice); + // Execute on CUDA + cudaRuntime->run(g); + // double perfTime = cudaRuntime->getPerfTime(g); + // // The example Conv takes 0.015ms with one core + // EXPECT_GT(perfTime, 0); + // EXPECT_LT(perfTime, 0.1); + + // copy CUDA output to CPU + auto o0 = conv->getOutput(); + auto cpuo0 = make_ref(Shape{1, 2, 2, 2}, DataType::Float32); + cpuo0->dataMalloc(cpuRuntime); + cudaMemcpy(cpuo0->getDataRawPtr(), + conv->getOutput()->getDataRawPtr(), + cpuo0->size() * sizeof(float), cudaMemcpyDeviceToHost); + + // check results on CPU + EXPECT_TRUE(cpuo0->equalData(ans)); +} + +TEST(Conv, cuDNN) { + testConvCudnn(OneGenerator(), + vector{12, 12, 18, 18, 12, 12, 18, 18}); + testConvCudnn( + IncrementalGenerator(), + vector{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656}); +} } // namespace infini \ No newline at end of file diff --git a/test/script/env_lotus.sh b/test/script/env_lotus.sh new file mode 100644 index 00000000..a65a746d --- /dev/null +++ b/test/script/env_lotus.sh @@ -0,0 +1,2 @@ +spack load cuda@11.0.2 cudnn@8.0.3.33-11.0 +export CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc