forked from jiuyuan/InfiniTensor
Add CUDA runtime (#6)
* Fix: add warm-up and repetition in timing * Add: CUDA runtime and float support * Refactor: Cuda and Cpu runtimes inherit Runtime * Add: environment script for Lotus * Add: Lotus build instructions * Update README.md Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
This commit is contained in:
parent
9303ddda8e
commit
04ea5eed38
|
@ -4,6 +4,7 @@ include(CMakeDependentOption)
|
||||||
project(InfiniTensor C CXX)
|
project(InfiniTensor C CXX)
|
||||||
|
|
||||||
# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
|
# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
|
||||||
|
option(USE_CUDA "Support CUDA GPU" ON)
|
||||||
option(BUILD_TEST "Build tests" ON)
|
option(BUILD_TEST "Build tests" ON)
|
||||||
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
|
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
|
||||||
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
|
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
|
||||||
|
@ -21,7 +22,6 @@ find_package(
|
||||||
Python
|
Python
|
||||||
COMPONENTS Interpreter Development
|
COMPONENTS Interpreter Development
|
||||||
REQUIRED)
|
REQUIRED)
|
||||||
find_package(CUDA REQUIRED)
|
|
||||||
# OpenMP
|
# OpenMP
|
||||||
find_package(OpenMP)
|
find_package(OpenMP)
|
||||||
if(OpenMP_C_FOUND)
|
if(OpenMP_C_FOUND)
|
||||||
|
@ -53,14 +53,19 @@ if(BUILD_TEST)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
file(GLOB_RECURSE SRC src/*.cc src/*.cu)
|
file(GLOB_RECURSE SRC src/*.cc src/*.cu)
|
||||||
# file(GLOB_RECURSE FFI src/ffi/ffi_pet.cc)
|
|
||||||
# list(REMOVE_ITEM SRC ${TEST} ${FFI})
|
|
||||||
|
|
||||||
add_library(InfiniTensor SHARED ${SRC})
|
add_library(InfiniTensor SHARED ${SRC})
|
||||||
# Target
|
|
||||||
# cuda_add_library(it SHARED ${SRC})
|
if(USE_CUDA)
|
||||||
# cuda_add_cublas_to_target(it) # cublas
|
# set(CUDA_HOST_COMPILER /home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc)
|
||||||
# # target_link_libraries(infini_cpp cudnn curand nlohmann_json::nlohmann_json pybind11::embed)
|
enable_language(CUDA)
|
||||||
|
# TODO: how to set option for CUDA_HOST_COMPILER. Now env var CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc takes effect.
|
||||||
|
# option(CUDA_HOST_COMPILER "" ${CMAKE_C_COMPILER})
|
||||||
|
# TODO: find_package seems unnecessary for CMake >= 3.8
|
||||||
|
find_package(CUDA REQUIRED)
|
||||||
|
# message("CUBLAS_LIBRARIES: ${CUDA_LIBRARIES}")
|
||||||
|
target_link_libraries(InfiniTensor cudnn curand cublas ${CUDA_LIBRARIES})
|
||||||
|
endif()
|
||||||
|
|
||||||
# # Python bindings
|
# # Python bindings
|
||||||
# pybind11_add_module(infini MODULE ${FFI})
|
# pybind11_add_module(infini MODULE ${FFI})
|
||||||
|
|
10
README.md
10
README.md
|
@ -1 +1,9 @@
|
||||||
# InfiniTensor
|
# InfiniTensor
|
||||||
|
|
||||||
|
## Compilation on Lotus
|
||||||
|
``` bash
|
||||||
|
# Enter the root of InfiniTensor
|
||||||
|
source test/script/env_lotus.sh
|
||||||
|
mkdir build && cd build
|
||||||
|
cmake .. && make -j 12
|
||||||
|
```
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/common.h"
|
||||||
|
#include "core/ref.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class RuntimeObj;
|
||||||
|
using Runtime = Ref<RuntimeObj>;
|
||||||
|
|
||||||
|
class BlobObj {
|
||||||
|
// Runtime might be replaced with a raw pointer for optimization
|
||||||
|
Runtime runtime;
|
||||||
|
void *ptr;
|
||||||
|
|
||||||
|
public:
|
||||||
|
BlobObj(Runtime runtime, void *ptr) : runtime(runtime), ptr(ptr) {}
|
||||||
|
BlobObj(BlobObj &other) = delete;
|
||||||
|
BlobObj &operator=(BlobObj const &) = delete;
|
||||||
|
~BlobObj();
|
||||||
|
|
||||||
|
template <typename T> T getPtr() const { return reinterpret_cast<T>(ptr); }
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -69,6 +69,8 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
double timeit(const std::function<void()> &func);
|
double timeit(const std::function<void()> &func, int warmupRounds = 200,
|
||||||
|
int timingRounds = 200,
|
||||||
|
const std::function<void(void)> &sync = {});
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -4,16 +4,16 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
// TODO: graph should be attached to a context
|
|
||||||
class GraphObj : public Object {
|
class GraphObj : public Object {
|
||||||
protected:
|
protected:
|
||||||
|
Runtime runtime;
|
||||||
TensorVec tensors;
|
TensorVec tensors;
|
||||||
TensorVec inputs;
|
TensorVec inputs;
|
||||||
TensorVec outputs;
|
TensorVec outputs;
|
||||||
OpVec ops;
|
OpVec ops;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Graph(OpVec oplist);
|
GraphObj(Runtime runtime) : runtime(runtime){};
|
||||||
string toString() const override;
|
string toString() const override;
|
||||||
|
|
||||||
Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
|
Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
|
||||||
|
|
|
@ -5,8 +5,14 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
class RuntimeObj; // Forward declaration for Kernel::compute
|
||||||
|
|
||||||
struct PerfRecord {
|
struct PerfRecord {
|
||||||
double time; // in milliseconds
|
PerfRecord(){};
|
||||||
|
PerfRecord(double time) : time(time){};
|
||||||
|
virtual ~PerfRecord() {}
|
||||||
|
|
||||||
|
double time = 0; // in milliseconds
|
||||||
};
|
};
|
||||||
|
|
||||||
class Kernel {
|
class Kernel {
|
||||||
|
@ -20,14 +26,16 @@ class Kernel {
|
||||||
* are required, inherit from PerfRecord and add extra parameters.
|
* are required, inherit from PerfRecord and add extra parameters.
|
||||||
* Otherwire, use PerfRecord directly.
|
* Otherwire, use PerfRecord directly.
|
||||||
*/
|
*/
|
||||||
virtual void compute(const Operator &op,
|
virtual void compute(const Operator &op, const PerfRecord &record,
|
||||||
const PerfRecord &record) const = 0;
|
const RuntimeObj *context) const = 0;
|
||||||
/**
|
/**
|
||||||
* @brief Executes an op with a default parameter.
|
* @brief Executes an op with a default parameter.
|
||||||
*/
|
*/
|
||||||
virtual void compute(const Operator &op) const = 0;
|
virtual void compute(const Operator &op,
|
||||||
|
const RuntimeObj *context) const = 0;
|
||||||
// Premise: op is idempotent since it is called multiple times.
|
// Premise: op is idempotent since it is called multiple times.
|
||||||
virtual PerfRecord tune(const Operator &op) const = 0;
|
virtual PerfRecord tune(const Operator &op,
|
||||||
|
const RuntimeObj *context) const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
class KernelRegistry {
|
class KernelRegistry {
|
||||||
|
|
|
@ -1,26 +0,0 @@
|
||||||
#pragma once
|
|
||||||
#include "core/graph.h"
|
|
||||||
#include "core/kernel.h"
|
|
||||||
#include "core/perf_engine.h"
|
|
||||||
|
|
||||||
namespace infini {
|
|
||||||
|
|
||||||
class RunEngine {
|
|
||||||
private:
|
|
||||||
Device device;
|
|
||||||
|
|
||||||
public:
|
|
||||||
RunEngine(Device device) : device(device) {}
|
|
||||||
~RunEngine() {}
|
|
||||||
|
|
||||||
void run(const Graph &graph, bool tune = false,
|
|
||||||
bool profiling = false) const;
|
|
||||||
double getPerfTime(const Graph &graph, bool profiling = false) const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
void printProfilingData(double totTime,
|
|
||||||
const std::map<OpType, double> &opTime,
|
|
||||||
const std::map<OpType, int> &opCnt) const;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace infini
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/perf_engine.h"
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
||||||
|
protected:
|
||||||
|
Device device;
|
||||||
|
|
||||||
|
public:
|
||||||
|
RuntimeObj(Device device) : device(device) {}
|
||||||
|
RuntimeObj(RuntimeObj &other) = delete;
|
||||||
|
RuntimeObj &operator=(RuntimeObj const &) = delete;
|
||||||
|
virtual ~RuntimeObj() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Execute a graph.
|
||||||
|
*
|
||||||
|
* @param graph
|
||||||
|
* @param tune If there is no performance record, whether to tune it. These
|
||||||
|
* can be independent method.
|
||||||
|
* @param profiling Whether to print breakdown of time
|
||||||
|
*/
|
||||||
|
virtual void run(const Graph &graph, bool tune = false,
|
||||||
|
bool profiling = false) const = 0;
|
||||||
|
virtual void *alloc(size_t size) = 0;
|
||||||
|
virtual void dealloc(void *ptr) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Get the execution time of each operator in performance record. No
|
||||||
|
* execution happens.
|
||||||
|
*
|
||||||
|
* @param graph
|
||||||
|
* @param profiling Whether to print breakdown of time
|
||||||
|
* @return double Return the sum of perf time for each operator
|
||||||
|
*/
|
||||||
|
double getPerfTime(const Graph &graph, bool profiling = false) const;
|
||||||
|
Blob allocBlob(size_t size);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void printProfilingData(double totTime,
|
||||||
|
const std::map<OpType, double> &opTime,
|
||||||
|
const std::map<OpType, int> &opCnt) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: change inheritance relation
|
||||||
|
class CpuRuntimeObj : public RuntimeObj {
|
||||||
|
public:
|
||||||
|
CpuRuntimeObj() : RuntimeObj(Device::CPU) {}
|
||||||
|
|
||||||
|
void run(const Graph &graph, bool tune = false,
|
||||||
|
bool profiling = false) const override;
|
||||||
|
void dealloc(void *ptr) override { return free(ptr); };
|
||||||
|
|
||||||
|
void *alloc(size_t size) override {
|
||||||
|
return calloc((size + sizeof(uint64_t) - 1) / sizeof(uint64_t),
|
||||||
|
sizeof(uint64_t));
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -1,5 +1,6 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "core/tensor_base.h"
|
#include "core/tensor_base.h"
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
@ -16,21 +17,46 @@ class TensorObj : public TensorBaseObj {
|
||||||
string toString() const override;
|
string toString() const override;
|
||||||
|
|
||||||
size_t size() const;
|
size_t size() const;
|
||||||
void dataMalloc();
|
|
||||||
|
|
||||||
Shape getDims() const { return shape; }
|
Shape getDims() const { return shape; }
|
||||||
|
|
||||||
size_t getOffset(const Shape &ds) const;
|
size_t getOffset(const Shape &ds) const;
|
||||||
using TensorBaseObj::getData;
|
using TensorBaseObj::getData;
|
||||||
VType getData(const Shape &pos) const;
|
VType getData(const Shape &pos) const;
|
||||||
void copyData(VType *dptr);
|
void dataMalloc(const Runtime &runtime);
|
||||||
|
// void copyData(VType *dptr);
|
||||||
|
template <typename T> void copyData(const T *dptr);
|
||||||
void copyData(vector<VType> dataVector);
|
void copyData(vector<VType> dataVector);
|
||||||
|
void copyData(vector<float> dataVector);
|
||||||
void printData() const;
|
void printData() const;
|
||||||
|
// TODO: merge these methods
|
||||||
bool equalData(const Tensor &rhs) const;
|
bool equalData(const Tensor &rhs) const;
|
||||||
void
|
template <typename T> bool equalData(const Tensor &rhs) const {
|
||||||
setData(std::function<void(void *, size_t, DataType)> generator) const {
|
IT_ASSERT(data != nullptr);
|
||||||
generator((void *)(data.get()), size(), dtype);
|
IT_ASSERT(rhs->data != nullptr);
|
||||||
|
// TODO: deal with data type
|
||||||
|
auto ptr = data->getPtr<T *>();
|
||||||
|
auto ptrRhs = rhs->data->getPtr<T *>();
|
||||||
|
if (shape != rhs->getDims())
|
||||||
|
return false;
|
||||||
|
size_t sz = size();
|
||||||
|
for (size_t i = 0; i < sz; ++i)
|
||||||
|
if (fabs(ptr[i] - ptrRhs[i]) /
|
||||||
|
std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
|
||||||
|
1e-6) {
|
||||||
|
printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
void setData(
|
||||||
|
const std::function<void(void *, size_t, DataType)> &generator) const {
|
||||||
|
generator(data->getPtr<void *>(), size(), dtype);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void printDataFloat() const;
|
||||||
|
void printDataUint32_t() const;
|
||||||
// void setDims(const Dim &dms) { dims = dms; }
|
// void setDims(const Dim &dms) { dims = dms; }
|
||||||
|
|
||||||
// bool dataRand(int seed = 0) {
|
// bool dataRand(int seed = 0) {
|
||||||
|
|
|
@ -1,19 +1,23 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
#include "core/blob.h"
|
||||||
#include "core/object.h"
|
#include "core/object.h"
|
||||||
#include "core/ref.h"
|
#include "core/ref.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
// class Tensor;
|
|
||||||
class TensorBaseObj;
|
class TensorBaseObj;
|
||||||
class TensorObj;
|
class TensorObj;
|
||||||
class OperatorObj;
|
class OperatorObj;
|
||||||
class GraphObj;
|
class GraphObj;
|
||||||
|
class RuntimeObj;
|
||||||
|
class BlobObj;
|
||||||
|
|
||||||
using TensorBase = Ref<TensorBaseObj>;
|
using TensorBase = Ref<TensorBaseObj>;
|
||||||
using Tensor = Ref<TensorObj>;
|
using Tensor = Ref<TensorObj>;
|
||||||
using Operator = Ref<OperatorObj>;
|
using Operator = Ref<OperatorObj>;
|
||||||
using Graph = Ref<GraphObj>;
|
using Graph = Ref<GraphObj>;
|
||||||
|
using Runtime = Ref<RuntimeObj>;
|
||||||
|
using Blob = Ref<BlobObj>;
|
||||||
|
|
||||||
using TensorVec = vector<Tensor>;
|
using TensorVec = vector<Tensor>;
|
||||||
using OpVec = vector<Operator>;
|
using OpVec = vector<Operator>;
|
||||||
|
@ -40,8 +44,7 @@ class TensorBaseObj : public Object {
|
||||||
DataType dtype;
|
DataType dtype;
|
||||||
vector<WRef<TensorBaseObj>> inputOf;
|
vector<WRef<TensorBaseObj>> inputOf;
|
||||||
WRef<TensorBaseObj> outputOf;
|
WRef<TensorBaseObj> outputOf;
|
||||||
// TODO: Ref<void> -> Ref<Blob>
|
Blob data;
|
||||||
Ref<VType[]> data;
|
|
||||||
// ComputeState computed;
|
// ComputeState computed;
|
||||||
// static int random_seed[256 * 16];
|
// static int random_seed[256 * 16];
|
||||||
// static bool random_inited;
|
// static bool random_inited;
|
||||||
|
@ -50,7 +53,15 @@ class TensorBaseObj : public Object {
|
||||||
TensorBaseObj(int dim, DataType dtype);
|
TensorBaseObj(int dim, DataType dtype);
|
||||||
virtual ~TensorBaseObj() {}
|
virtual ~TensorBaseObj() {}
|
||||||
|
|
||||||
Ref<VType[]> getDataPtr() const { return data; }
|
void dataMalloc(const Blob &blob) {
|
||||||
|
IT_ASSERT(data == nullptr);
|
||||||
|
data = blob;
|
||||||
|
}
|
||||||
|
Blob getDataPtr() const { return data; }
|
||||||
|
template <typename T> T getDataRawPtr() const {
|
||||||
|
IT_ASSERT(data != nullptr);
|
||||||
|
return data->getPtr<T>();
|
||||||
|
}
|
||||||
VType getData(size_t offset) const;
|
VType getData(size_t offset) const;
|
||||||
|
|
||||||
DataType getDType() const { return dtype; }
|
DataType getDType() const { return dtype; }
|
||||||
|
|
|
@ -0,0 +1,111 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/common.h"
|
||||||
|
#include <cublas_v2.h>
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <cudnn.h>
|
||||||
|
#include <curand.h>
|
||||||
|
|
||||||
|
// TODO: replace with Exception (IT_ASSERT)
|
||||||
|
#define checkCudaError(call) \
|
||||||
|
{ \
|
||||||
|
auto err = call; \
|
||||||
|
if (cudaSuccess != err) { \
|
||||||
|
fprintf(stderr, "Cuda error in %s:%i : %s.\n", __FILE__, __LINE__, \
|
||||||
|
cudaGetErrorString(err)); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define checkCublasError(call) \
|
||||||
|
{ \
|
||||||
|
auto err = call; \
|
||||||
|
if (CUBLAS_STATUS_SUCCESS != err) { \
|
||||||
|
fprintf(stderr, "cuBLAS error in %s:%i : %s.\n", __FILE__, \
|
||||||
|
__LINE__, cublasGetErrorString(err)); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define checkCudnnError(call) \
|
||||||
|
{ \
|
||||||
|
auto err = call; \
|
||||||
|
if (CUDNN_STATUS_SUCCESS != err) { \
|
||||||
|
fprintf(stderr, "cuDNN error in %s:%i : %s.\n", __FILE__, \
|
||||||
|
__LINE__, cudnnGetErrorString(err)); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define checkCurandError(call) \
|
||||||
|
{ \
|
||||||
|
auto err = call; \
|
||||||
|
if (CURAND_STATUS_SUCCESS != err) { \
|
||||||
|
fprintf(stderr, "cuRAND error in %s:%i : %s.\n", __FILE__, \
|
||||||
|
__LINE__, curandGetErrorString(err)); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
inline const char *cublasGetErrorString(cublasStatus_t error) {
|
||||||
|
switch (error) {
|
||||||
|
case CUBLAS_STATUS_SUCCESS:
|
||||||
|
return "CUBLAS_STATUS_SUCCESS";
|
||||||
|
case CUBLAS_STATUS_NOT_INITIALIZED:
|
||||||
|
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
||||||
|
case CUBLAS_STATUS_ALLOC_FAILED:
|
||||||
|
return "CUBLAS_STATUS_ALLOC_FAILED";
|
||||||
|
case CUBLAS_STATUS_INVALID_VALUE:
|
||||||
|
return "CUBLAS_STATUS_INVALID_VALUE";
|
||||||
|
case CUBLAS_STATUS_ARCH_MISMATCH:
|
||||||
|
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
||||||
|
case CUBLAS_STATUS_MAPPING_ERROR:
|
||||||
|
return "CUBLAS_STATUS_MAPPING_ERROR";
|
||||||
|
case CUBLAS_STATUS_EXECUTION_FAILED:
|
||||||
|
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
||||||
|
case CUBLAS_STATUS_INTERNAL_ERROR:
|
||||||
|
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
||||||
|
case CUBLAS_STATUS_NOT_SUPPORTED:
|
||||||
|
return "CUBLAS_STATUS_NOT_SUPPORTED";
|
||||||
|
case CUBLAS_STATUS_LICENSE_ERROR:
|
||||||
|
return "CUBLAS_STATUS_LICENSE_ERROR";
|
||||||
|
}
|
||||||
|
return "<unknown>";
|
||||||
|
}
|
||||||
|
|
||||||
|
inline const char *curandGetErrorString(curandStatus_t error) {
|
||||||
|
switch (error) {
|
||||||
|
case CURAND_STATUS_SUCCESS:
|
||||||
|
return "CURAND_STATUS_SUCCESS";
|
||||||
|
case CURAND_STATUS_VERSION_MISMATCH:
|
||||||
|
return "CURAND_STATUS_VERSION_MISMATCH";
|
||||||
|
case CURAND_STATUS_NOT_INITIALIZED:
|
||||||
|
return "CURAND_STATUS_NOT_INITIALIZED";
|
||||||
|
case CURAND_STATUS_ALLOCATION_FAILED:
|
||||||
|
return "CURAND_STATUS_ALLOCATION_FAILED";
|
||||||
|
case CURAND_STATUS_TYPE_ERROR:
|
||||||
|
return "CURAND_STATUS_TYPE_ERROR";
|
||||||
|
case CURAND_STATUS_OUT_OF_RANGE:
|
||||||
|
return "CURAND_STATUS_OUT_OF_RANGE";
|
||||||
|
case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
|
||||||
|
return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
|
||||||
|
case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
|
||||||
|
return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
|
||||||
|
case CURAND_STATUS_LAUNCH_FAILURE:
|
||||||
|
return "CURAND_STATUS_LAUNCH_FAILURE";
|
||||||
|
case CURAND_STATUS_PREEXISTING_FAILURE:
|
||||||
|
return "CURAND_STATUS_PREEXISTING_FAILURE";
|
||||||
|
case CURAND_STATUS_INITIALIZATION_FAILED:
|
||||||
|
return "CURAND_STATUS_INITIALIZATION_FAILED";
|
||||||
|
case CURAND_STATUS_ARCH_MISMATCH:
|
||||||
|
return "CURAND_STATUS_ARCH_MISMATCH";
|
||||||
|
case CURAND_STATUS_INTERNAL_ERROR:
|
||||||
|
return "CURAND_STATUS_INTERNAL_ERROR";
|
||||||
|
}
|
||||||
|
return "<unknown>";
|
||||||
|
}
|
||||||
|
|
||||||
|
using CudaPtr = void *;
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,49 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "cuda/cuda_common.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class CudaRuntimeObj : public RuntimeObj {
|
||||||
|
private:
|
||||||
|
cudnnHandle_t cudnn;
|
||||||
|
cublasHandle_t cublas;
|
||||||
|
CudaPtr workspace;
|
||||||
|
size_t workspaceSize;
|
||||||
|
|
||||||
|
public:
|
||||||
|
CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
|
||||||
|
checkCudnnError(cudnnCreate(&cudnn));
|
||||||
|
checkCublasError(cublasCreate(&cublas));
|
||||||
|
// 10GB for Longformer
|
||||||
|
// size_t longformerNum = 3lu * (1 << 30);
|
||||||
|
workspaceSize = 7ll << 30; // 7 GB
|
||||||
|
workspace = alloc(workspaceSize);
|
||||||
|
}
|
||||||
|
virtual ~CudaRuntimeObj() {
|
||||||
|
checkCudnnError(cudnnDestroy(cudnn));
|
||||||
|
checkCublasError(cublasDestroy(cublas));
|
||||||
|
}
|
||||||
|
|
||||||
|
void run(const Graph &graph, bool tune = false,
|
||||||
|
bool profiling = false) const;
|
||||||
|
// double runEvaluation(const Graph &graph, int nWarmups,
|
||||||
|
// int nEvaluations) const;
|
||||||
|
void sync() const;
|
||||||
|
CudaPtr alloc(size_t size) override {
|
||||||
|
void *ptr;
|
||||||
|
checkCudaError(cudaMalloc(&ptr, size));
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void dealloc(void *ptr) override { checkCudaError(cudaFree(ptr)); }
|
||||||
|
cudnnHandle_t cudnnHandle() const { return cudnn; }
|
||||||
|
cublasHandle_t cublasHandle() const { return cublas; }
|
||||||
|
CudaPtr getWorkspace(size_t size) const {
|
||||||
|
IT_ASSERT(size <= workspaceSize);
|
||||||
|
return workspace;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void runWithoutSync(const Graph &graph) const;
|
||||||
|
};
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,11 @@
|
||||||
|
#include "core/tensor.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void cudaPrintFloat(float *x, int len);
|
||||||
|
|
||||||
|
void cudaPrintTensor(const Tensor &tensor) {
|
||||||
|
cudaPrintFloat(tensor->getDataRawPtr<float *>(), tensor->size());
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -5,12 +5,14 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
// TODO: isolate these class
|
||||||
class DataGenerator {
|
class DataGenerator {
|
||||||
private:
|
private:
|
||||||
virtual void fill(uint32_t *data, size_t size) { IT_TODO_HALT(); };
|
virtual void fill(uint32_t *data, size_t size) { IT_TODO_HALT(); }
|
||||||
virtual void fill(float *data, size_t size) { IT_TODO_HALT(); };
|
virtual void fill(float *data, size_t size) { IT_TODO_HALT(); }
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
virtual ~DataGenerator() {}
|
||||||
void operator()(void *data, size_t size, DataType dataType) {
|
void operator()(void *data, size_t size, DataType dataType) {
|
||||||
switch (dataType) {
|
switch (dataType) {
|
||||||
case DataType::UInt32:
|
case DataType::UInt32:
|
||||||
|
@ -26,10 +28,36 @@ class DataGenerator {
|
||||||
};
|
};
|
||||||
|
|
||||||
class IncrementalGenerator : public DataGenerator {
|
class IncrementalGenerator : public DataGenerator {
|
||||||
void fill(uint32_t *data, size_t size) override {
|
public:
|
||||||
|
virtual ~IncrementalGenerator() {}
|
||||||
|
|
||||||
|
private:
|
||||||
|
template <typename T> void fill(T *data, size_t size) {
|
||||||
for (size_t i = 0; i < size; i++) {
|
for (size_t i = 0; i < size; i++) {
|
||||||
data[i] = i;
|
data[i] = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void fill(uint32_t *data, size_t size) override {
|
||||||
|
fill<uint32_t>(data, size);
|
||||||
|
}
|
||||||
|
void fill(float *data, size_t size) override { fill<float>(data, size); }
|
||||||
|
};
|
||||||
|
|
||||||
|
class OneGenerator : public DataGenerator {
|
||||||
|
public:
|
||||||
|
virtual ~OneGenerator() {}
|
||||||
|
|
||||||
|
private:
|
||||||
|
template <typename T> void fill(T *data, size_t size) {
|
||||||
|
for (size_t i = 0; i < size; i++) {
|
||||||
|
data[i] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void fill(uint32_t *data, size_t size) override {
|
||||||
|
fill<uint32_t>(data, size);
|
||||||
|
}
|
||||||
|
void fill(float *data, size_t size) override { fill<float>(data, size); }
|
||||||
};
|
};
|
||||||
} // namespace infini
|
} // namespace infini
|
|
@ -0,0 +1,10 @@
|
||||||
|
#include "core/runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
BlobObj::~BlobObj() {
|
||||||
|
// Avoid cycled inclusion
|
||||||
|
runtime->dealloc(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -4,11 +4,20 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
double timeit(const std::function<void()> &func) {
|
double timeit(const std::function<void()> &func, int warmupRounds,
|
||||||
|
int timingRounds, const std::function<void(void)> &sync) {
|
||||||
|
for (int i = 0; i < warmupRounds; ++i)
|
||||||
|
func();
|
||||||
|
if (sync)
|
||||||
|
sync();
|
||||||
auto start = std::chrono::high_resolution_clock::now();
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
func();
|
for (int i = 0; i < timingRounds; ++i)
|
||||||
|
func();
|
||||||
|
if (sync)
|
||||||
|
sync();
|
||||||
auto end = std::chrono::high_resolution_clock::now();
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
return std::chrono::duration<double, std::milli>(end - start).count();
|
return std::chrono::duration<double, std::milli>(end - start).count() /
|
||||||
|
timingRounds;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
|
@ -13,8 +13,9 @@ string GraphObj::toString() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
void GraphObj::dataMalloc() {
|
void GraphObj::dataMalloc() {
|
||||||
for (auto &tensor : tensors)
|
for (auto &tensor : tensors) {
|
||||||
tensor->dataMalloc();
|
tensor->dataMalloc(runtime);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
|
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
|
||||||
|
|
|
@ -1,9 +1,14 @@
|
||||||
#include "core/run_enigne.h"
|
#include "core/runtime.h"
|
||||||
|
#include "core/blob.h"
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <cuda_profiler_api.h>
|
||||||
|
#include <cudnn.h>
|
||||||
|
#include <curand.h>
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
|
void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
||||||
if (!tune && profiling)
|
if (!tune && profiling)
|
||||||
IT_TODO_HALT();
|
IT_TODO_HALT();
|
||||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||||
|
@ -12,7 +17,6 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
|
||||||
double totalTime = 0;
|
double totalTime = 0;
|
||||||
std::map<OpType, double> opTime;
|
std::map<OpType, double> opTime;
|
||||||
std::map<OpType, int> opCnt;
|
std::map<OpType, int> opCnt;
|
||||||
std::chrono::system_clock::time_point begin, end;
|
|
||||||
|
|
||||||
for (auto &op : graph->getOperators()) {
|
for (auto &op : graph->getOperators()) {
|
||||||
// HACK: set correct data type
|
// HACK: set correct data type
|
||||||
|
@ -24,7 +28,7 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
|
||||||
|
|
||||||
// If no record and disable tuning, run with the default argument
|
// If no record and disable tuning, run with the default argument
|
||||||
if (!perfData && !tune) {
|
if (!perfData && !tune) {
|
||||||
kernel->compute(op);
|
kernel->compute(op, this);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,16 +36,18 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
|
||||||
PerfRecord record;
|
PerfRecord record;
|
||||||
// Tune the kernel if there is no record
|
// Tune the kernel if there is no record
|
||||||
if (!perfData) {
|
if (!perfData) {
|
||||||
record = kernel->tune(op);
|
// TODO: record is not used
|
||||||
|
record = kernel->tune(op, this);
|
||||||
perfEngine.setPerfData(perfKey, record);
|
perfEngine.setPerfData(perfKey, record);
|
||||||
} else
|
} else
|
||||||
record = *perfData;
|
record = *perfData;
|
||||||
|
|
||||||
if (!profiling) {
|
if (!profiling) {
|
||||||
kernel->compute(op, *perfData);
|
kernel->compute(op, *perfData, this);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
double t = timeit([&]() { kernel->compute(op, *perfData); });
|
double t =
|
||||||
|
timeit([&]() { kernel->compute(op, *perfData, this); }, 1, 1);
|
||||||
op->print();
|
op->print();
|
||||||
printf(" op_time %lf\n", t);
|
printf(" op_time %lf\n", t);
|
||||||
totalTime += t;
|
totalTime += t;
|
||||||
|
@ -53,7 +59,7 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
|
||||||
printProfilingData(totalTime, opTime, opCnt);
|
printProfilingData(totalTime, opTime, opCnt);
|
||||||
}
|
}
|
||||||
|
|
||||||
double RunEngine::getPerfTime(const Graph &graph, bool profiling) const {
|
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
||||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||||
auto perfEngine = PerfEngine::getInstance();
|
auto perfEngine = PerfEngine::getInstance();
|
||||||
// Statistics
|
// Statistics
|
||||||
|
@ -72,7 +78,7 @@ double RunEngine::getPerfTime(const Graph &graph, bool profiling) const {
|
||||||
PerfRecord record;
|
PerfRecord record;
|
||||||
// Tune the kernel if there is no record
|
// Tune the kernel if there is no record
|
||||||
if (!perfData) {
|
if (!perfData) {
|
||||||
record = kernel->tune(op);
|
record = kernel->tune(op, this);
|
||||||
perfEngine.setPerfData(perfKey, record);
|
perfEngine.setPerfData(perfKey, record);
|
||||||
} else
|
} else
|
||||||
record = *perfData;
|
record = *perfData;
|
||||||
|
@ -91,9 +97,9 @@ double RunEngine::getPerfTime(const Graph &graph, bool profiling) const {
|
||||||
return totalTime;
|
return totalTime;
|
||||||
}
|
}
|
||||||
|
|
||||||
void RunEngine::printProfilingData(double totalTime,
|
void RuntimeObj::printProfilingData(double totalTime,
|
||||||
const std::map<OpType, double> &opTime,
|
const std::map<OpType, double> &opTime,
|
||||||
const std::map<OpType, int> &opCnt) const {
|
const std::map<OpType, int> &opCnt) const {
|
||||||
printf("%11s %3s %7s %7s %7s\n", "Op", "Cnt", "T_tot", "Percent", "T_mean");
|
printf("%11s %3s %7s %7s %7s\n", "Op", "Cnt", "T_tot", "Percent", "T_mean");
|
||||||
for (const auto &[type, t] : opTime) {
|
for (const auto &[type, t] : opTime) {
|
||||||
printf("%11s %3d %7.3f %7.1f %7.3f\n",
|
printf("%11s %3d %7.3f %7.1f %7.3f\n",
|
||||||
|
@ -102,4 +108,8 @@ void RunEngine::printProfilingData(double totalTime,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Blob RuntimeObj::allocBlob(size_t size) {
|
||||||
|
return make_ref<BlobObj>(shared_from_this(), alloc(size));
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
|
@ -1,15 +1,12 @@
|
||||||
#include <core/tensor.h>
|
#include "core/tensor.h"
|
||||||
|
#include "core/blob.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
TensorObj::TensorObj(const Shape &shape, DataType dtype)
|
TensorObj::TensorObj(const Shape &shape, DataType dtype)
|
||||||
: TensorBaseObj(shape.size(), dtype), shape(shape) {}
|
: TensorBaseObj(shape.size(), dtype), shape(shape) {}
|
||||||
|
|
||||||
void TensorObj::dataMalloc() {
|
|
||||||
IT_ASSERT(data == nullptr);
|
|
||||||
// initialized to zero
|
|
||||||
data.reset(reinterpret_cast<VType *>(calloc(size(), sizeof(VType))));
|
|
||||||
}
|
|
||||||
|
|
||||||
VType TensorObj::getData(const Shape &pos) const {
|
VType TensorObj::getData(const Shape &pos) const {
|
||||||
return getData(getOffset(pos));
|
return getData(getOffset(pos));
|
||||||
}
|
}
|
||||||
|
@ -37,24 +34,42 @@ size_t TensorObj::size() const {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void TensorObj::copyData(VType *dptr) {
|
template <typename T> void TensorObj::copyData(const T *dptr) {
|
||||||
|
// TODO: cuda
|
||||||
IT_ASSERT(data != nullptr);
|
IT_ASSERT(data != nullptr);
|
||||||
|
auto ptr = data->getPtr<T *>();
|
||||||
size_t sz = size();
|
size_t sz = size();
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (size_t i = 0; i < sz; ++i) {
|
for (size_t i = 0; i < sz; ++i) {
|
||||||
data[i] = dptr[i];
|
ptr[i] = dptr[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void TensorObj::copyData(vector<VType> dataVector) {
|
void TensorObj::copyData(vector<VType> dataVector) {
|
||||||
IT_ASSERT(dataVector.size() >= size());
|
IT_ASSERT(dataVector.size() >= size());
|
||||||
copyData(dataVector.data());
|
copyData(dataVector.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TensorObj::copyData(vector<float> dataVector) {
|
||||||
|
IT_ASSERT(dataVector.size() >= size());
|
||||||
|
copyData(dataVector.data());
|
||||||
|
}
|
||||||
|
|
||||||
void TensorObj::printData() const {
|
void TensorObj::printData() const {
|
||||||
IT_ASSERT(data != nullptr);
|
IT_ASSERT(data != nullptr);
|
||||||
|
if (dtype == DataType::Float32)
|
||||||
|
printDataFloat();
|
||||||
|
else if (dtype == DataType::UInt32)
|
||||||
|
printDataUint32_t();
|
||||||
|
else
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
|
||||||
|
void TensorObj::printDataFloat() const {
|
||||||
std::cout << "Tensor: " << guid << std::endl;
|
std::cout << "Tensor: " << guid << std::endl;
|
||||||
auto numDims = shape.size();
|
auto numDims = shape.size();
|
||||||
auto dimSzVec = std::vector<int>(numDims, 1);
|
auto dimSzVec = std::vector<int>(numDims, 1);
|
||||||
|
auto ptr = data->getPtr<float *>();
|
||||||
dimSzVec[numDims - 1] = shape[numDims - 1];
|
dimSzVec[numDims - 1] = shape[numDims - 1];
|
||||||
for (int i = numDims - 1; i != 0; --i)
|
for (int i = numDims - 1; i != 0; --i)
|
||||||
dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
|
dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
|
||||||
|
@ -64,7 +79,35 @@ void TensorObj::printData() const {
|
||||||
std::cout << "[";
|
std::cout << "[";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::cout << data[i];
|
printf("%.1f", ptr[i]);
|
||||||
|
for (size_t j = 0; j < numDims; ++j) {
|
||||||
|
if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
|
||||||
|
std::cout << "]";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (i != size() - 1)
|
||||||
|
std::cout << ", ";
|
||||||
|
if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1)
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TensorObj::printDataUint32_t() const {
|
||||||
|
IT_ASSERT(data != nullptr);
|
||||||
|
std::cout << "Tensor: " << guid << std::endl;
|
||||||
|
auto numDims = shape.size();
|
||||||
|
auto dimSzVec = std::vector<int>(numDims, 1);
|
||||||
|
auto ptr = data->getPtr<VType *>();
|
||||||
|
dimSzVec[numDims - 1] = shape[numDims - 1];
|
||||||
|
for (int i = numDims - 1; i != 0; --i)
|
||||||
|
dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
|
||||||
|
for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
|
||||||
|
for (size_t j = 0; j < numDims; ++j) {
|
||||||
|
if (i % dimSzVec[j] == 0) {
|
||||||
|
std::cout << "[";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout << ptr[i];
|
||||||
for (size_t j = 0; j < numDims; ++j) {
|
for (size_t j = 0; j < numDims; ++j) {
|
||||||
if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
|
if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
|
||||||
std::cout << "]";
|
std::cout << "]";
|
||||||
|
@ -80,13 +123,26 @@ void TensorObj::printData() const {
|
||||||
bool TensorObj::equalData(const Tensor &rhs) const {
|
bool TensorObj::equalData(const Tensor &rhs) const {
|
||||||
IT_ASSERT(data != nullptr);
|
IT_ASSERT(data != nullptr);
|
||||||
IT_ASSERT(rhs->data != nullptr);
|
IT_ASSERT(rhs->data != nullptr);
|
||||||
|
// TODO: deal with data type
|
||||||
|
auto ptr = data->getPtr<VType *>();
|
||||||
|
auto ptrRhs = rhs->data->getPtr<VType *>();
|
||||||
if (shape != rhs->getDims())
|
if (shape != rhs->getDims())
|
||||||
return false;
|
return false;
|
||||||
size_t sz = size();
|
size_t sz = size();
|
||||||
for (size_t i = 0; i < sz; ++i)
|
for (size_t i = 0; i < sz; ++i)
|
||||||
if (data[i] != rhs->data[i])
|
if (ptr[i] != ptrRhs[i])
|
||||||
return false;
|
return false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TensorObj::dataMalloc(const Runtime &runtime) {
|
||||||
|
IT_ASSERT(data == nullptr);
|
||||||
|
size_t bytesPerElement;
|
||||||
|
if (getDType() == DataType::Float32)
|
||||||
|
bytesPerElement = sizeof(float);
|
||||||
|
else if (getDType() == DataType::UInt32)
|
||||||
|
bytesPerElement = sizeof(uint32_t);
|
||||||
|
data = runtime->allocBlob(size() * bytesPerElement);
|
||||||
|
}
|
||||||
|
|
||||||
}; // namespace infini
|
}; // namespace infini
|
|
@ -1,9 +1,14 @@
|
||||||
#include <core/tensor_base.h>
|
#include "core/tensor_base.h"
|
||||||
|
#include "core/blob.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
TensorBaseObj::TensorBaseObj(int dim, DataType dtype)
|
TensorBaseObj::TensorBaseObj(int dim, DataType dtype)
|
||||||
: dim(dim), dtype(dtype) {}
|
: dim(dim), dtype(dtype) {}
|
||||||
|
|
||||||
VType TensorBaseObj::getData(size_t offset) const { return data[offset]; }
|
VType TensorBaseObj::getData(size_t offset) const {
|
||||||
|
// TODO: check cuda array
|
||||||
|
return (data->getPtr<VType *>())[offset];
|
||||||
|
}
|
||||||
|
|
||||||
}; // namespace infini
|
}; // namespace infini
|
|
@ -0,0 +1,32 @@
|
||||||
|
#include "cuda/cuda_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
||||||
|
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||||
|
auto perfEngine = PerfEngine::getInstance();
|
||||||
|
|
||||||
|
for (auto &op : graph->getOperators()) {
|
||||||
|
// HACK: set correct data type
|
||||||
|
auto kernelAttrs =
|
||||||
|
KernelAttrs{device, op->getOpType(), DataType::Float32};
|
||||||
|
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||||
|
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||||
|
std::optional<PerfRecord> perfData = perfEngine.getPerfData(perfKey);
|
||||||
|
if (perfData)
|
||||||
|
kernel->compute(op, *perfData, this);
|
||||||
|
else
|
||||||
|
kernel->compute(op, this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void CudaRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
||||||
|
if (tune || profiling)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
runWithoutSync(graph);
|
||||||
|
sync();
|
||||||
|
}
|
||||||
|
|
||||||
|
void CudaRuntimeObj::sync() const { cudaDeviceSynchronize(); }
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,20 @@
|
||||||
|
#include "cuda/cuda_common.h"
|
||||||
|
#include <cstdio>
|
||||||
|
|
||||||
|
__global__ void cudaPrintFloatImpl(float *x, int len) {
|
||||||
|
int start = threadIdx.x + blockDim.x * blockIdx.x;
|
||||||
|
if (start == 0) {
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
printf("%.3f ", x[i]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void cudaPrintFloat(float *x, int len) {
|
||||||
|
cudaPrintFloatImpl<<<1, 1>>>(x, len);
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
}
|
||||||
|
} // namespace infini
|
|
@ -4,11 +4,12 @@
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
template <typename T> class NaiveConv : public Kernel {
|
template <typename T> class NaiveConv : public Kernel {
|
||||||
void compute(const Operator &_op, const PerfRecord &record) const override {
|
void compute(const Operator &_op, const PerfRecord &record,
|
||||||
|
const RuntimeObj *context) const override {
|
||||||
auto op = as<ConvObj>(_op);
|
auto op = as<ConvObj>(_op);
|
||||||
T *iptr = reinterpret_cast<T *>(op->getInputs(0)->getDataPtr().get());
|
T *iptr = op->getInputs(0)->getDataRawPtr<T *>();
|
||||||
T *wptr = reinterpret_cast<T *>(op->getInputs(1)->getDataPtr().get());
|
T *wptr = op->getInputs(1)->getDataRawPtr<T *>();
|
||||||
T *optr = reinterpret_cast<T *>(op->getOutput()->getDataPtr().get());
|
T *optr = op->getOutput()->getDataRawPtr<T *>();
|
||||||
auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||||
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||||
int cpg = op->getChannelPerGroup();
|
int cpg = op->getChannelPerGroup();
|
||||||
|
@ -45,10 +46,13 @@ template <typename T> class NaiveConv : public Kernel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void compute(const Operator &op) const override { compute(op, {}); }
|
void compute(const Operator &op, const RuntimeObj *context) const override {
|
||||||
|
compute(op, {}, context);
|
||||||
|
}
|
||||||
|
|
||||||
PerfRecord tune(const Operator &op) const override {
|
PerfRecord tune(const Operator &op,
|
||||||
return PerfRecord{.time = timeit([this, &op]() { compute(op); })};
|
const RuntimeObj *context) const override {
|
||||||
|
return PerfRecord(timeit([&]() { compute(op, context); }));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -4,11 +4,12 @@
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
template <typename T> class NaiveMatmul : public Kernel {
|
template <typename T> class NaiveMatmul : public Kernel {
|
||||||
void compute(const Operator &_op, const PerfRecord &record) const override {
|
void compute(const Operator &_op, const PerfRecord &record,
|
||||||
|
const RuntimeObj *context) const override {
|
||||||
auto op = as<MatmulObj>(_op);
|
auto op = as<MatmulObj>(_op);
|
||||||
T *A = reinterpret_cast<T *>(op->getInputs(0)->getDataPtr().get());
|
T *A = op->getInputs(0)->getDataRawPtr<T *>();
|
||||||
T *B = reinterpret_cast<T *>(op->getInputs(1)->getDataPtr().get());
|
T *B = op->getInputs(1)->getDataRawPtr<T *>();
|
||||||
T *C = reinterpret_cast<T *>(op->getOutput()->getDataPtr().get());
|
T *C = op->getOutput()->getDataRawPtr<T *>();
|
||||||
IT_ASSERT(op->getTransA() == false && op->getTransB() == false);
|
IT_ASSERT(op->getTransA() == false && op->getTransB() == false);
|
||||||
IT_ASSERT(op->getAct() == ActType::None);
|
IT_ASSERT(op->getAct() == ActType::None);
|
||||||
IT_ASSERT(op->getB() == 1);
|
IT_ASSERT(op->getB() == 1);
|
||||||
|
@ -23,10 +24,15 @@ template <typename T> class NaiveMatmul : public Kernel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void compute(const Operator &op) const override { compute(op, {}); }
|
void compute(const Operator &op, const RuntimeObj *context) const override {
|
||||||
|
compute(op, {}, context);
|
||||||
|
}
|
||||||
|
|
||||||
PerfRecord tune(const Operator &op) const override {
|
PerfRecord tune(const Operator &op,
|
||||||
return PerfRecord{.time = timeit([this, &op]() { compute(op); })};
|
const RuntimeObj *context) const override {
|
||||||
|
PerfRecord ret;
|
||||||
|
ret.time = timeit([&]() { compute(op, context); });
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,208 @@
|
||||||
|
#include "operators/conv.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "cuda/cuda_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
static constexpr int N_ALGO = 8;
|
||||||
|
static constexpr cudnnConvolutionFwdAlgo_t ALGOS[N_ALGO] = {
|
||||||
|
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
|
||||||
|
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
|
||||||
|
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
|
||||||
|
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
|
||||||
|
CUDNN_CONVOLUTION_FWD_ALGO_FFT,
|
||||||
|
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
|
||||||
|
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
|
||||||
|
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED};
|
||||||
|
|
||||||
|
struct ConvCuDnnPerfRecord : public PerfRecord {
|
||||||
|
int algo = 0; // cudnnConvolutionFwdAlgo_t
|
||||||
|
size_t workspaceSize = 100000;
|
||||||
|
bool fuseAct = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
class convCudnn : public Kernel {
|
||||||
|
|
||||||
|
bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
|
||||||
|
const CudaRuntimeObj *context) const {
|
||||||
|
cudnnStatus_t stat;
|
||||||
|
void *const inData = (op->getInputs(0)->getDataRawPtr<void *>());
|
||||||
|
void *const knData = (op->getInputs(1)->getDataRawPtr<void *>());
|
||||||
|
if (op->getInputs(2) != nullptr)
|
||||||
|
IT_TODO_HALT();
|
||||||
|
// void *const biasData = (op->getInputs(2)->getDataRawPtr<void *>());
|
||||||
|
void *const outData = (op->getOutput()->getDataRawPtr<void *>());
|
||||||
|
|
||||||
|
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||||
|
const int cpg = op->getChannelPerGroup();
|
||||||
|
const int g = c / cpg;
|
||||||
|
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||||
|
|
||||||
|
int channelsPerGrp = cpg, channels = c;
|
||||||
|
|
||||||
|
// get inputs
|
||||||
|
cudnnTensorDescriptor_t inDesc;
|
||||||
|
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
|
||||||
|
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||||
|
inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, channels, h, w));
|
||||||
|
|
||||||
|
// get kernels
|
||||||
|
cudnnFilterDescriptor_t knDesc;
|
||||||
|
checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
|
||||||
|
checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
|
||||||
|
CUDNN_TENSOR_NCHW, f,
|
||||||
|
channelsPerGrp, r, s));
|
||||||
|
|
||||||
|
// get bias
|
||||||
|
cudnnTensorDescriptor_t biasDesc;
|
||||||
|
checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
|
||||||
|
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||||
|
biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1));
|
||||||
|
|
||||||
|
// get convlution descriptor
|
||||||
|
cudnnConvolutionDescriptor_t convDesc;
|
||||||
|
checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
|
||||||
|
// TODO: CUDNN_CONVOLUTION is a tunable argument
|
||||||
|
checkCudnnError(cudnnSetConvolution2dDescriptor(
|
||||||
|
convDesc, ph, pw, sh, sw, dh, dw, CUDNN_CROSS_CORRELATION,
|
||||||
|
CUDNN_DATA_FLOAT));
|
||||||
|
if (g > 1) {
|
||||||
|
checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g));
|
||||||
|
}
|
||||||
|
|
||||||
|
// get activation descriptor
|
||||||
|
cudnnActivationDescriptor_t actDesc;
|
||||||
|
checkCudnnError(cudnnCreateActivationDescriptor(&actDesc));
|
||||||
|
// NOT_PROPAGATE_NAN is requierd by
|
||||||
|
// cudnnConvolotionBiasActivationForward
|
||||||
|
switch (op->getAct()) {
|
||||||
|
case ActType::Relu:
|
||||||
|
checkCudnnError(cudnnSetActivationDescriptor(
|
||||||
|
actDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0));
|
||||||
|
break;
|
||||||
|
case ActType::Sigmoid:
|
||||||
|
checkCudnnError(cudnnSetActivationDescriptor(
|
||||||
|
actDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0));
|
||||||
|
break;
|
||||||
|
case ActType::None:
|
||||||
|
checkCudnnError(
|
||||||
|
cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_IDENTITY,
|
||||||
|
CUDNN_NOT_PROPAGATE_NAN, 0));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// get outputs
|
||||||
|
int outn, outc, outh, outw;
|
||||||
|
checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
|
||||||
|
convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
|
||||||
|
cudnnTensorDescriptor_t outDesc;
|
||||||
|
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
|
||||||
|
checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
|
||||||
|
CUDNN_DATA_FLOAT, outn, outc,
|
||||||
|
outh, outw));
|
||||||
|
IT_ASSERT((vector{outn, outc, outh, outw}) ==
|
||||||
|
op->getOutput()->getDims(),
|
||||||
|
"cuDNN output shape mismatches with OP output shape");
|
||||||
|
|
||||||
|
// get workspace
|
||||||
|
size_t wsSize = record.workspaceSize;
|
||||||
|
stat = cudnnGetConvolutionForwardWorkspaceSize(
|
||||||
|
context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
|
||||||
|
ALGOS[record.algo], &wsSize);
|
||||||
|
if (stat != CUDNN_STATUS_SUCCESS)
|
||||||
|
return false;
|
||||||
|
// assert(wsSize < (size_t)3 * 1024 * 1024 * 1024);
|
||||||
|
// if (wsSize >= (size_t)10 * 1024 * 1024 * 1024)
|
||||||
|
// continue;
|
||||||
|
CudaPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
float alpha = 1.f, beta = 0.f;
|
||||||
|
|
||||||
|
stat = cudnnConvolutionForward(context->cudnnHandle(), &alpha, inDesc,
|
||||||
|
inData, knDesc, knData, convDesc,
|
||||||
|
ALGOS[record.algo], wsData, wsSize,
|
||||||
|
&beta, outDesc, outData);
|
||||||
|
if (stat != CUDNN_STATUS_SUCCESS)
|
||||||
|
return false;
|
||||||
|
// TODO:
|
||||||
|
// // bias
|
||||||
|
// if (bias != nullptr) {
|
||||||
|
// auto sz = op.getOutputs()[0]->size();
|
||||||
|
// // TODO: element wise
|
||||||
|
// t += sz * 2 / 400;
|
||||||
|
// }
|
||||||
|
// // act
|
||||||
|
// if (act != None) {
|
||||||
|
// stat = cudnnActivationForward(cudnnHandle(), actDesc,
|
||||||
|
// &alpha, inDesc, inData,
|
||||||
|
// &beta, outDesc, outData);
|
||||||
|
// checkCudaError(cudaDeviceSynchronize());
|
||||||
|
// end = ch::high_resolution_clock::now();
|
||||||
|
// if (stat != CUDNN_STATUS_SUCCESS) {
|
||||||
|
// durtime = INFINITY;
|
||||||
|
// break;
|
||||||
|
// }
|
||||||
|
// t +=
|
||||||
|
// ch::duration_cast<ch::duration<double>>(end -
|
||||||
|
// beg).count() * 1000; // ms
|
||||||
|
// }
|
||||||
|
|
||||||
|
// best = ConvResult{durtime, ALGOS[i], wsSize, false};
|
||||||
|
|
||||||
|
// // w/ bias & act
|
||||||
|
// for (int j = 0; j < rounds + warmupRounds; ++j) {
|
||||||
|
// cudnnStatus_t stat;
|
||||||
|
// if (j == warmupRounds) {
|
||||||
|
// checkCudaError(cudaDeviceSynchronize());
|
||||||
|
// beg = ch::high_resolution_clock::now();
|
||||||
|
// }
|
||||||
|
// stat = cudnnConvolutionBiasActivationForward(
|
||||||
|
// cudnnHandle(), &alpha, inDesc, inData, knDesc, knData,
|
||||||
|
// convDesc, ALGOS[i], wsData, wsSize, &beta, outDesc,
|
||||||
|
// outData, biasDesc, biasData, actDesc, outDesc, outData);
|
||||||
|
// if (stat != CUDNN_STATUS_SUCCESS) {
|
||||||
|
// // checkCudnnError(stat);
|
||||||
|
// // Do not checkCudnnError since not all algorithms are
|
||||||
|
// // supported
|
||||||
|
// durtime_fuse = INFINITY;
|
||||||
|
// break;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// Destories in CUDA does not require sync. But cuDNN does not state
|
||||||
|
// whether sync is required before destories.
|
||||||
|
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
|
||||||
|
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
|
||||||
|
checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
|
||||||
|
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute(const Operator &op, const RuntimeObj *context) const override {
|
||||||
|
ConvCuDnnPerfRecord record; // with paramters in default ctor
|
||||||
|
compute(op, record, context);
|
||||||
|
}
|
||||||
|
|
||||||
|
PerfRecord tune(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
// TODO: real tuning
|
||||||
|
ConvCuDnnPerfRecord ret;
|
||||||
|
ret.time = timeit([&]() { compute(_op, _context); });
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute(const Operator &_op, const PerfRecord &_record,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<ConvObj>(_op);
|
||||||
|
auto &record = dynamic_cast<const ConvCuDnnPerfRecord &>(_record);
|
||||||
|
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||||
|
bool success = cuDNNUnfused(op, record, context);
|
||||||
|
IT_ASSERT(success);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float32, convCudnn,
|
||||||
|
"Conv_cuDNN_CUDA_Float32");
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -1,45 +1,48 @@
|
||||||
|
#include "core/blob.h"
|
||||||
#include "core/graph.h"
|
#include "core/graph.h"
|
||||||
#include "core/run_enigne.h"
|
#include "core/runtime.h"
|
||||||
#include "operators/matmul.h"
|
#include "operators/matmul.h"
|
||||||
#include "test.h"
|
#include "test.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
TEST(Graph, build_and_run) {
|
TEST(Graph, build_and_run) {
|
||||||
Graph g = make_ref<GraphObj>();
|
Runtime runtime = make_ref<CpuRuntimeObj>();
|
||||||
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
||||||
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
||||||
Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
|
Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
|
||||||
g->dataMalloc();
|
g->dataMalloc();
|
||||||
i0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||||
w0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||||
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
||||||
RunEngine(Device::CPU).run(g);
|
runtime->run(g);
|
||||||
// check answer
|
// check answer
|
||||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
|
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
|
||||||
ans->dataMalloc();
|
ans->dataMalloc(runtime);
|
||||||
ans->copyData({38, 44, 50, 56, 83, 98, 113, 128});
|
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||||
EXPECT_TRUE(o0->equalData(ans));
|
EXPECT_TRUE(o0->equalData(ans));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(Graph, perf_engine) {
|
TEST(Graph, perf_engine) {
|
||||||
Graph g = make_ref<GraphObj>();
|
Runtime runtime = make_ref<CpuRuntimeObj>();
|
||||||
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
||||||
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
||||||
auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);
|
auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);
|
||||||
|
|
||||||
g->dataMalloc();
|
g->dataMalloc();
|
||||||
i0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||||
w0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||||
RunEngine(Device::CPU).run(g, true, true);
|
runtime->run(g, true, true);
|
||||||
double perfTime = RunEngine(Device::CPU).getPerfTime(g);
|
double perfTime = runtime->getPerfTime(g);
|
||||||
// The example matmul takes 0.0036ms with one core
|
// The example matmul takes 0.0036ms with one core
|
||||||
EXPECT_GT(perfTime, 0);
|
EXPECT_GT(perfTime, 0);
|
||||||
EXPECT_LT(perfTime, 0.01);
|
EXPECT_LT(perfTime, 0.01);
|
||||||
// check answer
|
// check answer
|
||||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
|
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
|
||||||
ans->dataMalloc();
|
ans->dataMalloc(runtime);
|
||||||
ans->copyData({38, 44, 50, 56, 83, 98, 113, 128});
|
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||||
EXPECT_TRUE(matmul->getOutput()->equalData(ans));
|
EXPECT_TRUE(matmul->getOutput()->equalData(ans));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#include "core/graph.h"
|
#include "core/graph.h"
|
||||||
#include "core/run_enigne.h"
|
#include "core/runtime.h"
|
||||||
#include "operators/matmul.h"
|
#include "operators/matmul.h"
|
||||||
#include "test.h"
|
#include "test.h"
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ namespace infini {
|
||||||
TEST(Hash, OperatorHash) {
|
TEST(Hash, OperatorHash) {
|
||||||
OpPerfKey key1(0, OpType::Unknown), key2(0, OpType::Unknown);
|
OpPerfKey key1(0, OpType::Unknown), key2(0, OpType::Unknown);
|
||||||
{ // build with addOpWithOutputs
|
{ // build with addOpWithOutputs
|
||||||
Graph g = make_ref<GraphObj>();
|
Graph g = make_ref<GraphObj>(nullptr);
|
||||||
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
||||||
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
||||||
Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
|
Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
|
||||||
|
@ -18,7 +18,7 @@ TEST(Hash, OperatorHash) {
|
||||||
EXPECT_GT(key1.attrs.size(), 5);
|
EXPECT_GT(key1.attrs.size(), 5);
|
||||||
}
|
}
|
||||||
{ // build with addOp
|
{ // build with addOp
|
||||||
Graph g = make_ref<GraphObj>();
|
Graph g = make_ref<GraphObj>(nullptr);
|
||||||
Tensor i0 = g->addTensor({2, 2, 3}, DataType::UInt32);
|
Tensor i0 = g->addTensor({2, 2, 3}, DataType::UInt32);
|
||||||
Tensor w0 = g->addTensor({2, 3, 4}, DataType::UInt32);
|
Tensor w0 = g->addTensor({2, 3, 4}, DataType::UInt32);
|
||||||
auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);
|
auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);
|
||||||
|
|
|
@ -1,21 +1,24 @@
|
||||||
#include "core/graph.h"
|
#include "core/graph.h"
|
||||||
#include "core/run_enigne.h"
|
#include "core/runtime.h"
|
||||||
|
#include "cuda/cuda_runtime.h"
|
||||||
|
#include "cuda/cuda_utility.h"
|
||||||
#include "operators/conv.h"
|
#include "operators/conv.h"
|
||||||
#include "test.h"
|
#include "test.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
TEST(Conv, ShapeInference) {
|
TEST(Conv, ShapeInference) {
|
||||||
|
auto runtime = make_ref<CpuRuntimeObj>();
|
||||||
// Padding modes
|
// Padding modes
|
||||||
{
|
{
|
||||||
Graph g = make_ref<GraphObj>();
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
||||||
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
||||||
auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1);
|
auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1);
|
||||||
EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
|
EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
Graph g = make_ref<GraphObj>();
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
||||||
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
||||||
auto conv =
|
auto conv =
|
||||||
|
@ -23,7 +26,7 @@ TEST(Conv, ShapeInference) {
|
||||||
EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
|
EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
Graph g = make_ref<GraphObj>();
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
||||||
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
||||||
auto conv =
|
auto conv =
|
||||||
|
@ -31,7 +34,7 @@ TEST(Conv, ShapeInference) {
|
||||||
EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 2, 2}));
|
EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 2, 2}));
|
||||||
}
|
}
|
||||||
{ // dilation & stride
|
{ // dilation & stride
|
||||||
Graph g = make_ref<GraphObj>();
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
||||||
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
||||||
auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
|
auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
|
||||||
|
@ -40,7 +43,8 @@ TEST(Conv, ShapeInference) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(Conv, NaiveCPU) {
|
TEST(Conv, NaiveCPU) {
|
||||||
Graph g = make_ref<GraphObj>();
|
auto runtime = make_ref<CpuRuntimeObj>();
|
||||||
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
||||||
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
||||||
auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
|
auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
|
||||||
|
@ -48,16 +52,75 @@ TEST(Conv, NaiveCPU) {
|
||||||
g->dataMalloc();
|
g->dataMalloc();
|
||||||
i0->setData(IncrementalGenerator());
|
i0->setData(IncrementalGenerator());
|
||||||
w0->setData(IncrementalGenerator());
|
w0->setData(IncrementalGenerator());
|
||||||
RunEngine(Device::CPU).run(g, true, true);
|
runtime->run(g, true, true);
|
||||||
double perfTime = RunEngine(Device::CPU).getPerfTime(g);
|
double perfTime = runtime->getPerfTime(g);
|
||||||
// The example matmul takes 0.0036ms with one core
|
// The example Conv takes 0.015ms with one core
|
||||||
EXPECT_GT(perfTime, 0);
|
EXPECT_GT(perfTime, 0);
|
||||||
EXPECT_LT(perfTime, 5);
|
EXPECT_LT(perfTime, 0.1);
|
||||||
// check answer
|
// check answer
|
||||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32);
|
auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32);
|
||||||
ans->dataMalloc();
|
ans->dataMalloc(runtime);
|
||||||
ans->copyData({4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
|
ans->copyData(
|
||||||
|
vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
|
||||||
EXPECT_TRUE(conv->getOutput()->equalData(ans));
|
EXPECT_TRUE(conv->getOutput()->equalData(ans));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void testConvCudnn(
|
||||||
|
const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
|
vector<float> ansVec) {
|
||||||
|
auto cpuRuntime = make_ref<CpuRuntimeObj>();
|
||||||
|
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||||
|
// Build CUDA graph
|
||||||
|
Graph g = make_ref<GraphObj>(cudaRuntime);
|
||||||
|
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::Float32);
|
||||||
|
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::Float32);
|
||||||
|
auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
|
||||||
|
|
||||||
|
// allocate CUDA memory
|
||||||
|
g->dataMalloc();
|
||||||
|
|
||||||
|
// Build input and output data on CPU
|
||||||
|
auto cpui0 = make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32);
|
||||||
|
cpui0->dataMalloc(cpuRuntime);
|
||||||
|
cpui0->setData(generator);
|
||||||
|
|
||||||
|
auto cpuw0 = make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32);
|
||||||
|
cpuw0->dataMalloc(cpuRuntime);
|
||||||
|
cpuw0->setData(generator);
|
||||||
|
|
||||||
|
auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32);
|
||||||
|
ans->dataMalloc(cpuRuntime);
|
||||||
|
ans->copyData(ansVec);
|
||||||
|
|
||||||
|
// Copy inputs from CPU to CUDA
|
||||||
|
cudaMemcpy(i0->getDataRawPtr<void *>(), cpui0->getDataRawPtr<void *>(),
|
||||||
|
cpui0->size() * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(w0->getDataRawPtr<void *>(), cpuw0->getDataRawPtr<void *>(),
|
||||||
|
cpuw0->size() * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
// Execute on CUDA
|
||||||
|
cudaRuntime->run(g);
|
||||||
|
// double perfTime = cudaRuntime->getPerfTime(g);
|
||||||
|
// // The example Conv takes 0.015ms with one core
|
||||||
|
// EXPECT_GT(perfTime, 0);
|
||||||
|
// EXPECT_LT(perfTime, 0.1);
|
||||||
|
|
||||||
|
// copy CUDA output to CPU
|
||||||
|
auto o0 = conv->getOutput();
|
||||||
|
auto cpuo0 = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32);
|
||||||
|
cpuo0->dataMalloc(cpuRuntime);
|
||||||
|
cudaMemcpy(cpuo0->getDataRawPtr<void *>(),
|
||||||
|
conv->getOutput()->getDataRawPtr<void *>(),
|
||||||
|
cpuo0->size() * sizeof(float), cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
// check results on CPU
|
||||||
|
EXPECT_TRUE(cpuo0->equalData<float>(ans));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Conv, cuDNN) {
|
||||||
|
testConvCudnn(OneGenerator(),
|
||||||
|
vector<float>{12, 12, 18, 18, 12, 12, 18, 18});
|
||||||
|
testConvCudnn(
|
||||||
|
IncrementalGenerator(),
|
||||||
|
vector<float>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
|
||||||
|
}
|
||||||
} // namespace infini
|
} // namespace infini
|
|
@ -0,0 +1,2 @@
|
||||||
|
spack load cuda@11.0.2 cudnn@8.0.3.33-11.0
|
||||||
|
export CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc
|
Loading…
Reference in New Issue