Extended DataType class and Runtime interaction (#9)

* Add: DataType class

* Add: data-type-oblivious tensor interface

* Rename: copyBlobToCPU

Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
This commit is contained in:
zhengly123 2022-08-23 16:55:59 +08:00 committed by GitHub
parent bd5934279b
commit af08df32d2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 223 additions and 117 deletions

View File

@ -45,9 +45,10 @@ using HashType = uint64_t; // compatible with std::hash
std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) + \
"] Assertion failed (" + #name + "): " + #info))
#define _IT_ASSERT_1(name) _IT_ASSERT_2(name, "");
#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
#define IT_TODO_HALT() IT_ASSERT(false, "Unimplemented")
#define IT_TODO_HALT() _IT_ASSERT_2(false, "Unimplemented")
#define IT_TODO_HALT_MSG(msg) _IT_ASSERT_2(false, msg)
#define IT_TODO_SKIP() puts("Unimplemented " __FILE__ ":" __LINE__)
// Other utilities

34
include/core/data_type.h Normal file
View File

@ -0,0 +1,34 @@
#include "core/common.h"
namespace infini {
class DataType {
public:
static const DataType Float32;
static const DataType UInt32;
static constexpr size_t sizePerElement[]{sizeof(float), sizeof(uint32_t)};
static constexpr std::string_view names[]{"Float32", "UInt32"};
private:
int index;
public:
constexpr DataType(int index) : index(index) {}
bool operator==(const DataType &rhs) const { return index == rhs.index; }
bool operator<(const DataType &rhs) const { return index < rhs.index; }
template <typename T> static DataType get() {
IT_TODO_HALT_MSG("Unsupported data type");
}
size_t getSize() const { return sizePerElement[index]; }
string toString() const { return string(names[index]); }
};
inline const DataType DataType::Float32(0);
inline const DataType DataType::UInt32(1);
// Method definitions are out of the declaration due to GCC bug:
// https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
template <> inline DataType DataType::get<float>() { return Float32; }
template <> inline DataType DataType::get<uint32_t>() { return UInt32; }
} // namespace infini

View File

@ -37,8 +37,6 @@ enum class OpType {
MemBound = 300,
};
enum class Device { CPU = 1, CUDA };
using KernelAttrs = std::tuple<Device, OpType, DataType>;
class OpRegistry {

View File

@ -1,9 +1,33 @@
#pragma once
#include "core/graph.h"
#include "core/kernel.h"
#include "core/perf_engine.h"
#include "core/common.h"
#include "core/ref.h"
#include <memory>
namespace infini {
/***************** Forward declaration begin *****************/
class TensorBaseObj;
class TensorObj;
class OperatorObj;
class GraphObj;
class RuntimeObj;
class BlobObj;
using TensorBase = Ref<TensorBaseObj>;
using Tensor = Ref<TensorObj>;
using Operator = Ref<OperatorObj>;
using Graph = Ref<GraphObj>;
using Runtime = Ref<RuntimeObj>;
using Blob = Ref<BlobObj>;
enum class OpType;
using TensorVec = vector<Tensor>;
using OpVec = vector<Operator>;
using VType = uint32_t;
enum class Device { CPU = 1, CUDA };
/***************** Forward declaration end *****************/
class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
protected:
Device device;
@ -37,17 +61,27 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
*/
double getPerfTime(const Graph &graph, bool profiling = false) const;
Blob allocBlob(size_t size);
bool isCpu() const { return device == Device::CPU; }
bool isCuda() const { return device == Device::CUDA; }
void copyBlob(const TensorObj *dst, const TensorObj *src) const;
protected:
void printProfilingData(double totTime,
const std::map<OpType, double> &opTime,
const std::map<OpType, int> &opCnt) const;
virtual void copyBlobFromCPU(void *dst, void *src, size_t bytes) const = 0;
virtual void copyBlobToCPU(void *dst, void *src, size_t bytes) const = 0;
virtual void copyBlobInsideRuntime(void *dst, void *src,
size_t bytes) const = 0;
};
// TODO: change inheritance relation
class CpuRuntimeObj : public RuntimeObj {
public:
CpuRuntimeObj() : RuntimeObj(Device::CPU) {}
static Ref<CpuRuntimeObj> &getInstance() {
static Ref<CpuRuntimeObj> instance = make_ref<CpuRuntimeObj>();
return instance;
}
void run(const Graph &graph, bool tune = false,
bool profiling = false) const override;
@ -57,6 +91,11 @@ class CpuRuntimeObj : public RuntimeObj {
return calloc((size + sizeof(uint64_t) - 1) / sizeof(uint64_t),
sizeof(uint64_t));
};
void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override;
void copyBlobToCPU(void *dst, void *src, size_t bytes) const override;
void copyBlobInsideRuntime(void *dst, void *src,
size_t bytes) const override;
};
} // namespace infini

View File

@ -12,11 +12,12 @@ class TensorObj : public TensorBaseObj {
Shape shape;
public:
TensorObj(const Shape &shape, DataType dtype);
TensorObj(const Shape &shape, DataType dtype, Runtime runtime);
virtual ~TensorObj() {}
string toString() const override;
size_t size() const;
size_t getBytes() const;
Shape getDims() const { return shape; }
@ -24,39 +25,40 @@ class TensorObj : public TensorBaseObj {
using TensorBaseObj::getData;
VType getData(const Shape &pos) const;
void dataMalloc(const Runtime &runtime);
// void copyData(VType *dptr);
template <typename T> void copyData(const T *dptr);
void copyData(vector<VType> dataVector);
void copyData(vector<float> dataVector);
void printData() const;
// TODO: merge these methods
bool equalData(const Tensor &rhs) const;
template <typename T> bool equalData(const Tensor &rhs) const {
template <typename T> void copyData(const T *dptr) {
IT_ASSERT(DataType::get<T>() == dtype);
IT_ASSERT(data != nullptr);
IT_ASSERT(rhs->data != nullptr);
// TODO: deal with data type
if (!runtime->isCpu())
IT_TODO_HALT();
auto ptr = data->getPtr<T *>();
auto ptrRhs = rhs->data->getPtr<T *>();
if (shape != rhs->getDims())
return false;
size_t sz = size();
for (size_t i = 0; i < sz; ++i)
if (fabs(ptr[i] - ptrRhs[i]) /
std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
1e-6) {
printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
return false;
}
return true;
#pragma omp parallel for
for (size_t i = 0; i < sz; ++i) {
ptr[i] = dptr[i];
}
}
template <typename T> void copyData(vector<T> dataVector) {
IT_ASSERT(DataType::get<T>() == dtype);
IT_ASSERT(dataVector.size() >= size());
copyData(dataVector.data());
}
void copyData(const Tensor &src) { runtime->copyBlob(this, src.get()); }
void setData(
const std::function<void(void *, size_t, DataType)> &generator) const {
generator(data->getPtr<void *>(), size(), dtype);
}
void printData() const;
bool equalData(const Tensor &rhs) const;
private:
void printDataFloat() const;
void printDataUint32_t() const;
template <typename T> bool equalDataInt(const Tensor &rhs) const;
template <typename T> bool equalDataFloat(const Tensor &rhs) const;
// void setDims(const Dim &dms) { dims = dms; }
// bool dataRand(int seed = 0) {

View File

@ -1,34 +1,11 @@
#pragma once
#include "core/blob.h"
#include "core/data_type.h"
#include "core/object.h"
#include "core/ref.h"
#include "core/runtime.h"
namespace infini {
class TensorBaseObj;
class TensorObj;
class OperatorObj;
class GraphObj;
class RuntimeObj;
class BlobObj;
using TensorBase = Ref<TensorBaseObj>;
using Tensor = Ref<TensorObj>;
using Operator = Ref<OperatorObj>;
using Graph = Ref<GraphObj>;
using Runtime = Ref<RuntimeObj>;
using Blob = Ref<BlobObj>;
using TensorVec = vector<Tensor>;
using OpVec = vector<Operator>;
using VType = uint32_t;
enum class DataType {
Float32,
UInt32,
};
class TensorBaseObj : public Object {
public:
// enum TensorType {
@ -45,12 +22,10 @@ class TensorBaseObj : public Object {
vector<WRef<TensorBaseObj>> inputOf;
WRef<TensorBaseObj> outputOf;
Blob data;
// ComputeState computed;
// static int random_seed[256 * 16];
// static bool random_inited;
Runtime runtime;
public:
TensorBaseObj(int dim, DataType dtype);
TensorBaseObj(int dim, DataType dtype, Runtime runtime);
virtual ~TensorBaseObj() {}
void dataMalloc(const Blob &blob) {
@ -65,6 +40,7 @@ class TensorBaseObj : public Object {
VType getData(size_t offset) const;
DataType getDType() const { return dtype; }
Runtime getRuntime() const { return runtime; }
// uint64_t getHash() const { return hash; }

View File

@ -43,6 +43,19 @@ class CudaRuntimeObj : public RuntimeObj {
return workspace;
}
void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override {
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice));
}
void copyBlobToCPU(void *dst, void *src, size_t bytes) const override {
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost));
}
void copyBlobInsideRuntime(void *dst, void *src,
size_t bytes) const override {
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
}
private:
void runWithoutSync(const Graph &graph) const;
};

View File

@ -14,16 +14,12 @@ class DataGenerator {
public:
virtual ~DataGenerator() {}
void operator()(void *data, size_t size, DataType dataType) {
switch (dataType) {
case DataType::UInt32:
if (dataType == DataType::UInt32)
fill(reinterpret_cast<uint32_t *>(data), size);
break;
case DataType::Float32:
else if (dataType == DataType::Float32)
fill(reinterpret_cast<float *>(data), size);
break;
default:
else
IT_TODO_HALT();
}
}
};

View File

@ -1,3 +1,4 @@
#include "core/blob.h"
#include "core/runtime.h"
namespace infini {

View File

@ -19,7 +19,7 @@ void GraphObj::dataMalloc() {
}
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
Tensor tensor = make_ref<TensorObj>(dim, dtype);
Tensor tensor = make_ref<TensorObj>(dim, dtype, runtime);
tensors.emplace_back(tensor);
return tensor;
}

View File

@ -1,6 +1,9 @@
#include "core/runtime.h"
#include "core/blob.h"
#include "core/kernel.h"
#include "core/perf_engine.h"
#include <chrono>
#include <cstring>
#include <cuda.h>
#include <cuda_profiler_api.h>
#include <cudnn.h>
@ -112,4 +115,34 @@ Blob RuntimeObj::allocBlob(size_t size) {
return make_ref<BlobObj>(shared_from_this(), alloc(size));
}
void RuntimeObj::copyBlob(const TensorObj *dst, const TensorObj *src) const {
void *dstPtr = dst->getDataRawPtr<void *>();
void *srcPtr = src->getDataRawPtr<void *>();
size_t bytes = dst->getBytes();
auto dstRuntime = dst->getRuntime();
auto srcRuntime = src->getRuntime();
if (dstRuntime.get() == srcRuntime.get()) {
dstRuntime->copyBlobInsideRuntime(dstPtr, srcPtr, bytes);
} else if (src->getRuntime()->isCpu()) {
dstRuntime->copyBlobFromCPU(dstPtr, srcPtr, bytes);
} else if (dst->getRuntime()->isCpu()) {
srcRuntime->copyBlobToCPU(dstPtr, srcPtr, bytes);
} else
IT_TODO_HALT();
}
void CpuRuntimeObj::copyBlobFromCPU(void *dst, void *src, size_t bytes) const {
copyBlobInsideRuntime(dst, src, bytes);
}
void CpuRuntimeObj::copyBlobToCPU(void *dst, void *src, size_t bytes) const {
copyBlobInsideRuntime(dst, src, bytes);
}
void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, void *src,
size_t bytes) const {
memcpy(dst, src, bytes);
}
} // namespace infini

View File

@ -4,8 +4,8 @@
namespace infini {
TensorObj::TensorObj(const Shape &shape, DataType dtype)
: TensorBaseObj(shape.size(), dtype), shape(shape) {}
TensorObj::TensorObj(const Shape &shape, DataType dtype, Runtime runtime)
: TensorBaseObj(shape.size(), dtype, runtime), shape(shape) {}
VType TensorObj::getData(const Shape &pos) const {
return getData(getOffset(pos));
@ -34,29 +34,12 @@ size_t TensorObj::size() const {
return ret;
}
template <typename T> void TensorObj::copyData(const T *dptr) {
// TODO: cuda
IT_ASSERT(data != nullptr);
auto ptr = data->getPtr<T *>();
size_t sz = size();
#pragma omp parallel for
for (size_t i = 0; i < sz; ++i) {
ptr[i] = dptr[i];
}
}
void TensorObj::copyData(vector<VType> dataVector) {
IT_ASSERT(dataVector.size() >= size());
copyData(dataVector.data());
}
void TensorObj::copyData(vector<float> dataVector) {
IT_ASSERT(dataVector.size() >= size());
copyData(dataVector.data());
}
size_t TensorObj::getBytes() const { return size() * dtype.getSize(); }
void TensorObj::printData() const {
IT_ASSERT(data != nullptr);
if (!runtime->isCpu())
IT_TODO_HALT();
if (dtype == DataType::Float32)
printDataFloat();
else if (dtype == DataType::UInt32)
@ -120,12 +103,9 @@ void TensorObj::printDataUint32_t() const {
}
}
bool TensorObj::equalData(const Tensor &rhs) const {
IT_ASSERT(data != nullptr);
IT_ASSERT(rhs->data != nullptr);
// TODO: deal with data type
auto ptr = data->getPtr<VType *>();
auto ptrRhs = rhs->data->getPtr<VType *>();
template <typename T> bool TensorObj::equalDataInt(const Tensor &rhs) const {
auto ptr = data->getPtr<uint32_t *>();
auto ptrRhs = rhs->data->getPtr<uint32_t *>();
if (shape != rhs->getDims())
return false;
size_t sz = size();
@ -135,6 +115,36 @@ bool TensorObj::equalData(const Tensor &rhs) const {
return true;
}
template <typename T> bool TensorObj::equalDataFloat(const Tensor &rhs) const {
IT_ASSERT(data != nullptr);
IT_ASSERT(rhs->data != nullptr);
// TODO: deal with data type
auto ptr = data->getPtr<T *>();
auto ptrRhs = rhs->data->getPtr<T *>();
if (shape != rhs->getDims())
return false;
size_t sz = size();
for (size_t i = 0; i < sz; ++i)
if (fabs(ptr[i] - ptrRhs[i]) / std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
1e-6) {
printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
return false;
}
return true;
}
bool TensorObj::equalData(const Tensor &rhs) const {
IT_ASSERT(data != nullptr);
IT_ASSERT(rhs->data != nullptr);
IT_ASSERT(getDType() == rhs->getDType());
if (getDType() == DataType::UInt32)
return equalDataInt<uint32_t>(rhs);
else if (getDType() == DataType::Float32)
return equalDataInt<float>(rhs);
else
IT_TODO_HALT();
}
void TensorObj::dataMalloc(const Runtime &runtime) {
IT_ASSERT(data == nullptr);
size_t bytesPerElement;

View File

@ -3,8 +3,8 @@
#include "core/runtime.h"
namespace infini {
TensorBaseObj::TensorBaseObj(int dim, DataType dtype)
: dim(dim), dtype(dtype) {}
TensorBaseObj::TensorBaseObj(int dim, DataType dtype, Runtime runtime)
: dim(dim), dtype(dtype), runtime(runtime) {}
VType TensorBaseObj::getData(size_t offset) const {
// TODO: check cuda array

View File

@ -1,4 +1,6 @@
#include "cuda/cuda_runtime.h"
#include "core/kernel.h"
#include "core/perf_engine.h"
namespace infini {

View File

@ -7,7 +7,7 @@
namespace infini {
TEST(Graph, build_and_run) {
Runtime runtime = make_ref<CpuRuntimeObj>();
Runtime runtime = CpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
@ -18,14 +18,14 @@ TEST(Graph, build_and_run) {
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
runtime->run(g);
// check answer
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
ans->dataMalloc(runtime);
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
EXPECT_TRUE(o0->equalData(ans));
}
TEST(Graph, perf_engine) {
Runtime runtime = make_ref<CpuRuntimeObj>();
Runtime runtime = CpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
@ -40,7 +40,7 @@ TEST(Graph, perf_engine) {
EXPECT_GT(perfTime, 0);
EXPECT_LT(perfTime, 0.01);
// check answer
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
ans->dataMalloc(runtime);
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
EXPECT_TRUE(matmul->getOutput()->equalData(ans));

View File

@ -8,7 +8,7 @@
namespace infini {
TEST(Conv, ShapeInference) {
auto runtime = make_ref<CpuRuntimeObj>();
Runtime runtime = CpuRuntimeObj::getInstance();
// Padding modes
{
Graph g = make_ref<GraphObj>(runtime);
@ -43,7 +43,7 @@ TEST(Conv, ShapeInference) {
}
TEST(Conv, NaiveCPU) {
auto runtime = make_ref<CpuRuntimeObj>();
Runtime runtime = CpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
@ -58,7 +58,8 @@ TEST(Conv, NaiveCPU) {
EXPECT_GT(perfTime, 0);
EXPECT_LT(perfTime, 0.1);
// check answer
auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32);
auto ans =
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32, runtime);
ans->dataMalloc(runtime);
ans->copyData(
vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
@ -68,7 +69,7 @@ TEST(Conv, NaiveCPU) {
void testConvCudnn(
const std::function<void(void *, size_t, DataType)> &generator,
vector<float> ansVec) {
auto cpuRuntime = make_ref<CpuRuntimeObj>();
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
auto cudaRuntime = make_ref<CudaRuntimeObj>();
// Build CUDA graph
Graph g = make_ref<GraphObj>(cudaRuntime);
@ -80,23 +81,24 @@ void testConvCudnn(
g->dataMalloc();
// Build input and output data on CPU
auto cpui0 = make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32);
auto cpui0 =
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32, cpuRuntime);
cpui0->dataMalloc(cpuRuntime);
cpui0->setData(generator);
auto cpuw0 = make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32);
auto cpuw0 =
make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32, cpuRuntime);
cpuw0->dataMalloc(cpuRuntime);
cpuw0->setData(generator);
auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32);
auto ans =
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
ans->dataMalloc(cpuRuntime);
ans->copyData(ansVec);
// Copy inputs from CPU to CUDA
cudaMemcpy(i0->getDataRawPtr<void *>(), cpui0->getDataRawPtr<void *>(),
cpui0->size() * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(w0->getDataRawPtr<void *>(), cpuw0->getDataRawPtr<void *>(),
cpuw0->size() * sizeof(float), cudaMemcpyHostToDevice);
i0->copyData(cpui0);
w0->copyData(cpuw0);
// Execute on CUDA
cudaRuntime->run(g);
// double perfTime = cudaRuntime->getPerfTime(g);
@ -106,14 +108,13 @@ void testConvCudnn(
// copy CUDA output to CPU
auto o0 = conv->getOutput();
auto cpuo0 = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32);
auto cpuo0 =
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
cpuo0->dataMalloc(cpuRuntime);
cudaMemcpy(cpuo0->getDataRawPtr<void *>(),
conv->getOutput()->getDataRawPtr<void *>(),
cpuo0->size() * sizeof(float), cudaMemcpyDeviceToHost);
cpuo0->copyData(o0);
// check results on CPU
EXPECT_TRUE(cpuo0->equalData<float>(ans));
EXPECT_TRUE(cpuo0->equalData(ans));
}
TEST(Conv, cuDNN) {