forked from jiuyuan/InfiniTensor
Extended DataType class and Runtime interaction (#9)
* Add: DataType class * Add: data-type-oblivious tensor interface * Rename: copyBlobToCPU Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
This commit is contained in:
parent
bd5934279b
commit
af08df32d2
|
@ -45,9 +45,10 @@ using HashType = uint64_t; // compatible with std::hash
|
||||||
std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) + \
|
std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) + \
|
||||||
"] Assertion failed (" + #name + "): " + #info))
|
"] Assertion failed (" + #name + "): " + #info))
|
||||||
#define _IT_ASSERT_1(name) _IT_ASSERT_2(name, "");
|
#define _IT_ASSERT_1(name) _IT_ASSERT_2(name, "");
|
||||||
|
|
||||||
#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
|
#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
|
||||||
#define IT_TODO_HALT() IT_ASSERT(false, "Unimplemented")
|
|
||||||
|
#define IT_TODO_HALT() _IT_ASSERT_2(false, "Unimplemented")
|
||||||
|
#define IT_TODO_HALT_MSG(msg) _IT_ASSERT_2(false, msg)
|
||||||
#define IT_TODO_SKIP() puts("Unimplemented " __FILE__ ":" __LINE__)
|
#define IT_TODO_SKIP() puts("Unimplemented " __FILE__ ":" __LINE__)
|
||||||
|
|
||||||
// Other utilities
|
// Other utilities
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
#include "core/common.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class DataType {
|
||||||
|
public:
|
||||||
|
static const DataType Float32;
|
||||||
|
static const DataType UInt32;
|
||||||
|
static constexpr size_t sizePerElement[]{sizeof(float), sizeof(uint32_t)};
|
||||||
|
static constexpr std::string_view names[]{"Float32", "UInt32"};
|
||||||
|
|
||||||
|
private:
|
||||||
|
int index;
|
||||||
|
|
||||||
|
public:
|
||||||
|
constexpr DataType(int index) : index(index) {}
|
||||||
|
bool operator==(const DataType &rhs) const { return index == rhs.index; }
|
||||||
|
bool operator<(const DataType &rhs) const { return index < rhs.index; }
|
||||||
|
|
||||||
|
template <typename T> static DataType get() {
|
||||||
|
IT_TODO_HALT_MSG("Unsupported data type");
|
||||||
|
}
|
||||||
|
size_t getSize() const { return sizePerElement[index]; }
|
||||||
|
string toString() const { return string(names[index]); }
|
||||||
|
};
|
||||||
|
|
||||||
|
inline const DataType DataType::Float32(0);
|
||||||
|
inline const DataType DataType::UInt32(1);
|
||||||
|
// Method definitions are out of the declaration due to GCC bug:
|
||||||
|
// https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
|
||||||
|
template <> inline DataType DataType::get<float>() { return Float32; }
|
||||||
|
template <> inline DataType DataType::get<uint32_t>() { return UInt32; }
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -37,8 +37,6 @@ enum class OpType {
|
||||||
MemBound = 300,
|
MemBound = 300,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum class Device { CPU = 1, CUDA };
|
|
||||||
|
|
||||||
using KernelAttrs = std::tuple<Device, OpType, DataType>;
|
using KernelAttrs = std::tuple<Device, OpType, DataType>;
|
||||||
|
|
||||||
class OpRegistry {
|
class OpRegistry {
|
||||||
|
|
|
@ -1,9 +1,33 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "core/graph.h"
|
#include "core/common.h"
|
||||||
#include "core/kernel.h"
|
#include "core/ref.h"
|
||||||
#include "core/perf_engine.h"
|
#include <memory>
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
/***************** Forward declaration begin *****************/
|
||||||
|
class TensorBaseObj;
|
||||||
|
class TensorObj;
|
||||||
|
class OperatorObj;
|
||||||
|
class GraphObj;
|
||||||
|
class RuntimeObj;
|
||||||
|
class BlobObj;
|
||||||
|
|
||||||
|
using TensorBase = Ref<TensorBaseObj>;
|
||||||
|
using Tensor = Ref<TensorObj>;
|
||||||
|
using Operator = Ref<OperatorObj>;
|
||||||
|
using Graph = Ref<GraphObj>;
|
||||||
|
using Runtime = Ref<RuntimeObj>;
|
||||||
|
using Blob = Ref<BlobObj>;
|
||||||
|
enum class OpType;
|
||||||
|
|
||||||
|
using TensorVec = vector<Tensor>;
|
||||||
|
using OpVec = vector<Operator>;
|
||||||
|
|
||||||
|
using VType = uint32_t;
|
||||||
|
|
||||||
|
enum class Device { CPU = 1, CUDA };
|
||||||
|
/***************** Forward declaration end *****************/
|
||||||
|
|
||||||
class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
||||||
protected:
|
protected:
|
||||||
Device device;
|
Device device;
|
||||||
|
@ -37,17 +61,27 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
||||||
*/
|
*/
|
||||||
double getPerfTime(const Graph &graph, bool profiling = false) const;
|
double getPerfTime(const Graph &graph, bool profiling = false) const;
|
||||||
Blob allocBlob(size_t size);
|
Blob allocBlob(size_t size);
|
||||||
|
bool isCpu() const { return device == Device::CPU; }
|
||||||
|
bool isCuda() const { return device == Device::CUDA; }
|
||||||
|
void copyBlob(const TensorObj *dst, const TensorObj *src) const;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void printProfilingData(double totTime,
|
void printProfilingData(double totTime,
|
||||||
const std::map<OpType, double> &opTime,
|
const std::map<OpType, double> &opTime,
|
||||||
const std::map<OpType, int> &opCnt) const;
|
const std::map<OpType, int> &opCnt) const;
|
||||||
|
virtual void copyBlobFromCPU(void *dst, void *src, size_t bytes) const = 0;
|
||||||
|
virtual void copyBlobToCPU(void *dst, void *src, size_t bytes) const = 0;
|
||||||
|
virtual void copyBlobInsideRuntime(void *dst, void *src,
|
||||||
|
size_t bytes) const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: change inheritance relation
|
|
||||||
class CpuRuntimeObj : public RuntimeObj {
|
class CpuRuntimeObj : public RuntimeObj {
|
||||||
public:
|
public:
|
||||||
CpuRuntimeObj() : RuntimeObj(Device::CPU) {}
|
CpuRuntimeObj() : RuntimeObj(Device::CPU) {}
|
||||||
|
static Ref<CpuRuntimeObj> &getInstance() {
|
||||||
|
static Ref<CpuRuntimeObj> instance = make_ref<CpuRuntimeObj>();
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
void run(const Graph &graph, bool tune = false,
|
void run(const Graph &graph, bool tune = false,
|
||||||
bool profiling = false) const override;
|
bool profiling = false) const override;
|
||||||
|
@ -57,6 +91,11 @@ class CpuRuntimeObj : public RuntimeObj {
|
||||||
return calloc((size + sizeof(uint64_t) - 1) / sizeof(uint64_t),
|
return calloc((size + sizeof(uint64_t) - 1) / sizeof(uint64_t),
|
||||||
sizeof(uint64_t));
|
sizeof(uint64_t));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override;
|
||||||
|
void copyBlobToCPU(void *dst, void *src, size_t bytes) const override;
|
||||||
|
void copyBlobInsideRuntime(void *dst, void *src,
|
||||||
|
size_t bytes) const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
|
@ -12,11 +12,12 @@ class TensorObj : public TensorBaseObj {
|
||||||
Shape shape;
|
Shape shape;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
TensorObj(const Shape &shape, DataType dtype);
|
TensorObj(const Shape &shape, DataType dtype, Runtime runtime);
|
||||||
virtual ~TensorObj() {}
|
virtual ~TensorObj() {}
|
||||||
string toString() const override;
|
string toString() const override;
|
||||||
|
|
||||||
size_t size() const;
|
size_t size() const;
|
||||||
|
size_t getBytes() const;
|
||||||
|
|
||||||
Shape getDims() const { return shape; }
|
Shape getDims() const { return shape; }
|
||||||
|
|
||||||
|
@ -24,39 +25,40 @@ class TensorObj : public TensorBaseObj {
|
||||||
using TensorBaseObj::getData;
|
using TensorBaseObj::getData;
|
||||||
VType getData(const Shape &pos) const;
|
VType getData(const Shape &pos) const;
|
||||||
void dataMalloc(const Runtime &runtime);
|
void dataMalloc(const Runtime &runtime);
|
||||||
// void copyData(VType *dptr);
|
|
||||||
template <typename T> void copyData(const T *dptr);
|
template <typename T> void copyData(const T *dptr) {
|
||||||
void copyData(vector<VType> dataVector);
|
IT_ASSERT(DataType::get<T>() == dtype);
|
||||||
void copyData(vector<float> dataVector);
|
|
||||||
void printData() const;
|
|
||||||
// TODO: merge these methods
|
|
||||||
bool equalData(const Tensor &rhs) const;
|
|
||||||
template <typename T> bool equalData(const Tensor &rhs) const {
|
|
||||||
IT_ASSERT(data != nullptr);
|
IT_ASSERT(data != nullptr);
|
||||||
IT_ASSERT(rhs->data != nullptr);
|
if (!runtime->isCpu())
|
||||||
// TODO: deal with data type
|
IT_TODO_HALT();
|
||||||
auto ptr = data->getPtr<T *>();
|
auto ptr = data->getPtr<T *>();
|
||||||
auto ptrRhs = rhs->data->getPtr<T *>();
|
|
||||||
if (shape != rhs->getDims())
|
|
||||||
return false;
|
|
||||||
size_t sz = size();
|
size_t sz = size();
|
||||||
for (size_t i = 0; i < sz; ++i)
|
#pragma omp parallel for
|
||||||
if (fabs(ptr[i] - ptrRhs[i]) /
|
for (size_t i = 0; i < sz; ++i) {
|
||||||
std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
|
ptr[i] = dptr[i];
|
||||||
1e-6) {
|
|
||||||
printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T> void copyData(vector<T> dataVector) {
|
||||||
|
IT_ASSERT(DataType::get<T>() == dtype);
|
||||||
|
IT_ASSERT(dataVector.size() >= size());
|
||||||
|
copyData(dataVector.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
void copyData(const Tensor &src) { runtime->copyBlob(this, src.get()); }
|
||||||
void setData(
|
void setData(
|
||||||
const std::function<void(void *, size_t, DataType)> &generator) const {
|
const std::function<void(void *, size_t, DataType)> &generator) const {
|
||||||
generator(data->getPtr<void *>(), size(), dtype);
|
generator(data->getPtr<void *>(), size(), dtype);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void printData() const;
|
||||||
|
bool equalData(const Tensor &rhs) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void printDataFloat() const;
|
void printDataFloat() const;
|
||||||
void printDataUint32_t() const;
|
void printDataUint32_t() const;
|
||||||
|
template <typename T> bool equalDataInt(const Tensor &rhs) const;
|
||||||
|
template <typename T> bool equalDataFloat(const Tensor &rhs) const;
|
||||||
// void setDims(const Dim &dms) { dims = dms; }
|
// void setDims(const Dim &dms) { dims = dms; }
|
||||||
|
|
||||||
// bool dataRand(int seed = 0) {
|
// bool dataRand(int seed = 0) {
|
||||||
|
|
|
@ -1,34 +1,11 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "core/blob.h"
|
#include "core/blob.h"
|
||||||
|
#include "core/data_type.h"
|
||||||
#include "core/object.h"
|
#include "core/object.h"
|
||||||
#include "core/ref.h"
|
#include "core/runtime.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
class TensorBaseObj;
|
|
||||||
class TensorObj;
|
|
||||||
class OperatorObj;
|
|
||||||
class GraphObj;
|
|
||||||
class RuntimeObj;
|
|
||||||
class BlobObj;
|
|
||||||
|
|
||||||
using TensorBase = Ref<TensorBaseObj>;
|
|
||||||
using Tensor = Ref<TensorObj>;
|
|
||||||
using Operator = Ref<OperatorObj>;
|
|
||||||
using Graph = Ref<GraphObj>;
|
|
||||||
using Runtime = Ref<RuntimeObj>;
|
|
||||||
using Blob = Ref<BlobObj>;
|
|
||||||
|
|
||||||
using TensorVec = vector<Tensor>;
|
|
||||||
using OpVec = vector<Operator>;
|
|
||||||
|
|
||||||
using VType = uint32_t;
|
|
||||||
|
|
||||||
enum class DataType {
|
|
||||||
Float32,
|
|
||||||
UInt32,
|
|
||||||
};
|
|
||||||
|
|
||||||
class TensorBaseObj : public Object {
|
class TensorBaseObj : public Object {
|
||||||
public:
|
public:
|
||||||
// enum TensorType {
|
// enum TensorType {
|
||||||
|
@ -45,12 +22,10 @@ class TensorBaseObj : public Object {
|
||||||
vector<WRef<TensorBaseObj>> inputOf;
|
vector<WRef<TensorBaseObj>> inputOf;
|
||||||
WRef<TensorBaseObj> outputOf;
|
WRef<TensorBaseObj> outputOf;
|
||||||
Blob data;
|
Blob data;
|
||||||
// ComputeState computed;
|
Runtime runtime;
|
||||||
// static int random_seed[256 * 16];
|
|
||||||
// static bool random_inited;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
TensorBaseObj(int dim, DataType dtype);
|
TensorBaseObj(int dim, DataType dtype, Runtime runtime);
|
||||||
virtual ~TensorBaseObj() {}
|
virtual ~TensorBaseObj() {}
|
||||||
|
|
||||||
void dataMalloc(const Blob &blob) {
|
void dataMalloc(const Blob &blob) {
|
||||||
|
@ -65,6 +40,7 @@ class TensorBaseObj : public Object {
|
||||||
VType getData(size_t offset) const;
|
VType getData(size_t offset) const;
|
||||||
|
|
||||||
DataType getDType() const { return dtype; }
|
DataType getDType() const { return dtype; }
|
||||||
|
Runtime getRuntime() const { return runtime; }
|
||||||
|
|
||||||
// uint64_t getHash() const { return hash; }
|
// uint64_t getHash() const { return hash; }
|
||||||
|
|
||||||
|
|
|
@ -43,6 +43,19 @@ class CudaRuntimeObj : public RuntimeObj {
|
||||||
return workspace;
|
return workspace;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override {
|
||||||
|
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice));
|
||||||
|
}
|
||||||
|
|
||||||
|
void copyBlobToCPU(void *dst, void *src, size_t bytes) const override {
|
||||||
|
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost));
|
||||||
|
}
|
||||||
|
|
||||||
|
void copyBlobInsideRuntime(void *dst, void *src,
|
||||||
|
size_t bytes) const override {
|
||||||
|
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void runWithoutSync(const Graph &graph) const;
|
void runWithoutSync(const Graph &graph) const;
|
||||||
};
|
};
|
||||||
|
|
|
@ -14,17 +14,13 @@ class DataGenerator {
|
||||||
public:
|
public:
|
||||||
virtual ~DataGenerator() {}
|
virtual ~DataGenerator() {}
|
||||||
void operator()(void *data, size_t size, DataType dataType) {
|
void operator()(void *data, size_t size, DataType dataType) {
|
||||||
switch (dataType) {
|
if (dataType == DataType::UInt32)
|
||||||
case DataType::UInt32:
|
|
||||||
fill(reinterpret_cast<uint32_t *>(data), size);
|
fill(reinterpret_cast<uint32_t *>(data), size);
|
||||||
break;
|
else if (dataType == DataType::Float32)
|
||||||
case DataType::Float32:
|
|
||||||
fill(reinterpret_cast<float *>(data), size);
|
fill(reinterpret_cast<float *>(data), size);
|
||||||
break;
|
else
|
||||||
default:
|
|
||||||
IT_TODO_HALT();
|
IT_TODO_HALT();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class IncrementalGenerator : public DataGenerator {
|
class IncrementalGenerator : public DataGenerator {
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "core/blob.h"
|
||||||
#include "core/runtime.h"
|
#include "core/runtime.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
|
@ -19,7 +19,7 @@ void GraphObj::dataMalloc() {
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
|
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
|
||||||
Tensor tensor = make_ref<TensorObj>(dim, dtype);
|
Tensor tensor = make_ref<TensorObj>(dim, dtype, runtime);
|
||||||
tensors.emplace_back(tensor);
|
tensors.emplace_back(tensor);
|
||||||
return tensor;
|
return tensor;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
#include "core/runtime.h"
|
#include "core/runtime.h"
|
||||||
#include "core/blob.h"
|
#include "core/blob.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/perf_engine.h"
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
#include <cstring>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <cuda_profiler_api.h>
|
#include <cuda_profiler_api.h>
|
||||||
#include <cudnn.h>
|
#include <cudnn.h>
|
||||||
|
@ -112,4 +115,34 @@ Blob RuntimeObj::allocBlob(size_t size) {
|
||||||
return make_ref<BlobObj>(shared_from_this(), alloc(size));
|
return make_ref<BlobObj>(shared_from_this(), alloc(size));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void RuntimeObj::copyBlob(const TensorObj *dst, const TensorObj *src) const {
|
||||||
|
void *dstPtr = dst->getDataRawPtr<void *>();
|
||||||
|
void *srcPtr = src->getDataRawPtr<void *>();
|
||||||
|
size_t bytes = dst->getBytes();
|
||||||
|
auto dstRuntime = dst->getRuntime();
|
||||||
|
auto srcRuntime = src->getRuntime();
|
||||||
|
|
||||||
|
if (dstRuntime.get() == srcRuntime.get()) {
|
||||||
|
dstRuntime->copyBlobInsideRuntime(dstPtr, srcPtr, bytes);
|
||||||
|
} else if (src->getRuntime()->isCpu()) {
|
||||||
|
dstRuntime->copyBlobFromCPU(dstPtr, srcPtr, bytes);
|
||||||
|
} else if (dst->getRuntime()->isCpu()) {
|
||||||
|
srcRuntime->copyBlobToCPU(dstPtr, srcPtr, bytes);
|
||||||
|
} else
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
|
||||||
|
void CpuRuntimeObj::copyBlobFromCPU(void *dst, void *src, size_t bytes) const {
|
||||||
|
copyBlobInsideRuntime(dst, src, bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CpuRuntimeObj::copyBlobToCPU(void *dst, void *src, size_t bytes) const {
|
||||||
|
copyBlobInsideRuntime(dst, src, bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, void *src,
|
||||||
|
size_t bytes) const {
|
||||||
|
memcpy(dst, src, bytes);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
|
@ -4,8 +4,8 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
TensorObj::TensorObj(const Shape &shape, DataType dtype)
|
TensorObj::TensorObj(const Shape &shape, DataType dtype, Runtime runtime)
|
||||||
: TensorBaseObj(shape.size(), dtype), shape(shape) {}
|
: TensorBaseObj(shape.size(), dtype, runtime), shape(shape) {}
|
||||||
|
|
||||||
VType TensorObj::getData(const Shape &pos) const {
|
VType TensorObj::getData(const Shape &pos) const {
|
||||||
return getData(getOffset(pos));
|
return getData(getOffset(pos));
|
||||||
|
@ -34,29 +34,12 @@ size_t TensorObj::size() const {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> void TensorObj::copyData(const T *dptr) {
|
size_t TensorObj::getBytes() const { return size() * dtype.getSize(); }
|
||||||
// TODO: cuda
|
|
||||||
IT_ASSERT(data != nullptr);
|
|
||||||
auto ptr = data->getPtr<T *>();
|
|
||||||
size_t sz = size();
|
|
||||||
#pragma omp parallel for
|
|
||||||
for (size_t i = 0; i < sz; ++i) {
|
|
||||||
ptr[i] = dptr[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void TensorObj::copyData(vector<VType> dataVector) {
|
|
||||||
IT_ASSERT(dataVector.size() >= size());
|
|
||||||
copyData(dataVector.data());
|
|
||||||
}
|
|
||||||
|
|
||||||
void TensorObj::copyData(vector<float> dataVector) {
|
|
||||||
IT_ASSERT(dataVector.size() >= size());
|
|
||||||
copyData(dataVector.data());
|
|
||||||
}
|
|
||||||
|
|
||||||
void TensorObj::printData() const {
|
void TensorObj::printData() const {
|
||||||
IT_ASSERT(data != nullptr);
|
IT_ASSERT(data != nullptr);
|
||||||
|
if (!runtime->isCpu())
|
||||||
|
IT_TODO_HALT();
|
||||||
if (dtype == DataType::Float32)
|
if (dtype == DataType::Float32)
|
||||||
printDataFloat();
|
printDataFloat();
|
||||||
else if (dtype == DataType::UInt32)
|
else if (dtype == DataType::UInt32)
|
||||||
|
@ -120,12 +103,9 @@ void TensorObj::printDataUint32_t() const {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool TensorObj::equalData(const Tensor &rhs) const {
|
template <typename T> bool TensorObj::equalDataInt(const Tensor &rhs) const {
|
||||||
IT_ASSERT(data != nullptr);
|
auto ptr = data->getPtr<uint32_t *>();
|
||||||
IT_ASSERT(rhs->data != nullptr);
|
auto ptrRhs = rhs->data->getPtr<uint32_t *>();
|
||||||
// TODO: deal with data type
|
|
||||||
auto ptr = data->getPtr<VType *>();
|
|
||||||
auto ptrRhs = rhs->data->getPtr<VType *>();
|
|
||||||
if (shape != rhs->getDims())
|
if (shape != rhs->getDims())
|
||||||
return false;
|
return false;
|
||||||
size_t sz = size();
|
size_t sz = size();
|
||||||
|
@ -135,6 +115,36 @@ bool TensorObj::equalData(const Tensor &rhs) const {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T> bool TensorObj::equalDataFloat(const Tensor &rhs) const {
|
||||||
|
IT_ASSERT(data != nullptr);
|
||||||
|
IT_ASSERT(rhs->data != nullptr);
|
||||||
|
// TODO: deal with data type
|
||||||
|
auto ptr = data->getPtr<T *>();
|
||||||
|
auto ptrRhs = rhs->data->getPtr<T *>();
|
||||||
|
if (shape != rhs->getDims())
|
||||||
|
return false;
|
||||||
|
size_t sz = size();
|
||||||
|
for (size_t i = 0; i < sz; ++i)
|
||||||
|
if (fabs(ptr[i] - ptrRhs[i]) / std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
|
||||||
|
1e-6) {
|
||||||
|
printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TensorObj::equalData(const Tensor &rhs) const {
|
||||||
|
IT_ASSERT(data != nullptr);
|
||||||
|
IT_ASSERT(rhs->data != nullptr);
|
||||||
|
IT_ASSERT(getDType() == rhs->getDType());
|
||||||
|
if (getDType() == DataType::UInt32)
|
||||||
|
return equalDataInt<uint32_t>(rhs);
|
||||||
|
else if (getDType() == DataType::Float32)
|
||||||
|
return equalDataInt<float>(rhs);
|
||||||
|
else
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
|
||||||
void TensorObj::dataMalloc(const Runtime &runtime) {
|
void TensorObj::dataMalloc(const Runtime &runtime) {
|
||||||
IT_ASSERT(data == nullptr);
|
IT_ASSERT(data == nullptr);
|
||||||
size_t bytesPerElement;
|
size_t bytesPerElement;
|
||||||
|
|
|
@ -3,8 +3,8 @@
|
||||||
#include "core/runtime.h"
|
#include "core/runtime.h"
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
TensorBaseObj::TensorBaseObj(int dim, DataType dtype)
|
TensorBaseObj::TensorBaseObj(int dim, DataType dtype, Runtime runtime)
|
||||||
: dim(dim), dtype(dtype) {}
|
: dim(dim), dtype(dtype), runtime(runtime) {}
|
||||||
|
|
||||||
VType TensorBaseObj::getData(size_t offset) const {
|
VType TensorBaseObj::getData(size_t offset) const {
|
||||||
// TODO: check cuda array
|
// TODO: check cuda array
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
#include "cuda/cuda_runtime.h"
|
#include "cuda/cuda_runtime.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/perf_engine.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
TEST(Graph, build_and_run) {
|
TEST(Graph, build_and_run) {
|
||||||
Runtime runtime = make_ref<CpuRuntimeObj>();
|
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||||
Graph g = make_ref<GraphObj>(runtime);
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
||||||
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
||||||
|
@ -18,14 +18,14 @@ TEST(Graph, build_and_run) {
|
||||||
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
||||||
runtime->run(g);
|
runtime->run(g);
|
||||||
// check answer
|
// check answer
|
||||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
|
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
|
||||||
ans->dataMalloc(runtime);
|
ans->dataMalloc(runtime);
|
||||||
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||||
EXPECT_TRUE(o0->equalData(ans));
|
EXPECT_TRUE(o0->equalData(ans));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(Graph, perf_engine) {
|
TEST(Graph, perf_engine) {
|
||||||
Runtime runtime = make_ref<CpuRuntimeObj>();
|
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||||
Graph g = make_ref<GraphObj>(runtime);
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
||||||
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
||||||
|
@ -40,7 +40,7 @@ TEST(Graph, perf_engine) {
|
||||||
EXPECT_GT(perfTime, 0);
|
EXPECT_GT(perfTime, 0);
|
||||||
EXPECT_LT(perfTime, 0.01);
|
EXPECT_LT(perfTime, 0.01);
|
||||||
// check answer
|
// check answer
|
||||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
|
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
|
||||||
ans->dataMalloc(runtime);
|
ans->dataMalloc(runtime);
|
||||||
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||||
EXPECT_TRUE(matmul->getOutput()->equalData(ans));
|
EXPECT_TRUE(matmul->getOutput()->equalData(ans));
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
TEST(Conv, ShapeInference) {
|
TEST(Conv, ShapeInference) {
|
||||||
auto runtime = make_ref<CpuRuntimeObj>();
|
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||||
// Padding modes
|
// Padding modes
|
||||||
{
|
{
|
||||||
Graph g = make_ref<GraphObj>(runtime);
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
|
@ -43,7 +43,7 @@ TEST(Conv, ShapeInference) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(Conv, NaiveCPU) {
|
TEST(Conv, NaiveCPU) {
|
||||||
auto runtime = make_ref<CpuRuntimeObj>();
|
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||||
Graph g = make_ref<GraphObj>(runtime);
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
||||||
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
||||||
|
@ -58,7 +58,8 @@ TEST(Conv, NaiveCPU) {
|
||||||
EXPECT_GT(perfTime, 0);
|
EXPECT_GT(perfTime, 0);
|
||||||
EXPECT_LT(perfTime, 0.1);
|
EXPECT_LT(perfTime, 0.1);
|
||||||
// check answer
|
// check answer
|
||||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32);
|
auto ans =
|
||||||
|
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32, runtime);
|
||||||
ans->dataMalloc(runtime);
|
ans->dataMalloc(runtime);
|
||||||
ans->copyData(
|
ans->copyData(
|
||||||
vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
|
vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
|
||||||
|
@ -68,7 +69,7 @@ TEST(Conv, NaiveCPU) {
|
||||||
void testConvCudnn(
|
void testConvCudnn(
|
||||||
const std::function<void(void *, size_t, DataType)> &generator,
|
const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
vector<float> ansVec) {
|
vector<float> ansVec) {
|
||||||
auto cpuRuntime = make_ref<CpuRuntimeObj>();
|
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||||
// Build CUDA graph
|
// Build CUDA graph
|
||||||
Graph g = make_ref<GraphObj>(cudaRuntime);
|
Graph g = make_ref<GraphObj>(cudaRuntime);
|
||||||
|
@ -80,23 +81,24 @@ void testConvCudnn(
|
||||||
g->dataMalloc();
|
g->dataMalloc();
|
||||||
|
|
||||||
// Build input and output data on CPU
|
// Build input and output data on CPU
|
||||||
auto cpui0 = make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32);
|
auto cpui0 =
|
||||||
|
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32, cpuRuntime);
|
||||||
cpui0->dataMalloc(cpuRuntime);
|
cpui0->dataMalloc(cpuRuntime);
|
||||||
cpui0->setData(generator);
|
cpui0->setData(generator);
|
||||||
|
|
||||||
auto cpuw0 = make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32);
|
auto cpuw0 =
|
||||||
|
make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32, cpuRuntime);
|
||||||
cpuw0->dataMalloc(cpuRuntime);
|
cpuw0->dataMalloc(cpuRuntime);
|
||||||
cpuw0->setData(generator);
|
cpuw0->setData(generator);
|
||||||
|
|
||||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32);
|
auto ans =
|
||||||
|
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||||
ans->dataMalloc(cpuRuntime);
|
ans->dataMalloc(cpuRuntime);
|
||||||
ans->copyData(ansVec);
|
ans->copyData(ansVec);
|
||||||
|
|
||||||
// Copy inputs from CPU to CUDA
|
// Copy inputs from CPU to CUDA
|
||||||
cudaMemcpy(i0->getDataRawPtr<void *>(), cpui0->getDataRawPtr<void *>(),
|
i0->copyData(cpui0);
|
||||||
cpui0->size() * sizeof(float), cudaMemcpyHostToDevice);
|
w0->copyData(cpuw0);
|
||||||
cudaMemcpy(w0->getDataRawPtr<void *>(), cpuw0->getDataRawPtr<void *>(),
|
|
||||||
cpuw0->size() * sizeof(float), cudaMemcpyHostToDevice);
|
|
||||||
// Execute on CUDA
|
// Execute on CUDA
|
||||||
cudaRuntime->run(g);
|
cudaRuntime->run(g);
|
||||||
// double perfTime = cudaRuntime->getPerfTime(g);
|
// double perfTime = cudaRuntime->getPerfTime(g);
|
||||||
|
@ -106,14 +108,13 @@ void testConvCudnn(
|
||||||
|
|
||||||
// copy CUDA output to CPU
|
// copy CUDA output to CPU
|
||||||
auto o0 = conv->getOutput();
|
auto o0 = conv->getOutput();
|
||||||
auto cpuo0 = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32);
|
auto cpuo0 =
|
||||||
|
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||||
cpuo0->dataMalloc(cpuRuntime);
|
cpuo0->dataMalloc(cpuRuntime);
|
||||||
cudaMemcpy(cpuo0->getDataRawPtr<void *>(),
|
cpuo0->copyData(o0);
|
||||||
conv->getOutput()->getDataRawPtr<void *>(),
|
|
||||||
cpuo0->size() * sizeof(float), cudaMemcpyDeviceToHost);
|
|
||||||
|
|
||||||
// check results on CPU
|
// check results on CPU
|
||||||
EXPECT_TRUE(cpuo0->equalData<float>(ans));
|
EXPECT_TRUE(cpuo0->equalData(ans));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(Conv, cuDNN) {
|
TEST(Conv, cuDNN) {
|
||||||
|
|
Loading…
Reference in New Issue