forked from jiuyuan/InfiniTensor
Extended DataType class and Runtime interaction (#9)
* Add: DataType class * Add: data-type-oblivious tensor interface * Rename: copyBlobToCPU Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
This commit is contained in:
parent
bd5934279b
commit
af08df32d2
|
@ -45,9 +45,10 @@ using HashType = uint64_t; // compatible with std::hash
|
|||
std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) + \
|
||||
"] Assertion failed (" + #name + "): " + #info))
|
||||
#define _IT_ASSERT_1(name) _IT_ASSERT_2(name, "");
|
||||
|
||||
#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
|
||||
#define IT_TODO_HALT() IT_ASSERT(false, "Unimplemented")
|
||||
|
||||
#define IT_TODO_HALT() _IT_ASSERT_2(false, "Unimplemented")
|
||||
#define IT_TODO_HALT_MSG(msg) _IT_ASSERT_2(false, msg)
|
||||
#define IT_TODO_SKIP() puts("Unimplemented " __FILE__ ":" __LINE__)
|
||||
|
||||
// Other utilities
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
#include "core/common.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class DataType {
|
||||
public:
|
||||
static const DataType Float32;
|
||||
static const DataType UInt32;
|
||||
static constexpr size_t sizePerElement[]{sizeof(float), sizeof(uint32_t)};
|
||||
static constexpr std::string_view names[]{"Float32", "UInt32"};
|
||||
|
||||
private:
|
||||
int index;
|
||||
|
||||
public:
|
||||
constexpr DataType(int index) : index(index) {}
|
||||
bool operator==(const DataType &rhs) const { return index == rhs.index; }
|
||||
bool operator<(const DataType &rhs) const { return index < rhs.index; }
|
||||
|
||||
template <typename T> static DataType get() {
|
||||
IT_TODO_HALT_MSG("Unsupported data type");
|
||||
}
|
||||
size_t getSize() const { return sizePerElement[index]; }
|
||||
string toString() const { return string(names[index]); }
|
||||
};
|
||||
|
||||
inline const DataType DataType::Float32(0);
|
||||
inline const DataType DataType::UInt32(1);
|
||||
// Method definitions are out of the declaration due to GCC bug:
|
||||
// https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
|
||||
template <> inline DataType DataType::get<float>() { return Float32; }
|
||||
template <> inline DataType DataType::get<uint32_t>() { return UInt32; }
|
||||
|
||||
} // namespace infini
|
|
@ -37,8 +37,6 @@ enum class OpType {
|
|||
MemBound = 300,
|
||||
};
|
||||
|
||||
enum class Device { CPU = 1, CUDA };
|
||||
|
||||
using KernelAttrs = std::tuple<Device, OpType, DataType>;
|
||||
|
||||
class OpRegistry {
|
||||
|
|
|
@ -1,9 +1,33 @@
|
|||
#pragma once
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/perf_engine.h"
|
||||
#include "core/common.h"
|
||||
#include "core/ref.h"
|
||||
#include <memory>
|
||||
namespace infini {
|
||||
|
||||
/***************** Forward declaration begin *****************/
|
||||
class TensorBaseObj;
|
||||
class TensorObj;
|
||||
class OperatorObj;
|
||||
class GraphObj;
|
||||
class RuntimeObj;
|
||||
class BlobObj;
|
||||
|
||||
using TensorBase = Ref<TensorBaseObj>;
|
||||
using Tensor = Ref<TensorObj>;
|
||||
using Operator = Ref<OperatorObj>;
|
||||
using Graph = Ref<GraphObj>;
|
||||
using Runtime = Ref<RuntimeObj>;
|
||||
using Blob = Ref<BlobObj>;
|
||||
enum class OpType;
|
||||
|
||||
using TensorVec = vector<Tensor>;
|
||||
using OpVec = vector<Operator>;
|
||||
|
||||
using VType = uint32_t;
|
||||
|
||||
enum class Device { CPU = 1, CUDA };
|
||||
/***************** Forward declaration end *****************/
|
||||
|
||||
class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
||||
protected:
|
||||
Device device;
|
||||
|
@ -37,17 +61,27 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
|||
*/
|
||||
double getPerfTime(const Graph &graph, bool profiling = false) const;
|
||||
Blob allocBlob(size_t size);
|
||||
bool isCpu() const { return device == Device::CPU; }
|
||||
bool isCuda() const { return device == Device::CUDA; }
|
||||
void copyBlob(const TensorObj *dst, const TensorObj *src) const;
|
||||
|
||||
protected:
|
||||
void printProfilingData(double totTime,
|
||||
const std::map<OpType, double> &opTime,
|
||||
const std::map<OpType, int> &opCnt) const;
|
||||
virtual void copyBlobFromCPU(void *dst, void *src, size_t bytes) const = 0;
|
||||
virtual void copyBlobToCPU(void *dst, void *src, size_t bytes) const = 0;
|
||||
virtual void copyBlobInsideRuntime(void *dst, void *src,
|
||||
size_t bytes) const = 0;
|
||||
};
|
||||
|
||||
// TODO: change inheritance relation
|
||||
class CpuRuntimeObj : public RuntimeObj {
|
||||
public:
|
||||
CpuRuntimeObj() : RuntimeObj(Device::CPU) {}
|
||||
static Ref<CpuRuntimeObj> &getInstance() {
|
||||
static Ref<CpuRuntimeObj> instance = make_ref<CpuRuntimeObj>();
|
||||
return instance;
|
||||
}
|
||||
|
||||
void run(const Graph &graph, bool tune = false,
|
||||
bool profiling = false) const override;
|
||||
|
@ -57,6 +91,11 @@ class CpuRuntimeObj : public RuntimeObj {
|
|||
return calloc((size + sizeof(uint64_t) - 1) / sizeof(uint64_t),
|
||||
sizeof(uint64_t));
|
||||
};
|
||||
|
||||
void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override;
|
||||
void copyBlobToCPU(void *dst, void *src, size_t bytes) const override;
|
||||
void copyBlobInsideRuntime(void *dst, void *src,
|
||||
size_t bytes) const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -12,11 +12,12 @@ class TensorObj : public TensorBaseObj {
|
|||
Shape shape;
|
||||
|
||||
public:
|
||||
TensorObj(const Shape &shape, DataType dtype);
|
||||
TensorObj(const Shape &shape, DataType dtype, Runtime runtime);
|
||||
virtual ~TensorObj() {}
|
||||
string toString() const override;
|
||||
|
||||
size_t size() const;
|
||||
size_t getBytes() const;
|
||||
|
||||
Shape getDims() const { return shape; }
|
||||
|
||||
|
@ -24,39 +25,40 @@ class TensorObj : public TensorBaseObj {
|
|||
using TensorBaseObj::getData;
|
||||
VType getData(const Shape &pos) const;
|
||||
void dataMalloc(const Runtime &runtime);
|
||||
// void copyData(VType *dptr);
|
||||
template <typename T> void copyData(const T *dptr);
|
||||
void copyData(vector<VType> dataVector);
|
||||
void copyData(vector<float> dataVector);
|
||||
void printData() const;
|
||||
// TODO: merge these methods
|
||||
bool equalData(const Tensor &rhs) const;
|
||||
template <typename T> bool equalData(const Tensor &rhs) const {
|
||||
|
||||
template <typename T> void copyData(const T *dptr) {
|
||||
IT_ASSERT(DataType::get<T>() == dtype);
|
||||
IT_ASSERT(data != nullptr);
|
||||
IT_ASSERT(rhs->data != nullptr);
|
||||
// TODO: deal with data type
|
||||
if (!runtime->isCpu())
|
||||
IT_TODO_HALT();
|
||||
auto ptr = data->getPtr<T *>();
|
||||
auto ptrRhs = rhs->data->getPtr<T *>();
|
||||
if (shape != rhs->getDims())
|
||||
return false;
|
||||
size_t sz = size();
|
||||
for (size_t i = 0; i < sz; ++i)
|
||||
if (fabs(ptr[i] - ptrRhs[i]) /
|
||||
std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
|
||||
1e-6) {
|
||||
printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
#pragma omp parallel for
|
||||
for (size_t i = 0; i < sz; ++i) {
|
||||
ptr[i] = dptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> void copyData(vector<T> dataVector) {
|
||||
IT_ASSERT(DataType::get<T>() == dtype);
|
||||
IT_ASSERT(dataVector.size() >= size());
|
||||
copyData(dataVector.data());
|
||||
}
|
||||
|
||||
void copyData(const Tensor &src) { runtime->copyBlob(this, src.get()); }
|
||||
void setData(
|
||||
const std::function<void(void *, size_t, DataType)> &generator) const {
|
||||
generator(data->getPtr<void *>(), size(), dtype);
|
||||
}
|
||||
|
||||
void printData() const;
|
||||
bool equalData(const Tensor &rhs) const;
|
||||
|
||||
private:
|
||||
void printDataFloat() const;
|
||||
void printDataUint32_t() const;
|
||||
template <typename T> bool equalDataInt(const Tensor &rhs) const;
|
||||
template <typename T> bool equalDataFloat(const Tensor &rhs) const;
|
||||
// void setDims(const Dim &dms) { dims = dms; }
|
||||
|
||||
// bool dataRand(int seed = 0) {
|
||||
|
|
|
@ -1,34 +1,11 @@
|
|||
#pragma once
|
||||
#include "core/blob.h"
|
||||
#include "core/data_type.h"
|
||||
#include "core/object.h"
|
||||
#include "core/ref.h"
|
||||
#include "core/runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class TensorBaseObj;
|
||||
class TensorObj;
|
||||
class OperatorObj;
|
||||
class GraphObj;
|
||||
class RuntimeObj;
|
||||
class BlobObj;
|
||||
|
||||
using TensorBase = Ref<TensorBaseObj>;
|
||||
using Tensor = Ref<TensorObj>;
|
||||
using Operator = Ref<OperatorObj>;
|
||||
using Graph = Ref<GraphObj>;
|
||||
using Runtime = Ref<RuntimeObj>;
|
||||
using Blob = Ref<BlobObj>;
|
||||
|
||||
using TensorVec = vector<Tensor>;
|
||||
using OpVec = vector<Operator>;
|
||||
|
||||
using VType = uint32_t;
|
||||
|
||||
enum class DataType {
|
||||
Float32,
|
||||
UInt32,
|
||||
};
|
||||
|
||||
class TensorBaseObj : public Object {
|
||||
public:
|
||||
// enum TensorType {
|
||||
|
@ -45,12 +22,10 @@ class TensorBaseObj : public Object {
|
|||
vector<WRef<TensorBaseObj>> inputOf;
|
||||
WRef<TensorBaseObj> outputOf;
|
||||
Blob data;
|
||||
// ComputeState computed;
|
||||
// static int random_seed[256 * 16];
|
||||
// static bool random_inited;
|
||||
Runtime runtime;
|
||||
|
||||
public:
|
||||
TensorBaseObj(int dim, DataType dtype);
|
||||
TensorBaseObj(int dim, DataType dtype, Runtime runtime);
|
||||
virtual ~TensorBaseObj() {}
|
||||
|
||||
void dataMalloc(const Blob &blob) {
|
||||
|
@ -65,6 +40,7 @@ class TensorBaseObj : public Object {
|
|||
VType getData(size_t offset) const;
|
||||
|
||||
DataType getDType() const { return dtype; }
|
||||
Runtime getRuntime() const { return runtime; }
|
||||
|
||||
// uint64_t getHash() const { return hash; }
|
||||
|
||||
|
|
|
@ -43,6 +43,19 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
return workspace;
|
||||
}
|
||||
|
||||
void copyBlobFromCPU(void *dst, void *src, size_t bytes) const override {
|
||||
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
void copyBlobToCPU(void *dst, void *src, size_t bytes) const override {
|
||||
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
void copyBlobInsideRuntime(void *dst, void *src,
|
||||
size_t bytes) const override {
|
||||
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
private:
|
||||
void runWithoutSync(const Graph &graph) const;
|
||||
};
|
||||
|
|
|
@ -14,16 +14,12 @@ class DataGenerator {
|
|||
public:
|
||||
virtual ~DataGenerator() {}
|
||||
void operator()(void *data, size_t size, DataType dataType) {
|
||||
switch (dataType) {
|
||||
case DataType::UInt32:
|
||||
if (dataType == DataType::UInt32)
|
||||
fill(reinterpret_cast<uint32_t *>(data), size);
|
||||
break;
|
||||
case DataType::Float32:
|
||||
else if (dataType == DataType::Float32)
|
||||
fill(reinterpret_cast<float *>(data), size);
|
||||
break;
|
||||
default:
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#include "core/blob.h"
|
||||
#include "core/runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
|
|
@ -19,7 +19,7 @@ void GraphObj::dataMalloc() {
|
|||
}
|
||||
|
||||
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
|
||||
Tensor tensor = make_ref<TensorObj>(dim, dtype);
|
||||
Tensor tensor = make_ref<TensorObj>(dim, dtype, runtime);
|
||||
tensors.emplace_back(tensor);
|
||||
return tensor;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
#include "core/runtime.h"
|
||||
#include "core/blob.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/perf_engine.h"
|
||||
#include <chrono>
|
||||
#include <cstring>
|
||||
#include <cuda.h>
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cudnn.h>
|
||||
|
@ -112,4 +115,34 @@ Blob RuntimeObj::allocBlob(size_t size) {
|
|||
return make_ref<BlobObj>(shared_from_this(), alloc(size));
|
||||
}
|
||||
|
||||
void RuntimeObj::copyBlob(const TensorObj *dst, const TensorObj *src) const {
|
||||
void *dstPtr = dst->getDataRawPtr<void *>();
|
||||
void *srcPtr = src->getDataRawPtr<void *>();
|
||||
size_t bytes = dst->getBytes();
|
||||
auto dstRuntime = dst->getRuntime();
|
||||
auto srcRuntime = src->getRuntime();
|
||||
|
||||
if (dstRuntime.get() == srcRuntime.get()) {
|
||||
dstRuntime->copyBlobInsideRuntime(dstPtr, srcPtr, bytes);
|
||||
} else if (src->getRuntime()->isCpu()) {
|
||||
dstRuntime->copyBlobFromCPU(dstPtr, srcPtr, bytes);
|
||||
} else if (dst->getRuntime()->isCpu()) {
|
||||
srcRuntime->copyBlobToCPU(dstPtr, srcPtr, bytes);
|
||||
} else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
void CpuRuntimeObj::copyBlobFromCPU(void *dst, void *src, size_t bytes) const {
|
||||
copyBlobInsideRuntime(dst, src, bytes);
|
||||
}
|
||||
|
||||
void CpuRuntimeObj::copyBlobToCPU(void *dst, void *src, size_t bytes) const {
|
||||
copyBlobInsideRuntime(dst, src, bytes);
|
||||
}
|
||||
|
||||
void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, void *src,
|
||||
size_t bytes) const {
|
||||
memcpy(dst, src, bytes);
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -4,8 +4,8 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
TensorObj::TensorObj(const Shape &shape, DataType dtype)
|
||||
: TensorBaseObj(shape.size(), dtype), shape(shape) {}
|
||||
TensorObj::TensorObj(const Shape &shape, DataType dtype, Runtime runtime)
|
||||
: TensorBaseObj(shape.size(), dtype, runtime), shape(shape) {}
|
||||
|
||||
VType TensorObj::getData(const Shape &pos) const {
|
||||
return getData(getOffset(pos));
|
||||
|
@ -34,29 +34,12 @@ size_t TensorObj::size() const {
|
|||
return ret;
|
||||
}
|
||||
|
||||
template <typename T> void TensorObj::copyData(const T *dptr) {
|
||||
// TODO: cuda
|
||||
IT_ASSERT(data != nullptr);
|
||||
auto ptr = data->getPtr<T *>();
|
||||
size_t sz = size();
|
||||
#pragma omp parallel for
|
||||
for (size_t i = 0; i < sz; ++i) {
|
||||
ptr[i] = dptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
void TensorObj::copyData(vector<VType> dataVector) {
|
||||
IT_ASSERT(dataVector.size() >= size());
|
||||
copyData(dataVector.data());
|
||||
}
|
||||
|
||||
void TensorObj::copyData(vector<float> dataVector) {
|
||||
IT_ASSERT(dataVector.size() >= size());
|
||||
copyData(dataVector.data());
|
||||
}
|
||||
size_t TensorObj::getBytes() const { return size() * dtype.getSize(); }
|
||||
|
||||
void TensorObj::printData() const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
if (!runtime->isCpu())
|
||||
IT_TODO_HALT();
|
||||
if (dtype == DataType::Float32)
|
||||
printDataFloat();
|
||||
else if (dtype == DataType::UInt32)
|
||||
|
@ -120,12 +103,9 @@ void TensorObj::printDataUint32_t() const {
|
|||
}
|
||||
}
|
||||
|
||||
bool TensorObj::equalData(const Tensor &rhs) const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
IT_ASSERT(rhs->data != nullptr);
|
||||
// TODO: deal with data type
|
||||
auto ptr = data->getPtr<VType *>();
|
||||
auto ptrRhs = rhs->data->getPtr<VType *>();
|
||||
template <typename T> bool TensorObj::equalDataInt(const Tensor &rhs) const {
|
||||
auto ptr = data->getPtr<uint32_t *>();
|
||||
auto ptrRhs = rhs->data->getPtr<uint32_t *>();
|
||||
if (shape != rhs->getDims())
|
||||
return false;
|
||||
size_t sz = size();
|
||||
|
@ -135,6 +115,36 @@ bool TensorObj::equalData(const Tensor &rhs) const {
|
|||
return true;
|
||||
}
|
||||
|
||||
template <typename T> bool TensorObj::equalDataFloat(const Tensor &rhs) const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
IT_ASSERT(rhs->data != nullptr);
|
||||
// TODO: deal with data type
|
||||
auto ptr = data->getPtr<T *>();
|
||||
auto ptrRhs = rhs->data->getPtr<T *>();
|
||||
if (shape != rhs->getDims())
|
||||
return false;
|
||||
size_t sz = size();
|
||||
for (size_t i = 0; i < sz; ++i)
|
||||
if (fabs(ptr[i] - ptrRhs[i]) / std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
|
||||
1e-6) {
|
||||
printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TensorObj::equalData(const Tensor &rhs) const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
IT_ASSERT(rhs->data != nullptr);
|
||||
IT_ASSERT(getDType() == rhs->getDType());
|
||||
if (getDType() == DataType::UInt32)
|
||||
return equalDataInt<uint32_t>(rhs);
|
||||
else if (getDType() == DataType::Float32)
|
||||
return equalDataInt<float>(rhs);
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
void TensorObj::dataMalloc(const Runtime &runtime) {
|
||||
IT_ASSERT(data == nullptr);
|
||||
size_t bytesPerElement;
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
#include "core/runtime.h"
|
||||
namespace infini {
|
||||
|
||||
TensorBaseObj::TensorBaseObj(int dim, DataType dtype)
|
||||
: dim(dim), dtype(dtype) {}
|
||||
TensorBaseObj::TensorBaseObj(int dim, DataType dtype, Runtime runtime)
|
||||
: dim(dim), dtype(dtype), runtime(runtime) {}
|
||||
|
||||
VType TensorBaseObj::getData(size_t offset) const {
|
||||
// TODO: check cuda array
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#include "cuda/cuda_runtime.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/perf_engine.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
namespace infini {
|
||||
|
||||
TEST(Graph, build_and_run) {
|
||||
Runtime runtime = make_ref<CpuRuntimeObj>();
|
||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
||||
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
||||
|
@ -18,14 +18,14 @@ TEST(Graph, build_and_run) {
|
|||
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
||||
runtime->run(g);
|
||||
// check answer
|
||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
|
||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
|
||||
ans->dataMalloc(runtime);
|
||||
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||
EXPECT_TRUE(o0->equalData(ans));
|
||||
}
|
||||
|
||||
TEST(Graph, perf_engine) {
|
||||
Runtime runtime = make_ref<CpuRuntimeObj>();
|
||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
||||
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
||||
|
@ -40,7 +40,7 @@ TEST(Graph, perf_engine) {
|
|||
EXPECT_GT(perfTime, 0);
|
||||
EXPECT_LT(perfTime, 0.01);
|
||||
// check answer
|
||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
|
||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
|
||||
ans->dataMalloc(runtime);
|
||||
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||
EXPECT_TRUE(matmul->getOutput()->equalData(ans));
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
namespace infini {
|
||||
|
||||
TEST(Conv, ShapeInference) {
|
||||
auto runtime = make_ref<CpuRuntimeObj>();
|
||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
// Padding modes
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
|
@ -43,7 +43,7 @@ TEST(Conv, ShapeInference) {
|
|||
}
|
||||
|
||||
TEST(Conv, NaiveCPU) {
|
||||
auto runtime = make_ref<CpuRuntimeObj>();
|
||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
|
||||
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
||||
|
@ -58,7 +58,8 @@ TEST(Conv, NaiveCPU) {
|
|||
EXPECT_GT(perfTime, 0);
|
||||
EXPECT_LT(perfTime, 0.1);
|
||||
// check answer
|
||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32);
|
||||
auto ans =
|
||||
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32, runtime);
|
||||
ans->dataMalloc(runtime);
|
||||
ans->copyData(
|
||||
vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
|
||||
|
@ -68,7 +69,7 @@ TEST(Conv, NaiveCPU) {
|
|||
void testConvCudnn(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
vector<float> ansVec) {
|
||||
auto cpuRuntime = make_ref<CpuRuntimeObj>();
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
// Build CUDA graph
|
||||
Graph g = make_ref<GraphObj>(cudaRuntime);
|
||||
|
@ -80,23 +81,24 @@ void testConvCudnn(
|
|||
g->dataMalloc();
|
||||
|
||||
// Build input and output data on CPU
|
||||
auto cpui0 = make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32);
|
||||
auto cpui0 =
|
||||
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32, cpuRuntime);
|
||||
cpui0->dataMalloc(cpuRuntime);
|
||||
cpui0->setData(generator);
|
||||
|
||||
auto cpuw0 = make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32);
|
||||
auto cpuw0 =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32, cpuRuntime);
|
||||
cpuw0->dataMalloc(cpuRuntime);
|
||||
cpuw0->setData(generator);
|
||||
|
||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32);
|
||||
auto ans =
|
||||
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
ans->dataMalloc(cpuRuntime);
|
||||
ans->copyData(ansVec);
|
||||
|
||||
// Copy inputs from CPU to CUDA
|
||||
cudaMemcpy(i0->getDataRawPtr<void *>(), cpui0->getDataRawPtr<void *>(),
|
||||
cpui0->size() * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(w0->getDataRawPtr<void *>(), cpuw0->getDataRawPtr<void *>(),
|
||||
cpuw0->size() * sizeof(float), cudaMemcpyHostToDevice);
|
||||
i0->copyData(cpui0);
|
||||
w0->copyData(cpuw0);
|
||||
// Execute on CUDA
|
||||
cudaRuntime->run(g);
|
||||
// double perfTime = cudaRuntime->getPerfTime(g);
|
||||
|
@ -106,14 +108,13 @@ void testConvCudnn(
|
|||
|
||||
// copy CUDA output to CPU
|
||||
auto o0 = conv->getOutput();
|
||||
auto cpuo0 = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32);
|
||||
auto cpuo0 =
|
||||
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
cpuo0->dataMalloc(cpuRuntime);
|
||||
cudaMemcpy(cpuo0->getDataRawPtr<void *>(),
|
||||
conv->getOutput()->getDataRawPtr<void *>(),
|
||||
cpuo0->size() * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
cpuo0->copyData(o0);
|
||||
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(cpuo0->equalData<float>(ans));
|
||||
EXPECT_TRUE(cpuo0->equalData(ans));
|
||||
}
|
||||
|
||||
TEST(Conv, cuDNN) {
|
||||
|
|
Loading…
Reference in New Issue