forked from jiuyuan/InfiniTensor
Simplify tensor transfer between CPU and CUDA (#10)
* Add: OP infers data type & Graph clones tensor * Fix: vecToString format * Add: static assert for Tensor methods * Rename: getDataRawPtr -> getRawDataPtr Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
This commit is contained in:
parent
af08df32d2
commit
93f86d3f4d
|
@ -63,9 +63,10 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
|
|||
ret.append("[");
|
||||
for (auto d : vec) {
|
||||
ret.append(std::to_string(d));
|
||||
ret.append(", ");
|
||||
ret.append(",");
|
||||
}
|
||||
ret.pop_back();
|
||||
if (!vec.empty())
|
||||
ret.pop_back();
|
||||
ret.append("]");
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -17,6 +17,12 @@ class GraphObj : public Object {
|
|||
string toString() const override;
|
||||
|
||||
Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
|
||||
Tensor cloneTensor(const Tensor &tensor) {
|
||||
auto ret = addTensor(tensor->getDims(), tensor->getDType());
|
||||
ret->dataMalloc();
|
||||
ret->copyData(tensor);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Add an operator and create its outputs. Output tensor arguments
|
||||
|
|
|
@ -138,6 +138,7 @@ class OperatorObj : public Object {
|
|||
: type(opType), inputs(inputs), outputs(outputs) {}
|
||||
virtual optional<vector<Shape>>
|
||||
inferShape(const TensorVec &inputs) const = 0;
|
||||
virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
|
||||
/**
|
||||
* @brief Constructs outputs (if requried) and check whether the operator is
|
||||
* valid.
|
||||
|
@ -180,6 +181,7 @@ class OperatorObj : public Object {
|
|||
|
||||
protected:
|
||||
optional<vector<Shape>> inferShape() const;
|
||||
vector<DataType> inferDataType() const;
|
||||
|
||||
private:
|
||||
/**
|
||||
|
|
|
@ -24,7 +24,7 @@ class TensorObj : public TensorBaseObj {
|
|||
size_t getOffset(const Shape &ds) const;
|
||||
using TensorBaseObj::getData;
|
||||
VType getData(const Shape &pos) const;
|
||||
void dataMalloc(const Runtime &runtime);
|
||||
void dataMalloc();
|
||||
|
||||
template <typename T> void copyData(const T *dptr) {
|
||||
IT_ASSERT(DataType::get<T>() == dtype);
|
||||
|
@ -45,7 +45,8 @@ class TensorObj : public TensorBaseObj {
|
|||
copyData(dataVector.data());
|
||||
}
|
||||
|
||||
void copyData(const Tensor &src) { runtime->copyBlob(this, src.get()); }
|
||||
void copyData(const TensorObj *src);
|
||||
void copyData(const Tensor &src) { copyData(src.get()); }
|
||||
void setData(
|
||||
const std::function<void(void *, size_t, DataType)> &generator) const {
|
||||
generator(data->getPtr<void *>(), size(), dtype);
|
||||
|
@ -54,11 +55,33 @@ class TensorObj : public TensorBaseObj {
|
|||
void printData() const;
|
||||
bool equalData(const Tensor &rhs) const;
|
||||
|
||||
template <typename T> bool equalData(const vector<T> &dataVector) {
|
||||
IT_ASSERT(DataType::get<T>() == dtype);
|
||||
IT_ASSERT(size() == dataVector.size());
|
||||
return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
|
||||
}
|
||||
|
||||
private:
|
||||
void printDataFloat() const;
|
||||
void printDataUint32_t() const;
|
||||
template <typename T> bool equalDataInt(const Tensor &rhs) const;
|
||||
template <typename T> bool equalDataFloat(const Tensor &rhs) const;
|
||||
|
||||
template <typename T>
|
||||
bool equalDataImpl(const T *a, const T *b, size_t size) const {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if constexpr (std::is_integral_v<T>) {
|
||||
if (a[i] != b[i])
|
||||
return false;
|
||||
} else if constexpr (std::is_floating_point_v<T>) {
|
||||
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
|
||||
1e-6) {
|
||||
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
|
||||
return false;
|
||||
}
|
||||
} else
|
||||
static_assert(!sizeof(T), "Unsupported data type");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// void setDims(const Dim &dms) { dims = dms; }
|
||||
|
||||
// bool dataRand(int seed = 0) {
|
||||
|
|
|
@ -32,8 +32,10 @@ class TensorBaseObj : public Object {
|
|||
IT_ASSERT(data == nullptr);
|
||||
data = blob;
|
||||
}
|
||||
Blob getDataPtr() const { return data; }
|
||||
template <typename T> T getDataRawPtr() const {
|
||||
Blob getDataBlob() const { return data; }
|
||||
template <typename T> T getRawDataPtr() const {
|
||||
static_assert(std::is_pointer_v<T>,
|
||||
"Raw data pointer has a type of pointer");
|
||||
IT_ASSERT(data != nullptr);
|
||||
return data->getPtr<T>();
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ namespace infini {
|
|||
void cudaPrintFloat(float *x, int len);
|
||||
|
||||
void cudaPrintTensor(const Tensor &tensor) {
|
||||
cudaPrintFloat(tensor->getDataRawPtr<float *>(), tensor->size());
|
||||
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -36,7 +36,7 @@ class ConvObj : public OperatorObj {
|
|||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 3; }
|
||||
int numInputs() const override { return 2; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
Tensor getBias() const { return inputs[2]; }
|
||||
|
|
|
@ -33,7 +33,7 @@ class MatmulObj : public OperatorObj {
|
|||
std::string toString() const override;
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
int numInputs() const override { return 3; }
|
||||
int numInputs() const override { return 2; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
Tensor getBias() const { return inputs[2]; }
|
||||
|
|
|
@ -6,6 +6,10 @@ void GraphObj::updateConnection() { IT_TODO_HALT(); }
|
|||
|
||||
string GraphObj::toString() const {
|
||||
std::ostringstream oss;
|
||||
oss << "Graph Tensors:\n";
|
||||
for (const auto &tensor : tensors)
|
||||
oss << tensor << "\n";
|
||||
|
||||
oss << "Graph operators:\n";
|
||||
for (const auto &op : ops)
|
||||
oss << op << "\n";
|
||||
|
@ -14,7 +18,7 @@ string GraphObj::toString() const {
|
|||
|
||||
void GraphObj::dataMalloc() {
|
||||
for (auto &tensor : tensors) {
|
||||
tensor->dataMalloc(runtime);
|
||||
tensor->dataMalloc();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -57,9 +57,10 @@ bool OperatorObj::checkValid(GraphObj *graph) {
|
|||
if (shapes.size() != outputs.size())
|
||||
return false;
|
||||
if (graph) { // if graph != nullptr, outputs should be created
|
||||
auto dataTypes = inferDataType();
|
||||
for (size_t i = 0; i < outputs.size(); i++) {
|
||||
IT_ASSERT(!outputs[i]);
|
||||
outputs[i] = graph->addTensor(shapes[i]);
|
||||
outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
|
||||
}
|
||||
} else { // if graph is not empty, check outputs match inferred shapes
|
||||
for (size_t i = 0; i < shapes.size(); ++i) {
|
||||
|
@ -74,4 +75,15 @@ optional<vector<Shape>> OperatorObj::inferShape() const {
|
|||
return inferShape(inputs);
|
||||
}
|
||||
|
||||
vector<DataType> OperatorObj::inferDataType(const TensorVec &inputs) const {
|
||||
auto dataType = inputs[0]->getDType();
|
||||
for (const auto &tensor : inputs)
|
||||
IT_ASSERT(dataType == tensor->getDType());
|
||||
return vector(numOutputs(), dataType);
|
||||
}
|
||||
|
||||
vector<DataType> OperatorObj::inferDataType() const {
|
||||
return inferDataType(inputs);
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -116,8 +116,8 @@ Blob RuntimeObj::allocBlob(size_t size) {
|
|||
}
|
||||
|
||||
void RuntimeObj::copyBlob(const TensorObj *dst, const TensorObj *src) const {
|
||||
void *dstPtr = dst->getDataRawPtr<void *>();
|
||||
void *srcPtr = src->getDataRawPtr<void *>();
|
||||
void *dstPtr = dst->getRawDataPtr<void *>();
|
||||
void *srcPtr = src->getRawDataPtr<void *>();
|
||||
size_t bytes = dst->getBytes();
|
||||
auto dstRuntime = dst->getRuntime();
|
||||
auto srcRuntime = src->getRuntime();
|
||||
|
|
|
@ -11,7 +11,9 @@ VType TensorObj::getData(const Shape &pos) const {
|
|||
return getData(getOffset(pos));
|
||||
}
|
||||
|
||||
string TensorObj::toString() const { return "Tensor " + std::to_string(guid); }
|
||||
string TensorObj::toString() const {
|
||||
return "Tensor " + std::to_string(guid) + " shape " + vecToString(shape);
|
||||
}
|
||||
|
||||
size_t TensorObj::getOffset(const Shape &pos) const {
|
||||
auto nDim = pos.size();
|
||||
|
@ -103,50 +105,28 @@ void TensorObj::printDataUint32_t() const {
|
|||
}
|
||||
}
|
||||
|
||||
template <typename T> bool TensorObj::equalDataInt(const Tensor &rhs) const {
|
||||
auto ptr = data->getPtr<uint32_t *>();
|
||||
auto ptrRhs = rhs->data->getPtr<uint32_t *>();
|
||||
if (shape != rhs->getDims())
|
||||
return false;
|
||||
size_t sz = size();
|
||||
for (size_t i = 0; i < sz; ++i)
|
||||
if (ptr[i] != ptrRhs[i])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T> bool TensorObj::equalDataFloat(const Tensor &rhs) const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
IT_ASSERT(rhs->data != nullptr);
|
||||
// TODO: deal with data type
|
||||
auto ptr = data->getPtr<T *>();
|
||||
auto ptrRhs = rhs->data->getPtr<T *>();
|
||||
if (shape != rhs->getDims())
|
||||
return false;
|
||||
size_t sz = size();
|
||||
for (size_t i = 0; i < sz; ++i)
|
||||
if (fabs(ptr[i] - ptrRhs[i]) / std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
|
||||
1e-6) {
|
||||
printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TensorObj::equalData(const Tensor &rhs) const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
IT_ASSERT(rhs->data != nullptr);
|
||||
IT_ASSERT(getDType() == rhs->getDType());
|
||||
IT_ASSERT(runtime->isCpu());
|
||||
IT_ASSERT(rhs->getRuntime()->isCpu());
|
||||
if (shape != rhs->getDims())
|
||||
return false;
|
||||
if (getDType() == DataType::UInt32)
|
||||
return equalDataInt<uint32_t>(rhs);
|
||||
return equalDataImpl(getRawDataPtr<uint32_t *>(),
|
||||
rhs->getRawDataPtr<uint32_t *>(), size());
|
||||
else if (getDType() == DataType::Float32)
|
||||
return equalDataInt<float>(rhs);
|
||||
return equalDataImpl(getRawDataPtr<float *>(),
|
||||
rhs->getRawDataPtr<float *>(), size());
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
void TensorObj::dataMalloc(const Runtime &runtime) {
|
||||
IT_ASSERT(data == nullptr);
|
||||
void TensorObj::dataMalloc() {
|
||||
if (data != nullptr)
|
||||
return;
|
||||
// IT_ASSERT(data == nullptr);
|
||||
size_t bytesPerElement;
|
||||
if (getDType() == DataType::Float32)
|
||||
bytesPerElement = sizeof(float);
|
||||
|
@ -155,4 +135,10 @@ void TensorObj::dataMalloc(const Runtime &runtime) {
|
|||
data = runtime->allocBlob(size() * bytesPerElement);
|
||||
}
|
||||
|
||||
void TensorObj::copyData(const TensorObj *src) {
|
||||
IT_ASSERT(dtype == src->getDType());
|
||||
IT_ASSERT(size() == src->size());
|
||||
runtime->copyBlob(this, src);
|
||||
}
|
||||
|
||||
}; // namespace infini
|
|
@ -7,9 +7,9 @@ template <typename T> class NaiveConv : public Kernel {
|
|||
void compute(const Operator &_op, const PerfRecord &record,
|
||||
const RuntimeObj *context) const override {
|
||||
auto op = as<ConvObj>(_op);
|
||||
T *iptr = op->getInputs(0)->getDataRawPtr<T *>();
|
||||
T *wptr = op->getInputs(1)->getDataRawPtr<T *>();
|
||||
T *optr = op->getOutput()->getDataRawPtr<T *>();
|
||||
T *iptr = op->getInputs(0)->getRawDataPtr<T *>();
|
||||
T *wptr = op->getInputs(1)->getRawDataPtr<T *>();
|
||||
T *optr = op->getOutput()->getRawDataPtr<T *>();
|
||||
auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
int cpg = op->getChannelPerGroup();
|
||||
|
|
|
@ -7,9 +7,10 @@ template <typename T> class NaiveMatmul : public Kernel {
|
|||
void compute(const Operator &_op, const PerfRecord &record,
|
||||
const RuntimeObj *context) const override {
|
||||
auto op = as<MatmulObj>(_op);
|
||||
T *A = op->getInputs(0)->getDataRawPtr<T *>();
|
||||
T *B = op->getInputs(1)->getDataRawPtr<T *>();
|
||||
T *C = op->getOutput()->getDataRawPtr<T *>();
|
||||
IT_ASSERT(op->getInputs().size() == 2, "Bias is not supported yet.");
|
||||
T *A = op->getInputs(0)->getRawDataPtr<T *>();
|
||||
T *B = op->getInputs(1)->getRawDataPtr<T *>();
|
||||
T *C = op->getOutput()->getRawDataPtr<T *>();
|
||||
IT_ASSERT(op->getTransA() == false && op->getTransB() == false);
|
||||
IT_ASSERT(op->getAct() == ActType::None);
|
||||
IT_ASSERT(op->getB() == 1);
|
||||
|
|
|
@ -26,12 +26,12 @@ class convCudnn : public Kernel {
|
|||
bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
|
||||
const CudaRuntimeObj *context) const {
|
||||
cudnnStatus_t stat;
|
||||
void *const inData = (op->getInputs(0)->getDataRawPtr<void *>());
|
||||
void *const knData = (op->getInputs(1)->getDataRawPtr<void *>());
|
||||
if (op->getInputs(2) != nullptr)
|
||||
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
if (op->getInputs().size() > 2) // Bias is not supported yet
|
||||
IT_TODO_HALT();
|
||||
// void *const biasData = (op->getInputs(2)->getDataRawPtr<void *>());
|
||||
void *const outData = (op->getOutput()->getDataRawPtr<void *>());
|
||||
// void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
const int cpg = op->getChannelPerGroup();
|
||||
|
|
|
@ -3,20 +3,19 @@
|
|||
namespace infini {
|
||||
|
||||
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
|
||||
int ph, int pw, int sh, int sw, int dh, int dw, Tensor bias,
|
||||
ActType act)
|
||||
: OperatorObj(OpType::Conv, {input, weight, bias}, {output}), ph(ph),
|
||||
pw(pw), sh(sh), sw(sw), dh(dh), dw(dw), act(act),
|
||||
padding(PaddingMode::Other) {
|
||||
int ph, int pw, int sh, int sw, int dh, int dw,
|
||||
[[maybe_unused]] Tensor bias, ActType act)
|
||||
: OperatorObj(OpType::Conv, {input, weight}, {output}), ph(ph), pw(pw),
|
||||
sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(PaddingMode::Other) {
|
||||
setAuxilaryAttributes(PaddingMode::Other);
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
|
||||
PaddingMode mode, int sh, int sw, int dh, int dw, Tensor bias,
|
||||
ActType act)
|
||||
: OperatorObj(OpType::Conv, {input, weight, bias}, {output}), ph(-1),
|
||||
pw(-1), sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(mode) {
|
||||
PaddingMode mode, int sh, int sw, int dh, int dw,
|
||||
[[maybe_unused]] Tensor bias, ActType act)
|
||||
: OperatorObj(OpType::Conv, {input, weight}, {output}), ph(-1), pw(-1),
|
||||
sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(mode) {
|
||||
IT_ASSERT(mode != PaddingMode::Other);
|
||||
setAuxilaryAttributes(mode);
|
||||
IT_ASSERT(checkValid(graph));
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
namespace infini {
|
||||
|
||||
MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
|
||||
bool transB, Tensor bias, ActType act)
|
||||
: OperatorObj(OpType::Matmul, {A, B, bias}, {C}), transA(transA),
|
||||
transB(transB), act(act), b(A->getDims()[0]),
|
||||
bool transB, [[maybe_unused]] Tensor bias, ActType act)
|
||||
: OperatorObj(OpType::Matmul, {A, B}, {C}), transA(transA), transB(transB),
|
||||
act(act), b(A->getDims()[0]),
|
||||
m(transA ? A->getDims()[2] : A->getDims()[1]),
|
||||
n(transB ? B->getDims()[1] : B->getDims()[2]),
|
||||
k(transA ? A->getDims()[1] : A->getDims()[2]) {
|
||||
|
|
|
@ -19,7 +19,7 @@ TEST(Graph, build_and_run) {
|
|||
runtime->run(g);
|
||||
// check answer
|
||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
|
||||
ans->dataMalloc(runtime);
|
||||
ans->dataMalloc();
|
||||
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||
EXPECT_TRUE(o0->equalData(ans));
|
||||
}
|
||||
|
@ -41,7 +41,7 @@ TEST(Graph, perf_engine) {
|
|||
EXPECT_LT(perfTime, 0.01);
|
||||
// check answer
|
||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
|
||||
ans->dataMalloc(runtime);
|
||||
ans->dataMalloc();
|
||||
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||
EXPECT_TRUE(matmul->getOutput()->equalData(ans));
|
||||
}
|
||||
|
|
|
@ -60,7 +60,7 @@ TEST(Conv, NaiveCPU) {
|
|||
// check answer
|
||||
auto ans =
|
||||
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32, runtime);
|
||||
ans->dataMalloc(runtime);
|
||||
ans->dataMalloc();
|
||||
ans->copyData(
|
||||
vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
|
||||
EXPECT_TRUE(conv->getOutput()->equalData(ans));
|
||||
|
@ -69,52 +69,35 @@ TEST(Conv, NaiveCPU) {
|
|||
void testConvCudnn(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
vector<float> ansVec) {
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
// Construct Runtime and graph for CPU and CUDA
|
||||
Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Runtime cuda = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cuda);
|
||||
// Set input data on CPU in a CPU Graph
|
||||
Tensor i0Cpu = gCpu->addTensor({1, 3, 4, 4}, DataType::Float32);
|
||||
Tensor w0Cpu = gCpu->addTensor({2, 3, 3, 3}, DataType::Float32);
|
||||
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||
gCpu->dataMalloc();
|
||||
i0Cpu->setData(generator);
|
||||
w0Cpu->setData(generator);
|
||||
|
||||
// Copy input tensors from CPU to CUDA
|
||||
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
|
||||
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
|
||||
// Build CUDA graph
|
||||
Graph g = make_ref<GraphObj>(cudaRuntime);
|
||||
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::Float32);
|
||||
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::Float32);
|
||||
auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
|
||||
|
||||
auto conv =
|
||||
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
|
||||
// Build input and output data on CPU
|
||||
auto cpui0 =
|
||||
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32, cpuRuntime);
|
||||
cpui0->dataMalloc(cpuRuntime);
|
||||
cpui0->setData(generator);
|
||||
|
||||
auto cpuw0 =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32, cpuRuntime);
|
||||
cpuw0->dataMalloc(cpuRuntime);
|
||||
cpuw0->setData(generator);
|
||||
|
||||
auto ans =
|
||||
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
ans->dataMalloc(cpuRuntime);
|
||||
ans->copyData(ansVec);
|
||||
|
||||
// Copy inputs from CPU to CUDA
|
||||
i0->copyData(cpui0);
|
||||
w0->copyData(cpuw0);
|
||||
gCuda->dataMalloc();
|
||||
// Execute on CUDA
|
||||
cudaRuntime->run(g);
|
||||
// double perfTime = cudaRuntime->getPerfTime(g);
|
||||
// // The example Conv takes 0.015ms with one core
|
||||
// EXPECT_GT(perfTime, 0);
|
||||
// EXPECT_LT(perfTime, 0.1);
|
||||
|
||||
// copy CUDA output to CPU
|
||||
auto o0 = conv->getOutput();
|
||||
auto cpuo0 =
|
||||
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
cpuo0->dataMalloc(cpuRuntime);
|
||||
cpuo0->copyData(o0);
|
||||
|
||||
cuda->run(gCuda);
|
||||
// copy output from CUDA to CPU
|
||||
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(cpuo0->equalData(ans));
|
||||
EXPECT_TRUE(o0Cpu->equalData(ansVec));
|
||||
// print a tensor/operator/graph by print()
|
||||
gCuda->print();
|
||||
}
|
||||
|
||||
TEST(Conv, cuDNN) {
|
||||
|
|
Loading…
Reference in New Issue