Simplify tensor transfer between CPU and CUDA (#10)

* Add: OP infers data type  & Graph clones tensor

* Fix: vecToString format

* Add: static assert for Tensor methods

* Rename: getDataRawPtr -> getRawDataPtr

Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
This commit is contained in:
zhengly123 2022-08-25 11:29:16 +08:00 committed by GitHub
parent af08df32d2
commit 93f86d3f4d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 137 additions and 118 deletions

View File

@ -63,9 +63,10 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
ret.append("["); ret.append("[");
for (auto d : vec) { for (auto d : vec) {
ret.append(std::to_string(d)); ret.append(std::to_string(d));
ret.append(", "); ret.append(",");
} }
ret.pop_back(); if (!vec.empty())
ret.pop_back();
ret.append("]"); ret.append("]");
return ret; return ret;
} }

View File

@ -17,6 +17,12 @@ class GraphObj : public Object {
string toString() const override; string toString() const override;
Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32); Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
Tensor cloneTensor(const Tensor &tensor) {
auto ret = addTensor(tensor->getDims(), tensor->getDType());
ret->dataMalloc();
ret->copyData(tensor);
return ret;
}
/** /**
* @brief Add an operator and create its outputs. Output tensor arguments * @brief Add an operator and create its outputs. Output tensor arguments

View File

@ -138,6 +138,7 @@ class OperatorObj : public Object {
: type(opType), inputs(inputs), outputs(outputs) {} : type(opType), inputs(inputs), outputs(outputs) {}
virtual optional<vector<Shape>> virtual optional<vector<Shape>>
inferShape(const TensorVec &inputs) const = 0; inferShape(const TensorVec &inputs) const = 0;
virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
/** /**
* @brief Constructs outputs (if requried) and check whether the operator is * @brief Constructs outputs (if requried) and check whether the operator is
* valid. * valid.
@ -180,6 +181,7 @@ class OperatorObj : public Object {
protected: protected:
optional<vector<Shape>> inferShape() const; optional<vector<Shape>> inferShape() const;
vector<DataType> inferDataType() const;
private: private:
/** /**

View File

@ -24,7 +24,7 @@ class TensorObj : public TensorBaseObj {
size_t getOffset(const Shape &ds) const; size_t getOffset(const Shape &ds) const;
using TensorBaseObj::getData; using TensorBaseObj::getData;
VType getData(const Shape &pos) const; VType getData(const Shape &pos) const;
void dataMalloc(const Runtime &runtime); void dataMalloc();
template <typename T> void copyData(const T *dptr) { template <typename T> void copyData(const T *dptr) {
IT_ASSERT(DataType::get<T>() == dtype); IT_ASSERT(DataType::get<T>() == dtype);
@ -45,7 +45,8 @@ class TensorObj : public TensorBaseObj {
copyData(dataVector.data()); copyData(dataVector.data());
} }
void copyData(const Tensor &src) { runtime->copyBlob(this, src.get()); } void copyData(const TensorObj *src);
void copyData(const Tensor &src) { copyData(src.get()); }
void setData( void setData(
const std::function<void(void *, size_t, DataType)> &generator) const { const std::function<void(void *, size_t, DataType)> &generator) const {
generator(data->getPtr<void *>(), size(), dtype); generator(data->getPtr<void *>(), size(), dtype);
@ -54,11 +55,33 @@ class TensorObj : public TensorBaseObj {
void printData() const; void printData() const;
bool equalData(const Tensor &rhs) const; bool equalData(const Tensor &rhs) const;
template <typename T> bool equalData(const vector<T> &dataVector) {
IT_ASSERT(DataType::get<T>() == dtype);
IT_ASSERT(size() == dataVector.size());
return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
}
private: private:
void printDataFloat() const; void printDataFloat() const;
void printDataUint32_t() const; void printDataUint32_t() const;
template <typename T> bool equalDataInt(const Tensor &rhs) const;
template <typename T> bool equalDataFloat(const Tensor &rhs) const; template <typename T>
bool equalDataImpl(const T *a, const T *b, size_t size) const {
for (size_t i = 0; i < size; ++i) {
if constexpr (std::is_integral_v<T>) {
if (a[i] != b[i])
return false;
} else if constexpr (std::is_floating_point_v<T>) {
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
1e-6) {
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
return false;
}
} else
static_assert(!sizeof(T), "Unsupported data type");
}
return true;
}
// void setDims(const Dim &dms) { dims = dms; } // void setDims(const Dim &dms) { dims = dms; }
// bool dataRand(int seed = 0) { // bool dataRand(int seed = 0) {

View File

@ -32,8 +32,10 @@ class TensorBaseObj : public Object {
IT_ASSERT(data == nullptr); IT_ASSERT(data == nullptr);
data = blob; data = blob;
} }
Blob getDataPtr() const { return data; } Blob getDataBlob() const { return data; }
template <typename T> T getDataRawPtr() const { template <typename T> T getRawDataPtr() const {
static_assert(std::is_pointer_v<T>,
"Raw data pointer has a type of pointer");
IT_ASSERT(data != nullptr); IT_ASSERT(data != nullptr);
return data->getPtr<T>(); return data->getPtr<T>();
} }

View File

@ -5,7 +5,7 @@ namespace infini {
void cudaPrintFloat(float *x, int len); void cudaPrintFloat(float *x, int len);
void cudaPrintTensor(const Tensor &tensor) { void cudaPrintTensor(const Tensor &tensor) {
cudaPrintFloat(tensor->getDataRawPtr<float *>(), tensor->size()); cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
} }
} // namespace infini } // namespace infini

View File

@ -36,7 +36,7 @@ class ConvObj : public OperatorObj {
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override; std::string toString() const override;
int numInputs() const override { return 3; } int numInputs() const override { return 2; }
int numOutputs() const override { return 1; } int numOutputs() const override { return 1; }
Tensor getBias() const { return inputs[2]; } Tensor getBias() const { return inputs[2]; }

View File

@ -33,7 +33,7 @@ class MatmulObj : public OperatorObj {
std::string toString() const override; std::string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override; optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int numInputs() const override { return 3; } int numInputs() const override { return 2; }
int numOutputs() const override { return 1; } int numOutputs() const override { return 1; }
Tensor getBias() const { return inputs[2]; } Tensor getBias() const { return inputs[2]; }

View File

@ -6,6 +6,10 @@ void GraphObj::updateConnection() { IT_TODO_HALT(); }
string GraphObj::toString() const { string GraphObj::toString() const {
std::ostringstream oss; std::ostringstream oss;
oss << "Graph Tensors:\n";
for (const auto &tensor : tensors)
oss << tensor << "\n";
oss << "Graph operators:\n"; oss << "Graph operators:\n";
for (const auto &op : ops) for (const auto &op : ops)
oss << op << "\n"; oss << op << "\n";
@ -14,7 +18,7 @@ string GraphObj::toString() const {
void GraphObj::dataMalloc() { void GraphObj::dataMalloc() {
for (auto &tensor : tensors) { for (auto &tensor : tensors) {
tensor->dataMalloc(runtime); tensor->dataMalloc();
} }
} }

View File

@ -57,9 +57,10 @@ bool OperatorObj::checkValid(GraphObj *graph) {
if (shapes.size() != outputs.size()) if (shapes.size() != outputs.size())
return false; return false;
if (graph) { // if graph != nullptr, outputs should be created if (graph) { // if graph != nullptr, outputs should be created
auto dataTypes = inferDataType();
for (size_t i = 0; i < outputs.size(); i++) { for (size_t i = 0; i < outputs.size(); i++) {
IT_ASSERT(!outputs[i]); IT_ASSERT(!outputs[i]);
outputs[i] = graph->addTensor(shapes[i]); outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
} }
} else { // if graph is not empty, check outputs match inferred shapes } else { // if graph is not empty, check outputs match inferred shapes
for (size_t i = 0; i < shapes.size(); ++i) { for (size_t i = 0; i < shapes.size(); ++i) {
@ -74,4 +75,15 @@ optional<vector<Shape>> OperatorObj::inferShape() const {
return inferShape(inputs); return inferShape(inputs);
} }
vector<DataType> OperatorObj::inferDataType(const TensorVec &inputs) const {
auto dataType = inputs[0]->getDType();
for (const auto &tensor : inputs)
IT_ASSERT(dataType == tensor->getDType());
return vector(numOutputs(), dataType);
}
vector<DataType> OperatorObj::inferDataType() const {
return inferDataType(inputs);
}
} // namespace infini } // namespace infini

View File

@ -116,8 +116,8 @@ Blob RuntimeObj::allocBlob(size_t size) {
} }
void RuntimeObj::copyBlob(const TensorObj *dst, const TensorObj *src) const { void RuntimeObj::copyBlob(const TensorObj *dst, const TensorObj *src) const {
void *dstPtr = dst->getDataRawPtr<void *>(); void *dstPtr = dst->getRawDataPtr<void *>();
void *srcPtr = src->getDataRawPtr<void *>(); void *srcPtr = src->getRawDataPtr<void *>();
size_t bytes = dst->getBytes(); size_t bytes = dst->getBytes();
auto dstRuntime = dst->getRuntime(); auto dstRuntime = dst->getRuntime();
auto srcRuntime = src->getRuntime(); auto srcRuntime = src->getRuntime();

View File

@ -11,7 +11,9 @@ VType TensorObj::getData(const Shape &pos) const {
return getData(getOffset(pos)); return getData(getOffset(pos));
} }
string TensorObj::toString() const { return "Tensor " + std::to_string(guid); } string TensorObj::toString() const {
return "Tensor " + std::to_string(guid) + " shape " + vecToString(shape);
}
size_t TensorObj::getOffset(const Shape &pos) const { size_t TensorObj::getOffset(const Shape &pos) const {
auto nDim = pos.size(); auto nDim = pos.size();
@ -103,50 +105,28 @@ void TensorObj::printDataUint32_t() const {
} }
} }
template <typename T> bool TensorObj::equalDataInt(const Tensor &rhs) const {
auto ptr = data->getPtr<uint32_t *>();
auto ptrRhs = rhs->data->getPtr<uint32_t *>();
if (shape != rhs->getDims())
return false;
size_t sz = size();
for (size_t i = 0; i < sz; ++i)
if (ptr[i] != ptrRhs[i])
return false;
return true;
}
template <typename T> bool TensorObj::equalDataFloat(const Tensor &rhs) const {
IT_ASSERT(data != nullptr);
IT_ASSERT(rhs->data != nullptr);
// TODO: deal with data type
auto ptr = data->getPtr<T *>();
auto ptrRhs = rhs->data->getPtr<T *>();
if (shape != rhs->getDims())
return false;
size_t sz = size();
for (size_t i = 0; i < sz; ++i)
if (fabs(ptr[i] - ptrRhs[i]) / std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
1e-6) {
printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
return false;
}
return true;
}
bool TensorObj::equalData(const Tensor &rhs) const { bool TensorObj::equalData(const Tensor &rhs) const {
IT_ASSERT(data != nullptr); IT_ASSERT(data != nullptr);
IT_ASSERT(rhs->data != nullptr); IT_ASSERT(rhs->data != nullptr);
IT_ASSERT(getDType() == rhs->getDType()); IT_ASSERT(getDType() == rhs->getDType());
IT_ASSERT(runtime->isCpu());
IT_ASSERT(rhs->getRuntime()->isCpu());
if (shape != rhs->getDims())
return false;
if (getDType() == DataType::UInt32) if (getDType() == DataType::UInt32)
return equalDataInt<uint32_t>(rhs); return equalDataImpl(getRawDataPtr<uint32_t *>(),
rhs->getRawDataPtr<uint32_t *>(), size());
else if (getDType() == DataType::Float32) else if (getDType() == DataType::Float32)
return equalDataInt<float>(rhs); return equalDataImpl(getRawDataPtr<float *>(),
rhs->getRawDataPtr<float *>(), size());
else else
IT_TODO_HALT(); IT_TODO_HALT();
} }
void TensorObj::dataMalloc(const Runtime &runtime) { void TensorObj::dataMalloc() {
IT_ASSERT(data == nullptr); if (data != nullptr)
return;
// IT_ASSERT(data == nullptr);
size_t bytesPerElement; size_t bytesPerElement;
if (getDType() == DataType::Float32) if (getDType() == DataType::Float32)
bytesPerElement = sizeof(float); bytesPerElement = sizeof(float);
@ -155,4 +135,10 @@ void TensorObj::dataMalloc(const Runtime &runtime) {
data = runtime->allocBlob(size() * bytesPerElement); data = runtime->allocBlob(size() * bytesPerElement);
} }
void TensorObj::copyData(const TensorObj *src) {
IT_ASSERT(dtype == src->getDType());
IT_ASSERT(size() == src->size());
runtime->copyBlob(this, src);
}
}; // namespace infini }; // namespace infini

View File

@ -7,9 +7,9 @@ template <typename T> class NaiveConv : public Kernel {
void compute(const Operator &_op, const PerfRecord &record, void compute(const Operator &_op, const PerfRecord &record,
const RuntimeObj *context) const override { const RuntimeObj *context) const override {
auto op = as<ConvObj>(_op); auto op = as<ConvObj>(_op);
T *iptr = op->getInputs(0)->getDataRawPtr<T *>(); T *iptr = op->getInputs(0)->getRawDataPtr<T *>();
T *wptr = op->getInputs(1)->getDataRawPtr<T *>(); T *wptr = op->getInputs(1)->getRawDataPtr<T *>();
T *optr = op->getOutput()->getDataRawPtr<T *>(); T *optr = op->getOutput()->getRawDataPtr<T *>();
auto [n, c, h, w, f, r, s] = op->getNCHWFRS(); auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
int cpg = op->getChannelPerGroup(); int cpg = op->getChannelPerGroup();

View File

@ -7,9 +7,10 @@ template <typename T> class NaiveMatmul : public Kernel {
void compute(const Operator &_op, const PerfRecord &record, void compute(const Operator &_op, const PerfRecord &record,
const RuntimeObj *context) const override { const RuntimeObj *context) const override {
auto op = as<MatmulObj>(_op); auto op = as<MatmulObj>(_op);
T *A = op->getInputs(0)->getDataRawPtr<T *>(); IT_ASSERT(op->getInputs().size() == 2, "Bias is not supported yet.");
T *B = op->getInputs(1)->getDataRawPtr<T *>(); T *A = op->getInputs(0)->getRawDataPtr<T *>();
T *C = op->getOutput()->getDataRawPtr<T *>(); T *B = op->getInputs(1)->getRawDataPtr<T *>();
T *C = op->getOutput()->getRawDataPtr<T *>();
IT_ASSERT(op->getTransA() == false && op->getTransB() == false); IT_ASSERT(op->getTransA() == false && op->getTransB() == false);
IT_ASSERT(op->getAct() == ActType::None); IT_ASSERT(op->getAct() == ActType::None);
IT_ASSERT(op->getB() == 1); IT_ASSERT(op->getB() == 1);

View File

@ -26,12 +26,12 @@ class convCudnn : public Kernel {
bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record, bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
const CudaRuntimeObj *context) const { const CudaRuntimeObj *context) const {
cudnnStatus_t stat; cudnnStatus_t stat;
void *const inData = (op->getInputs(0)->getDataRawPtr<void *>()); void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const knData = (op->getInputs(1)->getDataRawPtr<void *>()); void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
if (op->getInputs(2) != nullptr) if (op->getInputs().size() > 2) // Bias is not supported yet
IT_TODO_HALT(); IT_TODO_HALT();
// void *const biasData = (op->getInputs(2)->getDataRawPtr<void *>()); // void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
void *const outData = (op->getOutput()->getDataRawPtr<void *>()); void *const outData = (op->getOutput()->getRawDataPtr<void *>());
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS(); const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
const int cpg = op->getChannelPerGroup(); const int cpg = op->getChannelPerGroup();

View File

@ -3,20 +3,19 @@
namespace infini { namespace infini {
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output, ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
int ph, int pw, int sh, int sw, int dh, int dw, Tensor bias, int ph, int pw, int sh, int sw, int dh, int dw,
ActType act) [[maybe_unused]] Tensor bias, ActType act)
: OperatorObj(OpType::Conv, {input, weight, bias}, {output}), ph(ph), : OperatorObj(OpType::Conv, {input, weight}, {output}), ph(ph), pw(pw),
pw(pw), sh(sh), sw(sw), dh(dh), dw(dw), act(act), sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(PaddingMode::Other) {
padding(PaddingMode::Other) {
setAuxilaryAttributes(PaddingMode::Other); setAuxilaryAttributes(PaddingMode::Other);
IT_ASSERT(checkValid(graph)); IT_ASSERT(checkValid(graph));
} }
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output, ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
PaddingMode mode, int sh, int sw, int dh, int dw, Tensor bias, PaddingMode mode, int sh, int sw, int dh, int dw,
ActType act) [[maybe_unused]] Tensor bias, ActType act)
: OperatorObj(OpType::Conv, {input, weight, bias}, {output}), ph(-1), : OperatorObj(OpType::Conv, {input, weight}, {output}), ph(-1), pw(-1),
pw(-1), sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(mode) { sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(mode) {
IT_ASSERT(mode != PaddingMode::Other); IT_ASSERT(mode != PaddingMode::Other);
setAuxilaryAttributes(mode); setAuxilaryAttributes(mode);
IT_ASSERT(checkValid(graph)); IT_ASSERT(checkValid(graph));

View File

@ -3,9 +3,9 @@
namespace infini { namespace infini {
MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA, MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
bool transB, Tensor bias, ActType act) bool transB, [[maybe_unused]] Tensor bias, ActType act)
: OperatorObj(OpType::Matmul, {A, B, bias}, {C}), transA(transA), : OperatorObj(OpType::Matmul, {A, B}, {C}), transA(transA), transB(transB),
transB(transB), act(act), b(A->getDims()[0]), act(act), b(A->getDims()[0]),
m(transA ? A->getDims()[2] : A->getDims()[1]), m(transA ? A->getDims()[2] : A->getDims()[1]),
n(transB ? B->getDims()[1] : B->getDims()[2]), n(transB ? B->getDims()[1] : B->getDims()[2]),
k(transA ? A->getDims()[1] : A->getDims()[2]) { k(transA ? A->getDims()[1] : A->getDims()[2]) {

View File

@ -19,7 +19,7 @@ TEST(Graph, build_and_run) {
runtime->run(g); runtime->run(g);
// check answer // check answer
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime); auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
ans->dataMalloc(runtime); ans->dataMalloc();
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128}); ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
EXPECT_TRUE(o0->equalData(ans)); EXPECT_TRUE(o0->equalData(ans));
} }
@ -41,7 +41,7 @@ TEST(Graph, perf_engine) {
EXPECT_LT(perfTime, 0.01); EXPECT_LT(perfTime, 0.01);
// check answer // check answer
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime); auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
ans->dataMalloc(runtime); ans->dataMalloc();
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128}); ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
EXPECT_TRUE(matmul->getOutput()->equalData(ans)); EXPECT_TRUE(matmul->getOutput()->equalData(ans));
} }

View File

@ -60,7 +60,7 @@ TEST(Conv, NaiveCPU) {
// check answer // check answer
auto ans = auto ans =
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32, runtime); make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32, runtime);
ans->dataMalloc(runtime); ans->dataMalloc();
ans->copyData( ans->copyData(
vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656}); vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
EXPECT_TRUE(conv->getOutput()->equalData(ans)); EXPECT_TRUE(conv->getOutput()->equalData(ans));
@ -69,52 +69,35 @@ TEST(Conv, NaiveCPU) {
void testConvCudnn( void testConvCudnn(
const std::function<void(void *, size_t, DataType)> &generator, const std::function<void(void *, size_t, DataType)> &generator,
vector<float> ansVec) { vector<float> ansVec) {
Runtime cpuRuntime = CpuRuntimeObj::getInstance(); // Construct Runtime and graph for CPU and CUDA
auto cudaRuntime = make_ref<CudaRuntimeObj>(); Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime cuda = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cuda);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({1, 3, 4, 4}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({2, 3, 3, 3}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(generator);
w0Cpu->setData(generator);
// Copy input tensors from CPU to CUDA
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
// Build CUDA graph // Build CUDA graph
Graph g = make_ref<GraphObj>(cudaRuntime); auto conv =
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::Float32); gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::Float32);
auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
// allocate CUDA memory // allocate CUDA memory
g->dataMalloc(); gCuda->dataMalloc();
// Build input and output data on CPU
auto cpui0 =
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32, cpuRuntime);
cpui0->dataMalloc(cpuRuntime);
cpui0->setData(generator);
auto cpuw0 =
make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32, cpuRuntime);
cpuw0->dataMalloc(cpuRuntime);
cpuw0->setData(generator);
auto ans =
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
ans->dataMalloc(cpuRuntime);
ans->copyData(ansVec);
// Copy inputs from CPU to CUDA
i0->copyData(cpui0);
w0->copyData(cpuw0);
// Execute on CUDA // Execute on CUDA
cudaRuntime->run(g); cuda->run(gCuda);
// double perfTime = cudaRuntime->getPerfTime(g); // copy output from CUDA to CPU
// // The example Conv takes 0.015ms with one core auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
// EXPECT_GT(perfTime, 0);
// EXPECT_LT(perfTime, 0.1);
// copy CUDA output to CPU
auto o0 = conv->getOutput();
auto cpuo0 =
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
cpuo0->dataMalloc(cpuRuntime);
cpuo0->copyData(o0);
// check results on CPU // check results on CPU
EXPECT_TRUE(cpuo0->equalData(ans)); EXPECT_TRUE(o0Cpu->equalData(ansVec));
// print a tensor/operator/graph by print()
gCuda->print();
} }
TEST(Conv, cuDNN) { TEST(Conv, cuDNN) {