Simplify tensor transfer between CPU and CUDA (#10)

* Add: OP infers data type  & Graph clones tensor

* Fix: vecToString format

* Add: static assert for Tensor methods

* Rename: getDataRawPtr -> getRawDataPtr

Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
This commit is contained in:
zhengly123 2022-08-25 11:29:16 +08:00 committed by GitHub
parent af08df32d2
commit 93f86d3f4d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 137 additions and 118 deletions

View File

@ -65,6 +65,7 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
ret.append(std::to_string(d));
ret.append(",");
}
if (!vec.empty())
ret.pop_back();
ret.append("]");
return ret;

View File

@ -17,6 +17,12 @@ class GraphObj : public Object {
string toString() const override;
Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
Tensor cloneTensor(const Tensor &tensor) {
auto ret = addTensor(tensor->getDims(), tensor->getDType());
ret->dataMalloc();
ret->copyData(tensor);
return ret;
}
/**
* @brief Add an operator and create its outputs. Output tensor arguments

View File

@ -138,6 +138,7 @@ class OperatorObj : public Object {
: type(opType), inputs(inputs), outputs(outputs) {}
virtual optional<vector<Shape>>
inferShape(const TensorVec &inputs) const = 0;
virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
/**
* @brief Constructs outputs (if requried) and check whether the operator is
* valid.
@ -180,6 +181,7 @@ class OperatorObj : public Object {
protected:
optional<vector<Shape>> inferShape() const;
vector<DataType> inferDataType() const;
private:
/**

View File

@ -24,7 +24,7 @@ class TensorObj : public TensorBaseObj {
size_t getOffset(const Shape &ds) const;
using TensorBaseObj::getData;
VType getData(const Shape &pos) const;
void dataMalloc(const Runtime &runtime);
void dataMalloc();
template <typename T> void copyData(const T *dptr) {
IT_ASSERT(DataType::get<T>() == dtype);
@ -45,7 +45,8 @@ class TensorObj : public TensorBaseObj {
copyData(dataVector.data());
}
void copyData(const Tensor &src) { runtime->copyBlob(this, src.get()); }
void copyData(const TensorObj *src);
void copyData(const Tensor &src) { copyData(src.get()); }
void setData(
const std::function<void(void *, size_t, DataType)> &generator) const {
generator(data->getPtr<void *>(), size(), dtype);
@ -54,11 +55,33 @@ class TensorObj : public TensorBaseObj {
void printData() const;
bool equalData(const Tensor &rhs) const;
template <typename T> bool equalData(const vector<T> &dataVector) {
IT_ASSERT(DataType::get<T>() == dtype);
IT_ASSERT(size() == dataVector.size());
return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
}
private:
void printDataFloat() const;
void printDataUint32_t() const;
template <typename T> bool equalDataInt(const Tensor &rhs) const;
template <typename T> bool equalDataFloat(const Tensor &rhs) const;
template <typename T>
bool equalDataImpl(const T *a, const T *b, size_t size) const {
for (size_t i = 0; i < size; ++i) {
if constexpr (std::is_integral_v<T>) {
if (a[i] != b[i])
return false;
} else if constexpr (std::is_floating_point_v<T>) {
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
1e-6) {
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
return false;
}
} else
static_assert(!sizeof(T), "Unsupported data type");
}
return true;
}
// void setDims(const Dim &dms) { dims = dms; }
// bool dataRand(int seed = 0) {

View File

@ -32,8 +32,10 @@ class TensorBaseObj : public Object {
IT_ASSERT(data == nullptr);
data = blob;
}
Blob getDataPtr() const { return data; }
template <typename T> T getDataRawPtr() const {
Blob getDataBlob() const { return data; }
template <typename T> T getRawDataPtr() const {
static_assert(std::is_pointer_v<T>,
"Raw data pointer has a type of pointer");
IT_ASSERT(data != nullptr);
return data->getPtr<T>();
}

View File

@ -5,7 +5,7 @@ namespace infini {
void cudaPrintFloat(float *x, int len);
void cudaPrintTensor(const Tensor &tensor) {
cudaPrintFloat(tensor->getDataRawPtr<float *>(), tensor->size());
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
}
} // namespace infini

View File

@ -36,7 +36,7 @@ class ConvObj : public OperatorObj {
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override;
int numInputs() const override { return 3; }
int numInputs() const override { return 2; }
int numOutputs() const override { return 1; }
Tensor getBias() const { return inputs[2]; }

View File

@ -33,7 +33,7 @@ class MatmulObj : public OperatorObj {
std::string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int numInputs() const override { return 3; }
int numInputs() const override { return 2; }
int numOutputs() const override { return 1; }
Tensor getBias() const { return inputs[2]; }

View File

@ -6,6 +6,10 @@ void GraphObj::updateConnection() { IT_TODO_HALT(); }
string GraphObj::toString() const {
std::ostringstream oss;
oss << "Graph Tensors:\n";
for (const auto &tensor : tensors)
oss << tensor << "\n";
oss << "Graph operators:\n";
for (const auto &op : ops)
oss << op << "\n";
@ -14,7 +18,7 @@ string GraphObj::toString() const {
void GraphObj::dataMalloc() {
for (auto &tensor : tensors) {
tensor->dataMalloc(runtime);
tensor->dataMalloc();
}
}

View File

@ -57,9 +57,10 @@ bool OperatorObj::checkValid(GraphObj *graph) {
if (shapes.size() != outputs.size())
return false;
if (graph) { // if graph != nullptr, outputs should be created
auto dataTypes = inferDataType();
for (size_t i = 0; i < outputs.size(); i++) {
IT_ASSERT(!outputs[i]);
outputs[i] = graph->addTensor(shapes[i]);
outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
}
} else { // if graph is not empty, check outputs match inferred shapes
for (size_t i = 0; i < shapes.size(); ++i) {
@ -74,4 +75,15 @@ optional<vector<Shape>> OperatorObj::inferShape() const {
return inferShape(inputs);
}
vector<DataType> OperatorObj::inferDataType(const TensorVec &inputs) const {
auto dataType = inputs[0]->getDType();
for (const auto &tensor : inputs)
IT_ASSERT(dataType == tensor->getDType());
return vector(numOutputs(), dataType);
}
vector<DataType> OperatorObj::inferDataType() const {
return inferDataType(inputs);
}
} // namespace infini

View File

@ -116,8 +116,8 @@ Blob RuntimeObj::allocBlob(size_t size) {
}
void RuntimeObj::copyBlob(const TensorObj *dst, const TensorObj *src) const {
void *dstPtr = dst->getDataRawPtr<void *>();
void *srcPtr = src->getDataRawPtr<void *>();
void *dstPtr = dst->getRawDataPtr<void *>();
void *srcPtr = src->getRawDataPtr<void *>();
size_t bytes = dst->getBytes();
auto dstRuntime = dst->getRuntime();
auto srcRuntime = src->getRuntime();

View File

@ -11,7 +11,9 @@ VType TensorObj::getData(const Shape &pos) const {
return getData(getOffset(pos));
}
string TensorObj::toString() const { return "Tensor " + std::to_string(guid); }
string TensorObj::toString() const {
return "Tensor " + std::to_string(guid) + " shape " + vecToString(shape);
}
size_t TensorObj::getOffset(const Shape &pos) const {
auto nDim = pos.size();
@ -103,50 +105,28 @@ void TensorObj::printDataUint32_t() const {
}
}
template <typename T> bool TensorObj::equalDataInt(const Tensor &rhs) const {
auto ptr = data->getPtr<uint32_t *>();
auto ptrRhs = rhs->data->getPtr<uint32_t *>();
if (shape != rhs->getDims())
return false;
size_t sz = size();
for (size_t i = 0; i < sz; ++i)
if (ptr[i] != ptrRhs[i])
return false;
return true;
}
template <typename T> bool TensorObj::equalDataFloat(const Tensor &rhs) const {
IT_ASSERT(data != nullptr);
IT_ASSERT(rhs->data != nullptr);
// TODO: deal with data type
auto ptr = data->getPtr<T *>();
auto ptrRhs = rhs->data->getPtr<T *>();
if (shape != rhs->getDims())
return false;
size_t sz = size();
for (size_t i = 0; i < sz; ++i)
if (fabs(ptr[i] - ptrRhs[i]) / std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
1e-6) {
printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
return false;
}
return true;
}
bool TensorObj::equalData(const Tensor &rhs) const {
IT_ASSERT(data != nullptr);
IT_ASSERT(rhs->data != nullptr);
IT_ASSERT(getDType() == rhs->getDType());
IT_ASSERT(runtime->isCpu());
IT_ASSERT(rhs->getRuntime()->isCpu());
if (shape != rhs->getDims())
return false;
if (getDType() == DataType::UInt32)
return equalDataInt<uint32_t>(rhs);
return equalDataImpl(getRawDataPtr<uint32_t *>(),
rhs->getRawDataPtr<uint32_t *>(), size());
else if (getDType() == DataType::Float32)
return equalDataInt<float>(rhs);
return equalDataImpl(getRawDataPtr<float *>(),
rhs->getRawDataPtr<float *>(), size());
else
IT_TODO_HALT();
}
void TensorObj::dataMalloc(const Runtime &runtime) {
IT_ASSERT(data == nullptr);
void TensorObj::dataMalloc() {
if (data != nullptr)
return;
// IT_ASSERT(data == nullptr);
size_t bytesPerElement;
if (getDType() == DataType::Float32)
bytesPerElement = sizeof(float);
@ -155,4 +135,10 @@ void TensorObj::dataMalloc(const Runtime &runtime) {
data = runtime->allocBlob(size() * bytesPerElement);
}
void TensorObj::copyData(const TensorObj *src) {
IT_ASSERT(dtype == src->getDType());
IT_ASSERT(size() == src->size());
runtime->copyBlob(this, src);
}
}; // namespace infini

View File

@ -7,9 +7,9 @@ template <typename T> class NaiveConv : public Kernel {
void compute(const Operator &_op, const PerfRecord &record,
const RuntimeObj *context) const override {
auto op = as<ConvObj>(_op);
T *iptr = op->getInputs(0)->getDataRawPtr<T *>();
T *wptr = op->getInputs(1)->getDataRawPtr<T *>();
T *optr = op->getOutput()->getDataRawPtr<T *>();
T *iptr = op->getInputs(0)->getRawDataPtr<T *>();
T *wptr = op->getInputs(1)->getRawDataPtr<T *>();
T *optr = op->getOutput()->getRawDataPtr<T *>();
auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
int cpg = op->getChannelPerGroup();

View File

@ -7,9 +7,10 @@ template <typename T> class NaiveMatmul : public Kernel {
void compute(const Operator &_op, const PerfRecord &record,
const RuntimeObj *context) const override {
auto op = as<MatmulObj>(_op);
T *A = op->getInputs(0)->getDataRawPtr<T *>();
T *B = op->getInputs(1)->getDataRawPtr<T *>();
T *C = op->getOutput()->getDataRawPtr<T *>();
IT_ASSERT(op->getInputs().size() == 2, "Bias is not supported yet.");
T *A = op->getInputs(0)->getRawDataPtr<T *>();
T *B = op->getInputs(1)->getRawDataPtr<T *>();
T *C = op->getOutput()->getRawDataPtr<T *>();
IT_ASSERT(op->getTransA() == false && op->getTransB() == false);
IT_ASSERT(op->getAct() == ActType::None);
IT_ASSERT(op->getB() == 1);

View File

@ -26,12 +26,12 @@ class convCudnn : public Kernel {
bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
const CudaRuntimeObj *context) const {
cudnnStatus_t stat;
void *const inData = (op->getInputs(0)->getDataRawPtr<void *>());
void *const knData = (op->getInputs(1)->getDataRawPtr<void *>());
if (op->getInputs(2) != nullptr)
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
if (op->getInputs().size() > 2) // Bias is not supported yet
IT_TODO_HALT();
// void *const biasData = (op->getInputs(2)->getDataRawPtr<void *>());
void *const outData = (op->getOutput()->getDataRawPtr<void *>());
// void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
const int cpg = op->getChannelPerGroup();

View File

@ -3,20 +3,19 @@
namespace infini {
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
int ph, int pw, int sh, int sw, int dh, int dw, Tensor bias,
ActType act)
: OperatorObj(OpType::Conv, {input, weight, bias}, {output}), ph(ph),
pw(pw), sh(sh), sw(sw), dh(dh), dw(dw), act(act),
padding(PaddingMode::Other) {
int ph, int pw, int sh, int sw, int dh, int dw,
[[maybe_unused]] Tensor bias, ActType act)
: OperatorObj(OpType::Conv, {input, weight}, {output}), ph(ph), pw(pw),
sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(PaddingMode::Other) {
setAuxilaryAttributes(PaddingMode::Other);
IT_ASSERT(checkValid(graph));
}
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
PaddingMode mode, int sh, int sw, int dh, int dw, Tensor bias,
ActType act)
: OperatorObj(OpType::Conv, {input, weight, bias}, {output}), ph(-1),
pw(-1), sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(mode) {
PaddingMode mode, int sh, int sw, int dh, int dw,
[[maybe_unused]] Tensor bias, ActType act)
: OperatorObj(OpType::Conv, {input, weight}, {output}), ph(-1), pw(-1),
sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(mode) {
IT_ASSERT(mode != PaddingMode::Other);
setAuxilaryAttributes(mode);
IT_ASSERT(checkValid(graph));

View File

@ -3,9 +3,9 @@
namespace infini {
MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
bool transB, Tensor bias, ActType act)
: OperatorObj(OpType::Matmul, {A, B, bias}, {C}), transA(transA),
transB(transB), act(act), b(A->getDims()[0]),
bool transB, [[maybe_unused]] Tensor bias, ActType act)
: OperatorObj(OpType::Matmul, {A, B}, {C}), transA(transA), transB(transB),
act(act), b(A->getDims()[0]),
m(transA ? A->getDims()[2] : A->getDims()[1]),
n(transB ? B->getDims()[1] : B->getDims()[2]),
k(transA ? A->getDims()[1] : A->getDims()[2]) {

View File

@ -19,7 +19,7 @@ TEST(Graph, build_and_run) {
runtime->run(g);
// check answer
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
ans->dataMalloc(runtime);
ans->dataMalloc();
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
EXPECT_TRUE(o0->equalData(ans));
}
@ -41,7 +41,7 @@ TEST(Graph, perf_engine) {
EXPECT_LT(perfTime, 0.01);
// check answer
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
ans->dataMalloc(runtime);
ans->dataMalloc();
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
EXPECT_TRUE(matmul->getOutput()->equalData(ans));
}

View File

@ -60,7 +60,7 @@ TEST(Conv, NaiveCPU) {
// check answer
auto ans =
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32, runtime);
ans->dataMalloc(runtime);
ans->dataMalloc();
ans->copyData(
vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
EXPECT_TRUE(conv->getOutput()->equalData(ans));
@ -69,52 +69,35 @@ TEST(Conv, NaiveCPU) {
void testConvCudnn(
const std::function<void(void *, size_t, DataType)> &generator,
vector<float> ansVec) {
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
auto cudaRuntime = make_ref<CudaRuntimeObj>();
// Construct Runtime and graph for CPU and CUDA
Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime cuda = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cuda);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({1, 3, 4, 4}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({2, 3, 3, 3}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(generator);
w0Cpu->setData(generator);
// Copy input tensors from CPU to CUDA
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
// Build CUDA graph
Graph g = make_ref<GraphObj>(cudaRuntime);
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::Float32);
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::Float32);
auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
auto conv =
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
// allocate CUDA memory
g->dataMalloc();
// Build input and output data on CPU
auto cpui0 =
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32, cpuRuntime);
cpui0->dataMalloc(cpuRuntime);
cpui0->setData(generator);
auto cpuw0 =
make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32, cpuRuntime);
cpuw0->dataMalloc(cpuRuntime);
cpuw0->setData(generator);
auto ans =
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
ans->dataMalloc(cpuRuntime);
ans->copyData(ansVec);
// Copy inputs from CPU to CUDA
i0->copyData(cpui0);
w0->copyData(cpuw0);
gCuda->dataMalloc();
// Execute on CUDA
cudaRuntime->run(g);
// double perfTime = cudaRuntime->getPerfTime(g);
// // The example Conv takes 0.015ms with one core
// EXPECT_GT(perfTime, 0);
// EXPECT_LT(perfTime, 0.1);
// copy CUDA output to CPU
auto o0 = conv->getOutput();
auto cpuo0 =
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
cpuo0->dataMalloc(cpuRuntime);
cpuo0->copyData(o0);
cuda->run(gCuda);
// copy output from CUDA to CPU
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
// check results on CPU
EXPECT_TRUE(cpuo0->equalData(ans));
EXPECT_TRUE(o0Cpu->equalData(ansVec));
// print a tensor/operator/graph by print()
gCuda->print();
}
TEST(Conv, cuDNN) {