Simplify tensor transfer between CPU and CUDA (#10)

* Add: OP infers data type & Graph clones tensor * Fix: vecToString format * Add: static assert for Tensor methods * Rename: getDataRawPtr -> getRawDataPtr Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
2022-08-25 11:29:16 +08:00 · 2022-08-25 11:29:16 +08:00 · 93f86d3f4d
parent af08df32d2
commit 93f86d3f4d
19 changed files with 137 additions and 118 deletions
--- a/include/core/common.h
+++ b/include/core/common.h
@ -63,9 +63,10 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
    ret.append("[");
    for (auto d : vec) {
        ret.append(std::to_string(d));
-        ret.append(", ");
+        ret.append(",");
    }
-    ret.pop_back();
+    if (!vec.empty())
        ret.pop_back();
    ret.append("]");
    return ret;
 }
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -17,6 +17,12 @@ class GraphObj : public Object {
    string toString() const override;
    Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
    Tensor cloneTensor(const Tensor &tensor) {
        auto ret = addTensor(tensor->getDims(), tensor->getDType());
        ret->dataMalloc();
        ret->copyData(tensor);
        return ret;
    }
    /**
     * @brief Add an operator and create its outputs. Output tensor arguments
--- a/include/core/operator.h
+++ b/include/core/operator.h
@ -138,6 +138,7 @@ class OperatorObj : public Object {
        : type(opType), inputs(inputs), outputs(outputs) {}
    virtual optional<vector<Shape>>
    inferShape(const TensorVec &inputs) const = 0;
    virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
    /**
     * @brief Constructs outputs (if requried) and check whether the operator is
     * valid.
@ -180,6 +181,7 @@ class OperatorObj : public Object {
  protected:
    optional<vector<Shape>> inferShape() const;
    vector<DataType> inferDataType() const;
  private:
    /**
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -24,7 +24,7 @@ class TensorObj : public TensorBaseObj {
    size_t getOffset(const Shape &ds) const;
    using TensorBaseObj::getData;
    VType getData(const Shape &pos) const;
-    void dataMalloc(const Runtime &runtime);
+    void dataMalloc();
    template <typename T> void copyData(const T *dptr) {
        IT_ASSERT(DataType::get<T>() == dtype);
@ -45,7 +45,8 @@ class TensorObj : public TensorBaseObj {
        copyData(dataVector.data());
    }
-    void copyData(const Tensor &src) { runtime->copyBlob(this, src.get()); }
+    void copyData(const TensorObj *src);
    void copyData(const Tensor &src) { copyData(src.get()); }
    void setData(
        const std::function<void(void *, size_t, DataType)> &generator) const {
        generator(data->getPtr<void *>(), size(), dtype);
@ -54,11 +55,33 @@ class TensorObj : public TensorBaseObj {
    void printData() const;
    bool equalData(const Tensor &rhs) const;
    template <typename T> bool equalData(const vector<T> &dataVector) {
        IT_ASSERT(DataType::get<T>() == dtype);
        IT_ASSERT(size() == dataVector.size());
        return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
    }
  private:
    void printDataFloat() const;
    void printDataUint32_t() const;
-    template <typename T> bool equalDataInt(const Tensor &rhs) const;
+
-    template <typename T> bool equalDataFloat(const Tensor &rhs) const;
+    template <typename T>
    bool equalDataImpl(const T *a, const T *b, size_t size) const {
        for (size_t i = 0; i < size; ++i) {
            if constexpr (std::is_integral_v<T>) {
                if (a[i] != b[i])
                    return false;
            } else if constexpr (std::is_floating_point_v<T>) {
                if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
                    1e-6) {
                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
                    return false;
                }
            } else
                static_assert(!sizeof(T), "Unsupported data type");
        }
        return true;
    }
    // void setDims(const Dim &dms) { dims = dms; }
    //     bool dataRand(int seed = 0) {
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@ -32,8 +32,10 @@ class TensorBaseObj : public Object {
        IT_ASSERT(data == nullptr);
        data = blob;
    }
-    Blob getDataPtr() const { return data; }
+    Blob getDataBlob() const { return data; }
-    template <typename T> T getDataRawPtr() const {
+    template <typename T> T getRawDataPtr() const {
        static_assert(std::is_pointer_v<T>,
                      "Raw data pointer has a type of pointer");
        IT_ASSERT(data != nullptr);
        return data->getPtr<T>();
    }
--- a/include/cuda/cuda_utility.h
+++ b/include/cuda/cuda_utility.h
@ -5,7 +5,7 @@ namespace infini {
 void cudaPrintFloat(float *x, int len);
 void cudaPrintTensor(const Tensor &tensor) {
-    cudaPrintFloat(tensor->getDataRawPtr<float *>(), tensor->size());
+    cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
 }
 } // namespace infini
--- a/include/operators/conv.h
+++ b/include/operators/conv.h
@ -36,7 +36,7 @@ class ConvObj : public OperatorObj {
    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
    std::string toString() const override;
-    int numInputs() const override { return 3; }
+    int numInputs() const override { return 2; }
    int numOutputs() const override { return 1; }
    Tensor getBias() const { return inputs[2]; }
--- a/include/operators/matmul.h
+++ b/include/operators/matmul.h
@ -33,7 +33,7 @@ class MatmulObj : public OperatorObj {
    std::string toString() const override;
    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
-    int numInputs() const override { return 3; }
+    int numInputs() const override { return 2; }
    int numOutputs() const override { return 1; }
    Tensor getBias() const { return inputs[2]; }
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@ -6,6 +6,10 @@ void GraphObj::updateConnection() { IT_TODO_HALT(); }
 string GraphObj::toString() const {
    std::ostringstream oss;
    oss << "Graph Tensors:\n";
    for (const auto &tensor : tensors)
        oss << tensor << "\n";
    oss << "Graph operators:\n";
    for (const auto &op : ops)
        oss << op << "\n";
@ -14,7 +18,7 @@ string GraphObj::toString() const {
 void GraphObj::dataMalloc() {
    for (auto &tensor : tensors) {
-        tensor->dataMalloc(runtime);
+        tensor->dataMalloc();
    }
 }
--- a/src/core/operator.cc
+++ b/src/core/operator.cc
@ -57,9 +57,10 @@ bool OperatorObj::checkValid(GraphObj *graph) {
    if (shapes.size() != outputs.size())
        return false;
    if (graph) { // if graph != nullptr, outputs should be created
        auto dataTypes = inferDataType();
        for (size_t i = 0; i < outputs.size(); i++) {
            IT_ASSERT(!outputs[i]);
-            outputs[i] = graph->addTensor(shapes[i]);
+            outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
        }
    } else { // if graph is not empty, check outputs match inferred shapes
        for (size_t i = 0; i < shapes.size(); ++i) {
@ -74,4 +75,15 @@ optional<vector<Shape>> OperatorObj::inferShape() const {
    return inferShape(inputs);
 }
 vector<DataType> OperatorObj::inferDataType(const TensorVec &inputs) const {
    auto dataType = inputs[0]->getDType();
    for (const auto &tensor : inputs)
        IT_ASSERT(dataType == tensor->getDType());
    return vector(numOutputs(), dataType);
 }
 vector<DataType> OperatorObj::inferDataType() const {
    return inferDataType(inputs);
 }
 } // namespace infini
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@ -116,8 +116,8 @@ Blob RuntimeObj::allocBlob(size_t size) {
 }
 void RuntimeObj::copyBlob(const TensorObj *dst, const TensorObj *src) const {
-    void *dstPtr = dst->getDataRawPtr<void *>();
+    void *dstPtr = dst->getRawDataPtr<void *>();
-    void *srcPtr = src->getDataRawPtr<void *>();
+    void *srcPtr = src->getRawDataPtr<void *>();
    size_t bytes = dst->getBytes();
    auto dstRuntime = dst->getRuntime();
    auto srcRuntime = src->getRuntime();
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -11,7 +11,9 @@ VType TensorObj::getData(const Shape &pos) const {
    return getData(getOffset(pos));
 }
-string TensorObj::toString() const { return "Tensor " + std::to_string(guid); }
+string TensorObj::toString() const {
    return "Tensor " + std::to_string(guid) + " shape " + vecToString(shape);
 }
 size_t TensorObj::getOffset(const Shape &pos) const {
    auto nDim = pos.size();
@ -103,50 +105,28 @@ void TensorObj::printDataUint32_t() const {
    }
 }
 template <typename T> bool TensorObj::equalDataInt(const Tensor &rhs) const {
    auto ptr = data->getPtr<uint32_t *>();
    auto ptrRhs = rhs->data->getPtr<uint32_t *>();
    if (shape != rhs->getDims())
        return false;
    size_t sz = size();
    for (size_t i = 0; i < sz; ++i)
        if (ptr[i] != ptrRhs[i])
            return false;
    return true;
 }
 template <typename T> bool TensorObj::equalDataFloat(const Tensor &rhs) const {
    IT_ASSERT(data != nullptr);
    IT_ASSERT(rhs->data != nullptr);
    // TODO: deal with data type
    auto ptr = data->getPtr<T *>();
    auto ptrRhs = rhs->data->getPtr<T *>();
    if (shape != rhs->getDims())
        return false;
    size_t sz = size();
    for (size_t i = 0; i < sz; ++i)
        if (fabs(ptr[i] - ptrRhs[i]) / std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
            1e-6) {
            printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
            return false;
        }
    return true;
 }
 bool TensorObj::equalData(const Tensor &rhs) const {
    IT_ASSERT(data != nullptr);
    IT_ASSERT(rhs->data != nullptr);
    IT_ASSERT(getDType() == rhs->getDType());
    IT_ASSERT(runtime->isCpu());
    IT_ASSERT(rhs->getRuntime()->isCpu());
    if (shape != rhs->getDims())
        return false;
    if (getDType() == DataType::UInt32)
-        return equalDataInt<uint32_t>(rhs);
+        return equalDataImpl(getRawDataPtr<uint32_t *>(),
                             rhs->getRawDataPtr<uint32_t *>(), size());
    else if (getDType() == DataType::Float32)
-        return equalDataInt<float>(rhs);
+        return equalDataImpl(getRawDataPtr<float *>(),
                             rhs->getRawDataPtr<float *>(), size());
    else
        IT_TODO_HALT();
 }
-void TensorObj::dataMalloc(const Runtime &runtime) {
+void TensorObj::dataMalloc() {
-    IT_ASSERT(data == nullptr);
+    if (data != nullptr)
        return;
    // IT_ASSERT(data == nullptr);
    size_t bytesPerElement;
    if (getDType() == DataType::Float32)
        bytesPerElement = sizeof(float);
@ -155,4 +135,10 @@ void TensorObj::dataMalloc(const Runtime &runtime) {
    data = runtime->allocBlob(size() * bytesPerElement);
 }
 void TensorObj::copyData(const TensorObj *src) {
    IT_ASSERT(dtype == src->getDType());
    IT_ASSERT(size() == src->size());
    runtime->copyBlob(this, src);
 }
 }; // namespace infini
--- a/src/kernels/cpu/conv.cc
+++ b/src/kernels/cpu/conv.cc
@ -7,9 +7,9 @@ template <typename T> class NaiveConv : public Kernel {
    void compute(const Operator &_op, const PerfRecord &record,
                 const RuntimeObj *context) const override {
        auto op = as<ConvObj>(_op);
-        T *iptr = op->getInputs(0)->getDataRawPtr<T *>();
+        T *iptr = op->getInputs(0)->getRawDataPtr<T *>();
-        T *wptr = op->getInputs(1)->getDataRawPtr<T *>();
+        T *wptr = op->getInputs(1)->getRawDataPtr<T *>();
-        T *optr = op->getOutput()->getDataRawPtr<T *>();
+        T *optr = op->getOutput()->getRawDataPtr<T *>();
        auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
        auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
        int cpg = op->getChannelPerGroup();
--- a/src/kernels/cpu/matmul.cc
+++ b/src/kernels/cpu/matmul.cc
@ -7,9 +7,10 @@ template <typename T> class NaiveMatmul : public Kernel {
    void compute(const Operator &_op, const PerfRecord &record,
                 const RuntimeObj *context) const override {
        auto op = as<MatmulObj>(_op);
-        T *A = op->getInputs(0)->getDataRawPtr<T *>();
+        IT_ASSERT(op->getInputs().size() == 2, "Bias is not supported yet.");
-        T *B = op->getInputs(1)->getDataRawPtr<T *>();
+        T *A = op->getInputs(0)->getRawDataPtr<T *>();
-        T *C = op->getOutput()->getDataRawPtr<T *>();
+        T *B = op->getInputs(1)->getRawDataPtr<T *>();
        T *C = op->getOutput()->getRawDataPtr<T *>();
        IT_ASSERT(op->getTransA() == false && op->getTransB() == false);
        IT_ASSERT(op->getAct() == ActType::None);
        IT_ASSERT(op->getB() == 1);
--- a/src/kernels/cuda/conv.cc
+++ b/src/kernels/cuda/conv.cc
@ -26,12 +26,12 @@ class convCudnn : public Kernel {
    bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
                      const CudaRuntimeObj *context) const {
        cudnnStatus_t stat;
-        void *const inData = (op->getInputs(0)->getDataRawPtr<void *>());
+        void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
-        void *const knData = (op->getInputs(1)->getDataRawPtr<void *>());
+        void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
-        if (op->getInputs(2) != nullptr)
+        if (op->getInputs().size() > 2) // Bias is not supported yet
            IT_TODO_HALT();
-        // void *const biasData = (op->getInputs(2)->getDataRawPtr<void *>());
+        // void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
-        void *const outData = (op->getOutput()->getDataRawPtr<void *>());
+        void *const outData = (op->getOutput()->getRawDataPtr<void *>());
        const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
        const int cpg = op->getChannelPerGroup();
--- a/src/operators/conv.cc
+++ b/src/operators/conv.cc
@ -3,20 +3,19 @@
 namespace infini {
 ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
-                 int ph, int pw, int sh, int sw, int dh, int dw, Tensor bias,
+                 int ph, int pw, int sh, int sw, int dh, int dw,
-                 ActType act)
+                 [[maybe_unused]] Tensor bias, ActType act)
-    : OperatorObj(OpType::Conv, {input, weight, bias}, {output}), ph(ph),
+    : OperatorObj(OpType::Conv, {input, weight}, {output}), ph(ph), pw(pw),
-      pw(pw), sh(sh), sw(sw), dh(dh), dw(dw), act(act),
+      sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(PaddingMode::Other) {
      padding(PaddingMode::Other) {
    setAuxilaryAttributes(PaddingMode::Other);
    IT_ASSERT(checkValid(graph));
 }
 ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
-                 PaddingMode mode, int sh, int sw, int dh, int dw, Tensor bias,
+                 PaddingMode mode, int sh, int sw, int dh, int dw,
-                 ActType act)
+                 [[maybe_unused]] Tensor bias, ActType act)
-    : OperatorObj(OpType::Conv, {input, weight, bias}, {output}), ph(-1),
+    : OperatorObj(OpType::Conv, {input, weight}, {output}), ph(-1), pw(-1),
-      pw(-1), sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(mode) {
+      sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(mode) {
    IT_ASSERT(mode != PaddingMode::Other);
    setAuxilaryAttributes(mode);
    IT_ASSERT(checkValid(graph));
--- a/src/operators/matmul.cc
+++ b/src/operators/matmul.cc
@ -3,9 +3,9 @@
 namespace infini {
 MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
-                     bool transB, Tensor bias, ActType act)
+                     bool transB, [[maybe_unused]] Tensor bias, ActType act)
-    : OperatorObj(OpType::Matmul, {A, B, bias}, {C}), transA(transA),
+    : OperatorObj(OpType::Matmul, {A, B}, {C}), transA(transA), transB(transB),
-      transB(transB), act(act), b(A->getDims()[0]),
+      act(act), b(A->getDims()[0]),
      m(transA ? A->getDims()[2] : A->getDims()[1]),
      n(transB ? B->getDims()[1] : B->getDims()[2]),
      k(transA ? A->getDims()[1] : A->getDims()[2]) {
--- a/test/core/test_graph.cc
+++ b/test/core/test_graph.cc
@ -19,7 +19,7 @@ TEST(Graph, build_and_run) {
    runtime->run(g);
    // check answer
    auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
-    ans->dataMalloc(runtime);
+    ans->dataMalloc();
    ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
    EXPECT_TRUE(o0->equalData(ans));
 }
@ -41,7 +41,7 @@ TEST(Graph, perf_engine) {
    EXPECT_LT(perfTime, 0.01);
    // check answer
    auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
-    ans->dataMalloc(runtime);
+    ans->dataMalloc();
    ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
    EXPECT_TRUE(matmul->getOutput()->equalData(ans));
 }
--- a/test/operators/test_conv.cc
+++ b/test/operators/test_conv.cc
@ -60,7 +60,7 @@ TEST(Conv, NaiveCPU) {
    // check answer
    auto ans =
        make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32, runtime);
-    ans->dataMalloc(runtime);
+    ans->dataMalloc();
    ans->copyData(
        vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
    EXPECT_TRUE(conv->getOutput()->equalData(ans));
@ -69,52 +69,35 @@ TEST(Conv, NaiveCPU) {
 void testConvCudnn(
    const std::function<void(void *, size_t, DataType)> &generator,
    vector<float> ansVec) {
-    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    // Construct Runtime and graph for CPU and CUDA
-    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
    Graph gCpu = make_ref<GraphObj>(cpu);
    Runtime cuda = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cuda);
    // Set input data on CPU in a CPU Graph
    Tensor i0Cpu = gCpu->addTensor({1, 3, 4, 4}, DataType::Float32);
    Tensor w0Cpu = gCpu->addTensor({2, 3, 3, 3}, DataType::Float32);
    // Malloc data for all tensors in a graph. Do we need implicit allocation?
    gCpu->dataMalloc();
    i0Cpu->setData(generator);
    w0Cpu->setData(generator);
    // Copy input tensors from CPU to CUDA
    Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
    Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
    // Build CUDA graph
-    Graph g = make_ref<GraphObj>(cudaRuntime);
+    auto conv =
-    Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::Float32);
+        gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
    Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::Float32);
    auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
    // allocate CUDA memory
-    g->dataMalloc();
+    gCuda->dataMalloc();
    // Build input and output data on CPU
    auto cpui0 =
        make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32, cpuRuntime);
    cpui0->dataMalloc(cpuRuntime);
    cpui0->setData(generator);
    auto cpuw0 =
        make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32, cpuRuntime);
    cpuw0->dataMalloc(cpuRuntime);
    cpuw0->setData(generator);
    auto ans =
        make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
    ans->dataMalloc(cpuRuntime);
    ans->copyData(ansVec);
    // Copy inputs from CPU to CUDA
    i0->copyData(cpui0);
    w0->copyData(cpuw0);
    // Execute on CUDA
-    cudaRuntime->run(g);
+    cuda->run(gCuda);
-    // double perfTime = cudaRuntime->getPerfTime(g);
+    // copy output from CUDA to CPU
-    // // The example Conv takes 0.015ms with one core
+    auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
    // EXPECT_GT(perfTime, 0);
    // EXPECT_LT(perfTime, 0.1);
    // copy CUDA output to CPU
    auto o0 = conv->getOutput();
    auto cpuo0 =
        make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
    cpuo0->dataMalloc(cpuRuntime);
    cpuo0->copyData(o0);
    // check results on CPU
-    EXPECT_TRUE(cpuo0->equalData(ans));
+    EXPECT_TRUE(o0Cpu->equalData(ansVec));
    // print a tensor/operator/graph by print()
    gCuda->print();
 }
 TEST(Conv, cuDNN) {