ADD:pad/slice operator and cuda kernel. (#39)

fix compile error refector clang format split test. fix compile error. ADD slice cuda kernel. ADD slice operator. ADD:pad operator and cuda kernel.
2022-09-29 10:29:24 +08:00 · 2022-09-29 10:29:24 +08:00 · 5560d0f2fb
parent 1aefc1b27e
commit 5560d0f2fb
13 changed files with 451 additions and 1 deletions
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -20,7 +20,7 @@ class TensorObj : public TensorBaseObj {
    size_t getBytes() const;

    Shape getDims() const { return shape; }
-
+    vector<size_t> getStride() const;
    size_t getOffset(const Shape &ds) const;
    using TensorBaseObj::getData;
    VType getData(const Shape &pos) const;
--- a/include/cuda/cuda_pad_slice.h
+++ b/include/cuda/cuda_pad_slice.h
@ -0,0 +1,19 @@
+#pragma once
+
+const int MAX_DIM = 4;
+
+// Pad operator acts like padding small(part) tensor into a big(whole) tensor.
+// Slice operator acts like spling a big(whole) tensor into a small(part)
+// tensor.
+typedef struct {
+    int begNum[MAX_DIM];     // pad or slice number at beginning
+    int wholeNDim[MAX_DIM];  // dim size after padding or before slicing
+    int partNDim[MAX_DIM];   // dim size before padding or after slicing
+    int partStride[MAX_DIM]; // stride before padding or after slicing
+} TransMetaData;
+
+namespace infini {
+void pad_slice_kernel(float *partData, float *wholeData,
+                      const TransMetaData &metadata, int nDims, int num,
+                      bool isPad);
+} // namespace infini
--- a/include/operators/pad.h
+++ b/include/operators/pad.h
@ -0,0 +1,24 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class PadObj : public OperatorObj {
+    // the number of start and end pad values for all dims.
+    vector<int> pads;
+
+  public:
+    // pad for appointed axises,if axis is empty,then pad for all axises.
+    PadObj(GraphObj *graph, Tensor input, Tensor output,
+           const vector<int> &pads, const optional<const vector<int>> &axis);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+    Shape PadObj::getPads() const { return pads; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+} // namespace infini
--- a/include/operators/slice.h
+++ b/include/operators/slice.h
@ -0,0 +1,24 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class SliceObj : public OperatorObj {
+    vector<int> starts, ends; // the start no. and end no. for all dims.
+
+  public:
+    SliceObj(GraphObj *graph, Tensor input, Tensor output,
+             const vector<int> &starts, const vector<int> &ends,
+             const optional<vector<int>> &axis,
+             const optional<vector<int>> &steps);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+    Shape getStart() const { return starts; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+} // namespace infini
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -30,6 +30,17 @@ size_t TensorObj::getOffset(const Shape &pos) const {
    return idx;
 }

+vector<size_t> TensorObj::getStride() const {
+    vector<size_t> ret;
+    size_t stride = 1;
+    for (int i = shape.size() - 1; i >= 1; i--) {
+        ret.emplace(ret.begin(), stride);
+        stride *= shape.at(i);
+    }
+    ret.emplace(ret.begin(), stride);
+    return ret;
+}
+
 size_t TensorObj::size() const {
    size_t ret = 1;
    for (const auto &d : shape)
--- a/src/kernels/cuda/pad_slice.cc
+++ b/src/kernels/cuda/pad_slice.cc
@ -0,0 +1,45 @@
+#include "cuda/cuda_kernel_wihtout_config.h"
+#include "cuda/cuda_pad_slice.h"
+#include "operators/pad.h"
+#include "operators/slice.h"
+namespace infini {
+class PadSliceCudaCompute {
+  public:
+    void do_compute(Tensor partTensor, Tensor wholeTensor, const Shape &begNos,
+                    bool isPad) const {
+        int nDims = partTensor->getDims().size();
+        IT_ASSERT(MAX_DIM >= nDims);
+        TransMetaData metadata;
+        for (int i = 0; i < nDims; i++) {
+            metadata.begNum[i] = begNos[i];
+            metadata.wholeNDim[i] = wholeTensor->getDims()[i];
+            metadata.partNDim[i] = partTensor->getDims()[i];
+            metadata.partStride[i] = partTensor->getStride()[i];
+        }
+        pad_slice_kernel(partTensor->getRawDataPtr<float *>(),
+                         wholeTensor->getRawDataPtr<float *>(), metadata, nDims,
+                         wholeTensor->size(), isPad);
+    }
+};
+
+class PadCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig {
+    void compute(const Operator &op,
+                 const RuntimeObj *_context) const override {
+        do_compute(op->getInputs(0), op->getOutput(), as<PadObj>(op)->getPads(),
+                   true);
+    }
+};
+
+class SliceCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig {
+    void compute(const Operator &op,
+                 const RuntimeObj *_context) const override {
+        do_compute(op->getOutput(), op->getInputs(0),
+                   as<SliceObj>(op)->getStart(), false);
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::Slice, DataType::Float32, SliceCuda,
+                "Slice__CUDA_Float32");
+REGISTER_KERNEL(Device::CUDA, OpType::Pad, DataType::Float32, PadCuda,
+                "Pad__CUDA_Float32");
+} // namespace infini
--- a/src/kernels/cuda/pad_slice.cu
+++ b/src/kernels/cuda/pad_slice.cu
@ -0,0 +1,52 @@
+#include "cuda/cuda_common.h"
+#include "cuda/cuda_pad_slice.h"
+
+__device__ int WholeTensorOffset2PartTensorOffset(int wholeOffset,
+                                                  TransMetaData metaData,
+                                                  int nDims) {
+    int offset = 0;
+    for (int i = nDims - 1; i >= 0; --i) {
+        auto wholePos = wholeOffset % metaData.wholeNDim[i];
+        auto pos = wholePos - metaData.begNum[i];
+        // if pos belongs to pad range, then return -1
+        if (pos < 0 || pos >= metaData.partNDim[i])
+            return -1;
+        wholeOffset = wholeOffset / metaData.wholeNDim[i];
+
+        offset += pos * metaData.partStride[i];
+    }
+
+    return offset;
+}
+
+__global__ void _pad_slice_kernel(float *part, float *whole,
+                                  TransMetaData metaData, int nDims, int num,
+                                  bool isPad) {
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (tid >= num)
+        return;
+
+    int stride = blockDim.x * gridDim.x;
+    while (tid < num) {
+        int offset = WholeTensorOffset2PartTensorOffset(tid, metaData, nDims);
+        if (isPad)
+            if (offset < 0)
+                whole[tid] = 0;
+            else
+                whole[tid] = part[offset];
+        else
+            part[offset] = whole[tid];
+        tid += stride;
+    }
+}
+
+namespace infini {
+void pad_slice_kernel(float *partData, float *wholeData,
+                      const TransMetaData &metadata, int nDims, int num,
+                      bool isPad) {
+    int blockSize = 32 * 16;
+    int gridSize = (num + blockSize - 1) / blockSize;
+    _pad_slice_kernel<<<gridSize, blockSize>>>(partData, wholeData, metadata,
+                                               nDims, num, isPad);
+}
+} // namespace infini
--- a/src/operators/pad.cc
+++ b/src/operators/pad.cc
@ -0,0 +1,63 @@
+#include "operators/pad.h"
+
+namespace infini {
+PadObj::PadObj(GraphObj *graph, Tensor input, Tensor output,
+               const vector<int> &_pads,
+               const optional<const vector<int>> &axis)
+    : OperatorObj(OpType::Pad, {input}, {output}) {
+    if (axis == std::nullopt)
+        pads = _pads;
+    else {
+        int nAxis = (*axis).size();
+        IT_ASSERT((int)_pads.size() == nAxis * 2);
+        int nDims = input->getDims().size();
+        vector<int> tmp(nDims * 2, 0);
+
+        for (int i = 0; i < nAxis; ++i) {
+            tmp[(*axis)[i]] = _pads[i];
+            tmp[(*axis)[i] + nDims] = _pads[i + nAxis];
+        }
+        pads = tmp;
+    }
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> PadObj::inferShape(const TensorVec &inputs) const {
+    auto dims = inputs[0]->getDims();
+    int nDims = dims.size();
+    if (nDims * 2 != (int)pads.size())
+        return {};
+    for (int i = 0; i < nDims; ++i) {
+        if (pads[i] < 0 || pads[i + nDims] < 0)
+            return {};
+        dims[i] += pads[i] + pads[i + nDims];
+    }
+
+    return {{dims}};
+}
+std::string PadObj::toString() const {
+    std::ostringstream os;
+    os << "Pad"
+       << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "pads=" << vecToString(pads) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> PadObj::getWorkloadVector() const {
+    vector<int> ret = inputs[0]->getDims();
+    ret.insert(ret.end(), pads.begin(), pads.end());
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+vector<int> PadObj::getOpAttrVector() const {
+    vector<int> ret = pads;
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+} // namespace infini
--- a/src/operators/slice.cc
+++ b/src/operators/slice.cc
@ -0,0 +1,80 @@
+#include "operators/slice.h"
+
+namespace infini {
+SliceObj::SliceObj(GraphObj *graph, Tensor input, Tensor output,
+                   const vector<int> &starts, const vector<int> &ends,
+                   const optional<vector<int>> &axis,
+                   const optional<vector<int>> &steps)
+    : OperatorObj(OpType::Slice, {input}, {output}) {
+    if (steps != std::nullopt)
+        IT_TODO_HALT();
+    IT_ASSERT(starts.size() == ends.size());
+
+    if (axis == std::nullopt) {
+        this->starts = starts;
+        this->ends = ends;
+    } else {
+        int nAxis = (*axis).size();
+        IT_ASSERT((int)starts.size() == nAxis);
+
+        int nDims = input->getDims().size();
+        vector<int> tmpS(nDims, 0), tmpE;
+        for (int i = 0; i < nDims; ++i) {
+            tmpE.emplace_back(input->getDims()[i] - 1);
+        }
+
+        for (int i = 0; i < nAxis; ++i) {
+            if ((*axis)[i] < 0)
+                IT_TODO_HALT();
+            tmpS[(*axis)[i]] = starts[i];
+            tmpE[(*axis)[i]] = ends[i];
+        }
+        this->starts = tmpS;
+        this->ends = tmpE;
+    }
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> SliceObj::inferShape(const TensorVec &inputs) const {
+    auto dims = inputs[0]->getDims();
+    int nDims = dims.size();
+    if (nDims != (int)starts.size())
+        return {};
+    for (int i = 0; i < nDims; ++i) {
+        if (starts[i] < 0 || ends[i] >= dims[i] || starts[i] > ends[i])
+            return {};
+        dims[i] = ends[i] - starts[i] + 1;
+    }
+
+    return {{dims}};
+}
+
+std::string SliceObj::toString() const {
+    std::ostringstream os;
+    os << "Slice"
+       << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "starts=" << vecToString(starts) << ",";
+    os << "ends=" << vecToString(ends) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> SliceObj::getWorkloadVector() const {
+    vector<int> ret = inputs[0]->getDims();
+    ret.insert(ret.end(), starts.begin(), starts.end());
+    ret.insert(ret.end(), ends.begin(), ends.end());
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+vector<int> SliceObj::getOpAttrVector() const {
+    vector<int> ret = starts;
+    ret.insert(ret.end(), ends.begin(), ends.end());
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+} // namespace infini
--- a/test/kernels/cuda/test_cuda_pad.cc
+++ b/test/kernels/cuda/test_cuda_pad.cc
@ -0,0 +1,41 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/pad.h"
+#include "test.h"
+
+namespace infini {
+TEST(Pad, Cuda) {
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor icpu =
+        make_ref<TensorObj>(Shape{1, 2, 3, 2}, DataType::Float32, cpuRuntime);
+    icpu->dataMalloc();
+    icpu->setData(IncrementalGenerator());
+
+    // Build CUDA graph;
+    Graph g = make_ref<GraphObj>(cudaRuntime);
+    auto i = g->cloneTensor(icpu);
+    auto op = g->addOp<PadObj>(i, nullptr, vector<int>{1, 0, 1, 1},
+                               vector<int>{0, 3});
+
+    // allocate CUDA memory
+    g->dataMalloc();
+
+    // Execute on CUDA
+    cudaRuntime->run(g);
+
+    // clone CUDA output to CPU
+    auto o = op->getOutput();
+    auto cpuo = o->clone(cpuRuntime);
+    // cudaPrintTensor(o);
+    //  check results on CPU
+    EXPECT_TRUE(cpuo->equalData(
+        vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,
+                      0, 1, 0, 2, 3, 0, 4, 5, 0, 6, 7, 0, 8, 9, 0, 10, 11, 0,
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0}));
+}
+} // namespace infini
--- a/test/kernels/cuda/test_cuda_slice.cc
+++ b/test/kernels/cuda/test_cuda_slice.cc
@ -0,0 +1,39 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/slice.h"
+#include "test.h"
+
+namespace infini {
+TEST(CUDA_Slice, run) {
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor icpu =
+        make_ref<TensorObj>(Shape{3, 2, 1, 5}, DataType::Float32, cpuRuntime);
+    icpu->dataMalloc();
+    icpu->setData(IncrementalGenerator());
+
+    // Build CUDA graph;
+    Graph g = make_ref<GraphObj>(cudaRuntime);
+    auto i = g->cloneTensor(icpu);
+    auto op =
+        g->addOp<SliceObj>(i, nullptr, vector<int>{1, 1}, vector<int>{1, 4},
+                           vector<int>{0, 3}, std::nullopt);
+
+    // allocate CUDA memory
+    g->dataMalloc();
+
+    // Execute on CUDA
+    cudaRuntime->run(g);
+
+    // clone CUDA output to CPU
+    auto o = op->getOutput();
+    auto cpuo = o->clone(cpuRuntime);
+    // cudaPrintTensor(o);
+    //  check results on CPU
+    EXPECT_TRUE(cpuo->equalData(vector<float>{11, 12, 13, 14, 16, 17, 18, 19}));
+}
+} // namespace infini
--- a/test/operators/test_pad.cc
+++ b/test/operators/test_pad.cc
@ -0,0 +1,25 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/pad.h"
+#include "test.h"
+
+namespace infini {
+TEST(Pad, ShapeInference) {
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    {
+        Graph g = make_ref<GraphObj>(cpuRuntime);
+        Tensor i = g->addTensor({1, 64, 162, 162}, DataType::UInt32);
+        auto op = g->addOp<PadObj>(
+            i, nullptr, vector<int>{2, 10, 1, 5, 0, 10, 1, 5}, std::nullopt);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{3, 84, 164, 172}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(cpuRuntime);
+        Tensor i = g->addTensor({1, 64, 162, 162}, DataType::UInt32);
+        auto op = g->addOp<PadObj>(i, nullptr, vector<int>{2, 10, 1, 5},
+                                   vector<int>{0, 3});
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{4, 64, 162, 177}));
+    }
+}
+
+} // namespace infini
--- a/test/operators/test_slice.cc
+++ b/test/operators/test_slice.cc
@ -0,0 +1,27 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/slice.h"
+#include "test.h"
+
+namespace infini {
+TEST(Slice, ShapeInference) {
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    {
+        Graph g = make_ref<GraphObj>(cpuRuntime);
+        Tensor i = g->addTensor({10, 64, 162, 162}, DataType::UInt32);
+        auto op = g->addOp<SliceObj>(i, nullptr, vector<int>{2, 10, 1, 5},
+                                     vector<int>{3, 10, 100, 100}, std::nullopt,
+                                     std::nullopt);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 100, 96}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(cpuRuntime);
+        Tensor i = g->addTensor({10, 64, 162, 162}, DataType::UInt32);
+        auto op = g->addOp<SliceObj>(i, nullptr, vector<int>{2, 5},
+                                     vector<int>{3, 100}, vector<int>{1, 3},
+                                     std::nullopt);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{10, 2, 162, 96}));
+    }
+}
+
+} // namespace infini