ADD:pad/slice operator and cuda kernel. (#39)

fix compile error refector clang format split test. fix compile error. ADD slice cuda kernel. ADD slice operator. ADD:pad operator and cuda kernel.
2022-09-29 10:29:24 +08:00 · 2022-09-29 10:29:24 +08:00 · 5560d0f2fb
parent 1aefc1b27e
commit 5560d0f2fb
13 changed files with 451 additions and 1 deletions
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -20,7 +20,7 @@ class TensorObj : public TensorBaseObj {
    size_t getBytes() const;
    Shape getDims() const { return shape; }
-
+    vector<size_t> getStride() const;
    size_t getOffset(const Shape &ds) const;
    using TensorBaseObj::getData;
    VType getData(const Shape &pos) const;
--- a/include/cuda/cuda_pad_slice.h
+++ b/include/cuda/cuda_pad_slice.h
@ -0,0 +1,19 @@
 #pragma once
 const int MAX_DIM = 4;
 // Pad operator acts like padding small(part) tensor into a big(whole) tensor.
 // Slice operator acts like spling a big(whole) tensor into a small(part)
 // tensor.
 typedef struct {
    int begNum[MAX_DIM];     // pad or slice number at beginning
    int wholeNDim[MAX_DIM];  // dim size after padding or before slicing
    int partNDim[MAX_DIM];   // dim size before padding or after slicing
    int partStride[MAX_DIM]; // stride before padding or after slicing
 } TransMetaData;
 namespace infini {
 void pad_slice_kernel(float *partData, float *wholeData,
                      const TransMetaData &metadata, int nDims, int num,
                      bool isPad);
 } // namespace infini
--- a/include/operators/pad.h
+++ b/include/operators/pad.h
@ -0,0 +1,24 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 class PadObj : public OperatorObj {
    // the number of start and end pad values for all dims.
    vector<int> pads;
  public:
    // pad for appointed axises,if axis is empty,then pad for all axises.
    PadObj(GraphObj *graph, Tensor input, Tensor output,
           const vector<int> &pads, const optional<const vector<int>> &axis);
    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
    std::string toString() const override;
    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }
    Shape PadObj::getPads() const { return pads; }
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
 };
 } // namespace infini
--- a/include/operators/slice.h
+++ b/include/operators/slice.h
@ -0,0 +1,24 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 class SliceObj : public OperatorObj {
    vector<int> starts, ends; // the start no. and end no. for all dims.
  public:
    SliceObj(GraphObj *graph, Tensor input, Tensor output,
             const vector<int> &starts, const vector<int> &ends,
             const optional<vector<int>> &axis,
             const optional<vector<int>> &steps);
    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
    std::string toString() const override;
    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }
    Shape getStart() const { return starts; }
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
 };
 } // namespace infini
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -30,6 +30,17 @@ size_t TensorObj::getOffset(const Shape &pos) const {
    return idx;
 }
 vector<size_t> TensorObj::getStride() const {
    vector<size_t> ret;
    size_t stride = 1;
    for (int i = shape.size() - 1; i >= 1; i--) {
        ret.emplace(ret.begin(), stride);
        stride *= shape.at(i);
    }
    ret.emplace(ret.begin(), stride);
    return ret;
 }
 size_t TensorObj::size() const {
    size_t ret = 1;
    for (const auto &d : shape)
--- a/src/kernels/cuda/pad_slice.cc
+++ b/src/kernels/cuda/pad_slice.cc
@ -0,0 +1,45 @@
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_pad_slice.h"
 #include "operators/pad.h"
 #include "operators/slice.h"
 namespace infini {
 class PadSliceCudaCompute {
  public:
    void do_compute(Tensor partTensor, Tensor wholeTensor, const Shape &begNos,
                    bool isPad) const {
        int nDims = partTensor->getDims().size();
        IT_ASSERT(MAX_DIM >= nDims);
        TransMetaData metadata;
        for (int i = 0; i < nDims; i++) {
            metadata.begNum[i] = begNos[i];
            metadata.wholeNDim[i] = wholeTensor->getDims()[i];
            metadata.partNDim[i] = partTensor->getDims()[i];
            metadata.partStride[i] = partTensor->getStride()[i];
        }
        pad_slice_kernel(partTensor->getRawDataPtr<float *>(),
                         wholeTensor->getRawDataPtr<float *>(), metadata, nDims,
                         wholeTensor->size(), isPad);
    }
 };
 class PadCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig {
    void compute(const Operator &op,
                 const RuntimeObj *_context) const override {
        do_compute(op->getInputs(0), op->getOutput(), as<PadObj>(op)->getPads(),
                   true);
    }
 };
 class SliceCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig {
    void compute(const Operator &op,
                 const RuntimeObj *_context) const override {
        do_compute(op->getOutput(), op->getInputs(0),
                   as<SliceObj>(op)->getStart(), false);
    }
 };
 REGISTER_KERNEL(Device::CUDA, OpType::Slice, DataType::Float32, SliceCuda,
                "Slice__CUDA_Float32");
 REGISTER_KERNEL(Device::CUDA, OpType::Pad, DataType::Float32, PadCuda,
                "Pad__CUDA_Float32");
 } // namespace infini
--- a/src/kernels/cuda/pad_slice.cu
+++ b/src/kernels/cuda/pad_slice.cu
@ -0,0 +1,52 @@
 #include "cuda/cuda_common.h"
 #include "cuda/cuda_pad_slice.h"
 __device__ int WholeTensorOffset2PartTensorOffset(int wholeOffset,
                                                  TransMetaData metaData,
                                                  int nDims) {
    int offset = 0;
    for (int i = nDims - 1; i >= 0; --i) {
        auto wholePos = wholeOffset % metaData.wholeNDim[i];
        auto pos = wholePos - metaData.begNum[i];
        // if pos belongs to pad range, then return -1
        if (pos < 0 || pos >= metaData.partNDim[i])
            return -1;
        wholeOffset = wholeOffset / metaData.wholeNDim[i];
        offset += pos * metaData.partStride[i];
    }
    return offset;
 }
 __global__ void _pad_slice_kernel(float *part, float *whole,
                                  TransMetaData metaData, int nDims, int num,
                                  bool isPad) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid >= num)
        return;
    int stride = blockDim.x * gridDim.x;
    while (tid < num) {
        int offset = WholeTensorOffset2PartTensorOffset(tid, metaData, nDims);
        if (isPad)
            if (offset < 0)
                whole[tid] = 0;
            else
                whole[tid] = part[offset];
        else
            part[offset] = whole[tid];
        tid += stride;
    }
 }
 namespace infini {
 void pad_slice_kernel(float *partData, float *wholeData,
                      const TransMetaData &metadata, int nDims, int num,
                      bool isPad) {
    int blockSize = 32 * 16;
    int gridSize = (num + blockSize - 1) / blockSize;
    _pad_slice_kernel<<<gridSize, blockSize>>>(partData, wholeData, metadata,
                                               nDims, num, isPad);
 }
 } // namespace infini
--- a/src/operators/pad.cc
+++ b/src/operators/pad.cc
@ -0,0 +1,63 @@
 #include "operators/pad.h"
 namespace infini {
 PadObj::PadObj(GraphObj *graph, Tensor input, Tensor output,
               const vector<int> &_pads,
               const optional<const vector<int>> &axis)
    : OperatorObj(OpType::Pad, {input}, {output}) {
    if (axis == std::nullopt)
        pads = _pads;
    else {
        int nAxis = (*axis).size();
        IT_ASSERT((int)_pads.size() == nAxis * 2);
        int nDims = input->getDims().size();
        vector<int> tmp(nDims * 2, 0);
        for (int i = 0; i < nAxis; ++i) {
            tmp[(*axis)[i]] = _pads[i];
            tmp[(*axis)[i] + nDims] = _pads[i + nAxis];
        }
        pads = tmp;
    }
    IT_ASSERT(checkValid(graph));
 }
 optional<vector<Shape>> PadObj::inferShape(const TensorVec &inputs) const {
    auto dims = inputs[0]->getDims();
    int nDims = dims.size();
    if (nDims * 2 != (int)pads.size())
        return {};
    for (int i = 0; i < nDims; ++i) {
        if (pads[i] < 0 || pads[i + nDims] < 0)
            return {};
        dims[i] += pads[i] + pads[i + nDims];
    }
    return {{dims}};
 }
 std::string PadObj::toString() const {
    std::ostringstream os;
    os << "Pad"
       << "[" << getGuid() << "]";
    os << "(";
    os << vecToString(inputs[0]->getDims()) << ",";
    os << "pads=" << vecToString(pads) << ",";
    os << "input=" << inputs[0]->getGuid() << ",";
    os << "output=" << outputs[0]->getGuid() << ")";
    return os.str();
 }
 vector<int> PadObj::getWorkloadVector() const {
    vector<int> ret = inputs[0]->getDims();
    ret.insert(ret.end(), pads.begin(), pads.end());
    ret.emplace(ret.begin(), enum_to_underlying(type));
    return ret;
 }
 vector<int> PadObj::getOpAttrVector() const {
    vector<int> ret = pads;
    ret.emplace(ret.begin(), enum_to_underlying(type));
    return ret;
 }
 } // namespace infini
--- a/src/operators/slice.cc
+++ b/src/operators/slice.cc
@ -0,0 +1,80 @@
 #include "operators/slice.h"
 namespace infini {
 SliceObj::SliceObj(GraphObj *graph, Tensor input, Tensor output,
                   const vector<int> &starts, const vector<int> &ends,
                   const optional<vector<int>> &axis,
                   const optional<vector<int>> &steps)
    : OperatorObj(OpType::Slice, {input}, {output}) {
    if (steps != std::nullopt)
        IT_TODO_HALT();
    IT_ASSERT(starts.size() == ends.size());
    if (axis == std::nullopt) {
        this->starts = starts;
        this->ends = ends;
    } else {
        int nAxis = (*axis).size();
        IT_ASSERT((int)starts.size() == nAxis);
        int nDims = input->getDims().size();
        vector<int> tmpS(nDims, 0), tmpE;
        for (int i = 0; i < nDims; ++i) {
            tmpE.emplace_back(input->getDims()[i] - 1);
        }
        for (int i = 0; i < nAxis; ++i) {
            if ((*axis)[i] < 0)
                IT_TODO_HALT();
            tmpS[(*axis)[i]] = starts[i];
            tmpE[(*axis)[i]] = ends[i];
        }
        this->starts = tmpS;
        this->ends = tmpE;
    }
    IT_ASSERT(checkValid(graph));
 }
 optional<vector<Shape>> SliceObj::inferShape(const TensorVec &inputs) const {
    auto dims = inputs[0]->getDims();
    int nDims = dims.size();
    if (nDims != (int)starts.size())
        return {};
    for (int i = 0; i < nDims; ++i) {
        if (starts[i] < 0 || ends[i] >= dims[i] || starts[i] > ends[i])
            return {};
        dims[i] = ends[i] - starts[i] + 1;
    }
    return {{dims}};
 }
 std::string SliceObj::toString() const {
    std::ostringstream os;
    os << "Slice"
       << "[" << getGuid() << "]";
    os << "(";
    os << vecToString(inputs[0]->getDims()) << ",";
    os << "starts=" << vecToString(starts) << ",";
    os << "ends=" << vecToString(ends) << ",";
    os << "input=" << inputs[0]->getGuid() << ",";
    os << "output=" << outputs[0]->getGuid() << ")";
    return os.str();
 }
 vector<int> SliceObj::getWorkloadVector() const {
    vector<int> ret = inputs[0]->getDims();
    ret.insert(ret.end(), starts.begin(), starts.end());
    ret.insert(ret.end(), ends.begin(), ends.end());
    ret.emplace(ret.begin(), enum_to_underlying(type));
    return ret;
 }
 vector<int> SliceObj::getOpAttrVector() const {
    vector<int> ret = starts;
    ret.insert(ret.end(), ends.begin(), ends.end());
    ret.emplace(ret.begin(), enum_to_underlying(type));
    return ret;
 }
 } // namespace infini
--- a/test/kernels/cuda/test_cuda_pad.cc
+++ b/test/kernels/cuda/test_cuda_pad.cc
@ -0,0 +1,41 @@
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
 #include "operators/pad.h"
 #include "test.h"
 namespace infini {
 TEST(Pad, Cuda) {
    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    // Build input data on CPU
    Tensor icpu =
        make_ref<TensorObj>(Shape{1, 2, 3, 2}, DataType::Float32, cpuRuntime);
    icpu->dataMalloc();
    icpu->setData(IncrementalGenerator());
    // Build CUDA graph;
    Graph g = make_ref<GraphObj>(cudaRuntime);
    auto i = g->cloneTensor(icpu);
    auto op = g->addOp<PadObj>(i, nullptr, vector<int>{1, 0, 1, 1},
                               vector<int>{0, 3});
    // allocate CUDA memory
    g->dataMalloc();
    // Execute on CUDA
    cudaRuntime->run(g);
    // clone CUDA output to CPU
    auto o = op->getOutput();
    auto cpuo = o->clone(cpuRuntime);
    // cudaPrintTensor(o);
    //  check results on CPU
    EXPECT_TRUE(cpuo->equalData(
        vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,
                      0, 1, 0, 2, 3, 0, 4, 5, 0, 6, 7, 0, 8, 9, 0, 10, 11, 0,
                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0}));
 }
 } // namespace infini
--- a/test/kernels/cuda/test_cuda_slice.cc
+++ b/test/kernels/cuda/test_cuda_slice.cc
@ -0,0 +1,39 @@
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
 #include "operators/slice.h"
 #include "test.h"
 namespace infini {
 TEST(CUDA_Slice, run) {
    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    // Build input data on CPU
    Tensor icpu =
        make_ref<TensorObj>(Shape{3, 2, 1, 5}, DataType::Float32, cpuRuntime);
    icpu->dataMalloc();
    icpu->setData(IncrementalGenerator());
    // Build CUDA graph;
    Graph g = make_ref<GraphObj>(cudaRuntime);
    auto i = g->cloneTensor(icpu);
    auto op =
        g->addOp<SliceObj>(i, nullptr, vector<int>{1, 1}, vector<int>{1, 4},
                           vector<int>{0, 3}, std::nullopt);
    // allocate CUDA memory
    g->dataMalloc();
    // Execute on CUDA
    cudaRuntime->run(g);
    // clone CUDA output to CPU
    auto o = op->getOutput();
    auto cpuo = o->clone(cpuRuntime);
    // cudaPrintTensor(o);
    //  check results on CPU
    EXPECT_TRUE(cpuo->equalData(vector<float>{11, 12, 13, 14, 16, 17, 18, 19}));
 }
 } // namespace infini
--- a/test/operators/test_pad.cc
+++ b/test/operators/test_pad.cc
@ -0,0 +1,25 @@
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "operators/pad.h"
 #include "test.h"
 namespace infini {
 TEST(Pad, ShapeInference) {
    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
    {
        Graph g = make_ref<GraphObj>(cpuRuntime);
        Tensor i = g->addTensor({1, 64, 162, 162}, DataType::UInt32);
        auto op = g->addOp<PadObj>(
            i, nullptr, vector<int>{2, 10, 1, 5, 0, 10, 1, 5}, std::nullopt);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{3, 84, 164, 172}));
    }
    {
        Graph g = make_ref<GraphObj>(cpuRuntime);
        Tensor i = g->addTensor({1, 64, 162, 162}, DataType::UInt32);
        auto op = g->addOp<PadObj>(i, nullptr, vector<int>{2, 10, 1, 5},
                                   vector<int>{0, 3});
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{4, 64, 162, 177}));
    }
 }
 } // namespace infini
--- a/test/operators/test_slice.cc
+++ b/test/operators/test_slice.cc
@ -0,0 +1,27 @@
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "operators/slice.h"
 #include "test.h"
 namespace infini {
 TEST(Slice, ShapeInference) {
    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
    {
        Graph g = make_ref<GraphObj>(cpuRuntime);
        Tensor i = g->addTensor({10, 64, 162, 162}, DataType::UInt32);
        auto op = g->addOp<SliceObj>(i, nullptr, vector<int>{2, 10, 1, 5},
                                     vector<int>{3, 10, 100, 100}, std::nullopt,
                                     std::nullopt);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 100, 96}));
    }
    {
        Graph g = make_ref<GraphObj>(cpuRuntime);
        Tensor i = g->addTensor({10, 64, 162, 162}, DataType::UInt32);
        auto op = g->addOp<SliceObj>(i, nullptr, vector<int>{2, 5},
                                     vector<int>{3, 100}, vector<int>{1, 3},
                                     std::nullopt);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{10, 2, 162, 96}));
    }
 }
 } // namespace infini