ADD:pad/slice operator and cuda kernel. (#39)

fix compile error

refector

clang format

split test.

fix compile error.

ADD slice cuda kernel.

ADD slice operator.

ADD:pad operator and cuda kernel.
This commit is contained in:
wendy12022 2022-09-29 10:29:24 +08:00 committed by GitHub
parent 1aefc1b27e
commit 5560d0f2fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 451 additions and 1 deletions

View File

@ -20,7 +20,7 @@ class TensorObj : public TensorBaseObj {
size_t getBytes() const;
Shape getDims() const { return shape; }
vector<size_t> getStride() const;
size_t getOffset(const Shape &ds) const;
using TensorBaseObj::getData;
VType getData(const Shape &pos) const;

View File

@ -0,0 +1,19 @@
#pragma once
const int MAX_DIM = 4;
// Pad operator acts like padding small(part) tensor into a big(whole) tensor.
// Slice operator acts like spling a big(whole) tensor into a small(part)
// tensor.
typedef struct {
int begNum[MAX_DIM]; // pad or slice number at beginning
int wholeNDim[MAX_DIM]; // dim size after padding or before slicing
int partNDim[MAX_DIM]; // dim size before padding or after slicing
int partStride[MAX_DIM]; // stride before padding or after slicing
} TransMetaData;
namespace infini {
void pad_slice_kernel(float *partData, float *wholeData,
const TransMetaData &metadata, int nDims, int num,
bool isPad);
} // namespace infini

24
include/operators/pad.h Normal file
View File

@ -0,0 +1,24 @@
#pragma once
#include "core/operator.h"
namespace infini {
class PadObj : public OperatorObj {
// the number of start and end pad values for all dims.
vector<int> pads;
public:
// pad for appointed axises,if axis is empty,then pad for all axises.
PadObj(GraphObj *graph, Tensor input, Tensor output,
const vector<int> &pads, const optional<const vector<int>> &axis);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override;
int numInputs() const override { return 1; }
int numOutputs() const override { return 1; }
Shape PadObj::getPads() const { return pads; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

24
include/operators/slice.h Normal file
View File

@ -0,0 +1,24 @@
#pragma once
#include "core/operator.h"
namespace infini {
class SliceObj : public OperatorObj {
vector<int> starts, ends; // the start no. and end no. for all dims.
public:
SliceObj(GraphObj *graph, Tensor input, Tensor output,
const vector<int> &starts, const vector<int> &ends,
const optional<vector<int>> &axis,
const optional<vector<int>> &steps);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override;
int numInputs() const override { return 1; }
int numOutputs() const override { return 1; }
Shape getStart() const { return starts; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -30,6 +30,17 @@ size_t TensorObj::getOffset(const Shape &pos) const {
return idx;
}
vector<size_t> TensorObj::getStride() const {
vector<size_t> ret;
size_t stride = 1;
for (int i = shape.size() - 1; i >= 1; i--) {
ret.emplace(ret.begin(), stride);
stride *= shape.at(i);
}
ret.emplace(ret.begin(), stride);
return ret;
}
size_t TensorObj::size() const {
size_t ret = 1;
for (const auto &d : shape)

View File

@ -0,0 +1,45 @@
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_pad_slice.h"
#include "operators/pad.h"
#include "operators/slice.h"
namespace infini {
class PadSliceCudaCompute {
public:
void do_compute(Tensor partTensor, Tensor wholeTensor, const Shape &begNos,
bool isPad) const {
int nDims = partTensor->getDims().size();
IT_ASSERT(MAX_DIM >= nDims);
TransMetaData metadata;
for (int i = 0; i < nDims; i++) {
metadata.begNum[i] = begNos[i];
metadata.wholeNDim[i] = wholeTensor->getDims()[i];
metadata.partNDim[i] = partTensor->getDims()[i];
metadata.partStride[i] = partTensor->getStride()[i];
}
pad_slice_kernel(partTensor->getRawDataPtr<float *>(),
wholeTensor->getRawDataPtr<float *>(), metadata, nDims,
wholeTensor->size(), isPad);
}
};
class PadCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig {
void compute(const Operator &op,
const RuntimeObj *_context) const override {
do_compute(op->getInputs(0), op->getOutput(), as<PadObj>(op)->getPads(),
true);
}
};
class SliceCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig {
void compute(const Operator &op,
const RuntimeObj *_context) const override {
do_compute(op->getOutput(), op->getInputs(0),
as<SliceObj>(op)->getStart(), false);
}
};
REGISTER_KERNEL(Device::CUDA, OpType::Slice, DataType::Float32, SliceCuda,
"Slice__CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Pad, DataType::Float32, PadCuda,
"Pad__CUDA_Float32");
} // namespace infini

View File

@ -0,0 +1,52 @@
#include "cuda/cuda_common.h"
#include "cuda/cuda_pad_slice.h"
__device__ int WholeTensorOffset2PartTensorOffset(int wholeOffset,
TransMetaData metaData,
int nDims) {
int offset = 0;
for (int i = nDims - 1; i >= 0; --i) {
auto wholePos = wholeOffset % metaData.wholeNDim[i];
auto pos = wholePos - metaData.begNum[i];
// if pos belongs to pad range, then return -1
if (pos < 0 || pos >= metaData.partNDim[i])
return -1;
wholeOffset = wholeOffset / metaData.wholeNDim[i];
offset += pos * metaData.partStride[i];
}
return offset;
}
__global__ void _pad_slice_kernel(float *part, float *whole,
TransMetaData metaData, int nDims, int num,
bool isPad) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid >= num)
return;
int stride = blockDim.x * gridDim.x;
while (tid < num) {
int offset = WholeTensorOffset2PartTensorOffset(tid, metaData, nDims);
if (isPad)
if (offset < 0)
whole[tid] = 0;
else
whole[tid] = part[offset];
else
part[offset] = whole[tid];
tid += stride;
}
}
namespace infini {
void pad_slice_kernel(float *partData, float *wholeData,
const TransMetaData &metadata, int nDims, int num,
bool isPad) {
int blockSize = 32 * 16;
int gridSize = (num + blockSize - 1) / blockSize;
_pad_slice_kernel<<<gridSize, blockSize>>>(partData, wholeData, metadata,
nDims, num, isPad);
}
} // namespace infini

63
src/operators/pad.cc Normal file
View File

@ -0,0 +1,63 @@
#include "operators/pad.h"
namespace infini {
PadObj::PadObj(GraphObj *graph, Tensor input, Tensor output,
const vector<int> &_pads,
const optional<const vector<int>> &axis)
: OperatorObj(OpType::Pad, {input}, {output}) {
if (axis == std::nullopt)
pads = _pads;
else {
int nAxis = (*axis).size();
IT_ASSERT((int)_pads.size() == nAxis * 2);
int nDims = input->getDims().size();
vector<int> tmp(nDims * 2, 0);
for (int i = 0; i < nAxis; ++i) {
tmp[(*axis)[i]] = _pads[i];
tmp[(*axis)[i] + nDims] = _pads[i + nAxis];
}
pads = tmp;
}
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>> PadObj::inferShape(const TensorVec &inputs) const {
auto dims = inputs[0]->getDims();
int nDims = dims.size();
if (nDims * 2 != (int)pads.size())
return {};
for (int i = 0; i < nDims; ++i) {
if (pads[i] < 0 || pads[i + nDims] < 0)
return {};
dims[i] += pads[i] + pads[i + nDims];
}
return {{dims}};
}
std::string PadObj::toString() const {
std::ostringstream os;
os << "Pad"
<< "[" << getGuid() << "]";
os << "(";
os << vecToString(inputs[0]->getDims()) << ",";
os << "pads=" << vecToString(pads) << ",";
os << "input=" << inputs[0]->getGuid() << ",";
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
vector<int> PadObj::getWorkloadVector() const {
vector<int> ret = inputs[0]->getDims();
ret.insert(ret.end(), pads.begin(), pads.end());
ret.emplace(ret.begin(), enum_to_underlying(type));
return ret;
}
vector<int> PadObj::getOpAttrVector() const {
vector<int> ret = pads;
ret.emplace(ret.begin(), enum_to_underlying(type));
return ret;
}
} // namespace infini

80
src/operators/slice.cc Normal file
View File

@ -0,0 +1,80 @@
#include "operators/slice.h"
namespace infini {
SliceObj::SliceObj(GraphObj *graph, Tensor input, Tensor output,
const vector<int> &starts, const vector<int> &ends,
const optional<vector<int>> &axis,
const optional<vector<int>> &steps)
: OperatorObj(OpType::Slice, {input}, {output}) {
if (steps != std::nullopt)
IT_TODO_HALT();
IT_ASSERT(starts.size() == ends.size());
if (axis == std::nullopt) {
this->starts = starts;
this->ends = ends;
} else {
int nAxis = (*axis).size();
IT_ASSERT((int)starts.size() == nAxis);
int nDims = input->getDims().size();
vector<int> tmpS(nDims, 0), tmpE;
for (int i = 0; i < nDims; ++i) {
tmpE.emplace_back(input->getDims()[i] - 1);
}
for (int i = 0; i < nAxis; ++i) {
if ((*axis)[i] < 0)
IT_TODO_HALT();
tmpS[(*axis)[i]] = starts[i];
tmpE[(*axis)[i]] = ends[i];
}
this->starts = tmpS;
this->ends = tmpE;
}
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>> SliceObj::inferShape(const TensorVec &inputs) const {
auto dims = inputs[0]->getDims();
int nDims = dims.size();
if (nDims != (int)starts.size())
return {};
for (int i = 0; i < nDims; ++i) {
if (starts[i] < 0 || ends[i] >= dims[i] || starts[i] > ends[i])
return {};
dims[i] = ends[i] - starts[i] + 1;
}
return {{dims}};
}
std::string SliceObj::toString() const {
std::ostringstream os;
os << "Slice"
<< "[" << getGuid() << "]";
os << "(";
os << vecToString(inputs[0]->getDims()) << ",";
os << "starts=" << vecToString(starts) << ",";
os << "ends=" << vecToString(ends) << ",";
os << "input=" << inputs[0]->getGuid() << ",";
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
vector<int> SliceObj::getWorkloadVector() const {
vector<int> ret = inputs[0]->getDims();
ret.insert(ret.end(), starts.begin(), starts.end());
ret.insert(ret.end(), ends.begin(), ends.end());
ret.emplace(ret.begin(), enum_to_underlying(type));
return ret;
}
vector<int> SliceObj::getOpAttrVector() const {
vector<int> ret = starts;
ret.insert(ret.end(), ends.begin(), ends.end());
ret.emplace(ret.begin(), enum_to_underlying(type));
return ret;
}
} // namespace infini

View File

@ -0,0 +1,41 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/pad.h"
#include "test.h"
namespace infini {
TEST(Pad, Cuda) {
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
auto cudaRuntime = make_ref<CudaRuntimeObj>();
// Build input data on CPU
Tensor icpu =
make_ref<TensorObj>(Shape{1, 2, 3, 2}, DataType::Float32, cpuRuntime);
icpu->dataMalloc();
icpu->setData(IncrementalGenerator());
// Build CUDA graph;
Graph g = make_ref<GraphObj>(cudaRuntime);
auto i = g->cloneTensor(icpu);
auto op = g->addOp<PadObj>(i, nullptr, vector<int>{1, 0, 1, 1},
vector<int>{0, 3});
// allocate CUDA memory
g->dataMalloc();
// Execute on CUDA
cudaRuntime->run(g);
// clone CUDA output to CPU
auto o = op->getOutput();
auto cpuo = o->clone(cpuRuntime);
// cudaPrintTensor(o);
// check results on CPU
EXPECT_TRUE(cpuo->equalData(
vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 2, 3, 0, 4, 5, 0, 6, 7, 0, 8, 9, 0, 10, 11, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
}
} // namespace infini

View File

@ -0,0 +1,39 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/slice.h"
#include "test.h"
namespace infini {
TEST(CUDA_Slice, run) {
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
auto cudaRuntime = make_ref<CudaRuntimeObj>();
// Build input data on CPU
Tensor icpu =
make_ref<TensorObj>(Shape{3, 2, 1, 5}, DataType::Float32, cpuRuntime);
icpu->dataMalloc();
icpu->setData(IncrementalGenerator());
// Build CUDA graph;
Graph g = make_ref<GraphObj>(cudaRuntime);
auto i = g->cloneTensor(icpu);
auto op =
g->addOp<SliceObj>(i, nullptr, vector<int>{1, 1}, vector<int>{1, 4},
vector<int>{0, 3}, std::nullopt);
// allocate CUDA memory
g->dataMalloc();
// Execute on CUDA
cudaRuntime->run(g);
// clone CUDA output to CPU
auto o = op->getOutput();
auto cpuo = o->clone(cpuRuntime);
// cudaPrintTensor(o);
// check results on CPU
EXPECT_TRUE(cpuo->equalData(vector<float>{11, 12, 13, 14, 16, 17, 18, 19}));
}
} // namespace infini

View File

@ -0,0 +1,25 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/pad.h"
#include "test.h"
namespace infini {
TEST(Pad, ShapeInference) {
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
{
Graph g = make_ref<GraphObj>(cpuRuntime);
Tensor i = g->addTensor({1, 64, 162, 162}, DataType::UInt32);
auto op = g->addOp<PadObj>(
i, nullptr, vector<int>{2, 10, 1, 5, 0, 10, 1, 5}, std::nullopt);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{3, 84, 164, 172}));
}
{
Graph g = make_ref<GraphObj>(cpuRuntime);
Tensor i = g->addTensor({1, 64, 162, 162}, DataType::UInt32);
auto op = g->addOp<PadObj>(i, nullptr, vector<int>{2, 10, 1, 5},
vector<int>{0, 3});
EXPECT_EQ(op->getOutput()->getDims(), (Shape{4, 64, 162, 177}));
}
}
} // namespace infini

View File

@ -0,0 +1,27 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/slice.h"
#include "test.h"
namespace infini {
TEST(Slice, ShapeInference) {
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
{
Graph g = make_ref<GraphObj>(cpuRuntime);
Tensor i = g->addTensor({10, 64, 162, 162}, DataType::UInt32);
auto op = g->addOp<SliceObj>(i, nullptr, vector<int>{2, 10, 1, 5},
vector<int>{3, 10, 100, 100}, std::nullopt,
std::nullopt);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 100, 96}));
}
{
Graph g = make_ref<GraphObj>(cpuRuntime);
Tensor i = g->addTensor({10, 64, 162, 162}, DataType::UInt32);
auto op = g->addOp<SliceObj>(i, nullptr, vector<int>{2, 5},
vector<int>{3, 100}, vector<int>{1, 3},
std::nullopt);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{10, 2, 162, 96}));
}
}
} // namespace infini