Merge branch 'master' into xpu

This commit is contained in:
Haojie Wang 2023-10-10 15:32:37 +08:00 committed by GitHub
commit c82d5fdc60
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 264 additions and 118 deletions

View File

@ -32,9 +32,9 @@ class GraphHandlerObj {
float momentum, float eps, bool training);
Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
int ph, int pw, int sh, int sw);
int ph, int pw, int sh, int sw, int ceilMode);
Tensor avgPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
int ph, int pw, int sh, int sw);
int ph, int pw, int sh, int sw, int ceilMode);
Tensor add(Tensor a, Tensor b, Tensor c);
Tensor sub(Tensor a, Tensor b, Tensor c);
@ -45,12 +45,14 @@ class GraphHandlerObj {
Tensor max(Tensor a, Tensor b, Tensor c);
Tensor relu(Tensor x, Tensor y);
Tensor gelu(Tensor x, Tensor y);
Tensor sigmoid(Tensor x, Tensor y);
Tensor tanh(Tensor x, Tensor y);
Tensor erf(Tensor x, Tensor y);
Tensor softmax(Tensor x, Tensor y, int axis);
Tensor abs(Tensor x, Tensor y);
Tensor sqrt(Tensor x, Tensor y);
Tensor neg(Tensor x, Tensor y);
Tensor shape(Tensor x, Tensor y);
Tensor identity(Tensor x, Tensor y);
Tensor flatten(Tensor s, Tensor y, int axis);

View File

@ -73,6 +73,7 @@ struct OpType {
GatherElements,
GatherND,
Gemm,
Gelu, // Unary
GlobalAveragePool, // GlobalPool
GlobalLpPool, // GlobalPool
GlobalMaxPool, // GlobalPool

View File

@ -3,14 +3,15 @@
#include "operators/unary.h"
namespace infini {
// TODO(constroy): num should be size_t.
void softmax_kernel(float *input, float *output, int num);
void relu_kernel(float *input, float *output, int num);
void sigmoid_kernel(float *input, float *output, int num);
void tanh_kernel(float *input, float *output, int num);
void abs_kernel(float *input, float *output, int num);
void sqrt_kernel(float *input, float *output, int num);
void erf_kernel(float *input, float *output, int num);
void softmax_kernel(float *input, float *output, size_t num);
void relu_kernel(float *input, float *output, size_t num);
void sigmoid_kernel(float *input, float *output, size_t num);
void tanh_kernel(float *input, float *output, size_t num);
void abs_kernel(float *input, float *output, size_t num);
void sqrt_kernel(float *input, float *output, size_t num);
void neg_kernel(float *input, float *output, size_t num);
void gelu_kernel(float *input, float *output, size_t num);
void erf_kernel(float *input, float *output, size_t num);
void unary_kernel(const Operator &_op) {
auto op = as<UnaryObj>(_op);
@ -30,6 +31,10 @@ void unary_kernel(const Operator &_op) {
abs_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Sqrt)
sqrt_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Gelu)
gelu_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Neg)
neg_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Erf)
erf_kernel(inputData, outputData, num);
else

View File

@ -12,6 +12,7 @@ class PoolingObj : public OperatorObj {
int dh, dw;
int ph, pw;
int sh, sw;
int ceilMode;
int n, c, h, w;
public:
@ -32,9 +33,12 @@ class PoolingObj : public OperatorObj {
* @param pw Padding at the width dimension.
* @param sh Stride at the height dimension.
* @param sw Stride at the width dimension.
* @param ceilMode Whether to use ceil(1) or floor(0) to compute the output
* shape.
*/
PoolingObj(GraphObj *graph, OpType optype, Tensor input, Tensor output,
int kh, int kw, int dh, int dw, int ph, int pw, int sh, int sw);
int kh, int kw, int dh, int dw, int ph, int pw, int sh, int sw,
int ceilMode);
OP_CLONE(PoolingObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
@ -50,6 +54,7 @@ class PoolingObj : public OperatorObj {
int getPw() const { return pw; }
int getSh() const { return sh; }
int getSw() const { return sw; }
int getCeilMode() const { return ceilMode; }
auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); }
auto getNCHWRS() const { return tuple(n, c, h, w, kh, kw); }
@ -62,15 +67,15 @@ class PoolingObj : public OperatorObj {
class MaxPoolObj : public PoolingObj {
public:
MaxPoolObj(GraphObj *graph, Tensor input, Tensor output, int kh, int kw,
int dh, int dw, int ph, int pw, int sh, int sw)
int dh, int dw, int ph, int pw, int sh, int sw, int ceilMode)
: PoolingObj(graph, OpType::MaxPool, input, output, kh, kw, dh, dw, ph,
pw, sh, sw) {}
pw, sh, sw, ceilMode) {}
};
class AvgPoolObj : public PoolingObj {
public:
AvgPoolObj(GraphObj *graph, Tensor input, Tensor output, int kh, int kw,
int dh, int dw, int ph, int pw, int sh, int sw)
int dh, int dw, int ph, int pw, int sh, int sw, int ceilMode)
: PoolingObj(graph, OpType::AveragePool, input, output, kh, kw, dh, dw,
ph, pw, sh, sw) {}
ph, pw, sh, sw, ceilMode) {}
};
}; // namespace infini

View File

@ -258,6 +258,7 @@ class LogObj : public OperatorObj {
};
DEFINE_UNARY_OBJ(Relu, OpType::Relu)
DEFINE_UNARY_OBJ(Gelu, OpType::Gelu)
DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
// DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)

View File

@ -228,11 +228,12 @@ class OnnxStub:
"dilations": [1, 1],
"pads": [0, 0, 0, 0],
"strides": [1, 1],
"ceil_mode": 0,
},
)
(k, d, p, s) = (
(k, d, p, s, ceil_mode) = (
attributes[name]
for name in ["kernel_shape", "dilations", "pads", "strides"]
for name in ["kernel_shape", "dilations", "pads", "strides", "ceil_mode"]
)
if p[0] != p[2] or p[1] != p[3]:
adapt = "{}-adapt".format(node.output[0])
@ -250,6 +251,7 @@ class OnnxStub:
0,
s[0],
s[1],
ceil_mode,
)
else:
tensors[node.output[0]] = self.handler.maxPool(
@ -263,6 +265,7 @@ class OnnxStub:
p[1],
s[0],
s[1],
ceil_mode,
)
elif node.op_type == "AveragePool":
attributes = _parse_attribute(
@ -271,10 +274,11 @@ class OnnxStub:
"kernel_shape": None,
"pads": [0, 0, 0, 0],
"strides": [1, 1],
"ceil_mode": 0,
},
)
(k, p, s) = (
attributes[name] for name in ["kernel_shape", "pads", "strides"]
(k, p, s, ceil_mode) = (
attributes[name] for name in ["kernel_shape", "pads", "strides", "ceil_mode"]
)
if p[0] != p[2] or p[1] != p[3]:
adapt = "{}-adapt".format(node.output[0])
@ -292,6 +296,7 @@ class OnnxStub:
0,
s[0],
s[1],
ceil_mode,
)
else:
tensors[node.output[0]] = self.handler.avgPool(
@ -305,6 +310,7 @@ class OnnxStub:
p[1],
s[0],
s[1],
ceil_mode,
)
elif node.op_type == "GlobalAveragePool":
[_, _, h, w] = _search_shape(model, node.input[0])
@ -319,6 +325,7 @@ class OnnxStub:
0,
1,
1,
0,
)
elif node.op_type == "Add":
tensors[node.output[0]] = self.handler.add(
@ -367,6 +374,11 @@ class OnnxStub:
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Gelu":
tensors[node.output[0]] = self.handler.gelu(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Sigmoid":
tensors[node.output[0]] = self.handler.sigmoid(
tensors[node.input[0]],
@ -396,6 +408,11 @@ class OnnxStub:
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Neg":
tensors[node.output[0]] = self.handler.neg(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Shape":
tensors[node.output[0]] = self.handler.shape(
tensors[node.input[0]],
@ -866,7 +883,7 @@ class OnnxStub:
)
)
elif ty == backend.OpTypeId.MaxPool:
kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
kh, kw, dh, dw, ph, pw, sh, sw, ceil_mode = backend.pool_attrs_of(op)
ctx.push_node(
make_node(
ty.name,
@ -877,10 +894,11 @@ class OnnxStub:
pads=[ph, pw, ph, pw],
dilations=[dh, dw],
strides=[sh, sw],
ceil_mode=ceil_mode,
)
)
elif ty == backend.OpTypeId.AveragePool:
kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
kh, kw, dh, dw, ph, pw, sh, sw, ceil_mode = backend.pool_attrs_of(op)
ctx.push_node(
make_node(
"AveragePool",
@ -890,6 +908,7 @@ class OnnxStub:
kernel_shape=[kh, kw],
pads=[ph, pw, ph, pw],
strides=[sh, sw],
ceil_mode=ceil_mode,
)
)
elif ty in [
@ -899,6 +918,7 @@ class OnnxStub:
backend.OpTypeId.Div,
backend.OpTypeId.Pow,
backend.OpTypeId.Relu,
backend.OpTypeId.Gelu,
backend.OpTypeId.Sigmoid,
backend.OpTypeId.Tanh,
backend.OpTypeId.Softmax,
@ -907,6 +927,7 @@ class OnnxStub:
backend.OpTypeId.PRelu,
backend.OpTypeId.Sqrt,
backend.OpTypeId.Erf,
backend.OpTypeId.Neg,
]:
ctx.push_node(make_node(ty.name, inputs, outputs, name))
elif ty == backend.OpTypeId.Flatten:

View File

@ -208,6 +208,14 @@ class TestStringMethods(unittest.TestCase):
relu = make_node("Relu", ["x"], ["y"], name="relu")
make_and_import_model(make_graph([relu], "relu", [x], [y]))
'''Gelu operator is not supported by onnx 14.1 currently.'''
def test_gelu(self):
pass
# x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
# y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
# gelu = make_node("Gelu", ["x"], ["y"], name="gelu")
# make_and_import_model(make_graph([gelu], "gelu", [x], [y]))
def test_erf(self):
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
@ -243,6 +251,12 @@ class TestStringMethods(unittest.TestCase):
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
abs = make_node("Abs", ["x"], ["y"], name="abs")
make_and_import_model(make_graph([abs], "abs", [x], [y]))
def test_neg(self):
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
neg = make_node("Neg", ["x"], ["y"], name="neg")
make_and_import_model(make_graph([neg], "neg", [x], [y]))
def test_identity(self):
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])

View File

@ -210,10 +210,6 @@ void GraphObj::dataMalloc() {
tensorToOffset[tensor.get()]));
}
}
#ifdef DEBUG_MODE
allocator.info();
#endif
}
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {

View File

@ -95,30 +95,30 @@ Tensor GraphHandlerObj::batchNormalization(Tensor input, Tensor output,
}
Tensor GraphHandlerObj::maxPool(Tensor input, Tensor output, int kh, int kw,
int dh, int dw, int ph, int pw, int sh,
int sw) {
int dh, int dw, int ph, int pw, int sh, int sw,
int ceilMode) {
if (output) {
g->addOpWithOutputs<MaxPoolObj>(std::move(input), output, kh, kw, dh,
dw, ph, pw, sh, sw);
dw, ph, pw, sh, sw, ceilMode);
return output;
} else {
return g
->addOp<MaxPoolObj>(std::move(input), output, kh, kw, dh, dw, ph,
pw, sh, sw)
pw, sh, sw, ceilMode)
->getOutput();
}
}
Tensor GraphHandlerObj::avgPool(Tensor input, Tensor output, int kh, int kw,
int dh, int dw, int ph, int pw, int sh,
int sw) {
int dh, int dw, int ph, int pw, int sh, int sw,
int ceilMode) {
if (output) {
g->addOpWithOutputs<AvgPoolObj>(std::move(input), output, kh, kw, dh,
dw, ph, pw, sh, sw);
dw, ph, pw, sh, sw, ceilMode);
return output;
} else {
return g
->addOp<AvgPoolObj>(std::move(input), output, kh, kw, dh, dw, ph,
pw, sh, sw)
pw, sh, sw, ceilMode)
->getOutput();
}
}
@ -155,10 +155,12 @@ DEFINE_ELEMENT_WISE_METHOD(max, Maximum)
}
DEFINE_UNARY_METHOD(relu, Relu)
DEFINE_UNARY_METHOD(gelu, Gelu)
DEFINE_UNARY_METHOD(sigmoid, Sigmoid)
DEFINE_UNARY_METHOD(tanh, Tanh)
DEFINE_UNARY_METHOD(abs, Abs)
DEFINE_UNARY_METHOD(sqrt, Sqrt)
DEFINE_UNARY_METHOD(neg, Neg)
DEFINE_UNARY_METHOD(shape, Shape)
DEFINE_UNARY_METHOD(erf, Erf)

View File

@ -145,10 +145,10 @@ void LazyAllocator::free(size_t addr, size_t size) {
void *LazyAllocator::getPtr() {
if (this->ptr == nullptr) {
this->ptr = runtime->alloc(this->peak);
#ifdef DEBUG_MODE
printf("LazyAllocator really alloc non-weight: %p %lu bytes\n",
this->ptr, peak);
#endif
// #ifdef DEBUG_MODE
// printf("LazyAllocator really alloc non-weight: %p %lu
// bytes\n", this->ptr, peak);
// #endif
}
return this->ptr;
}
@ -156,10 +156,10 @@ void *LazyAllocator::getPtr() {
void *LazyAllocator::getWeightPtr() {
if (this->weightPtr == nullptr) {
this->weightPtr = runtime->alloc(this->weightPeak);
#ifdef DEBUG_MODE
printf("LazyAllocator really alloc weight: %p %lu bytes\n",
this->weightPtr, weightPeak);
#endif
// #ifdef DEBUG_MODE
// printf("LazyAllocator really alloc weight: %p %lu bytes\n",
// this->weightPtr, weightPeak);
// #endif
}
return this->weightPtr;
}

View File

@ -142,6 +142,7 @@ const char *OpType::toString() const {
CASE(ReduceSum);
CASE(ReduceSumSquare);
CASE(Relu);
CASE(Gelu);
CASE(Reshape);
CASE(Resize);
CASE(ReverseSequence);
@ -232,9 +233,9 @@ const char *OpType::toString() const {
bool OpType::isUnary() const {
static const std::unordered_set<decltype(type)> set{
Abs, Acos, Acosh, Asin, Asinh, Atan, Atanh, Cast, Ceil,
Clip, Cos, Cosh, Erf, Exp, Floor, Log, Neg, Not,
Relu, Round, Sigmoid, Sin, Sinh, Sqrt, Tan, Tanh,
Abs, Acos, Acosh, Asin, Asinh, Atan, Atanh, Cast, Ceil,
Clip, Cos, Cosh, Erf, Exp, Floor, Log, Neg, Not,
Relu, Gelu, Round, Sigmoid, Sin, Sinh, Sqrt, Tan, Tanh,
};
return set.find(type) != set.end();

View File

@ -95,6 +95,7 @@ void export_values(py::module &m) {
.VALUE(OpType, BatchNormalization)
.VALUE(OpType, Softmax)
.VALUE(OpType, Relu)
.VALUE(OpType, Gelu)
.VALUE(OpType, PRelu)
.VALUE(OpType, Sigmoid)
.VALUE(OpType, Tanh)
@ -103,6 +104,7 @@ void export_values(py::module &m) {
.VALUE(OpType, Dropout)
.VALUE(OpType, Cast)
.VALUE(OpType, Sqrt)
.VALUE(OpType, Neg)
.VALUE(OpType, Expand)
.VALUE(OpType, Erf)
.VALUE(OpType, Where)
@ -194,14 +196,14 @@ static std::tuple<float, float, bool> batch_norm_attrs_of(Operator op) {
batchnorm->getTrainingMode());
}
static std::tuple<int, int, int, int, int, int, int, int>
static std::tuple<int, int, int, int, int, int, int, int, int>
pool_attrs_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::MaxPool ||
op->getOpType() == OpType::AveragePool);
auto pool = dynamic_cast<const PoolingObj *>(op.get());
return std::make_tuple(pool->getKh(), pool->getKw(), pool->getDh(),
pool->getDw(), pool->getPh(), pool->getPw(),
pool->getSh(), pool->getSw());
pool->getSh(), pool->getSw(), pool->getCeilMode());
}
static std::tuple<std::optional<float>, std::optional<float>>
@ -454,11 +456,13 @@ void init_graph_builder(py::module &m) {
.def("min", &Handler::min, policy::move)
.def("max", &Handler::max, policy::move)
.def("relu", &Handler::relu, policy::move)
.def("gelu", &Handler::gelu, policy::move)
.def("sigmoid", &Handler::sigmoid, policy::move)
.def("tanh", &Handler::tanh, policy::move)
.def("softmax", &Handler::softmax, policy::move)
.def("abs", &Handler::abs, policy::move)
.def("sqrt", &Handler::sqrt, policy::move)
.def("neg", &Handler::neg, policy::move)
.def("shape", &Handler::shape, policy::move)
.def("identity", &Handler::identity, policy::move)
.def("flatten", &Handler::flatten, policy::move)

View File

@ -30,6 +30,7 @@ class PoolingCnnl : public BangKernelWithoutConfig {
ph, pw, pw, sh, sw, dh, dw, false));
// get outputs
// TODO: verify ceiling mode
auto outVec = op->getOutput()->getDims();
int outArray[4] = {outVec[0], outVec[1], outVec[2], outVec[3]};
cnnlTensorDescriptor_t outDesc;

View File

@ -21,6 +21,7 @@ template <typename T> class NativePooling : public CpuKernelWithoutConfig {
auto inoffset = i * (c * ih * iw) + j * ih * iw;
for (auto h = 0; h < oh; h++) {
for (auto w = 0; w < ow; w++) {
// TODO: verify ceil mode
T val =
getPoolingValue(kh, kw, h * sh - ph, w * sw - pw,
ih, iw, inptr + inoffset);

View File

@ -78,6 +78,11 @@ template <typename T> class NaiveSinh : public NativeUnary<T> {
template <typename T> class NaiveCosh : public NativeUnary<T> {
T doCompute(T val) const override { return std::cosh(val); }
template <typename T> class NaiveGelu : public NativeUnary<T> {
T doCompute(T val) const override {
return 0.5 * val * (1 + std::erf(val / std::sqrt(2)));
}
};
template <typename T> class NaiveErf : public NativeUnary<T> {
@ -102,6 +107,10 @@ template <typename T> class NaiveASinh : public NativeUnary<T> {
template <typename T> class NaiveATanh : public NativeUnary<T> {
T doCompute(T val) const override { return std::atanh(val); }
template <typename T> class NaiveNeg : public NativeUnary<T> {
T doCompute(T val) const override { return -val; }
};
template <typename T> class Clip : public CpuKernelWithoutConfig {
@ -164,6 +173,10 @@ REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32,
NaiveRelu<uint32_t>, "reluNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu<float>,
"reluNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Gelu, DataType::UInt32, NaiveGelu<float>,
"geluNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Gelu, DataType::Float32, NaiveGelu<float>,
"geluNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::UInt32,
NaiveSigmoid<uint32_t>, "sigmoidNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::Float32,
@ -180,6 +193,8 @@ REGISTER_KERNEL(Device::CPU, OpType::Sqrt, DataType::Float32, NaiveSqrt<float>,
"sqrtNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Erf, DataType::Float32, NaiveErf<float>,
"erfNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Neg, DataType::Float32, NaiveNeg<float>,
"negNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::UInt32,
NaiveSoftmax<uint32_t>, "softmaxNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::Float32,

View File

@ -29,17 +29,27 @@ class poolingCudnn : public CudaKernelWithoutConfig {
pw, sh, sw));
// get outputs
int outn, outc, outh, outw;
checkCudnnError(cudnnGetPooling2dForwardOutputDim(
poolingDesc, inDesc, &outn, &outc, &outh, &outw));
auto outDims = op->getOutput()->getDims();
int outn = outDims[0], outc = outDims[1], outh = outDims[2],
outw = outDims[3];
// NOTICE: cudnn pooling does not support ceil mode, so the shape
// inference of cudnn pooling is not consistant with our framework. Ceil
// mode is also supported in Pytorch and ONNX. See
// https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html#torch.nn.MaxPool2d
// and https://github.com/onnx/onnx/blob/main/docs/Operators.md#MaxPool
// for reference.
// TODO: Make sure the result after considering ceil mode is correct.
// int outn, outc, outh, outw;
// checkCudnnError(cudnnGetPooling2dForwardOutputDim(poolingDesc,
// inDesc, &outn, &outc, &outh, &outw));
cudnnTensorDescriptor_t outDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
CUDNN_DATA_FLOAT, outn, outc,
outh, outw));
IT_ASSERT((vector{outn, outc, outh, outw}) ==
op->getOutput()->getDims(),
"cuDNN output shape mismatches with OP output shape");
// IT_ASSERT((vector{outn, outc, outh, outw}) ==
// op->getOutput()->getDims(),
// "cuDNN output shape mismatches with OP output shape");
float alpha = 1.f, beta = 0.f;
checkCudnnError(cudnnPoolingForward(context->cudnnHandle(), poolingDesc,

View File

@ -140,6 +140,10 @@ REGISTER_KERNEL(Device::CUDA, OpType::Abs, DataType::Float32, UnaryCuda,
"Abs_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Sqrt, DataType::Float32, UnaryCuda,
"Sqrt_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Gelu, DataType::Float32, UnaryCuda,
"Gelu_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Neg, DataType::Float32, UnaryCuda,
"Neg_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Erf, DataType::Float32, UnaryCuda,
"Erf_CUDA_Float32");

View File

@ -8,7 +8,7 @@ constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
__global__ void _softmax_kernel1(float *input, float *output, int n) {
__global__ void _softmax_kernel1(float *input, float *output, size_t n) {
float sum = 0.0f;
for (size_t i = 0; i < n; ++i) {
sum += pow(E_CONSTANT, input[i]);
@ -16,106 +16,136 @@ __global__ void _softmax_kernel1(float *input, float *output, int n) {
*output = sum;
}
__global__ void _softmax_kernel2(float *input, float *output, int n) {
__global__ void _softmax_kernel2(float *input, float *output, size_t n) {
float sum = *output;
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
output[i] = pow(E_CONSTANT, input[i]) / sum;
}
}
__global__ void _relu_kernel(float *input, float *output, int n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
__global__ void _relu_kernel(float *input, float *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
output[i] = max(input[i], float(0));
}
}
__global__ void _sigmoid_kernel(float *input, float *output, int n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
__global__ void _sigmoid_kernel(float *input, float *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
output[i] = 1 / (1 + pow(E_CONSTANT, -input[i]));
}
}
__global__ void _tanh_kernel(float *input, float *output, int n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
__global__ void _tanh_kernel(float *input, float *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
output[i] = (pow(E_CONSTANT, input[i]) - pow(E_CONSTANT, -input[i])) /
(pow(E_CONSTANT, input[i]) + pow(E_CONSTANT, -input[i]));
}
}
__global__ void _abs_kernel(float *input, float *output, int n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
__global__ void _abs_kernel(float *input, float *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
output[i] = input[i] < 0 ? -input[i] : input[i];
}
}
__global__ void _sqrt_kernel(float *input, float *output, int n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
__global__ void _sqrt_kernel(float *input, float *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
output[i] = sqrt(input[i]);
}
}
__global__ void _erf_kernel(float *input, float *output, int n) {
__global__ void _gelu_kernel(float *input, float *output, size_t n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
float x = input[i];
output[i] = 0.5 * x * (1 + erf(x / sqrt(2.0f)));
}
}
__global__ void _erf_kernel(float *input, float *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
output[i] = erf(input[i]);
}
}
template <typename T>
__global__ void _neg_kernel(T *input, T *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
output[i] = -input[i];
}
}
namespace infini {
void softmax_kernel(float *input, float *output, int num) {
void softmax_kernel(float *input, float *output, size_t num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_softmax_kernel1<<<1, 1>>>(input, output, num);
_softmax_kernel2<<<gridsize, blocksize>>>(input, output, num);
}
void relu_kernel(float *input, float *output, int num) {
void relu_kernel(float *input, float *output, size_t num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_relu_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void sigmoid_kernel(float *input, float *output, int num) {
void sigmoid_kernel(float *input, float *output, size_t num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void tanh_kernel(float *input, float *output, int num) {
void tanh_kernel(float *input, float *output, size_t num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_tanh_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void abs_kernel(float *input, float *output, int num) {
void abs_kernel(float *input, float *output, size_t num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_abs_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void sqrt_kernel(float *input, float *output, int num) {
void sqrt_kernel(float *input, float *output, size_t num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_sqrt_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void erf_kernel(float *input, float *output, int num) {
void gelu_kernel(float *input, float *output, size_t num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_gelu_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void erf_kernel(float *input, float *output, size_t num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_erf_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void neg_kernel(float *input, float *output, size_t num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_neg_kernel<<<gridsize, blocksize>>>(input, output, num);
}
}; // namespace infini

View File

@ -4,11 +4,9 @@ namespace infini {
PoolingObj::PoolingObj(GraphObj *graph, OpType optype, Tensor input,
Tensor output, int kh, int kw, int dh, int dw, int ph,
int pw, int sh, int sw)
: OperatorObj(optype, {input}, {output}),
kh(kh), kw(kw), dh(dh), dw(dw), ph(ph), pw(pw), sh(sh), sw(sw),
int pw, int sh, int sw, int ceilMode)
: OperatorObj(optype, {input}, {output}), kh(kh), kw(kw), dh(dh), dw(dw),
ph(ph), pw(pw), sh(sh), sw(sw), ceilMode(ceilMode),
n(input->getDims()[0]), c(input->getDims()[1]), h(input->getDims()[2]),
w(input->getDims()[3]) {
IT_ASSERT(checkValid(graph));
@ -18,8 +16,14 @@ optional<vector<Shape>> PoolingObj::inferShape(const TensorVec &inputs) const {
const auto &input = inputs[0];
auto h = input->getDims()[input->getRank() - 2],
w = input->getDims()[input->getRank() - 1];
int oh = (h - (kh - sh) + ph * 2) / sh;
int ow = (w - (kw - sw) + pw * 2) / sw;
int oh, ow;
if (ceilMode) {
oh = ceil(((float)(h + 2 * ph - dh * (kh - 1) - 1)) / sh + 1);
ow = ceil(((float)(w + 2 * pw - dw * (kw - 1) - 1)) / sw + 1);
} else {
oh = floor(((float)(h + 2 * ph - dh * (kh - 1) - 1)) / sh + 1);
ow = floor(((float)(w + 2 * pw - dw * (kw - 1) - 1)) / sw + 1);
}
auto ret = input->getDims();
ret[input->getRank() - 2] = oh;
ret[input->getRank() - 1] = ow;
@ -34,17 +38,19 @@ std::string PoolingObj::toString() const {
os << "p=[" << ph << "," << pw << "],";
os << "s=[" << sh << "," << sw << "],";
os << "d=[" << dh << "," << dw << "],";
os << "ceil mode=" << ceilMode << ",";
os << "input=" << inputs[0]->getGuid() << ",";
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
vector<int> PoolingObj::getWorkloadVector() const {
return {type.underlying(), n, c, h, w, kh, kw, ph, pw, sh, sw, dh, dw};
return {type.underlying(), n, c, h, w, kh, kw, ph, pw, sh, sw, dh, dw,
ceilMode};
}
vector<int> PoolingObj::getOpAttrVector() const {
return {type.underlying(), kh, kw, ph, pw, sh, sw, dh, dw};
return {type.underlying(), kh, kw, ph, pw, sh, sw, dh, dw, ceilMode};
}
}; // namespace infini

View File

@ -208,16 +208,13 @@ TEST(MatchGraph, multi_output) {
SubGraph subg0 = make_ref<SubGraphObj>(runtime, TensorVec{i});
{
auto maxpool =
subg0->addOp<MaxPoolObj>(i, nullptr, 3, 3, 0, 0, 0, 0, 2, 2);
subg0->addOp<MaxPoolObj>(i, nullptr, 3, 3, 1, 1, 0, 0, 2, 2, 0);
Tensor w0 = subg0->addTensor(Shape{64, 192, 1, 1}, DataType::UInt32);
auto conv0 = subg0->addOp<ConvObj>(maxpool->getOutput(0), w0, nullptr);
auto relu0 = subg0->addOp<ReluObj>(conv0->getOutput(0), nullptr);
auto pad = subg0->addOp<PadObj>(maxpool->getOutput(0), nullptr,
vector<int>{0, 0, 1, 1, 0, 0, 1, 1},
std::nullopt);
auto avgpool = subg0->addOp<AvgPoolObj>(pad->getOutput(0), nullptr, 3,
3, 0, 0, 0, 0, 1, 1);
auto avgpool = subg0->addOp<AvgPoolObj>(maxpool->getOutput(0), nullptr,
3, 3, 0, 0, 0, 0, 1, 1, 0);
subg0->setOutputs(
TensorVec{relu0->getOutput(0), avgpool->getOutput(0)});
}
@ -225,8 +222,9 @@ TEST(MatchGraph, multi_output) {
SubGraph subg1 =
make_ref<SubGraphObj>(runtime, TensorVec{i->clone(runtime)});
{
auto avgpool = subg1->addOp<AvgPoolObj>(
subg1->getInputsFromOutside()[0], nullptr, 3, 3, 0, 0, 0, 0, 2, 2);
auto avgpool =
subg1->addOp<AvgPoolObj>(subg1->getInputsFromOutside()[0], nullptr,
3, 3, 1, 1, 0, 0, 2, 2, 0);
auto relu0 = subg1->addOp<ReluObj>(avgpool->getOutput(0), nullptr);
@ -295,7 +293,7 @@ TEST(MatchGraph, multi_input_output) {
Tensor w2 = subg0->addTensor(Shape{128, 256, 1, 1}, DataType::UInt32);
auto conv2 = subg0->addOp<ConvObj>(relu1->getOutput(0), w2, nullptr);
auto maxpool = subg0->addOp<MaxPoolObj>(relu1->getOutput(0), nullptr, 3,
3, 0, 0, 0, 0, 2, 2);
3, 1, 1, 0, 0, 2, 2, 0);
subg0->setOutputs(
TensorVec{conv2->getOutput(0), maxpool->getOutput(0)});
}
@ -317,7 +315,7 @@ TEST(MatchGraph, multi_input_output) {
Tensor w2 = subg1->addTensor(Shape{128, 256, 1, 1}, DataType::UInt32);
auto conv2 = subg1->addOp<ConvObj>(relu1->getOutput(0), w2, nullptr);
auto maxpool = subg1->addOp<MaxPoolObj>(relu1->getOutput(0), nullptr, 3,
3, 0, 0, 0, 0, 2, 2);
3, 1, 1, 0, 0, 2, 2, 0);
subg1->setOutputs(
TensorVec{maxpool->getOutput(0), conv2->getOutput(0)});
}
@ -338,7 +336,7 @@ TEST(MatchGraph, multi_input_output) {
Tensor w2 = subg2->addTensor(Shape{128, 256, 1, 1}, DataType::UInt32);
auto conv2 = subg2->addOp<ConvObj>(relu1->getOutput(0), w2, nullptr);
auto avgpool = subg2->addOp<AvgPoolObj>(relu1->getOutput(0), nullptr, 3,
3, 0, 0, 0, 0, 2, 2);
3, 1, 1, 0, 0, 2, 2, 0);
subg2->setOutputs(
TensorVec{conv2->getOutput(0), avgpool->getOutput(0)});
}
@ -349,7 +347,7 @@ TEST(MatchGraph, multi_input_output) {
auto i = g->addTensor(Shape{1, 64, 112, 112}, DataType::UInt32);
auto relu = g->addOp<ReluObj>(i, nullptr);
auto maxPool = g->addOp<MaxPoolObj>(relu->getOutput(0), nullptr, 3, 3,
0, 0, 1, 1, 2, 2);
1, 1, 1, 1, 2, 2, 0);
auto out0 =
v.addSubGraph(subg0, {relu->getOutput(0), maxPool->getOutput(0)});
auto out1 =

View File

@ -8,7 +8,8 @@
namespace infini {
template <class T>
template <class T, typename std::enable_if<std::is_base_of<PoolingObj, T>{},
int>::type = 0>
void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
@ -23,7 +24,8 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu = bangGraph->cloneTensor(inputCpu);
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2);
auto gpuOp =
bangGraph->addOp<T>(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
bangGraph->dataMalloc();
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();

View File

@ -29,7 +29,7 @@ TEST(CUDA_Inception_v3_block, run) {
TensorVec outputs;
vector<OpVec> ops;
auto maxpool =
g->addOp<MaxPoolObj>(blockInput, nullptr, 3, 3, 1, 1, 1, 1, 1, 1);
g->addOp<MaxPoolObj>(blockInput, nullptr, 3, 3, 1, 1, 1, 1, 1, 1, 0);
auto chainInput = maxpool->getOutput();
for (auto &pathConfig : configs) {
int inputChannels = initialChannels;
@ -52,7 +52,7 @@ TEST(CUDA_Inception_v3_block, run) {
inputChannels = f;
} else { // Add AveragePool
auto pool = g->addOp<AvgPoolObj>(input, nullptr, r, r, 1, 1,
r / 2, r / 2, 1, 1);
r / 2, r / 2, 1, 1, 0);
input = pool->getOutput();
ops.back().emplace_back(pool);
}

View File

@ -9,7 +9,8 @@ namespace infini {
using KDPS = vector<int>;
using ExpectOutput = vector<float>;
template <class T>
template <class T, typename std::enable_if<std::is_base_of<PoolingObj, T>{},
int>::type = 0>
void testPoolCudnn(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape, const KDPS &kdps, const ExpectOutput &ansVec) {
@ -24,7 +25,7 @@ void testPoolCudnn(
Graph g = make_ref<GraphObj>(cudaRuntime);
auto i0 = g->cloneTensor(i0cpu);
auto pool = g->addOp<T>(i0, nullptr, kdps[0], kdps[1], kdps[2], kdps[3],
kdps[4], kdps[5], kdps[6], kdps[7]);
kdps[4], kdps[5], kdps[6], kdps[7], 0);
// allocate CUDA memory
g->dataMalloc();

View File

@ -46,11 +46,16 @@ TEST(cuDNN_Unary, run) {
testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<TanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<SqrtObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<NegObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<ErfObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
// more shapes
testUnary<SqrtObj>(IncrementalGenerator(), Shape{13});
testUnary<SqrtObj>(IncrementalGenerator(), Shape{4, 3});
testUnary<SqrtObj>(IncrementalGenerator(), Shape{2, 3, 4, 5, 6});
testUnary<GeluObj>(IncrementalGenerator(), Shape{1});
testUnary<GeluObj>(IncrementalGenerator(), Shape{1, 2});
testUnary<GeluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
}
} // namespace infini

View File

@ -12,16 +12,16 @@ TEST(MaxPool, ShapeInference) {
Graph g = make_ref<GraphObj>(cpuRuntime);
Tensor i = g->addTensor({1, 64, 162, 162}, DataType::UInt32);
const int kh = 3, kw = 3, dh = 1, dw = 1, ph = 0, pw = 0, sh = 2,
sw = 2;
auto op =
g->addOp<MaxPoolObj>(i, nullptr, kh, kw, dh, dw, ph, pw, sh, sw);
sw = 2, ceilMode = 0;
auto op = g->addOp<MaxPoolObj>(i, nullptr, kh, kw, dh, dw, ph, pw, sh,
sw, ceilMode);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 64, 80, 80}));
}
{ // dilation & stride
Graph g = make_ref<GraphObj>(cpuRuntime);
Tensor i = g->addTensor({1, 64, 162, 162}, DataType::UInt32);
auto op = g->addOp<MaxPoolObj>(i, nullptr, 4, 3, 1, 1, 2, 1, 1, 2);
auto op = g->addOp<MaxPoolObj>(i, nullptr, 4, 3, 1, 1, 2, 1, 1, 2, 0);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 64, 163, 81}));
}
}
@ -30,7 +30,7 @@ TEST(MaxPool, NaiveCPU) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(cpuRuntime);
Tensor i = g->addTensor({1, 2, 5, 5}, DataType::UInt32);
auto op = g->addOp<MaxPoolObj>(i, nullptr, 3, 3, 1, 1, 1, 1, 2, 2);
auto op = g->addOp<MaxPoolObj>(i, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
g->dataMalloc();
i->setData(IncrementalGenerator());
@ -49,7 +49,7 @@ TEST(AvgPool, NaiveCPU) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(cpuRuntime);
Tensor i = g->addTensor({1, 2, 5, 5}, DataType::Float32);
auto op = g->addOp<AvgPoolObj>(i, nullptr, 3, 3, 1, 1, 1, 1, 2, 2);
auto op = g->addOp<AvgPoolObj>(i, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
g->dataMalloc();
i->setData(IncrementalGenerator());

View File

@ -0,0 +1,21 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/unary.h"
#include "test.h"
namespace infini {
using ExpectOutput = vector<float>;
TEST(Unary, ShapeInference) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
{
Graph g = make_ref<GraphObj>(runtime);
Tensor i0 = g->addTensor({2}, DataType::Float32);
auto op = g->addOp<GeluObj>(i0, nullptr);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2}));
}
}
} // namespace infini