【Hackathon No.108】Add Gelu operator, ffi, kernel for cpu and gpu. (#148)

feat: Add Gelu kernel, operator, ffi.
This commit is contained in:
ChengXiang Qi 2023-10-10 15:21:13 +08:00 committed by GitHub
parent 7600fe688c
commit 7f16fa353e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 79 additions and 3 deletions

View File

@ -45,6 +45,7 @@ class GraphHandlerObj {
Tensor max(Tensor a, Tensor b, Tensor c); Tensor max(Tensor a, Tensor b, Tensor c);
Tensor relu(Tensor x, Tensor y); Tensor relu(Tensor x, Tensor y);
Tensor gelu(Tensor x, Tensor y);
Tensor sigmoid(Tensor x, Tensor y); Tensor sigmoid(Tensor x, Tensor y);
Tensor tanh(Tensor x, Tensor y); Tensor tanh(Tensor x, Tensor y);
Tensor erf(Tensor x, Tensor y); Tensor erf(Tensor x, Tensor y);

View File

@ -73,6 +73,7 @@ struct OpType {
GatherElements, GatherElements,
GatherND, GatherND,
Gemm, Gemm,
Gelu, // Unary
GlobalAveragePool, // GlobalPool GlobalAveragePool, // GlobalPool
GlobalLpPool, // GlobalPool GlobalLpPool, // GlobalPool
GlobalMaxPool, // GlobalPool GlobalMaxPool, // GlobalPool

View File

@ -10,6 +10,7 @@ void tanh_kernel(float *input, float *output, size_t num);
void abs_kernel(float *input, float *output, size_t num); void abs_kernel(float *input, float *output, size_t num);
void sqrt_kernel(float *input, float *output, size_t num); void sqrt_kernel(float *input, float *output, size_t num);
void neg_kernel(float *input, float *output, size_t num); void neg_kernel(float *input, float *output, size_t num);
void gelu_kernel(float *input, float *output, size_t num);
void erf_kernel(float *input, float *output, size_t num); void erf_kernel(float *input, float *output, size_t num);
void unary_kernel(const Operator &_op) { void unary_kernel(const Operator &_op) {
@ -30,6 +31,8 @@ void unary_kernel(const Operator &_op) {
abs_kernel(inputData, outputData, num); abs_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Sqrt) else if (op->getOpType() == OpType::Sqrt)
sqrt_kernel(inputData, outputData, num); sqrt_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Gelu)
gelu_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Neg) else if (op->getOpType() == OpType::Neg)
neg_kernel(inputData, outputData, num); neg_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Erf) else if (op->getOpType() == OpType::Erf)

View File

@ -258,6 +258,7 @@ class LogObj : public OperatorObj {
}; };
DEFINE_UNARY_OBJ(Relu, OpType::Relu) DEFINE_UNARY_OBJ(Relu, OpType::Relu)
DEFINE_UNARY_OBJ(Gelu, OpType::Gelu)
DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid) DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
DEFINE_UNARY_OBJ(Tanh, OpType::Tanh) DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
// DEFINE_UNARY_OBJ(Softmax, OpType::Softmax) // DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)

View File

@ -374,6 +374,11 @@ class OnnxStub:
tensors[node.input[0]], tensors[node.input[0]],
tensors.get(node.output[0]), tensors.get(node.output[0]),
) )
elif node.op_type == "Gelu":
tensors[node.output[0]] = self.handler.gelu(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Sigmoid": elif node.op_type == "Sigmoid":
tensors[node.output[0]] = self.handler.sigmoid( tensors[node.output[0]] = self.handler.sigmoid(
tensors[node.input[0]], tensors[node.input[0]],
@ -913,6 +918,7 @@ class OnnxStub:
backend.OpTypeId.Div, backend.OpTypeId.Div,
backend.OpTypeId.Pow, backend.OpTypeId.Pow,
backend.OpTypeId.Relu, backend.OpTypeId.Relu,
backend.OpTypeId.Gelu,
backend.OpTypeId.Sigmoid, backend.OpTypeId.Sigmoid,
backend.OpTypeId.Tanh, backend.OpTypeId.Tanh,
backend.OpTypeId.Softmax, backend.OpTypeId.Softmax,

View File

@ -208,6 +208,14 @@ class TestStringMethods(unittest.TestCase):
relu = make_node("Relu", ["x"], ["y"], name="relu") relu = make_node("Relu", ["x"], ["y"], name="relu")
make_and_import_model(make_graph([relu], "relu", [x], [y])) make_and_import_model(make_graph([relu], "relu", [x], [y]))
'''Gelu operator is not supported by onnx 14.1 currently.'''
def test_gelu(self):
pass
# x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
# y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
# gelu = make_node("Gelu", ["x"], ["y"], name="gelu")
# make_and_import_model(make_graph([gelu], "gelu", [x], [y]))
def test_erf(self): def test_erf(self):
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7]) x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7]) y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])

View File

@ -155,6 +155,7 @@ DEFINE_ELEMENT_WISE_METHOD(max, Maximum)
} }
DEFINE_UNARY_METHOD(relu, Relu) DEFINE_UNARY_METHOD(relu, Relu)
DEFINE_UNARY_METHOD(gelu, Gelu)
DEFINE_UNARY_METHOD(sigmoid, Sigmoid) DEFINE_UNARY_METHOD(sigmoid, Sigmoid)
DEFINE_UNARY_METHOD(tanh, Tanh) DEFINE_UNARY_METHOD(tanh, Tanh)
DEFINE_UNARY_METHOD(abs, Abs) DEFINE_UNARY_METHOD(abs, Abs)

View File

@ -142,6 +142,7 @@ const char *OpType::toString() const {
CASE(ReduceSum); CASE(ReduceSum);
CASE(ReduceSumSquare); CASE(ReduceSumSquare);
CASE(Relu); CASE(Relu);
CASE(Gelu);
CASE(Reshape); CASE(Reshape);
CASE(Resize); CASE(Resize);
CASE(ReverseSequence); CASE(ReverseSequence);
@ -234,7 +235,7 @@ bool OpType::isUnary() const {
static const std::unordered_set<decltype(type)> set{ static const std::unordered_set<decltype(type)> set{
Abs, Acos, Acosh, Asin, Asinh, Atan, Atanh, Cast, Ceil, Abs, Acos, Acosh, Asin, Asinh, Atan, Atanh, Cast, Ceil,
Clip, Cos, Cosh, Erf, Exp, Floor, Log, Neg, Not, Clip, Cos, Cosh, Erf, Exp, Floor, Log, Neg, Not,
Relu, Round, Sigmoid, Sin, Sinh, Sqrt, Tan, Tanh, Relu, Gelu, Round, Sigmoid, Sin, Sinh, Sqrt, Tan, Tanh,
}; };
return set.find(type) != set.end(); return set.find(type) != set.end();

View File

@ -92,6 +92,7 @@ void export_values(py::module &m) {
.VALUE(OpType, BatchNormalization) .VALUE(OpType, BatchNormalization)
.VALUE(OpType, Softmax) .VALUE(OpType, Softmax)
.VALUE(OpType, Relu) .VALUE(OpType, Relu)
.VALUE(OpType, Gelu)
.VALUE(OpType, PRelu) .VALUE(OpType, PRelu)
.VALUE(OpType, Sigmoid) .VALUE(OpType, Sigmoid)
.VALUE(OpType, Tanh) .VALUE(OpType, Tanh)
@ -440,6 +441,7 @@ void init_graph_builder(py::module &m) {
.def("min", &Handler::min, policy::move) .def("min", &Handler::min, policy::move)
.def("max", &Handler::max, policy::move) .def("max", &Handler::max, policy::move)
.def("relu", &Handler::relu, policy::move) .def("relu", &Handler::relu, policy::move)
.def("gelu", &Handler::gelu, policy::move)
.def("sigmoid", &Handler::sigmoid, policy::move) .def("sigmoid", &Handler::sigmoid, policy::move)
.def("tanh", &Handler::tanh, policy::move) .def("tanh", &Handler::tanh, policy::move)
.def("softmax", &Handler::softmax, policy::move) .def("softmax", &Handler::softmax, policy::move)

View File

@ -60,6 +60,12 @@ template <typename T> class NaiveSqrt : public NativeUnary<T> {
T doCompute(T val) const override { return std::sqrt(val); } T doCompute(T val) const override { return std::sqrt(val); }
}; };
template <typename T> class NaiveGelu : public NativeUnary<T> {
T doCompute(T val) const override {
return 0.5 * val * (1 + std::erf(val / std::sqrt(2)));
}
};
template <typename T> class NaiveErf : public NativeUnary<T> { template <typename T> class NaiveErf : public NativeUnary<T> {
T doCompute(T val) const override { return std::erf(val); } T doCompute(T val) const override { return std::erf(val); }
}; };
@ -91,6 +97,10 @@ REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32,
NaiveRelu<uint32_t>, "reluNaive_CPU_uint32"); NaiveRelu<uint32_t>, "reluNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu<float>, REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu<float>,
"reluNaive_CPU_float32"); "reluNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Gelu, DataType::UInt32, NaiveGelu<float>,
"geluNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Gelu, DataType::Float32, NaiveGelu<float>,
"geluNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::UInt32, REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::UInt32,
NaiveSigmoid<uint32_t>, "sigmoidNaive_CPU_uint32"); NaiveSigmoid<uint32_t>, "sigmoidNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::Float32, REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::Float32,

View File

@ -140,6 +140,8 @@ REGISTER_KERNEL(Device::CUDA, OpType::Abs, DataType::Float32, UnaryCuda,
"Abs_CUDA_Float32"); "Abs_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Sqrt, DataType::Float32, UnaryCuda, REGISTER_KERNEL(Device::CUDA, OpType::Sqrt, DataType::Float32, UnaryCuda,
"Sqrt_CUDA_Float32"); "Sqrt_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Gelu, DataType::Float32, UnaryCuda,
"Gelu_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Neg, DataType::Float32, UnaryCuda, REGISTER_KERNEL(Device::CUDA, OpType::Neg, DataType::Float32, UnaryCuda,
"Neg_CUDA_Float32"); "Neg_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Erf, DataType::Float32, UnaryCuda, REGISTER_KERNEL(Device::CUDA, OpType::Erf, DataType::Float32, UnaryCuda,

View File

@ -66,6 +66,15 @@ __global__ void _sqrt_kernel(float *input, float *output, size_t n) {
} }
} }
__global__ void _gelu_kernel(float *input, float *output, size_t n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
float x = input[i];
output[i] = 0.5 * x * (1 + erf(x / sqrt(2.0f)));
}
}
__global__ void _erf_kernel(float *input, float *output, size_t n) { __global__ void _erf_kernel(float *input, float *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x; size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x; size_t stride = blockDim.x * gridDim.x;
@ -121,6 +130,12 @@ void sqrt_kernel(float *input, float *output, size_t num) {
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_sqrt_kernel<<<gridsize, blocksize>>>(input, output, num); _sqrt_kernel<<<gridsize, blocksize>>>(input, output, num);
} }
void gelu_kernel(float *input, float *output, size_t num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_gelu_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void erf_kernel(float *input, float *output, size_t num) { void erf_kernel(float *input, float *output, size_t num) {
int blocksize = block_work_size(); int blocksize = block_work_size();

View File

@ -52,6 +52,10 @@ TEST(cuDNN_Unary, run) {
testUnary<SqrtObj>(IncrementalGenerator(), Shape{13}); testUnary<SqrtObj>(IncrementalGenerator(), Shape{13});
testUnary<SqrtObj>(IncrementalGenerator(), Shape{4, 3}); testUnary<SqrtObj>(IncrementalGenerator(), Shape{4, 3});
testUnary<SqrtObj>(IncrementalGenerator(), Shape{2, 3, 4, 5, 6}); testUnary<SqrtObj>(IncrementalGenerator(), Shape{2, 3, 4, 5, 6});
testUnary<GeluObj>(IncrementalGenerator(), Shape{1});
testUnary<GeluObj>(IncrementalGenerator(), Shape{1, 2});
testUnary<GeluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
} }
} // namespace infini } // namespace infini

View File

@ -0,0 +1,21 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/unary.h"
#include "test.h"
namespace infini {
using ExpectOutput = vector<float>;
TEST(Unary, ShapeInference) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
{
Graph g = make_ref<GraphObj>(runtime);
Tensor i0 = g->addTensor({2}, DataType::Float32);
auto op = g->addOp<GeluObj>(i0, nullptr);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2}));
}
}
} // namespace infini