* add: unary kernel for xpu

* formatting

* format

* format

* format

* fix: pointer jump

* fix optype comments
This commit is contained in:
zhangyue207 2023-08-28 10:15:07 +08:00 committed by GitHub
parent a4c6214529
commit 85b96d8997
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 485 additions and 8 deletions

View File

@ -21,10 +21,10 @@ struct OpType {
Add, // Binary
And, // Binary
ArgMax, //
Asin, // Binary
Asinh, // Binary
Atan, // Binary
Atanh, // Binary
Asin, // Unary
Asinh, // Unary
Atan, // Unary
Atanh, // Unary
AveragePool, // Pool
BatchNormalization, //
Bernoulli, //

View File

@ -151,14 +151,15 @@ class TensorObj : public TensorBaseObj {
}
template <typename T>
bool equalDataImpl(const T *a, const T *b, size_t size) const {
bool equalDataImpl(const T *a, const T *b, size_t size,
double relativeError = 1e-6) const {
for (size_t i = 0; i < size; ++i) {
if constexpr (std::is_integral_v<T>) {
if (a[i] != b[i])
return false;
} else if constexpr (std::is_floating_point_v<T>) {
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
1e-6) {
relativeError) {
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
return false;
}

View File

@ -103,7 +103,8 @@ bool TensorObj::equalData(const Tensor &rhs, double relativeError) const {
#define TEST_EQUAL(N) \
if (dtype == DataType(N)) \
return equalDataImpl(getRawDataPtr<DT<N>::t *>(), \
rhs->getRawDataPtr<DT<N>::t *>(), size());
rhs->getRawDataPtr<DT<N>::t *>(), size(), \
relativeError);
TEST_EQUAL(0) // fmt: new line
else TEST_EQUAL(1) //

View File

@ -60,6 +60,50 @@ template <typename T> class NaiveSqrt : public NativeUnary<T> {
T doCompute(T val) const override { return std::sqrt(val); }
};
template <typename T> class NaiveCos : public NativeUnary<T> {
T doCompute(T val) const override { return std::cos(val); }
};
template <typename T> class NaiveSin : public NativeUnary<T> {
T doCompute(T val) const override { return std::sin(val); }
};
template <typename T> class NaiveTan : public NativeUnary<T> {
T doCompute(T val) const override { return std::tan(val); }
};
template <typename T> class NaiveSinh : public NativeUnary<T> {
T doCompute(T val) const override { return std::sinh(val); }
};
template <typename T> class NaiveCosh : public NativeUnary<T> {
T doCompute(T val) const override { return std::cosh(val); }
};
template <typename T> class NaiveErf : public NativeUnary<T> {
T doCompute(T val) const override { return std::erf(val); }
};
template <typename T> class NaiveACos : public NativeUnary<T> {
T doCompute(T val) const override { return std::acos(val); }
};
template <typename T> class NaiveACosh : public NativeUnary<T> {
T doCompute(T val) const override { return std::acosh(val); }
};
template <typename T> class NaiveASin : public NativeUnary<T> {
T doCompute(T val) const override { return std::asin(val); }
};
template <typename T> class NaiveASinh : public NativeUnary<T> {
T doCompute(T val) const override { return std::asinh(val); }
};
template <typename T> class NaiveATanh : public NativeUnary<T> {
T doCompute(T val) const override { return std::atanh(val); }
};
template <typename T> class Clip : public CpuKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *context) const override {
@ -79,6 +123,43 @@ template <typename T> class Clip : public CpuKernelWithoutConfig {
}
};
template <typename T> class Log : public CpuKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *context) const override {
auto op = as<LogObj>(_op);
T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
T *outptr = op->getOutput()->getRawDataPtr<T *>();
auto logType = op->getType(); // get log type
auto len = op->getOutput()->size();
for (size_t offset = 0; offset < len; offset++) {
T res;
auto val = *inptr++;
switch (logType) {
case LogObj::LogE:
res = std::log(val);
*outptr++ = res;
break;
case LogObj::Log2:
res = std::log2(val);
*outptr++ = res;
break;
case LogObj::Log10:
res = std::log10(val);
*outptr++ = res;
break;
default:
printf("LogType not Defined");
break;
}
}
}
};
template <typename T> class NaiveATan : public NativeUnary<T> {
T doCompute(T val) const override { return std::atan(val); }
};
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32,
NaiveRelu<uint32_t>, "reluNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu<float>,
@ -103,4 +184,30 @@ REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::Float32,
NaiveSoftmax<float>, "softmaxNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Clip, DataType::Float32, Clip<float>,
"Clip_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Atan, DataType::Float32, NaiveATan<float>,
"Atan_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Log, DataType::Float32, Log<float>,
"Log_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Cos, DataType::Float32, NaiveCos<float>,
"Cos_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Sin, DataType::Float32, NaiveSin<float>,
"Sin_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Tan, DataType::Float32, NaiveTan<float>,
"Tan_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Sinh, DataType::Float32, NaiveSinh<float>,
"Sinh_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Cosh, DataType::Float32, NaiveCosh<float>,
"Cosh_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Erf, DataType::Float32, NaiveErf<float>,
"Erf_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Acos, DataType::Float32, NaiveACos<float>,
"ACos_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Acosh, DataType::Float32,
NaiveACosh<float>, "ACosh_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Asin, DataType::Float32, NaiveASin<float>,
"ASin_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Asinh, DataType::Float32,
NaiveASinh<float>, "ASinh_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Atanh, DataType::Float32,
NaiveATanh<float>, "ATanh_CPU_float32");
}; // namespace infini

View File

@ -226,6 +226,268 @@ class ReciprocalXdnn : public XPUKernelWithoutConfig {
}
};
class AbsXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::abs<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ATanXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<UnaryObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::arctan<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class LogXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<LogObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto aDim = op->getInputs(0)->getDims();
std::vector<int> divDim = {
1,
};
auto len = op->getInputs(0)->size();
// get ptr of tempspace
XPUPtr temp = context->getWorkspace(len * sizeof(float));
LogObj::LogType type = op->getType();
// get output of xpu::api::loge(x)
auto ret = baidu::xpu::api::log<float>(
context->XPUHandle(), (float *)aData, (float *)temp, len);
// get ptr of divider
XPUPtr dd =
(float *)(context->getWorkspace((1 + len) * sizeof(float))) + len;
// choose from logE, log2, log10
switch (type) {
float constant;
case LogObj::LogE:
// if use loge, copy from temp to cData
ret = baidu::xpu::api::copy<float>(
context->XPUHandle(), (float *)temp, (float *)cData, len);
break;
case LogObj::Log2:
constant = std::log(2);
context->copyBlobFromCPU(dd, &constant, sizeof(float));
ret = baidu::xpu::api::broadcast_div<float>(
context->XPUHandle(), (float *)temp, (float *)dd,
(float *)cData, aDim, divDim);
break;
case LogObj::Log10:
constant = std::log(10);
context->copyBlobFromCPU(dd, &constant, sizeof(float));
ret = baidu::xpu::api::broadcast_div<float>(
context->XPUHandle(), (float *)temp, (float *)dd,
(float *)cData, aDim, divDim);
break;
default:
printf("LogType not support!");
break;
}
assert(ret == 0);
return;
}
};
class CosXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<CosObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::cos<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class SinXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<SinObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::sin<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class TanXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<TanObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::tan<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class SinhXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<SinHObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::sinh<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class CoshXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<CosHObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::cosh<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ErfXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ErfObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::erf<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ACosXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ACosObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::arccos<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ACoshXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ACosHObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::acosh<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ASinXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ASinObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::arcsin<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ASinhXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ASinHObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::asinh<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class ATanhXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ATanHObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::atanh<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::XPU, OpType::Relu, DataType::Float32, ReluXdnn,
"Relu_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Sigmoid, DataType::Float32, SigmoidXdnn,
@ -257,4 +519,32 @@ REGISTER_KERNEL(Device::XPU, OpType::Flatten, DataType::Float32, CopyXdnn,
"Flatten_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Identity, DataType::Float32, CopyXdnn,
"Identity_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Abs, DataType::Float32, AbsXdnn,
"Abs_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Atan, DataType::Float32, ATanXdnn,
"Atan_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Log, DataType::Float32, LogXdnn,
"Log_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Cos, DataType::Float32, CosXdnn,
"Cos_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Sin, DataType::Float32, SinXdnn,
"Sin_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Tan, DataType::Float32, TanXdnn,
"Tan_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Sinh, DataType::Float32, SinhXdnn,
"Sinh_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Cosh, DataType::Float32, CoshXdnn,
"Cosh_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Erf, DataType::Float32, ErfXdnn,
"Erf_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Acos, DataType::Float32, ACosXdnn,
"ACos_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Acosh, DataType::Float32, ACoshXdnn,
"ACosh_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Asin, DataType::Float32, ASinXdnn,
"ASin_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Asinh, DataType::Float32, ASinhXdnn,
"ASinh_xdnn_Float3 2");
REGISTER_KERNEL(Device::XPU, OpType::Atanh, DataType::Float32, ATanhXdnn,
"ATanh_xdnn_Float32");
}; // namespace infini

View File

@ -36,7 +36,7 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu, 1e-6));
}
void testClip(const std::function<void(void *, size_t, DataType)> &generator,
@ -103,10 +103,88 @@ void testCast(const std::function<void(void *, size_t, DataType)> &generator,
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
}
template <LogObj::LogType T>
void testLog(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<XPURuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
auto gpuOp = xpuGraph->addOp<LogObj>(inputGpu, nullptr, T);
xpuGraph->dataMalloc();
inputGpu->setData(generator);
xpuRuntime->run(xpuGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<LogObj>(inputCpu, nullptr, T);
cpuGraph->addTensor(inputCpu);
cpuGraph->dataMalloc();
inputCpu->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
}
template <class T>
void testTrigon(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<XPURuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
auto gpuOp = xpuGraph->addOp<T>(inputGpu, nullptr);
xpuGraph->dataMalloc();
inputGpu->setData(generator);
xpuRuntime->run(xpuGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr);
cpuGraph->addTensor(inputCpu);
cpuGraph->dataMalloc();
inputCpu->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu, 1e-3));
}
TEST(xdnn_Unary, run) {
testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<TanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<AbsObj>(ValGenerator<-1>(), Shape{1, 2, 2, 3});
testUnary<ATanObj>(OneGenerator(), Shape{1, 2, 2, 3});
testLog<LogObj::Log10>(ValGenerator<2>(), Shape{1, 2, 2, 3});
testLog<LogObj::Log2>(ValGenerator<2>(), Shape{1, 2, 2, 3});
testLog<LogObj::LogE>(ValGenerator<2>(), Shape{1, 2, 2, 3});
testTrigon<CosObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<SinObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<TanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<SinHObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<CosHObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testUnary<ErfObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<ACosObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<ACosHObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<ASinObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<ASinHObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
testTrigon<ATanHObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
}
} // namespace infini