添加XPU算子 (#120)

* add floordiv for xpu

* add batchnorm for xpu

* add more cast types for xpu

* add conv_trans for xpu

* add pad for xpu

* add logical ops for xpu

* fix format for xpu src and include

* fix format for xpu test

* fix format for xpu src

---------

Co-authored-by: Bolun <bolunz@u.nus.edu>
This commit is contained in:
Bolun Zhang 2023-08-24 16:47:50 +08:00 committed by GitHub
parent d18d40a2e9
commit a4c6214529
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 952 additions and 266 deletions

View File

@ -1,7 +1,7 @@
#pragma once
#include "core/common.h"
#include "xpu/runtime_ex.h"
#include "xpu/xdnn.h"
#include "core/common.h"
#define checkXPUError(call) \
{ \

View File

@ -1,6 +1,6 @@
#pragma once
#include "xpu/xpu_runtime.h"
#include "core/kernel.h"
#include "xpu/xpu_runtime.h"
namespace infini {

View File

@ -1,28 +1,28 @@
#pragma once
#include "xpu/xpu_common.h"
#include "core/runtime.h"
#include "xpu/xpu_common.h"
namespace infini {
class XPURuntimeObj : public RuntimeObj {
private:
baidu::xpu::api::Context* xdnn;
baidu::xpu::api::Context *xdnn;
XPUPtr workspace;
size_t workspaceSize;
public:
XPURuntimeObj() : RuntimeObj(Device::XPU) {
xdnn = baidu::xpu::api::create_context();
xdnn = baidu::xpu::api::create_context();
// 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 3ll << 30; // 3 GB
//std::cout<<workspaceSize/1024/1024/1024<< std::endl;
//std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
// std::cout<<workspaceSize/1024/1024/1024<< std::endl;
// std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
workspace = alloc(workspaceSize);
}
virtual ~XPURuntimeObj() {
dealloc(workspace);
baidu::xpu::api::destroy_context(xdnn);
baidu::xpu::api::destroy_context(xdnn);
}
string toString() const override;
@ -33,11 +33,12 @@ class XPURuntimeObj : public RuntimeObj {
void sync() const;
XPUPtr alloc(size_t size) override {
void *ptr;
checkXPUError(xpu_malloc_ex((void**)&ptr, size, XPUMemoryKind::XPU_MEM_MAIN));
checkXPUError(
xpu_malloc_ex((void **)&ptr, size, XPUMemoryKind::XPU_MEM_MAIN));
return ptr;
}
void dealloc(void *ptr) override { xpu_free(ptr); }
baidu::xpu::api::Context* XPUHandle() const { return xdnn; }
baidu::xpu::api::Context *XPUHandle() const { return xdnn; }
XPUPtr getWorkspace(size_t size) const {
IT_ASSERT(size <= workspaceSize);
return workspace;
@ -45,17 +46,20 @@ class XPURuntimeObj : public RuntimeObj {
void copyBlobFromCPU(void *dst, const void *src,
size_t bytes) const override {
xpu_memcpy(dst, const_cast<void *>(src), bytes, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
xpu_memcpy(dst, const_cast<void *>(src), bytes,
XPUMemcpyKind::XPU_HOST_TO_DEVICE);
}
void copyBlobToCPU(void *dst, const void *src,
size_t bytes) const override {
xpu_memcpy(dst, const_cast<void *>(src), bytes, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
xpu_memcpy(dst, const_cast<void *>(src), bytes,
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
}
void copyBlobInsideRuntime(void *dst, const void *src,
size_t bytes) const override {
xpu_memcpy(dst, const_cast<void *>(src), bytes, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
xpu_memcpy(dst, const_cast<void *>(src), bytes,
XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
}
private:

View File

@ -0,0 +1,41 @@
#include "operators/batch_norm.h"
#include "xpu/xpu_kernel_without_config.h"
#include "xpu/xpu_runtime.h"
namespace infini {
class BatchNormXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<BatchNormObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const input = (op->getInputs(0)->getRawDataPtr<void *>());
void *const mean = (op->getInputs(1)->getRawDataPtr<void *>());
void *const var = (op->getInputs(2)->getRawDataPtr<void *>());
void *const scale = (op->getInputs(3)->getRawDataPtr<void *>());
void *const bias = (op->getInputs(4)->getRawDataPtr<void *>());
void *const output = (op->getOutput()->getRawDataPtr<void *>());
auto dims = op->getInputs(0)->getDims();
if (dims.size() != 4)
IT_TODO_HALT();
int w = dims[3];
int h = dims[2];
int c = dims[1];
int n = dims[0];
auto ret = baidu::xpu::api::batch_norm_infer<float>(
context->XPUHandle(), (float *)input, (float *)output, n, c, h, w,
op->getEps(), (float *)scale, (float *)bias, (float *)mean,
(float *)var, true);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::XPU, OpType::BatchNormalization, DataType::Float32,
BatchNormXdnn, "BatchNorm_xdnn_XPU_Float32");
}; // namespace infini

View File

@ -12,23 +12,83 @@ class CastXdnn : public XPUKernelWithoutConfig {
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
CastType type = op->getType();
int ret = 0;
switch (type) {
case CastType::Float2Int32:
ret = baidu::xpu::api::cast<float,int>(context->XPUHandle(), (float*)aData, (int*)cData, len);
break;
case CastType::Int322Int8:
ret = baidu::xpu::api::cast<int,float>(context->XPUHandle(), (int*)aData, (float*)cData, len);
break;
default:
IT_TODO_HALT();
}
assert(ret == 0);
return;
CastType type = op->getType();
int ret = 0;
switch (type) {
case CastType::Float2Float16:
ret = baidu::xpu::api::cast<float, float16>(
context->XPUHandle(), (float *)aData, (float16 *)cData, len);
break;
case CastType::Float2Int64:
ret = baidu::xpu::api::cast<float, int64_t>(
context->XPUHandle(), (float *)aData, (int64_t *)cData, len);
break;
case CastType::Float2Int32:
ret = baidu::xpu::api::cast<float, int>(
context->XPUHandle(), (float *)aData, (int *)cData, len);
break;
case CastType::Float2Int16:
ret = baidu::xpu::api::cast<float, int16_t>(
context->XPUHandle(), (float *)aData, (int16_t *)cData, len);
break;
case CastType::Float2Int8:
ret = baidu::xpu::api::cast<float, int8_t>(
context->XPUHandle(), (float *)aData, (int8_t *)cData, len);
break;
case CastType::Int322Float:
ret = baidu::xpu::api::cast<int, float>(
context->XPUHandle(), (int *)aData, (float *)cData, len);
break;
case CastType::Int322Int8:
ret = baidu::xpu::api::cast<int, int8_t>(
context->XPUHandle(), (int *)aData, (int8_t *)cData, len);
break;
case CastType::Int322Int16:
ret = baidu::xpu::api::cast<int, int16_t>(
context->XPUHandle(), (int *)aData, (int16_t *)cData, len);
break;
case CastType::Int162Float:
ret = baidu::xpu::api::cast<int16_t, float>(
context->XPUHandle(), (int16_t *)aData, (float *)cData, len);
break;
case CastType::Int162Int32:
ret = baidu::xpu::api::cast<int16_t, int>(
context->XPUHandle(), (int16_t *)aData, (int *)cData, len);
break;
case CastType::Int82Float:
ret = baidu::xpu::api::cast<int8_t, float>(
context->XPUHandle(), (int8_t *)aData, (float *)cData, len);
break;
case CastType::Int82Int16:
ret = baidu::xpu::api::cast<int8_t, int16_t>(
context->XPUHandle(), (int8_t *)aData, (int16_t *)cData, len);
break;
case CastType::Int82Int32:
ret = baidu::xpu::api::cast<int8_t, int>(
context->XPUHandle(), (int8_t *)aData, (int *)cData, len);
break;
case CastType::Int322Int64:
ret = baidu::xpu::api::cast<int, int64_t>(
context->XPUHandle(), (int *)aData, (int64_t *)cData, len);
break;
case CastType::Int642Int32:
ret = baidu::xpu::api::cast<int64_t, int>(
context->XPUHandle(), (int64_t *)aData, (int *)cData, len);
break;
case CastType::Int642Float:
ret = baidu::xpu::api::cast<int64_t, float>(
context->XPUHandle(), (int64_t *)aData, (float *)cData, len);
break;
case CastType::Float162Float:
ret = baidu::xpu::api::cast<float16, float>(
context->XPUHandle(), (float16 *)aData, (float *)cData, len);
break;
default:
IT_TODO_HALT();
}
assert(ret == 0);
return;
}
};

View File

@ -8,26 +8,27 @@ class ConcatXdnn : public XPUKernelWithoutConfig {
const RuntimeObj *_context) const override {
auto op = as<ConcatObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
int axis = op->getDim();
int num = op->numInputs();
std::vector<const float*> inputsData;
for (int i = 0; i < num; ++i) {
inputsData.push_back((float*)(op->getInputs(i)->getRawDataPtr<void *>()));
}
int axis = op->getDim();
int num = op->numInputs();
std::vector<const float *> inputsData;
for (int i = 0; i < num; ++i) {
inputsData.push_back(
(float *)(op->getInputs(i)->getRawDataPtr<void *>()));
}
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
std::vector<std::vector<int>> dims;
for(int i = 0; i < num; ++i){
auto dim = op->getInputs(i)->getDims();
if (dim.size() != 4) {
IT_TODO_HALT();
}
dims.push_back(dim);
}
auto ret = baidu::xpu::api::concat<float>(context->XPUHandle(), inputsData, (float*)cData, dims, axis);
assert(ret == 0);
return;
std::vector<std::vector<int>> dims;
for (int i = 0; i < num; ++i) {
auto dim = op->getInputs(i)->getDims();
if (dim.size() != 4) {
IT_TODO_HALT();
}
dims.push_back(dim);
}
auto ret = baidu::xpu::api::concat<float>(
context->XPUHandle(), inputsData, (float *)cData, dims, axis);
assert(ret == 0);
return;
}
};

View File

@ -9,25 +9,26 @@ class ConvXdnn : public XPUKernelWithoutConfig {
auto op = as<ConvObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
const int cpg = op->getChannelPerGroup();
const int g = c / cpg;
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
std::vector<int> pads = {ph, pw};
std::vector<int> ksize = {r, s};
std::vector<int> stride = {sh, sw};
std::vector<int> dilation = {dh, dw};
auto ret = baidu::xpu::api::conv2d<float,float,float,float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData,
n,c,h,w,f,ksize,stride,pads,dilation,g,nullptr,nullptr,nullptr,true);
assert(ret == 0);
return;
std::vector<int> pads = {ph, pw};
std::vector<int> ksize = {r, s};
std::vector<int> stride = {sh, sw};
std::vector<int> dilation = {dh, dw};
auto ret = baidu::xpu::api::conv2d<float, float, float, float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(float *)cData, n, c, h, w, f, ksize, stride, pads, dilation, g,
nullptr, nullptr, nullptr, true);
assert(ret == 0);
return;
}
};

View File

@ -0,0 +1,54 @@
#include "operators/conv.h"
#include "xpu/xpu_kernel_without_config.h"
#include "xpu/xpu_runtime.h"
namespace infini {
class ConvTransXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ConvBaseObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
const int cpg = op->getChannelPerGroup();
const int g = c / cpg;
const bool isNCHW =
(op->getOpType() == OpType::ConvTransNHWC) ? false : true;
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
std::vector<int> pads = {ph, pw};
std::vector<int> ksize = {r, s};
std::vector<int> stride = {sh, sw};
std::vector<int> dilation = {dh, dw};
auto dimInputs0 = op->getInputs(0)->getDims();
auto dimInputs1 = op->getInputs(1)->getDims();
auto dimOutput = op->getOutput()->getDims();
if (dimInputs0.size() != 4)
IT_TODO_HALT();
if (dimInputs1.size() != 4)
IT_TODO_HALT();
if (dimOutput.size() != 4)
IT_TODO_HALT();
auto ret =
baidu::xpu::api::conv2d_transpose<float, float, float, float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(float *)cData, n, c, h, w, f, ksize, stride, pads, dilation, g,
nullptr, nullptr, nullptr, isNCHW);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::XPU, OpType::ConvTranspose, DataType::Float32,
ConvTransXdnn, "ConvTrans_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::ConvTransNHWC, DataType::Float32,
ConvTransXdnn, "ConvTranposedNHWC_xdnn_XPU_Float32");
}; // namespace infini

View File

@ -17,10 +17,11 @@ class AddXdnn : public XPUKernelWithoutConfig {
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_add<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::broadcast_add<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
@ -38,10 +39,11 @@ class SubXdnn : public XPUKernelWithoutConfig {
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_sub<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::broadcast_sub<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
@ -59,10 +61,11 @@ class MulXdnn : public XPUKernelWithoutConfig {
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_mul<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::broadcast_mul<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
@ -80,10 +83,11 @@ class DivXdnn : public XPUKernelWithoutConfig {
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_div<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::broadcast_div<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
@ -101,10 +105,11 @@ class PowXdnn : public XPUKernelWithoutConfig {
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_pow<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::broadcast_pow<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
@ -122,10 +127,11 @@ class MaxXdnn : public XPUKernelWithoutConfig {
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_max<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::broadcast_max<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
@ -143,10 +149,11 @@ class MinXdnn : public XPUKernelWithoutConfig {
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_min<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::broadcast_min<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(float *)cData, aDim, bDim);
assert(ret == 0);
return;
}
};
@ -159,18 +166,20 @@ class EqualXdnn : public XPUKernelWithoutConfig {
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_equal<float>(context->XPUHandle(), (float*)aData, (float*)bData, (bool*)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(context->XPUHandle(), (bool*)wsData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::broadcast_equal<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(bool *)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -183,18 +192,20 @@ class GreaterEqualXdnn : public XPUKernelWithoutConfig {
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_greater_equal<float>(context->XPUHandle(), (float*)aData, (float*)bData, (bool*)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(context->XPUHandle(), (bool*)wsData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::broadcast_greater_equal<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(bool *)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -207,18 +218,20 @@ class GreaterThanXdnn : public XPUKernelWithoutConfig {
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_greater_than<float>(context->XPUHandle(), (float*)aData, (float*)bData, (bool*)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(context->XPUHandle(), (bool*)wsData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::broadcast_greater_than<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(bool *)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -231,18 +244,20 @@ class LessEqualXdnn : public XPUKernelWithoutConfig {
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_less_equal<float>(context->XPUHandle(), (float*)aData, (float*)bData, (bool*)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(context->XPUHandle(), (bool*)wsData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::broadcast_less_equal<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(bool *)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -255,18 +270,170 @@ class LessThanXdnn : public XPUKernelWithoutConfig {
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_less_than<float>(context->XPUHandle(), (float*)aData, (float*)bData, (bool*)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(context->XPUHandle(), (bool*)wsData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::broadcast_less_than<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(bool *)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<bool, float>(
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class FloorDivXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::broadcast_floordiv<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(float *)wsData, aDim, bDim);
ret = baidu::xpu::api::cast<int, float>(
context->XPUHandle(), (int *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class MSELossXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<MSELossObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
auto dim = op->getInputs(0)->getDims();
if (dim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::mse_loss<float>(
context->XPUHandle(), (float *)aData, (float *)bData,
(float *)cData, len);
assert(ret == 0);
return;
}
};
class AndXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::logical_and<bool>(
context->XPUHandle(), (bool *)aData, (bool *)bData, (bool *)wsData,
len);
ret = baidu::xpu::api::cast<bool, float>(
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class OrXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::logical_or<bool>(
context->XPUHandle(), (bool *)aData, (bool *)bData, (bool *)wsData,
len);
ret = baidu::xpu::api::cast<bool, float>(
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class XorXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
auto bDim = op->getInputs(1)->getDims();
if (aDim.size() != 4 || bDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::logical_xor<bool>(
context->XPUHandle(), (bool *)aData, (bool *)bData, (bool *)wsData,
len);
ret = baidu::xpu::api::cast<bool, float>(
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
class NotXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ElementWiseObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
size_t len = op->getOutput()->size();
XPUPtr wsData = context->getWorkspace(len);
auto aDim = op->getInputs(0)->getDims();
if (aDim.size() != 4)
IT_TODO_HALT();
auto ret = baidu::xpu::api::logical_not<bool>(
context->XPUHandle(), (bool *)aData, (bool *)wsData, len);
ret = baidu::xpu::api::cast<bool, float>(
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -286,12 +453,24 @@ REGISTER_KERNEL(Device::XPU, OpType::Min, DataType::Float32, MinXdnn,
"Min_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Equal, DataType::Float32, EqualXdnn,
"Equal_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::GreaterOrEqual, DataType::Float32, GreaterEqualXdnn,
"GreaterEqual_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Greater, DataType::Float32, GreaterThanXdnn,
"GreaterThan_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::LessOrEqual, DataType::Float32, LessEqualXdnn,
"LessEqual_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::GreaterOrEqual, DataType::Float32,
GreaterEqualXdnn, "GreaterEqual_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Greater, DataType::Float32,
GreaterThanXdnn, "GreaterThan_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::LessOrEqual, DataType::Float32,
LessEqualXdnn, "LessEqual_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Less, DataType::Float32, LessThanXdnn,
"LessThan_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::FloorDiv, DataType::Float32, FloorDivXdnn,
"FloorDiv_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::MSELoss, DataType::Float32, MSELossXdnn,
"MSELoss_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::And, DataType::Float32, AndXdnn,
"And_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Or, DataType::Float32, OrXdnn,
"Or_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Xor, DataType::Float32, XorXdnn,
"Xor_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Not, DataType::Float32, NotXdnn,
"Not_xdnn_XPU_Float32");
}; // namespace infini

View File

@ -8,25 +8,28 @@ class MatmulXdnn : public XPUKernelWithoutConfig {
const RuntimeObj *_context) const override {
auto op = as<MatmulObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
bool transA = op->getTransA();
bool transA = op->getTransA();
bool transB = op->getTransB();
if(op->getInputs(0)->getDims().size() != 2 || op->getInputs(1)->getDims().size() != 2) {
IT_TODO_HALT();
}
if (op->getInputs(0)->getDims().size() != 2 ||
op->getInputs(1)->getDims().size() != 2) {
IT_TODO_HALT();
}
auto m = transA ? op->getInputs(0)->getDims()[1] : op->getInputs(0)->getDims()[0];
auto n = transB ? op->getInputs(1)->getDims()[0] : op->getInputs(1)->getDims()[1];
auto k = transA ? op->getInputs(0)->getDims()[0] : op->getInputs(0)->getDims()[1];
auto ret = baidu::xpu::api::fc<float,float,float,int>(context->XPUHandle(),
(float*)aData, (float*)bData, (float*)cData,
m,n,k, transA, transB, nullptr, nullptr, nullptr);
assert(ret == 0);
return;
auto m = transA ? op->getInputs(0)->getDims()[1]
: op->getInputs(0)->getDims()[0];
auto n = transB ? op->getInputs(1)->getDims()[0]
: op->getInputs(1)->getDims()[1];
auto k = transA ? op->getInputs(0)->getDims()[0]
: op->getInputs(0)->getDims()[1];
auto ret = baidu::xpu::api::fc<float, float, float, int>(
context->XPUHandle(), (float *)aData, (float *)bData,
(float *)cData, m, n, k, transA, transB, nullptr, nullptr, nullptr);
assert(ret == 0);
return;
}
};

37
src/kernels/xpu/pad.cc Normal file
View File

@ -0,0 +1,37 @@
#include "operators/pad.h"
#include "xpu/xpu_kernel_without_config.h"
#include "xpu/xpu_runtime.h"
namespace infini {
class PadXdnn : public XPUKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<PadObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto dim = op->getInputs(0)->getDims();
int dim_size = dim.size();
std::vector<int> pads = op->getPads();
std::cout << std::endl;
std::vector<int> paddings_left(pads.begin(), pads.begin() + dim_size);
std::vector<int> paddings_right(pads.begin() + dim_size, pads.end());
float paddingValue = 0.0;
auto ret = baidu::xpu::api::pad<float>(
context->XPUHandle(), (float *)aData, (float *)cData, dim,
paddings_left, paddings_right, paddingValue);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::XPU, OpType::Pad, DataType::Float32, PadXdnn,
"Pad_xdnn_XPU_Float32");
}; // namespace infini

View File

@ -14,15 +14,15 @@ class AvgPooling : public XPUKernelWithoutConfig {
auto [n, c, h, w, kh, kw] = op->getNCHWRS();
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
std::vector<int> ksize = {kh, kw};
std::vector<int> stride = {sh, sw};
std::vector<int> pad = {ph, pw};
std::vector<int> ksize = {kh, kw};
std::vector<int> stride = {sh, sw};
std::vector<int> pad = {ph, pw};
auto ret = baidu::xpu::api::avg_pool2d<float>(context->XPUHandle(), (float*)aData, (float*)cData,
n,c,h,w,ksize,stride,pad,true,true,nullptr,nullptr);
auto ret = baidu::xpu::api::avg_pool2d<float>(
context->XPUHandle(), (float *)aData, (float *)cData, n, c, h, w,
ksize, stride, pad, true, true, nullptr, nullptr);
assert(ret == 0);
return;
}
};
@ -37,24 +37,24 @@ class MaxPooling : public XPUKernelWithoutConfig {
auto [n, c, h, w, kh, kw] = op->getNCHWRS();
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
std::vector<int> ksize = {kh, kw};
std::vector<int> stride = {sh, sw};
std::vector<int> pad = {ph, pw};
std::vector<int> ksize = {kh, kw};
std::vector<int> stride = {sh, sw};
std::vector<int> pad = {ph, pw};
int yh = (h + ph*2 -kh) /sh + 1;
int yw = (w + pw*2 -kw) /sw + 1;
int yh = (h + ph * 2 - kh) / sh + 1;
int yw = (w + pw * 2 - kw) / sw + 1;
XPUPtr indices = context->getWorkspace(yh * yw * 4);
XPUPtr indices = context->getWorkspace(yh * yw * 4);
auto ret = baidu::xpu::api::max_pool2d<float>(context->XPUHandle(), (float*)aData, (float*)cData,
(int*)indices, n,c,h,w,ksize,stride,pad,true,nullptr,nullptr,false);
auto ret = baidu::xpu::api::max_pool2d<float>(
context->XPUHandle(), (float *)aData, (float *)cData,
(int *)indices, n, c, h, w, ksize, stride, pad, true, nullptr,
nullptr, false);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::XPU, OpType::MaxPool, DataType::Float32, MaxPooling,
"MaxPool_xdnn_Float32");
REGISTER_KERNEL(Device::XPU, OpType::AveragePool, DataType::Float32, AvgPooling,

View File

@ -8,29 +8,31 @@ class SplitXdnn : public XPUKernelWithoutConfig {
const RuntimeObj *_context) const override {
auto op = as<SplitObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
int axis = op->getDim();
int num = op->numOutputs();
int axis = op->getDim();
int num = op->numOutputs();
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
auto inputDim = op->getInputs(0)->getDims();
auto inputDim = op->getInputs(0)->getDims();
std::vector<float*> outputsData;
for (int i = 0; i < num; ++i) {
outputsData.push_back((float*)(op->getOutput(i)->getRawDataPtr<void*>()));
}
std::vector<float *> outputsData;
for (int i = 0; i < num; ++i) {
outputsData.push_back(
(float *)(op->getOutput(i)->getRawDataPtr<void *>()));
}
std::vector<int> splitList;
for(int i = 0; i < num; ++i){
auto dim = op->getOutput(i)->getDims();
if (dim.size() != 4) {
IT_TODO_HALT();
}
splitList.push_back(dim[axis]);
}
auto ret = baidu::xpu::api::split<float>(context->XPUHandle(), (float*)inputData, outputsData, inputDim, splitList, axis);
assert(ret == 0);
return;
std::vector<int> splitList;
for (int i = 0; i < num; ++i) {
auto dim = op->getOutput(i)->getDims();
if (dim.size() != 4) {
IT_TODO_HALT();
}
splitList.push_back(dim[axis]);
}
auto ret = baidu::xpu::api::split<float>(
context->XPUHandle(), (float *)inputData, outputsData, inputDim,
splitList, axis);
assert(ret == 0);
return;
}
};

View File

@ -9,23 +9,24 @@ class TransposeXdnn : public XPUKernelWithoutConfig {
auto op = as<TransposeObj>(_op);
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto dimin = op->getInputs(0)->getDims();
auto permute = op->getPermute();
auto dimin = op->getInputs(0)->getDims();
auto permute = op->getPermute();
if ( dimin.size() != 4 ) {
if (dimin.size() != 4) {
IT_TODO_HALT();
}
auto ret = baidu::xpu::api::transpose<float>(context->XPUHandle(), (float*)aData, (float*)cData, dimin, permute);
assert(ret == 0);
return;
}
auto ret = baidu::xpu::api::transpose<float>(
context->XPUHandle(), (float *)aData, (float *)cData, dimin,
permute);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::XPU, OpType::Transpose, DataType::Float32, TransposeXdnn,
"Transpose_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Transpose, DataType::Float32,
TransposeXdnn, "Transpose_xdnn_XPU_Float32");
}; // namespace infini

View File

@ -13,10 +13,10 @@ class ReluXdnn : public XPUKernelWithoutConfig {
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::relu<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::relu<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -30,10 +30,10 @@ class SigmoidXdnn : public XPUKernelWithoutConfig {
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::sigmoid<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::sigmoid<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -47,10 +47,10 @@ class TanhXdnn : public XPUKernelWithoutConfig {
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::tanh<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::tanh<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -64,10 +64,10 @@ class SquareXdnn : public XPUKernelWithoutConfig {
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::square<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::square<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -81,10 +81,10 @@ class SqrtXdnn : public XPUKernelWithoutConfig {
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::sqrt<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::sqrt<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -98,10 +98,10 @@ class RsqrtXdnn : public XPUKernelWithoutConfig {
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::rsqrt<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::rsqrt<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -115,10 +115,10 @@ class ExpXdnn : public XPUKernelWithoutConfig {
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::exp<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::exp<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -132,10 +132,10 @@ class CeilXdnn : public XPUKernelWithoutConfig {
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::ceil<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::ceil<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -148,13 +148,14 @@ class ClipXdnn : public XPUKernelWithoutConfig {
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
float min = op->getMin().value();
float max = op->getMax().value();
auto ret = baidu::xpu::api::clip<float>(context->XPUHandle(), (float*)aData, (float*)cData, len, min, max);
assert(ret == 0);
return;
float min = op->getMin().value();
float max = op->getMax().value();
auto ret =
baidu::xpu::api::clip<float>(context->XPUHandle(), (float *)aData,
(float *)cData, len, min, max);
assert(ret == 0);
return;
}
};
@ -168,10 +169,10 @@ class FloorXdnn : public XPUKernelWithoutConfig {
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::floor<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::floor<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -185,10 +186,10 @@ class NegXdnn : public XPUKernelWithoutConfig {
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::neg<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::neg<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -201,10 +202,10 @@ class CopyXdnn : public XPUKernelWithoutConfig {
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::copy<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::copy<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
@ -218,14 +219,13 @@ class ReciprocalXdnn : public XPUKernelWithoutConfig {
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto len = op->getInputs(0)->size();
auto ret = baidu::xpu::api::reciprocal<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
assert(ret == 0);
return;
auto ret = baidu::xpu::api::reciprocal<float>(
context->XPUHandle(), (float *)aData, (float *)cData, len);
assert(ret == 0);
return;
}
};
REGISTER_KERNEL(Device::XPU, OpType::Relu, DataType::Float32, ReluXdnn,
"Relu_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Sigmoid, DataType::Float32, SigmoidXdnn,
@ -248,8 +248,8 @@ REGISTER_KERNEL(Device::XPU, OpType::Floor, DataType::Float32, FloorXdnn,
"Floor_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Neg, DataType::Float32, NegXdnn,
"Neg_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Reciprocal, DataType::Float32, ReciprocalXdnn,
"Reciprocal_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Reciprocal, DataType::Float32,
ReciprocalXdnn, "Reciprocal_xdnn_XPU_Float32");
REGISTER_KERNEL(Device::XPU, OpType::Reshape, DataType::Float32, CopyXdnn,
"Reshape_xdnn_Float32");

View File

@ -1,19 +1,18 @@
#include "xpu/operator_timer.h"
#include "xpu/xpu_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/conv.h"
#include "operators/matmul.h"
#include "utils/data_generator.h"
#include "xpu/xpu_runtime.h"
namespace infini {
namespace opTimer {
double getPerfConvXPU(int n, int c, int h, int w, int f, int r, int s,
int padh, int padw, int strideh, int stridew,
int dilationh, int dilationw, int group,
const char *name) {
double getPerfConvXPU(int n, int c, int h, int w, int f, int r, int s, int padh,
int padw, int strideh, int stridew, int dilationh,
int dilationw, int group, const char *name) {
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime xpu = make_ref<XPURuntimeObj>();
@ -31,8 +30,8 @@ double getPerfConvXPU(int n, int c, int h, int w, int f, int r, int s,
Tensor i0XPU = gXpu->cloneTensor(i0Cpu);
Tensor w0XPU = gXpu->cloneTensor(w0Cpu);
// Build Xpu graph
auto conv = gXpu->addOp<ConvObj>(i0XPU, w0XPU, nullptr, padh, padw,
strideh, stridew, dilationh, dilationw);
auto conv = gXpu->addOp<ConvObj>(i0XPU, w0XPU, nullptr, padh, padw, strideh,
stridew, dilationh, dilationw);
// allocate Xpu memory
gXpu->dataMalloc();
// Execute on Xpu

View File

@ -5,7 +5,7 @@
namespace infini {
void XPURuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
bool profiling = false) const {
bool profiling = false) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
double totalTime = 0;

View File

@ -1,17 +1,16 @@
#include "xpu/xpu_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/element_wise.h"
#include "xpu/xpu_runtime.h"
#include "test.h"
namespace infini {
template <class T>
void testAdd(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
void testAdd(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<XPURuntimeObj>();

View File

@ -0,0 +1,61 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/batch_norm.h"
#include "test.h"
#include "xpu/xpu_kernel_without_config.h"
#include "xpu/xpu_runtime.h"
namespace infini {
TEST(XPU_BatchNorm, run) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<XPURuntimeObj>();
// Build cpu graph
Graph gCpu = make_ref<GraphObj>(cpuRuntime);
auto iCpu = gCpu->addTensor(Shape{1, 3, 2, 2}, DataType::Float32);
auto meanCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
auto varCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
auto scaleCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
auto biasCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
// Build input data on CPU
gCpu->dataMalloc();
iCpu->setData(IncrementalGenerator());
meanCpu->copyin(vector<float>{1, 6, 9});
varCpu->copyin(vector<float>{4, 1, 9});
scaleCpu->setData(OneGenerator());
biasCpu->setData(ZeroGenerator());
// Build XPU graph
Graph g = make_ref<GraphObj>(xpuRuntime);
auto i = g->cloneTensor(iCpu);
auto mean = g->cloneTensor(meanCpu);
auto var = g->cloneTensor(varCpu);
auto scale = g->cloneTensor(scaleCpu);
auto bias = g->cloneTensor(biasCpu);
auto op =
g->addOp<BatchNormObj>(i, nullptr, mean, var, scale, bias, 0.9, 0);
// allocate XPU memory
g->dataMalloc();
i->setData(IncrementalGenerator());
mean->copyin(vector<float>{1, 6, 9});
var->copyin(vector<float>{4, 1, 9});
scale->setData(OneGenerator());
bias->setData(ZeroGenerator());
// Execute on XPU
xpuRuntime->run(g);
// clone XPU output to CPU
auto o = op->getOutput();
auto ocpu = o->clone(cpuRuntime);
// check results on CPU
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 2}));
EXPECT_TRUE(ocpu->equalData(vector<float>{
-0.5, 0, 0.5, 1, -2, -1, 0, 1, -0.333333, 0, 0.3333333, 0.6666667}));
}
} // namespace infini

View File

@ -1,8 +1,8 @@
#include "xpu/xpu_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/concat.h"
#include "xpu/xpu_runtime.h"
#include "test.h"

View File

@ -1,8 +1,8 @@
#include "xpu/xpu_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/conv.h"
#include "xpu/xpu_runtime.h"
#include "test.h"

View File

@ -0,0 +1,136 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/perf_engine.h"
#include "core/runtime.h"
#include "operators/conv.h"
#include "xpu/xpu_kernel_without_config.h"
#include "xpu/xpu_runtime.h"
#include "test.h"
namespace infini {
void testConvTransposedXdnn(
const std::function<void(void *, size_t, DataType)> &generator,
vector<float> ansVec) {
const auto &[N, C, H, W, F, R, S] = tuple{1, 1, 2, 2, 1, 4, 4};
const int stride = 1, padding = 0, dilation = 1;
// Construct Runtime and graph for CPU and XPU
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime xpu = make_ref<XPURuntimeObj>();
Graph gXpu = make_ref<GraphObj>(xpu);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({N, F, H, H}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({F, C, R, S}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(generator);
w0Cpu->setData(generator);
// Copy input tensors from CPU to XPU
Tensor i0Xpu = gXpu->cloneTensor(i0Cpu);
Tensor w0Xpu = gXpu->cloneTensor(w0Cpu);
// Build XPU graph
auto conv = gXpu->addOp<ConvTransposed2dObj>(i0Xpu, w0Xpu, nullptr, padding,
padding, stride, stride,
dilation, dilation);
gXpu->dataMalloc();
i0Xpu->setData(generator);
w0Xpu->setData(generator);
// Execute on XPU
xpu->run(gXpu);
// copy output from XPU to CPU
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
// check results on CPU
EXPECT_TRUE(o0Cpu->equalData(ansVec));
}
void testConvTransposedNHWCXdnn(
const std::function<void(void *, size_t, DataType)> &generator,
vector<float> ansVec) {
const auto &[N, C, H, W, F, R, S] = tuple{1, 1, 2, 2, 1, 4, 4};
const int stride = 1, padding = 0, dilation = 1;
// Construct Runtime and graph for CPU and XPU
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime xpu = make_ref<XPURuntimeObj>();
Graph gXpu = make_ref<GraphObj>(xpu);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({N, H, W, F}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({F, R, S, C}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(generator);
w0Cpu->setData(generator);
// Copy input tensors from CPU to XPU
Tensor i0Xpu = gXpu->cloneTensor(i0Cpu);
Tensor w0Xpu = gXpu->cloneTensor(w0Cpu);
// Build XPU graph
auto conv = gXpu->addOp<ConvTransposed2dNHWCObj>(
i0Xpu, w0Xpu, nullptr, padding, padding, stride, stride, dilation,
dilation);
gXpu->dataMalloc();
i0Xpu->setData(generator);
w0Xpu->setData(generator);
// Execute on XPU
xpu->run(gXpu);
// copy output from XPU to CPU
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
// check results on CPU
EXPECT_TRUE(o0Cpu->equalData(ansVec));
}
TEST(XPU_ConvTransposed, run) {
testConvTransposedXdnn(IncrementalGenerator(),
vector<float>{0., 0., 1., 2., 3., 0., 6.,
12., 18., 16., 8., 30., 36., 42.,
32., 16., 54., 60., 66., 48., 24.,
62., 67., 72., 45.});
}
TEST(XPU_ConvTransposedNHWC, run) {
testConvTransposedNHWCXdnn(IncrementalGenerator(),
vector<float>{0., 0., 1., 2., 3., 0., 6.,
12., 18., 16., 8., 30., 36., 42.,
32., 16., 54., 60., 66., 48., 24.,
62., 67., 72., 45.});
}
TEST(XPU_ConvTransposed, run1) {
// Construct Runtime and graph for CPU and XPU
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime xpu = make_ref<XPURuntimeObj>();
Graph gXpu = make_ref<GraphObj>(xpu);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({1, 2, 3, 3}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({2, 2, 3, 3}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(IncrementalGenerator());
w0Cpu->setData(IncrementalGenerator());
// Copy input tensors from CPU to XPU
Tensor i0Xpu = gXpu->cloneTensor(i0Cpu);
Tensor w0Xpu = gXpu->cloneTensor(w0Cpu);
// Build XPU graph
auto conv = gXpu->addOp<ConvTransposed2dObj>(i0Xpu, w0Xpu, nullptr, 0, 0);
gXpu->dataMalloc();
i0Xpu->setData(IncrementalGenerator());
w0Xpu->setData(IncrementalGenerator());
// Execute on XPU
xpu->run(gXpu);
// copy output from XPU to CPU
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
// check results on CPU
EXPECT_TRUE(o0Cpu->equalData(vector<float>{
162, 351, 569, 413, 224, 405, 876, 1417, 1024, 553,
747, 1611, 2598, 1869, 1005, 639, 1368, 2191, 1564, 835,
396, 843, 1343, 953, 506, 243, 531, 866, 629, 341,
621, 1344, 2173, 1564, 841, 1152, 2475, 3975, 2841, 1518,
963, 2052, 3271, 2320, 1231, 585, 1239, 1964, 1385, 731}));
}
} // namespace infini

View File

@ -0,0 +1,66 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/element_wise.h"
#include "xpu/xpu_runtime.h"
#include "test.h"
namespace infini {
using ExpectOutput = vector<float>;
template <class T>
void testElementWiseXdnn(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape, const ExpectOutput &ansVec) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<XPURuntimeObj>();
// Build input data on CPU
Tensor acpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
acpu->dataMalloc();
acpu->setData(generator);
Tensor bcpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
bcpu->dataMalloc();
bcpu->setData(generator);
// Build XPU graph
Graph g = make_ref<GraphObj>(xpuRuntime);
auto a = g->cloneTensor(acpu);
auto b = g->cloneTensor(bcpu);
auto op = g->addOp<T>(a, b, nullptr);
// allocate XPU memory
g->dataMalloc();
a->setData(generator);
b->setData(generator);
// Execute on XPU
xpuRuntime->run(g);
// clone XPU output to CPU
auto c = op->getOutput();
auto ccpu = c->clone(cpuRuntime);
// check results on CPU
EXPECT_TRUE(ccpu->equalData(ansVec));
}
TEST(xdnn_ElementWise, run) {
testElementWiseXdnn<AddObj>(
IncrementalGenerator(), Shape{1, 2, 2, 3},
ExpectOutput{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22});
testElementWiseXdnn<SubObj>(
IncrementalGenerator(), Shape{1, 2, 2, 3},
ExpectOutput{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
testElementWiseXdnn<MulObj>(
IncrementalGenerator(), Shape{1, 2, 2, 3},
ExpectOutput{0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121});
testElementWiseXdnn<DivObj>(
OneGenerator(), Shape{1, 2, 2, 3},
ExpectOutput{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
testElementWiseXdnn<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 1},
ExpectOutput{1, 1, 4, 27});
}
} // namespace infini

View File

@ -1,8 +1,8 @@
#include "xpu/xpu_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/matmul.h"
#include "xpu/xpu_runtime.h"
#include "test.h"

View File

@ -0,0 +1,40 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/pad.h"
#include "test.h"
#include "xpu/xpu_kernel_without_config.h"
#include "xpu/xpu_runtime.h"
namespace infini {
TEST(xpu_Pad, run) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<XPURuntimeObj>();
// Build input data on CPU
Tensor icpu =
make_ref<TensorObj>(Shape{1, 2, 3, 2}, DataType::Float32, cpuRuntime);
// Build XPU graph;
Graph g = make_ref<GraphObj>(xpuRuntime);
auto i = g->cloneTensor(icpu);
auto op = g->addOp<PadObj>(i, nullptr, vector<int>{1, 0, 1, 1},
vector<int>{0, 3});
// allocate XPU memory
g->dataMalloc();
i->setData(IncrementalGenerator());
// Execute on XPU
xpuRuntime->run(g);
// clone XPU output to CPU
auto o = op->getOutput();
auto cpuo = o->clone(cpuRuntime);
cpuo->printData();
// check results on CPU
EXPECT_TRUE(cpuo->equalData(
vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 2, 3, 0, 4, 5, 0, 6, 7, 0, 8, 9, 0, 10, 11, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
}
} // namespace infini

View File

@ -1,8 +1,8 @@
#include "xpu/xpu_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/pooling.h"
#include "xpu/xpu_runtime.h"
#include "test.h"

View File

@ -1,8 +1,8 @@
#include "xpu/xpu_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/split.h"
#include "xpu/xpu_runtime.h"
#include "test.h"

View File

@ -1,8 +1,8 @@
#include "xpu/xpu_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/transpose.h"
#include "xpu/xpu_runtime.h"
#include "test.h"

View File

@ -1,8 +1,8 @@
#include "xpu/xpu_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/unary.h"
#include "xpu/xpu_runtime.h"
#include "test.h"
@ -40,7 +40,7 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
}
void testClip(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<XPURuntimeObj>();
@ -72,7 +72,7 @@ void testClip(const std::function<void(void *, size_t, DataType)> &generator,
}
void testCast(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto xpuRuntime = make_ref<XPURuntimeObj>();
@ -83,7 +83,8 @@ void testCast(const std::function<void(void *, size_t, DataType)> &generator,
// GPU
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
auto gpuOp = xpuGraph->addOp<CastObj>(inputGpu, nullptr, CastType::Float2Int32);
auto gpuOp =
xpuGraph->addOp<CastObj>(inputGpu, nullptr, CastType::Float2Int32);
xpuGraph->dataMalloc();
inputGpu->setData(generator);
xpuRuntime->run(xpuGraph);
@ -91,7 +92,8 @@ void testCast(const std::function<void(void *, size_t, DataType)> &generator,
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<CastObj>(inputCpu, nullptr, CastType::Float2Int32);
auto cpuOp =
cpuGraph->addOp<CastObj>(inputCpu, nullptr, CastType::Float2Int32);
cpuGraph->addTensor(inputCpu);
cpuGraph->dataMalloc();
inputCpu->setData(generator);