forked from jiuyuan/InfiniTensor
添加XPU算子 (#120)
* add floordiv for xpu * add batchnorm for xpu * add more cast types for xpu * add conv_trans for xpu * add pad for xpu * add logical ops for xpu * fix format for xpu src and include * fix format for xpu test * fix format for xpu src --------- Co-authored-by: Bolun <bolunz@u.nus.edu>
This commit is contained in:
parent
d18d40a2e9
commit
a4c6214529
|
@ -1,7 +1,7 @@
|
|||
#pragma once
|
||||
#include "core/common.h"
|
||||
#include "xpu/runtime_ex.h"
|
||||
#include "xpu/xdnn.h"
|
||||
#include "core/common.h"
|
||||
|
||||
#define checkXPUError(call) \
|
||||
{ \
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#pragma once
|
||||
#include "xpu/xpu_runtime.h"
|
||||
#include "core/kernel.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
|
|
@ -1,28 +1,28 @@
|
|||
#pragma once
|
||||
#include "xpu/xpu_common.h"
|
||||
#include "core/runtime.h"
|
||||
#include "xpu/xpu_common.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class XPURuntimeObj : public RuntimeObj {
|
||||
private:
|
||||
baidu::xpu::api::Context* xdnn;
|
||||
baidu::xpu::api::Context *xdnn;
|
||||
XPUPtr workspace;
|
||||
size_t workspaceSize;
|
||||
|
||||
public:
|
||||
XPURuntimeObj() : RuntimeObj(Device::XPU) {
|
||||
xdnn = baidu::xpu::api::create_context();
|
||||
xdnn = baidu::xpu::api::create_context();
|
||||
// 10GB for Longformer
|
||||
// size_t longformerNum = 3lu * (1 << 30);
|
||||
workspaceSize = 3ll << 30; // 3 GB
|
||||
//std::cout<<workspaceSize/1024/1024/1024<< std::endl;
|
||||
//std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
|
||||
// std::cout<<workspaceSize/1024/1024/1024<< std::endl;
|
||||
// std::cout<<std::bitset<64>(workspaceSize)<< std::endl;
|
||||
workspace = alloc(workspaceSize);
|
||||
}
|
||||
virtual ~XPURuntimeObj() {
|
||||
dealloc(workspace);
|
||||
baidu::xpu::api::destroy_context(xdnn);
|
||||
baidu::xpu::api::destroy_context(xdnn);
|
||||
}
|
||||
string toString() const override;
|
||||
|
||||
|
@ -33,11 +33,12 @@ class XPURuntimeObj : public RuntimeObj {
|
|||
void sync() const;
|
||||
XPUPtr alloc(size_t size) override {
|
||||
void *ptr;
|
||||
checkXPUError(xpu_malloc_ex((void**)&ptr, size, XPUMemoryKind::XPU_MEM_MAIN));
|
||||
checkXPUError(
|
||||
xpu_malloc_ex((void **)&ptr, size, XPUMemoryKind::XPU_MEM_MAIN));
|
||||
return ptr;
|
||||
}
|
||||
void dealloc(void *ptr) override { xpu_free(ptr); }
|
||||
baidu::xpu::api::Context* XPUHandle() const { return xdnn; }
|
||||
baidu::xpu::api::Context *XPUHandle() const { return xdnn; }
|
||||
XPUPtr getWorkspace(size_t size) const {
|
||||
IT_ASSERT(size <= workspaceSize);
|
||||
return workspace;
|
||||
|
@ -45,17 +46,20 @@ class XPURuntimeObj : public RuntimeObj {
|
|||
|
||||
void copyBlobFromCPU(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
xpu_memcpy(dst, const_cast<void *>(src), bytes, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
|
||||
xpu_memcpy(dst, const_cast<void *>(src), bytes,
|
||||
XPUMemcpyKind::XPU_HOST_TO_DEVICE);
|
||||
}
|
||||
|
||||
void copyBlobToCPU(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
xpu_memcpy(dst, const_cast<void *>(src), bytes, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
|
||||
xpu_memcpy(dst, const_cast<void *>(src), bytes,
|
||||
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
|
||||
}
|
||||
|
||||
void copyBlobInsideRuntime(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
xpu_memcpy(dst, const_cast<void *>(src), bytes, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
|
||||
xpu_memcpy(dst, const_cast<void *>(src), bytes,
|
||||
XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
|
||||
}
|
||||
|
||||
private:
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
#include "operators/batch_norm.h"
|
||||
#include "xpu/xpu_kernel_without_config.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class BatchNormXdnn : public XPUKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<BatchNormObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
|
||||
void *const input = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const mean = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const var = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
void *const scale = (op->getInputs(3)->getRawDataPtr<void *>());
|
||||
void *const bias = (op->getInputs(4)->getRawDataPtr<void *>());
|
||||
void *const output = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto dims = op->getInputs(0)->getDims();
|
||||
|
||||
if (dims.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int w = dims[3];
|
||||
int h = dims[2];
|
||||
int c = dims[1];
|
||||
int n = dims[0];
|
||||
auto ret = baidu::xpu::api::batch_norm_infer<float>(
|
||||
context->XPUHandle(), (float *)input, (float *)output, n, c, h, w,
|
||||
op->getEps(), (float *)scale, (float *)bias, (float *)mean,
|
||||
(float *)var, true);
|
||||
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::XPU, OpType::BatchNormalization, DataType::Float32,
|
||||
BatchNormXdnn, "BatchNorm_xdnn_XPU_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -12,23 +12,83 @@ class CastXdnn : public XPUKernelWithoutConfig {
|
|||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
CastType type = op->getType();
|
||||
|
||||
int ret = 0;
|
||||
switch (type) {
|
||||
case CastType::Float2Int32:
|
||||
ret = baidu::xpu::api::cast<float,int>(context->XPUHandle(), (float*)aData, (int*)cData, len);
|
||||
break;
|
||||
case CastType::Int322Int8:
|
||||
ret = baidu::xpu::api::cast<int,float>(context->XPUHandle(), (int*)aData, (float*)cData, len);
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
|
||||
}
|
||||
assert(ret == 0);
|
||||
return;
|
||||
CastType type = op->getType();
|
||||
|
||||
int ret = 0;
|
||||
switch (type) {
|
||||
case CastType::Float2Float16:
|
||||
ret = baidu::xpu::api::cast<float, float16>(
|
||||
context->XPUHandle(), (float *)aData, (float16 *)cData, len);
|
||||
break;
|
||||
case CastType::Float2Int64:
|
||||
ret = baidu::xpu::api::cast<float, int64_t>(
|
||||
context->XPUHandle(), (float *)aData, (int64_t *)cData, len);
|
||||
break;
|
||||
case CastType::Float2Int32:
|
||||
ret = baidu::xpu::api::cast<float, int>(
|
||||
context->XPUHandle(), (float *)aData, (int *)cData, len);
|
||||
break;
|
||||
case CastType::Float2Int16:
|
||||
ret = baidu::xpu::api::cast<float, int16_t>(
|
||||
context->XPUHandle(), (float *)aData, (int16_t *)cData, len);
|
||||
break;
|
||||
case CastType::Float2Int8:
|
||||
ret = baidu::xpu::api::cast<float, int8_t>(
|
||||
context->XPUHandle(), (float *)aData, (int8_t *)cData, len);
|
||||
break;
|
||||
case CastType::Int322Float:
|
||||
ret = baidu::xpu::api::cast<int, float>(
|
||||
context->XPUHandle(), (int *)aData, (float *)cData, len);
|
||||
break;
|
||||
case CastType::Int322Int8:
|
||||
ret = baidu::xpu::api::cast<int, int8_t>(
|
||||
context->XPUHandle(), (int *)aData, (int8_t *)cData, len);
|
||||
break;
|
||||
case CastType::Int322Int16:
|
||||
ret = baidu::xpu::api::cast<int, int16_t>(
|
||||
context->XPUHandle(), (int *)aData, (int16_t *)cData, len);
|
||||
break;
|
||||
case CastType::Int162Float:
|
||||
ret = baidu::xpu::api::cast<int16_t, float>(
|
||||
context->XPUHandle(), (int16_t *)aData, (float *)cData, len);
|
||||
break;
|
||||
case CastType::Int162Int32:
|
||||
ret = baidu::xpu::api::cast<int16_t, int>(
|
||||
context->XPUHandle(), (int16_t *)aData, (int *)cData, len);
|
||||
break;
|
||||
case CastType::Int82Float:
|
||||
ret = baidu::xpu::api::cast<int8_t, float>(
|
||||
context->XPUHandle(), (int8_t *)aData, (float *)cData, len);
|
||||
break;
|
||||
case CastType::Int82Int16:
|
||||
ret = baidu::xpu::api::cast<int8_t, int16_t>(
|
||||
context->XPUHandle(), (int8_t *)aData, (int16_t *)cData, len);
|
||||
break;
|
||||
case CastType::Int82Int32:
|
||||
ret = baidu::xpu::api::cast<int8_t, int>(
|
||||
context->XPUHandle(), (int8_t *)aData, (int *)cData, len);
|
||||
break;
|
||||
case CastType::Int322Int64:
|
||||
ret = baidu::xpu::api::cast<int, int64_t>(
|
||||
context->XPUHandle(), (int *)aData, (int64_t *)cData, len);
|
||||
break;
|
||||
case CastType::Int642Int32:
|
||||
ret = baidu::xpu::api::cast<int64_t, int>(
|
||||
context->XPUHandle(), (int64_t *)aData, (int *)cData, len);
|
||||
break;
|
||||
case CastType::Int642Float:
|
||||
ret = baidu::xpu::api::cast<int64_t, float>(
|
||||
context->XPUHandle(), (int64_t *)aData, (float *)cData, len);
|
||||
break;
|
||||
case CastType::Float162Float:
|
||||
ret = baidu::xpu::api::cast<float16, float>(
|
||||
context->XPUHandle(), (float16 *)aData, (float *)cData, len);
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -8,26 +8,27 @@ class ConcatXdnn : public XPUKernelWithoutConfig {
|
|||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConcatObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
int axis = op->getDim();
|
||||
int num = op->numInputs();
|
||||
std::vector<const float*> inputsData;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
inputsData.push_back((float*)(op->getInputs(i)->getRawDataPtr<void *>()));
|
||||
}
|
||||
int axis = op->getDim();
|
||||
int num = op->numInputs();
|
||||
std::vector<const float *> inputsData;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
inputsData.push_back(
|
||||
(float *)(op->getInputs(i)->getRawDataPtr<void *>()));
|
||||
}
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
std::vector<std::vector<int>> dims;
|
||||
for(int i = 0; i < num; ++i){
|
||||
auto dim = op->getInputs(i)->getDims();
|
||||
if (dim.size() != 4) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
dims.push_back(dim);
|
||||
}
|
||||
auto ret = baidu::xpu::api::concat<float>(context->XPUHandle(), inputsData, (float*)cData, dims, axis);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
std::vector<std::vector<int>> dims;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
auto dim = op->getInputs(i)->getDims();
|
||||
if (dim.size() != 4) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
dims.push_back(dim);
|
||||
}
|
||||
auto ret = baidu::xpu::api::concat<float>(
|
||||
context->XPUHandle(), inputsData, (float *)cData, dims, axis);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -9,25 +9,26 @@ class ConvXdnn : public XPUKernelWithoutConfig {
|
|||
auto op = as<ConvObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
const int cpg = op->getChannelPerGroup();
|
||||
const int g = c / cpg;
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
std::vector<int> pads = {ph, pw};
|
||||
std::vector<int> ksize = {r, s};
|
||||
std::vector<int> stride = {sh, sw};
|
||||
std::vector<int> dilation = {dh, dw};
|
||||
|
||||
auto ret = baidu::xpu::api::conv2d<float,float,float,float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData,
|
||||
n,c,h,w,f,ksize,stride,pads,dilation,g,nullptr,nullptr,nullptr,true);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
std::vector<int> pads = {ph, pw};
|
||||
std::vector<int> ksize = {r, s};
|
||||
std::vector<int> stride = {sh, sw};
|
||||
std::vector<int> dilation = {dh, dw};
|
||||
|
||||
auto ret = baidu::xpu::api::conv2d<float, float, float, float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(float *)cData, n, c, h, w, f, ksize, stride, pads, dilation, g,
|
||||
nullptr, nullptr, nullptr, true);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
#include "operators/conv.h"
|
||||
#include "xpu/xpu_kernel_without_config.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class ConvTransXdnn : public XPUKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConvBaseObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
const int cpg = op->getChannelPerGroup();
|
||||
const int g = c / cpg;
|
||||
const bool isNCHW =
|
||||
(op->getOpType() == OpType::ConvTransNHWC) ? false : true;
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
std::vector<int> pads = {ph, pw};
|
||||
std::vector<int> ksize = {r, s};
|
||||
std::vector<int> stride = {sh, sw};
|
||||
std::vector<int> dilation = {dh, dw};
|
||||
|
||||
auto dimInputs0 = op->getInputs(0)->getDims();
|
||||
auto dimInputs1 = op->getInputs(1)->getDims();
|
||||
auto dimOutput = op->getOutput()->getDims();
|
||||
|
||||
if (dimInputs0.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
if (dimInputs1.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
if (dimOutput.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
auto ret =
|
||||
baidu::xpu::api::conv2d_transpose<float, float, float, float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(float *)cData, n, c, h, w, f, ksize, stride, pads, dilation, g,
|
||||
nullptr, nullptr, nullptr, isNCHW);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::XPU, OpType::ConvTranspose, DataType::Float32,
|
||||
ConvTransXdnn, "ConvTrans_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::ConvTransNHWC, DataType::Float32,
|
||||
ConvTransXdnn, "ConvTranposedNHWC_xdnn_XPU_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -17,10 +17,11 @@ class AddXdnn : public XPUKernelWithoutConfig {
|
|||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_add<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::broadcast_add<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(float *)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -38,10 +39,11 @@ class SubXdnn : public XPUKernelWithoutConfig {
|
|||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_sub<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::broadcast_sub<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(float *)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -59,10 +61,11 @@ class MulXdnn : public XPUKernelWithoutConfig {
|
|||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_mul<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::broadcast_mul<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(float *)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -80,10 +83,11 @@ class DivXdnn : public XPUKernelWithoutConfig {
|
|||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_div<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::broadcast_div<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(float *)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -101,10 +105,11 @@ class PowXdnn : public XPUKernelWithoutConfig {
|
|||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_pow<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::broadcast_pow<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(float *)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -122,10 +127,11 @@ class MaxXdnn : public XPUKernelWithoutConfig {
|
|||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_max<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::broadcast_max<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(float *)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -143,10 +149,11 @@ class MinXdnn : public XPUKernelWithoutConfig {
|
|||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_min<float>(context->XPUHandle(), (float*)aData, (float*)bData, (float*)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::broadcast_min<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(float *)cData, aDim, bDim);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -159,18 +166,20 @@ class EqualXdnn : public XPUKernelWithoutConfig {
|
|||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
|
||||
auto aDim = op->getInputs(0)->getDims();
|
||||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_equal<float>(context->XPUHandle(), (float*)aData, (float*)bData, (bool*)wsData, aDim, bDim);
|
||||
ret = baidu::xpu::api::cast<bool, float>(context->XPUHandle(), (bool*)wsData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::broadcast_equal<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(bool *)wsData, aDim, bDim);
|
||||
ret = baidu::xpu::api::cast<bool, float>(
|
||||
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -183,18 +192,20 @@ class GreaterEqualXdnn : public XPUKernelWithoutConfig {
|
|||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
|
||||
auto aDim = op->getInputs(0)->getDims();
|
||||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_greater_equal<float>(context->XPUHandle(), (float*)aData, (float*)bData, (bool*)wsData, aDim, bDim);
|
||||
ret = baidu::xpu::api::cast<bool, float>(context->XPUHandle(), (bool*)wsData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::broadcast_greater_equal<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(bool *)wsData, aDim, bDim);
|
||||
ret = baidu::xpu::api::cast<bool, float>(
|
||||
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -207,18 +218,20 @@ class GreaterThanXdnn : public XPUKernelWithoutConfig {
|
|||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
|
||||
auto aDim = op->getInputs(0)->getDims();
|
||||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_greater_than<float>(context->XPUHandle(), (float*)aData, (float*)bData, (bool*)wsData, aDim, bDim);
|
||||
ret = baidu::xpu::api::cast<bool, float>(context->XPUHandle(), (bool*)wsData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::broadcast_greater_than<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(bool *)wsData, aDim, bDim);
|
||||
ret = baidu::xpu::api::cast<bool, float>(
|
||||
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -231,18 +244,20 @@ class LessEqualXdnn : public XPUKernelWithoutConfig {
|
|||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
|
||||
auto aDim = op->getInputs(0)->getDims();
|
||||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_less_equal<float>(context->XPUHandle(), (float*)aData, (float*)bData, (bool*)wsData, aDim, bDim);
|
||||
ret = baidu::xpu::api::cast<bool, float>(context->XPUHandle(), (bool*)wsData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::broadcast_less_equal<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(bool *)wsData, aDim, bDim);
|
||||
ret = baidu::xpu::api::cast<bool, float>(
|
||||
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -255,18 +270,170 @@ class LessThanXdnn : public XPUKernelWithoutConfig {
|
|||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
|
||||
auto aDim = op->getInputs(0)->getDims();
|
||||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_less_than<float>(context->XPUHandle(), (float*)aData, (float*)bData, (bool*)wsData, aDim, bDim);
|
||||
ret = baidu::xpu::api::cast<bool, float>(context->XPUHandle(), (bool*)wsData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
auto ret = baidu::xpu::api::broadcast_less_than<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(bool *)wsData, aDim, bDim);
|
||||
ret = baidu::xpu::api::cast<bool, float>(
|
||||
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
class FloorDivXdnn : public XPUKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
|
||||
auto aDim = op->getInputs(0)->getDims();
|
||||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_floordiv<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(float *)wsData, aDim, bDim);
|
||||
ret = baidu::xpu::api::cast<int, float>(
|
||||
context->XPUHandle(), (int *)wsData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
class MSELossXdnn : public XPUKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<MSELossObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
size_t len = op->getOutput()->size();
|
||||
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
auto ret = baidu::xpu::api::mse_loss<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
class AndXdnn : public XPUKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
|
||||
auto aDim = op->getInputs(0)->getDims();
|
||||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::logical_and<bool>(
|
||||
context->XPUHandle(), (bool *)aData, (bool *)bData, (bool *)wsData,
|
||||
len);
|
||||
ret = baidu::xpu::api::cast<bool, float>(
|
||||
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
class OrXdnn : public XPUKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
|
||||
auto aDim = op->getInputs(0)->getDims();
|
||||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::logical_or<bool>(
|
||||
context->XPUHandle(), (bool *)aData, (bool *)bData, (bool *)wsData,
|
||||
len);
|
||||
ret = baidu::xpu::api::cast<bool, float>(
|
||||
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
class XorXdnn : public XPUKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
|
||||
auto aDim = op->getInputs(0)->getDims();
|
||||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::logical_xor<bool>(
|
||||
context->XPUHandle(), (bool *)aData, (bool *)bData, (bool *)wsData,
|
||||
len);
|
||||
ret = baidu::xpu::api::cast<bool, float>(
|
||||
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
class NotXdnn : public XPUKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
|
||||
auto aDim = op->getInputs(0)->getDims();
|
||||
if (aDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::logical_not<bool>(
|
||||
context->XPUHandle(), (bool *)aData, (bool *)wsData, len);
|
||||
ret = baidu::xpu::api::cast<bool, float>(
|
||||
context->XPUHandle(), (bool *)wsData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -286,12 +453,24 @@ REGISTER_KERNEL(Device::XPU, OpType::Min, DataType::Float32, MinXdnn,
|
|||
"Min_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Equal, DataType::Float32, EqualXdnn,
|
||||
"Equal_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::GreaterOrEqual, DataType::Float32, GreaterEqualXdnn,
|
||||
"GreaterEqual_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Greater, DataType::Float32, GreaterThanXdnn,
|
||||
"GreaterThan_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::LessOrEqual, DataType::Float32, LessEqualXdnn,
|
||||
"LessEqual_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::GreaterOrEqual, DataType::Float32,
|
||||
GreaterEqualXdnn, "GreaterEqual_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Greater, DataType::Float32,
|
||||
GreaterThanXdnn, "GreaterThan_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::LessOrEqual, DataType::Float32,
|
||||
LessEqualXdnn, "LessEqual_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Less, DataType::Float32, LessThanXdnn,
|
||||
"LessThan_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::FloorDiv, DataType::Float32, FloorDivXdnn,
|
||||
"FloorDiv_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::MSELoss, DataType::Float32, MSELossXdnn,
|
||||
"MSELoss_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::And, DataType::Float32, AndXdnn,
|
||||
"And_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Or, DataType::Float32, OrXdnn,
|
||||
"Or_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Xor, DataType::Float32, XorXdnn,
|
||||
"Xor_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Not, DataType::Float32, NotXdnn,
|
||||
"Not_xdnn_XPU_Float32");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -8,25 +8,28 @@ class MatmulXdnn : public XPUKernelWithoutConfig {
|
|||
const RuntimeObj *_context) const override {
|
||||
auto op = as<MatmulObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
bool transA = op->getTransA();
|
||||
bool transA = op->getTransA();
|
||||
bool transB = op->getTransB();
|
||||
if(op->getInputs(0)->getDims().size() != 2 || op->getInputs(1)->getDims().size() != 2) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
if (op->getInputs(0)->getDims().size() != 2 ||
|
||||
op->getInputs(1)->getDims().size() != 2) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
auto m = transA ? op->getInputs(0)->getDims()[1] : op->getInputs(0)->getDims()[0];
|
||||
auto n = transB ? op->getInputs(1)->getDims()[0] : op->getInputs(1)->getDims()[1];
|
||||
auto k = transA ? op->getInputs(0)->getDims()[0] : op->getInputs(0)->getDims()[1];
|
||||
|
||||
auto ret = baidu::xpu::api::fc<float,float,float,int>(context->XPUHandle(),
|
||||
(float*)aData, (float*)bData, (float*)cData,
|
||||
m,n,k, transA, transB, nullptr, nullptr, nullptr);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
auto m = transA ? op->getInputs(0)->getDims()[1]
|
||||
: op->getInputs(0)->getDims()[0];
|
||||
auto n = transB ? op->getInputs(1)->getDims()[0]
|
||||
: op->getInputs(1)->getDims()[1];
|
||||
auto k = transA ? op->getInputs(0)->getDims()[0]
|
||||
: op->getInputs(0)->getDims()[1];
|
||||
|
||||
auto ret = baidu::xpu::api::fc<float, float, float, int>(
|
||||
context->XPUHandle(), (float *)aData, (float *)bData,
|
||||
(float *)cData, m, n, k, transA, transB, nullptr, nullptr, nullptr);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
#include "operators/pad.h"
|
||||
#include "xpu/xpu_kernel_without_config.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class PadXdnn : public XPUKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<PadObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
int dim_size = dim.size();
|
||||
|
||||
std::vector<int> pads = op->getPads();
|
||||
|
||||
std::cout << std::endl;
|
||||
std::vector<int> paddings_left(pads.begin(), pads.begin() + dim_size);
|
||||
std::vector<int> paddings_right(pads.begin() + dim_size, pads.end());
|
||||
|
||||
float paddingValue = 0.0;
|
||||
auto ret = baidu::xpu::api::pad<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, dim,
|
||||
paddings_left, paddings_right, paddingValue);
|
||||
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Pad, DataType::Float32, PadXdnn,
|
||||
"Pad_xdnn_XPU_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -14,15 +14,15 @@ class AvgPooling : public XPUKernelWithoutConfig {
|
|||
auto [n, c, h, w, kh, kw] = op->getNCHWRS();
|
||||
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
|
||||
std::vector<int> ksize = {kh, kw};
|
||||
std::vector<int> stride = {sh, sw};
|
||||
std::vector<int> pad = {ph, pw};
|
||||
std::vector<int> ksize = {kh, kw};
|
||||
std::vector<int> stride = {sh, sw};
|
||||
std::vector<int> pad = {ph, pw};
|
||||
|
||||
auto ret = baidu::xpu::api::avg_pool2d<float>(context->XPUHandle(), (float*)aData, (float*)cData,
|
||||
n,c,h,w,ksize,stride,pad,true,true,nullptr,nullptr);
|
||||
auto ret = baidu::xpu::api::avg_pool2d<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, n, c, h, w,
|
||||
ksize, stride, pad, true, true, nullptr, nullptr);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -37,24 +37,24 @@ class MaxPooling : public XPUKernelWithoutConfig {
|
|||
auto [n, c, h, w, kh, kw] = op->getNCHWRS();
|
||||
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
|
||||
std::vector<int> ksize = {kh, kw};
|
||||
std::vector<int> stride = {sh, sw};
|
||||
std::vector<int> pad = {ph, pw};
|
||||
std::vector<int> ksize = {kh, kw};
|
||||
std::vector<int> stride = {sh, sw};
|
||||
std::vector<int> pad = {ph, pw};
|
||||
|
||||
int yh = (h + ph*2 -kh) /sh + 1;
|
||||
int yw = (w + pw*2 -kw) /sw + 1;
|
||||
int yh = (h + ph * 2 - kh) / sh + 1;
|
||||
int yw = (w + pw * 2 - kw) / sw + 1;
|
||||
|
||||
XPUPtr indices = context->getWorkspace(yh * yw * 4);
|
||||
XPUPtr indices = context->getWorkspace(yh * yw * 4);
|
||||
|
||||
auto ret = baidu::xpu::api::max_pool2d<float>(context->XPUHandle(), (float*)aData, (float*)cData,
|
||||
(int*)indices, n,c,h,w,ksize,stride,pad,true,nullptr,nullptr,false);
|
||||
auto ret = baidu::xpu::api::max_pool2d<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData,
|
||||
(int *)indices, n, c, h, w, ksize, stride, pad, true, nullptr,
|
||||
nullptr, false);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
REGISTER_KERNEL(Device::XPU, OpType::MaxPool, DataType::Float32, MaxPooling,
|
||||
"MaxPool_xdnn_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::AveragePool, DataType::Float32, AvgPooling,
|
||||
|
|
|
@ -8,29 +8,31 @@ class SplitXdnn : public XPUKernelWithoutConfig {
|
|||
const RuntimeObj *_context) const override {
|
||||
auto op = as<SplitObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
int axis = op->getDim();
|
||||
int num = op->numOutputs();
|
||||
int axis = op->getDim();
|
||||
int num = op->numOutputs();
|
||||
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
auto inputDim = op->getInputs(0)->getDims();
|
||||
auto inputDim = op->getInputs(0)->getDims();
|
||||
|
||||
std::vector<float*> outputsData;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
outputsData.push_back((float*)(op->getOutput(i)->getRawDataPtr<void*>()));
|
||||
}
|
||||
std::vector<float *> outputsData;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
outputsData.push_back(
|
||||
(float *)(op->getOutput(i)->getRawDataPtr<void *>()));
|
||||
}
|
||||
|
||||
std::vector<int> splitList;
|
||||
for(int i = 0; i < num; ++i){
|
||||
auto dim = op->getOutput(i)->getDims();
|
||||
if (dim.size() != 4) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
splitList.push_back(dim[axis]);
|
||||
}
|
||||
|
||||
auto ret = baidu::xpu::api::split<float>(context->XPUHandle(), (float*)inputData, outputsData, inputDim, splitList, axis);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
std::vector<int> splitList;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
auto dim = op->getOutput(i)->getDims();
|
||||
if (dim.size() != 4) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
splitList.push_back(dim[axis]);
|
||||
}
|
||||
|
||||
auto ret = baidu::xpu::api::split<float>(
|
||||
context->XPUHandle(), (float *)inputData, outputsData, inputDim,
|
||||
splitList, axis);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -9,23 +9,24 @@ class TransposeXdnn : public XPUKernelWithoutConfig {
|
|||
auto op = as<TransposeObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto dimin = op->getInputs(0)->getDims();
|
||||
auto permute = op->getPermute();
|
||||
auto dimin = op->getInputs(0)->getDims();
|
||||
auto permute = op->getPermute();
|
||||
|
||||
if ( dimin.size() != 4 ) {
|
||||
if (dimin.size() != 4) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
auto ret = baidu::xpu::api::transpose<float>(context->XPUHandle(), (float*)aData, (float*)cData, dimin, permute);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
|
||||
auto ret = baidu::xpu::api::transpose<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, dimin,
|
||||
permute);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Transpose, DataType::Float32, TransposeXdnn,
|
||||
"Transpose_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Transpose, DataType::Float32,
|
||||
TransposeXdnn, "Transpose_xdnn_XPU_Float32");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -13,10 +13,10 @@ class ReluXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::relu<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::relu<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -30,10 +30,10 @@ class SigmoidXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::sigmoid<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::sigmoid<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -47,10 +47,10 @@ class TanhXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::tanh<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::tanh<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -64,10 +64,10 @@ class SquareXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::square<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::square<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -81,10 +81,10 @@ class SqrtXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::sqrt<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::sqrt<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -98,10 +98,10 @@ class RsqrtXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::rsqrt<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::rsqrt<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -115,10 +115,10 @@ class ExpXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::exp<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::exp<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -132,10 +132,10 @@ class CeilXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::ceil<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::ceil<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -148,13 +148,14 @@ class ClipXdnn : public XPUKernelWithoutConfig {
|
|||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
float min = op->getMin().value();
|
||||
float max = op->getMax().value();
|
||||
|
||||
auto ret = baidu::xpu::api::clip<float>(context->XPUHandle(), (float*)aData, (float*)cData, len, min, max);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
float min = op->getMin().value();
|
||||
float max = op->getMax().value();
|
||||
|
||||
auto ret =
|
||||
baidu::xpu::api::clip<float>(context->XPUHandle(), (float *)aData,
|
||||
(float *)cData, len, min, max);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -168,10 +169,10 @@ class FloorXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::floor<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::floor<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -185,10 +186,10 @@ class NegXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::neg<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::neg<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -201,10 +202,10 @@ class CopyXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::copy<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::copy<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -218,14 +219,13 @@ class ReciprocalXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::reciprocal<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
auto ret = baidu::xpu::api::reciprocal<float>(
|
||||
context->XPUHandle(), (float *)aData, (float *)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Relu, DataType::Float32, ReluXdnn,
|
||||
"Relu_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Sigmoid, DataType::Float32, SigmoidXdnn,
|
||||
|
@ -248,8 +248,8 @@ REGISTER_KERNEL(Device::XPU, OpType::Floor, DataType::Float32, FloorXdnn,
|
|||
"Floor_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Neg, DataType::Float32, NegXdnn,
|
||||
"Neg_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Reciprocal, DataType::Float32, ReciprocalXdnn,
|
||||
"Reciprocal_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Reciprocal, DataType::Float32,
|
||||
ReciprocalXdnn, "Reciprocal_xdnn_XPU_Float32");
|
||||
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Reshape, DataType::Float32, CopyXdnn,
|
||||
"Reshape_xdnn_Float32");
|
||||
|
|
|
@ -1,19 +1,18 @@
|
|||
#include "xpu/operator_timer.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "utils/data_generator.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
namespace opTimer {
|
||||
|
||||
double getPerfConvXPU(int n, int c, int h, int w, int f, int r, int s,
|
||||
int padh, int padw, int strideh, int stridew,
|
||||
int dilationh, int dilationw, int group,
|
||||
const char *name) {
|
||||
double getPerfConvXPU(int n, int c, int h, int w, int f, int r, int s, int padh,
|
||||
int padw, int strideh, int stridew, int dilationh,
|
||||
int dilationw, int group, const char *name) {
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Runtime xpu = make_ref<XPURuntimeObj>();
|
||||
|
@ -31,8 +30,8 @@ double getPerfConvXPU(int n, int c, int h, int w, int f, int r, int s,
|
|||
Tensor i0XPU = gXpu->cloneTensor(i0Cpu);
|
||||
Tensor w0XPU = gXpu->cloneTensor(w0Cpu);
|
||||
// Build Xpu graph
|
||||
auto conv = gXpu->addOp<ConvObj>(i0XPU, w0XPU, nullptr, padh, padw,
|
||||
strideh, stridew, dilationh, dilationw);
|
||||
auto conv = gXpu->addOp<ConvObj>(i0XPU, w0XPU, nullptr, padh, padw, strideh,
|
||||
stridew, dilationh, dilationw);
|
||||
// allocate Xpu memory
|
||||
gXpu->dataMalloc();
|
||||
// Execute on Xpu
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
namespace infini {
|
||||
|
||||
void XPURuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
|
||||
bool profiling = false) const {
|
||||
bool profiling = false) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
double totalTime = 0;
|
||||
|
|
|
@ -1,17 +1,16 @@
|
|||
#include "xpu/xpu_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testAdd(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
void testAdd(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto xpuRuntime = make_ref<XPURuntimeObj>();
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/batch_norm.h"
|
||||
#include "test.h"
|
||||
#include "xpu/xpu_kernel_without_config.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(XPU_BatchNorm, run) {
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto xpuRuntime = make_ref<XPURuntimeObj>();
|
||||
|
||||
// Build cpu graph
|
||||
Graph gCpu = make_ref<GraphObj>(cpuRuntime);
|
||||
auto iCpu = gCpu->addTensor(Shape{1, 3, 2, 2}, DataType::Float32);
|
||||
auto meanCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||
auto varCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||
auto scaleCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||
auto biasCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||
|
||||
// Build input data on CPU
|
||||
gCpu->dataMalloc();
|
||||
iCpu->setData(IncrementalGenerator());
|
||||
meanCpu->copyin(vector<float>{1, 6, 9});
|
||||
varCpu->copyin(vector<float>{4, 1, 9});
|
||||
scaleCpu->setData(OneGenerator());
|
||||
biasCpu->setData(ZeroGenerator());
|
||||
|
||||
// Build XPU graph
|
||||
Graph g = make_ref<GraphObj>(xpuRuntime);
|
||||
|
||||
auto i = g->cloneTensor(iCpu);
|
||||
auto mean = g->cloneTensor(meanCpu);
|
||||
auto var = g->cloneTensor(varCpu);
|
||||
auto scale = g->cloneTensor(scaleCpu);
|
||||
auto bias = g->cloneTensor(biasCpu);
|
||||
auto op =
|
||||
g->addOp<BatchNormObj>(i, nullptr, mean, var, scale, bias, 0.9, 0);
|
||||
|
||||
// allocate XPU memory
|
||||
g->dataMalloc();
|
||||
i->setData(IncrementalGenerator());
|
||||
mean->copyin(vector<float>{1, 6, 9});
|
||||
var->copyin(vector<float>{4, 1, 9});
|
||||
scale->setData(OneGenerator());
|
||||
bias->setData(ZeroGenerator());
|
||||
|
||||
// Execute on XPU
|
||||
xpuRuntime->run(g);
|
||||
|
||||
// clone XPU output to CPU
|
||||
auto o = op->getOutput();
|
||||
auto ocpu = o->clone(cpuRuntime);
|
||||
|
||||
// check results on CPU
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 2}));
|
||||
EXPECT_TRUE(ocpu->equalData(vector<float>{
|
||||
-0.5, 0, 0.5, 1, -2, -1, 0, 1, -0.333333, 0, 0.3333333, 0.6666667}));
|
||||
}
|
||||
} // namespace infini
|
|
@ -1,8 +1,8 @@
|
|||
#include "xpu/xpu_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/concat.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
#include "xpu/xpu_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/conv.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
|
|
|
@ -0,0 +1,136 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/perf_engine.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/conv.h"
|
||||
#include "xpu/xpu_kernel_without_config.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void testConvTransposedXdnn(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
vector<float> ansVec) {
|
||||
const auto &[N, C, H, W, F, R, S] = tuple{1, 1, 2, 2, 1, 4, 4};
|
||||
const int stride = 1, padding = 0, dilation = 1;
|
||||
// Construct Runtime and graph for CPU and XPU
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Runtime xpu = make_ref<XPURuntimeObj>();
|
||||
Graph gXpu = make_ref<GraphObj>(xpu);
|
||||
// Set input data on CPU in a CPU Graph
|
||||
Tensor i0Cpu = gCpu->addTensor({N, F, H, H}, DataType::Float32);
|
||||
Tensor w0Cpu = gCpu->addTensor({F, C, R, S}, DataType::Float32);
|
||||
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||
gCpu->dataMalloc();
|
||||
i0Cpu->setData(generator);
|
||||
w0Cpu->setData(generator);
|
||||
|
||||
// Copy input tensors from CPU to XPU
|
||||
Tensor i0Xpu = gXpu->cloneTensor(i0Cpu);
|
||||
Tensor w0Xpu = gXpu->cloneTensor(w0Cpu);
|
||||
// Build XPU graph
|
||||
auto conv = gXpu->addOp<ConvTransposed2dObj>(i0Xpu, w0Xpu, nullptr, padding,
|
||||
padding, stride, stride,
|
||||
dilation, dilation);
|
||||
gXpu->dataMalloc();
|
||||
i0Xpu->setData(generator);
|
||||
w0Xpu->setData(generator);
|
||||
// Execute on XPU
|
||||
xpu->run(gXpu);
|
||||
// copy output from XPU to CPU
|
||||
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(o0Cpu->equalData(ansVec));
|
||||
}
|
||||
|
||||
void testConvTransposedNHWCXdnn(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
vector<float> ansVec) {
|
||||
const auto &[N, C, H, W, F, R, S] = tuple{1, 1, 2, 2, 1, 4, 4};
|
||||
const int stride = 1, padding = 0, dilation = 1;
|
||||
// Construct Runtime and graph for CPU and XPU
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Runtime xpu = make_ref<XPURuntimeObj>();
|
||||
Graph gXpu = make_ref<GraphObj>(xpu);
|
||||
// Set input data on CPU in a CPU Graph
|
||||
Tensor i0Cpu = gCpu->addTensor({N, H, W, F}, DataType::Float32);
|
||||
Tensor w0Cpu = gCpu->addTensor({F, R, S, C}, DataType::Float32);
|
||||
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||
gCpu->dataMalloc();
|
||||
i0Cpu->setData(generator);
|
||||
w0Cpu->setData(generator);
|
||||
|
||||
// Copy input tensors from CPU to XPU
|
||||
Tensor i0Xpu = gXpu->cloneTensor(i0Cpu);
|
||||
Tensor w0Xpu = gXpu->cloneTensor(w0Cpu);
|
||||
// Build XPU graph
|
||||
auto conv = gXpu->addOp<ConvTransposed2dNHWCObj>(
|
||||
i0Xpu, w0Xpu, nullptr, padding, padding, stride, stride, dilation,
|
||||
dilation);
|
||||
gXpu->dataMalloc();
|
||||
i0Xpu->setData(generator);
|
||||
w0Xpu->setData(generator);
|
||||
// Execute on XPU
|
||||
xpu->run(gXpu);
|
||||
// copy output from XPU to CPU
|
||||
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(o0Cpu->equalData(ansVec));
|
||||
}
|
||||
|
||||
TEST(XPU_ConvTransposed, run) {
|
||||
testConvTransposedXdnn(IncrementalGenerator(),
|
||||
vector<float>{0., 0., 1., 2., 3., 0., 6.,
|
||||
12., 18., 16., 8., 30., 36., 42.,
|
||||
32., 16., 54., 60., 66., 48., 24.,
|
||||
62., 67., 72., 45.});
|
||||
}
|
||||
|
||||
TEST(XPU_ConvTransposedNHWC, run) {
|
||||
testConvTransposedNHWCXdnn(IncrementalGenerator(),
|
||||
vector<float>{0., 0., 1., 2., 3., 0., 6.,
|
||||
12., 18., 16., 8., 30., 36., 42.,
|
||||
32., 16., 54., 60., 66., 48., 24.,
|
||||
62., 67., 72., 45.});
|
||||
}
|
||||
|
||||
TEST(XPU_ConvTransposed, run1) {
|
||||
// Construct Runtime and graph for CPU and XPU
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Runtime xpu = make_ref<XPURuntimeObj>();
|
||||
Graph gXpu = make_ref<GraphObj>(xpu);
|
||||
// Set input data on CPU in a CPU Graph
|
||||
Tensor i0Cpu = gCpu->addTensor({1, 2, 3, 3}, DataType::Float32);
|
||||
Tensor w0Cpu = gCpu->addTensor({2, 2, 3, 3}, DataType::Float32);
|
||||
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||
gCpu->dataMalloc();
|
||||
i0Cpu->setData(IncrementalGenerator());
|
||||
w0Cpu->setData(IncrementalGenerator());
|
||||
|
||||
// Copy input tensors from CPU to XPU
|
||||
Tensor i0Xpu = gXpu->cloneTensor(i0Cpu);
|
||||
Tensor w0Xpu = gXpu->cloneTensor(w0Cpu);
|
||||
// Build XPU graph
|
||||
auto conv = gXpu->addOp<ConvTransposed2dObj>(i0Xpu, w0Xpu, nullptr, 0, 0);
|
||||
gXpu->dataMalloc();
|
||||
i0Xpu->setData(IncrementalGenerator());
|
||||
w0Xpu->setData(IncrementalGenerator());
|
||||
// Execute on XPU
|
||||
xpu->run(gXpu);
|
||||
// copy output from XPU to CPU
|
||||
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(o0Cpu->equalData(vector<float>{
|
||||
162, 351, 569, 413, 224, 405, 876, 1417, 1024, 553,
|
||||
747, 1611, 2598, 1869, 1005, 639, 1368, 2191, 1564, 835,
|
||||
396, 843, 1343, 953, 506, 243, 531, 866, 629, 341,
|
||||
621, 1344, 2173, 1564, 841, 1152, 2475, 3975, 2841, 1518,
|
||||
963, 2052, 3271, 2320, 1231, 585, 1239, 1964, 1385, 731}));
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,66 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
using ExpectOutput = vector<float>;
|
||||
template <class T>
|
||||
void testElementWiseXdnn(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape, const ExpectOutput &ansVec) {
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto xpuRuntime = make_ref<XPURuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor acpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
acpu->dataMalloc();
|
||||
acpu->setData(generator);
|
||||
|
||||
Tensor bcpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
bcpu->dataMalloc();
|
||||
bcpu->setData(generator);
|
||||
|
||||
// Build XPU graph
|
||||
Graph g = make_ref<GraphObj>(xpuRuntime);
|
||||
auto a = g->cloneTensor(acpu);
|
||||
auto b = g->cloneTensor(bcpu);
|
||||
auto op = g->addOp<T>(a, b, nullptr);
|
||||
|
||||
// allocate XPU memory
|
||||
g->dataMalloc();
|
||||
a->setData(generator);
|
||||
b->setData(generator);
|
||||
|
||||
// Execute on XPU
|
||||
xpuRuntime->run(g);
|
||||
|
||||
// clone XPU output to CPU
|
||||
auto c = op->getOutput();
|
||||
auto ccpu = c->clone(cpuRuntime);
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(ccpu->equalData(ansVec));
|
||||
}
|
||||
|
||||
TEST(xdnn_ElementWise, run) {
|
||||
testElementWiseXdnn<AddObj>(
|
||||
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||
ExpectOutput{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22});
|
||||
testElementWiseXdnn<SubObj>(
|
||||
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||
ExpectOutput{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
|
||||
testElementWiseXdnn<MulObj>(
|
||||
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||
ExpectOutput{0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121});
|
||||
testElementWiseXdnn<DivObj>(
|
||||
OneGenerator(), Shape{1, 2, 2, 3},
|
||||
ExpectOutput{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
|
||||
testElementWiseXdnn<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 1},
|
||||
ExpectOutput{1, 1, 4, 27});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -1,8 +1,8 @@
|
|||
#include "xpu/xpu_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/pad.h"
|
||||
#include "test.h"
|
||||
#include "xpu/xpu_kernel_without_config.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
TEST(xpu_Pad, run) {
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto xpuRuntime = make_ref<XPURuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor icpu =
|
||||
make_ref<TensorObj>(Shape{1, 2, 3, 2}, DataType::Float32, cpuRuntime);
|
||||
|
||||
// Build XPU graph;
|
||||
Graph g = make_ref<GraphObj>(xpuRuntime);
|
||||
auto i = g->cloneTensor(icpu);
|
||||
auto op = g->addOp<PadObj>(i, nullptr, vector<int>{1, 0, 1, 1},
|
||||
vector<int>{0, 3});
|
||||
|
||||
// allocate XPU memory
|
||||
g->dataMalloc();
|
||||
i->setData(IncrementalGenerator());
|
||||
|
||||
// Execute on XPU
|
||||
xpuRuntime->run(g);
|
||||
|
||||
// clone XPU output to CPU
|
||||
auto o = op->getOutput();
|
||||
auto cpuo = o->clone(cpuRuntime);
|
||||
cpuo->printData();
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(cpuo->equalData(
|
||||
vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 1, 0, 2, 3, 0, 4, 5, 0, 6, 7, 0, 8, 9, 0, 10, 11, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
|
||||
}
|
||||
} // namespace infini
|
|
@ -1,8 +1,8 @@
|
|||
#include "xpu/xpu_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/pooling.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
#include "xpu/xpu_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/split.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
#include "xpu/xpu_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/transpose.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
#include "xpu/xpu_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
#include "xpu/xpu_runtime.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
|
@ -40,7 +40,7 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
}
|
||||
|
||||
void testClip(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto xpuRuntime = make_ref<XPURuntimeObj>();
|
||||
|
@ -72,7 +72,7 @@ void testClip(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
}
|
||||
|
||||
void testCast(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto xpuRuntime = make_ref<XPURuntimeObj>();
|
||||
|
@ -83,7 +83,8 @@ void testCast(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
// GPU
|
||||
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
|
||||
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = xpuGraph->addOp<CastObj>(inputGpu, nullptr, CastType::Float2Int32);
|
||||
auto gpuOp =
|
||||
xpuGraph->addOp<CastObj>(inputGpu, nullptr, CastType::Float2Int32);
|
||||
xpuGraph->dataMalloc();
|
||||
inputGpu->setData(generator);
|
||||
xpuRuntime->run(xpuGraph);
|
||||
|
@ -91,7 +92,8 @@ void testCast(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<CastObj>(inputCpu, nullptr, CastType::Float2Int32);
|
||||
auto cpuOp =
|
||||
cpuGraph->addOp<CastObj>(inputCpu, nullptr, CastType::Float2Int32);
|
||||
cpuGraph->addTensor(inputCpu);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
|
Loading…
Reference in New Issue