forked from jiuyuan/InfiniTensor
Compare commits
57 Commits
master
...
activation
Author | SHA1 | Date |
---|---|---|
wanghailu | fb2a6a8fb2 | |
wanghailu | aebd7440aa | |
wanghailu | 8b58b08240 | |
wanghailu | 44594f13d3 | |
wanghailu | cc4eb31265 | |
wanghailu | 27af0065f7 | |
wanghailu | 1fcab531ec | |
wanghailu | c51b19b198 | |
wanghailu | 9ba670cc08 | |
wanghailu | 6b53a50927 | |
wanghailu | 05f8789b68 | |
wanghailu | 8d7150f815 | |
wanghailu | f2f149861a | |
wanghailu | 7167badbb7 | |
wanghailu | 68a4269a2c | |
wanghailu | 3a8c309236 | |
wanghailu | 5f4cb6fb55 | |
wanghailu | 6f1c7d0e82 | |
wanghailu | 04d0e1a560 | |
wanghailu | d216b529e7 | |
wanghailu | cd703e5679 | |
wanghailu | 2b8bca17e2 | |
wanghailu | 156a40806d | |
wanghailu | c19d6e6bb0 | |
wanghailu | 68f4630dac | |
wanghailu | 5ae96ce060 | |
wanghailu | dbb606f158 | |
wanghailu | 0079d1271b | |
wanghailu | 5329e66d0f | |
wanghailu | 45ea5c83f6 | |
wanghailu | 9177629a77 | |
wanghailu | f98f91de8b | |
wanghailu | 335dfabf80 | |
wanghailu | 376c992aca | |
wanghailu | 39d2a3571b | |
wanghailu | 0707fb6aff | |
wanghailu | 4ad648fa36 | |
wanghailu | 2749b49ff7 | |
wanghailu | 34ba231cd4 | |
wanghailu | 8bd1d64c53 | |
wanghailu | 084063a68f | |
wanghailu | 82f510672d | |
wanghailu | b27b95a5e2 | |
wanghailu | 9346232129 | |
wanghailu | a56fb98eee | |
wanghailu | 949e00b732 | |
wanghailu | 58b89dd601 | |
wanghailu | 46a1bb2773 | |
wanghailu | 820d855ec8 | |
wanghailu | 392427cca6 | |
wanghailu | 8cfe04e5b7 | |
wanghailu | a8bd1a910c | |
wanghailu | b68dcf8b9a | |
wanghailu | 111ff10df0 | |
wanghailu | db9069f1b7 | |
wanghailu | 468ed541af | |
wanghailu | 267bfa3a4b |
|
@ -13,7 +13,7 @@ void element_wise_kernel(const RuntimeObj *obj, const Operator &_op) {
|
|||
auto dim = op->getInputs(0)->getDims();
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(obj);
|
||||
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
||||
if (op->getOpType() == OpType::Div)
|
||||
if (op->getOpType() == OpType::DivDemo)
|
||||
div_kernel(context->cnnlHandle(), aData, bData, cData, n * c * h * w);
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
|
|
|
@ -30,6 +30,7 @@ class BangRuntimeObj : public RuntimeObj {
|
|||
dealloc(workspace);
|
||||
checkCnnlError(cnnlDestroy(cnnl));
|
||||
}
|
||||
string toString() const override;
|
||||
|
||||
void run(const Graph &graph, bool tune = false,
|
||||
bool profiling = false) const;
|
||||
|
|
|
@ -6,8 +6,10 @@ class DataType {
|
|||
public:
|
||||
static const DataType Float32;
|
||||
static const DataType UInt32;
|
||||
static constexpr size_t sizePerElement[]{sizeof(float), sizeof(uint32_t)};
|
||||
static constexpr std::string_view names[]{"Float32", "UInt32"};
|
||||
static const DataType Int32;
|
||||
static constexpr size_t sizePerElement[]{sizeof(float), sizeof(uint32_t),
|
||||
sizeof(int32_t)};
|
||||
static constexpr std::string_view names[]{"Float32", "UInt32", "Int32"};
|
||||
|
||||
private:
|
||||
int index;
|
||||
|
@ -29,9 +31,11 @@ class DataType {
|
|||
|
||||
inline const DataType DataType::Float32(0);
|
||||
inline const DataType DataType::UInt32(1);
|
||||
inline const DataType DataType::Int32(2);
|
||||
// Method definitions are out of the declaration due to GCC bug:
|
||||
// https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
|
||||
template <> inline DataType DataType::get<float>() { return Float32; }
|
||||
template <> inline DataType DataType::get<uint32_t>() { return UInt32; }
|
||||
template <> inline DataType DataType::get<int32_t>() { return Int32; }
|
||||
|
||||
} // namespace infini
|
||||
} // namespace infini
|
||||
|
|
|
@ -7,6 +7,8 @@ enum class OpType {
|
|||
Unknown = 0,
|
||||
// linear
|
||||
Conv = 100,
|
||||
ConvBackwardFilter,
|
||||
ConvBackwardData,
|
||||
Matmul,
|
||||
ConvTrans,
|
||||
G2BMM,
|
||||
|
@ -23,6 +25,8 @@ enum class OpType {
|
|||
Sub,
|
||||
Mul,
|
||||
Div,
|
||||
DivDemo,
|
||||
DivNoNan,
|
||||
Pow,
|
||||
Gather,
|
||||
ReduceMean,
|
||||
|
@ -34,10 +38,82 @@ enum class OpType {
|
|||
Softmax,
|
||||
Activation,
|
||||
Relu,
|
||||
ReluBackward,
|
||||
Sigmoid,
|
||||
SigmoidBackward,
|
||||
Tanh,
|
||||
TanhBackward,
|
||||
Abs,
|
||||
Sin,
|
||||
Cos,
|
||||
Tan,
|
||||
ASin,
|
||||
ACos,
|
||||
ATan,
|
||||
SinH,
|
||||
CosH,
|
||||
TanH,
|
||||
ASinH,
|
||||
ACosH,
|
||||
ATanH,
|
||||
Resize,
|
||||
Arange,
|
||||
Copy,
|
||||
Ceil,
|
||||
Floor,
|
||||
Clip,
|
||||
Erf,
|
||||
Exp,
|
||||
Fill,
|
||||
Log_e,
|
||||
Log_2,
|
||||
Log_10,
|
||||
Log1p,
|
||||
L2Loss,
|
||||
Maximum,
|
||||
Minimum,
|
||||
MSELoss,
|
||||
NegTensor,
|
||||
Power,
|
||||
Reciprocal,
|
||||
Sqrt,
|
||||
Rsqrt,
|
||||
Transform,
|
||||
AddN,
|
||||
MulN,
|
||||
Cast,
|
||||
FloorDiv,
|
||||
FloorDivTrunc,
|
||||
FloorMod,
|
||||
FloorModTrunc,
|
||||
Cumsum,
|
||||
Cumprod,
|
||||
Det,
|
||||
Round,
|
||||
Square,
|
||||
SquaredDifference,
|
||||
Flip,
|
||||
Hardtanh,
|
||||
Equal,
|
||||
NotEqual,
|
||||
GreaterThan,
|
||||
GreaterEqual,
|
||||
LessThan,
|
||||
LessEqual,
|
||||
And,
|
||||
Or,
|
||||
Xor,
|
||||
Not,
|
||||
Addcdiv,
|
||||
Addcmul,
|
||||
BitAnd,
|
||||
BitOr,
|
||||
BitXor,
|
||||
BitNot,
|
||||
BitLeftShift,
|
||||
BitRightShift,
|
||||
Dropout,
|
||||
Lrn,
|
||||
//
|
||||
MemBound = 300,
|
||||
};
|
||||
|
@ -55,6 +131,8 @@ class OpRegistry {
|
|||
FOP(Unknown);
|
||||
// linear
|
||||
FOP(Conv);
|
||||
FOP(ConvBackwardFilter);
|
||||
FOP(ConvBackwardData);
|
||||
FOP(Matmul);
|
||||
FOP(ConvTrans);
|
||||
FOP(G2BMM);
|
||||
|
@ -71,6 +149,8 @@ class OpRegistry {
|
|||
FOP(Sub);
|
||||
FOP(Mul);
|
||||
FOP(Div);
|
||||
FOP(DivDemo);
|
||||
FOP(DivNoNan);
|
||||
FOP(Pow);
|
||||
FOP(Gather);
|
||||
FOP(ReduceMean);
|
||||
|
@ -81,9 +161,81 @@ class OpRegistry {
|
|||
FOP(Softmax);
|
||||
FOP(Activation);
|
||||
FOP(Relu);
|
||||
FOP(ReluBackward);
|
||||
FOP(Sigmoid);
|
||||
FOP(SigmoidBackward);
|
||||
FOP(Tanh);
|
||||
FOP(TanhBackward);
|
||||
FOP(Abs);
|
||||
FOP(Sin);
|
||||
FOP(Cos);
|
||||
FOP(Tan);
|
||||
FOP(ASin);
|
||||
FOP(ACos);
|
||||
FOP(ATan);
|
||||
FOP(SinH);
|
||||
FOP(CosH);
|
||||
FOP(TanH);
|
||||
FOP(ASinH);
|
||||
FOP(ACosH);
|
||||
FOP(ATanH);
|
||||
FOP(Arange);
|
||||
FOP(Copy);
|
||||
FOP(Ceil);
|
||||
FOP(Floor);
|
||||
FOP(Clip);
|
||||
FOP(Erf);
|
||||
FOP(Exp);
|
||||
FOP(Fill);
|
||||
FOP(Log_e);
|
||||
FOP(Log_2);
|
||||
FOP(Log_10);
|
||||
FOP(Log1p);
|
||||
FOP(L2Loss);
|
||||
FOP(Maximum);
|
||||
FOP(Minimum);
|
||||
FOP(MSELoss);
|
||||
FOP(NegTensor);
|
||||
FOP(Power);
|
||||
FOP(Reciprocal);
|
||||
FOP(Sqrt);
|
||||
FOP(Rsqrt);
|
||||
FOP(Transform);
|
||||
FOP(AddN);
|
||||
FOP(MulN);
|
||||
FOP(Cast);
|
||||
FOP(FloorDiv);
|
||||
FOP(FloorDivTrunc);
|
||||
FOP(FloorMod);
|
||||
FOP(FloorModTrunc);
|
||||
FOP(Cumsum);
|
||||
FOP(Cumprod);
|
||||
FOP(Det);
|
||||
FOP(Round);
|
||||
FOP(Square);
|
||||
FOP(SquaredDifference);
|
||||
FOP(Flip);
|
||||
FOP(Hardtanh);
|
||||
FOP(Equal);
|
||||
FOP(NotEqual);
|
||||
FOP(GreaterThan);
|
||||
FOP(GreaterEqual);
|
||||
FOP(LessThan);
|
||||
FOP(LessEqual);
|
||||
FOP(And);
|
||||
FOP(Or);
|
||||
FOP(Xor);
|
||||
FOP(Not);
|
||||
FOP(Addcdiv);
|
||||
FOP(Addcmul);
|
||||
FOP(BitAnd);
|
||||
FOP(BitOr);
|
||||
FOP(BitXor);
|
||||
FOP(BitNot);
|
||||
FOP(BitLeftShift);
|
||||
FOP(BitRightShift);
|
||||
FOP(Dropout);
|
||||
FOP(Lrn);
|
||||
//
|
||||
FOP(MemBound);
|
||||
default:
|
||||
|
@ -147,6 +299,13 @@ class OperatorObj : public Object {
|
|||
|
||||
public:
|
||||
OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
|
||||
OperatorObj(OpType opType);
|
||||
void setInputs(TensorVec inputsTensor) {
|
||||
inputs = inputsTensor;
|
||||
for (auto &t : inputs)
|
||||
IT_ASSERT(t != nullptr);
|
||||
}
|
||||
void setOutputs(TensorVec outputsTensor) { outputs = outputsTensor; }
|
||||
virtual optional<vector<Shape>>
|
||||
inferShape(const TensorVec &inputs) const = 0;
|
||||
virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
|
||||
|
@ -158,6 +317,7 @@ class OperatorObj : public Object {
|
|||
* function.
|
||||
*/
|
||||
bool checkValid(GraphObj *graph);
|
||||
bool checkValid(GraphObj *graph, DataType type);
|
||||
OpPerfKey getOpPerfKey() const;
|
||||
/**
|
||||
* @brief Hash operator attributes. Input and output shapes are not
|
||||
|
|
|
@ -72,6 +72,7 @@ class TensorObj : public TensorBaseObj {
|
|||
private:
|
||||
void printDataFloat() const;
|
||||
void printDataUint32_t() const;
|
||||
void printDataInt32_t() const;
|
||||
|
||||
template <typename T>
|
||||
bool equalDataImpl(const T *a, const T *b, size_t size) const {
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
#pragma once
|
||||
|
||||
namespace infini {
|
||||
void transpose_kernel(float *a, float *c, int dim0, int dim1, int dim2, int dim3, int p0, int p1, int p2, int p3);
|
||||
}; // namespace infini
|
|
@ -0,0 +1,31 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
class ActivationBackwardObj : public OperatorObj {
|
||||
public:
|
||||
ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y,
|
||||
Tensor x, Tensor diff_x);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 3; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
#define DEFINE_ACTIVATION_BACKWARD_OBJ(prefix, type) \
|
||||
class prefix##Obj : public ActivationBackwardObj { \
|
||||
public: \
|
||||
prefix##Obj(GraphObj *graph, Tensor y, Tensor diff_y, Tensor x, \
|
||||
Tensor diff_x) \
|
||||
: ActivationBackwardObj(type, graph, y, diff_y, x, diff_x) {} \
|
||||
};
|
||||
|
||||
DEFINE_ACTIVATION_BACKWARD_OBJ(ReluBackward, OpType::ReluBackward)
|
||||
DEFINE_ACTIVATION_BACKWARD_OBJ(SigmoidBackward, OpType::SigmoidBackward)
|
||||
DEFINE_ACTIVATION_BACKWARD_OBJ(TanhBackward, OpType::TanhBackward)
|
||||
}; // namespace infini
|
|
@ -86,6 +86,29 @@ class ConvObj : public ConvBaseObj {
|
|||
void setAuxilaryAttributes(PaddingMode mode) override;
|
||||
};
|
||||
|
||||
class ConvBackwardFilterObj : public ConvBaseObj {
|
||||
private:
|
||||
ActType act;
|
||||
|
||||
public:
|
||||
ConvBackwardFilterObj(GraphObj *graph, Tensor inputX, Tensor diffY,
|
||||
Tensor diffW, int ph, int pw, int sh = 1, int sw = 1,
|
||||
int dh = 1, int dw = 1, Tensor bias = nullptr,
|
||||
ActType act = ActType::None);
|
||||
// Constructors for setting padding mode
|
||||
ConvBackwardFilterObj(GraphObj *graph, Tensor inputX, Tensor diffY,
|
||||
Tensor diffW, PaddingMode mode = PaddingMode::Same,
|
||||
int sh = 1, int sw = 1, int dh = 1, int dw = 1,
|
||||
Tensor bias = nullptr, ActType act = ActType::None);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
ActType getAct() const { return act; }
|
||||
int getNumGroups() const override { return c / getChannelPerGroup(); }
|
||||
|
||||
private:
|
||||
void setAuxilaryAttributes(PaddingMode mode) override;
|
||||
};
|
||||
|
||||
class ConvTransposed2dObj : public ConvBaseObj {
|
||||
private:
|
||||
int oph, opw;
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
class DetObj : public OperatorObj {
|
||||
public:
|
||||
enum Mode { NormalDet = 0, LogDet };
|
||||
DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
Mode getMode() const { return modeValue; }
|
||||
|
||||
private:
|
||||
Mode modeValue;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
}; // namespace infini
|
|
@ -17,6 +17,88 @@ class ElementWiseObj : public OperatorObj {
|
|||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class MSELossObj : public OperatorObj {
|
||||
public:
|
||||
enum Reduction { None = 0, Sum, Mean };
|
||||
MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
|
||||
Reduction reduction, Tensor output);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
Reduction getReduction() const { return reductionMode; }
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 2; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
Reduction reductionMode;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class AddNObj : public OperatorObj {
|
||||
public:
|
||||
AddNObj(GraphObj *graph, int tensorNum, Tensor output, ...);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return num; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
int num;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class MulNObj : public OperatorObj {
|
||||
public:
|
||||
MulNObj(GraphObj *graph, int tensorNum, Tensor output, ...);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return num; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
int num;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class AddcdivObj : public OperatorObj {
|
||||
public:
|
||||
AddcdivObj(GraphObj *graph, float alpha, Tensor input0,
|
||||
Tensor input1, Tensor input2, Tensor output);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 3; }
|
||||
int numOutputs() const override { return 1; }
|
||||
float getAlpha() { return alphaValue; }
|
||||
|
||||
private:
|
||||
float alphaValue;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class AddcmulObj : public OperatorObj {
|
||||
public:
|
||||
AddcmulObj(GraphObj *graph, float alpha, Tensor input0,
|
||||
Tensor input1, Tensor input2, Tensor output);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 3; }
|
||||
int numOutputs() const override { return 1; }
|
||||
float getAlpha() { return alphaValue; }
|
||||
|
||||
private:
|
||||
float alphaValue;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
#define DEFINE_ELEMENT_WISE_OBJ(prefix, type) \
|
||||
class prefix##Obj : public ElementWiseObj { \
|
||||
public: \
|
||||
|
@ -28,6 +110,32 @@ class ElementWiseObj : public OperatorObj {
|
|||
DEFINE_ELEMENT_WISE_OBJ(Add, OpType::Add)
|
||||
DEFINE_ELEMENT_WISE_OBJ(Sub, OpType::Sub)
|
||||
DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
|
||||
DEFINE_ELEMENT_WISE_OBJ(DivDemo, OpType::DivDemo)
|
||||
DEFINE_ELEMENT_WISE_OBJ(DivNoNan, OpType::DivNoNan)
|
||||
DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
|
||||
DEFINE_ELEMENT_WISE_OBJ(Pow, OpType::Pow)
|
||||
DEFINE_ELEMENT_WISE_OBJ(Maximum, OpType::Maximum)
|
||||
DEFINE_ELEMENT_WISE_OBJ(Minimum, OpType::Minimum)
|
||||
DEFINE_ELEMENT_WISE_OBJ(Power, OpType::Power)
|
||||
DEFINE_ELEMENT_WISE_OBJ(FloorDiv, OpType::FloorDiv)
|
||||
DEFINE_ELEMENT_WISE_OBJ(FloorDivTrunc, OpType::FloorDivTrunc)
|
||||
DEFINE_ELEMENT_WISE_OBJ(FloorMod, OpType::FloorMod)
|
||||
DEFINE_ELEMENT_WISE_OBJ(FloorModTrunc, OpType::FloorModTrunc)
|
||||
DEFINE_ELEMENT_WISE_OBJ(SquaredDifference, OpType::SquaredDifference)
|
||||
DEFINE_ELEMENT_WISE_OBJ(Equal, OpType::Equal)
|
||||
DEFINE_ELEMENT_WISE_OBJ(NotEqual, OpType::NotEqual)
|
||||
DEFINE_ELEMENT_WISE_OBJ(GreaterThan, OpType::GreaterThan)
|
||||
DEFINE_ELEMENT_WISE_OBJ(GreaterEqual, OpType::GreaterEqual)
|
||||
DEFINE_ELEMENT_WISE_OBJ(LessThan, OpType::LessThan)
|
||||
DEFINE_ELEMENT_WISE_OBJ(LessEqual, OpType::LessEqual)
|
||||
DEFINE_ELEMENT_WISE_OBJ(And, OpType::And)
|
||||
DEFINE_ELEMENT_WISE_OBJ(Or, OpType::Or)
|
||||
DEFINE_ELEMENT_WISE_OBJ(Xor, OpType::Xor)
|
||||
DEFINE_ELEMENT_WISE_OBJ(Not, OpType::Not)
|
||||
DEFINE_ELEMENT_WISE_OBJ(BitAnd, OpType::BitAnd)
|
||||
DEFINE_ELEMENT_WISE_OBJ(BitOr, OpType::BitOr)
|
||||
DEFINE_ELEMENT_WISE_OBJ(BitXor, OpType::BitXor)
|
||||
DEFINE_ELEMENT_WISE_OBJ(BitNot, OpType::BitNot)
|
||||
DEFINE_ELEMENT_WISE_OBJ(BitLeftShift, OpType::BitLeftShift)
|
||||
DEFINE_ELEMENT_WISE_OBJ(BitRightShift, OpType::BitRightShift)
|
||||
}; // namespace infini
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
class TransposeObj : public OperatorObj {
|
||||
public:
|
||||
TransposeObj(GraphObj *graph, Tensor input, Tensor output, int permute[4]);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
auto getPermute() { return transposePermute; }
|
||||
|
||||
private:
|
||||
int transposePermute[4];
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
}; // namespace infini
|
|
@ -16,6 +16,246 @@ class UnaryObj : public OperatorObj {
|
|||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class ClipObj : public OperatorObj {
|
||||
public:
|
||||
ClipObj(GraphObj *graph, Tensor input, Tensor output, float min, float max);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
float getMin() const { return minValue; };
|
||||
float getMax() const { return maxValue; };
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
float minValue, maxValue;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class HardtanhObj : public OperatorObj {
|
||||
public:
|
||||
HardtanhObj(GraphObj *graph, Tensor input, Tensor output, float min, float max);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
float getMin() const { return minValue; };
|
||||
float getMax() const { return maxValue; };
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
float minValue, maxValue;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class FlipObj : public OperatorObj {
|
||||
public:
|
||||
FlipObj(GraphObj *graph, Tensor input, Tensor output, vector<int> axis);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
vector<int> getAxis() const { return axisValue; };
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
vector<int> axisValue;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class FillObj : public OperatorObj {
|
||||
public:
|
||||
FillObj(GraphObj *graph, Tensor input, Tensor output, float value);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
float getValue() const { return setValue; };
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
float setValue;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class L2LossObj : public OperatorObj {
|
||||
public:
|
||||
L2LossObj(GraphObj *graph, Tensor input, Tensor output);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class TransformObj : public OperatorObj {
|
||||
public:
|
||||
TransformObj(GraphObj *graph, Tensor input, Tensor output, float alpha,
|
||||
float beta);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
float getAlpha() const { return alphaValue; }
|
||||
float getBeta() const { return betaValue; }
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
float alphaValue, betaValue;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class LrnObj : public OperatorObj {
|
||||
public:
|
||||
LrnObj(GraphObj *graph, Tensor input, Tensor output, int feature_num, float alpha, float beta, float bias);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
std::string toString() const override;
|
||||
float getFeatureNum() const { return featureNumValue; }
|
||||
float getAlpha() const { return alphaValue; }
|
||||
float getBeta() const { return betaValue; }
|
||||
float getBias() const { return biasValue; }
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
int featureNumValue;
|
||||
float alphaValue;
|
||||
float betaValue;
|
||||
float biasValue;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class CastObj : public OperatorObj {
|
||||
public:
|
||||
enum CastType {
|
||||
Float2Half = 0,
|
||||
Float2HalfIEEE754,
|
||||
Float2Double,
|
||||
Float2Int64,
|
||||
Float2Int32,
|
||||
Float2Int16,
|
||||
Float2Int8,
|
||||
Float2Bool,
|
||||
Half2Float,
|
||||
Half2Int32,
|
||||
Half2Int64,
|
||||
Half2Int16,
|
||||
Half2Int8,
|
||||
Half2Uint8,
|
||||
Half2Bool,
|
||||
Half2FloatInf,
|
||||
Int322Float,
|
||||
Int322Half,
|
||||
Int322Int8,
|
||||
Int322Int16,
|
||||
Int162Float,
|
||||
Int162Half,
|
||||
Int162Int32,
|
||||
Int82Float,
|
||||
Int82Half,
|
||||
Int82Int16,
|
||||
Int82Int32,
|
||||
Uint82Float,
|
||||
Uint82Half,
|
||||
Uint82Int32,
|
||||
Uint82Int64,
|
||||
Bool2Float,
|
||||
Bool2Half,
|
||||
Bool2Int32,
|
||||
Int322Int64,
|
||||
Int322Bool,
|
||||
Int642Int32,
|
||||
Int642Uint32,
|
||||
Int642Float,
|
||||
Int642Half,
|
||||
Uint642Uint32,
|
||||
Uint322Int64,
|
||||
Uint322Uint64,
|
||||
Double2Float
|
||||
};
|
||||
CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
CastType getType() const { return castType; }
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
CastType castType;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class CumsumObj : public OperatorObj {
|
||||
public:
|
||||
CumsumObj(GraphObj *graph, Tensor input, Tensor output, int axis,
|
||||
bool exclusive, bool reverse);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int getAxis() const { return axisValue; }
|
||||
float getExclusive() const { return exclusiveValue; }
|
||||
float getReverse() const { return reverseValue; }
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
int axisValue;
|
||||
bool exclusiveValue, reverseValue;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class ArangeObj : public OperatorObj {
|
||||
public:
|
||||
ArangeObj(GraphObj *graph, float start, float step, int length, Tensor output);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 0; }
|
||||
int numOutputs() const override { return 1; }
|
||||
float getStartValue() { return startValue; }
|
||||
float getStepValue() { return stepValue; }
|
||||
int getLength() { return lengthValue; }
|
||||
|
||||
private:
|
||||
float startValue, stepValue;
|
||||
int lengthValue;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
// class CumprodObj : public OperatorObj {
|
||||
// public:
|
||||
// CumprodObj(GraphObj *graph, Tensor input, Tensor output, int axis, bool
|
||||
// exclusive, bool reverse); optional<vector<Shape>> inferShape(const
|
||||
// TensorVec &inputs) const override;
|
||||
//
|
||||
// std::string toString() const override;
|
||||
// int getAxis() const { return axisValue; }
|
||||
// float getExclusive() const { return exclusiveValue; }
|
||||
// float getReverse() const { return reverseValue; }
|
||||
// int numInputs() const override { return 1; }
|
||||
// int numOutputs() const override { return 1; }
|
||||
//
|
||||
// private:
|
||||
// int axisValue;
|
||||
// bool exclusiveValue, reverseValue;
|
||||
// vector<int> getWorkloadVector() const override;
|
||||
// vector<int> getOpAttrVector() const override;
|
||||
// };
|
||||
|
||||
#define DEFINE_UNARY_OBJ(prefix, type) \
|
||||
class prefix##Obj : public UnaryObj { \
|
||||
public: \
|
||||
|
@ -28,4 +268,33 @@ DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
|
|||
DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
|
||||
DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)
|
||||
DEFINE_UNARY_OBJ(Abs, OpType::Abs)
|
||||
|
||||
DEFINE_UNARY_OBJ(Sin, OpType::Sin)
|
||||
DEFINE_UNARY_OBJ(Cos, OpType::Cos)
|
||||
DEFINE_UNARY_OBJ(Tan, OpType::Tan)
|
||||
DEFINE_UNARY_OBJ(ASin, OpType::ASin)
|
||||
DEFINE_UNARY_OBJ(ACos, OpType::ACos)
|
||||
DEFINE_UNARY_OBJ(ATan, OpType::ATan)
|
||||
DEFINE_UNARY_OBJ(SinH, OpType::SinH)
|
||||
DEFINE_UNARY_OBJ(CosH, OpType::CosH)
|
||||
DEFINE_UNARY_OBJ(TanH, OpType::TanH)
|
||||
DEFINE_UNARY_OBJ(ASinH, OpType::ASinH)
|
||||
DEFINE_UNARY_OBJ(ACosH, OpType::ACosH)
|
||||
DEFINE_UNARY_OBJ(ATanH, OpType::ATanH)
|
||||
|
||||
DEFINE_UNARY_OBJ(Copy, OpType::Copy)
|
||||
DEFINE_UNARY_OBJ(Ceil, OpType::Ceil)
|
||||
DEFINE_UNARY_OBJ(Floor, OpType::Floor)
|
||||
DEFINE_UNARY_OBJ(Erf, OpType::Erf)
|
||||
DEFINE_UNARY_OBJ(Exp, OpType::Exp)
|
||||
DEFINE_UNARY_OBJ(Log_e, OpType::Log_e)
|
||||
DEFINE_UNARY_OBJ(Log_2, OpType::Log_2)
|
||||
DEFINE_UNARY_OBJ(Log_10, OpType::Log_10)
|
||||
DEFINE_UNARY_OBJ(Log1p, OpType::Log1p)
|
||||
DEFINE_UNARY_OBJ(NegTensor, OpType::NegTensor)
|
||||
DEFINE_UNARY_OBJ(Reciprocal, OpType::Reciprocal)
|
||||
DEFINE_UNARY_OBJ(Sqrt, OpType::Sqrt)
|
||||
DEFINE_UNARY_OBJ(Rsqrt, OpType::Rsqrt)
|
||||
DEFINE_UNARY_OBJ(Round, OpType::Round)
|
||||
DEFINE_UNARY_OBJ(Square, OpType::Square)
|
||||
}; // namespace infini
|
||||
|
|
|
@ -54,4 +54,6 @@ void BangRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
|||
|
||||
void BangRuntimeObj::sync() const { cnrtSyncDevice(); }
|
||||
|
||||
string BangRuntimeObj::toString() const { return "BANG Runtime"; }
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -61,4 +61,4 @@ OpVec GraphObj::getComputeOps() const {
|
|||
return opList;
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
} // namespace infini
|
||||
|
|
|
@ -10,6 +10,8 @@ OperatorObj::OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs)
|
|||
IT_ASSERT(t != nullptr);
|
||||
}
|
||||
|
||||
OperatorObj::OperatorObj(OpType opType) : type(opType) {}
|
||||
|
||||
bool OperatorObj::isLinearOp() const {
|
||||
return enum_to_underlying(type) >= 100 && enum_to_underlying(type) < 200;
|
||||
}
|
||||
|
@ -78,6 +80,30 @@ bool OperatorObj::checkValid(GraphObj *graph) {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool OperatorObj::checkValid(GraphObj *graph, DataType type) {
|
||||
auto optShapes = inferShape();
|
||||
if (!optShapes) // shape inference failed
|
||||
return false;
|
||||
|
||||
const vector<Shape> &shapes = *optShapes;
|
||||
if (shapes.size() != outputs.size())
|
||||
return false;
|
||||
if (graph) { // if graph != nullptr, outputs should be created
|
||||
auto dataTypes = vector(numOutputs(), type);
|
||||
;
|
||||
for (size_t i = 0; i < outputs.size(); i++) {
|
||||
IT_ASSERT(!outputs[i]);
|
||||
outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
|
||||
}
|
||||
} else { // if outputs have been created, check their shapes
|
||||
for (size_t i = 0; i < shapes.size(); ++i) {
|
||||
if (shapes[i] != outputs[i]->getDims())
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
optional<vector<Shape>> OperatorObj::inferShape() const {
|
||||
return inferShape(inputs);
|
||||
}
|
||||
|
|
|
@ -69,6 +69,8 @@ void TensorObj::printData() const {
|
|||
printDataFloat();
|
||||
else if (dtype == DataType::UInt32)
|
||||
printDataUint32_t();
|
||||
else if (dtype == DataType::Int32)
|
||||
printDataInt32_t();
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
@ -87,7 +89,7 @@ void TensorObj::printDataFloat() const {
|
|||
std::cout << "[";
|
||||
}
|
||||
}
|
||||
printf("%.1f", ptr[i]);
|
||||
printf("%.6f", ptr[i]);
|
||||
for (size_t j = 0; j < numDims; ++j) {
|
||||
if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
|
||||
std::cout << "]";
|
||||
|
@ -128,6 +130,34 @@ void TensorObj::printDataUint32_t() const {
|
|||
}
|
||||
}
|
||||
|
||||
void TensorObj::printDataInt32_t() const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
std::cout << "Tensor: " << guid << std::endl;
|
||||
auto numDims = shape.size();
|
||||
auto dimSzVec = std::vector<int>(numDims, 1);
|
||||
auto ptr = data->getPtr<int32_t *>();
|
||||
dimSzVec[numDims - 1] = shape[numDims - 1];
|
||||
for (int i = numDims - 1; i != 0; --i)
|
||||
dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
|
||||
for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
|
||||
for (size_t j = 0; j < numDims; ++j) {
|
||||
if (i % dimSzVec[j] == 0) {
|
||||
std::cout << "[";
|
||||
}
|
||||
}
|
||||
std::cout << ptr[i];
|
||||
for (size_t j = 0; j < numDims; ++j) {
|
||||
if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
|
||||
std::cout << "]";
|
||||
}
|
||||
}
|
||||
if (i != size() - 1)
|
||||
std::cout << ", ";
|
||||
if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1)
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
bool TensorObj::equalData(const Tensor &rhs) const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
IT_ASSERT(rhs->data != nullptr);
|
||||
|
@ -142,6 +172,9 @@ bool TensorObj::equalData(const Tensor &rhs) const {
|
|||
else if (getDType() == DataType::Float32)
|
||||
return equalDataImpl(getRawDataPtr<float *>(),
|
||||
rhs->getRawDataPtr<float *>(), size());
|
||||
else if (getDType() == DataType::Int32)
|
||||
return equalDataImpl(getRawDataPtr<int32_t *>(),
|
||||
rhs->getRawDataPtr<int32_t *>(), size());
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
@ -155,6 +188,8 @@ void TensorObj::dataMalloc() {
|
|||
bytesPerElement = sizeof(float);
|
||||
else if (getDType() == DataType::UInt32)
|
||||
bytesPerElement = sizeof(uint32_t);
|
||||
else if (getDType() == DataType::Int32)
|
||||
bytesPerElement = sizeof(int32_t);
|
||||
data = runtime->allocBlob(size() * bytesPerElement);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,161 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class UnaryCnnl : public BangKernelWithoutConfig {
|
||||
virtual cnnlActivationMode_t getOpType() const = 0;
|
||||
virtual float getCoef() const = 0;
|
||||
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get op descriptor
|
||||
cnnlActivationDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
|
||||
checkCnnlError(cnnlSetActivationDescriptor(
|
||||
opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
|
||||
|
||||
auto [alpha, beta] = getAlphBeta();
|
||||
cnnlStatus_t stat =
|
||||
cnnlActivationForward(context->cnnlHandle(), opDesc, &alpha, aDesc,
|
||||
aData, &beta, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
checkCnnlError(cnnlDestroyActivationDescriptor(opDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class RoundCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class SquareCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlSquare(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class ReluCnnl : public UnaryCnnl {
|
||||
cnnlActivationMode_t getOpType() const override {
|
||||
return CNNL_ACTIVATION_RELU;
|
||||
}
|
||||
float getCoef() const override { return 0.0; }
|
||||
};
|
||||
|
||||
class SigmoidCnnl : public UnaryCnnl {
|
||||
cnnlActivationMode_t getOpType() const override {
|
||||
return CNNL_ACTIVATION_SIGMOID;
|
||||
}
|
||||
float getCoef() const override { return 0.0; }
|
||||
};
|
||||
|
||||
class TanhCnnl : public UnaryCnnl {
|
||||
cnnlActivationMode_t getOpType() const override {
|
||||
return CNNL_ACTIVATION_TANH;
|
||||
}
|
||||
float getCoef() const override { return 0.0; }
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Relu, DataType::Float32, ReluCnnl,
|
||||
"Relu_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, DataType::Float32, SigmoidCnnl,
|
||||
"Sigmoid_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Tanh, DataType::Float32, TanhCnnl,
|
||||
"Tanh_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Round, DataType::Float32, RoundCnnl,
|
||||
"Round_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Square, DataType::Float32, SquareCnnl,
|
||||
"Square_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,94 @@
|
|||
#include "operators/activation_backward.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class ActivationBackwardCnnl : public BangKernelWithoutConfig {
|
||||
virtual cnnlActivationMode_t getOpType() const = 0;
|
||||
virtual float getCoef() const = 0;
|
||||
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ActivationBackwardObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const yData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const diffYData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const xData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
void *const diffXData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t yDesc, diffYDesc, xDesc, diffXDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&yDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(diffYDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&xDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(xDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(diffXDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get op descriptor
|
||||
cnnlActivationDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
|
||||
checkCnnlError(cnnlSetActivationDescriptor(
|
||||
opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
|
||||
|
||||
auto [alpha, beta] = getAlphBeta();
|
||||
cnnlStatus_t stat = cnnlActivationBackward(
|
||||
context->cnnlHandle(), opDesc, &alpha, yDesc, yData, diffYDesc,
|
||||
diffYData, xDesc, xData, &beta, diffXDesc, diffXData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(yDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(diffYDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(xDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(diffXDesc));
|
||||
checkCnnlError(cnnlDestroyActivationDescriptor(opDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class ReluBackwardCnnl : public ActivationBackwardCnnl {
|
||||
cnnlActivationMode_t getOpType() const override {
|
||||
return CNNL_ACTIVATION_RELU;
|
||||
}
|
||||
float getCoef() const override { return 0.0; }
|
||||
};
|
||||
|
||||
class SigmoidBackwardCnnl : public ActivationBackwardCnnl {
|
||||
cnnlActivationMode_t getOpType() const override {
|
||||
return CNNL_ACTIVATION_SIGMOID;
|
||||
}
|
||||
float getCoef() const override { return 0.0; }
|
||||
};
|
||||
|
||||
class TanhBackwardCnnl : public ActivationBackwardCnnl {
|
||||
cnnlActivationMode_t getOpType() const override {
|
||||
return CNNL_ACTIVATION_TANH;
|
||||
}
|
||||
float getCoef() const override { return 0.0; }
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, DataType::Float32,
|
||||
ReluBackwardCnnl, "ReluBackward_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, DataType::Float32,
|
||||
SigmoidBackwardCnnl, "SigmoidBackward_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, DataType::Float32,
|
||||
TanhBackwardCnnl, "TanhBackward_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,51 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
namespace infini {
|
||||
class AddNCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<AddNObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
int num = op->numInputs();
|
||||
void *argv[num];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
argv[i] = op->getInputs(i)->getRawDataPtr<void *>();
|
||||
}
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t desc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
cnnlTensorDescriptor_t descArray[num];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
}
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlAddN(context->cnnlHandle(), descArray, argv, num, desc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
for (int i = 0; i < num; ++i) {
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
|
||||
}
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(desc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AddN, DataType::Float32, AddNCnnl,
|
||||
"AddN_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class ArangeCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ArangeObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
float start = op->getStartValue();
|
||||
float step = op->getStepValue();
|
||||
int length = op->getLength();
|
||||
|
||||
cnnlTensorDescriptor_t cDesc;
|
||||
int dim_array[1] = {length};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, 1, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlArange_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
&start, &step, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Arange, DataType::Float32, ArangeCnnl,
|
||||
"Arange_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,120 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class CastCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<CastObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
cnnlCastDataType_t NlCastType;
|
||||
CastObj::CastType type = op->getType();
|
||||
switch (type) {
|
||||
case CastObj::Float2Half:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_HALF, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_FLOAT_TO_HALF;
|
||||
break;
|
||||
case CastObj::Float2HalfIEEE754:
|
||||
case CastObj::Float2Double:
|
||||
case CastObj::Float2Int64:
|
||||
case CastObj::Float2Int32:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_FLOAT_TO_INT32;
|
||||
case CastObj::Float2Int16:
|
||||
case CastObj::Float2Int8:
|
||||
case CastObj::Float2Bool:
|
||||
// Todo
|
||||
break;
|
||||
case CastObj::Half2Float:
|
||||
case CastObj::Half2Int32:
|
||||
case CastObj::Half2Int64:
|
||||
case CastObj::Half2Int16:
|
||||
case CastObj::Half2Int8:
|
||||
case CastObj::Half2Uint8:
|
||||
case CastObj::Half2Bool:
|
||||
case CastObj::Half2FloatInf:
|
||||
// todo
|
||||
break;
|
||||
case CastObj::Int322Float:
|
||||
case CastObj::Int322Half:
|
||||
case CastObj::Int322Int8:
|
||||
case CastObj::Int322Int16:
|
||||
// todo
|
||||
break;
|
||||
case CastObj::Int162Float:
|
||||
case CastObj::Int162Half:
|
||||
case CastObj::Int162Int32:
|
||||
// todo
|
||||
break;
|
||||
case CastObj::Int82Float:
|
||||
case CastObj::Int82Half:
|
||||
case CastObj::Int82Int16:
|
||||
case CastObj::Int82Int32:
|
||||
// todo
|
||||
break;
|
||||
case CastObj::Uint82Float:
|
||||
case CastObj::Uint82Half:
|
||||
case CastObj::Uint82Int32:
|
||||
case CastObj::Uint82Int64:
|
||||
// todo
|
||||
break;
|
||||
case CastObj::Bool2Float:
|
||||
case CastObj::Bool2Half:
|
||||
case CastObj::Bool2Int32:
|
||||
// todo
|
||||
break;
|
||||
case CastObj::Int322Int64:
|
||||
case CastObj::Int322Bool:
|
||||
// todo
|
||||
break;
|
||||
case CastObj::Int642Int32:
|
||||
case CastObj::Int642Uint32:
|
||||
case CastObj::Int642Float:
|
||||
case CastObj::Int642Half:
|
||||
// todo
|
||||
break;
|
||||
case CastObj::Uint642Uint32:
|
||||
case CastObj::Uint322Int64:
|
||||
case CastObj::Uint322Uint64:
|
||||
// todo
|
||||
break;
|
||||
case CastObj::Double2Float:
|
||||
// todo
|
||||
break;
|
||||
}
|
||||
cnnlStatus_t stat = cnnlCastDataType(context->cnnlHandle(), aDesc,
|
||||
aData, NlCastType, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Cast, DataType::Float32, CastCnnl,
|
||||
"Cast_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,46 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class CeilCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Ceil, DataType::Float32, CeilCnnl,
|
||||
"Ceil_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,42 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class ClipCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ClipObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
float min = op->getMin();
|
||||
float max = op->getMax();
|
||||
|
||||
cnnlTensorDescriptor_t aDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Clip, DataType::Float32, ClipCnnl,
|
||||
"Clip_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,68 @@
|
|||
#include "operators/concat.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class ConcatCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConcatObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
int num = op->numInputs();
|
||||
int axis = op->getDim();
|
||||
void *argv[num];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
argv[i] = op->getInputs(i)->getRawDataPtr<void *>();
|
||||
}
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t desc;
|
||||
|
||||
int dim_array[num][4];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
auto dim = op->getInputs(i)->getDims();
|
||||
if (dim.size() != 4) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
dim_array[i][0] = dim[0];
|
||||
dim_array[i][1] = dim[1];
|
||||
dim_array[i][2] = dim[2];
|
||||
dim_array[i][3] = dim[3];
|
||||
}
|
||||
|
||||
auto dim = op->getOutput()->getDims();
|
||||
int dimout_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
desc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimout_array));
|
||||
cnnlTensorDescriptor_t descArray[num];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array[i]));
|
||||
}
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetConcatWorkspaceSize(context->cnnlHandle(), num, &wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlConcat(context->cnnlHandle(), num, axis, descArray, argv,
|
||||
wsData, wsSize, desc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
for (int i = 0; i < num; ++i) {
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
|
||||
}
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(desc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Concat, DataType::Float32, ConcatCnnl,
|
||||
"Concat_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,159 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/conv.h"
|
||||
|
||||
namespace infini {
|
||||
class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConvBackwardFilterObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
const int cpg = op->getChannelPerGroup();
|
||||
const int g = c / cpg;
|
||||
|
||||
int pad[4] = {ph, ph, pw, pw};
|
||||
int stride[2] = {sh, sw};
|
||||
int dilation[2] = {dh, dw};
|
||||
|
||||
cnnlConvolutionDescriptor_t convDesc;
|
||||
checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
|
||||
checkCnnlError(cnnlSetConvolutionDescriptor(
|
||||
convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc, aDescTrans, bDescTrans,
|
||||
cDescTrans;
|
||||
auto dimInputs0 = op->getInputs(0)->getDims();
|
||||
auto dimInputs1 = op->getInputs(1)->getDims();
|
||||
auto dimOutput = op->getOutput()->getDims();
|
||||
|
||||
if (dimInputs0.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
if (dimInputs1.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
if (dimOutput.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int inputs0Array[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2],
|
||||
dimInputs0[3]};
|
||||
int inputs1Array[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2],
|
||||
dimInputs1[3]};
|
||||
int outputArray[4] = {dimOutput[0], dimOutput[1], dimOutput[2],
|
||||
dimOutput[3]};
|
||||
|
||||
int inputs0ArrayTrans[4] = {dimInputs0[0], dimInputs0[2], dimInputs0[3],
|
||||
dimInputs0[1]};
|
||||
int inputs1ArrayTrans[4] = {dimInputs1[0], dimInputs1[2], dimInputs1[3],
|
||||
dimInputs1[1]};
|
||||
int outputArrayTrans[4] = {dimOutput[0], dimOutput[2], dimOutput[3],
|
||||
dimOutput[1]};
|
||||
|
||||
int transMode[4] = {0, 2, 3, 1};
|
||||
cnnlTransposeDescriptor_t transDesc;
|
||||
checkCnnlError(cnnlCreateTransposeDescriptor(&transDesc));
|
||||
checkCnnlError(cnnlSetTransposeDescriptor(transDesc, 4, transMode));
|
||||
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs0Array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDescTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDescTrans, CNNL_LAYOUT_NHWC,
|
||||
CNNL_DTYPE_FLOAT, 4,
|
||||
inputs0ArrayTrans));
|
||||
|
||||
size_t wsTrans1Size = dimInputs0[0] * dimInputs0[1] * dimInputs0[2] *
|
||||
dimInputs0[3] * sizeof(float);
|
||||
BangPtr wsTrans1Data = context->getWorkspace(wsTrans1Size);
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlTranspose(context->cnnlHandle(), transDesc, aDesc, aData,
|
||||
aDescTrans, wsTrans1Data);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs1Array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDescTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDescTrans, CNNL_LAYOUT_NHWC,
|
||||
CNNL_DTYPE_FLOAT, 4,
|
||||
inputs1ArrayTrans));
|
||||
|
||||
size_t wsTrans2Size = dimInputs1[0] * dimInputs1[1] * dimInputs1[2] *
|
||||
dimInputs1[3] * sizeof(float);
|
||||
BangPtr wsTrans2Data = context->getWorkspace(wsTrans2Size);
|
||||
|
||||
stat = cnnlTranspose(context->cnnlHandle(), transDesc, bDesc, bData,
|
||||
bDescTrans, wsTrans2Data);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, outputArray));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDescTrans));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDescTrans, CNNL_LAYOUT_NHWC,
|
||||
CNNL_DTYPE_FLOAT, 4,
|
||||
outputArrayTrans));
|
||||
|
||||
size_t wsTrans3Size = dimOutput[0] * dimOutput[1] * dimOutput[2] *
|
||||
dimOutput[3] * sizeof(float);
|
||||
BangPtr wsTrans3Data = context->getWorkspace(wsTrans3Size);
|
||||
|
||||
cnnlConvolutionBwdFilterAlgo_t algo;
|
||||
cnnlGetConvolutionBackwardFilterAlgorithm(
|
||||
context->cnnlHandle(), convDesc, aDescTrans, bDescTrans, cDescTrans,
|
||||
CNNL_CONVOLUTION_BWD_FILTER_FASTEST, &algo);
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetConvolutionBackwardFilterWorkspaceSize(
|
||||
context->cnnlHandle(), aDescTrans, bDescTrans, cDescTrans, convDesc,
|
||||
algo, &wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
stat = cnnlConvolutionBackwardFilter(
|
||||
context->cnnlHandle(), NULL, aDescTrans, wsTrans1Data, bDescTrans,
|
||||
wsTrans2Data, convDesc, algo, wsData, wsSize, NULL, cDescTrans,
|
||||
wsTrans3Data);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
int transMode2[4] = {0, 3, 1, 2};
|
||||
cnnlTransposeDescriptor_t transOutputDesc;
|
||||
checkCnnlError(cnnlCreateTransposeDescriptor(&transOutputDesc));
|
||||
checkCnnlError(
|
||||
cnnlSetTransposeDescriptor(transOutputDesc, 4, transMode2));
|
||||
|
||||
stat = cnnlTranspose(context->cnnlHandle(), transOutputDesc, cDescTrans,
|
||||
wsTrans3Data, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDescTrans));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDescTrans));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDescTrans));
|
||||
checkCnnlError(cnnlDestroyTransposeDescriptor(transDesc));
|
||||
checkCnnlError(cnnlDestroyTransposeDescriptor(transOutputDesc));
|
||||
checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ConvBackwardFilter, DataType::Float32,
|
||||
ConvBackwardFilterCnnl, "ConvBackwardFilter_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,46 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class CopyCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlCopy(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Copy, DataType::Float32, CopyCnnl,
|
||||
"Copy_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,50 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class CumsumCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<CumsumObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
int axis = op->getAxis();
|
||||
bool exclusive = op->getExclusive();
|
||||
bool reverse = op->getReverse();
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlCumsum(context->cnnlHandle(), aDesc, aData, axis, exclusive,
|
||||
reverse, CNNL_NOT_PROPAGATE_NAN, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Cumsum, DataType::Float32, CumsumCnnl,
|
||||
"Cumsum_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,53 @@
|
|||
#include "operators/det.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class DetCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<DetObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
DetObj::Mode mode = op->getMode();
|
||||
cnnlDetMode_t nlMode;
|
||||
if (mode == DetObj::LogDet) {
|
||||
nlMode = CNNL_DET_MODE_LOGDET;
|
||||
} else {
|
||||
nlMode = CNNL_DET_MODE_DET;
|
||||
}
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dimin = op->getInputs(0)->getDims();
|
||||
auto dimout = op->getOutput()->getDims();
|
||||
if (dimin.size() != 4 || dimout.size() != 2)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dimin_array[4] = {dimin[0], dimin[1], dimin[2], dimin[3]};
|
||||
int dimout_array[2] = {dimout[0], dimout[1]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimin_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 2, dimout_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlDet(context->cnnlHandle(), nlMode, aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Det, DataType::Float32, DetCnnl,
|
||||
"Det_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
|
@ -66,6 +66,772 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
class LogicOpCnnl : public BangKernelWithoutConfig {
|
||||
virtual cnnlLogicOp_t getOpType() const = 0;
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetLogicOpWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
&wsSize);
|
||||
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat = cnnlLogicOp(context->cnnlHandle(), getOpType(),
|
||||
aDesc, aData, bDesc, bData,
|
||||
wsData, wsSize, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class BitComputeCnnl : public BangKernelWithoutConfig {
|
||||
virtual cnnlBitComputeOp_t getOpType() const = 0;
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_INT32, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_INT32, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_INT32, 4, dim_array));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetBitComputeWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
&wsSize);
|
||||
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat = cnnlBitCompute_v2(context->cnnlHandle(), getOpType(),
|
||||
aDesc, aData, bDesc, bData,
|
||||
cDesc, cData, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class DivCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
&wsSize);
|
||||
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat = cnnlDiv_v2(
|
||||
context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc,
|
||||
aData, bDesc, bData, wsData, wsSize, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class DivNoNanCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetDivNoNanWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
&wsSize);
|
||||
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat = cnnlDivNoNan_v2(
|
||||
context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc,
|
||||
aData, bDesc, bData, wsData, wsSize, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class MaximumCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get op descriptor
|
||||
size_t wsSize;
|
||||
cnnlGetMaximumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlMaximum(context->cnnlHandle(), aDesc, aData, bDesc, bData,
|
||||
cDesc, cData, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class MinimumCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get op descriptor
|
||||
size_t wsSize;
|
||||
cnnlGetMinimumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlMinimum(context->cnnlHandle(), aDesc, aData, bDesc, bData,
|
||||
cDesc, cData, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class MSELossCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<MSELossObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
MSELossObj::Reduction reduction = op->getReduction();
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
int dim_out[4] = {1, 1, 1, 1};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
if (reduction == MSELossObj::None) {
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
} else {
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_out));
|
||||
}
|
||||
cnnlStatus_t stat;
|
||||
if (reduction == MSELossObj::None) {
|
||||
stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_NONE, aDesc,
|
||||
aData, bDesc, bData, cDesc, cData);
|
||||
} else if (reduction == MSELossObj::Sum) {
|
||||
stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_SUM, aDesc,
|
||||
aData, bDesc, bData, cDesc, cData);
|
||||
} else {
|
||||
stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_MEAN, aDesc,
|
||||
aData, bDesc, bData, cDesc, cData);
|
||||
}
|
||||
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class PowerCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get op descriptor
|
||||
size_t wsSize;
|
||||
cnnlGetPowWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
&wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlPow(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
aDesc, aData, bDesc, bData, wsData, wsSize, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class FloorDivCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetFloorDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
&wsSize);
|
||||
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat = cnnlFloorDiv_v2(
|
||||
context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc,
|
||||
aData, bDesc, bData, cDesc, cData, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class FloorDivTruncCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetFloorDivTruncWorkspaceSize(context->cnnlHandle(), aDesc, bDesc,
|
||||
cDesc, &wsSize);
|
||||
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat = cnnlFloorDivTrunc(
|
||||
context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc,
|
||||
aData, bDesc, bData, cDesc, cData, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class FloorModCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
&wsSize);
|
||||
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlFloorMod(context->cnnlHandle(), aDesc, aData, bDesc, bData,
|
||||
cDesc, cData, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetSquaredDifferenceWorkspaceSize(context->cnnlHandle(), aDesc,
|
||||
bDesc, cDesc, &wsSize);
|
||||
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlSquaredDifference(context->cnnlHandle(), aDesc, aData, bDesc,
|
||||
bData, cDesc, cData, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class AddcdivCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<AddcdivObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
void *const oData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
float alpha = op->getAlpha();
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc, oDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&oDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(oDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetAddcdivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
&wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat = cnnlAddcdiv(context->cnnlHandle(), aDesc, aData, &alpha,
|
||||
bDesc, bData, cDesc, cData, wsData, wsSize, oDesc, oData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(oDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class AddcmulCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<AddcmulObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
void *const oData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
float alpha = op->getAlpha();
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc, oDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&oDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(oDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetAddcmulWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
|
||||
&wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat = cnnlAddcmul(context->cnnlHandle(), aDesc, aData, &alpha,
|
||||
bDesc, bData, cDesc, cData, wsData, wsSize, oDesc, oData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(oDesc));
|
||||
}
|
||||
};
|
||||
|
||||
// class FloorModTruncCnnl : public BangKernelWithoutConfig {
|
||||
// void compute(const Operator &_op,
|
||||
// const RuntimeObj *_context) const override {
|
||||
// auto op = as<ElementWiseObj>(_op);
|
||||
// auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
//
|
||||
// void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
// void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
// void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
//
|
||||
// cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
||||
// auto dim = op->getInputs(0)->getDims();
|
||||
// if (dim.size() != 4)
|
||||
// IT_TODO_HALT();
|
||||
//
|
||||
// int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// // get inputs
|
||||
// checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
// checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
// CNNL_DTYPE_FLOAT, 4,
|
||||
// dim_array));
|
||||
//
|
||||
// checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||
// checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
|
||||
// CNNL_DTYPE_FLOAT, 4,
|
||||
// dim_array));
|
||||
//
|
||||
// // get outputs
|
||||
// checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
// checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
// CNNL_DTYPE_FLOAT, 4,
|
||||
// dim_array));
|
||||
//
|
||||
// size_t wsSize;
|
||||
// cnnlGetFloorModTruncWorkspaceSize(context->cnnlHandle(), aDesc,
|
||||
// bDesc, cDesc,
|
||||
// &wsSize);
|
||||
//
|
||||
// BangPtr wsData = context->getWorkspace(wsSize);
|
||||
//
|
||||
// cnnlStatus_t stat = cnnlFloorModTrunc(context->cnnlHandle(),
|
||||
// aDesc, aData, bDesc, bData, cDesc,
|
||||
// cData, wsData, wsSize);
|
||||
// if (stat != CNNL_STATUS_SUCCESS)
|
||||
// return;
|
||||
//
|
||||
// // Destories in BANG does not require sync. But cnnl does not state
|
||||
// // whether sync is required before destories.
|
||||
// checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
// checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||
// checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
// }
|
||||
// };
|
||||
|
||||
class AddCnnl : public ElementWiseCnnl {
|
||||
cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_ADD; }
|
||||
};
|
||||
|
@ -88,6 +854,56 @@ class ElementWiseBang : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
class EqualCnnl : public LogicOpCnnl {
|
||||
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_EQ; }
|
||||
};
|
||||
class NotEqualCnnl : public LogicOpCnnl {
|
||||
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_NE; }
|
||||
};
|
||||
class GreaterThanCnnl : public LogicOpCnnl {
|
||||
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_GT; }
|
||||
};
|
||||
class GreaterEqualCnnl : public LogicOpCnnl {
|
||||
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_GE; }
|
||||
};
|
||||
class LessThanCnnl : public LogicOpCnnl {
|
||||
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_LT; }
|
||||
};
|
||||
class LessEqualCnnl : public LogicOpCnnl {
|
||||
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_LE; }
|
||||
};
|
||||
class AndCnnl : public LogicOpCnnl {
|
||||
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_AND; }
|
||||
};
|
||||
class OrCnnl : public LogicOpCnnl {
|
||||
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_OR; }
|
||||
};
|
||||
class XorCnnl : public LogicOpCnnl {
|
||||
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_XOR; }
|
||||
};
|
||||
class NotCnnl : public LogicOpCnnl {
|
||||
cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_NOT; }
|
||||
};
|
||||
|
||||
class BitAndCnnl : public BitComputeCnnl {
|
||||
cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BAND_OP; }
|
||||
};
|
||||
class BitOrCnnl : public BitComputeCnnl {
|
||||
cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BOR_OP; }
|
||||
};
|
||||
class BitXorCnnl : public BitComputeCnnl {
|
||||
cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BXOR_OP; }
|
||||
};
|
||||
class BitNotCnnl : public BitComputeCnnl {
|
||||
cnnlBitComputeOp_t getOpType() const override { return CNNL_BNOT_OP; }
|
||||
};
|
||||
// class BitLeftShiftCnnl : public BitComputeCnnl {
|
||||
// cnnlBitComputeOp_t getOpType() const override { return CNNL_BLEFT_SHIFT_OP_V2; }
|
||||
// };
|
||||
// class BitRightShiftCnnl : public BitComputeCnnl {
|
||||
// cnnlBitComputeOp_t getOpType() const override { return CNNL_BLEFT_SHIFT_OP_V2; }
|
||||
// };
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
|
||||
"Add_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
|
||||
|
@ -95,8 +911,69 @@ REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
|
|||
REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
|
||||
"Mul_cnnl_BANG_Float32");
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, ElementWiseBang,
|
||||
"Div_Bang_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::DivDemo, DataType::Float32,
|
||||
ElementWiseBang, "DivDemo_Bang_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, DivCnnl,
|
||||
"Div_cnnl_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::DivNoNan, DataType::Float32, DivNoNanCnnl,
|
||||
"DivNoNan_cnnl_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Maximum, DataType::Float32, MaximumCnnl,
|
||||
"Maximum_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Minimum, DataType::Float32, MinimumCnnl,
|
||||
"Minimum_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::MSELoss, DataType::Float32, MSELossCnnl,
|
||||
"MSELoss_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Power, DataType::Float32, PowerCnnl,
|
||||
"Power_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, DataType::Float32, FloorDivCnnl,
|
||||
"FloorDiv_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::FloorDivTrunc, DataType::Float32,
|
||||
FloorDivTruncCnnl, "FloorDivTrunc_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::FloorMod, DataType::Float32, FloorModCnnl,
|
||||
"FloorMod_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::SquaredDifference, DataType::Float32,
|
||||
SquaredDifferenceCnnl, "SquaredDifference_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Equal, DataType::Float32, EqualCnnl,
|
||||
"Equal_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::NotEqual, DataType::Float32, NotEqualCnnl,
|
||||
"NotEqual_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::GreaterThan, DataType::Float32, GreaterThanCnnl,
|
||||
"GreaterThan_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::GreaterEqual, DataType::Float32, GreaterEqualCnnl,
|
||||
"GreaterEqual_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::LessThan, DataType::Float32, LessThanCnnl,
|
||||
"LessThan_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::LessEqual, DataType::Float32, LessEqualCnnl,
|
||||
"LessEqual_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::And, DataType::Float32, AndCnnl,
|
||||
"And_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Or, DataType::Float32, OrCnnl,
|
||||
"Or_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Xor, DataType::Float32, XorCnnl,
|
||||
"Xor_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Not, DataType::Float32, NotCnnl,
|
||||
"Not_cnnl_BANG_Float32");
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Addcdiv, DataType::Float32, AddcdivCnnl,
|
||||
"Addcdiv_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Addcmul, DataType::Float32, AddcmulCnnl,
|
||||
"Addcmul_cnnl_BANG_Float32");
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BitAnd, DataType::Float32, BitAndCnnl,
|
||||
"BitAnd_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BitOr, DataType::Float32, BitOrCnnl,
|
||||
"BitOr_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BitXor, DataType::Float32, BitXorCnnl,
|
||||
"BitXor_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BitNot, DataType::Float32, BitNotCnnl,
|
||||
"BitNot_cnnl_BANG_Float32");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::BitLeftShift, DataType::Float32, BitLeftShiftCnnl,
|
||||
// "BitLeftShift_cnnl_BANG_Float32");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::BitRightShift, DataType::Float32, BitRightShiftCnnl,
|
||||
// "BitRightShift_cnnl_BANG_Float32");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::FloorModTrunc, DataType::Float32,
|
||||
// FloorModTruncCnnl,
|
||||
// "FloorModTrunc_cnnl_BANG_Float32");
|
||||
// REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
|
||||
// ElementWiseBang,
|
||||
// "Pow_Bang_Float32");
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class ErfCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlErf_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Erf, DataType::Float32, ErfCnnl,
|
||||
"Erf_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,47 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class ExpCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlExp_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Exp, DataType::Float32, ExpCnnl,
|
||||
"Exp_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class FillCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<FillObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
float value = op->getValue();
|
||||
|
||||
cnnlTensorDescriptor_t cDesc;
|
||||
auto dim = op->getOutput()->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlFill(context->cnnlHandle(), value, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Fill, DataType::Float32, FillCnnl,
|
||||
"Fill_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,41 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class FlipCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<FlipObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
vector<int> axis = op->getAxis();
|
||||
|
||||
cnnlTensorDescriptor_t aDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat = cnnlFlip(context->cnnlHandle(), axis.data(),
|
||||
axis.size(), aDesc, aData, aDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Flip, DataType::Float32, FlipCnnl,
|
||||
"Flip_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,46 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class FloorCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlFloor(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Floor, DataType::Float32, FloorCnnl,
|
||||
"Floor_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,42 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class HardtanhCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<HardtanhObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
float min = op->getMin();
|
||||
float max = op->getMax();
|
||||
|
||||
cnnlTensorDescriptor_t aDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlHardtanh(context->cnnlHandle(), aDesc, aData, max, min, aDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Hardtanh, DataType::Float32, HardtanhCnnl,
|
||||
"Hardtanh_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class L2LossCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<L2LossObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlL2Loss(context->cnnlHandle(), aDesc, aData, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::L2Loss, DataType::Float32, L2LossCnnl,
|
||||
"L2Loss_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,62 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class LogCnnl : public BangKernelWithoutConfig {
|
||||
virtual cnnlLogBase_t getOpType() const = 0;
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlLog_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
getOpType(), aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class LogECnnl : public LogCnnl {
|
||||
cnnlLogBase_t getOpType() const override { return CNNL_LOG_E; }
|
||||
};
|
||||
class Log2Cnnl : public LogCnnl {
|
||||
cnnlLogBase_t getOpType() const override { return CNNL_LOG_2; }
|
||||
};
|
||||
class Log10Cnnl : public LogCnnl {
|
||||
cnnlLogBase_t getOpType() const override { return CNNL_LOG_10; }
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Log_e, DataType::Float32, LogECnnl,
|
||||
"Loge_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Log_2, DataType::Float32, Log2Cnnl,
|
||||
"Loge_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Log_10, DataType::Float32, Log10Cnnl,
|
||||
"Loge_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,47 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class Log1pCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlLog1p(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Log1p, DataType::Float32, Log1pCnnl,
|
||||
"Log1p_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,54 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class LrnCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<LrnObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
int lrn_n = op->getFeatureNum();
|
||||
float alpha = op->getAlpha();
|
||||
float beta = op->getBeta();
|
||||
float bias = op->getBias();
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getOutput()->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetLrnWorkspaceSize(context->cnnlHandle(), aDesc, cDesc, lrn_n,
|
||||
&wsSize);
|
||||
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat = cnnlLrn(context->cnnlHandle(), CNNL_LRN_CROSS_CHANNEL, (unsigned int)lrn_n, double(alpha),
|
||||
double(beta), double(bias), wsData, wsSize, aDesc,
|
||||
aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Lrn, DataType::Float32,
|
||||
LrnCnnl, "Lrn_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,51 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
namespace infini {
|
||||
class MulNCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<MulNObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
int num = op->numInputs();
|
||||
void *argv[num];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
argv[i] = op->getInputs(i)->getRawDataPtr<void *>();
|
||||
}
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t desc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
cnnlTensorDescriptor_t descArray[num];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
}
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlMulN(context->cnnlHandle(), descArray, argv, num, desc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
for (int i = 0; i < num; ++i) {
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
|
||||
}
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(desc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::MulN, DataType::Float32, MulNCnnl,
|
||||
"MulN_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,46 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class NegTensorCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlNegTensor(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::NegTensor, DataType::Float32,
|
||||
NegTensorCnnl, "NegTensor_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,65 @@
|
|||
#include "operators/pad.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class PadCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<PadObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getOutput()->getDims();
|
||||
int dim_size = dim.size();
|
||||
int dim_array[dim_size];
|
||||
for (int i = 0; i < dim_size; ++i) {
|
||||
dim_array[i] = dim[i];
|
||||
}
|
||||
int paddings[dim_size * 2];
|
||||
std::vector<int> pads = op->getPads();
|
||||
if (pads.size() == 2 && dim_size != 1) {
|
||||
for (int i = 0; i < dim_size * 2; i += 2) {
|
||||
paddings[i] = pads[0];
|
||||
paddings[i + 1] = pads[1];
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < dim_size * 2; i += 2) {
|
||||
paddings[i] = pads[i / 2];
|
||||
paddings[i + 1] = pads[i / 2 + dim_size];
|
||||
}
|
||||
}
|
||||
int dimout_array[dim_size];
|
||||
for (int i = 0; i < dim_size; ++i) {
|
||||
dimout_array[i] = dim[i] + paddings[2 * i] + paddings[2 * i + 1];
|
||||
}
|
||||
float paddingValue = 0.0;
|
||||
// input
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dim_size, dim_array));
|
||||
// output
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||
CNNL_DTYPE_FLOAT, dim_size,
|
||||
dimout_array));
|
||||
|
||||
cnnlStatus_t stat = cnnlPad(context->cnnlHandle(), aDesc, aData,
|
||||
paddings, &paddingValue, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Pad, DataType::Float32, PadCnnl,
|
||||
"Pad_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,73 @@
|
|||
#include "operators/pooling.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class poolingCnnl : public BangKernelWithoutConfig {
|
||||
virtual cnnlPoolingMode_t getPoolingMode() const = 0;
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<PoolingObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
const auto [n, c, h, w, kh, kw] = op->getNCHWRS();
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
|
||||
// get inputs
|
||||
int inArray[4] = {n, c, h, w};
|
||||
cnnlTensorDescriptor_t inDesc;
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, inArray));
|
||||
|
||||
// get maxpool descriptor
|
||||
cnnlPoolingDescriptor_t poolingDesc;
|
||||
checkCnnlError(cnnlCreatePoolingDescriptor(&poolingDesc));
|
||||
checkCnnlError(cnnlSetPooling2dDescriptor_v2(
|
||||
poolingDesc, getPoolingMode(), CNNL_NOT_PROPAGATE_NAN, kh, kw, ph,
|
||||
ph, pw, pw, sh, sw, dh, dw, false));
|
||||
|
||||
// get outputs
|
||||
auto outVec = op->getOutput()->getDims();
|
||||
int outArray[4] = {outVec[0], outVec[1], outVec[2], outVec[3]};
|
||||
cnnlTensorDescriptor_t outDesc;
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, outArray));
|
||||
size_t wsSize;
|
||||
cnnlGetPoolingWorkspaceSize(context->cnnlHandle(), getPoolingMode(),
|
||||
outVec[3], outVec[2], &wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
checkCnnlError(cnnlPoolingForward(context->cnnlHandle(), poolingDesc,
|
||||
&alpha, inDesc, inData, &beta,
|
||||
outDesc, outData, wsData, wsSize));
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(outDesc));
|
||||
checkCnnlError(cnnlDestroyPoolingDescriptor(poolingDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class maxPoolCnnl : public poolingCnnl {
|
||||
cnnlPoolingMode_t getPoolingMode() const override {
|
||||
return CNNL_POOLING_MAX;
|
||||
}
|
||||
};
|
||||
|
||||
class avgPoolCnnl : public poolingCnnl {
|
||||
cnnlPoolingMode_t getPoolingMode() const override {
|
||||
return CNNL_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::MaxPool, DataType::Float32, maxPoolCnnl,
|
||||
"MaxPool_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::AvgPool, DataType::Float32, avgPoolCnnl,
|
||||
"AvgPool_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,46 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class ReciprocalCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlReciprocal(context->cnnlHandle(), aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Reciprocal, DataType::Float32,
|
||||
ReciprocalCnnl, "Reciprocal_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,47 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class RsqrtCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlRsqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Rsqrt, DataType::Float32, RsqrtCnnl,
|
||||
"Rsqrt_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,69 @@
|
|||
#include "operators/split.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class SplitCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<SplitObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
int num = op->numOutputs();
|
||||
int axis = op->getDim();
|
||||
void *argv[num];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
argv[i] = op->getOutput(i)->getRawDataPtr<void *>();
|
||||
}
|
||||
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t desc;
|
||||
|
||||
int dimout_array[num][4];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
auto dim = op->getOutput(i)->getDims();
|
||||
if (dim.size() != 4) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
dimout_array[i][0] = dim[0];
|
||||
dimout_array[i][1] = dim[1];
|
||||
dimout_array[i][2] = dim[2];
|
||||
dimout_array[i][3] = dim[3];
|
||||
}
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
cnnlTensorDescriptor_t descArray[num];
|
||||
for (int i = 0; i < num; ++i) {
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
|
||||
checkCnnlError(
|
||||
cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dimout_array[i]));
|
||||
}
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetSplitWorkspaceSize(context->cnnlHandle(), num, &wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlSplit(context->cnnlHandle(), num, axis, desc, inputData, wsData,
|
||||
wsSize, descArray, argv);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
for (int i = 0; i < num; ++i) {
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
|
||||
}
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(desc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Split, DataType::Float32, SplitCnnl,
|
||||
"Split_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,47 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class SqrtCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlSqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
|
||||
aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Sqrt, DataType::Float32, SqrtCnnl,
|
||||
"Sqrt_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,42 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class TransformCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<TransformObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t cDesc;
|
||||
auto dim = op->getOutput()->getDims();
|
||||
float alpha = op->getAlpha();
|
||||
float beta = op->getBeta();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
cnnlStatus_t stat = cnnlTransform(context->cnnlHandle(), &alpha, cDesc,
|
||||
aData, &beta, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Transform, DataType::Float32,
|
||||
TransformCnnl, "Transform_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,60 @@
|
|||
#include "operators/transpose.h"
|
||||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class TransposeCnnl : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<TransposeObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dimin = op->getInputs(0)->getDims();
|
||||
auto dimout = op->getOutput()->getDims();
|
||||
if (dimin.size() != 4 || dimout.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dimin_array[4] = {dimin[0], dimin[1], dimin[2], dimin[3]};
|
||||
int dimout_array[4] = {dimout[0], dimout[1], dimout[2], dimout[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimin_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimout_array));
|
||||
|
||||
// get op descriptor
|
||||
auto permute = op->getPermute();
|
||||
cnnlTransposeDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
|
||||
checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
|
||||
|
||||
size_t wsSize;
|
||||
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aDesc, opDesc,
|
||||
&wsSize);
|
||||
BangPtr wsData = context->getWorkspace(wsSize);
|
||||
|
||||
cnnlStatus_t stat =
|
||||
cnnlTranspose_v2(context->cnnlHandle(), opDesc, aDesc, aData, cDesc,
|
||||
cData, wsData, wsSize);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Transpose, DataType::Float32,
|
||||
TransposeCnnl, "Transpose_cnnl_BANG_Float32");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,184 @@
|
|||
#include "bang/bang_kernel_without_config.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
class TrigonCnnl : public BangKernelWithoutConfig {
|
||||
virtual cnnlTrigonFunctionMode_t getOpType() const = 0;
|
||||
virtual cnnlComputationPreference_t getPrefer() const = 0;
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
|
||||
// get inputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get outputs
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||
CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
|
||||
// get op descriptor
|
||||
cnnlTrigonDescriptor_t opDesc;
|
||||
checkCnnlError(cnnlCreateTrigonDescriptor(&opDesc));
|
||||
checkCnnlError(cnnlSetTrigonDescriptor(opDesc, getOpType()));
|
||||
|
||||
cnnlStatus_t stat = cnnlTrigonForward(context->cnnlHandle(), opDesc,
|
||||
aDesc, aData, cDesc, cData);
|
||||
if (stat != CNNL_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in BANG does not require sync. But cnnl does not state
|
||||
// whether sync is required before destories.
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||
checkCnnlError(cnnlDestroyTrigonDescriptor(opDesc));
|
||||
}
|
||||
};
|
||||
|
||||
class SinCnnl : public TrigonCnnl {
|
||||
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||
return CNNL_TRIGON_SIN;
|
||||
}
|
||||
cnnlComputationPreference_t getPrefer() const override {
|
||||
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||
}
|
||||
};
|
||||
|
||||
class CosCnnl : public TrigonCnnl {
|
||||
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||
return CNNL_TRIGON_COS;
|
||||
}
|
||||
cnnlComputationPreference_t getPrefer() const override {
|
||||
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||
}
|
||||
};
|
||||
|
||||
class TanCnnl : public TrigonCnnl {
|
||||
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||
return CNNL_TRIGON_TAN;
|
||||
}
|
||||
cnnlComputationPreference_t getPrefer() const override {
|
||||
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||
}
|
||||
};
|
||||
|
||||
class ASinCnnl : public TrigonCnnl {
|
||||
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||
return CNNL_TRIGON_ASIN;
|
||||
}
|
||||
cnnlComputationPreference_t getPrefer() const override {
|
||||
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||
}
|
||||
};
|
||||
|
||||
class ACosCnnl : public TrigonCnnl {
|
||||
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||
return CNNL_TRIGON_ACOS;
|
||||
}
|
||||
cnnlComputationPreference_t getPrefer() const override {
|
||||
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||
}
|
||||
};
|
||||
|
||||
class ATanCnnl : public TrigonCnnl {
|
||||
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||
return CNNL_TRIGON_ATAN;
|
||||
}
|
||||
cnnlComputationPreference_t getPrefer() const override {
|
||||
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||
}
|
||||
};
|
||||
|
||||
class SinHCnnl : public TrigonCnnl {
|
||||
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||
return CNNL_TRIGON_SINH;
|
||||
}
|
||||
cnnlComputationPreference_t getPrefer() const override {
|
||||
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||
}
|
||||
};
|
||||
|
||||
class CosHCnnl : public TrigonCnnl {
|
||||
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||
return CNNL_TRIGON_COSH;
|
||||
}
|
||||
cnnlComputationPreference_t getPrefer() const override {
|
||||
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||
}
|
||||
};
|
||||
|
||||
class TanHCnnl : public TrigonCnnl {
|
||||
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||
return CNNL_TRIGON_TANH;
|
||||
}
|
||||
cnnlComputationPreference_t getPrefer() const override {
|
||||
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||
}
|
||||
};
|
||||
|
||||
class ASinHCnnl : public TrigonCnnl {
|
||||
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||
return CNNL_TRIGON_ASINH;
|
||||
}
|
||||
cnnlComputationPreference_t getPrefer() const override {
|
||||
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||
}
|
||||
};
|
||||
|
||||
class ACosHCnnl : public TrigonCnnl {
|
||||
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||
return CNNL_TRIGON_ACOSH;
|
||||
}
|
||||
cnnlComputationPreference_t getPrefer() const override {
|
||||
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||
}
|
||||
};
|
||||
|
||||
class ATanHCnnl : public TrigonCnnl {
|
||||
cnnlTrigonFunctionMode_t getOpType() const override {
|
||||
return CNNL_TRIGON_ATANH;
|
||||
}
|
||||
cnnlComputationPreference_t getPrefer() const override {
|
||||
return CNNL_COMPUTATION_HIGH_PRECISION;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Sin, DataType::Float32, SinCnnl,
|
||||
"Sin_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Cos, DataType::Float32, CosCnnl,
|
||||
"Cos_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Tan, DataType::Float32, TanCnnl,
|
||||
"Tan_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ASin, DataType::Float32, ASinCnnl,
|
||||
"ASin_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ACos, DataType::Float32, ACosCnnl,
|
||||
"ACos_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ATan, DataType::Float32, ATanCnnl,
|
||||
"ATan_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::SinH, DataType::Float32, SinHCnnl,
|
||||
"SinH_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::CosH, DataType::Float32, CosHCnnl,
|
||||
"CosH_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::TanH, DataType::Float32, TanHCnnl,
|
||||
"TanH_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ASinH, DataType::Float32, ASinHCnnl,
|
||||
"ASinH_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ACosH, DataType::Float32, ACosHCnnl,
|
||||
"ACosH_cnnl_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::ATanH, DataType::Float32, ATanHCnnl,
|
||||
"ATanH_cnnl_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,50 @@
|
|||
#include "operators/unary.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class LrnCudnn : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<LrnObj>(_op);
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
int lrn_n = op->getFeatureNum();
|
||||
float alpha = op->getAlpha();
|
||||
float beta = op->getBeta();
|
||||
float bias = op->getBias();
|
||||
|
||||
cudnnTensorDescriptor_t aDesc, cDesc;
|
||||
auto dim = op->getOutput()->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&aDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW,
|
||||
CUDNN_DATA_FLOAT, dim[0], dim[1], dim[2], dim[3]));
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&cDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW,
|
||||
CUDNN_DATA_FLOAT, dim[0], dim[1], dim[2], dim[3]));
|
||||
|
||||
cudnnLRNDescriptor_t lrn_desc;
|
||||
checkCudnnError(cudnnCreateLRNDescriptor(&lrn_desc));
|
||||
checkCudnnError(cudnnSetLRNDescriptor(lrn_desc, (unsigned int)lrn_n, (double)alpha, double(beta), double(bias)));
|
||||
cudnnStatus_t stat = cudnnLRNCrossChannelForward(context->cudnnHandle(), lrn_desc, CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, aDesc, aData,
|
||||
&beta, cDesc, cData);
|
||||
if (stat != CUDNN_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in CUDA does not require sync. But cudnn does not state
|
||||
// whether sync is required before destories.
|
||||
checkCudnnError(cudnnDestroyLRNDescriptor(lrn_desc));
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(aDesc));
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(cDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Lrn, DataType::Float32,
|
||||
LrnCudnn, "Lrn_cudnn_CUDA_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,24 @@
|
|||
#include "operators/transpose.h"
|
||||
#include "cuda/cuda_transpose.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class TransposeCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<TransposeObj>(_op);
|
||||
float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
|
||||
float *const cData = (op->getOutput()->getRawDataPtr<float *>());
|
||||
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
||||
auto permute = op->getPermute();
|
||||
transpose_kernel(aData, cData, n,c,h,w, permute[0],permute[1],permute[2],permute[3]);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Transpose, DataType::Float32, TransposeCuda,
|
||||
"Transpose_CUDA_Float32");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,43 @@
|
|||
#include "cuda/cuda_transpose.h"
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
||||
|
||||
__global__ void _transpose_kernel(float *a, float *c, int dim_0, int dim_1, int dim_2, int dim_3,
|
||||
int p_0, int p_1, int p_2, int p_3) {
|
||||
|
||||
int src_dim[4] = {dim_0, dim_1, dim_2, dim_3};
|
||||
int stride_dim[4] = {dim_1*dim_2*dim_3, dim_2*dim_3, dim_3, 1};
|
||||
int permute[4] = {p_0, p_1, p_2, p_3};
|
||||
int dst_dim[4] = {src_dim[p_0], src_dim[p_1], src_dim[p_2], src_dim[p_3]};
|
||||
int n = dim_0 * dim_1 * dim_2 * dim_3;
|
||||
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (int i = index; i < n; i += stride) {
|
||||
int c0_index = i / (dst_dim[1] * dst_dim[2] * dst_dim[3]);
|
||||
int c1_index = (i % (dst_dim[1] * dst_dim[2] * dst_dim[3])) / (dst_dim[2] * dst_dim[3]);
|
||||
int c2_index = ((i % (dst_dim[1] * dst_dim[2] * dst_dim[3])) % (dst_dim[2] * dst_dim[3])) / dst_dim[3];
|
||||
int c3_index = ((i % (dst_dim[1] * dst_dim[2] * dst_dim[3])) % (dst_dim[2] * dst_dim[3])) % dst_dim[3];
|
||||
int new_0 = c0_index * stride_dim[permute[0]];
|
||||
int new_1 = c1_index * stride_dim[permute[1]];
|
||||
int new_2 = c2_index * stride_dim[permute[2]];
|
||||
int new_3 = c3_index * stride_dim[permute[3]];
|
||||
int src_address = new_0 + new_1 + new_2 + new_3;
|
||||
c[i] = a[src_address];
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void transpose_kernel(float *a, float *c, int dim_0, int dim_1, int dim_2, int dim_3,
|
||||
int p_0, int p_1, int p_2, int p_3) {
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (dim_0*dim_1*dim_2*dim_3 + block_work_size() - 1) / block_work_size();
|
||||
_transpose_kernel<<<blocksize, gridsize>>>(a,c,dim_0,dim_1,dim_2,dim_3,p_0,p_1,p_2,p_3);
|
||||
}
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,38 @@
|
|||
#include "operators/activation_backward.h"
|
||||
|
||||
namespace infini {
|
||||
ActivationBackwardObj::ActivationBackwardObj(OpType type, GraphObj *graph,
|
||||
Tensor y, Tensor diff_y, Tensor x,
|
||||
Tensor diff_x)
|
||||
: OperatorObj(type, {y, diff_y, x}, {diff_x}) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>>
|
||||
ActivationBackwardObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0];
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string ActivationBackwardObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> ActivationBackwardObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> ActivationBackwardObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
}; // namespace infini
|
|
@ -180,4 +180,79 @@ void ConvTransposed2dObj::setAuxilaryAttributes(PaddingMode mode) {
|
|||
}
|
||||
}
|
||||
|
||||
void ConvBackwardFilterObj::setAuxilaryAttributes(PaddingMode mode) {
|
||||
const Tensor &inputX = inputs[0];
|
||||
const Tensor &diffY = inputs[1];
|
||||
n = inputX->getDims()[0], c = inputX->getDims()[1],
|
||||
h = inputX->getDims()[2], w = inputX->getDims()[3], f = diffY->getDims()[0],
|
||||
r = diffY->getDims()[2], s = diffY->getDims()[3];
|
||||
if (mode == PaddingMode::Same) {
|
||||
int oh = h / sh;
|
||||
int ow = w / sw;
|
||||
ph = (h - oh * sh + (r - sh) * dh) / 2;
|
||||
pw = (w - ow * sw + (s - sw) * dw) / 2;
|
||||
} else if (mode == PaddingMode::Valid) {
|
||||
ph = pw = 0;
|
||||
}
|
||||
}
|
||||
|
||||
ConvBackwardFilterObj::ConvBackwardFilterObj(GraphObj *graph, Tensor inputX,
|
||||
Tensor diffY, Tensor diffW, int ph,
|
||||
int pw, int sh, int sw, int dh,
|
||||
int dw, Tensor bias, ActType act)
|
||||
: ConvBaseObj(OpType::Conv, {inputX, diffY}, diffW, ph, pw, sh, sw, dh, dw,
|
||||
inputX, diffY),
|
||||
act(act) {
|
||||
if (bias)
|
||||
IT_TODO_HALT();
|
||||
setAuxilaryAttributes(PaddingMode::Other);
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
ConvBackwardFilterObj::ConvBackwardFilterObj(GraphObj *graph, Tensor inputX,
|
||||
Tensor diffY, Tensor diffW,
|
||||
PaddingMode mode, int sh, int sw,
|
||||
int dh, int dw, Tensor bias,
|
||||
ActType act)
|
||||
: ConvBaseObj(OpType::Conv, {inputX, diffY}, diffW, mode, sh, sw, dh, dw,
|
||||
inputX, diffY),
|
||||
act(act) {
|
||||
if (bias)
|
||||
IT_TODO_HALT();
|
||||
setAuxilaryAttributes(mode);
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>>
|
||||
ConvBackwardFilterObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto &inputX = inputs[0], &diffY = inputs[1];
|
||||
auto n = inputX->getDims()[0];
|
||||
auto h = inputX->getDims()[2];
|
||||
auto w = inputX->getDims()[3];
|
||||
auto f = diffY->getDims()[0];
|
||||
auto r = diffY->getDims()[2];
|
||||
auto s = diffY->getDims()[3];
|
||||
int on = n, oc = f;
|
||||
int oh = 0, ow = 0;
|
||||
// For NCHW+FCRS layout, C of input is divisable by C of weight
|
||||
if (inputX->getDims()[1] % diffY->getDims()[1] != 0)
|
||||
return {};
|
||||
// Set padding size
|
||||
if (padding == PaddingMode::Other) {
|
||||
oh = (h - (r - sh) * dh + ph * 2) / sh;
|
||||
ow = (w - (s - sw) * dw + pw * 2) / sw;
|
||||
} else if (padding == PaddingMode::Same) {
|
||||
oh = h / sh;
|
||||
ow = w / sw;
|
||||
// ph = (h - oh * sh + (r - sh) * dh) / 2;
|
||||
// pw = (w - ow * sw + (s - sw) * dw) / 2;
|
||||
} else if (padding == PaddingMode::Valid) {
|
||||
int ph = 0;
|
||||
int pw = 0;
|
||||
oh = (h - (r - sh) * dh + ph * 2) / sh;
|
||||
ow = (w - (s - sw) * dw + pw * 2) / sw;
|
||||
}
|
||||
return {{{on, oc, oh, ow}}};
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
#include "operators/det.h"
|
||||
|
||||
namespace infini {
|
||||
DetObj::DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode)
|
||||
: OperatorObj(OpType::Det, {input}, {output}), modeValue(mode) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> DetObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0];
|
||||
auto input = A->getDims();
|
||||
int length = input.size();
|
||||
if (length == 2) {
|
||||
std::vector<int> output = {1};
|
||||
return {{output}};
|
||||
} else {
|
||||
std::vector<int> output(input.begin(), input.end() - 2);
|
||||
return {{output}};
|
||||
}
|
||||
}
|
||||
|
||||
std::string DetObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> DetObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> DetObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
}; // namespace infini
|
|
@ -54,4 +54,220 @@ vector<int> ElementWiseObj::getOpAttrVector() const {
|
|||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
MSELossObj::MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
|
||||
Reduction reduction, Tensor output)
|
||||
: OperatorObj(OpType::MSELoss, {input0, input1}, {output}),
|
||||
reductionMode(reduction) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> MSELossObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0], B = inputs[1];
|
||||
if (A->getDims().size() != B->getDims().size() ||
|
||||
A->getDims() != B->getDims())
|
||||
return {};
|
||||
|
||||
if (reductionMode == None) {
|
||||
return {{A->getDims()}};
|
||||
} else {
|
||||
Shape temp = {1};
|
||||
return {{temp}};
|
||||
}
|
||||
}
|
||||
|
||||
std::string MSELossObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << vecToString(inputs[1]->getDims()) << ",";
|
||||
os << "input0=" << inputs[0]->getGuid() << ",";
|
||||
os << "input1=" << inputs[1]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
// use output dim or inputs dim?
|
||||
vector<int> MSELossObj::getWorkloadVector() const {
|
||||
vector<int> ret = outputs[0]->getDims();
|
||||
ret.emplace(ret.begin(), enum_to_underlying(type));
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> MSELossObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
AddNObj::AddNObj(GraphObj *graph, int tensorNum, Tensor output, ...)
|
||||
: OperatorObj(OpType::AddN), num(tensorNum) {
|
||||
TensorVec temp;
|
||||
Tensor *start = &output;
|
||||
++start;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
temp.push_back(*start);
|
||||
start++;
|
||||
}
|
||||
setOutputs({output});
|
||||
setInputs(temp);
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> AddNObj::inferShape(const TensorVec &inputs) const {
|
||||
// For now,we only process the same dims here, broardcast will be considered
|
||||
// in the opt layer.
|
||||
const auto A = inputs[0];
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string AddNObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << vecToString(inputs[1]->getDims()) << ",";
|
||||
os << "input0=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
// use output dim or inputs dim?
|
||||
vector<int> AddNObj::getWorkloadVector() const {
|
||||
vector<int> ret = outputs[0]->getDims();
|
||||
ret.emplace(ret.begin(), enum_to_underlying(type));
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> AddNObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
MulNObj::MulNObj(GraphObj *graph, int tensorNum, Tensor output, ...)
|
||||
: OperatorObj(OpType::MulN), num(tensorNum) {
|
||||
TensorVec temp;
|
||||
Tensor *start = &output;
|
||||
++start;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
temp.push_back(*start);
|
||||
start++;
|
||||
}
|
||||
setOutputs({output});
|
||||
setInputs(temp);
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> MulNObj::inferShape(const TensorVec &inputs) const {
|
||||
// For now,we only process the same dims here, broardcast will be considered
|
||||
// in the opt layer.
|
||||
const auto A = inputs[0];
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string MulNObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << vecToString(inputs[1]->getDims()) << ",";
|
||||
os << "input0=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
// use output dim or inputs dim?
|
||||
vector<int> MulNObj::getWorkloadVector() const {
|
||||
vector<int> ret = outputs[0]->getDims();
|
||||
ret.emplace(ret.begin(), enum_to_underlying(type));
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> MulNObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
AddcdivObj::AddcdivObj(GraphObj *graph, float alpha, Tensor input0,
|
||||
Tensor input1, Tensor input2, Tensor output)
|
||||
: OperatorObj(OpType::Addcdiv, {input0, input1, input2}, {output}), alphaValue(alpha) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>>
|
||||
AddcdivObj::inferShape(const TensorVec &inputs) const {
|
||||
// For now,we only process the same dims here, broardcast will be considered
|
||||
// in the opt layer.
|
||||
const auto A = inputs[0], B = inputs[1];
|
||||
if (A->getDims().size() != B->getDims().size() ||
|
||||
A->getDims() != B->getDims())
|
||||
return {};
|
||||
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string AddcdivObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << vecToString(inputs[1]->getDims()) << ",";
|
||||
os << vecToString(inputs[2]->getDims()) << ",";
|
||||
os << "input0=" << inputs[0]->getGuid() << ",";
|
||||
os << "input1=" << inputs[1]->getGuid() << ",";
|
||||
os << "input1=" << inputs[2]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
// use output dim or inputs dim?
|
||||
vector<int> AddcdivObj::getWorkloadVector() const {
|
||||
vector<int> ret = outputs[0]->getDims();
|
||||
ret.emplace(ret.begin(), enum_to_underlying(type));
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> AddcdivObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
AddcmulObj::AddcmulObj(GraphObj *graph, float alpha, Tensor input0,
|
||||
Tensor input1, Tensor input2, Tensor output)
|
||||
: OperatorObj(OpType::Addcmul, {input0, input1, input2}, {output}), alphaValue(alpha) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>>
|
||||
AddcmulObj::inferShape(const TensorVec &inputs) const {
|
||||
// For now,we only process the same dims here, broardcast will be considered
|
||||
// in the opt layer.
|
||||
const auto A = inputs[0], B = inputs[1];
|
||||
if (A->getDims().size() != B->getDims().size() ||
|
||||
A->getDims() != B->getDims())
|
||||
return {};
|
||||
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string AddcmulObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << vecToString(inputs[1]->getDims()) << ",";
|
||||
os << vecToString(inputs[2]->getDims()) << ",";
|
||||
os << "input0=" << inputs[0]->getGuid() << ",";
|
||||
os << "input1=" << inputs[1]->getGuid() << ",";
|
||||
os << "input1=" << inputs[2]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
// use output dim or inputs dim?
|
||||
vector<int> AddcmulObj::getWorkloadVector() const {
|
||||
vector<int> ret = outputs[0]->getDims();
|
||||
ret.emplace(ret.begin(), enum_to_underlying(type));
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> AddcmulObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
#include "operators/transpose.h"
|
||||
|
||||
namespace infini {
|
||||
TransposeObj::TransposeObj(GraphObj *graph, Tensor input, Tensor output,
|
||||
int permute[4])
|
||||
: OperatorObj(OpType::Transpose, {input}, {output}) {
|
||||
transposePermute[0] = permute[0];
|
||||
transposePermute[1] = permute[1];
|
||||
transposePermute[2] = permute[2];
|
||||
transposePermute[3] = permute[3];
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>>
|
||||
TransposeObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0];
|
||||
auto input = A->getDims();
|
||||
auto output = input;
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
output[i] = input[transposePermute[i]];
|
||||
}
|
||||
return {{output}};
|
||||
}
|
||||
|
||||
std::string TransposeObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> TransposeObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> TransposeObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
}; // namespace infini
|
|
@ -32,4 +32,342 @@ vector<int> UnaryObj::getOpAttrVector() const {
|
|||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
ClipObj::ClipObj(GraphObj *graph, Tensor input, Tensor output, float min,
|
||||
float max)
|
||||
: OperatorObj(OpType::Clip, {input}, {output}), minValue(min),
|
||||
maxValue(max) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> ClipObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0];
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string ClipObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> ClipObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> ClipObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
HardtanhObj::HardtanhObj(GraphObj *graph, Tensor input, Tensor output, float min,
|
||||
float max)
|
||||
: OperatorObj(OpType::Hardtanh, {input}, {output}), minValue(min),
|
||||
maxValue(max) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> HardtanhObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0];
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string HardtanhObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> HardtanhObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> HardtanhObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
FlipObj::FlipObj(GraphObj *graph, Tensor input, Tensor output, vector<int> axis)
|
||||
: OperatorObj(OpType::Flip, {input}, {output}), axisValue(axis) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> FlipObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0];
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string FlipObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> FlipObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> FlipObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
FillObj::FillObj(GraphObj *graph, Tensor input, Tensor output, float value)
|
||||
: OperatorObj(OpType::Fill, {input}, {output}), setValue(value) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> FillObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0];
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string FillObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> FillObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> FillObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
L2LossObj::L2LossObj(GraphObj *graph, Tensor input, Tensor output)
|
||||
: OperatorObj(OpType::L2Loss, {input}, {output}) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> L2LossObj::inferShape(const TensorVec &inputs) const {
|
||||
Shape temp = {1};
|
||||
return {{temp}};
|
||||
}
|
||||
|
||||
std::string L2LossObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> L2LossObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> L2LossObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
TransformObj::TransformObj(GraphObj *graph, Tensor input, Tensor output,
|
||||
float alpha, float beta)
|
||||
: OperatorObj(OpType::Transform, {input}, {output}), alphaValue(alpha),
|
||||
betaValue(beta) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>>
|
||||
TransformObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0];
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string TransformObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> TransformObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> TransformObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
CastObj::CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type)
|
||||
: OperatorObj(OpType::Cast, {input}, {output}), castType(type) {
|
||||
IT_ASSERT(checkValid(graph, DataType::Int32));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> CastObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0];
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string CastObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> CastObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> CastObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
CumsumObj::CumsumObj(GraphObj *graph, Tensor input, Tensor output, int axis,
|
||||
bool exclusive, bool reverse)
|
||||
: OperatorObj(OpType::Cumsum, {input}, {output}), axisValue(axis),
|
||||
exclusiveValue(exclusive), reverseValue(reverse) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> CumsumObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0];
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string CumsumObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> CumsumObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> CumsumObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
// CumprodObj::CumprodObj(GraphObj *graph, Tensor input, Tensor output, int
|
||||
// axis, bool exclusive, bool reverse)
|
||||
// : OperatorObj(OpType::Cumprod, {input}, {output}), axisValue(axis),
|
||||
// exclusiveValue(exclusive), reverseValue(reverse) {
|
||||
// IT_ASSERT(checkValid(graph));
|
||||
// }
|
||||
//
|
||||
// optional<vector<Shape>> CumprodObj::inferShape(const TensorVec &inputs) const
|
||||
// {
|
||||
// const auto A = inputs[0];
|
||||
// return {{A->getDims()}};
|
||||
// }
|
||||
//
|
||||
// std::string CumprodObj::toString() const {
|
||||
// std::ostringstream os;
|
||||
// os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
// os << "(";
|
||||
// os << "output=" << outputs[0]->getGuid() << ")";
|
||||
// return os.str();
|
||||
// }
|
||||
//
|
||||
// vector<int> CumprodObj::getWorkloadVector() const {
|
||||
// vector<int> ret{enum_to_underlying(type)};
|
||||
// const Shape shape = outputs[0]->getDims();
|
||||
// ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
// return ret;
|
||||
// }
|
||||
//
|
||||
// vector<int> CumprodObj::getOpAttrVector() const {
|
||||
// return {enum_to_underlying(type)};
|
||||
// }
|
||||
|
||||
ArangeObj::ArangeObj(GraphObj *graph, float start, float step, int length, Tensor output)
|
||||
: OperatorObj(OpType::Arange, {}, {output}), startValue(start), stepValue(step), lengthValue(length) {
|
||||
IT_ASSERT(checkValid(graph, DataType::Float32));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> ArangeObj::inferShape(const TensorVec &inputs) const {
|
||||
Shape temp = { lengthValue };
|
||||
return {{temp}};
|
||||
}
|
||||
|
||||
std::string ArangeObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(outputs[0]->getDims()) << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> ArangeObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> ArangeObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
LrnObj::LrnObj(GraphObj *graph, Tensor input, Tensor output, int feature_num, float alpha, float beta, float bias)
|
||||
: OperatorObj(OpType::Lrn, {input}, {output}), featureNumValue(feature_num), alphaValue(alpha), betaValue(beta), biasValue(bias) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> LrnObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0];
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
||||
std::string LrnObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> LrnObj::getWorkloadVector() const {
|
||||
vector<int> ret{enum_to_underlying(type)};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> LrnObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/activation_backward.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T, class D>
|
||||
void testActivationBackward(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor yCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
Tensor diffYCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
Tensor xCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
|
||||
yCpu->dataMalloc();
|
||||
diffYCpu->dataMalloc();
|
||||
xCpu->dataMalloc();
|
||||
|
||||
yCpu->setData(generator);
|
||||
diffYCpu->setData(generator);
|
||||
xCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto yGpu = bangGraph->cloneTensor(yCpu);
|
||||
auto diffYGpu = bangGraph->cloneTensor(diffYCpu);
|
||||
auto xGpu = bangGraph->cloneTensor(xCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(yGpu, diffYGpu, xGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto diffXGpu = gpuOp->getOutput();
|
||||
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_ActivationBackward, run) {
|
||||
testActivationBackward<ReluBackwardObj, ReluObj>(IncrementalGenerator(),
|
||||
Shape{1, 2, 2, 3});
|
||||
testActivationBackward<SigmoidBackwardObj, SigmoidObj>(
|
||||
IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testActivationBackward<TanhBackwardObj, TanhObj>(IncrementalGenerator(),
|
||||
Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,54 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testAddcdiv(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
Tensor inputCpu3 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu3->dataMalloc();
|
||||
inputCpu3->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto inputGpu3 = bangGraph->cloneTensor(inputCpu3);
|
||||
float alpha = 1.1;
|
||||
auto gpuOp = bangGraph->addOp<T>(alpha, inputGpu1, inputGpu2, inputGpu3, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu1->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_addcdiv, run) {
|
||||
testAddcdiv<AddcdivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,54 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testAddcmul(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
Tensor inputCpu3 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu3->dataMalloc();
|
||||
inputCpu3->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto inputGpu3 = bangGraph->cloneTensor(inputCpu3);
|
||||
float alpha = 1.1;
|
||||
auto gpuOp = bangGraph->addOp<T>(alpha, inputGpu1, inputGpu2, inputGpu3, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu1->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_addcmul, run) {
|
||||
testAddcmul<AddcmulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,48 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testaddN(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(2, nullptr, inputGpu1, inputGpu2);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu1->printData();
|
||||
inputCpu2->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_addN, run) {
|
||||
testaddN<AddNObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,35 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testArange() {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
float start = 0.0;
|
||||
float step = 2.0;
|
||||
int length = 10;
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto gpuOp = bangGraph->addOp<T>(start, step, length, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Arange, run) {
|
||||
testArange<ArangeObj>();
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,51 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testBitCompute(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu1->printData();
|
||||
inputCpu2->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_BitCompute, run) {
|
||||
testBitCompute<BitAndObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testBitCompute<BitOrObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testBitCompute<BitXorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testBitCompute<BitNotObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testCast(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, CastObj::Float2Int32);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Cast, run) {
|
||||
testCast<CastObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testCeil(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Ceil, run) {
|
||||
testCeil<CeilObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,42 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testClip(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
float min = 1.0;
|
||||
float max = 4.0;
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, min, max);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Clip, run) {
|
||||
testClip<ClipObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,52 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/concat.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testConcat(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp =
|
||||
bangGraph->addOp<T>(TensorVec{inputGpu1, inputGpu2}, nullptr, 2);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu1->print();
|
||||
inputCpu1->printData();
|
||||
inputCpu2->print();
|
||||
inputCpu2->printData();
|
||||
outputGpu2Cpu->print();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Concat, run) {
|
||||
testConcat<ConcatObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testCopy(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(outputGpu2Cpu->equalData(inputCpu));
|
||||
}
|
||||
|
||||
TEST(cnnl_Copy, run) {
|
||||
testCopy<CopyObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,42 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testCumsum(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
int axis, const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, axis, false, false);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Cumsum, run) {
|
||||
testCumsum<CumsumObj>(IncrementalGenerator(), 1, Shape{1, 2, 2, 3});
|
||||
testCumsum<CumsumObj>(IncrementalGenerator(), 2, Shape{1, 2, 2, 3});
|
||||
testCumsum<CumsumObj>(IncrementalGenerator(), 3, Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,41 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/det.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testDet(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, DetObj::NormalDet);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Det, run) {
|
||||
testDet<DetObj>(IncrementalGenerator(), Shape{1, 1, 2, 2});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,48 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testDivDemo(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu1->printData();
|
||||
inputCpu2->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_DivDemo, run) {
|
||||
testDivDemo<DivDemoObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,49 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testDivNoNan(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu1->printData();
|
||||
inputCpu2->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_DivNoNan, run) {
|
||||
testDivNoNan<DivNoNanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testErf(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Erf, run) {
|
||||
testErf<ErfObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testExp(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Exp, run) {
|
||||
testExp<ExpObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testFill(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
float value = 1.0;
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, value);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Fill, run) {
|
||||
testFill<FillObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testFlip(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, vector<int>{2});
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Flip, run) {
|
||||
testFlip<FlipObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testFloor(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Floor, run) {
|
||||
testFloor<FloorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,49 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testFloorDiv(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu1->printData();
|
||||
inputCpu2->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_FloorDiv, run) {
|
||||
testFloorDiv<FloorDivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,50 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testFloorDivTrunc(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu1->printData();
|
||||
inputCpu2->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_FloorDivTrunc, run) {
|
||||
testFloorDivTrunc<FloorDivTruncObj>(IncrementalGenerator(),
|
||||
Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,49 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testFloorMod(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu1->printData();
|
||||
inputCpu2->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_FloorMod, run) {
|
||||
testFloorMod<FloorModObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,42 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testHardtanh(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
float min = 1.0;
|
||||
float max = 4.0;
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, min, max);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Hardtanh, run) {
|
||||
testHardtanh<HardtanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testL2Loss(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_L2Loss, run) {
|
||||
testL2Loss<L2LossObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,42 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testLog(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Log, run) {
|
||||
testLog<Log_eObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testLog<Log_2Obj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testLog<Log_10Obj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testLog1p(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Log, run) {
|
||||
testLog1p<Log1pObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,57 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testLogicOp(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu1->printData();
|
||||
inputCpu2->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_LogicOp, run) {
|
||||
testLogicOp<EqualObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testLogicOp<NotEqualObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testLogicOp<GreaterThanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testLogicOp<GreaterEqualObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testLogicOp<LessThanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testLogicOp<LessEqualObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testLogicOp<AndObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testLogicOp<OrObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testLogicOp<XorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testLogicOp<NotObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testLrn(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
inputCpu->printData();
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, 5, 0.0001, 0.75, 2.0);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Lrn, run) {
|
||||
testLrn<LrnObj>(IncrementalGenerator(), Shape{1, 10, 3, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,46 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testMaximum(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Maximum, run) {
|
||||
testMaximum<MaximumObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,46 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testMinimum(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Minimum, run) {
|
||||
testMinimum<MinimumObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,57 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testMSELoss(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp1 =
|
||||
bangGraph->addOp<T>(inputGpu1, inputGpu2, MSELossObj::None, nullptr);
|
||||
auto gpuOp2 =
|
||||
bangGraph->addOp<T>(inputGpu1, inputGpu2, MSELossObj::Sum, nullptr);
|
||||
auto gpuOp3 =
|
||||
bangGraph->addOp<T>(inputGpu1, inputGpu2, MSELossObj::Mean, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu1 = gpuOp1->getOutput();
|
||||
auto outputGpu2 = gpuOp2->getOutput();
|
||||
auto outputGpu3 = gpuOp3->getOutput();
|
||||
auto outputGpu2Cpu1 = outputGpu1->clone(cpuRuntime);
|
||||
auto outputGpu2Cpu2 = outputGpu2->clone(cpuRuntime);
|
||||
auto outputGpu2Cpu3 = outputGpu3->clone(cpuRuntime);
|
||||
// Check
|
||||
outputGpu2Cpu1->printData();
|
||||
outputGpu2Cpu2->printData();
|
||||
outputGpu2Cpu3->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_MSELoss, run) {
|
||||
testMSELoss<MSELossObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,48 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testmulN(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(2, nullptr, inputGpu1, inputGpu2);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu1->printData();
|
||||
inputCpu2->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_mulN, run) {
|
||||
testmulN<MulNObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,41 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testNegTensor(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_NegTensor, run) {
|
||||
testNegTensor<NegTensorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,49 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void testNet(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<MulNObj>(2, nullptr, inputGpu1, inputGpu2);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto gpuOp2 = bangGraph->addOp<SigmoidObj>(outputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu2 = gpuOp2->getOutput();
|
||||
auto outputGpu2Cpu2 = outputGpu2->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu2->printData();
|
||||
outputGpu2Cpu2->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Net, run) {
|
||||
testNet(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -42,6 +42,8 @@ void testOptensor(
|
|||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
outputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
|
||||
}
|
||||
|
||||
|
@ -49,6 +51,7 @@ TEST(cuDNN_OpTensor, run) {
|
|||
testOptensor<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testOptensor<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testOptensor<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testOptensor<DivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/pad.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testPad(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, vector<int>{1, 1, 1, 1},
|
||||
vector<int>{0, 3});
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
inputCpu->print();
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->print();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Pad, run) {
|
||||
testPad<PadObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,41 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/pooling.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Pooling, run) {
|
||||
testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
|
||||
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,46 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testPow(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
|
||||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Pow, run) {
|
||||
testPow<PowerObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,41 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testReciprocal(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Reciprocal, run) {
|
||||
testReciprocal<ReciprocalObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,40 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testRound(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
inputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_Round, run) {
|
||||
testRound<RoundObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue