add transpose operation

add lrn operation
2023-03-24 16:42:45 +08:00 · 2023-03-23 07:59:19 +00:00 · 2023-03-23 15:31:42 +08:00 · 2023-02-09 05:35:05 +00:00 · 2023-02-01 05:49:03 +00:00 · 2023-01-31 05:33:40 +00:00
111 changed files with 6878 additions and 8 deletions
--- a/include/bang/bang_element_wise.h
+++ b/include/bang/bang_element_wise.h
@ -13,7 +13,7 @@ void element_wise_kernel(const RuntimeObj *obj, const Operator &_op) {
    auto dim = op->getInputs(0)->getDims();
    auto context = dynamic_cast<const BangRuntimeObj *>(obj);
    int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
-    if (op->getOpType() == OpType::Div)
+    if (op->getOpType() == OpType::DivDemo)
        div_kernel(context->cnnlHandle(), aData, bData, cData, n * c * h * w);
    else
        IT_TODO_HALT();
--- a/include/bang/bang_runtime.h
+++ b/include/bang/bang_runtime.h
@ -30,6 +30,7 @@ class BangRuntimeObj : public RuntimeObj {
        dealloc(workspace);
        checkCnnlError(cnnlDestroy(cnnl));
    }
+    string toString() const override;

    void run(const Graph &graph, bool tune = false,
             bool profiling = false) const;
--- a/include/core/data_type.h
+++ b/include/core/data_type.h
@ -6,8 +6,10 @@ class DataType {
  public:
    static const DataType Float32;
    static const DataType UInt32;
-    static constexpr size_t sizePerElement[]{sizeof(float), sizeof(uint32_t)};
-    static constexpr std::string_view names[]{"Float32", "UInt32"};
+    static const DataType Int32;
+    static constexpr size_t sizePerElement[]{sizeof(float), sizeof(uint32_t),
+                                             sizeof(int32_t)};
+    static constexpr std::string_view names[]{"Float32", "UInt32", "Int32"};

  private:
    int index;
@ -29,9 +31,11 @@ class DataType {

 inline const DataType DataType::Float32(0);
 inline const DataType DataType::UInt32(1);
+inline const DataType DataType::Int32(2);
 // Method definitions are out of the declaration due to GCC bug:
 // https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
 template <> inline DataType DataType::get<float>() { return Float32; }
 template <> inline DataType DataType::get<uint32_t>() { return UInt32; }
+template <> inline DataType DataType::get<int32_t>() { return Int32; }

-} // namespace infini
+} // namespace infini
--- a/include/core/operator.h
+++ b/include/core/operator.h
@ -7,6 +7,8 @@ enum class OpType {
    Unknown = 0,
    // linear
    Conv = 100,
+    ConvBackwardFilter,
+    ConvBackwardData,
    Matmul,
    ConvTrans,
    G2BMM,
@ -23,6 +25,8 @@ enum class OpType {
    Sub,
    Mul,
    Div,
+    DivDemo,
+    DivNoNan,
    Pow,
    Gather,
    ReduceMean,
@ -34,10 +38,82 @@ enum class OpType {
    Softmax,
    Activation,
    Relu,
+    ReluBackward,
    Sigmoid,
+    SigmoidBackward,
    Tanh,
+    TanhBackward,
    Abs,
+    Sin,
+    Cos,
+    Tan,
+    ASin,
+    ACos,
+    ATan,
+    SinH,
+    CosH,
+    TanH,
+    ASinH,
+    ACosH,
+    ATanH,
    Resize,
+    Arange,
+    Copy,
+    Ceil,
+    Floor,
+    Clip,
+    Erf,
+    Exp,
+    Fill,
+    Log_e,
+    Log_2,
+    Log_10,
+    Log1p,
+    L2Loss,
+    Maximum,
+    Minimum,
+    MSELoss,
+    NegTensor,
+    Power,
+    Reciprocal,
+    Sqrt,
+    Rsqrt,
+    Transform,
+    AddN,
+    MulN,
+    Cast,
+    FloorDiv,
+    FloorDivTrunc,
+    FloorMod,
+    FloorModTrunc,
+    Cumsum,
+    Cumprod,
+    Det,
+    Round,
+    Square,
+    SquaredDifference,
+    Flip,
+    Hardtanh,
+    Equal,
+    NotEqual,
+    GreaterThan,
+    GreaterEqual,
+    LessThan,
+    LessEqual,
+    And,
+    Or,
+    Xor,
+    Not,
+    Addcdiv,
+    Addcmul,
+    BitAnd,
+    BitOr,
+    BitXor,
+    BitNot,
+    BitLeftShift,
+    BitRightShift,
+    Dropout,
+    Lrn,
    //
    MemBound = 300,
 };
@ -55,6 +131,8 @@ class OpRegistry {
            FOP(Unknown);
            // linear
            FOP(Conv);
+            FOP(ConvBackwardFilter);
+            FOP(ConvBackwardData);
            FOP(Matmul);
            FOP(ConvTrans);
            FOP(G2BMM);
@ -71,6 +149,8 @@ class OpRegistry {
            FOP(Sub);
            FOP(Mul);
            FOP(Div);
+            FOP(DivDemo);
+            FOP(DivNoNan);
            FOP(Pow);
            FOP(Gather);
            FOP(ReduceMean);
@ -81,9 +161,81 @@ class OpRegistry {
            FOP(Softmax);
            FOP(Activation);
            FOP(Relu);
+            FOP(ReluBackward);
            FOP(Sigmoid);
+            FOP(SigmoidBackward);
            FOP(Tanh);
+            FOP(TanhBackward);
            FOP(Abs);
+            FOP(Sin);
+            FOP(Cos);
+            FOP(Tan);
+            FOP(ASin);
+            FOP(ACos);
+            FOP(ATan);
+            FOP(SinH);
+            FOP(CosH);
+            FOP(TanH);
+            FOP(ASinH);
+            FOP(ACosH);
+            FOP(ATanH);
+            FOP(Arange);
+            FOP(Copy);
+            FOP(Ceil);
+            FOP(Floor);
+            FOP(Clip);
+            FOP(Erf);
+            FOP(Exp);
+            FOP(Fill);
+            FOP(Log_e);
+            FOP(Log_2);
+            FOP(Log_10);
+            FOP(Log1p);
+            FOP(L2Loss);
+            FOP(Maximum);
+            FOP(Minimum);
+            FOP(MSELoss);
+            FOP(NegTensor);
+            FOP(Power);
+            FOP(Reciprocal);
+            FOP(Sqrt);
+            FOP(Rsqrt);
+            FOP(Transform);
+            FOP(AddN);
+            FOP(MulN);
+            FOP(Cast);
+            FOP(FloorDiv);
+            FOP(FloorDivTrunc);
+            FOP(FloorMod);
+            FOP(FloorModTrunc);
+            FOP(Cumsum);
+            FOP(Cumprod);
+            FOP(Det);
+            FOP(Round);
+            FOP(Square);
+            FOP(SquaredDifference);
+            FOP(Flip);
+            FOP(Hardtanh);
+            FOP(Equal);
+            FOP(NotEqual);
+            FOP(GreaterThan);
+            FOP(GreaterEqual);
+            FOP(LessThan);
+            FOP(LessEqual);
+            FOP(And);
+            FOP(Or);
+            FOP(Xor);
+            FOP(Not);
+            FOP(Addcdiv);
+            FOP(Addcmul);
+            FOP(BitAnd);
+            FOP(BitOr);
+            FOP(BitXor);
+            FOP(BitNot);
+            FOP(BitLeftShift);
+            FOP(BitRightShift);
+            FOP(Dropout);
+            FOP(Lrn);
            //
            FOP(MemBound);
        default:
@ -147,6 +299,13 @@ class OperatorObj : public Object {

  public:
    OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
+    OperatorObj(OpType opType);
+    void setInputs(TensorVec inputsTensor) {
+        inputs = inputsTensor;
+        for (auto &t : inputs)
+            IT_ASSERT(t != nullptr);
+    }
+    void setOutputs(TensorVec outputsTensor) { outputs = outputsTensor; }
    virtual optional<vector<Shape>>
    inferShape(const TensorVec &inputs) const = 0;
    virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
@ -158,6 +317,7 @@ class OperatorObj : public Object {
     * function.
     */
    bool checkValid(GraphObj *graph);
+    bool checkValid(GraphObj *graph, DataType type);
    OpPerfKey getOpPerfKey() const;
    /**
     * @brief Hash operator attributes. Input and output shapes are not
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -72,6 +72,7 @@ class TensorObj : public TensorBaseObj {
  private:
    void printDataFloat() const;
    void printDataUint32_t() const;
+    void printDataInt32_t() const;

    template <typename T>
    bool equalDataImpl(const T *a, const T *b, size_t size) const {
--- a/include/cuda/cuda_transpose.h
+++ b/include/cuda/cuda_transpose.h
@ -0,0 +1,5 @@
+#pragma once
+
+namespace infini {
+void transpose_kernel(float *a, float *c, int dim0, int dim1, int dim2, int dim3, int p0, int p1, int p2, int p3);
+}; // namespace infini
--- a/include/operators/activation_backward.h
+++ b/include/operators/activation_backward.h
@ -0,0 +1,31 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class ActivationBackwardObj : public OperatorObj {
+  public:
+    ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y,
+                          Tensor x, Tensor diff_x);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 3; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+#define DEFINE_ACTIVATION_BACKWARD_OBJ(prefix, type)                           \
+    class prefix##Obj : public ActivationBackwardObj {                         \
+      public:                                                                  \
+        prefix##Obj(GraphObj *graph, Tensor y, Tensor diff_y, Tensor x,        \
+                    Tensor diff_x)                                             \
+            : ActivationBackwardObj(type, graph, y, diff_y, x, diff_x) {}      \
+    };
+
+DEFINE_ACTIVATION_BACKWARD_OBJ(ReluBackward, OpType::ReluBackward)
+DEFINE_ACTIVATION_BACKWARD_OBJ(SigmoidBackward, OpType::SigmoidBackward)
+DEFINE_ACTIVATION_BACKWARD_OBJ(TanhBackward, OpType::TanhBackward)
+}; // namespace infini
--- a/include/operators/conv.h
+++ b/include/operators/conv.h
@ -86,6 +86,29 @@ class ConvObj : public ConvBaseObj {
    void setAuxilaryAttributes(PaddingMode mode) override;
 };

+class ConvBackwardFilterObj : public ConvBaseObj {
+  private:
+    ActType act;
+
+  public:
+    ConvBackwardFilterObj(GraphObj *graph, Tensor inputX, Tensor diffY,
+                          Tensor diffW, int ph, int pw, int sh = 1, int sw = 1,
+                          int dh = 1, int dw = 1, Tensor bias = nullptr,
+                          ActType act = ActType::None);
+    // Constructors for setting padding mode
+    ConvBackwardFilterObj(GraphObj *graph, Tensor inputX, Tensor diffY,
+                          Tensor diffW, PaddingMode mode = PaddingMode::Same,
+                          int sh = 1, int sw = 1, int dh = 1, int dw = 1,
+                          Tensor bias = nullptr, ActType act = ActType::None);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    ActType getAct() const { return act; }
+    int getNumGroups() const override { return c / getChannelPerGroup(); }
+
+  private:
+    void setAuxilaryAttributes(PaddingMode mode) override;
+};
+
 class ConvTransposed2dObj : public ConvBaseObj {
  private:
    int oph, opw;
--- a/include/operators/det.h
+++ b/include/operators/det.h
@ -0,0 +1,21 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class DetObj : public OperatorObj {
+  public:
+    enum Mode { NormalDet = 0, LogDet };
+    DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+    Mode getMode() const { return modeValue; }
+
+  private:
+    Mode modeValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+}; // namespace infini
--- a/include/operators/element_wise.h
+++ b/include/operators/element_wise.h
@ -17,6 +17,88 @@ class ElementWiseObj : public OperatorObj {
    vector<int> getOpAttrVector() const override;
 };

+class MSELossObj : public OperatorObj {
+  public:
+    enum Reduction { None = 0, Sum, Mean };
+    MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
+               Reduction reduction, Tensor output);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    Reduction getReduction() const { return reductionMode; }
+    std::string toString() const override;
+    int numInputs() const override { return 2; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    Reduction reductionMode;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class AddNObj : public OperatorObj {
+  public:
+    AddNObj(GraphObj *graph, int tensorNum, Tensor output, ...);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return num; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    int num;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class MulNObj : public OperatorObj {
+  public:
+    MulNObj(GraphObj *graph, int tensorNum, Tensor output, ...);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return num; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    int num;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class AddcdivObj : public OperatorObj {
+  public:
+    AddcdivObj(GraphObj *graph, float alpha, Tensor input0,
+               Tensor input1, Tensor input2, Tensor output);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 3; }
+    int numOutputs() const override { return 1; }
+    float getAlpha() { return alphaValue; }
+
+  private:
+    float alphaValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class AddcmulObj : public OperatorObj {
+  public:
+    AddcmulObj(GraphObj *graph, float alpha, Tensor input0,
+               Tensor input1, Tensor input2, Tensor output);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 3; }
+    int numOutputs() const override { return 1; }
+    float getAlpha() { return alphaValue; }
+
+  private:
+    float alphaValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
 #define DEFINE_ELEMENT_WISE_OBJ(prefix, type)                                  \
    class prefix##Obj : public ElementWiseObj {                                \
      public:                                                                  \
@ -28,6 +110,32 @@ class ElementWiseObj : public OperatorObj {
 DEFINE_ELEMENT_WISE_OBJ(Add, OpType::Add)
 DEFINE_ELEMENT_WISE_OBJ(Sub, OpType::Sub)
 DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
+DEFINE_ELEMENT_WISE_OBJ(DivDemo, OpType::DivDemo)
+DEFINE_ELEMENT_WISE_OBJ(DivNoNan, OpType::DivNoNan)
 DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
 DEFINE_ELEMENT_WISE_OBJ(Pow, OpType::Pow)
+DEFINE_ELEMENT_WISE_OBJ(Maximum, OpType::Maximum)
+DEFINE_ELEMENT_WISE_OBJ(Minimum, OpType::Minimum)
+DEFINE_ELEMENT_WISE_OBJ(Power, OpType::Power)
+DEFINE_ELEMENT_WISE_OBJ(FloorDiv, OpType::FloorDiv)
+DEFINE_ELEMENT_WISE_OBJ(FloorDivTrunc, OpType::FloorDivTrunc)
+DEFINE_ELEMENT_WISE_OBJ(FloorMod, OpType::FloorMod)
+DEFINE_ELEMENT_WISE_OBJ(FloorModTrunc, OpType::FloorModTrunc)
+DEFINE_ELEMENT_WISE_OBJ(SquaredDifference, OpType::SquaredDifference)
+DEFINE_ELEMENT_WISE_OBJ(Equal, OpType::Equal)
+DEFINE_ELEMENT_WISE_OBJ(NotEqual, OpType::NotEqual)
+DEFINE_ELEMENT_WISE_OBJ(GreaterThan, OpType::GreaterThan)
+DEFINE_ELEMENT_WISE_OBJ(GreaterEqual, OpType::GreaterEqual)
+DEFINE_ELEMENT_WISE_OBJ(LessThan, OpType::LessThan)
+DEFINE_ELEMENT_WISE_OBJ(LessEqual, OpType::LessEqual)
+DEFINE_ELEMENT_WISE_OBJ(And, OpType::And)
+DEFINE_ELEMENT_WISE_OBJ(Or, OpType::Or)
+DEFINE_ELEMENT_WISE_OBJ(Xor, OpType::Xor)
+DEFINE_ELEMENT_WISE_OBJ(Not, OpType::Not)
+DEFINE_ELEMENT_WISE_OBJ(BitAnd, OpType::BitAnd)
+DEFINE_ELEMENT_WISE_OBJ(BitOr, OpType::BitOr)
+DEFINE_ELEMENT_WISE_OBJ(BitXor, OpType::BitXor)
+DEFINE_ELEMENT_WISE_OBJ(BitNot, OpType::BitNot)
+DEFINE_ELEMENT_WISE_OBJ(BitLeftShift, OpType::BitLeftShift)
+DEFINE_ELEMENT_WISE_OBJ(BitRightShift, OpType::BitRightShift)
 }; // namespace infini
--- a/include/operators/transpose.h
+++ b/include/operators/transpose.h
@ -0,0 +1,20 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class TransposeObj : public OperatorObj {
+  public:
+    TransposeObj(GraphObj *graph, Tensor input, Tensor output, int permute[4]);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+    auto getPermute() { return transposePermute; }
+
+  private:
+    int transposePermute[4];
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+}; // namespace infini
--- a/include/operators/unary.h
+++ b/include/operators/unary.h
@ -16,6 +16,246 @@ class UnaryObj : public OperatorObj {
    vector<int> getOpAttrVector() const override;
 };

+class ClipObj : public OperatorObj {
+  public:
+    ClipObj(GraphObj *graph, Tensor input, Tensor output, float min, float max);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    float getMin() const { return minValue; };
+    float getMax() const { return maxValue; };
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    float minValue, maxValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class HardtanhObj : public OperatorObj {
+  public:
+    HardtanhObj(GraphObj *graph, Tensor input, Tensor output, float min, float max);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    float getMin() const { return minValue; };
+    float getMax() const { return maxValue; };
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    float minValue, maxValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class FlipObj : public OperatorObj {
+  public:
+    FlipObj(GraphObj *graph, Tensor input, Tensor output, vector<int> axis);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    vector<int> getAxis() const { return axisValue; };
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    vector<int> axisValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class FillObj : public OperatorObj {
+  public:
+    FillObj(GraphObj *graph, Tensor input, Tensor output, float value);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    float getValue() const { return setValue; };
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    float setValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class L2LossObj : public OperatorObj {
+  public:
+    L2LossObj(GraphObj *graph, Tensor input, Tensor output);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class TransformObj : public OperatorObj {
+  public:
+    TransformObj(GraphObj *graph, Tensor input, Tensor output, float alpha,
+                 float beta);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    float getAlpha() const { return alphaValue; }
+    float getBeta() const { return betaValue; }
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    float alphaValue, betaValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class LrnObj : public OperatorObj {
+    public:
+        LrnObj(GraphObj *graph, Tensor input, Tensor output, int feature_num, float alpha, float beta, float bias);
+        optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+        std::string toString() const override;
+        float getFeatureNum() const { return featureNumValue; }
+        float getAlpha() const { return alphaValue; }
+        float getBeta() const { return betaValue; }
+        float getBias() const { return biasValue; }
+        int numInputs() const override { return 1; }
+        int numOutputs() const override { return 1; }
+
+    private:
+        int featureNumValue;
+        float alphaValue;
+        float betaValue;
+        float biasValue;
+        vector<int> getWorkloadVector() const override;
+        vector<int> getOpAttrVector() const override;
+};
+
+class CastObj : public OperatorObj {
+  public:
+    enum CastType {
+        Float2Half = 0,
+        Float2HalfIEEE754,
+        Float2Double,
+        Float2Int64,
+        Float2Int32,
+        Float2Int16,
+        Float2Int8,
+        Float2Bool,
+        Half2Float,
+        Half2Int32,
+        Half2Int64,
+        Half2Int16,
+        Half2Int8,
+        Half2Uint8,
+        Half2Bool,
+        Half2FloatInf,
+        Int322Float,
+        Int322Half,
+        Int322Int8,
+        Int322Int16,
+        Int162Float,
+        Int162Half,
+        Int162Int32,
+        Int82Float,
+        Int82Half,
+        Int82Int16,
+        Int82Int32,
+        Uint82Float,
+        Uint82Half,
+        Uint82Int32,
+        Uint82Int64,
+        Bool2Float,
+        Bool2Half,
+        Bool2Int32,
+        Int322Int64,
+        Int322Bool,
+        Int642Int32,
+        Int642Uint32,
+        Int642Float,
+        Int642Half,
+        Uint642Uint32,
+        Uint322Int64,
+        Uint322Uint64,
+        Double2Float
+    };
+    CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    CastType getType() const { return castType; }
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    CastType castType;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class CumsumObj : public OperatorObj {
+  public:
+    CumsumObj(GraphObj *graph, Tensor input, Tensor output, int axis,
+              bool exclusive, bool reverse);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int getAxis() const { return axisValue; }
+    float getExclusive() const { return exclusiveValue; }
+    float getReverse() const { return reverseValue; }
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    int axisValue;
+    bool exclusiveValue, reverseValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+class ArangeObj : public OperatorObj {
+  public:
+    ArangeObj(GraphObj *graph, float start, float step, int length, Tensor output);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 0; }
+    int numOutputs() const override { return 1; }
+    float getStartValue() { return startValue; }
+    float getStepValue() { return stepValue; }
+    int getLength() { return lengthValue; }
+
+  private:
+    float startValue, stepValue;
+    int lengthValue;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+// class CumprodObj : public OperatorObj {
+//   public:
+//     CumprodObj(GraphObj *graph, Tensor input, Tensor output, int axis, bool
+//     exclusive, bool reverse); optional<vector<Shape>> inferShape(const
+//     TensorVec &inputs) const override;
+//
+//     std::string toString() const override;
+//     int getAxis() const { return axisValue; }
+//     float getExclusive() const { return exclusiveValue; }
+//     float getReverse() const { return reverseValue; }
+//     int numInputs() const override { return 1; }
+//     int numOutputs() const override { return 1; }
+//
+//   private:
+//     int axisValue;
+//     bool exclusiveValue, reverseValue;
+//     vector<int> getWorkloadVector() const override;
+//     vector<int> getOpAttrVector() const override;
+// };
+
 #define DEFINE_UNARY_OBJ(prefix, type)                                         \
    class prefix##Obj : public UnaryObj {                                      \
      public:                                                                  \
@ -28,4 +268,33 @@ DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
 DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
 DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)
 DEFINE_UNARY_OBJ(Abs, OpType::Abs)
+
+DEFINE_UNARY_OBJ(Sin, OpType::Sin)
+DEFINE_UNARY_OBJ(Cos, OpType::Cos)
+DEFINE_UNARY_OBJ(Tan, OpType::Tan)
+DEFINE_UNARY_OBJ(ASin, OpType::ASin)
+DEFINE_UNARY_OBJ(ACos, OpType::ACos)
+DEFINE_UNARY_OBJ(ATan, OpType::ATan)
+DEFINE_UNARY_OBJ(SinH, OpType::SinH)
+DEFINE_UNARY_OBJ(CosH, OpType::CosH)
+DEFINE_UNARY_OBJ(TanH, OpType::TanH)
+DEFINE_UNARY_OBJ(ASinH, OpType::ASinH)
+DEFINE_UNARY_OBJ(ACosH, OpType::ACosH)
+DEFINE_UNARY_OBJ(ATanH, OpType::ATanH)
+
+DEFINE_UNARY_OBJ(Copy, OpType::Copy)
+DEFINE_UNARY_OBJ(Ceil, OpType::Ceil)
+DEFINE_UNARY_OBJ(Floor, OpType::Floor)
+DEFINE_UNARY_OBJ(Erf, OpType::Erf)
+DEFINE_UNARY_OBJ(Exp, OpType::Exp)
+DEFINE_UNARY_OBJ(Log_e, OpType::Log_e)
+DEFINE_UNARY_OBJ(Log_2, OpType::Log_2)
+DEFINE_UNARY_OBJ(Log_10, OpType::Log_10)
+DEFINE_UNARY_OBJ(Log1p, OpType::Log1p)
+DEFINE_UNARY_OBJ(NegTensor, OpType::NegTensor)
+DEFINE_UNARY_OBJ(Reciprocal, OpType::Reciprocal)
+DEFINE_UNARY_OBJ(Sqrt, OpType::Sqrt)
+DEFINE_UNARY_OBJ(Rsqrt, OpType::Rsqrt)
+DEFINE_UNARY_OBJ(Round, OpType::Round)
+DEFINE_UNARY_OBJ(Square, OpType::Square)
 }; // namespace infini
--- a/src/bang/bang_runtime.cc
+++ b/src/bang/bang_runtime.cc
@ -54,4 +54,6 @@ void BangRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {

 void BangRuntimeObj::sync() const { cnrtSyncDevice(); }

+string BangRuntimeObj::toString() const { return "BANG Runtime"; }
+
 } // namespace infini
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@ -61,4 +61,4 @@ OpVec GraphObj::getComputeOps() const {
    return opList;
 };

-} // namespace infini
+} // namespace infini
--- a/src/core/operator.cc
+++ b/src/core/operator.cc
@ -10,6 +10,8 @@ OperatorObj::OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs)
        IT_ASSERT(t != nullptr);
 }

+OperatorObj::OperatorObj(OpType opType) : type(opType) {}
+
 bool OperatorObj::isLinearOp() const {
    return enum_to_underlying(type) >= 100 && enum_to_underlying(type) < 200;
 }
@ -78,6 +80,30 @@ bool OperatorObj::checkValid(GraphObj *graph) {
    return true;
 }

+bool OperatorObj::checkValid(GraphObj *graph, DataType type) {
+    auto optShapes = inferShape();
+    if (!optShapes) // shape inference failed
+        return false;
+
+    const vector<Shape> &shapes = *optShapes;
+    if (shapes.size() != outputs.size())
+        return false;
+    if (graph) { // if graph != nullptr, outputs should be created
+        auto dataTypes = vector(numOutputs(), type);
+        ;
+        for (size_t i = 0; i < outputs.size(); i++) {
+            IT_ASSERT(!outputs[i]);
+            outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
+        }
+    } else { // if outputs have been created, check their shapes
+        for (size_t i = 0; i < shapes.size(); ++i) {
+            if (shapes[i] != outputs[i]->getDims())
+                return false;
+        }
+    }
+    return true;
+}
+
 optional<vector<Shape>> OperatorObj::inferShape() const {
    return inferShape(inputs);
 }
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -69,6 +69,8 @@ void TensorObj::printData() const {
        printDataFloat();
    else if (dtype == DataType::UInt32)
        printDataUint32_t();
+    else if (dtype == DataType::Int32)
+        printDataInt32_t();
    else
        IT_TODO_HALT();
 }
@ -87,7 +89,7 @@ void TensorObj::printDataFloat() const {
                std::cout << "[";
            }
        }
-        printf("%.1f", ptr[i]);
+        printf("%.6f", ptr[i]);
        for (size_t j = 0; j < numDims; ++j) {
            if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
                std::cout << "]";
@ -128,6 +130,34 @@ void TensorObj::printDataUint32_t() const {
    }
 }

+void TensorObj::printDataInt32_t() const {
+    IT_ASSERT(data != nullptr);
+    std::cout << "Tensor: " << guid << std::endl;
+    auto numDims = shape.size();
+    auto dimSzVec = std::vector<int>(numDims, 1);
+    auto ptr = data->getPtr<int32_t *>();
+    dimSzVec[numDims - 1] = shape[numDims - 1];
+    for (int i = numDims - 1; i != 0; --i)
+        dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
+    for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
+        for (size_t j = 0; j < numDims; ++j) {
+            if (i % dimSzVec[j] == 0) {
+                std::cout << "[";
+            }
+        }
+        std::cout << ptr[i];
+        for (size_t j = 0; j < numDims; ++j) {
+            if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
+                std::cout << "]";
+            }
+        }
+        if (i != size() - 1)
+            std::cout << ", ";
+        if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1)
+            std::cout << std::endl;
+    }
+}
+
 bool TensorObj::equalData(const Tensor &rhs) const {
    IT_ASSERT(data != nullptr);
    IT_ASSERT(rhs->data != nullptr);
@ -142,6 +172,9 @@ bool TensorObj::equalData(const Tensor &rhs) const {
    else if (getDType() == DataType::Float32)
        return equalDataImpl(getRawDataPtr<float *>(),
                             rhs->getRawDataPtr<float *>(), size());
+    else if (getDType() == DataType::Int32)
+        return equalDataImpl(getRawDataPtr<int32_t *>(),
+                             rhs->getRawDataPtr<int32_t *>(), size());
    else
        IT_TODO_HALT();
 }
@ -155,6 +188,8 @@ void TensorObj::dataMalloc() {
        bytesPerElement = sizeof(float);
    else if (getDType() == DataType::UInt32)
        bytesPerElement = sizeof(uint32_t);
+    else if (getDType() == DataType::Int32)
+        bytesPerElement = sizeof(int32_t);
    data = runtime->allocBlob(size() * bytesPerElement);
 }

--- a/src/kernels/bang/activation.cc
+++ b/src/kernels/bang/activation.cc
@ -0,0 +1,161 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class UnaryCnnl : public BangKernelWithoutConfig {
+    virtual cnnlActivationMode_t getOpType() const = 0;
+    virtual float getCoef() const = 0;
+    virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get op descriptor
+        cnnlActivationDescriptor_t opDesc;
+        checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
+        checkCnnlError(cnnlSetActivationDescriptor(
+            opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
+
+        auto [alpha, beta] = getAlphBeta();
+        cnnlStatus_t stat =
+            cnnlActivationForward(context->cnnlHandle(), opDesc, &alpha, aDesc,
+                                  aData, &beta, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyActivationDescriptor(opDesc));
+    }
+};
+
+class RoundCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class SquareCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlSquare(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class ReluCnnl : public UnaryCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_RELU;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+class SigmoidCnnl : public UnaryCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_SIGMOID;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+class TanhCnnl : public UnaryCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_TANH;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Relu, DataType::Float32, ReluCnnl,
+                "Relu_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, DataType::Float32, SigmoidCnnl,
+                "Sigmoid_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Tanh, DataType::Float32, TanhCnnl,
+                "Tanh_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Round, DataType::Float32, RoundCnnl,
+                "Round_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Square, DataType::Float32, SquareCnnl,
+                "Square_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/activation_backward.cc
+++ b/src/kernels/bang/activation_backward.cc
@ -0,0 +1,94 @@
+#include "operators/activation_backward.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class ActivationBackwardCnnl : public BangKernelWithoutConfig {
+    virtual cnnlActivationMode_t getOpType() const = 0;
+    virtual float getCoef() const = 0;
+    virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ActivationBackwardObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const yData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const diffYData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const xData = (op->getInputs(2)->getRawDataPtr<void *>());
+        void *const diffXData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t yDesc, diffYDesc, xDesc, diffXDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&yDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(diffYDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&xDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(xDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(diffXDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get op descriptor
+        cnnlActivationDescriptor_t opDesc;
+        checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
+        checkCnnlError(cnnlSetActivationDescriptor(
+            opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
+
+        auto [alpha, beta] = getAlphBeta();
+        cnnlStatus_t stat = cnnlActivationBackward(
+            context->cnnlHandle(), opDesc, &alpha, yDesc, yData, diffYDesc,
+            diffYData, xDesc, xData, &beta, diffXDesc, diffXData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(yDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(diffYDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(xDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(diffXDesc));
+        checkCnnlError(cnnlDestroyActivationDescriptor(opDesc));
+    }
+};
+
+class ReluBackwardCnnl : public ActivationBackwardCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_RELU;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+class SigmoidBackwardCnnl : public ActivationBackwardCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_SIGMOID;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+class TanhBackwardCnnl : public ActivationBackwardCnnl {
+    cnnlActivationMode_t getOpType() const override {
+        return CNNL_ACTIVATION_TANH;
+    }
+    float getCoef() const override { return 0.0; }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, DataType::Float32,
+                ReluBackwardCnnl, "ReluBackward_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, DataType::Float32,
+                SigmoidBackwardCnnl, "SigmoidBackward_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, DataType::Float32,
+                TanhBackwardCnnl, "TanhBackward_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/addn.cc
+++ b/src/kernels/bang/addn.cc
@ -0,0 +1,51 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/element_wise.h"
+
+namespace infini {
+class AddNCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<AddNObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        int num = op->numInputs();
+        void *argv[num];
+        for (int i = 0; i < num; ++i) {
+            argv[i] = op->getInputs(i)->getRawDataPtr<void *>();
+        }
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t desc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        checkCnnlError(cnnlCreateTensorDescriptor(&desc));
+        checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        cnnlTensorDescriptor_t descArray[num];
+        for (int i = 0; i < num; ++i) {
+            checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
+            checkCnnlError(
+                cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
+                                        CNNL_DTYPE_FLOAT, 4, dim_array));
+        }
+
+        cnnlStatus_t stat =
+            cnnlAddN(context->cnnlHandle(), descArray, argv, num, desc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        for (int i = 0; i < num; ++i) {
+            checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
+        }
+        checkCnnlError(cnnlDestroyTensorDescriptor(desc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::AddN, DataType::Float32, AddNCnnl,
+                "AddN_cnnl_BANG_Float32");
+}; // namespace infini
--- a/src/kernels/bang/arange.cc
+++ b/src/kernels/bang/arange.cc
@ -0,0 +1,40 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class ArangeCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ArangeObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        float start = op->getStartValue();
+        float step = op->getStepValue();
+        int length = op->getLength();
+
+        cnnlTensorDescriptor_t cDesc;
+        int dim_array[1] = {length};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, 1, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlArange_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                          &start, &step, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Arange, DataType::Float32, ArangeCnnl,
+                "Arange_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/cast.cc
+++ b/src/kernels/bang/cast.cc
@ -0,0 +1,120 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class CastCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<CastObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        cnnlCastDataType_t NlCastType;
+        CastObj::CastType type = op->getType();
+        switch (type) {
+        case CastObj::Float2Half:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_HALF, 4, dim_array));
+            NlCastType = CNNL_CAST_FLOAT_TO_HALF;
+            break;
+        case CastObj::Float2HalfIEEE754:
+        case CastObj::Float2Double:
+        case CastObj::Float2Int64:
+        case CastObj::Float2Int32:
+            checkCnnlError(cnnlSetTensorDescriptor(
+                aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
+            NlCastType = CNNL_CAST_FLOAT_TO_INT32;
+        case CastObj::Float2Int16:
+        case CastObj::Float2Int8:
+        case CastObj::Float2Bool:
+            // Todo
+            break;
+        case CastObj::Half2Float:
+        case CastObj::Half2Int32:
+        case CastObj::Half2Int64:
+        case CastObj::Half2Int16:
+        case CastObj::Half2Int8:
+        case CastObj::Half2Uint8:
+        case CastObj::Half2Bool:
+        case CastObj::Half2FloatInf:
+            // todo
+            break;
+        case CastObj::Int322Float:
+        case CastObj::Int322Half:
+        case CastObj::Int322Int8:
+        case CastObj::Int322Int16:
+            // todo
+            break;
+        case CastObj::Int162Float:
+        case CastObj::Int162Half:
+        case CastObj::Int162Int32:
+            // todo
+            break;
+        case CastObj::Int82Float:
+        case CastObj::Int82Half:
+        case CastObj::Int82Int16:
+        case CastObj::Int82Int32:
+            // todo
+            break;
+        case CastObj::Uint82Float:
+        case CastObj::Uint82Half:
+        case CastObj::Uint82Int32:
+        case CastObj::Uint82Int64:
+            // todo
+            break;
+        case CastObj::Bool2Float:
+        case CastObj::Bool2Half:
+        case CastObj::Bool2Int32:
+            // todo
+            break;
+        case CastObj::Int322Int64:
+        case CastObj::Int322Bool:
+            // todo
+            break;
+        case CastObj::Int642Int32:
+        case CastObj::Int642Uint32:
+        case CastObj::Int642Float:
+        case CastObj::Int642Half:
+            // todo
+            break;
+        case CastObj::Uint642Uint32:
+        case CastObj::Uint322Int64:
+        case CastObj::Uint322Uint64:
+            // todo
+            break;
+        case CastObj::Double2Float:
+            // todo
+            break;
+        }
+        cnnlStatus_t stat = cnnlCastDataType(context->cnnlHandle(), aDesc,
+                                             aData, NlCastType, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Cast, DataType::Float32, CastCnnl,
+                "Cast_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/ceil.cc
+++ b/src/kernels/bang/ceil.cc
@ -0,0 +1,46 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class CeilCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Ceil, DataType::Float32, CeilCnnl,
+                "Ceil_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/clip.cc
+++ b/src/kernels/bang/clip.cc
@ -0,0 +1,42 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class ClipCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ClipObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        float min = op->getMin();
+        float max = op->getMax();
+
+        cnnlTensorDescriptor_t aDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Clip, DataType::Float32, ClipCnnl,
+                "Clip_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/concat.cc
+++ b/src/kernels/bang/concat.cc
@ -0,0 +1,68 @@
+#include "operators/concat.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class ConcatCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ConcatObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        int num = op->numInputs();
+        int axis = op->getDim();
+        void *argv[num];
+        for (int i = 0; i < num; ++i) {
+            argv[i] = op->getInputs(i)->getRawDataPtr<void *>();
+        }
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t desc;
+
+        int dim_array[num][4];
+        for (int i = 0; i < num; ++i) {
+            auto dim = op->getInputs(i)->getDims();
+            if (dim.size() != 4) {
+                IT_TODO_HALT();
+            }
+            dim_array[i][0] = dim[0];
+            dim_array[i][1] = dim[1];
+            dim_array[i][2] = dim[2];
+            dim_array[i][3] = dim[3];
+        }
+
+        auto dim = op->getOutput()->getDims();
+        int dimout_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&desc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            desc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimout_array));
+        cnnlTensorDescriptor_t descArray[num];
+        for (int i = 0; i < num; ++i) {
+            checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
+            checkCnnlError(
+                cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
+                                        CNNL_DTYPE_FLOAT, 4, dim_array[i]));
+        }
+
+        size_t wsSize;
+        cnnlGetConcatWorkspaceSize(context->cnnlHandle(), num, &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlConcat(context->cnnlHandle(), num, axis, descArray, argv,
+                       wsData, wsSize, desc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        for (int i = 0; i < num; ++i) {
+            checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
+        }
+        checkCnnlError(cnnlDestroyTensorDescriptor(desc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Concat, DataType::Float32, ConcatCnnl,
+                "Concat_cnnl_BANG_Float32");
+}; // namespace infini
--- a/src/kernels/bang/convbpfilter.cc
+++ b/src/kernels/bang/convbpfilter.cc
@ -0,0 +1,159 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/conv.h"
+
+namespace infini {
+class ConvBackwardFilterCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ConvBackwardFilterObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+        const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
+        const int cpg = op->getChannelPerGroup();
+        const int g = c / cpg;
+
+        int pad[4] = {ph, ph, pw, pw};
+        int stride[2] = {sh, sw};
+        int dilation[2] = {dh, dw};
+
+        cnnlConvolutionDescriptor_t convDesc;
+        checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc));
+        checkCnnlError(cnnlSetConvolutionDescriptor(
+            convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT));
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc, aDescTrans, bDescTrans,
+            cDescTrans;
+        auto dimInputs0 = op->getInputs(0)->getDims();
+        auto dimInputs1 = op->getInputs(1)->getDims();
+        auto dimOutput = op->getOutput()->getDims();
+
+        if (dimInputs0.size() != 4)
+            IT_TODO_HALT();
+        if (dimInputs1.size() != 4)
+            IT_TODO_HALT();
+        if (dimOutput.size() != 4)
+            IT_TODO_HALT();
+
+        int inputs0Array[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2],
+                               dimInputs0[3]};
+        int inputs1Array[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2],
+                               dimInputs1[3]};
+        int outputArray[4] = {dimOutput[0], dimOutput[1], dimOutput[2],
+                              dimOutput[3]};
+
+        int inputs0ArrayTrans[4] = {dimInputs0[0], dimInputs0[2], dimInputs0[3],
+                                    dimInputs0[1]};
+        int inputs1ArrayTrans[4] = {dimInputs1[0], dimInputs1[2], dimInputs1[3],
+                                    dimInputs1[1]};
+        int outputArrayTrans[4] = {dimOutput[0], dimOutput[2], dimOutput[3],
+                                   dimOutput[1]};
+
+        int transMode[4] = {0, 2, 3, 1};
+        cnnlTransposeDescriptor_t transDesc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&transDesc));
+        checkCnnlError(cnnlSetTransposeDescriptor(transDesc, 4, transMode));
+
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs0Array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDescTrans));
+        checkCnnlError(cnnlSetTensorDescriptor(aDescTrans, CNNL_LAYOUT_NHWC,
+                                               CNNL_DTYPE_FLOAT, 4,
+                                               inputs0ArrayTrans));
+
+        size_t wsTrans1Size = dimInputs0[0] * dimInputs0[1] * dimInputs0[2] *
+                              dimInputs0[3] * sizeof(float);
+        BangPtr wsTrans1Data = context->getWorkspace(wsTrans1Size);
+
+        cnnlStatus_t stat =
+            cnnlTranspose(context->cnnlHandle(), transDesc, aDesc, aData,
+                          aDescTrans, wsTrans1Data);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs1Array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDescTrans));
+        checkCnnlError(cnnlSetTensorDescriptor(bDescTrans, CNNL_LAYOUT_NHWC,
+                                               CNNL_DTYPE_FLOAT, 4,
+                                               inputs1ArrayTrans));
+
+        size_t wsTrans2Size = dimInputs1[0] * dimInputs1[1] * dimInputs1[2] *
+                              dimInputs1[3] * sizeof(float);
+        BangPtr wsTrans2Data = context->getWorkspace(wsTrans2Size);
+
+        stat = cnnlTranspose(context->cnnlHandle(), transDesc, bDesc, bData,
+                             bDescTrans, wsTrans2Data);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, outputArray));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDescTrans));
+        checkCnnlError(cnnlSetTensorDescriptor(cDescTrans, CNNL_LAYOUT_NHWC,
+                                               CNNL_DTYPE_FLOAT, 4,
+                                               outputArrayTrans));
+
+        size_t wsTrans3Size = dimOutput[0] * dimOutput[1] * dimOutput[2] *
+                              dimOutput[3] * sizeof(float);
+        BangPtr wsTrans3Data = context->getWorkspace(wsTrans3Size);
+
+        cnnlConvolutionBwdFilterAlgo_t algo;
+        cnnlGetConvolutionBackwardFilterAlgorithm(
+            context->cnnlHandle(), convDesc, aDescTrans, bDescTrans, cDescTrans,
+            CNNL_CONVOLUTION_BWD_FILTER_FASTEST, &algo);
+
+        size_t wsSize;
+        cnnlGetConvolutionBackwardFilterWorkspaceSize(
+            context->cnnlHandle(), aDescTrans, bDescTrans, cDescTrans, convDesc,
+            algo, &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        stat = cnnlConvolutionBackwardFilter(
+            context->cnnlHandle(), NULL, aDescTrans, wsTrans1Data, bDescTrans,
+            wsTrans2Data, convDesc, algo, wsData, wsSize, NULL, cDescTrans,
+            wsTrans3Data);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        int transMode2[4] = {0, 3, 1, 2};
+        cnnlTransposeDescriptor_t transOutputDesc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&transOutputDesc));
+        checkCnnlError(
+            cnnlSetTransposeDescriptor(transOutputDesc, 4, transMode2));
+
+        stat = cnnlTranspose(context->cnnlHandle(), transOutputDesc, cDescTrans,
+                             wsTrans3Data, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDescTrans));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDescTrans));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDescTrans));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(transDesc));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(transOutputDesc));
+        checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::ConvBackwardFilter, DataType::Float32,
+                ConvBackwardFilterCnnl, "ConvBackwardFilter_cnnl_BANG_Float32");
+}; // namespace infini
--- a/src/kernels/bang/copy.cc
+++ b/src/kernels/bang/copy.cc
@ -0,0 +1,46 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class CopyCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlCopy(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Copy, DataType::Float32, CopyCnnl,
+                "Copy_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/cumsum.cc
+++ b/src/kernels/bang/cumsum.cc
@ -0,0 +1,50 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class CumsumCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<CumsumObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        int axis = op->getAxis();
+        bool exclusive = op->getExclusive();
+        bool reverse = op->getReverse();
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlCumsum(context->cnnlHandle(), aDesc, aData, axis, exclusive,
+                       reverse, CNNL_NOT_PROPAGATE_NAN, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Cumsum, DataType::Float32, CumsumCnnl,
+                "Cumsum_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/det.cc
+++ b/src/kernels/bang/det.cc
@ -0,0 +1,53 @@
+#include "operators/det.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class DetCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<DetObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        DetObj::Mode mode = op->getMode();
+        cnnlDetMode_t nlMode;
+        if (mode == DetObj::LogDet) {
+            nlMode = CNNL_DET_MODE_LOGDET;
+        } else {
+            nlMode = CNNL_DET_MODE_DET;
+        }
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dimin = op->getInputs(0)->getDims();
+        auto dimout = op->getOutput()->getDims();
+        if (dimin.size() != 4 || dimout.size() != 2)
+            IT_TODO_HALT();
+
+        int dimin_array[4] = {dimin[0], dimin[1], dimin[2], dimin[3]};
+        int dimout_array[2] = {dimout[0], dimout[1]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimin_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 2, dimout_array));
+
+        cnnlStatus_t stat =
+            cnnlDet(context->cnnlHandle(), nlMode, aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Det, DataType::Float32, DetCnnl,
+                "Det_cnnl_BANG_Float32");
+}; // namespace infini
--- a/src/kernels/bang/element_wise.cc
+++ b/src/kernels/bang/element_wise.cc
@ -66,6 +66,772 @@ class ElementWiseCnnl : public BangKernelWithoutConfig {
    }
 };

+class LogicOpCnnl : public BangKernelWithoutConfig {
+    virtual cnnlLogicOp_t getOpType() const = 0;
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetLogicOpWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                     &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlLogicOp(context->cnnlHandle(), getOpType(),
+                                         aDesc, aData, bDesc, bData,
+                                         wsData, wsSize, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class BitComputeCnnl : public BangKernelWithoutConfig {
+    virtual cnnlBitComputeOp_t getOpType() const = 0;
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_INT32, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_INT32, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_INT32, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetBitComputeWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                     &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlBitCompute_v2(context->cnnlHandle(), getOpType(),
+                                              aDesc, aData, bDesc, bData,
+                                              cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class DivCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlDiv_v2(
+            context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc,
+            aData, bDesc, bData, wsData, wsSize, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class DivNoNanCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetDivNoNanWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                     &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlDivNoNan_v2(
+            context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc,
+            aData, bDesc, bData, wsData, wsSize, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class MaximumCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get op descriptor
+        size_t wsSize;
+        cnnlGetMaximumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlMaximum(context->cnnlHandle(), aDesc, aData, bDesc, bData,
+                        cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class MinimumCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get op descriptor
+        size_t wsSize;
+        cnnlGetMinimumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlMinimum(context->cnnlHandle(), aDesc, aData, bDesc, bData,
+                        cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class MSELossCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<MSELossObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        MSELossObj::Reduction reduction = op->getReduction();
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        int dim_out[4] = {1, 1, 1, 1};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        if (reduction == MSELossObj::None) {
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
+        } else {
+            checkCnnlError(cnnlSetTensorDescriptor(
+                cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_out));
+        }
+        cnnlStatus_t stat;
+        if (reduction == MSELossObj::None) {
+            stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_NONE, aDesc,
+                               aData, bDesc, bData, cDesc, cData);
+        } else if (reduction == MSELossObj::Sum) {
+            stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_SUM, aDesc,
+                               aData, bDesc, bData, cDesc, cData);
+        } else {
+            stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_MEAN, aDesc,
+                               aData, bDesc, bData, cDesc, cData);
+        }
+
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class PowerCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get op descriptor
+        size_t wsSize;
+        cnnlGetPowWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlPow(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                    aDesc, aData, bDesc, bData, wsData, wsSize, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class FloorDivCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetFloorDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                     &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlFloorDiv_v2(
+            context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc,
+            aData, bDesc, bData, cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class FloorDivTruncCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetFloorDivTruncWorkspaceSize(context->cnnlHandle(), aDesc, bDesc,
+                                          cDesc, &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlFloorDivTrunc(
+            context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc,
+            aData, bDesc, bData, cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class FloorModCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                     &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlFloorMod(context->cnnlHandle(), aDesc, aData, bDesc, bData,
+                         cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetSquaredDifferenceWorkspaceSize(context->cnnlHandle(), aDesc,
+                                              bDesc, cDesc, &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlSquaredDifference(context->cnnlHandle(), aDesc, aData, bDesc,
+                                  bData, cDesc, cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class AddcdivCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<AddcdivObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getInputs(2)->getRawDataPtr<void *>());
+        void *const oData = (op->getOutput()->getRawDataPtr<void *>());
+        float alpha = op->getAlpha();
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc, oDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&oDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(oDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetAddcdivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                    &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlAddcdiv(context->cnnlHandle(), aDesc, aData, &alpha,
+                                        bDesc, bData, cDesc, cData, wsData, wsSize, oDesc, oData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(oDesc));
+    }
+};
+
+class AddcmulCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<AddcmulObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getInputs(2)->getRawDataPtr<void *>());
+        void *const oData = (op->getOutput()->getRawDataPtr<void *>());
+        float alpha = op->getAlpha();
+
+        cnnlTensorDescriptor_t aDesc, bDesc, cDesc, oDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&oDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(oDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetAddcmulWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
+                                    &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlAddcmul(context->cnnlHandle(), aDesc, aData, &alpha,
+                                        bDesc, bData, cDesc, cData, wsData, wsSize, oDesc, oData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(oDesc));
+    }
+};
+
+// class FloorModTruncCnnl : public BangKernelWithoutConfig {
+//     void compute(const Operator &_op,
+//                  const RuntimeObj *_context) const override {
+//         auto op = as<ElementWiseObj>(_op);
+//         auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+//
+//         void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+//         void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+//         void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+//
+//         cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+//         auto dim = op->getInputs(0)->getDims();
+//         if (dim.size() != 4)
+//             IT_TODO_HALT();
+//
+//         int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+//         // get inputs
+//         checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+//         checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+//                                                CNNL_DTYPE_FLOAT, 4,
+//                                                dim_array));
+//
+//         checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
+//         checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
+//                                                CNNL_DTYPE_FLOAT, 4,
+//                                                dim_array));
+//
+//         // get outputs
+//         checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+//         checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+//                                                CNNL_DTYPE_FLOAT, 4,
+//                                                dim_array));
+//
+//         size_t wsSize;
+//         cnnlGetFloorModTruncWorkspaceSize(context->cnnlHandle(), aDesc,
+//         bDesc, cDesc,
+//                                      &wsSize);
+//
+//         BangPtr wsData = context->getWorkspace(wsSize);
+//
+//         cnnlStatus_t stat = cnnlFloorModTrunc(context->cnnlHandle(),
+//                                        aDesc, aData, bDesc, bData, cDesc,
+//                                        cData, wsData, wsSize);
+//         if (stat != CNNL_STATUS_SUCCESS)
+//             return;
+//
+//         // Destories in BANG does not require sync. But cnnl does not state
+//         // whether sync is required before destories.
+//         checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+//         checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
+//         checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+//     }
+// };
+
 class AddCnnl : public ElementWiseCnnl {
    cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_ADD; }
 };
@ -88,6 +854,56 @@ class ElementWiseBang : public BangKernelWithoutConfig {
    }
 };

+class EqualCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_EQ; }
+};
+class NotEqualCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_NE; }
+};
+class GreaterThanCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_GT; }
+};
+class GreaterEqualCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_GE; }
+};
+class LessThanCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_LT; }
+};
+class LessEqualCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_LE; }
+};
+class AndCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_AND; }
+};
+class OrCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_OR; }
+};
+class XorCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_XOR; }
+};
+class NotCnnl : public LogicOpCnnl {
+    cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_NOT; }
+};
+
+class BitAndCnnl : public BitComputeCnnl {
+    cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BAND_OP; }
+};
+class BitOrCnnl : public BitComputeCnnl {
+    cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BOR_OP; }
+};
+class BitXorCnnl : public BitComputeCnnl {
+    cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BXOR_OP; }
+};
+class BitNotCnnl : public BitComputeCnnl {
+    cnnlBitComputeOp_t getOpType() const override { return CNNL_BNOT_OP; }
+};
+// class BitLeftShiftCnnl : public BitComputeCnnl {
+//     cnnlBitComputeOp_t getOpType() const override { return CNNL_BLEFT_SHIFT_OP_V2; }
+// };
+// class BitRightShiftCnnl : public BitComputeCnnl {
+//     cnnlBitComputeOp_t getOpType() const override { return CNNL_BLEFT_SHIFT_OP_V2; }
+// };
+
 REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl,
                "Add_cnnl_BANG_Float32");
 REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
@ -95,8 +911,69 @@ REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl,
 REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl,
                "Mul_cnnl_BANG_Float32");

-REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, ElementWiseBang,
-                "Div_Bang_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::DivDemo, DataType::Float32,
+                ElementWiseBang, "DivDemo_Bang_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, DivCnnl,
+                "Div_cnnl_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::DivNoNan, DataType::Float32, DivNoNanCnnl,
+                "DivNoNan_cnnl_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Maximum, DataType::Float32, MaximumCnnl,
+                "Maximum_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Minimum, DataType::Float32, MinimumCnnl,
+                "Minimum_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::MSELoss, DataType::Float32, MSELossCnnl,
+                "MSELoss_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Power, DataType::Float32, PowerCnnl,
+                "Power_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, DataType::Float32, FloorDivCnnl,
+                "FloorDiv_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::FloorDivTrunc, DataType::Float32,
+                FloorDivTruncCnnl, "FloorDivTrunc_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::FloorMod, DataType::Float32, FloorModCnnl,
+                "FloorMod_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::SquaredDifference, DataType::Float32,
+                SquaredDifferenceCnnl, "SquaredDifference_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Equal, DataType::Float32, EqualCnnl,
+                "Equal_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::NotEqual, DataType::Float32, NotEqualCnnl,
+                "NotEqual_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::GreaterThan, DataType::Float32, GreaterThanCnnl,
+                "GreaterThan_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::GreaterEqual, DataType::Float32, GreaterEqualCnnl,
+                "GreaterEqual_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::LessThan, DataType::Float32, LessThanCnnl,
+                "LessThan_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::LessEqual, DataType::Float32, LessEqualCnnl,
+                "LessEqual_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::And, DataType::Float32, AndCnnl,
+                "And_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Or, DataType::Float32, OrCnnl,
+                "Or_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Xor, DataType::Float32, XorCnnl,
+                "Xor_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Not, DataType::Float32, NotCnnl,
+                "Not_cnnl_BANG_Float32");
+
+REGISTER_KERNEL(Device::BANG, OpType::Addcdiv, DataType::Float32, AddcdivCnnl,
+                "Addcdiv_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Addcmul, DataType::Float32, AddcmulCnnl,
+                "Addcmul_cnnl_BANG_Float32");
+
+REGISTER_KERNEL(Device::BANG, OpType::BitAnd, DataType::Float32, BitAndCnnl,
+                "BitAnd_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::BitOr, DataType::Float32, BitOrCnnl,
+                "BitOr_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::BitXor, DataType::Float32, BitXorCnnl,
+                "BitXor_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::BitNot, DataType::Float32, BitNotCnnl,
+                "BitNot_cnnl_BANG_Float32");
+// REGISTER_KERNEL(Device::BANG, OpType::BitLeftShift, DataType::Float32, BitLeftShiftCnnl,
+//                 "BitLeftShift_cnnl_BANG_Float32");
+// REGISTER_KERNEL(Device::BANG, OpType::BitRightShift, DataType::Float32, BitRightShiftCnnl,
+//                 "BitRightShift_cnnl_BANG_Float32");
+// REGISTER_KERNEL(Device::BANG, OpType::FloorModTrunc, DataType::Float32,
+// FloorModTruncCnnl,
+//                 "FloorModTrunc_cnnl_BANG_Float32");
 // REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32,
 // ElementWiseBang,
 //                 "Pow_Bang_Float32");
--- a/src/kernels/bang/erf.cc
+++ b/src/kernels/bang/erf.cc
@ -0,0 +1,47 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class ErfCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlErf_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                       aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Erf, DataType::Float32, ErfCnnl,
+                "Erf_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/exp.cc
+++ b/src/kernels/bang/exp.cc
@ -0,0 +1,47 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class ExpCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlExp_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                       aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Exp, DataType::Float32, ExpCnnl,
+                "Exp_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/fill.cc
+++ b/src/kernels/bang/fill.cc
@ -0,0 +1,40 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class FillCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<FillObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        float value = op->getValue();
+
+        cnnlTensorDescriptor_t cDesc;
+        auto dim = op->getOutput()->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlFill(context->cnnlHandle(), value, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Fill, DataType::Float32, FillCnnl,
+                "Fill_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/flip.cc
+++ b/src/kernels/bang/flip.cc
@ -0,0 +1,41 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class FlipCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<FlipObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        vector<int> axis = op->getAxis();
+
+        cnnlTensorDescriptor_t aDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat = cnnlFlip(context->cnnlHandle(), axis.data(),
+                                     axis.size(), aDesc, aData, aDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Flip, DataType::Float32, FlipCnnl,
+                "Flip_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/floor.cc
+++ b/src/kernels/bang/floor.cc
@ -0,0 +1,46 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class FloorCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlFloor(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Floor, DataType::Float32, FloorCnnl,
+                "Floor_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/hardtanh.cc
+++ b/src/kernels/bang/hardtanh.cc
@ -0,0 +1,42 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class HardtanhCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<HardtanhObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        float min = op->getMin();
+        float max = op->getMax();
+
+        cnnlTensorDescriptor_t aDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlHardtanh(context->cnnlHandle(), aDesc, aData, max, min, aDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Hardtanh, DataType::Float32, HardtanhCnnl,
+                "Hardtanh_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/l2loss.cc
+++ b/src/kernels/bang/l2loss.cc
@ -0,0 +1,40 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class L2LossCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<L2LossObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlL2Loss(context->cnnlHandle(), aDesc, aData, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::L2Loss, DataType::Float32, L2LossCnnl,
+                "L2Loss_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/log.cc
+++ b/src/kernels/bang/log.cc
@ -0,0 +1,62 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class LogCnnl : public BangKernelWithoutConfig {
+    virtual cnnlLogBase_t getOpType() const = 0;
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlLog_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                       getOpType(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+class LogECnnl : public LogCnnl {
+    cnnlLogBase_t getOpType() const override { return CNNL_LOG_E; }
+};
+class Log2Cnnl : public LogCnnl {
+    cnnlLogBase_t getOpType() const override { return CNNL_LOG_2; }
+};
+class Log10Cnnl : public LogCnnl {
+    cnnlLogBase_t getOpType() const override { return CNNL_LOG_10; }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Log_e, DataType::Float32, LogECnnl,
+                "Loge_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Log_2, DataType::Float32, Log2Cnnl,
+                "Loge_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Log_10, DataType::Float32, Log10Cnnl,
+                "Loge_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/log1p.cc
+++ b/src/kernels/bang/log1p.cc
@ -0,0 +1,47 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class Log1pCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlLog1p(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                      aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Log1p, DataType::Float32, Log1pCnnl,
+                "Log1p_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/lrn.cc
+++ b/src/kernels/bang/lrn.cc
@ -0,0 +1,54 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class LrnCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<LrnObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        int lrn_n = op->getFeatureNum();
+        float alpha = op->getAlpha();
+        float beta = op->getBeta();
+        float bias = op->getBias();
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getOutput()->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        size_t wsSize;
+        cnnlGetLrnWorkspaceSize(context->cnnlHandle(), aDesc, cDesc, lrn_n, 
+                                     &wsSize);
+
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat = cnnlLrn(context->cnnlHandle(), CNNL_LRN_CROSS_CHANNEL, (unsigned int)lrn_n, double(alpha),
+                                    double(beta), double(bias), wsData, wsSize, aDesc,
+                                          aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Lrn, DataType::Float32,
+                LrnCnnl, "Lrn_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/muln.cc
+++ b/src/kernels/bang/muln.cc
@ -0,0 +1,51 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/element_wise.h"
+
+namespace infini {
+class MulNCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<MulNObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        int num = op->numInputs();
+        void *argv[num];
+        for (int i = 0; i < num; ++i) {
+            argv[i] = op->getInputs(i)->getRawDataPtr<void *>();
+        }
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t desc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        checkCnnlError(cnnlCreateTensorDescriptor(&desc));
+        checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        cnnlTensorDescriptor_t descArray[num];
+        for (int i = 0; i < num; ++i) {
+            checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
+            checkCnnlError(
+                cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
+                                        CNNL_DTYPE_FLOAT, 4, dim_array));
+        }
+
+        cnnlStatus_t stat =
+            cnnlMulN(context->cnnlHandle(), descArray, argv, num, desc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        for (int i = 0; i < num; ++i) {
+            checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
+        }
+        checkCnnlError(cnnlDestroyTensorDescriptor(desc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::MulN, DataType::Float32, MulNCnnl,
+                "MulN_cnnl_BANG_Float32");
+}; // namespace infini
--- a/src/kernels/bang/negtensor.cc
+++ b/src/kernels/bang/negtensor.cc
@ -0,0 +1,46 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class NegTensorCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlNegTensor(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::NegTensor, DataType::Float32,
+                NegTensorCnnl, "NegTensor_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/pad.cc
+++ b/src/kernels/bang/pad.cc
@ -0,0 +1,65 @@
+#include "operators/pad.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class PadCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<PadObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getOutput()->getDims();
+        int dim_size = dim.size();
+        int dim_array[dim_size];
+        for (int i = 0; i < dim_size; ++i) {
+            dim_array[i] = dim[i];
+        }
+        int paddings[dim_size * 2];
+        std::vector<int> pads = op->getPads();
+        if (pads.size() == 2 && dim_size != 1) {
+            for (int i = 0; i < dim_size * 2; i += 2) {
+                paddings[i] = pads[0];
+                paddings[i + 1] = pads[1];
+            }
+        } else {
+            for (int i = 0; i < dim_size * 2; i += 2) {
+                paddings[i] = pads[i / 2];
+                paddings[i + 1] = pads[i / 2 + dim_size];
+            }
+        }
+        int dimout_array[dim_size];
+        for (int i = 0; i < dim_size; ++i) {
+            dimout_array[i] = dim[i] + paddings[2 * i] + paddings[2 * i + 1];
+        }
+        float paddingValue = 0.0;
+        // input
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dim_size, dim_array));
+        // output
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
+                                               CNNL_DTYPE_FLOAT, dim_size,
+                                               dimout_array));
+
+        cnnlStatus_t stat = cnnlPad(context->cnnlHandle(), aDesc, aData,
+                                    paddings, &paddingValue, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Pad, DataType::Float32, PadCnnl,
+                "Pad_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/pooling.cc
+++ b/src/kernels/bang/pooling.cc
@ -0,0 +1,73 @@
+#include "operators/pooling.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class poolingCnnl : public BangKernelWithoutConfig {
+    virtual cnnlPoolingMode_t getPoolingMode() const = 0;
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<PoolingObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const outData = (op->getOutput()->getRawDataPtr<void *>());
+
+        const auto [n, c, h, w, kh, kw] = op->getNCHWRS();
+        const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+
+        // get inputs
+        int inArray[4] = {n, c, h, w};
+        cnnlTensorDescriptor_t inDesc;
+        checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, inArray));
+
+        // get maxpool descriptor
+        cnnlPoolingDescriptor_t poolingDesc;
+        checkCnnlError(cnnlCreatePoolingDescriptor(&poolingDesc));
+        checkCnnlError(cnnlSetPooling2dDescriptor_v2(
+            poolingDesc, getPoolingMode(), CNNL_NOT_PROPAGATE_NAN, kh, kw, ph,
+            ph, pw, pw, sh, sw, dh, dw, false));
+
+        // get outputs
+        auto outVec = op->getOutput()->getDims();
+        int outArray[4] = {outVec[0], outVec[1], outVec[2], outVec[3]};
+        cnnlTensorDescriptor_t outDesc;
+        checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, outArray));
+        size_t wsSize;
+        cnnlGetPoolingWorkspaceSize(context->cnnlHandle(), getPoolingMode(),
+                                    outVec[3], outVec[2], &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        float alpha = 1.f, beta = 0.f;
+        checkCnnlError(cnnlPoolingForward(context->cnnlHandle(), poolingDesc,
+                                          &alpha, inDesc, inData, &beta,
+                                          outDesc, outData, wsData, wsSize));
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(outDesc));
+        checkCnnlError(cnnlDestroyPoolingDescriptor(poolingDesc));
+    }
+};
+
+class maxPoolCnnl : public poolingCnnl {
+    cnnlPoolingMode_t getPoolingMode() const override {
+        return CNNL_POOLING_MAX;
+    }
+};
+
+class avgPoolCnnl : public poolingCnnl {
+    cnnlPoolingMode_t getPoolingMode() const override {
+        return CNNL_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::MaxPool, DataType::Float32, maxPoolCnnl,
+                "MaxPool_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::AvgPool, DataType::Float32, avgPoolCnnl,
+                "AvgPool_cnnl_BANG_Float32");
+}; // namespace infini
--- a/src/kernels/bang/reciprocal.cc
+++ b/src/kernels/bang/reciprocal.cc
@ -0,0 +1,46 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class ReciprocalCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlReciprocal(context->cnnlHandle(), aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Reciprocal, DataType::Float32,
+                ReciprocalCnnl, "Reciprocal_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/rsqrt.cc
+++ b/src/kernels/bang/rsqrt.cc
@ -0,0 +1,47 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class RsqrtCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlRsqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                         aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Rsqrt, DataType::Float32, RsqrtCnnl,
+                "Rsqrt_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/split.cc
+++ b/src/kernels/bang/split.cc
@ -0,0 +1,69 @@
+#include "operators/split.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class SplitCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<SplitObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+        int num = op->numOutputs();
+        int axis = op->getDim();
+        void *argv[num];
+        for (int i = 0; i < num; ++i) {
+            argv[i] = op->getOutput(i)->getRawDataPtr<void *>();
+        }
+        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t desc;
+
+        int dimout_array[num][4];
+        for (int i = 0; i < num; ++i) {
+            auto dim = op->getOutput(i)->getDims();
+            if (dim.size() != 4) {
+                IT_TODO_HALT();
+            }
+            dimout_array[i][0] = dim[0];
+            dimout_array[i][1] = dim[1];
+            dimout_array[i][2] = dim[2];
+            dimout_array[i][3] = dim[3];
+        }
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4) {
+            IT_TODO_HALT();
+        }
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        checkCnnlError(cnnlCreateTensorDescriptor(&desc));
+        checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+        cnnlTensorDescriptor_t descArray[num];
+        for (int i = 0; i < num; ++i) {
+            checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
+            checkCnnlError(
+                cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
+                                        CNNL_DTYPE_FLOAT, 4, dimout_array[i]));
+        }
+
+        size_t wsSize;
+        cnnlGetSplitWorkspaceSize(context->cnnlHandle(), num, &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlSplit(context->cnnlHandle(), num, axis, desc, inputData, wsData,
+                      wsSize, descArray, argv);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        for (int i = 0; i < num; ++i) {
+            checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
+        }
+        checkCnnlError(cnnlDestroyTensorDescriptor(desc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Split, DataType::Float32, SplitCnnl,
+                "Split_cnnl_BANG_Float32");
+}; // namespace infini
--- a/src/kernels/bang/sqrt.cc
+++ b/src/kernels/bang/sqrt.cc
@ -0,0 +1,47 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class SqrtCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat =
+            cnnlSqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION,
+                        aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Sqrt, DataType::Float32, SqrtCnnl,
+                "Sqrt_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/transform.cc
+++ b/src/kernels/bang/transform.cc
@ -0,0 +1,42 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class TransformCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<TransformObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t cDesc;
+        auto dim = op->getOutput()->getDims();
+        float alpha = op->getAlpha();
+        float beta = op->getBeta();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        cnnlStatus_t stat = cnnlTransform(context->cnnlHandle(), &alpha, cDesc,
+                                          aData, &beta, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Transform, DataType::Float32,
+                TransformCnnl, "Transform_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/bang/transpose.cc
+++ b/src/kernels/bang/transpose.cc
@ -0,0 +1,60 @@
+#include "operators/transpose.h"
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+
+namespace infini {
+class TransposeCnnl : public BangKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<TransposeObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dimin = op->getInputs(0)->getDims();
+        auto dimout = op->getOutput()->getDims();
+        if (dimin.size() != 4 || dimout.size() != 4)
+            IT_TODO_HALT();
+
+        int dimin_array[4] = {dimin[0], dimin[1], dimin[2], dimin[3]};
+        int dimout_array[4] = {dimout[0], dimout[1], dimout[2], dimout[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimin_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(
+            cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimout_array));
+
+        // get op descriptor
+        auto permute = op->getPermute();
+        cnnlTransposeDescriptor_t opDesc;
+        checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
+        checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
+
+        size_t wsSize;
+        cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aDesc, opDesc,
+                                      &wsSize);
+        BangPtr wsData = context->getWorkspace(wsSize);
+
+        cnnlStatus_t stat =
+            cnnlTranspose_v2(context->cnnlHandle(), opDesc, aDesc, aData, cDesc,
+                             cData, wsData, wsSize);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Transpose, DataType::Float32,
+                TransposeCnnl, "Transpose_cnnl_BANG_Float32");
+}; // namespace infini
--- a/src/kernels/bang/trigon.cc
+++ b/src/kernels/bang/trigon.cc
@ -0,0 +1,184 @@
+#include "bang/bang_kernel_without_config.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class TrigonCnnl : public BangKernelWithoutConfig {
+    virtual cnnlTrigonFunctionMode_t getOpType() const = 0;
+    virtual cnnlComputationPreference_t getPrefer() const = 0;
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const BangRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        cnnlTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getInputs(0)->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
+        // get inputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get outputs
+        checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
+        checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
+                                               CNNL_DTYPE_FLOAT, 4, dim_array));
+
+        // get op descriptor
+        cnnlTrigonDescriptor_t opDesc;
+        checkCnnlError(cnnlCreateTrigonDescriptor(&opDesc));
+        checkCnnlError(cnnlSetTrigonDescriptor(opDesc, getOpType()));
+
+        cnnlStatus_t stat = cnnlTrigonForward(context->cnnlHandle(), opDesc,
+                                              aDesc, aData, cDesc, cData);
+        if (stat != CNNL_STATUS_SUCCESS)
+            return;
+
+        // Destories in BANG does not require sync. But cnnl does not state
+        // whether sync is required before destories.
+        checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
+        checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
+        checkCnnlError(cnnlDestroyTrigonDescriptor(opDesc));
+    }
+};
+
+class SinCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_SIN;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class CosCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_COS;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class TanCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_TAN;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class ASinCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_ASIN;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class ACosCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_ACOS;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class ATanCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_ATAN;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class SinHCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_SINH;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class CosHCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_COSH;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class TanHCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_TANH;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class ASinHCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_ASINH;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class ACosHCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_ACOSH;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+class ATanHCnnl : public TrigonCnnl {
+    cnnlTrigonFunctionMode_t getOpType() const override {
+        return CNNL_TRIGON_ATANH;
+    }
+    cnnlComputationPreference_t getPrefer() const override {
+        return CNNL_COMPUTATION_HIGH_PRECISION;
+    }
+};
+
+REGISTER_KERNEL(Device::BANG, OpType::Sin, DataType::Float32, SinCnnl,
+                "Sin_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Cos, DataType::Float32, CosCnnl,
+                "Cos_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::Tan, DataType::Float32, TanCnnl,
+                "Tan_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ASin, DataType::Float32, ASinCnnl,
+                "ASin_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ACos, DataType::Float32, ACosCnnl,
+                "ACos_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ATan, DataType::Float32, ATanCnnl,
+                "ATan_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::SinH, DataType::Float32, SinHCnnl,
+                "SinH_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::CosH, DataType::Float32, CosHCnnl,
+                "CosH_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::TanH, DataType::Float32, TanHCnnl,
+                "TanH_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ASinH, DataType::Float32, ASinHCnnl,
+                "ASinH_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ACosH, DataType::Float32, ACosHCnnl,
+                "ACosH_cnnl_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::ATanH, DataType::Float32, ATanHCnnl,
+                "ATanH_cnnl_BANG_Float32");
+
+}; // namespace infini
--- a/src/kernels/cuda/lrn.cc
+++ b/src/kernels/cuda/lrn.cc
@ -0,0 +1,50 @@
+#include "operators/unary.h"
+#include "cuda/cuda_kernel_wihtout_config.h"
+#include "cuda/cuda_runtime.h"
+
+namespace infini {
+class LrnCudnn : public CudaKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<LrnObj>(_op);
+        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+        int lrn_n = op->getFeatureNum();
+        float alpha = op->getAlpha();
+        float beta = op->getBeta();
+        float bias = op->getBias();
+
+        cudnnTensorDescriptor_t aDesc, cDesc;
+        auto dim = op->getOutput()->getDims();
+        if (dim.size() != 4)
+            IT_TODO_HALT();
+
+        checkCudnnError(cudnnCreateTensorDescriptor(&aDesc));
+        checkCudnnError(cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW,
+                                               CUDNN_DATA_FLOAT, dim[0], dim[1], dim[2], dim[3]));
+        checkCudnnError(cudnnCreateTensorDescriptor(&cDesc));
+        checkCudnnError(cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW,
+                                               CUDNN_DATA_FLOAT, dim[0], dim[1], dim[2], dim[3]));
+
+        cudnnLRNDescriptor_t lrn_desc;
+        checkCudnnError(cudnnCreateLRNDescriptor(&lrn_desc));
+        checkCudnnError(cudnnSetLRNDescriptor(lrn_desc, (unsigned int)lrn_n, (double)alpha, double(beta), double(bias)));
+        cudnnStatus_t stat = cudnnLRNCrossChannelForward(context->cudnnHandle(), lrn_desc, CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, aDesc, aData,
+                                                         &beta, cDesc, cData);
+        if (stat != CUDNN_STATUS_SUCCESS)
+            return;
+
+        // Destories in CUDA does not require sync. But cudnn does not state
+        // whether sync is required before destories.
+        checkCudnnError(cudnnDestroyLRNDescriptor(lrn_desc));
+        checkCudnnError(cudnnDestroyTensorDescriptor(aDesc));
+        checkCudnnError(cudnnDestroyTensorDescriptor(cDesc));
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::Lrn, DataType::Float32,
+                LrnCudnn, "Lrn_cudnn_CUDA_Float32");
+
+}; // namespace infini
--- a/src/kernels/cuda/transpose.cc
+++ b/src/kernels/cuda/transpose.cc
@ -0,0 +1,24 @@
+#include "operators/transpose.h"
+#include "cuda/cuda_transpose.h"
+#include "cuda/cuda_kernel_wihtout_config.h"
+#include "cuda/cuda_runtime.h"
+
+namespace infini {
+
+class TransposeCuda : public CudaKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<TransposeObj>(_op);
+        float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
+        float *const cData = (op->getOutput()->getRawDataPtr<float *>());
+
+        auto dim = op->getInputs(0)->getDims();
+        int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
+        auto permute = op->getPermute();
+        transpose_kernel(aData, cData, n,c,h,w, permute[0],permute[1],permute[2],permute[3]);
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::Transpose, DataType::Float32, TransposeCuda,
+                "Transpose_CUDA_Float32");
+}; // namespace infini
--- a/src/kernels/cuda/transpose.cu
+++ b/src/kernels/cuda/transpose.cu
@ -0,0 +1,43 @@
+#include "cuda/cuda_transpose.h"
+#include <stdio.h>
+#include <math.h>
+
+constexpr unsigned int num_threads() { return 32 * 4; }
+constexpr int thread_work_size() { return 4; }
+constexpr int block_work_size() { return thread_work_size() * num_threads(); }
+
+
+__global__ void _transpose_kernel(float *a, float *c, int dim_0, int dim_1, int dim_2, int dim_3,
+                                          int p_0, int p_1, int p_2, int p_3) {
+
+    int src_dim[4] = {dim_0, dim_1, dim_2, dim_3};
+    int stride_dim[4] = {dim_1*dim_2*dim_3, dim_2*dim_3, dim_3, 1};
+    int permute[4] = {p_0, p_1, p_2, p_3};
+    int dst_dim[4] = {src_dim[p_0], src_dim[p_1], src_dim[p_2], src_dim[p_3]};
+    int n = dim_0 * dim_1 * dim_2 * dim_3;
+
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+    int stride = blockDim.x * gridDim.x;
+    for (int i = index; i < n; i += stride) {
+        int c0_index = i / (dst_dim[1] * dst_dim[2] * dst_dim[3]);
+        int c1_index = (i %  (dst_dim[1] * dst_dim[2] * dst_dim[3])) / (dst_dim[2] * dst_dim[3]);
+        int c2_index = ((i % (dst_dim[1] * dst_dim[2] * dst_dim[3])) % (dst_dim[2] * dst_dim[3])) / dst_dim[3];
+        int c3_index = ((i % (dst_dim[1] * dst_dim[2] * dst_dim[3])) % (dst_dim[2] * dst_dim[3])) % dst_dim[3]; 
+        int new_0 = c0_index * stride_dim[permute[0]];
+        int new_1 = c1_index * stride_dim[permute[1]];
+        int new_2 = c2_index * stride_dim[permute[2]];
+        int new_3 = c3_index * stride_dim[permute[3]];
+        int src_address = new_0 + new_1 + new_2 + new_3; 
+        c[i] = a[src_address];
+    }
+}
+
+namespace infini {
+void transpose_kernel(float *a, float *c, int dim_0, int dim_1, int dim_2, int dim_3,
+                                          int p_0, int p_1, int p_2, int p_3) {
+    int blocksize = block_work_size();
+    int gridsize = (dim_0*dim_1*dim_2*dim_3 + block_work_size() - 1) / block_work_size();
+    _transpose_kernel<<<blocksize, gridsize>>>(a,c,dim_0,dim_1,dim_2,dim_3,p_0,p_1,p_2,p_3);
+}
+
+}; // namespace infini
--- a/src/operators/activation_backward.cc
+++ b/src/operators/activation_backward.cc
@ -0,0 +1,38 @@
+#include "operators/activation_backward.h"
+
+namespace infini {
+ActivationBackwardObj::ActivationBackwardObj(OpType type, GraphObj *graph,
+                                             Tensor y, Tensor diff_y, Tensor x,
+                                             Tensor diff_x)
+    : OperatorObj(type, {y, diff_y, x}, {diff_x}) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>>
+ActivationBackwardObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string ActivationBackwardObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> ActivationBackwardObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> ActivationBackwardObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+}; // namespace infini
--- a/src/operators/conv.cc
+++ b/src/operators/conv.cc
@ -180,4 +180,79 @@ void ConvTransposed2dObj::setAuxilaryAttributes(PaddingMode mode) {
    }
 }

+void ConvBackwardFilterObj::setAuxilaryAttributes(PaddingMode mode) {
+    const Tensor &inputX = inputs[0];
+    const Tensor &diffY = inputs[1];
+    n = inputX->getDims()[0], c = inputX->getDims()[1],
+    h = inputX->getDims()[2], w = inputX->getDims()[3], f = diffY->getDims()[0],
+    r = diffY->getDims()[2], s = diffY->getDims()[3];
+    if (mode == PaddingMode::Same) {
+        int oh = h / sh;
+        int ow = w / sw;
+        ph = (h - oh * sh + (r - sh) * dh) / 2;
+        pw = (w - ow * sw + (s - sw) * dw) / 2;
+    } else if (mode == PaddingMode::Valid) {
+        ph = pw = 0;
+    }
+}
+
+ConvBackwardFilterObj::ConvBackwardFilterObj(GraphObj *graph, Tensor inputX,
+                                             Tensor diffY, Tensor diffW, int ph,
+                                             int pw, int sh, int sw, int dh,
+                                             int dw, Tensor bias, ActType act)
+    : ConvBaseObj(OpType::Conv, {inputX, diffY}, diffW, ph, pw, sh, sw, dh, dw,
+                  inputX, diffY),
+      act(act) {
+    if (bias)
+        IT_TODO_HALT();
+    setAuxilaryAttributes(PaddingMode::Other);
+    IT_ASSERT(checkValid(graph));
+}
+
+ConvBackwardFilterObj::ConvBackwardFilterObj(GraphObj *graph, Tensor inputX,
+                                             Tensor diffY, Tensor diffW,
+                                             PaddingMode mode, int sh, int sw,
+                                             int dh, int dw, Tensor bias,
+                                             ActType act)
+    : ConvBaseObj(OpType::Conv, {inputX, diffY}, diffW, mode, sh, sw, dh, dw,
+                  inputX, diffY),
+      act(act) {
+    if (bias)
+        IT_TODO_HALT();
+    setAuxilaryAttributes(mode);
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>>
+ConvBackwardFilterObj::inferShape(const TensorVec &inputs) const {
+    const auto &inputX = inputs[0], &diffY = inputs[1];
+    auto n = inputX->getDims()[0];
+    auto h = inputX->getDims()[2];
+    auto w = inputX->getDims()[3];
+    auto f = diffY->getDims()[0];
+    auto r = diffY->getDims()[2];
+    auto s = diffY->getDims()[3];
+    int on = n, oc = f;
+    int oh = 0, ow = 0;
+    // For NCHW+FCRS layout, C of input is divisable by C of weight
+    if (inputX->getDims()[1] % diffY->getDims()[1] != 0)
+        return {};
+    // Set padding size
+    if (padding == PaddingMode::Other) {
+        oh = (h - (r - sh) * dh + ph * 2) / sh;
+        ow = (w - (s - sw) * dw + pw * 2) / sw;
+    } else if (padding == PaddingMode::Same) {
+        oh = h / sh;
+        ow = w / sw;
+        // ph = (h - oh * sh + (r - sh) * dh) / 2;
+        // pw = (w - ow * sw + (s - sw) * dw) / 2;
+    } else if (padding == PaddingMode::Valid) {
+        int ph = 0;
+        int pw = 0;
+        oh = (h - (r - sh) * dh + ph * 2) / sh;
+        ow = (w - (s - sw) * dw + pw * 2) / sw;
+    }
+    return {{{on, oc, oh, ow}}};
+}
+
 } // namespace infini
--- a/src/operators/det.cc
+++ b/src/operators/det.cc
@ -0,0 +1,43 @@
+#include "operators/det.h"
+
+namespace infini {
+DetObj::DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode)
+    : OperatorObj(OpType::Det, {input}, {output}), modeValue(mode) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> DetObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    auto input = A->getDims();
+    int length = input.size();
+    if (length == 2) {
+        std::vector<int> output = {1};
+        return {{output}};
+    } else {
+        std::vector<int> output(input.begin(), input.end() - 2);
+        return {{output}};
+    }
+}
+
+std::string DetObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> DetObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> DetObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+}; // namespace infini
--- a/src/operators/element_wise.cc
+++ b/src/operators/element_wise.cc
@ -54,4 +54,220 @@ vector<int> ElementWiseObj::getOpAttrVector() const {
    return {enum_to_underlying(type)};
 }

+MSELossObj::MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
+                       Reduction reduction, Tensor output)
+    : OperatorObj(OpType::MSELoss, {input0, input1}, {output}),
+      reductionMode(reduction) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> MSELossObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0], B = inputs[1];
+    if (A->getDims().size() != B->getDims().size() ||
+        A->getDims() != B->getDims())
+        return {};
+
+    if (reductionMode == None) {
+        return {{A->getDims()}};
+    } else {
+        Shape temp = {1};
+        return {{temp}};
+    }
+}
+
+std::string MSELossObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << vecToString(inputs[1]->getDims()) << ",";
+    os << "input0=" << inputs[0]->getGuid() << ",";
+    os << "input1=" << inputs[1]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+// use output dim or inputs dim?
+vector<int> MSELossObj::getWorkloadVector() const {
+    vector<int> ret = outputs[0]->getDims();
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+vector<int> MSELossObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+AddNObj::AddNObj(GraphObj *graph, int tensorNum, Tensor output, ...)
+    : OperatorObj(OpType::AddN), num(tensorNum) {
+    TensorVec temp;
+    Tensor *start = &output;
+    ++start;
+    for (int i = 0; i < num; ++i) {
+        temp.push_back(*start);
+        start++;
+    }
+    setOutputs({output});
+    setInputs(temp);
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> AddNObj::inferShape(const TensorVec &inputs) const {
+    // For now,we only process the same dims here, broardcast will be considered
+    // in the opt layer.
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string AddNObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << vecToString(inputs[1]->getDims()) << ",";
+    os << "input0=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+// use output dim or inputs dim?
+vector<int> AddNObj::getWorkloadVector() const {
+    vector<int> ret = outputs[0]->getDims();
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+vector<int> AddNObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+MulNObj::MulNObj(GraphObj *graph, int tensorNum, Tensor output, ...)
+    : OperatorObj(OpType::MulN), num(tensorNum) {
+    TensorVec temp;
+    Tensor *start = &output;
+    ++start;
+    for (int i = 0; i < num; ++i) {
+        temp.push_back(*start);
+        start++;
+    }
+    setOutputs({output});
+    setInputs(temp);
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> MulNObj::inferShape(const TensorVec &inputs) const {
+    // For now,we only process the same dims here, broardcast will be considered
+    // in the opt layer.
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string MulNObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << vecToString(inputs[1]->getDims()) << ",";
+    os << "input0=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+// use output dim or inputs dim?
+vector<int> MulNObj::getWorkloadVector() const {
+    vector<int> ret = outputs[0]->getDims();
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+vector<int> MulNObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+AddcdivObj::AddcdivObj(GraphObj *graph, float alpha, Tensor input0,
+                       Tensor input1, Tensor input2, Tensor output)
+    : OperatorObj(OpType::Addcdiv, {input0, input1, input2}, {output}), alphaValue(alpha) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>>
+AddcdivObj::inferShape(const TensorVec &inputs) const {
+    // For now,we only process the same dims here, broardcast will be considered
+    // in the opt layer.
+    const auto A = inputs[0], B = inputs[1];
+    if (A->getDims().size() != B->getDims().size() ||
+        A->getDims() != B->getDims())
+        return {};
+
+    return {{A->getDims()}};
+}
+
+std::string AddcdivObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << vecToString(inputs[1]->getDims()) << ",";
+    os << vecToString(inputs[2]->getDims()) << ",";
+    os << "input0=" << inputs[0]->getGuid() << ",";
+    os << "input1=" << inputs[1]->getGuid() << ",";
+    os << "input1=" << inputs[2]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+// use output dim or inputs dim?
+vector<int> AddcdivObj::getWorkloadVector() const {
+    vector<int> ret = outputs[0]->getDims();
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+vector<int> AddcdivObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+AddcmulObj::AddcmulObj(GraphObj *graph, float alpha, Tensor input0,
+                       Tensor input1, Tensor input2, Tensor output)
+    : OperatorObj(OpType::Addcmul, {input0, input1, input2}, {output}), alphaValue(alpha) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>>
+AddcmulObj::inferShape(const TensorVec &inputs) const {
+    // For now,we only process the same dims here, broardcast will be considered
+    // in the opt layer.
+    const auto A = inputs[0], B = inputs[1];
+    if (A->getDims().size() != B->getDims().size() ||
+        A->getDims() != B->getDims())
+        return {};
+
+    return {{A->getDims()}};
+}
+
+std::string AddcmulObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << vecToString(inputs[1]->getDims()) << ",";
+    os << vecToString(inputs[2]->getDims()) << ",";
+    os << "input0=" << inputs[0]->getGuid() << ",";
+    os << "input1=" << inputs[1]->getGuid() << ",";
+    os << "input1=" << inputs[2]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+// use output dim or inputs dim?
+vector<int> AddcmulObj::getWorkloadVector() const {
+    vector<int> ret = outputs[0]->getDims();
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+vector<int> AddcmulObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
 }; // namespace infini
--- a/src/operators/transpose.cc
+++ b/src/operators/transpose.cc
@ -0,0 +1,47 @@
+#include "operators/transpose.h"
+
+namespace infini {
+TransposeObj::TransposeObj(GraphObj *graph, Tensor input, Tensor output,
+                           int permute[4])
+    : OperatorObj(OpType::Transpose, {input}, {output}) {
+    transposePermute[0] = permute[0];
+    transposePermute[1] = permute[1];
+    transposePermute[2] = permute[2];
+    transposePermute[3] = permute[3];
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>>
+TransposeObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    auto input = A->getDims();
+    auto output = input;
+
+    for (int i = 0; i < 4; ++i) {
+        output[i] = input[transposePermute[i]];
+    }
+    return {{output}};
+}
+
+std::string TransposeObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> TransposeObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> TransposeObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+}; // namespace infini
--- a/src/operators/unary.cc
+++ b/src/operators/unary.cc
@ -32,4 +32,342 @@ vector<int> UnaryObj::getOpAttrVector() const {
    return {enum_to_underlying(type)};
 }

+ClipObj::ClipObj(GraphObj *graph, Tensor input, Tensor output, float min,
+                 float max)
+    : OperatorObj(OpType::Clip, {input}, {output}), minValue(min),
+      maxValue(max) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> ClipObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string ClipObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> ClipObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> ClipObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+HardtanhObj::HardtanhObj(GraphObj *graph, Tensor input, Tensor output, float min,
+                 float max)
+    : OperatorObj(OpType::Hardtanh, {input}, {output}), minValue(min),
+      maxValue(max) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> HardtanhObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string HardtanhObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> HardtanhObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> HardtanhObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+FlipObj::FlipObj(GraphObj *graph, Tensor input, Tensor output, vector<int> axis)
+    : OperatorObj(OpType::Flip, {input}, {output}), axisValue(axis) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> FlipObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string FlipObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> FlipObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> FlipObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+FillObj::FillObj(GraphObj *graph, Tensor input, Tensor output, float value)
+    : OperatorObj(OpType::Fill, {input}, {output}), setValue(value) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> FillObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string FillObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> FillObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> FillObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+L2LossObj::L2LossObj(GraphObj *graph, Tensor input, Tensor output)
+    : OperatorObj(OpType::L2Loss, {input}, {output}) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> L2LossObj::inferShape(const TensorVec &inputs) const {
+    Shape temp = {1};
+    return {{temp}};
+}
+
+std::string L2LossObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> L2LossObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> L2LossObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+TransformObj::TransformObj(GraphObj *graph, Tensor input, Tensor output,
+                           float alpha, float beta)
+    : OperatorObj(OpType::Transform, {input}, {output}), alphaValue(alpha),
+      betaValue(beta) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>>
+TransformObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string TransformObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> TransformObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> TransformObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+CastObj::CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type)
+    : OperatorObj(OpType::Cast, {input}, {output}), castType(type) {
+    IT_ASSERT(checkValid(graph, DataType::Int32));
+}
+
+optional<vector<Shape>> CastObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string CastObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> CastObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> CastObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+CumsumObj::CumsumObj(GraphObj *graph, Tensor input, Tensor output, int axis,
+                     bool exclusive, bool reverse)
+    : OperatorObj(OpType::Cumsum, {input}, {output}), axisValue(axis),
+      exclusiveValue(exclusive), reverseValue(reverse) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> CumsumObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string CumsumObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> CumsumObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> CumsumObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+// CumprodObj::CumprodObj(GraphObj *graph, Tensor input, Tensor output, int
+// axis, bool exclusive, bool reverse)
+//     : OperatorObj(OpType::Cumprod, {input}, {output}), axisValue(axis),
+//     exclusiveValue(exclusive), reverseValue(reverse)  {
+//     IT_ASSERT(checkValid(graph));
+// }
+//
+// optional<vector<Shape>> CumprodObj::inferShape(const TensorVec &inputs) const
+// {
+//     const auto A = inputs[0];
+//     return {{A->getDims()}};
+// }
+//
+// std::string CumprodObj::toString() const {
+//     std::ostringstream os;
+//     os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+//     os << "(";
+//     os << "output=" << outputs[0]->getGuid() << ")";
+//     return os.str();
+// }
+//
+// vector<int> CumprodObj::getWorkloadVector() const {
+//     vector<int> ret{enum_to_underlying(type)};
+//     const Shape shape = outputs[0]->getDims();
+//     ret.insert(ret.end(), shape.begin(), shape.end());
+//     return ret;
+// }
+//
+// vector<int> CumprodObj::getOpAttrVector() const {
+//     return {enum_to_underlying(type)};
+// }
+
+ArangeObj::ArangeObj(GraphObj *graph, float start, float step, int length, Tensor output)
+    : OperatorObj(OpType::Arange, {}, {output}), startValue(start), stepValue(step), lengthValue(length) {
+    IT_ASSERT(checkValid(graph, DataType::Float32));
+}
+
+optional<vector<Shape>> ArangeObj::inferShape(const TensorVec &inputs) const {
+    Shape temp = { lengthValue };
+    return {{temp}};
+}
+
+std::string ArangeObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(outputs[0]->getDims()) << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> ArangeObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> ArangeObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
+LrnObj::LrnObj(GraphObj *graph, Tensor input, Tensor output, int feature_num, float alpha, float beta, float bias)
+    : OperatorObj(OpType::Lrn, {input}, {output}), featureNumValue(feature_num), alphaValue(alpha), betaValue(beta), biasValue(bias) {
+    IT_ASSERT(checkValid(graph));
+}
+
+optional<vector<Shape>> LrnObj::inferShape(const TensorVec &inputs) const {
+    const auto A = inputs[0];
+    return {{A->getDims()}};
+}
+
+std::string LrnObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+vector<int> LrnObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type)};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> LrnObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+
 }; // namespace infini
--- a/test/kernels/bang/test_bang_activation_backward.cc
+++ b/test/kernels/bang/test_bang_activation_backward.cc
@ -0,0 +1,56 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/activation_backward.h"
+#include "operators/element_wise.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T, class D>
+void testActivationBackward(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor yCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    Tensor diffYCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    Tensor xCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+
+    yCpu->dataMalloc();
+    diffYCpu->dataMalloc();
+    xCpu->dataMalloc();
+
+    yCpu->setData(generator);
+    diffYCpu->setData(generator);
+    xCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto yGpu = bangGraph->cloneTensor(yCpu);
+    auto diffYGpu = bangGraph->cloneTensor(diffYCpu);
+    auto xGpu = bangGraph->cloneTensor(xCpu);
+    auto gpuOp = bangGraph->addOp<T>(yGpu, diffYGpu, xGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto diffXGpu = gpuOp->getOutput();
+
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_ActivationBackward, run) {
+    testActivationBackward<ReluBackwardObj, ReluObj>(IncrementalGenerator(),
+                                                     Shape{1, 2, 2, 3});
+    testActivationBackward<SigmoidBackwardObj, SigmoidObj>(
+        IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testActivationBackward<TanhBackwardObj, TanhObj>(IncrementalGenerator(),
+                                                     Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_addcdiv.cc
+++ b/test/kernels/bang/test_bang_addcdiv.cc
@ -0,0 +1,54 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testAddcdiv(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+    Tensor inputCpu3 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu3->dataMalloc();
+    inputCpu3->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto inputGpu3 = bangGraph->cloneTensor(inputCpu3);
+    float alpha = 1.1;
+    auto gpuOp = bangGraph->addOp<T>(alpha, inputGpu1, inputGpu2, inputGpu3, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_addcdiv, run) {
+    testAddcdiv<AddcdivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_addcmul.cc
+++ b/test/kernels/bang/test_bang_addcmul.cc
@ -0,0 +1,54 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testAddcmul(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+    Tensor inputCpu3 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu3->dataMalloc();
+    inputCpu3->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto inputGpu3 = bangGraph->cloneTensor(inputCpu3);
+    float alpha = 1.1;
+    auto gpuOp = bangGraph->addOp<T>(alpha, inputGpu1, inputGpu2, inputGpu3, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_addcmul, run) {
+    testAddcmul<AddcmulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_addn.cc
+++ b/test/kernels/bang/test_bang_addn.cc
@ -0,0 +1,48 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testaddN(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(2, nullptr, inputGpu1, inputGpu2);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_addN, run) {
+    testaddN<AddNObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_arange.cc
+++ b/test/kernels/bang/test_bang_arange.cc
@ -0,0 +1,35 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testArange() {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    float start = 0.0;
+    float step = 2.0;
+    int length = 10;
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto gpuOp = bangGraph->addOp<T>(start, step, length, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Arange, run) {
+    testArange<ArangeObj>();
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_bitcompute.cc
+++ b/test/kernels/bang/test_bang_bitcompute.cc
@ -0,0 +1,51 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testBitCompute(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_BitCompute, run) {
+    testBitCompute<BitAndObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testBitCompute<BitOrObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testBitCompute<BitXorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testBitCompute<BitNotObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_cast.cc
+++ b/test/kernels/bang/test_bang_cast.cc
@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testCast(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, CastObj::Float2Int32);
+    auto outputGpu = gpuOp->getOutput();
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Cast, run) {
+    testCast<CastObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_ceil.cc
+++ b/test/kernels/bang/test_bang_ceil.cc
@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testCeil(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Ceil, run) {
+    testCeil<CeilObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_clip.cc
+++ b/test/kernels/bang/test_bang_clip.cc
@ -0,0 +1,42 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testClip(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    float min = 1.0;
+    float max = 4.0;
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, min, max);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Clip, run) {
+    testClip<ClipObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_concat.cc
+++ b/test/kernels/bang/test_bang_concat.cc
@ -0,0 +1,52 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/concat.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testConcat(const std::function<void(void *, size_t, DataType)> &generator,
+                const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp =
+        bangGraph->addOp<T>(TensorVec{inputGpu1, inputGpu2}, nullptr, 2);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->print();
+    inputCpu1->printData();
+    inputCpu2->print();
+    inputCpu2->printData();
+    outputGpu2Cpu->print();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Concat, run) {
+    testConcat<ConcatObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_copy.cc
+++ b/test/kernels/bang/test_bang_copy.cc
@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testCopy(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(outputGpu2Cpu->equalData(inputCpu));
+}
+
+TEST(cnnl_Copy, run) {
+    testCopy<CopyObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_cumsum.cc
+++ b/test/kernels/bang/test_bang_cumsum.cc
@ -0,0 +1,42 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testCumsum(const std::function<void(void *, size_t, DataType)> &generator,
+                int axis, const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, axis, false, false);
+    auto outputGpu = gpuOp->getOutput();
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Cumsum, run) {
+    testCumsum<CumsumObj>(IncrementalGenerator(), 1, Shape{1, 2, 2, 3});
+    testCumsum<CumsumObj>(IncrementalGenerator(), 2, Shape{1, 2, 2, 3});
+    testCumsum<CumsumObj>(IncrementalGenerator(), 3, Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_det.cc
+++ b/test/kernels/bang/test_bang_det.cc
@ -0,0 +1,41 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/det.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testDet(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, DetObj::NormalDet);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Det, run) {
+    testDet<DetObj>(IncrementalGenerator(), Shape{1, 1, 2, 2});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_divdemo.cc
+++ b/test/kernels/bang/test_bang_divdemo.cc
@ -0,0 +1,48 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testDivDemo(const std::function<void(void *, size_t, DataType)> &generator,
+                 const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_DivDemo, run) {
+    testDivDemo<DivDemoObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_divnonan.cc
+++ b/test/kernels/bang/test_bang_divnonan.cc
@ -0,0 +1,49 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testDivNoNan(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_DivNoNan, run) {
+    testDivNoNan<DivNoNanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_erf.cc
+++ b/test/kernels/bang/test_bang_erf.cc
@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testErf(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Erf, run) {
+    testErf<ErfObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_exp.cc
+++ b/test/kernels/bang/test_bang_exp.cc
@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testExp(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Exp, run) {
+    testExp<ExpObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_fill.cc
+++ b/test/kernels/bang/test_bang_fill.cc
@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testFill(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    float value = 1.0;
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, value);
+    auto outputGpu = gpuOp->getOutput();
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Fill, run) {
+    testFill<FillObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_flip.cc
+++ b/test/kernels/bang/test_bang_flip.cc
@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testFlip(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, vector<int>{2});
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Flip, run) {
+    testFlip<FlipObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_floor.cc
+++ b/test/kernels/bang/test_bang_floor.cc
@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testFloor(const std::function<void(void *, size_t, DataType)> &generator,
+               const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Floor, run) {
+    testFloor<FloorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_floordiv.cc
+++ b/test/kernels/bang/test_bang_floordiv.cc
@ -0,0 +1,49 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testFloorDiv(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_FloorDiv, run) {
+    testFloorDiv<FloorDivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_floordivtrunc.cc
+++ b/test/kernels/bang/test_bang_floordivtrunc.cc
@ -0,0 +1,50 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testFloorDivTrunc(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_FloorDivTrunc, run) {
+    testFloorDivTrunc<FloorDivTruncObj>(IncrementalGenerator(),
+                                        Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_floormod.cc
+++ b/test/kernels/bang/test_bang_floormod.cc
@ -0,0 +1,49 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testFloorMod(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_FloorMod, run) {
+    testFloorMod<FloorModObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_hardtanh.cc
+++ b/test/kernels/bang/test_bang_hardtanh.cc
@ -0,0 +1,42 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testHardtanh(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    float min = 1.0;
+    float max = 4.0;
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, min, max);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Hardtanh, run) {
+    testHardtanh<HardtanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_l2loss.cc
+++ b/test/kernels/bang/test_bang_l2loss.cc
@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testL2Loss(const std::function<void(void *, size_t, DataType)> &generator,
+                const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_L2Loss, run) {
+    testL2Loss<L2LossObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_log.cc
+++ b/test/kernels/bang/test_bang_log.cc
@ -0,0 +1,42 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testLog(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Log, run) {
+    testLog<Log_eObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLog<Log_2Obj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLog<Log_10Obj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_log1p.cc
+++ b/test/kernels/bang/test_bang_log1p.cc
@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testLog1p(const std::function<void(void *, size_t, DataType)> &generator,
+               const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Log, run) {
+    testLog1p<Log1pObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_logic.cc
+++ b/test/kernels/bang/test_bang_logic.cc
@ -0,0 +1,57 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testLogicOp(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_LogicOp, run) {
+    testLogicOp<EqualObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<NotEqualObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<GreaterThanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<GreaterEqualObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<LessThanObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<LessEqualObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<AndObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<OrObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<XorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testLogicOp<NotObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_lrn.cc
+++ b/test/kernels/bang/test_bang_lrn.cc
@ -0,0 +1,40 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "bang/bang_runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testLrn(const std::function<void(void *, size_t, DataType)> &generator,
+               const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+    inputCpu->printData();
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, 5, 0.0001, 0.75, 2.0);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Lrn, run) {
+    testLrn<LrnObj>(IncrementalGenerator(), Shape{1, 10, 3, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_maximum.cc
+++ b/test/kernels/bang/test_bang_maximum.cc
@ -0,0 +1,46 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testMaximum(const std::function<void(void *, size_t, DataType)> &generator,
+                 const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Maximum, run) {
+    testMaximum<MaximumObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_minimum.cc
+++ b/test/kernels/bang/test_bang_minimum.cc
@ -0,0 +1,46 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testMinimum(const std::function<void(void *, size_t, DataType)> &generator,
+                 const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Minimum, run) {
+    testMinimum<MinimumObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_mseloss.cc
+++ b/test/kernels/bang/test_bang_mseloss.cc
@ -0,0 +1,57 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testMSELoss(const std::function<void(void *, size_t, DataType)> &generator,
+                 const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp1 =
+        bangGraph->addOp<T>(inputGpu1, inputGpu2, MSELossObj::None, nullptr);
+    auto gpuOp2 =
+        bangGraph->addOp<T>(inputGpu1, inputGpu2, MSELossObj::Sum, nullptr);
+    auto gpuOp3 =
+        bangGraph->addOp<T>(inputGpu1, inputGpu2, MSELossObj::Mean, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu1 = gpuOp1->getOutput();
+    auto outputGpu2 = gpuOp2->getOutput();
+    auto outputGpu3 = gpuOp3->getOutput();
+    auto outputGpu2Cpu1 = outputGpu1->clone(cpuRuntime);
+    auto outputGpu2Cpu2 = outputGpu2->clone(cpuRuntime);
+    auto outputGpu2Cpu3 = outputGpu3->clone(cpuRuntime);
+    // Check
+    outputGpu2Cpu1->printData();
+    outputGpu2Cpu2->printData();
+    outputGpu2Cpu3->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_MSELoss, run) {
+    testMSELoss<MSELossObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_muln.cc
+++ b/test/kernels/bang/test_bang_muln.cc
@ -0,0 +1,48 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testmulN(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(2, nullptr, inputGpu1, inputGpu2);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu1->printData();
+    inputCpu2->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_mulN, run) {
+    testmulN<MulNObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_negtensor.cc
+++ b/test/kernels/bang/test_bang_negtensor.cc
@ -0,0 +1,41 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testNegTensor(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_NegTensor, run) {
+    testNegTensor<NegTensorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_net.cc
+++ b/test/kernels/bang/test_bang_net.cc
@ -0,0 +1,49 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+void testNet(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<MulNObj>(2, nullptr, inputGpu1, inputGpu2);
+    auto outputGpu = gpuOp->getOutput();
+    auto gpuOp2 = bangGraph->addOp<SigmoidObj>(outputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu2 = gpuOp2->getOutput();
+    auto outputGpu2Cpu2 = outputGpu2->clone(cpuRuntime);
+    // Check
+    inputCpu2->printData();
+    outputGpu2Cpu2->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Net, run) {
+    testNet(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_optensor.cc
+++ b/test/kernels/bang/test_bang_optensor.cc
@ -42,6 +42,8 @@ void testOptensor(
    cpuRuntime->run(cpuGraph);
    auto outputCpu = cpuOp->getOutput();
    // Check
+    outputCpu->printData();
+    outputGpu2Cpu->printData();
    EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
 }

@ -49,6 +51,7 @@ TEST(cuDNN_OpTensor, run) {
    testOptensor<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
    testOptensor<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
    testOptensor<MulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testOptensor<DivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
 }

 } // namespace infini
--- a/test/kernels/bang/test_bang_pad.cc
+++ b/test/kernels/bang/test_bang_pad.cc
@ -0,0 +1,44 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/pad.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testPad(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, vector<int>{1, 1, 1, 1},
+                                     vector<int>{0, 3});
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    inputCpu->print();
+    inputCpu->printData();
+    outputGpu2Cpu->print();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Pad, run) {
+    testPad<PadObj>(IncrementalGenerator(), Shape{1, 1, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_pooling.cc
+++ b/test/kernels/bang/test_bang_pooling.cc
@ -0,0 +1,41 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/pooling.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
+                 const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Pooling, run) {
+    testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
+    testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_pow.cc
+++ b/test/kernels/bang/test_bang_pow.cc
@ -0,0 +1,46 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testPow(const std::function<void(void *, size_t, DataType)> &generator,
+             const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu1 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu1->dataMalloc();
+    inputCpu1->setData(generator);
+    Tensor inputCpu2 =
+        make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu2->dataMalloc();
+    inputCpu2->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
+    auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    // Check
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Pow, run) {
+    testPow<PowerObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_reciprocal.cc
+++ b/test/kernels/bang/test_bang_reciprocal.cc
@ -0,0 +1,41 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testReciprocal(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Reciprocal, run) {
+    testReciprocal<ReciprocalObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_round.cc
+++ b/test/kernels/bang/test_bang_round.cc
@ -0,0 +1,40 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testRound(const std::function<void(void *, size_t, DataType)> &generator,
+               const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
+    bangGraph->dataMalloc();
+    bangRuntime->run(bangGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    inputCpu->printData();
+    outputGpu2Cpu->printData();
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_Round, run) {
+    testRound<RoundObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+} // namespace infini
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
wanghailu	fb2a6a8fb2	add transpose operation	2023-03-24 16:42:45 +08:00
wanghailu	aebd7440aa	add lrn operation	2023-03-23 07:59:19 +00:00
wanghailu	8b58b08240	add lrn operation	2023-03-23 15:31:42 +08:00
wanghailu	44594f13d3	add net test	2023-02-09 05:35:05 +00:00
wanghailu	cc4eb31265	add bitcompute operation	2023-02-01 05:49:03 +00:00
wanghailu	27af0065f7	add arange operation	2023-01-31 05:33:40 +00:00
wanghailu	1fcab531ec	add addcdiv and addcmul operation	2023-01-30 06:36:36 +00:00
wanghailu	c51b19b198	add logic operation	2023-01-29 06:00:12 +00:00
wanghailu	9ba670cc08	Merge branch 'activation' of github.com:InfiniTensor/InfiniTensor into activation	2023-01-29 02:41:23 +00:00
wanghailu	6b53a50927	add hardtanh operation	2023-01-29 02:37:15 +00:00
wanghailu	05f8789b68	code format fix	2023-01-17 12:18:49 +08:00
wanghailu	8d7150f815	add flip operation	2023-01-17 04:15:57 +00:00
wanghailu	f2f149861a	code format fix	2023-01-16 14:08:14 +08:00
wanghailu	7167badbb7	add squaredDifference operation	2023-01-16 06:00:42 +00:00
wanghailu	68a4269a2c	add square operation	2023-01-16 05:39:40 +00:00
wanghailu	3a8c309236	add pooling operation	2023-01-13 02:25:39 +00:00
wanghailu	5f4cb6fb55	add round operation	2023-01-12 06:55:10 +00:00
wanghailu	6f1c7d0e82	fix concat and split operation	2023-01-10 02:02:19 +00:00
wanghailu	04d0e1a560	add split operation	2023-01-09 09:24:47 +00:00
wanghailu	d216b529e7	format	2023-01-09 15:16:43 +08:00
wanghailu	cd703e5679	add concat operation	2023-01-09 07:14:32 +00:00
wanghailu	2b8bca17e2	format	2023-01-05 14:23:46 +08:00
wanghailu	156a40806d	add pad operation	2023-01-05 05:49:53 +00:00
wanghailu	c19d6e6bb0	add det operation	2023-01-04 09:24:52 +00:00
wanghailu	68f4630dac	add cumsum operation	2023-01-03 08:45:54 +00:00
wanghailu	5ae96ce060	add floormod operation	2023-01-03 07:20:47 +00:00
wanghailu	dbb606f158	add floordiv operation and floordivtrunc operation	2023-01-03 07:07:22 +00:00
wanghailu	0079d1271b	add cast operation	2022-12-28 08:57:52 +00:00
wanghailu	5329e66d0f	add muln operation	2022-12-27 08:22:50 +00:00
wanghailu	45ea5c83f6	add addn operation	2022-12-27 07:03:23 +00:00
wanghailu	9177629a77	add transform operation	2022-12-27 02:13:10 +00:00
wanghailu	f98f91de8b	add sqrt and rsqrt operation	2022-12-26 06:29:12 +00:00
wanghailu	335dfabf80	add reciprocal operation	2022-12-26 06:15:07 +00:00
wanghailu	376c992aca	add power operation	2022-12-26 05:45:40 +00:00
wanghailu	39d2a3571b	add negTensor operation	2022-12-26 04:40:24 +00:00
wanghailu	0707fb6aff	add mseloss operation	2022-12-26 03:06:34 +00:00
wanghailu	4ad648fa36	add maximum and minimum operation	2022-12-21 07:42:54 +00:00
wanghailu	2749b49ff7	add l2loss operation	2022-12-21 02:21:19 +00:00
wanghailu	34ba231cd4	add log1p operation	2022-12-21 01:41:28 +00:00
wanghailu	8bd1d64c53	add log operation	2022-12-20 03:09:40 +00:00
wanghailu	084063a68f	add operation fill	2022-12-19 02:59:33 +00:00
wanghailu	82f510672d	add exp operation	2022-12-19 02:03:12 +00:00
wanghailu	b27b95a5e2	add erf operation	2022-12-19 01:51:25 +00:00
wanghailu	9346232129	add divnonan operation and test	2022-12-15 08:47:22 +00:00
wanghailu	a56fb98eee	add operation cnnl div, test and test for divdemo bangc kernel	2022-12-15 08:26:49 +00:00
wanghailu	949e00b732	add operation clip	2022-12-15 06:04:23 +00:00
wanghailu	58b89dd601	add ceil operation and floor operation	2022-12-14 02:50:06 +00:00
wanghailu	46a1bb2773	add copy operation on mlu	2022-12-14 02:32:32 +00:00
wanghailu	820d855ec8	add trigon function operation on mlu: sin,cos,tan,asin,sinh,asinh	2022-12-12 06:24:24 +00:00
wanghailu	392427cca6	add transpsoe code and test	2022-12-12 05:17:53 +00:00
wanghailu	8cfe04e5b7	fix	2022-12-08 07:41:15 +00:00
wanghailu	a8bd1a910c	add convbpfilter	2022-12-07 06:52:45 +00:00
wanghailu	b68dcf8b9a	add test	2022-12-06 07:09:51 +00:00
wanghailu	111ff10df0	add test for activation_backward	2022-12-06 04:38:26 +00:00
wanghailu	db9069f1b7	add activation backward operation	2022-12-05 08:45:39 +00:00
wanghailu	468ed541af	commit for format	2022-12-02 11:38:02 +08:00
wanghailu	267bfa3a4b	add activation operatiopn relu, tanh, sigmoid on mlu	2022-12-02 03:24:09 +00:00