Add Conv operator and naive CPU implemenation (#5)

* Add: Conv definition * Add: tensor copy data from vector * Add: CPU conv kernel * Fix: replace Int32 with UInt32 in DataType Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
2022-08-17 14:16:01 +08:00 · 2022-08-17 14:16:01 +08:00 · 9303ddda8e
parent a26890abce
commit 9303ddda8e
15 changed files with 369 additions and 23 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -81,6 +81,7 @@ if(BUILD_TEST)
  enable_testing()
  if(BUILD_TEST_CORE)
    build_test(test/core/*.cc)
+    build_test(test/operators/*.cc)
  endif()
  if(BUILD_TEST_PET)
    build_test(test/pet/*.cc)
--- a/include/core/common.h
+++ b/include/core/common.h
@ -57,6 +57,18 @@ template <typename T> auto enum_to_underlying(T e) {
    return static_cast<std::underlying_type_t<T>>(e);
 }

+template <typename T> std::string vecToString(const std::vector<T> &vec) {
+    std::string ret;
+    ret.append("[");
+    for (auto d : vec) {
+        ret.append(std::to_string(d));
+        ret.append(", ");
+    }
+    ret.pop_back();
+    ret.append("]");
+    return ret;
+}
+
 double timeit(const std::function<void()> &func);

 } // namespace infini
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -16,7 +16,7 @@ class GraphObj : public Object {
    // Graph(OpVec oplist);
    string toString() const override;

-    Tensor addTensor(Shape dim, DataType dtype = DataType::Int32);
+    Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);

    /**
     * @brief Add an operator and create its outputs. Output tensor arguments
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -24,8 +24,13 @@ class TensorObj : public TensorBaseObj {
    using TensorBaseObj::getData;
    VType getData(const Shape &pos) const;
    void copyData(VType *dptr);
+    void copyData(vector<VType> dataVector);
    void printData() const;
    bool equalData(const Tensor &rhs) const;
+    void
+    setData(std::function<void(void *, size_t, DataType)> generator) const {
+        generator((void *)(data.get()), size(), dtype);
+    }
    // void setDims(const Dim &dms) { dims = dms; }

    //     bool dataRand(int seed = 0) {
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@ -22,7 +22,7 @@ using VType = uint32_t;

 enum class DataType {
    Float32,
-    Int32,
+    UInt32,
 };

 class TensorBaseObj : public Object {
--- a/include/operators/conv.h
+++ b/include/operators/conv.h
@ -0,0 +1,68 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+
+class ConvObj : public OperatorObj {
+  public:
+    // When PaddingMode is Other, ConvObj will use padding size (ph, pw)
+    // Otherwise, padding size (ph, pw) will be computed by padding mode
+    enum class PaddingMode {
+        Other,
+        Same,
+        Valid,
+    };
+
+  private:
+    int ph, pw;
+    int sh, sw;
+    int dh, dw;
+    ActType act;
+    PaddingMode padding;
+    // auxiliary attributes
+    int n, c, h, w, f, r, s;
+
+  public:
+    // Constructors for explicitly setting padding size
+    ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output, int ph,
+            int pw, int sh = 1, int sw = 1, int dh = 1, int dw = 1,
+            Tensor bias = nullptr, ActType act = ActType::None);
+    // Constructors for setting padding mode
+    ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
+            PaddingMode mode = PaddingMode::Same, int sh = 1, int sw = 1,
+            int dh = 1, int dw = 1, Tensor bias = nullptr,
+            ActType act = ActType::None);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 3; }
+    int numOutputs() const override { return 1; }
+
+    Tensor getBias() const { return inputs[2]; }
+    ActType getAct() const { return act; }
+    PaddingMode getPaddingMode() const { return padding; }
+    pair<int, int> inferPaddingSize() const;
+
+    int getDh() const { return dh; }
+    int getDw() const { return dw; }
+    int getPh() const { return ph; }
+    int getPw() const { return pw; }
+    int getSh() const { return sh; }
+    int getSw() const { return sw; }
+    auto getNCHWFRS() const { return tuple(n, c, h, w, f, r, s); }
+    auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); }
+    int getChannelPerGroup() const { return inputs[1]->getDims()[1]; }
+    int getNumGroups() const { return c / getChannelPerGroup(); }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+    /**
+     * @brief Set the Auxilary Attributes: nchwrfs and padding (ph, pw) if
+     * padding mode is set. This function should be called in constructor.
+     */
+    void setAuxilaryAttributes(PaddingMode mode);
+};
+
+} // namespace infini
--- a/include/test.h
+++ b/include/test.h
@ -1,3 +1,35 @@
 #pragma once
 #include "core/common.h"
+#include "core/tensor_base.h"
 #include "gtest/gtest.h"
+
+namespace infini {
+
+class DataGenerator {
+  private:
+    virtual void fill(uint32_t *data, size_t size) { IT_TODO_HALT(); };
+    virtual void fill(float *data, size_t size) { IT_TODO_HALT(); };
+
+  public:
+    void operator()(void *data, size_t size, DataType dataType) {
+        switch (dataType) {
+        case DataType::UInt32:
+            fill(reinterpret_cast<uint32_t *>(data), size);
+            break;
+        case DataType::Float32:
+            fill(reinterpret_cast<float *>(data), size);
+            break;
+        default:
+            IT_TODO_HALT();
+        }
+    }
+};
+
+class IncrementalGenerator : public DataGenerator {
+    void fill(uint32_t *data, size_t size) override {
+        for (size_t i = 0; i < size; i++) {
+            data[i] = i;
+        }
+    }
+};
+} // namespace infini
--- a/src/core/run_engine.cc
+++ b/src/core/run_engine.cc
@ -17,7 +17,7 @@ void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
    for (auto &op : graph->getOperators()) {
        // HACK: set correct data type
        auto kernelAttrs =
-            KernelAttrs{device, op->getOpType(), DataType::Int32};
+            KernelAttrs{device, op->getOpType(), DataType::UInt32};
        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
        std::optional<PerfRecord> perfData = perfEngine.getPerfData(perfKey);
@ -64,7 +64,7 @@ double RunEngine::getPerfTime(const Graph &graph, bool profiling) const {
    for (auto &op : graph->getOperators()) {
        // HACK: set correct data type
        auto kernelAttrs =
-            KernelAttrs{device, op->getOpType(), DataType::Int32};
+            KernelAttrs{device, op->getOpType(), DataType::UInt32};
        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
        std::optional<PerfRecord> perfData = perfEngine.getPerfData(perfKey);
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -45,6 +45,10 @@ void TensorObj::copyData(VType *dptr) {
        data[i] = dptr[i];
    }
 }
+void TensorObj::copyData(vector<VType> dataVector) {
+    IT_ASSERT(dataVector.size() >= size());
+    copyData(dataVector.data());
+}

 void TensorObj::printData() const {
    IT_ASSERT(data != nullptr);
--- a/src/kerels/cpu/conv.cc
+++ b/src/kerels/cpu/conv.cc
@ -0,0 +1,60 @@
+#include "operators/conv.h"
+#include "core/kernel.h"
+
+namespace infini {
+
+template <typename T> class NaiveConv : public Kernel {
+    void compute(const Operator &_op, const PerfRecord &record) const override {
+        auto op = as<ConvObj>(_op);
+        T *iptr = reinterpret_cast<T *>(op->getInputs(0)->getDataPtr().get());
+        T *wptr = reinterpret_cast<T *>(op->getInputs(1)->getDataPtr().get());
+        T *optr = reinterpret_cast<T *>(op->getOutput()->getDataPtr().get());
+        auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
+        auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+        int cpg = op->getChannelPerGroup();
+        int g = op->getNumGroups();
+        IT_ASSERT(f % g == 0, "Illegal number of channel");
+        auto outDim = op->getOutput()->getDims();
+        int oh = outDim[2], ow = outDim[3];
+        for (int nn = 0; nn < n; nn++) {
+#pragma omp parallel for
+            for (int ff = 0; ff < f; ff++) {
+                for (int hh = 0; hh < oh; hh++)
+                    for (int ww = 0; ww < ow; ww++) {
+                        int gidx = ff / (f / g);
+                        VType val = 0;
+                        for (int cc = 0; cc < cpg; cc++)
+                            for (int rr = 0; rr < r; rr++)
+                                for (int ss = 0; ss < s; ss++) {
+                                    // clang-format off
+                                int posH = hh * sh + rr * dh - ph;
+                                int posW = ww * sw + ss * dw - pw;
+                                if (posH < 0 || posH >= h || posW < 0 || posW >= w)
+                                    continue;
+                                auto iOffset = posW + w * (posH + h * ((cc + gidx * cpg) + c * nn)),
+                                    wOffset = ss + s * (rr + r * (cc + cpg * ff));
+                                auto inputVal = iptr[iOffset], weightVal = wptr[wOffset];
+                                val += weightVal * inputVal;
+                                    // clang-format on
+                                }
+                        // TODO: check correctness, oh & ow or h & w?
+                        auto oOffset = ww + ow * (hh + oh * (ff + f * nn));
+                        optr[oOffset] = val;
+                    }
+            }
+        }
+    }
+
+    void compute(const Operator &op) const override { compute(op, {}); }
+
+    PerfRecord tune(const Operator &op) const override {
+        return PerfRecord{.time = timeit([this, &op]() { compute(op); })};
+    }
+};
+
+REGISTER_KERNEL(Device::CPU, OpType::Conv, DataType::UInt32,
+                NaiveConv<uint32_t>, "ConvNaive_CPU_uint32");
+REGISTER_KERNEL(Device::CPU, OpType::Conv, DataType::Float32, NaiveConv<float>,
+                "ConvNaive_CPU_float32");
+
+} // namespace infini
--- a/src/kerels/cpu/matmul.cc
+++ b/src/kerels/cpu/matmul.cc
@ -30,7 +30,7 @@ template <typename T> class NaiveMatmul : public Kernel {
    }
 };

-REGISTER_KERNEL(Device::CPU, OpType::Matmul, DataType::Int32,
+REGISTER_KERNEL(Device::CPU, OpType::Matmul, DataType::UInt32,
                NaiveMatmul<uint32_t>, "MatmulNaive_CPU_uint32");
 REGISTER_KERNEL(Device::CPU, OpType::Matmul, DataType::Float32,
                NaiveMatmul<float>, "MatmulNaive_CPU_float32");
--- a/src/operators/conv.cc
+++ b/src/operators/conv.cc
@ -0,0 +1,101 @@
+#include "operators/conv.h"
+
+namespace infini {
+
+ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
+                 int ph, int pw, int sh, int sw, int dh, int dw, Tensor bias,
+                 ActType act)
+    : OperatorObj(OpType::Conv, {input, weight, bias}, {output}), ph(ph),
+      pw(pw), sh(sh), sw(sw), dh(dh), dw(dw), act(act),
+      padding(PaddingMode::Other) {
+    setAuxilaryAttributes(PaddingMode::Other);
+    IT_ASSERT(checkValid(graph));
+}
+
+ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
+                 PaddingMode mode, int sh, int sw, int dh, int dw, Tensor bias,
+                 ActType act)
+    : OperatorObj(OpType::Conv, {input, weight, bias}, {output}), ph(-1),
+      pw(-1), sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(mode) {
+    IT_ASSERT(mode != PaddingMode::Other);
+    setAuxilaryAttributes(mode);
+    IT_ASSERT(checkValid(graph));
+}
+
+string ConvObj::toString() const {
+    std::ostringstream os;
+    os << "Conv[" << getGuid() << "]";
+    os << "(";
+    if (inputs.size() == 2) {
+        os << vecToString(inputs[0]->getDims()) << ",";
+        os << vecToString(inputs[1]->getDims()) << ",";
+    }
+    os << "p=[" << ph << "," << pw << "],";
+    os << "s=[" << sh << "," << sw << "],";
+    os << "d=[" << dh << "," << dw << "],";
+    os << "act=" << enum_to_underlying(act) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "weight=" << inputs[1]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ")";
+    return os.str();
+}
+
+optional<vector<Shape>> ConvObj::inferShape(const TensorVec &inputs) const {
+    const auto &input = inputs[0], &weight = inputs[1];
+    auto n = input->getDims()[0];
+    auto h = input->getDims()[2];
+    auto w = input->getDims()[3];
+    auto f = weight->getDims()[0];
+    auto r = weight->getDims()[2];
+    auto s = weight->getDims()[3];
+    int on = n, oc = f;
+    int oh = 0, ow = 0;
+    // For NCHW+FCRS layout, C of input is divisable by C of weight
+    if (input->getDims()[1] % weight->getDims()[1] != 0)
+        return {};
+    // Set padding size
+    if (padding == PaddingMode::Other) {
+        oh = (h - (r - sh) * dh + ph * 2) / sh;
+        ow = (w - (s - sw) * dw + pw * 2) / sw;
+    } else if (padding == PaddingMode::Same) {
+        oh = h / sh;
+        ow = w / sw;
+        // ph = (h - oh * sh + (r - sh) * dh) / 2;
+        // pw = (w - ow * sw + (s - sw) * dw) / 2;
+    } else if (padding == PaddingMode::Valid) {
+        int ph = 0;
+        int pw = 0;
+        oh = (h - (r - sh) * dh + ph * 2) / sh;
+        ow = (w - (s - sw) * dw + pw * 2) / sw;
+    }
+    return {{{on, oc, oh, ow}}};
+}
+
+vector<int> ConvObj::getWorkloadVector() const {
+    return {
+        enum_to_underlying(type), n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw,
+        enum_to_underlying(act)};
+}
+
+vector<int> ConvObj::getOpAttrVector() const {
+    IT_TODO_HALT(); // should padding mode / ph+pw be in attrs?
+    return {enum_to_underlying(type), c, f, r, s, ph, pw, sh, sw, dh, dw,
+            enum_to_underlying(act)};
+}
+
+void ConvObj::setAuxilaryAttributes(PaddingMode mode) {
+    n = inputs[0]->getDims()[0], c = inputs[0]->getDims()[1],
+    h = inputs[0]->getDims()[2], w = inputs[0]->getDims()[3],
+    f = inputs[1]->getDims()[0], r = inputs[1]->getDims()[2],
+    s = inputs[1]->getDims()[3];
+    if (mode == PaddingMode::Same) {
+        int oh = h / sh;
+        int ow = w / sw;
+        ph = (h - oh * sh + (r - sh) * dh) / 2;
+        pw = (w - ow * sw + (s - sw) * dw) / 2;
+    } else if (mode == PaddingMode::Valid) {
+        ph = pw = 0;
+    }
+}
+
+} // namespace infini
--- a/test/core/test_graph.cc
+++ b/test/core/test_graph.cc
@ -7,39 +7,39 @@ namespace infini {

 TEST(Graph, build_and_run) {
    Graph g = make_ref<GraphObj>();
-    Tensor i0 = g->addTensor({1, 2, 3}, DataType::Int32);
-    Tensor w0 = g->addTensor({1, 3, 4}, DataType::Int32);
-    Tensor o0 = g->addTensor({1, 2, 4}, DataType::Int32);
+    Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
+    Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
+    Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
    g->dataMalloc();
-    i0->copyData(vector<VType>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}.data());
-    w0->copyData(vector<VType>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}.data());
+    i0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
    g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
    RunEngine(Device::CPU).run(g);
    // check answer
-    auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::Int32);
+    auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
    ans->dataMalloc();
-    ans->copyData(vector<VType>{38, 44, 50, 56, 83, 98, 113, 128}.data());
+    ans->copyData({38, 44, 50, 56, 83, 98, 113, 128});
    EXPECT_TRUE(o0->equalData(ans));
 }

 TEST(Graph, perf_engine) {
    Graph g = make_ref<GraphObj>();
-    Tensor i0 = g->addTensor({1, 2, 3}, DataType::Int32);
-    Tensor w0 = g->addTensor({1, 3, 4}, DataType::Int32);
+    Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
+    Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
    auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);

    g->dataMalloc();
-    i0->copyData(vector<VType>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}.data());
-    w0->copyData(vector<VType>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}.data());
+    i0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyData({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
    RunEngine(Device::CPU).run(g, true, true);
    double perfTime = RunEngine(Device::CPU).getPerfTime(g);
    // The example matmul takes 0.0036ms with one core
    EXPECT_GT(perfTime, 0);
    EXPECT_LT(perfTime, 0.01);
    // check answer
-    auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::Int32);
+    auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32);
    ans->dataMalloc();
-    ans->copyData(vector<VType>{38, 44, 50, 56, 83, 98, 113, 128}.data());
+    ans->copyData({38, 44, 50, 56, 83, 98, 113, 128});
    EXPECT_TRUE(matmul->getOutput()->equalData(ans));
 }

--- a/test/core/test_hash.cc
+++ b/test/core/test_hash.cc
@ -9,9 +9,9 @@ TEST(Hash, OperatorHash) {
    OpPerfKey key1(0, OpType::Unknown), key2(0, OpType::Unknown);
    { // build with addOpWithOutputs
        Graph g = make_ref<GraphObj>();
-        Tensor i0 = g->addTensor({1, 2, 3}, DataType::Int32);
-        Tensor w0 = g->addTensor({1, 3, 4}, DataType::Int32);
-        Tensor o0 = g->addTensor({1, 2, 4}, DataType::Int32);
+        Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
+        Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
+        Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
        auto matmul = g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
        key1 = matmul->getOpPerfKey();
        EXPECT_NE(key1.hash, 0);
@ -19,8 +19,8 @@ TEST(Hash, OperatorHash) {
    }
    { // build with addOp
        Graph g = make_ref<GraphObj>();
-        Tensor i0 = g->addTensor({2, 2, 3}, DataType::Int32);
-        Tensor w0 = g->addTensor({2, 3, 4}, DataType::Int32);
+        Tensor i0 = g->addTensor({2, 2, 3}, DataType::UInt32);
+        Tensor w0 = g->addTensor({2, 3, 4}, DataType::UInt32);
        auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);
        key2 = matmul->getOpPerfKey();
        EXPECT_NE(key2.hash, 0);
--- a/test/operators/test_conv.cc
+++ b/test/operators/test_conv.cc
@ -0,0 +1,63 @@
+#include "core/graph.h"
+#include "core/run_enigne.h"
+#include "operators/conv.h"
+#include "test.h"
+
+namespace infini {
+
+TEST(Conv, ShapeInference) {
+    // Padding modes
+    {
+        Graph g = make_ref<GraphObj>();
+        Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
+        Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
+        auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1);
+        EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
+    }
+    {
+        Graph g = make_ref<GraphObj>();
+        Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
+        Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
+        auto conv =
+            g->addOp<ConvObj>(i0, w0, nullptr, ConvObj::PaddingMode::Same);
+        EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
+    }
+    {
+        Graph g = make_ref<GraphObj>();
+        Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
+        Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
+        auto conv =
+            g->addOp<ConvObj>(i0, w0, nullptr, ConvObj::PaddingMode::Valid);
+        EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 2, 2}));
+    }
+    { // dilation & stride
+        Graph g = make_ref<GraphObj>();
+        Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
+        Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
+        auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
+        EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 2, 2}));
+    }
+}
+
+TEST(Conv, NaiveCPU) {
+    Graph g = make_ref<GraphObj>();
+    Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::UInt32);
+    Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
+    auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
+
+    g->dataMalloc();
+    i0->setData(IncrementalGenerator());
+    w0->setData(IncrementalGenerator());
+    RunEngine(Device::CPU).run(g, true, true);
+    double perfTime = RunEngine(Device::CPU).getPerfTime(g);
+    // The example matmul takes 0.0036ms with one core
+    EXPECT_GT(perfTime, 0);
+    EXPECT_LT(perfTime, 5);
+    // check answer
+    auto ans = make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32);
+    ans->dataMalloc();
+    ans->copyData({4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
+    EXPECT_TRUE(conv->getOutput()->equalData(ans));
+}
+
+} // namespace infini