Cpu backend2 (#77)

fix review change Device::MKL to Device::INTELCPU fix mkl linkage fix errors according to merge from master now can call mkl backend fix softmax/flatten with axis from onnx. modify README.md fix memory refree add env_lotus_intelcpu.sh fix compile merge from branch cpu_backend fix something add gather fix something FIX: directory rename from "mkl" to "intelcpu" ADD: use oneMKL dpcpp interface to implement matmul kernel. ADD: add dpcpp as compiler for mkl, and fix warnings for clang compiling. add dpcpp kernel for pow. ADD: mkl kernel for pad. ADD: slice mkl kernel. ADD: reshape/flatten/identity mkl kernel. ADD: split mkl kernel. fix compile error FIX: fix flattenObj with axis. ADD reduce_mean mkl kernel. Add concat mkl kernel. bathNorm for mkl kernel. sigmoid mkl kernel. ADD：add mkl kernel for pooling add more tests for softmax Now softmax cuda kernel supports any axises. mkl kernel for softmax softmax add axis to softmax operator add mkl kernel for abs tanh ADD: relu kernel for mkl fix binary mkl primitives. add mkl kernel for binary operators fix compiler error move stream to runtime clang format add MemoryFormat for tensorObj. use post_ops for fused conv/deconv Distinguish mkl op_timer from cuda op timer. add act optype to conv and deconv add operator timer add mkl kernel for convTransposed minor fix for group conv do not use cblas_sgemm_batch CpuRuntimeObj->NativeCpuRuntimeObj add matmul op for mkl
2023-04-17 12:15:23 +08:00 · 2023-04-17 12:15:23 +08:00 · c8b2c8ed32
parent fe1afe38fa
commit c8b2c8ed32
72 changed files with 2209 additions and 218 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -5,7 +5,7 @@ project(InfiniTensor C CXX)
 # Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
 option(USE_CUDA "Support CUDA GPU" OFF)
 option(USE_BANG "Support BANG MLU" OFF)
-option(USE_MKL "Support MKL" OFF)
+option(USE_INTELCPU "Support INTELCPU" OFF)
 option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
 option(USE_PROTOBUF "Serialize and deserialize tensors" ON)
 option(BUILD_TEST "Build tests" ON)
@ -19,10 +19,6 @@ set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off

-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion
-
 find_package(
  Python
  COMPONENTS Interpreter Development
@ -35,6 +31,20 @@ endif()
 if(OpenMP_CXX_FOUND)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 endif()
+
+
+if(BUILD_TEST)
+  set(BUILD_GMOCK
+      OFF
+      CACHE BOOL "Do not build gmock" FORCE)
+  set(INSTALL_GTEST
+      OFF
+      CACHE BOOL "Do not install gtest" FORCE)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall ")    
+  add_subdirectory(3rd-party/googletest)
+  include_directories(SYSTEM 3rd-party/googletest/googletest/include)
+endif()
+
 #Protobuf
 if(USE_PROTOBUF)
  add_definitions(-D TENSOR_PROTOBUF)
@ -47,14 +57,12 @@ if(USE_PROTOBUF)
  set(PROTO_PATH "${CMAKE_CURRENT_SOURCE_DIR}/proto")
  file(GLOB PROTO_FILES "${PROTO_PATH}/data.proto")
  protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROTO_FILES})
-  message(${PROTO_SRCS} "-----------" ${PROTO_FILES})
-  message(${PROTO_HDRS} "-----------" ${PROTO_FILES})
+  set_source_files_properties (${PROTO_SRCS} PROPERTIES COMPILE_FLAGS -Wno-unused-variable)
  add_library(tensor_proto SHARED ${PROTO_SRCS} ${PROTO_HDRS})
  target_link_libraries(tensor_proto PUBLIC ${PROTOBUF_LIBRARIES})
 endif()

 include_directories(include)
-
 # Pybind11
 add_subdirectory(3rd-party/pybind11)
 include_directories(3rd-party/pybind11/include)
@ -63,16 +71,9 @@ include_directories(3rd-party/pybind11/include)
 add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
 include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)

-if(BUILD_TEST)
-  set(BUILD_GMOCK
-      OFF
-      CACHE BOOL "Do not build gmock" FORCE)
-  set(INSTALL_GTEST
-      OFF
-      CACHE BOOL "Do not install gtest" FORCE)
-  add_subdirectory(3rd-party/googletest)
-  include_directories(3rd-party/googletest/googletest/include)
-endif()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion

 # Source files
 file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/operators/*.cc src/utils/*.cc)
@ -87,9 +88,9 @@ if(USE_BANG)
  list (APPEND SRC ${SRC_BANG})
 endif()

-if(USE_MKL)
-  file(GLOB_RECURSE SRC_MKL src/mkl/*.cc src/kernels/mkl/*.cc )
-  list (APPEND SRC ${SRC_MKL})
+if(USE_INTELCPU)
+  file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc )
+  list (APPEND SRC ${SRC_INTELCPU})
 endif()

 # Libraries
@ -113,19 +114,28 @@ if(USE_BACKTRACE)
  target_link_libraries(InfiniTensor dw)
 endif()

-if(USE_MKL)
+if(USE_INTELCPU)
+  add_compile_definitions(USE_INTELCPU=1) 
  find_package(MKL CONFIG REQUIRED)
-  target_link_libraries(InfiniTensor  $<LINK_ONLY:MKL::MKL>)
+
+  # Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-link-line-advisor.html
+  target_link_libraries(InfiniTensor sycl OpenCL)
+
  set(DNNL_CONFIGURATION "cpu_gomp")
  find_package(dnnl CONFIG REQUIRED)
  if(dnnl_FOUND)          
-      add_compile_definitions(USE_MKL=1)
      include_directories(BEFORE ${dnnl_DIR}/../../../cpu_gomp/include/)
      link_directories(${dnnl_DIR}/../../../cpu_gomp/lib)   
      target_link_libraries(InfiniTensor  dnnl)       
  else()
-      message(FATAL_ERROR ”dnnl library not found”)
+      message(FATAL_ERROR "dnnl library not found")
  endif()
+  set(WNO_ERRORS "-Wno-error=unused-parameter -Wno-error=unused-function -Wno-error=unused-private-field -Wno-error=ignored-attributes -Wno-error=unused-const-variable -Wno-error=inconsistent-missing-override -Wno-error=unused-variable -Wno-error=tautological-constant-compare")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMKL_ILP64 -qmkl=parallel -Werror  ${WNO_ERRORS}")
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DMKL_ILP64 -qmkl=parallel ${WNO_ERRORS}") # Enable assertion
+  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DMKL_ILP64 -qmkl=parallel ${WNO_ERRORS}") # Enable assertion
+  
+  find_package(IntelDPCPP REQUIRED)
 endif()

 if(USE_CUDA)
@ -210,8 +220,8 @@ if(BUILD_TEST)
    if (USE_BANG)
      build_test(test/kernels/bang/*.cc)
    endif()
-    if (USE_MKL)
-      build_test(test/kernels/mkl/*.cc)
+    if (USE_INTELCPU)
+      build_test(test/kernels/intelcpu/*.cc)
    endif()
  endif()
  if(BUILD_TEST_PET)
--- a/7
+++ b/7
@ -2,6 +2,7 @@

 TYPE ?= release
 CUDA ?= off
+INTELCPU ?= off

 CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE) 

@ -9,9 +10,13 @@ ifeq ($(CUDA), ON)
 	CMAKE_OPT += -DUSE_CUDA=ON
 endif

+ifeq ($(INTELCPU), ON)
+	CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp
+endif
+
 build:
 	mkdir -p build/$(TYPE)
-	cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8
+	cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j22

 clean:
 	rm -rf build
--- a/README.md
+++ b/README.md
@ -1,12 +1,19 @@
 # InfiniTensor

 ## Compilation on Lotus
-
+# Compilation for cuda
 ``` bash
 # Enter the root of InfiniTensor
 source test/script/env_lotus.sh
 make CUDA=ON
 ```
+## Compilation for intelcpu
+``` bash
+# Enter the root of InfiniTensor
+source test/script/env_lotus.sh intelcpu
+mkdir build && cd build
+cmake -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp .. && make -j 12
+```

 ### Make Commands

--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -66,10 +66,10 @@ class GraphHandlerObj {
    Tensor relu(Tensor x, Tensor y);
    Tensor sigmoid(Tensor x, Tensor y);
    Tensor tanh(Tensor x, Tensor y);
-    Tensor softmax(Tensor x, Tensor y);
+    Tensor softmax(Tensor x, Tensor y, int axis);
    Tensor abs(Tensor x, Tensor y);
    Tensor identity(Tensor x, Tensor y);
-    Tensor flatten(Tensor s, Tensor y);
+    Tensor flatten(Tensor s, Tensor y, int axis);
    Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
    Tensor concat(TensorVec inputs, Tensor output, int dim);
    Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -28,7 +28,7 @@ using OpVec = vector<Operator>;

 using VType = uint32_t;

-enum class Device { CPU = 1, CUDA, BANG, MKL };
+enum class Device { CPU = 1, CUDA, BANG, INTELCPU };
 /***************** Forward declaration end *****************/

 class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
@ -53,7 +53,6 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
                     bool profiling = false) const = 0;
    virtual void *alloc(size_t size) = 0;
    virtual void dealloc(void *ptr) = 0;
-    void prepareAndRun(Graph &graph, bool tune = false, bool profiling = false);
    /**
     * @brief Get the execution time of each operator in performance record. No
     * execution happens.
@ -65,7 +64,7 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
    double getPerfTime(const Graph &graph, bool profiling = false) const;
    Blob allocBlob(size_t size);
    bool isCpu() const {
-        return device == Device::CPU || device == Device::MKL;
+        return device == Device::CPU || device == Device::INTELCPU;
    }
    bool isCuda() const { return device == Device::CUDA; }
    bool isBang() const { return device == Device::BANG; }
--- a/include/cuda/softmax.h
+++ b/include/cuda/softmax.h
@ -0,0 +1,6 @@
+#pragma once
+
+namespace infini {
+void softmax_kernel(int max_threadblock_size, int batch_size, float *x,
+                    float *y, int dim, int stride);
+}
--- a/include/intelcpu/mkl_kernel_without_config.h
+++ b/include/intelcpu/mkl_kernel_without_config.h
@ -0,0 +1,40 @@
+#pragma once
+#include "core/kernel.h"
+#include "intelcpu/mkl_runtime.h"
+
+namespace infini {
+
+class MklKernelWithoutConfig : public Kernel {
+  public:
+    virtual void compute(const Operator &op, const PerfRecord &record,
+                         const RuntimeObj *_context) const override {
+        compute(op, _context);
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+        context->sync();
+    }
+    virtual void compute(const Operator &op,
+                         const RuntimeObj *context) const = 0;
+    // Premise: op is idempotent since it is called multiple times.
+    virtual PerfRecord tune(const Operator &op,
+                            const RuntimeObj *_context) const override {
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+        return make_ref<PerfRecordObj>(timeit([&]() { compute(op, _context); },
+                                              [&]() { context->sync(); }));
+    }
+
+  protected:
+    dnnl::memory::format_tag getUserFormatTag(int nDim) const {
+        if (nDim == 2)
+            return dnnl::memory::format_tag::nc;
+        else if (nDim == 3)
+            return dnnl::memory::format_tag::ncw;
+        else if (nDim == 4)
+            return dnnl::memory::format_tag::nchw;
+        else if (nDim == 5)
+            return dnnl::memory::format_tag::ncdhw;
+        else
+            IT_TODO_HALT();
+    }
+};
+
+} // namespace infini
--- a/include/intelcpu/mkl_runtime.h
+++ b/include/intelcpu/mkl_runtime.h
@ -7,9 +7,9 @@
 #include <dnnl_debug.h>
 #include <mkl.h>
 namespace infini {
-// TODO move utility function to alone file
 class MklRuntimeObj : public CpuRuntimeObj {
    dnnl_engine_t engine;
+    dnnl_stream_t stream;

  public:
    MklRuntimeObj();
@ -26,8 +26,10 @@ class MklRuntimeObj : public CpuRuntimeObj {
                          sizeof(uint64_t), 64);
    };

-    string toString() const override { return "CPU MKL Runtime"; };
+    string toString() const override { return "INTELCPU Runtime"; };
    dnnl::engine getEngine() const { return dnnl::engine(engine, true); }
+    dnnl::stream getStream() const { return dnnl::stream(stream, true); }
+    void sync() const;
 };

 } // namespace infini
--- a/include/intelcpu/operator_timer.h
+++ b/include/intelcpu/operator_timer.h
--- a/include/nnet/routine.h
+++ b/include/nnet/routine.h
@ -22,6 +22,7 @@ class RoutineNode {

  public:
    RoutineNode(Expr _expr, const vector<Tensor> &_inputs);
+    virtual ~RoutineNode() {}
    virtual string toReadable() const = 0;
    const Expr &getExpr() const { return expr; }
    const vector<Tensor> &getInputs() const { return inputs; }
--- a/include/operators/reshape.h
+++ b/include/operators/reshape.h
@ -42,6 +42,7 @@ class ReshapeObj : public OperatorObj {
 *
 */
 class FlattenObj : public OperatorObj {
+    int axis;

  public:
    /**
@ -51,7 +52,7 @@ class FlattenObj : public OperatorObj {
     * @param input The input tensor.
     * @param output The output one-dimensional tensor.
     */
-    FlattenObj(GraphObj *graph, Tensor input, Tensor output);
+    FlattenObj(GraphObj *graph, Tensor input, Tensor output, int axis);
    OP_CLONE(FlattenObj);

    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
--- a/include/operators/resize.h
+++ b/include/operators/resize.h
@ -75,6 +75,9 @@ class ResizeObj : public OperatorObj {
        IT_ASSERT((size_t)i < scales.size());
        return scales.at(i);
    }
+
+    vector<float> getScales() const { return scales; }
+
    float getRoi(int i) const {
        if (coMode == ECoordinateTransMode::tfCropAndResize) {
            IT_ASSERT(size_t(i) < roi.size());
--- a/include/operators/softmax.h
+++ b/include/operators/softmax.h
@ -0,0 +1,27 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class SoftmaxObj : public OperatorObj {
+    int axis;
+
+  public:
+    SoftmaxObj(GraphObj *graph, Tensor input, Tensor output, int axis);
+
+    OP_CLONE(SoftmaxObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override {
+        return {{inputs[0]->getDims()}};
+    };
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+    int getAxis() const { return axis; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+} // namespace infini
--- a/include/operators/unary.h
+++ b/include/operators/unary.h
@ -39,6 +39,7 @@ class UnaryObj : public OperatorObj {
 DEFINE_UNARY_OBJ(Relu, OpType::Relu)
 DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
 DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
-DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)
+// DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)
 DEFINE_UNARY_OBJ(Abs, OpType::Abs)
+
 }; // namespace infini
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -25,12 +25,7 @@ from onnx.shape_inference import infer_shapes
 from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
 from functools import reduce

-cpu_runtime = backend.cpu_runtime()
-
-
-def cuda_runtime():
-    return backend.cuda_runtime()
-
+runtime = backend.runtime()

 class OnnxStub:
    inputs: Dict[str, backend.Tensor] = {}
@ -253,6 +248,7 @@ class OnnxStub:
                tensors[node.output[0]] = self.handler.softmax(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
+                    next((attr.i for attr in node.attribute if attr.name == "axis")),
                )
            elif node.op_type == "Abs":
                tensors[node.output[0]] = self.handler.abs(
@ -265,14 +261,11 @@ class OnnxStub:
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Flatten":
-                # FIXME axis must be 1
-                axis = next(
-                    (attr.i for attr in node.attribute if attr.name == "axis"), None
-                )
-                assert axis == None or axis == 1
+                
                tensors[node.output[0]] = self.handler.flatten(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
+                    next((attr.i for attr in node.attribute if attr.name == "axis")),
                )
            elif node.op_type == "Reshape":
                input_shape = next(
@ -583,6 +576,9 @@ def from_onnx(model: ModelProto, runtime):
    stub = OnnxStub(model, runtime)
    return stub.inputs, stub.outputs, stub.handler

+def run_onnx(model: ModelProto, runtime):
+    stub = OnnxStub(model, runtime)
+    stub.run()

 def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
    for attr in node.attribute:
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@ -8,16 +8,28 @@ from onnx.helper import (
    make_tensor_value_info,
 )
 from onnx.checker import check_model
-from pyinfinitensor.onnx import from_onnx, backend, cpu_runtime
+from pyinfinitensor.onnx import from_onnx, backend, runtime, run_onnx


 def make_and_import_model(graph: onnx.GraphProto):
    model = make_model(graph)
    check_model(model)
-    from_onnx(model, cpu_runtime)
+    from_onnx(model, runtime)


 class TestStringMethods(unittest.TestCase):
+    #def test_run(self):
+    #    model_file = next(
+    #        (name for name in os.listdir() if name.endswith(".onnx")), None
+    #    )
+    #    if model_file != None:
+    #        print(
+    #            "model: {file}({size:.2f} MiB)".format(
+    #                file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
+    #            )
+    #        )
+    #        run_onnx(onnx.load(model_file), runtime)
+
    def test_load(self):
        model_file = next(
            (name for name in os.listdir() if name.endswith(".onnx")), None
@ -28,7 +40,7 @@ class TestStringMethods(unittest.TestCase):
                    file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
                )
            )
-            from_onnx(onnx.load(model_file), cpu_runtime)
+            from_onnx(onnx.load(model_file), runtime)

    def test_tensor(self):
        x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 2, 3])
@ -177,7 +189,7 @@ class TestStringMethods(unittest.TestCase):
    def test_softmax(self):
        x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
        y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
-        softmax = make_node("Softmax", ["x"], ["y"], name="softmax")
+        softmax = make_node("Softmax", ["x"], ["y"], axis=2, name="softmax")
        make_and_import_model(make_graph([softmax], "softmax", [x], [y]))

    def test_abs(self):
@ -194,9 +206,8 @@ class TestStringMethods(unittest.TestCase):

    def test_flatten(self):
        x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
-        y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 1 * 3 * 5 * 7])
-        flatten = make_node("Flatten", ["x"], ["y"], name="flatten")
-        # FIXME 后端要求产生 Π(dims) 长的一维张量，onnx 产生 1×Π(dims) 的二维张量
+        y = make_tensor_value_info("y", TensorProto.FLOAT, [1*3,  5 * 7])
+        flatten = make_node("Flatten", ["x"], ["y"], axis=2, name="flatten")
        # make_and_import_model(
        make_graph([flatten], "flatten", [x], [y])
        # )
@ -289,10 +300,10 @@ class TestStringMethods(unittest.TestCase):
        graph = make_graph([matmul, add], "lr", [x, a, b], [y])
        model = make_model(graph)
        check_model(model)
-        from_onnx(model, cpu_runtime)
+        from_onnx(model, runtime)

    def test_frontend(self):
-        handler = backend.GraphHandler(cpu_runtime)
+        handler = backend.GraphHandler(runtime)
        a = handler.tensor([1, 2, 3], 12)
        b = handler.tensor([1, 2, 3], 12)
        c = handler.tensor([1, 2, 3], 12)
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@ -10,6 +10,7 @@
 #include "operators/reduce_mean.h"
 #include "operators/reshape.h"
 #include "operators/slice.h"
+#include "operators/softmax.h"
 #include "operators/unary.h"

 namespace infini {
@ -126,11 +127,29 @@ DEFINE_ELEMENT_WISE_METHOD(pow, Pow)
 DEFINE_UNARY_METHOD(relu, Relu)
 DEFINE_UNARY_METHOD(sigmoid, Sigmoid)
 DEFINE_UNARY_METHOD(tanh, Tanh)
-DEFINE_UNARY_METHOD(softmax, Softmax)
 DEFINE_UNARY_METHOD(abs, Abs)
 // see operators/reshape.h
 DEFINE_UNARY_METHOD(identity, Identity)
-DEFINE_UNARY_METHOD(flatten, Flatten)
+
+Tensor GraphHandlerObj::softmax(Tensor input, Tensor output, int axis) {
+    if (output) {
+        g->addOpWithOutputs<SoftmaxObj>(std::move(input), output, axis);
+        return output;
+    } else {
+        return g->addOp<SoftmaxObj>(std::move(input), output, axis)
+            ->getOutput();
+    }
+}
+
+Tensor GraphHandlerObj::flatten(Tensor input, Tensor output, int axis) {
+    if (output) {
+        g->addOpWithOutputs<FlattenObj>(std::move(input), output, axis);
+        return output;
+    } else {
+        return g->addOp<FlattenObj>(std::move(input), output, axis)
+            ->getOutput();
+    }
+}

 Tensor GraphHandlerObj::reshape(Tensor data, Tensor reshaped, Shape shape) {
    if (reshaped) {
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@ -6,10 +6,6 @@
 #include <chrono>
 #include <cstring>
 namespace infini {
-void RuntimeObj::prepareAndRun(Graph &graph, bool tune, bool profiling) {
-    run(graph, tune, profiling);
-}
-
 void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
    if (!tune && profiling)
        IT_TODO_HALT();
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -9,7 +9,7 @@
 namespace infini {

 TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime)
-    : TensorBaseObj(shape.size(), dtype, runtime), shape(std::move(shape_)),
+    : TensorBaseObj(shape_.size(), dtype, runtime), shape(std::move(shape_)),
      _size(shape.empty()
                ? 0
                : std::accumulate(shape.begin(), shape.end(), 1,
--- a/src/cuda/cuda_utility.cu
+++ b/src/cuda/cuda_utility.cu
@ -5,7 +5,7 @@ __global__ void cudaPrintFloatImpl(float *x, int len) {
    int start = threadIdx.x + blockDim.x * blockIdx.x;
    if (start == 0) {
        for (int i = 0; i < len; ++i) {
-            printf("%.3f ", x[i]);
+            printf("%.7f ", x[i]);
        }
        printf("\n");
    }
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -12,8 +12,9 @@
 #include "cuda/cuda_runtime.h"
 #include "cuda/operator_timer.h"
 #endif
-#ifdef USE_MKL
-#include "mkl/operator_timer.h"
+#ifdef USE_INTELCPU
+#include "intelcpu/mkl_runtime.h"
+#include "intelcpu/operator_timer.h"
 #endif
 namespace py = pybind11;

@ -30,7 +31,7 @@ void register_operator_timer(py::module &m) {
    m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
 #endif

-#ifdef USE_MKL
+#ifdef USE_INTELCPU
    using namespace opTimer;
    m.def("getPerfConvMkl", &getPerfConvMkl);
    m.def("getPerfConvTransposed2dMkl", &getPerfConvTransposed2dMkl);
@ -111,6 +112,10 @@ static int tensor_dtype(Tensor t) {
 static Ref<CudaRuntimeObj> cuda_runtime() { return make_ref<CudaRuntimeObj>(); }
 #endif

+#ifdef USE_INTELCPU
+static Ref<RuntimeObj> intelcpu_runtime() { return make_ref<MklRuntimeObj>(); }
+#endif
+
 static std::tuple<int, int, int, int, int, int> conv_attrs_of(Operator op) {
    IT_ASSERT(op->getOpType() == OpType::Conv);
    auto conv = dynamic_cast<const ConvObj *>(op.get());
@ -158,10 +163,14 @@ static Shape reshape_shape_of(Operator op) {

 void export_functions(py::module &m) {
 #define FUNCTION(NAME) def(#NAME, &NAME)
-    m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
 #ifdef USE_CUDA
-        .FUNCTION(cuda_runtime)
+    m.def("runtime", cuda_runtime)
+#elif USE_INTELCPU
+    m.def("runtime", intelcpu_runtime)
+#else
+    m.def("runtime", &NativeCpuRuntimeObj::getInstance)
 #endif
+
        .FUNCTION(conv_attrs_of)
        .FUNCTION(batch_norm_attrs_of)
        .FUNCTION(pool_attrs_of)
--- a/src/intelcpu/mkl_runtime.cc
+++ b/src/intelcpu/mkl_runtime.cc
@ -0,0 +1,19 @@
+#include "intelcpu/mkl_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+namespace infini {
+MklRuntimeObj::MklRuntimeObj() : CpuRuntimeObj(Device::INTELCPU) {
+    dnnl_engine_create(&engine, dnnl_engine_kind_t::dnnl_cpu, 0);
+    dnnl_stream_create(
+        &stream, engine,
+        static_cast<dnnl_stream_flags_t>(dnnl_stream_default_flags));
+}
+
+MklRuntimeObj::~MklRuntimeObj() {
+    mkl_free_buffers();
+    dnnl_stream_destroy(stream);
+    dnnl_engine_destroy(engine);
+}
+
+void MklRuntimeObj::sync() const { getStream().wait(); }
+} // namespace infini
--- a/src/intelcpu/operator_timer.cc
+++ b/src/intelcpu/operator_timer.cc
@ -1,7 +1,7 @@
 #include "core/graph.h"
 #include "core/kernel.h"
 #include "core/runtime.h"
-#include "mkl/mkl_runtime.h"
+#include "intelcpu/mkl_runtime.h"
 #include "operators/conv.h"
 #include "operators/matmul.h"
 #include "utils/data_generator.h"
--- a/src/kernels/cpu/conv.cc
+++ b/src/kernels/cpu/conv.cc
@ -10,8 +10,13 @@ template <typename T> class NaiveConv : public CpuKernelWithoutConfig {
        T *iptr = op->getInputs(0)->getRawDataPtr<T *>();
        T *wptr = op->getInputs(1)->getRawDataPtr<T *>();
        T *optr = op->getOutput()->getRawDataPtr<T *>();
-        auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
-        auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+        //  Clang will give an error of " reference to local binding 'sh'
+        //  declared in enclosing function" if we write like this:
+        //        auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
+        int n, c, h, w, f, r, s;
+        std::tie(n, c, h, w, f, r, s) = op->getNCHWFRS();
+        int ph, pw, sh, sw, dh, dw;
+        std::tie(ph, pw, sh, sw, dh, dw) = op->getPadStrideDilation();
        int cpg = op->getChannelPerGroup();
        int g = op->getNumGroups();
        IT_ASSERT(f % g == 0, "Illegal number of channel");
@ -23,7 +28,7 @@ template <typename T> class NaiveConv : public CpuKernelWithoutConfig {
                for (int hh = 0; hh < oh; hh++)
                    for (int ww = 0; ww < ow; ww++) {
                        int gidx = ff / (f / g);
-                        VType val = 0;
+                        T val = 0;
                        for (int cc = 0; cc < cpg; cc++)
                            for (int rr = 0; rr < r; rr++)
                                for (int ss = 0; ss < s; ss++) {
--- a/src/kernels/cpu/membound.cc
+++ b/src/kernels/cpu/membound.cc
@ -30,8 +30,8 @@ class MemboundInterpreter : public Kernel {
        // }

        nnet::RangeOp range = nnet::as<nnet::RangeOpNode>(op->getNnetExpr());
-        const auto &rangeShape = range->getOutputShape();
-        const auto &outputShape = output->getDims();
+        // const auto &rangeShape = range->getOutputShape();
+        // const auto &outputShape = output->getDims();
        // rangeShape and outputShape may extra dims of length 1.
        // But their sizes should be the same.
        IT_ASSERT((ssize_t)range->getOutputSize() == (ssize_t)output->size());
--- a/src/kernels/cuda/resize.cu
+++ b/src/kernels/cuda/resize.cu
@ -213,7 +213,7 @@ void resize_kernel_nearest(float *in, float *out, const MetaData &metaData,
                                   sizeof(p_cooridnate_trans_mode_func[0]));
    IT_ASSERT(nearestMode <
              sizeof(p_nearest_mode_fun) / sizeof(p_nearest_mode_fun[0]));
-    _resize_kernel_nearest<<<blocksize, gridsize>>>(
+    _resize_kernel_nearest<<<gridsize, blocksize>>>(
        in, out, metaData, num, coordinateMode, nearestMode);
 }

@ -223,7 +223,7 @@ void resize_kernel_linear(float *in, float *out, const MetaData &metaData,
    auto gridsize = (num + blocksize - 1) / blocksize;
    IT_ASSERT(coordinateMode < sizeof(p_cooridnate_trans_mode_func) /
                                   sizeof(p_cooridnate_trans_mode_func[0]));
-    _resize_kernel_linear_coeff<<<blocksize, gridsize>>>(in, out, metaData, num,
+    _resize_kernel_linear_coeff<<<gridsize, blocksize>>>(in, out, metaData, num,
                                                         coordinateMode);
 }

@ -233,7 +233,7 @@ void resize_kernel_cubic(float *in, float *out, const MetaData &metaData,
    auto gridsize = (num + blocksize - 1) / blocksize;
    IT_ASSERT(coordinateMode < sizeof(p_cooridnate_trans_mode_func) /
                                   sizeof(p_cooridnate_trans_mode_func[0]));
-    _resize_kernel_cubic_coeff<<<blocksize, gridsize>>>(in, out, metaData, num,
+    _resize_kernel_cubic_coeff<<<gridsize, blocksize>>>(in, out, metaData, num,
                                                        coordinateMode);
 }
 } // namespace infini
--- a/src/kernels/cuda/softmax.cc
+++ b/src/kernels/cuda/softmax.cc
@ -0,0 +1,30 @@
+#include "operators/softmax.h"
+#include "cuda/cuda_kernel_wihtout_config.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/softmax.h"
+
+namespace infini {
+class SoftmaxCudnn : public CudaKernelWithoutConfig {
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<SoftmaxObj>(_op);
+        auto x = op->getInputs(0)->getRawDataPtr<float *>();
+        auto y = op->getOutput(0)->getRawDataPtr<float *>();
+        auto dims = op->getInputs(0)->getDims();
+
+        int batch_size = 1;
+        for (size_t i = 0; i < dims.size(); ++i)
+            batch_size *= dims[i];
+        int dim = dims[op->getAxis()];
+
+        int block_num = batch_size / dim;
+        int max_threadblock_size = batch_size / block_num;
+        softmax_kernel(max_threadblock_size, block_num, x, y, dim,
+                       op->getInputs(0)->getStride().at(op->getAxis()));
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, SoftmaxCudnn,
+                "Softmax_CUDA_Float32");
+} // namespace infini
--- a/src/kernels/cuda/softmax.cu
+++ b/src/kernels/cuda/softmax.cu
@ -0,0 +1,77 @@
+#include "cuda/cuda_common.h"
+#include "cuda/softmax.h"
+#include <cub/cub.cuh>
+
+struct __align__(8) MD {
+    float data;
+    float d;
+};
+
+__device__ __forceinline__ MD reduce_md_op(MD a, MD b) {
+    bool a_bigger = (a.data > b.data);
+    MD bigger_m = a_bigger ? a : b;
+    MD smaller_m = a_bigger ? b : a;
+    MD res;
+    res.d = bigger_m.d + smaller_m.d * __expf(smaller_m.data - bigger_m.data);
+    res.data = bigger_m.data;
+    return res;
+}
+
+template <int THREADBLOCK_SIZE>
+__launch_bounds__(THREADBLOCK_SIZE) __global__
+    void online_softmax(const float *__restrict in, float *__restrict out,
+                        int dimSize, int stride) {
+
+    // reposition in and out to data for the current vector
+    int blockOffset = blockIdx.x;
+    if (blockIdx.x >= stride) {
+        int tmp = blockIdx.x % stride;
+        blockOffset = tmp + (blockIdx.x - tmp) * dimSize;
+    }
+    in += blockOffset;
+    out += blockOffset;
+
+    MD md_partial;
+    md_partial.data = -FLT_MAX;
+    md_partial.d = 0.0F;
+
+    for (int elem_id = threadIdx.x; elem_id < dimSize;
+         elem_id += THREADBLOCK_SIZE) {
+        MD new_elem;
+        new_elem.data = in[elem_id * stride];
+        new_elem.d = 1.0F;
+        md_partial = reduce_md_op(md_partial, new_elem);
+    }
+
+    // blockreduce for THREADBLOCK_SIZE threads.
+    // The actrual threads num used in the block is "dimsSize"
+    typedef cub::BlockReduce<MD, THREADBLOCK_SIZE> BlockReduce;
+
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    __shared__ MD md_total;
+
+    MD md = BlockReduce(temp_storage).Reduce(md_partial, reduce_md_op);
+    if (threadIdx.x == 0)
+        md_total = md;
+    __syncthreads();
+
+    float d_total_inverse = __fdividef(1.0F, md_total.d);
+    for (int elem_id = threadIdx.x; elem_id < dimSize;
+         elem_id += THREADBLOCK_SIZE)
+        out[elem_id * stride] =
+            __expf(in[elem_id * stride] - md_total.data) * d_total_inverse;
+}
+
+namespace infini {
+void softmax_kernel(int max_threadblock_size, int blockNum, float *in,
+                    float *out, int dimSize, int stride) {
+    if (max_threadblock_size >= 255)
+        online_softmax<256><<<blockNum, 256>>>(in, out, dimSize, stride);
+    else if (max_threadblock_size >= 128)
+        online_softmax<128><<<blockNum, 128>>>(in, out, dimSize, stride);
+    else if (max_threadblock_size >= 64)
+        online_softmax<64><<<blockNum, 64>>>(in, out, dimSize, stride);
+    else
+        online_softmax<32><<<blockNum, 32>>>(in, out, dimSize, stride);
+}
+} // namespace infini
--- a/src/kernels/cuda/unary.cc
+++ b/src/kernels/cuda/unary.cc
@ -60,48 +60,6 @@ class ActivationCudnn : public CudaKernelWithoutConfig {
    }
 };

-class SoftmaxCudnn : public CudaKernelWithoutConfig {
-    virtual cudnnSoftmaxAlgorithm_t getAlgorithmType() const = 0;
-    virtual cudnnSoftmaxMode_t getModeType() const = 0;
-    virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
-    void compute(const Operator &_op,
-                 const RuntimeObj *_context) const override {
-        auto op = as<UnaryObj>(_op);
-        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
-
-        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
-        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
-
-        cudnnTensorDescriptor_t inputDesc, outputDesc;
-        auto dim = op->getInputs(0)->getDims();
-        if (dim.size() != 4)
-            IT_TODO_HALT();
-        int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
-
-        // get inputs
-        checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(
-            inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
-
-        // get outputs
-        checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
-        checkCudnnError(cudnnSetTensor4dDescriptor(
-            outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
-
-        auto [alpha, beta] = getAlphBeta();
-        cudnnStatus_t stat = cudnnSoftmaxForward(
-            context->cudnnHandle(), getAlgorithmType(), getModeType(), &alpha,
-            inputDesc, inputData, &beta, outputDesc, outputData);
-        if (stat != CUDNN_STATUS_SUCCESS)
-            return;
-
-        // Destories in CUDA does not require sync. But cuDNN does not state
-        // whether sync is required before destories.
-        checkCudnnError(cudnnDestroyTensorDescriptor(inputDesc));
-        checkCudnnError(cudnnDestroyTensorDescriptor(outputDesc));
-    }
-};
-
 class ReluCudnn : public ActivationCudnn {
    cudnnActivationMode_t getOpType() const override {
        return CUDNN_ACTIVATION_RELU;
@ -120,17 +78,6 @@ class TanhCudnn : public ActivationCudnn {
    }
 };

-class NormalSoftmaxCudnn : public SoftmaxCudnn {
-    cudnnSoftmaxAlgorithm_t getAlgorithmType() const override {
-        return CUDNN_SOFTMAX_ACCURATE;
-    }
-    cudnnSoftmaxMode_t getModeType() const override {
-        return CUDNN_SOFTMAX_MODE_INSTANCE;
-    }
-};
-
-REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32,
-                NormalSoftmaxCudnn, "Softmax_CUDA_Float32");
 REGISTER_KERNEL(Device::CUDA, OpType::Relu, DataType::Float32, ReluCudnn,
                "Relu_CUDA_Float32");
 REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, DataType::Float32, SigmoidCudnn,
--- a/src/kernels/intelcpu/batch_norm.cc
+++ b/src/kernels/intelcpu/batch_norm.cc
@ -0,0 +1,68 @@
+#include "operators/batch_norm.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+
+namespace infini {
+class MklBatchNorm : public MklKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<BatchNormObj>(_op);
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+
+        float *const srcData = op->getInputs(0)->getRawDataPtr<float *>();
+        float *const dstData = op->getOutput()->getRawDataPtr<float *>();
+
+        //  create user memory that describes data layout in the buffers
+        std::vector<dnnl_dim_t> dims;
+        for (size_t i = 0; i < op->getInputs(0)->getDims().size(); ++i)
+            dims.push_back(op->getInputs(0)->getDims()[i]);
+
+        auto srcMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(dims.size()));
+        auto srcMemory = dnnl::memory(srcMd, context->getEngine(), srcData);
+
+        auto dstMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(dims.size()));
+        auto output = dnnl::memory(dstMd, context->getEngine(), dstData);
+
+        std::vector<dnnl_dim_t> meanDims(op->getInputs(0)->getDims().size(), 1);
+        meanDims[1] = op->getInputs(0)->getDims()[1];
+        auto meanMd = dnnl::memory::desc(meanDims, dnnl::memory::data_type::f32,
+                                         getUserFormatTag(meanDims.size()));
+
+        auto meanMemory =
+            dnnl::memory(meanMd, context->getEngine(),
+                         op->getInputs(1)->getRawDataPtr<float *>());
+        auto varMemory =
+            dnnl::memory(meanMd, context->getEngine(),
+                         op->getInputs(2)->getRawDataPtr<float *>());
+        auto scaleMemory =
+            dnnl::memory(meanMd, context->getEngine(),
+                         op->getInputs(3)->getRawDataPtr<float *>());
+        auto baisMemory =
+            dnnl::memory(meanMd, context->getEngine(),
+                         op->getInputs(4)->getRawDataPtr<float *>());
+        using op_desc_t = dnnl::batch_normalization_forward::desc;
+        using pd_t = dnnl::batch_normalization_forward::primitive_desc;
+
+        // use_global_stats stands for use mean and var as inputs
+        auto opDesc =
+            op_desc_t(dnnl::prop_kind::forward_inference, srcMd, op->getEps(),
+                      dnnl::normalization_flags::use_global_stats |
+                          dnnl::normalization_flags::use_shift |
+                          dnnl::normalization_flags::use_scale);
+        auto primDesc = pd_t(opDesc, context->getEngine());
+
+        // create and execute primitive
+        dnnl::batch_normalization_forward(primDesc).execute(
+            context->getStream(), {{DNNL_ARG_SRC, srcMemory},
+                                   {DNNL_ARG_DST, output},
+                                   {DNNL_ARG_MEAN, meanMemory},
+                                   {DNNL_ARG_VARIANCE, varMemory},
+                                   {DNNL_ARG_SCALE, scaleMemory},
+                                   {DNNL_ARG_SHIFT, baisMemory}});
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::BatchNorm, DataType::Float32,
+                MklBatchNorm, "BatchNorm_Mkl_Float32");
+}; // namespace infini
--- a/src/kernels/intelcpu/concat.cc
+++ b/src/kernels/intelcpu/concat.cc
@ -0,0 +1,58 @@
+#include "operators/concat.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+
+namespace infini {
+class MklConcat : public MklKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ConcatObj>(_op);
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+
+        //  create user memory that describes data layout in the buffers
+        std::vector<dnnl::memory::desc> srcsMd;
+        std::vector<dnnl::memory> srcs;
+
+        for (size_t i = 0; i < op->getInputs().size(); i++) {
+            std::vector<dnnl_dim_t> dims;
+            auto inDims = op->getInputs(i)->getDims();
+            int ndim = inDims.size();
+            for (int j = 0; j < ndim; ++j)
+                dims.push_back(inDims.at(j));
+
+            auto md = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                         getUserFormatTag(dims.size()));
+            srcsMd.push_back(md);
+
+            auto srcMemory =
+                dnnl::memory(md, context->getEngine(),
+                             op->getInputs(i)->getRawDataPtr<float *>());
+            srcs.push_back(srcMemory);
+        }
+
+        std::vector<dnnl_dim_t> dims;
+        auto oDims = op->getOutput(0)->getDims();
+        int ndim = oDims.size();
+        for (int i = 0; i < ndim; ++i)
+            dims.push_back(oDims.at(i));
+
+        auto dstMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(dims.size()));
+        auto primDesc =
+            dnnl::concat::primitive_desc(dstMd, static_cast<int>(op->getDim()),
+                                         srcsMd, context->getEngine());
+
+        float *const dstData = op->getOutput()->getRawDataPtr<float *>();
+        auto output = dnnl::memory(dstMd, context->getEngine(), dstData);
+
+        // create and execute primitive
+        std::unordered_map<int, dnnl::memory> args = {{DNNL_ARG_DST, output}};
+        for (int i = 0; i < (int)srcs.size(); i++) {
+            args.insert({DNNL_ARG_MULTIPLE_SRC + i, srcs.at(i)});
+        }
+        dnnl::concat(primDesc).execute(context->getStream(), args);
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::Concat, DataType::Float32, MklConcat,
+                "Concat_Mkl_Float32");
+}; // namespace infini
--- a/src/kernels/intelcpu/conv.cc
+++ b/src/kernels/intelcpu/conv.cc
@ -1,6 +1,6 @@
 #include "operators/conv.h"
 #include "core/kernel.h"
-#include "mkl/mkl_runtime.h"
+#include "intelcpu/mkl_runtime.h"

 namespace infini {
 struct ConvMklPerfRecordObj : public PerfRecordObj {
@ -167,20 +167,19 @@ class MklConv : public Kernel {
    }

    void compute(const Operator &_op, const PerfRecord &_record,
-                 const RuntimeObj *_context) const {
+                 const RuntimeObj *_context) const override {
        auto op = as<ConvObj>(_op);
        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
        auto record = as<ConvMklPerfRecordObj>(_record);

-        dnnl::stream stream(context->getEngine());
        std::vector<dnnl::primitive> prims;
        std::vector<std::unordered_map<int, dnnl::memory>> primArgs;
        IT_ASSERT(createPrimitives(op, record, context, true, prims, primArgs));

        IT_ASSERT(prims.size() == primArgs.size());
        for (size_t i = 0; i < prims.size(); ++i)
-            prims.at(i).execute(stream, primArgs.at(i));
-        stream.wait();
+            prims.at(i).execute(context->getStream(), primArgs.at(i));
+        context->getStream().wait();
    }

    void compute(const Operator &op, const RuntimeObj *context) const override {
@ -209,17 +208,19 @@ class MklConv : public Kernel {
                continue;

            IT_ASSERT(prims.size() == primArgs.size());
-            dnnl::stream stream(context->getEngine());
+            // does context->getStream() need to be attached to runtime, and
+            // delete after each use?
            for (size_t i = 0; i < prims.size(); ++i)
-                prims.at(i).execute(stream, primArgs.at(i));
-            stream.wait();
+                prims.at(i).execute(context->getStream(), primArgs.at(i));
+            context->getStream().wait();

            record.time = timeit(
                [&]() {
                    for (size_t i = 0; i < prims.size(); ++i)
-                        prims.at(i).execute(stream, primArgs.at(i));
+                        prims.at(i).execute(context->getStream(),
+                                            primArgs.at(i));
                },
-                [&]() { stream.wait(); });
+                [&]() { context->getStream().wait(); });

            // Update the tune result
            if (ret.time > record.time)
@ -232,6 +233,6 @@ class MklConv : public Kernel {
        return make_ref<ConvMklPerfRecordObj>(ret);
    }
 };
-REGISTER_KERNEL(Device::MKL, OpType::Conv, DataType::Float32, MklConv,
+REGISTER_KERNEL(Device::INTELCPU, OpType::Conv, DataType::Float32, MklConv,
                "MklConv_CPU_float32");
 } // namespace infini
--- a/src/kernels/intelcpu/conv_transposed.cc
+++ b/src/kernels/intelcpu/conv_transposed.cc
@ -1,5 +1,5 @@
 #include "core/kernel.h"
-#include "mkl/mkl_runtime.h"
+#include "intelcpu/mkl_runtime.h"
 #include "operators/conv.h"

 namespace infini {
@ -244,7 +244,7 @@ class MklConvTranspose : public Kernel {
        return make_ref<ConvTransposeMklPerfRecordObj>(ret);
    }
 };
-REGISTER_KERNEL(Device::MKL, OpType::ConvTrans, DataType::Float32,
+REGISTER_KERNEL(Device::INTELCPU, OpType::ConvTrans, DataType::Float32,
                MklConvTranspose, "MklConvTrans_CPU_float32");

 } // namespace infini
--- a/src/kernels/intelcpu/element_wise.cc
+++ b/src/kernels/intelcpu/element_wise.cc
@ -0,0 +1,133 @@
+#include "operators/element_wise.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/unary.h"
+
+namespace infini {
+class MklBinary : public MklKernelWithoutConfig {
+    dnnl::algorithm getAlgorithem(const Ref<ElementWiseObj> &op) const {
+        switch (op->getOpType()) {
+        case OpType::Add:
+            return dnnl::algorithm::binary_add;
+        case OpType::Sub:
+            return dnnl::algorithm::binary_sub;
+        case OpType::Mul:
+            return dnnl::algorithm::binary_mul;
+        case OpType::Div:
+            return dnnl::algorithm::binary_div;
+
+        default:
+            IT_TODO_HALT();
+        }
+        return dnnl::algorithm::undef;
+    }
+
+    // Binary primitives support elementwise broadcast
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ElementWiseObj>(_op);
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+
+        void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const cData = (op->getOutput()->getRawDataPtr<void *>());
+
+        //  create user memory that describes data layout in the buffers
+        std::vector<dnnl_dim_t> dims;
+        for (size_t i = 0; i < op->getInputs(0)->getDims().size(); ++i)
+            dims.push_back(op->getInputs(0)->getDims()[i]);
+
+        auto srcMd1 = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                         getUserFormatTag(dims.size()));
+        auto srcMemory1 = dnnl::memory(srcMd1, context->getEngine(), aData);
+
+        auto srcMd2 = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                         getUserFormatTag(dims.size()));
+        auto srcMemory2 = dnnl::memory(srcMd2, context->getEngine(), bData);
+
+        auto dstMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(dims.size()));
+        auto output = dnnl::memory(dstMd, context->getEngine(), cData);
+
+        auto binaryDesc =
+            dnnl::binary::desc(getAlgorithem(op), srcMd1, srcMd2, dstMd);
+        auto primDesc =
+            dnnl::binary::primitive_desc(binaryDesc, context->getEngine());
+
+        // create and execute binary primitive
+        dnnl::binary(primDesc).execute(context->getStream(),
+                                       {{DNNL_ARG_SRC_0, srcMemory1},
+                                        {DNNL_ARG_SRC_1, srcMemory2},
+                                        {DNNL_ARG_DST, output}});
+    }
+};
+
+class MklUnary : public MklKernelWithoutConfig {
+    dnnl::algorithm getAlgorithem(const Ref<UnaryObj> &op) const {
+        switch (op->getOpType()) {
+        case OpType::Relu:
+            return dnnl::algorithm::eltwise_relu;
+        case OpType::Tanh:
+            return dnnl::algorithm::eltwise_tanh;
+        case OpType::Abs:
+            return dnnl::algorithm::eltwise_abs;
+        case OpType::Sigmoid:
+            return dnnl::algorithm::eltwise_logistic;
+        default:
+            IT_TODO_HALT();
+        }
+        return dnnl::algorithm::undef;
+    }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<UnaryObj>(_op);
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+
+        void *const srcData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const dstData = (op->getOutput()->getRawDataPtr<void *>());
+
+        //  create user memory that describes data layout in the buffers
+        std::vector<dnnl_dim_t> dims;
+        for (size_t i = 0; i < op->getInputs(0)->getDims().size(); ++i)
+            dims.push_back(op->getInputs(0)->getDims()[i]);
+
+        auto srcMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(dims.size()), false);
+        auto srcMemory = dnnl::memory(srcMd, context->getEngine(), srcData);
+
+        auto output = dnnl::memory(srcMd, context->getEngine(), dstData);
+
+        const float negative1_slope = 0.0f;
+
+        auto unaryDesc = dnnl::eltwise_forward::desc(
+            dnnl::prop_kind::forward_inference, getAlgorithem(op), srcMd,
+            negative1_slope);
+        auto primDesc = dnnl::eltwise_forward::primitive_desc(
+            unaryDesc, context->getEngine());
+
+        // create and execute binary primitive
+        dnnl::eltwise_forward(primDesc).execute(
+            context->getStream(),
+            {{DNNL_ARG_SRC, srcMemory}, {DNNL_ARG_DST, output}});
+    }
+};
+
+REGISTER_KERNEL(Device::INTELCPU, OpType::Add, DataType::Float32, MklBinary,
+                "Add_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Sub, DataType::Float32, MklBinary,
+                "Sub_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Mul, DataType::Float32, MklBinary,
+                "Mul_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Div, DataType::Float32, MklBinary,
+                "Div_Mkl_Float32");
+
+REGISTER_KERNEL(Device::INTELCPU, OpType::Relu, DataType::Float32, MklUnary,
+                "Relu_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Sigmoid, DataType::Float32, MklUnary,
+                "Sigmoid_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Tanh, DataType::Float32, MklUnary,
+                "Tanh_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Abs, DataType::Float32, MklUnary,
+                "Abs_Mkl_Float32");
+} // namespace infini
--- a/src/kernels/intelcpu/extend.cc
+++ b/src/kernels/intelcpu/extend.cc
@ -0,0 +1,45 @@
+#include "operators/extend.h"
+#include "core/kernel.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+#include <CL/sycl.hpp>
+#include <math.h>
+
+namespace infini {
+class MklExtend : public MklKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ExtendObj>(_op);
+        auto inData = op->getInputs(0)->getRawDataPtr<float *>();
+        auto outData = op->getOutput(0)->getRawDataPtr<float *>();
+        int iSize = op->getInputs(0)->size();
+        int oSize = op->getOutput(0)->size();
+
+        sycl::queue q(sycl::cpu_selector{});
+        auto inDevice = sycl::malloc_device<float>(iSize, q);
+        auto outDevice = sycl::malloc_device<float>(oSize, q);
+
+        q.memcpy(inDevice, inData, iSize * sizeof(float));
+        q.wait();
+
+        int blockSize = 1;
+        auto iDim = op->getInputs(0)->getDims();
+        for (size_t i = iDim.size() - 1;
+             i >= (size_t)op->getDim() && i != (size_t)-1; --i)
+            blockSize *= iDim[i];
+        auto blockSizeOuter = (op->getNum() + 1) * blockSize;
+
+        q.parallel_for(sycl::range<1>(oSize), [=](sycl::id<1> index) {
+             auto iIdx = index % blockSize + index / blockSizeOuter * blockSize;
+             outDevice[index] = inDevice[iIdx];
+         }).wait();
+
+        q.memcpy(outData, outDevice, oSize * sizeof(float));
+        q.wait();
+        sycl::free(inDevice, q);
+        sycl::free(outDevice, q);
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::Extend, DataType::Float32, MklExtend,
+                "Extend_Mkl_Float32");
+}; // namespace infini
--- a/src/kernels/intelcpu/gather.cc
+++ b/src/kernels/intelcpu/gather.cc
@ -0,0 +1,86 @@
+#include "operators/gather.h"
+#include "core/kernel.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+#include <CL/sycl.hpp>
+#include <math.h>
+
+namespace infini {
+class MklGather : public MklKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<GatherObj>(_op);
+        auto in = op->getInputs(0);
+        auto index = op->getInputs(1);
+        auto out = op->getOutput();
+        int iSize = in->size();
+        int oSize = out->size();
+        int idxSize = index->size();
+
+        int inNDim = in->getDims().size();
+        int oNDim = out->getDims().size();
+        int idxNDim = index->getDims().size();
+        int axis = op->getAxis();
+
+        int outDim[4] = {0};
+        int idxDim[4] = {0};
+        int idxStride[4] = {0};
+        int inStride[4] = {0};
+        for (int i = 0; i < oNDim; ++i)
+            outDim[i] = out->getDims()[i];
+        for (int i = 0; i < idxNDim; ++i) {
+            idxDim[i] = index->getDims()[i];
+            idxStride[i] = index->getStride()[i];
+        }
+        for (int i = 0; i < inNDim; ++i) {
+            inStride[i] = in->getStride()[i];
+        }
+
+        sycl::queue q(sycl::cpu_selector{});
+        auto inDevice = sycl::malloc_device<float>(iSize, q);
+        auto indexDevice = sycl::malloc_device<uint32_t>(idxSize, q);
+        auto outDevice = sycl::malloc_device<float>(oSize, q);
+
+        q.memcpy(inDevice, in->getRawDataPtr<float *>(), iSize * sizeof(float));
+        q.memcpy(indexDevice, index->getRawDataPtr<uint32_t *>(),
+                 idxSize * sizeof(uint32_t));
+        q.wait();
+
+        q.parallel_for(sycl::range<1>(oSize), [=](sycl::id<1> index) {
+             int offset = 0;
+             int gOffset = index;
+             for (int i = inNDim - 1, k = oNDim - 1; i >= 0; --i) {
+                 int idx = 0;
+                 if (i == axis) {
+                     int idxOffset = 0;
+                     for (int j = idxNDim - 1; j >= 0; --j) {
+                         int p = gOffset % idxDim[j];
+                         gOffset = gOffset / idxDim[j];
+                         idxOffset += p * idxStride[j];
+                     }
+
+                     idx = indexDevice[idxOffset];
+                     k = k - idxNDim;
+
+                 } else {
+                     idx = gOffset % outDim[k];
+                     gOffset = gOffset / outDim[k];
+                     --k;
+                 }
+                 offset += idx * inStride[i];
+             }
+
+             outDevice[index] = inDevice[offset];
+         }).wait();
+
+        q.memcpy(out->getRawDataPtr<float *>(), outDevice,
+                 oSize * sizeof(float));
+        q.wait();
+        sycl::free(inDevice, q);
+        sycl::free(outDevice, q);
+        sycl::free(indexDevice, q);
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::Gather, DataType::Float32, MklGather,
+                "Gather_Mkl_Float32");
+}; // namespace infini
--- a/src/kernels/intelcpu/matmul.cc
+++ b/src/kernels/intelcpu/matmul.cc
@ -1,9 +1,8 @@
 #include "operators/matmul.h"
 #include "core/kernel.h"
-#include "mkl/mkl_runtime.h"
+#include "intelcpu/mkl_runtime.h"

 namespace infini {
-
 template <typename T> class MklMatmul : public CpuKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *context) const override {
@ -32,7 +31,7 @@ template <typename T> class MklMatmul : public CpuKernelWithoutConfig {
    }
 };

-REGISTER_KERNEL(Device::MKL, OpType::Matmul, DataType::Float32,
-                MklMatmul<float>, "MklMatmul_CPU_float32");
+/*REGISTER_KERNEL(Device::INTELCPU, OpType::Matmul, DataType::Float32,
+                MklMatmul<float>, "MklMatmul_CPU_float32");*/

 } // namespace infini
--- a/src/kernels/intelcpu/matmul_dpcpp.cc
+++ b/src/kernels/intelcpu/matmul_dpcpp.cc
@ -0,0 +1,75 @@
+#include "core/kernel.h"
+#include "intelcpu/mkl_runtime.h"
+#include "mkl.h"
+#include "oneapi/mkl/blas.hpp"
+#include "operators/matmul.h"
+#include <CL/sycl.hpp>
+
+namespace infini {
+template <typename T> class MklDpcppMatmul : public CpuKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+        auto op = as<MatmulObj>(_op);
+        IT_ASSERT(op->getInputs().size() == 2, "Bias is not supported yet.");
+        const T *A = op->getInputs(0)->getRawDataPtr<T *>();
+        const T *B = op->getInputs(1)->getRawDataPtr<T *>();
+        T *C = op->getOutput()->getRawDataPtr<T *>();
+        IT_ASSERT(op->getAct() == ActType::None);
+        const int m = op->getM(), n = op->getN(), k = op->getK(),
+                  b = op->getB();
+
+        auto opA = op->getTransA() ? oneapi::mkl::transpose::trans
+                                   : oneapi::mkl::transpose::nontrans;
+        auto opB = op->getTransB() ? oneapi::mkl::transpose::trans
+                                   : oneapi::mkl::transpose::nontrans;
+        // ldA is always a.col, and ldB is always b.col when row major
+        const int ldA =
+            std::max((opA == oneapi::mkl::transpose::nontrans) ? k : m, 1);
+        const int ldB =
+            std::max((opB == oneapi::mkl::transpose::nontrans) ? n : k, 1);
+        const int ldC = std::max(n, 1);
+
+        const float alpha = 1.f, beta = 0.f;
+        // TODO: Intel MKL ERROR will occur when using cblas_sgemm_batch
+        /*for (int i = 0; i < b; ++i) {
+            cblas_sgemm(CblasRowMajor, opA, opB, m, n, k, alpha, A + m * k * i,
+                        ldA, B + k * n * i, ldB, beta, C + m * n * i, ldC);
+        }*/
+
+        sycl::queue q(sycl::cpu_selector{});
+        // Catch asynchronous exceptions
+        auto exception_handler = [](cl::sycl::exception_list exceptions) {
+            for (std::exception_ptr const &e : exceptions) {
+                try {
+                    std::rethrow_exception(e);
+                } catch (cl::sycl::exception const &e) {
+                    std::cout
+                        << "Caught asynchronous SYCL exception during GEMM:\n"
+                        << e.what() << std::endl;
+                }
+            }
+        };
+
+        // create execution queue and buffers of matrix data
+        cl::sycl::queue main_queue(sycl::cpu_selector{}, exception_handler);
+
+        cl::sycl::buffer<float, 1> A_buffer(A, op->getInputs(0)->size());
+        cl::sycl::buffer<float, 1> B_buffer(B, op->getInputs(1)->size());
+        cl::sycl::buffer<float, 1> C_buffer(C, op->getOutput(0)->size());
+
+        // add oneapi::mkl::blas::gemm to execution queue
+        try {
+            oneapi::mkl::blas::row_major::gemm_batch(
+                main_queue, opA, opB, m, n, k, alpha, A_buffer, ldA, m * k,
+                B_buffer, ldB, k * n, beta, C_buffer, ldC, m * n, b);
+        } catch (cl::sycl::exception const &e) {
+            std::cout << "\t\tCaught synchronous SYCL exception during GEMM:\n"
+                      << e.what() << std::endl;
+        }
+    }
+};
+
+REGISTER_KERNEL(Device::INTELCPU, OpType::Matmul, DataType::Float32,
+                MklDpcppMatmul<float>, "MklDpcppMatmul_CPU_float32");
+
+} // namespace infini
--- a/src/kernels/intelcpu/pad.cc
+++ b/src/kernels/intelcpu/pad.cc
@ -0,0 +1,58 @@
+#include "operators/pad.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+
+namespace infini {
+class MklPad : public MklKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<PadObj>(_op);
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+
+        std::vector<dnnl_dim_t> dims;
+        for (size_t i = 0; i < op->getInputs(0)->getDims().size(); ++i) {
+            dims.push_back(op->getInputs(0)->getDims()[i]);
+        }
+        auto paddedMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                           getUserFormatTag(dims.size()));
+
+        // dst md
+        auto oDims = op->getOutput(0)->getDims();
+        int ndim = oDims.size();
+        std::vector<dnnl_dim_t> paddedDims, offsets;
+        for (int i = 0; i < ndim; ++i) {
+            paddedDims.push_back(oDims.at(i));
+            paddedMd.data.padded_dims[i] = oDims.at(i);
+            paddedMd.data.padded_offsets[i] = op->getPads().at(i);
+            offsets.push_back(op->getPads().at(i));
+        }
+        // will fill padded area with zero.
+        auto paddedMemory =
+            dnnl::memory(paddedMd, context->getEngine(),
+                         op->getOutput(0)->getRawDataPtr<float *>());
+
+        auto dstMd =
+            dnnl::memory::desc(paddedDims, dnnl::memory::data_type::f32,
+                               getUserFormatTag(paddedDims.size()));
+
+        // copy src to the submemory of dst
+        // create submemory
+        auto md = dstMd.submemory_desc(dims, offsets);
+        auto mem = dnnl::memory(md, context->getEngine(),
+                                op->getOutput(0)->getRawDataPtr<float *>());
+
+        auto srcMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(dims.size()));
+        auto srcMemory =
+            dnnl::memory(srcMd, context->getEngine(),
+                         op->getInputs(0)->getRawDataPtr<float *>());
+
+        // copy data to submemory
+        dnnl::reorder(srcMemory, mem)
+            .execute(context->getStream(),
+                     {{DNNL_ARG_FROM, srcMemory}, {DNNL_ARG_TO, mem}});
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::Pad, DataType::Float32, MklPad,
+                "Pad_Mkl_Float32");
+} // namespace infini
--- a/src/kernels/intelcpu/pooling.cc
+++ b/src/kernels/intelcpu/pooling.cc
@ -0,0 +1,84 @@
+#include "operators/pooling.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+
+namespace infini {
+class MklPooling : public MklKernelWithoutConfig {
+    virtual dnnl::algorithm getAlgorithm() const = 0;
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<PoolingObj>(_op);
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+
+        float *const srcData = op->getInputs(0)->getRawDataPtr<float *>();
+        float *const dstData = op->getOutput()->getRawDataPtr<float *>();
+
+        //  create user memory that describes data layout in the buffers
+        auto [n, c, h, w, r, s] = op->getNCHWRS();
+        auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+        auto nDim = op->getOutput()->getDims().size();
+        auto oh = op->getOutput()->getDims()[nDim - 2];
+        auto ow = op->getOutput()->getDims()[nDim - 1];
+
+        auto srcMd = dnnl::memory::desc(
+            {n, c, h, w}, dnnl::memory::data_type::f32, getUserFormatTag(nDim));
+        auto srcMemory = dnnl::memory(srcMd, context->getEngine(), srcData);
+
+        auto userDstMd =
+            dnnl::memory::desc({n, c, oh, ow}, dnnl::memory::data_type::f32,
+                               getUserFormatTag(nDim));
+
+        auto dstMd =
+            dnnl::memory::desc({n, c, oh, ow}, dnnl::memory::data_type::f32,
+                               dnnl::memory::format_tag::any);
+
+        using op_desc_t = dnnl::pooling_v2_forward::desc;
+        using pd_t = dnnl::pooling_v2_forward::primitive_desc;
+
+        auto opDesc = op_desc_t(dnnl::prop_kind::forward_inference,
+                                getAlgorithm(), srcMd, dstMd, {sh, sw}, {r, s},
+                                {dh - 1, dw - 1}, {ph, pw}, {ph, pw});
+        auto primDesc = pd_t(opDesc, context->getEngine());
+
+        if (primDesc.dst_desc() == userDstMd) {
+            auto output = dnnl::memory(primDesc.dst_desc(),
+                                       context->getEngine(), dstData);
+
+            dnnl::pooling_v2_forward(primDesc).execute(
+                context->getStream(),
+                {{DNNL_ARG_SRC, srcMemory}, {DNNL_ARG_DST, output}});
+        } else {
+            auto dstMemory =
+                dnnl::memory(primDesc.dst_desc(), context->getEngine());
+
+            dnnl::pooling_v2_forward(primDesc).execute(
+                context->getStream(),
+                {{DNNL_ARG_SRC, srcMemory}, {DNNL_ARG_DST, dstMemory}});
+
+            auto output =
+                dnnl::memory(userDstMd, context->getEngine(), dstData);
+            dnnl::reorder(dstMemory, output)
+                .execute(context->getStream(),
+                         {{DNNL_ARG_FROM, dstMemory}, {DNNL_ARG_TO, output}});
+        }
+    }
+};
+
+class MklAvgPool : public MklPooling {
+    dnnl::algorithm getAlgorithm() const override {
+        return dnnl::algorithm::pooling_avg_include_padding;
+    }
+};
+
+class MklMaxPool : public MklPooling {
+    dnnl::algorithm getAlgorithm() const override {
+        return dnnl::algorithm::pooling_max;
+    }
+};
+
+REGISTER_KERNEL(Device::INTELCPU, OpType::AvgPool, DataType::Float32,
+                MklAvgPool, "AvgPool_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::MaxPool, DataType::Float32,
+                MklMaxPool, "MaxPool_Mkl_Float32");
+} // namespace infini
--- a/src/kernels/intelcpu/pow.cc
+++ b/src/kernels/intelcpu/pow.cc
@ -0,0 +1,43 @@
+#include "core/kernel.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/element_wise.h"
+#include <CL/sycl.hpp>
+#include <math.h>
+
+namespace infini {
+class MklPow : public MklKernelWithoutConfig {
+    // TODO: not need to copy memory??
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<PowObj>(_op);
+        auto in0Data = op->getInputs(0)->getRawDataPtr<float *>();
+        auto in1Data = op->getInputs(1)->getRawDataPtr<float *>();
+        auto outData = op->getOutput(0)->getRawDataPtr<float *>();
+        int size = op->getInputs(0)->size();
+
+        // cpu_selector using openCL as backend;and host_selector bypasses the
+        // OnenCL backend and runs directly on CPU hardware
+        sycl::queue q(sycl::cpu_selector{});
+        auto in0Device = sycl::malloc_device<float>(size, q);
+        auto in1Device = sycl::malloc_device<float>(size, q);
+        auto outDevice = sycl::malloc_device<float>(size, q);
+        q.memcpy(in0Device, in0Data, size * sizeof(float));
+        q.wait();
+        q.memcpy(in1Device, in1Data, size * sizeof(float));
+        q.wait();
+
+        q.parallel_for(sycl::range<1>(size), [=](sycl::id<1> i) {
+             outDevice[i] = pow(in0Device[i], in1Device[i]);
+         }).wait();
+        q.memcpy(outData, outDevice, size * sizeof(float));
+        q.wait();
+        sycl::free(in0Device, q);
+        sycl::free(in1Device, q);
+        sycl::free(outDevice, q);
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::Pow, DataType::Float32, MklPow,
+                "Pow_Mkl_Float32");
+
+}; // namespace infini
--- a/src/kernels/intelcpu/reduce.cc
+++ b/src/kernels/intelcpu/reduce.cc
@ -0,0 +1,69 @@
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/reduce_mean.h"
+
+namespace infini {
+class MklReduce : public MklKernelWithoutConfig {
+    dnnl::algorithm getAlgorithm() const {
+        return dnnl::algorithm::reduction_mean;
+    }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ReduceMeanObj>(_op);
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+
+        float *const srcData = op->getInputs(0)->getRawDataPtr<float *>();
+        float *const dstData = op->getOutput()->getRawDataPtr<float *>();
+
+        //  create user memory that describes data layout in the buffers
+        std::vector<dnnl_dim_t> inDims, inStrides;
+        for (size_t i = 0; i < op->getInputs(0)->getDims().size(); ++i) {
+            inDims.push_back(op->getInputs(0)->getDims()[i]);
+            inStrides.push_back(op->getInputs(0)->getStride()[i]);
+        }
+
+        std::vector<dnnl_dim_t> oDims(op->getInputs(0)->getDims().size(), 0),
+            oStrides(op->getInputs(0)->getDims().size(), 1);
+        if (!op->getKeepDims()) {
+            oDims = inDims;
+            for (size_t i = 0; i < op->getInputs(0)->getDims().size(); ++i) {
+                if (op->isReduced(i)) {
+                    oDims[i] = 1;
+                }
+            }
+            int stride = 1;
+            for (int i = (int)oDims.size() - 1; i >= 0; --i) {
+                oStrides[i] = stride;
+                stride *= oDims[i];
+            }
+        } else {
+            for (size_t i = 0; i < op->getOutput(0)->getDims().size(); ++i) {
+                oDims[i] = op->getOutput(0)->getDims()[i];
+                oStrides[i] = op->getOutput(0)->getStride()[i];
+            }
+        }
+
+        auto srcMd =
+            dnnl::memory::desc(inDims, dnnl::memory::data_type::f32, inStrides);
+        auto srcMemory = dnnl::memory(srcMd, context->getEngine(), srcData);
+
+        auto dstMd =
+            dnnl::memory::desc(oDims, dnnl::memory::data_type::f32, oStrides);
+        auto output = dnnl::memory(dstMd, context->getEngine(), dstData);
+
+        using op_desc_t = dnnl::reduction::desc;
+        using pd_t = dnnl::reduction::primitive_desc;
+
+        auto opDesc = op_desc_t(getAlgorithm(), srcMd, dstMd, 0, 0);
+        auto primDesc = pd_t(opDesc, context->getEngine());
+
+        // create and execute primitive
+        dnnl::reduction(primDesc).execute(
+            context->getStream(),
+            {{DNNL_ARG_SRC, srcMemory}, {DNNL_ARG_DST, output}});
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::ReduceMean, DataType::Float32,
+                MklReduce, "ReduceMean_Mkl_Float32");
+}; // namespace infini
--- a/src/kernels/intelcpu/reshape.cc
+++ b/src/kernels/intelcpu/reshape.cc
@ -0,0 +1,50 @@
+#include "operators/reshape.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+
+namespace infini {
+class MklReshape : public MklKernelWithoutConfig {
+    void compute(const Operator &op,
+                 const RuntimeObj *_context) const override {
+
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+
+        std::vector<dnnl_dim_t> dims;
+        for (size_t i = 0; i < op->getInputs(0)->getDims().size(); ++i)
+            dims.push_back(op->getInputs(0)->getDims()[i]);
+
+        // create src md and src memory
+        auto srcMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(dims.size()));
+
+        // dst md
+        auto oDims = op->getOutput(0)->getDims();
+        int ndim = oDims.size();
+        std::vector<dnnl_dim_t> reshapeDims;
+        for (int i = 0; i < ndim; ++i) {
+            reshapeDims.push_back(oDims.at(i));
+        }
+        auto reshapeMd = srcMd.reshape(reshapeDims);
+        auto reshapeMemory =
+            dnnl::memory(reshapeMd, context->getEngine(),
+                         op->getInputs(0)->getRawDataPtr<float *>());
+
+        auto dstMd =
+            dnnl::memory::desc(reshapeDims, dnnl::memory::data_type::f32,
+                               getUserFormatTag(reshapeDims.size()));
+        auto output = dnnl::memory(dstMd, context->getEngine(),
+                                   op->getOutput(0)->getRawDataPtr<float *>());
+
+        // copy data to dst
+        dnnl::reorder(reshapeMemory, output)
+            .execute(context->getStream(),
+                     {{DNNL_ARG_FROM, reshapeMemory}, {DNNL_ARG_TO, output}});
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::Reshape, DataType::Float32,
+                MklReshape, "Reshape_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Identity, DataType::Float32,
+                MklReshape, "Identify_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Flatten, DataType::Float32,
+                MklReshape, "Flatten_Mkl_Float32");
+}; // namespace infini
--- a/src/kernels/intelcpu/resize.cc
+++ b/src/kernels/intelcpu/resize.cc
@ -0,0 +1,80 @@
+#include "operators/resize.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+
+namespace infini {
+class MklResize : public MklKernelWithoutConfig {
+    dnnl::algorithm getAlgorithm(Ref<ResizeObj> op) const {
+        switch (op->getMode()) {
+        case ResizeObj::ECoeffMode::nearest: {
+            if (op->getNearestMode() !=
+                enum_to_underlying(ResizeObj::ENearestMode::ceil))
+                IT_TODO_HALT();
+            return dnnl::algorithm::resampling_nearest;
+        }
+        case ResizeObj::ECoeffMode::linear:
+            return dnnl::algorithm::resampling_linear;
+
+        default:
+            IT_TODO_HALT();
+        }
+        return dnnl::algorithm::resampling_nearest;
+    }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<ResizeObj>(_op);
+
+        // only support default coordinate transmode??
+        if (op->getCoordinateTransMode() !=
+            enum_to_underlying(ResizeObj::ECoordinateTransMode::halfPixel))
+            IT_TODO_HALT();
+
+        int nDim = op->getInputs(0)->getDims().size();
+        IT_ASSERT(nDim == 3 || nDim == 4 ||
+                  nDim == 5 &&
+                      (op->getInputs(0)->getDims()[0] == 1 &&
+                       op->getInputs(0)->getDims()[1] == 1) &&
+                      (op->getOutput(0)->getDims()[0] == 1 &&
+                       op->getOutput(0)->getDims()[1] == 1));
+
+        IT_ASSERT(op->getScales().size() == nDim);
+        std::vector<float>::iterator beg = op->getScales().begin() + 2;
+        std::vector<float> scales(beg, op->getScales().end());
+
+        //  create user memory that describes data layout in the buffers
+        std::vector<dnnl_dim_t> idims, odims;
+        for (size_t i = 0; i < op->getInputs(0)->getDims().size(); ++i) {
+            idims.push_back(op->getInputs(0)->getDims()[i]);
+            odims.push_back(op->getOutput(0)->getDims()[i]);
+        }
+
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+
+        float *const srcData = op->getInputs(0)->getRawDataPtr<float *>();
+        float *const dstData = op->getOutput()->getRawDataPtr<float *>();
+
+        auto srcMd = dnnl::memory::desc(idims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(idims.size()));
+        auto srcMemory = dnnl::memory(srcMd, context->getEngine(), srcData);
+
+        auto dstMd = dnnl::memory::desc(odims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(odims.size()));
+        auto output = dnnl::memory(dstMd, context->getEngine(), dstData);
+
+        using op_desc_t = dnnl::resampling_forward::desc;
+        using pd_t = dnnl::resampling_forward::primitive_desc;
+
+        auto opDesc = op_desc_t(dnnl::prop_kind::forward_inference,
+                                getAlgorithm(op), scales, srcMd, dstMd);
+        auto primDesc = pd_t(opDesc, context->getEngine());
+
+        // create and execute primitive
+        dnnl::resampling_forward(primDesc).execute(
+            context->getStream(),
+            {{DNNL_ARG_SRC, srcMemory}, {DNNL_ARG_DST, output}});
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::Resize, DataType::Float32, MklResize,
+                "Resize_Mkl_Float32");
+}; // namespace infini
--- a/src/kernels/intelcpu/slice.cc
+++ b/src/kernels/intelcpu/slice.cc
@ -0,0 +1,46 @@
+#include "operators/slice.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+
+namespace infini {
+class MklSlice : public MklKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<SliceObj>(_op);
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+
+        std::vector<dnnl_dim_t> dims;
+        for (size_t i = 0; i < op->getInputs(0)->getDims().size(); ++i)
+            dims.push_back(op->getInputs(0)->getDims()[i]);
+
+        // create src md
+        auto srcMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(dims.size()));
+
+        // dst md
+        auto oDims = op->getOutput(0)->getDims();
+        int ndim = oDims.size();
+        std::vector<dnnl_dim_t> sDims, offsets;
+        for (int i = 0; i < ndim; ++i) {
+            sDims.push_back(oDims.at(i));
+            offsets.push_back(op->getStart().at(i));
+        }
+        auto sliceMd = srcMd.submemory_desc(sDims, offsets);
+        auto sliceMemory =
+            dnnl::memory(sliceMd, context->getEngine(),
+                         op->getInputs(0)->getRawDataPtr<float *>());
+
+        auto dstMd = dnnl::memory::desc(sDims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(sDims.size()));
+        auto output = dnnl::memory(dstMd, context->getEngine(),
+                                   op->getOutput(0)->getRawDataPtr<float *>());
+
+        // copy data to dst
+        dnnl::reorder(sliceMemory, output)
+            .execute(context->getStream(),
+                     {{DNNL_ARG_FROM, sliceMemory}, {DNNL_ARG_TO, output}});
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::Slice, DataType::Float32, MklSlice,
+                "Slice_Mkl_Float32");
+} // namespace infini
--- a/src/kernels/intelcpu/softmax.cc
+++ b/src/kernels/intelcpu/softmax.cc
@ -0,0 +1,43 @@
+#include "operators/softmax.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+
+namespace infini {
+class MklSoftmax : public MklKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<SoftmaxObj>(_op);
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+
+        float *const srcData = op->getInputs(0)->getRawDataPtr<float *>();
+        float *const dstData = op->getOutput()->getRawDataPtr<float *>();
+
+        //  create user memory that describes data layout in the buffers
+        std::vector<dnnl_dim_t> dims;
+        for (size_t i = 0; i < op->getInputs(0)->getDims().size(); ++i)
+            dims.push_back(op->getInputs(0)->getDims()[i]);
+
+        auto srcMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(dims.size()));
+        auto srcMemory = dnnl::memory(srcMd, context->getEngine(), srcData);
+
+        auto dstMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(dims.size()));
+        auto output = dnnl::memory(dstMd, context->getEngine(), dstData);
+
+        using op_desc_t = dnnl::softmax_forward::desc;
+        using pd_t = dnnl::softmax_forward::primitive_desc;
+
+        auto opDesc =
+            op_desc_t(dnnl::prop_kind::forward_inference, srcMd, op->getAxis());
+        auto primDesc = pd_t(opDesc, context->getEngine());
+
+        // create and execute primitive
+        dnnl::softmax_forward(primDesc).execute(
+            context->getStream(),
+            {{DNNL_ARG_SRC, srcMemory}, {DNNL_ARG_DST, output}});
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::Softmax, DataType::Float32,
+                MklSoftmax, "Softmax_Mkl_Float32");
+}; // namespace infini
--- a/src/kernels/intelcpu/split.cc
+++ b/src/kernels/intelcpu/split.cc
@ -0,0 +1,54 @@
+#include "operators/split.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+#include "intelcpu/mkl_runtime.h"
+
+namespace infini {
+class MklSplit : public MklKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<SplitObj>(_op);
+        auto context = dynamic_cast<const MklRuntimeObj *>(_context);
+
+        std::vector<dnnl_dim_t> dims;
+        for (size_t i = 0; i < op->getInputs(0)->getDims().size(); ++i)
+            dims.push_back(op->getInputs(0)->getDims()[i]);
+
+        // create src md
+        auto srcMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                        getUserFormatTag(dims.size()));
+
+        // dst md
+        std::vector<dnnl::memory::desc> dstsMd;
+        std::vector<dnnl::memory> dsts;
+        int offset = 0;
+        for (size_t i = 0; i < op->getOutputs().size(); i++) {
+            auto oDims = op->getOutput(i)->getDims();
+            int ndim = oDims.size();
+            std::vector<dnnl_dim_t> dims, offsets(ndim, 0);
+            for (int i = 0; i < ndim; ++i) {
+                dims.push_back(oDims.at(i));
+            }
+            offsets[op->getDim()] = offset;
+            auto splitMd = srcMd.submemory_desc(dims, offsets);
+            auto splitMemory =
+                dnnl::memory(splitMd, context->getEngine(),
+                             op->getInputs(0)->getRawDataPtr<float *>());
+
+            auto dstMd = dnnl::memory::desc(dims, dnnl::memory::data_type::f32,
+                                            getUserFormatTag(dims.size()));
+            auto output =
+                dnnl::memory(dstMd, context->getEngine(),
+                             op->getOutput(i)->getRawDataPtr<float *>());
+
+            // copy data to dst
+            dnnl::reorder(splitMemory, output)
+                .execute(context->getStream(),
+                         {{DNNL_ARG_FROM, splitMemory}, {DNNL_ARG_TO, output}});
+
+            offset += dims.at(op->getDim());
+        }
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::Split, DataType::Float32, MklSplit,
+                "Split_Mkl_Float32");
+}; // namespace infini
--- a/src/mkl/mkl_runtime.cc
+++ b/src/mkl/mkl_runtime.cc
@ -1,13 +0,0 @@
-#include "mkl/mkl_runtime.h"
-#include "core/graph.h"
-#include "core/kernel.h"
-namespace infini {
-MklRuntimeObj::MklRuntimeObj() : CpuRuntimeObj(Device::MKL) {
-    dnnl_engine_create(&engine, dnnl_engine_kind_t::dnnl_cpu, 0);
-}
-
-MklRuntimeObj::~MklRuntimeObj() {
-    mkl_free_buffers();
-    dnnl_engine_destroy(engine);
-}
-} // namespace infini
--- a/src/operators/reshape.cc
+++ b/src/operators/reshape.cc
@ -39,18 +39,25 @@ vector<int> ReshapeObj::getOpAttrVector() const {
    return ret;
 }

-FlattenObj::FlattenObj(GraphObj *graph, Tensor input, Tensor output)
+FlattenObj::FlattenObj(GraphObj *graph, Tensor input, Tensor output, int _axis)
    : OperatorObj(OpType::Flatten, {input}, {output}) {
+    if (_axis >= 0 && (size_t)_axis < input->getDims().size())
+        axis = _axis;
+    else if (_axis <= -1 && (size_t)_axis >= -input->getDims().size())
+        axis = _axis + input->getDims().size();
+    else
+        IT_ASSERT(0);
    IT_ASSERT(checkValid(graph));
 }

 optional<vector<Shape>> FlattenObj::inferShape(const TensorVec &inputs) const {
-    int size = 1;
+    int sizeB = 1, sizeE = 1;
    auto dims = getInputs(0)->getDims();
-    for (size_t i = 0; i < dims.size(); ++i)
-        size *= dims.at(i);
+    int ndim = dims.size();
+    for (int i = 0; i < ndim; ++i)
+        ((i < axis) ? sizeB : sizeE) *= dims.at(i);

-    return {{{size}}};
+    return {{{sizeB, sizeE}}};
 }

 std::string FlattenObj::toString() const {
@ -59,18 +66,20 @@ std::string FlattenObj::toString() const {
    os << "(";
    os << vecToString(inputs[0]->getDims()) << ",";
    os << "input=" << inputs[0]->getGuid() << ",";
-    os << "output=" << outputs[0]->getGuid() << ")";
+    os << "output=" << outputs[0]->getGuid() << ",";
+    os << "axis=" << axis << ")";
    return os.str();
 }

 vector<int> FlattenObj::getWorkloadVector() const {
    vector<int> ret = inputs[0]->getDims();
+    ret.emplace(ret.begin(), axis);
    ret.emplace(ret.begin(), enum_to_underlying(type));
    return ret;
 }

 vector<int> FlattenObj::getOpAttrVector() const {
-    return {enum_to_underlying(type)};
+    return {enum_to_underlying(type), axis};
 }

 IdentityObj::IdentityObj(GraphObj *graph, Tensor input, Tensor output)
--- a/src/operators/resize.cc
+++ b/src/operators/resize.cc
@ -70,25 +70,6 @@ void ResizeObj::init(const Tensor &input, const Tensor &sizes,
        }
    }
 }
-/*
-Operator ResizeObj::clone(TensorVec inputs, TensorVec outputs) {
-    Tensor roi{nullptr}, sizes{nullptr}, scales{nullptr};
-    if (inputs.size() == 3)
-        roi = inputs[2];
-    if (isResizeBySizes())
-        sizes = inputs[1];
-    else
-        scales = inputs[1];
-
-    if (mode == ECoeffMode::nearest)
-        return make_ref<ResizeObj>(nullptr, inputs[0], outputs[0], axes,
-                                   inputs[1], nullptr, roi, ratioPolicy,
-                                   nearestMode, coMode);
-    else
-        return make_ref<ResizeObj>(nullptr, inputs[0], outputs[0], axes,
-                                   inputs[1], nullptr, roi, mode, ratioPolicy,
-                                   coMode);
-}*/

 void ResizeObj::InitBySizes(Tensor input, Tensor sizes,
                            const std::optional<vector<int>> &axes) {
--- a/src/operators/softmax.cc
+++ b/src/operators/softmax.cc
@ -0,0 +1,37 @@
+#include "operators/softmax.h"
+
+namespace infini {
+
+SoftmaxObj::SoftmaxObj(GraphObj *graph, Tensor input, Tensor output, int _axis)
+    : OperatorObj(OpType::Softmax, {input}, {output}) {
+    if (_axis >= 0 && (size_t)_axis < input->getDims().size())
+        axis = _axis;
+    else if (_axis <= -1 && (size_t)_axis >= -input->getDims().size())
+        axis = _axis + input->getDims().size();
+    else
+        IT_ASSERT(0);
+    IT_ASSERT(checkValid(graph));
+}
+
+std::string SoftmaxObj::toString() const {
+    std::ostringstream os;
+    os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
+    os << "(";
+    os << vecToString(inputs[0]->getDims()) << ",";
+    os << "input=" << inputs[0]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ",";
+    os << "axis=" << axis << ")";
+    return os.str();
+}
+
+vector<int> SoftmaxObj::getWorkloadVector() const {
+    vector<int> ret{enum_to_underlying(type), axis};
+    const Shape shape = outputs[0]->getDims();
+    ret.insert(ret.end(), shape.begin(), shape.end());
+    return ret;
+}
+
+vector<int> SoftmaxObj::getOpAttrVector() const {
+    return {enum_to_underlying(type), axis};
+}
+} // namespace infini
--- a/test/kernels/cuda/test_cuda_reshape.cc
+++ b/test/kernels/cuda/test_cuda_reshape.cc
@ -51,7 +51,7 @@ TEST(CUDA_Flatten, run) {
    // Build CUDA graph
    Graph g = make_ref<GraphObj>(cudaRuntime);
    auto i = g->cloneTensor(icpu);
-    auto op = g->addOp<FlattenObj>(i, nullptr);
+    auto op = g->addOp<FlattenObj>(i, nullptr, 2);

    // allocate CUDA memory
    g->dataMalloc();
--- a/test/kernels/cuda/test_cuda_softmax.cc
+++ b/test/kernels/cuda/test_cuda_softmax.cc
@ -0,0 +1,142 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/softmax.h"
+#include "test.h"
+#include <cmath>
+namespace infini {
+
+TEST(cuDNN_Softmax, run_axis1) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
+
+    // GPU
+    Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
+    auto inputGpu = cudaGraph->cloneTensor(inputCpu);
+    auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
+    cudaGraph->dataMalloc();
+    cudaRuntime->run(cudaGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    cudaPrintTensor(outputGpu);
+    // Check
+    EXPECT_TRUE(outputGpu2Cpu->equalData(
+        vector<float>{0.032058604, 0.08714432, 0.23688284, 0.6439143,
+                      0.032058604, 0.08714432, 0.23688284, 0.6439143}));
+}
+
+TEST(cuDNN_Softmax, run_axis0) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
+
+    // GPU
+    Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
+    auto inputGpu = cudaGraph->cloneTensor(inputCpu);
+    auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 0);
+    cudaGraph->dataMalloc();
+    cudaRuntime->run(cudaGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    cudaPrintTensor(outputGpu);
+    // Check
+    EXPECT_TRUE(
+        outputGpu2Cpu->equalData(vector<float>{0., 0., 0., 0., 1, 1, 1, 1}));
+}
+
+TEST(cuDNN_Softmax2, run_axis1) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(IncrementalGenerator());
+
+    // GPU
+    Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
+    auto inputGpu = cudaGraph->cloneTensor(inputCpu);
+    auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
+    cudaGraph->dataMalloc();
+    cudaRuntime->run(cudaGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    cudaPrintTensor(outputGpu);
+    // Check
+    EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
+        0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138, 0.9820138,
+        0.9820138, 0.9820138, 0.0179862, 0.0179862, 0.0179862, 0.0179862,
+        0.9820138, 0.9820138, 0.9820138, 0.9820138}));
+}
+
+TEST(cuDNN_Softmax2, run_axis2) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(IncrementalGenerator());
+
+    // GPU
+    Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
+    auto inputGpu = cudaGraph->cloneTensor(inputCpu);
+    auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 2);
+    cudaGraph->dataMalloc();
+    cudaRuntime->run(cudaGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    cudaPrintTensor(outputGpu);
+    // Check
+    EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
+        0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029,
+        0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971, 0.8807971,
+        0.1192029, 0.1192029, 0.8807971, 0.8807971}));
+}
+
+TEST(cuDNN_Softmax2, run_axis3) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+
+    // Build input data on CPU
+    Tensor inputCpu =
+        make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(IncrementalGenerator());
+
+    // GPU
+    Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
+    auto inputGpu = cudaGraph->cloneTensor(inputCpu);
+    auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 3);
+    cudaGraph->dataMalloc();
+    cudaRuntime->run(cudaGraph);
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    cudaPrintTensor(outputGpu);
+    // Check
+    EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
+        0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
+        0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
+        0.2689414, 0.7310586, 0.2689414, 0.7310586}));
+}
+} // namespace infini
--- a/test/kernels/cuda/test_cuda_unary.cc
+++ b/test/kernels/cuda/test_cuda_unary.cc
@ -41,7 +41,6 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,

 TEST(cuDNN_Unary, run) {
    testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
-    testUnary<SoftmaxObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
    testUnary<AbsObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
    testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
    testUnary<TanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
--- a/test/kernels/intelcpu/test_mkl_batch_norm.cc
+++ b/test/kernels/intelcpu/test_mkl_batch_norm.cc
@ -0,0 +1,36 @@
+
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/batch_norm.h"
+#include "test.h"
+
+namespace infini {
+TEST(MklBatchNorm, run) {
+    // Runtime
+    auto runtime = make_ref<MklRuntimeObj>();
+
+    // Build graph
+    Graph g = make_ref<GraphObj>(runtime);
+    auto i = g->addTensor(Shape{1, 3, 2, 2}, DataType::Float32);
+    auto mean = g->addTensor(Shape{1, 3, 1, 1}, DataType::Float32);
+    auto var = g->addTensor(Shape{1, 3, 1, 1}, DataType::Float32);
+    auto scale = g->addTensor(Shape{1, 3, 1, 1}, DataType::Float32);
+    auto bias = g->addTensor(Shape{1, 3, 1, 1}, DataType::Float32);
+    auto op =
+        g->addOp<BatchNormObj>(i, nullptr, mean, var, scale, bias, 0.9, 0);
+    g->dataMalloc();
+    i->setData(IncrementalGenerator());
+    mean->copyin(vector<float>{1, 6, 9});
+    var->copyin(vector<float>{4, 1, 9});
+    scale->setData(OneGenerator());
+    bias->setData(ZeroGenerator());
+
+    runtime->run(g);
+
+    auto o = op->getOutput();
+    EXPECT_TRUE(o->equalData(vector<float>{
+        -0.5, 0, 0.5, 1, -2, -1, 0, 1, -0.3333333, 0, 0.3333333, 0.6666667}));
+}
+
+} // namespace infini
--- a/test/kernels/intelcpu/test_mkl_concat.cc
+++ b/test/kernels/intelcpu/test_mkl_concat.cc
@ -0,0 +1,29 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/concat.h"
+
+#include "test.h"
+
+namespace infini {
+
+TEST(Concat, Mkl) {
+    Runtime runtime = MklRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    auto t1 = g->addTensor({2, 2, 3, 1}, DataType::Float32);
+    auto t2 = g->addTensor({2, 2, 1, 1}, DataType::Float32);
+    auto t3 = g->addTensor({2, 2, 2, 1}, DataType::Float32);
+    auto op = g->addOp<ConcatObj>(TensorVec{t1, t2, t3}, nullptr, 2);
+    g->dataMalloc();
+    t1->setData(IncrementalGenerator());
+    t2->setData(OneGenerator());
+    t3->setData(OneGenerator());
+
+    runtime->run(g);
+    EXPECT_TRUE(op->getOutput()->equalData(
+        vector<float>{0, 1, 2, 1, 1, 1, 3, 4,  5,  1, 1, 1,
+                      6, 7, 8, 1, 1, 1, 9, 10, 11, 1, 1, 1}));
+}
+
+} // namespace infini
--- a/test/kernels/intelcpu/test_mkl_conv.cc
+++ b/test/kernels/intelcpu/test_mkl_conv.cc
@ -2,7 +2,7 @@
 #include "core/kernel.h"
 #include "core/perf_engine.h"
 #include "core/runtime.h"
-#include "mkl/mkl_runtime.h"
+#include "intelcpu/mkl_runtime.h"
 #include "operators/conv.h"

 #include "test.h"
@ -17,18 +17,15 @@ void testConvDnnl(

    Tensor i0 = gMkl->addTensor({1, 3, 4, 4}, DataType::Float32);
    Tensor w0 = gMkl->addTensor({2, 3, 3, 3}, DataType::Float32);
+
+    // Build  graph
+    auto conv = gMkl->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
    // Malloc data for all tensors in a graph.
    gMkl->dataMalloc();
    i0->setData(generator);
    w0->setData(generator);

-    // Build  graph
-    auto conv = gMkl->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
-    // allocate CUDA memory
-    gMkl->dataMalloc();
-    // Execute on CUDA
    mklRuntime->run(gMkl);
-    // check results on CPU
    EXPECT_TRUE(conv->getOutput(0)->equalData(ansVec));
 }

@ -57,7 +54,7 @@ TEST(mkl_Conv, tune) {

    // check record
    auto kernelAttrs =
-        KernelAttrs{Device::MKL, conv->getOpType(), DataType::Float32};
+        KernelAttrs{Device::INTELCPU, conv->getOpType(), DataType::Float32};
    auto perfKey = PerfEngine::Key{kernelAttrs, conv->getOpPerfKey()};
    std::optional<PerfRecord> perfData =
        PerfEngine::getInstance().getPerfData(perfKey);
--- a/test/kernels/intelcpu/test_mkl_conv_transposed.cc
+++ b/test/kernels/intelcpu/test_mkl_conv_transposed.cc
@ -1,7 +1,7 @@
 #include "core/graph.h"
 #include "core/kernel.h"
 #include "core/perf_engine.h"
-#include "mkl/mkl_runtime.h"
+#include "intelcpu/mkl_runtime.h"
 #include "operators/conv.h"

 #include "test.h"
@ -26,7 +26,7 @@ void testConvTransposedMkl(
    i0->setData(generator);
    w0->setData(generator);

-    runtime->prepareAndRun(gMkl);
+    runtime->run(gMkl);
    EXPECT_TRUE(conv->getOutput()->equalData(ansVec));
 }

@ -50,7 +50,7 @@ TEST(mkl_ConvTransposed, run1) {
    i0->setData(IncrementalGenerator());
    w0->setData(IncrementalGenerator());

-    runtime->prepareAndRun(gMkl);
+    runtime->run(gMkl);
    EXPECT_TRUE(conv->getOutput()->equalData(vector<float>{
        162, 351,  569,  413,  224,  405,  876,  1417, 1024, 553,
        747, 1611, 2598, 1869, 1005, 639,  1368, 2191, 1564, 835,
@ -71,10 +71,10 @@ TEST(mkl_ConvTransposed, tune) {
    w0->setData(IncrementalGenerator());

    bool tune = true;
-    runtime->prepareAndRun(gMkl, tune);
+    runtime->run(gMkl, tune);
    // check record
    auto kernelAttrs =
-        KernelAttrs{Device::MKL, conv->getOpType(), DataType::Float32};
+        KernelAttrs{Device::INTELCPU, conv->getOpType(), DataType::Float32};
    auto perfKey = PerfEngine::Key{kernelAttrs, conv->getOpPerfKey()};
    std::optional<PerfRecord> perfData =
        PerfEngine::getInstance().getPerfData(perfKey);
--- a/test/kernels/intelcpu/test_mkl_element_wise.cc
+++ b/test/kernels/intelcpu/test_mkl_element_wise.cc
@ -0,0 +1,84 @@
+
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/element_wise.h"
+#include "operators/unary.h"
+#include "test.h"
+
+namespace infini {
+
+using ExpectOutput = vector<float>;
+template <class T>
+void testBinary(const std::function<void(void *, size_t, DataType)> &generator,
+                const Shape &shape, const ExpectOutput &ansVec) {
+    Runtime runtime = MklRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+    auto a = g->addTensor(shape, DataType::Float32);
+    auto b = g->addTensor(shape, DataType::Float32);
+    auto op = g->addOp<T>(a, b, nullptr);
+    g->dataMalloc();
+    a->setData(generator);
+    b->setData(generator);
+
+    runtime->run(g);
+
+    auto c = op->getOutput();
+    //  check results on CPU
+    EXPECT_TRUE(c->equalData(ansVec));
+}
+
+TEST(dnnl_Binary, run) {
+    testBinary<AddObj>(IncrementalGenerator(), Shape{1, 2, 2, 3},
+                       ExpectOutput{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22});
+    testBinary<SubObj>(IncrementalGenerator(), Shape{1, 2, 2, 3},
+                       ExpectOutput{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+    testBinary<MulObj>(
+        IncrementalGenerator(), Shape{1, 2, 2, 3},
+        ExpectOutput{0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121});
+
+    testBinary<DivObj>(OneGenerator(), Shape{1, 2, 2, 3},
+                       ExpectOutput{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+}
+
+TEST(sycl_Pow, run) {
+    testBinary<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 1},
+                       ExpectOutput{1, 1, 4, 27});
+}
+
+template <class T>
+void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
+               const Shape &shape) {
+    // Runtime
+    Runtime rCpu = NativeCpuRuntimeObj::getInstance();
+    auto rMkl = make_ref<MklRuntimeObj>();
+
+    // Build input data on CPU
+
+    Graph gCpu = make_ref<GraphObj>(rCpu);
+    Tensor iCpu = gCpu->addTensor(shape, DataType::Float32);
+    auto opCpu = gCpu->addOp<T>(iCpu, nullptr);
+    gCpu->dataMalloc();
+    iCpu->setData(generator);
+    rCpu->run(gCpu);
+
+    // MKL
+    Graph gMkl = make_ref<GraphObj>(rMkl);
+    auto iMkl = gMkl->addTensor(shape, DataType::Float32);
+    auto opMkl = gMkl->addOp<T>(iMkl, nullptr);
+    gMkl->dataMalloc();
+    iMkl->setData(generator);
+    rMkl->run(gMkl);
+
+    // Check
+    EXPECT_TRUE(opCpu->getOutput()->equalData(opMkl->getOutput()));
+}
+
+TEST(dnnl_Unary, run) {
+    testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testUnary<AbsObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+    testUnary<TanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
+}
+
+}; // namespace infini
--- a/test/kernels/intelcpu/test_mkl_extend.cc
+++ b/test/kernels/intelcpu/test_mkl_extend.cc
@ -0,0 +1,31 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/extend.h"
+
+#include "test.h"
+
+namespace infini {
+
+TEST(MKL_Extend, run) {
+    Runtime runtime = MklRuntimeObj::getInstance();
+
+    Graph g = make_ref<GraphObj>(runtime);
+    Tensor i = g->addTensor(Shape{2, 3, 2, 2}, DataType::Float32);
+    auto op = g->addOp<ExtendObj>(i, nullptr, 1, 1);
+    g->dataMalloc();
+    i->setData(IncrementalGenerator());
+
+    // Execute
+    runtime->run(g);
+
+    auto o = op->getOutput();
+
+    //  check results on CPU
+    EXPECT_TRUE(o->equalData(vector<float>{
+        0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,
+        4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20, 21, 22, 23, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}));
+}
+} // namespace infini
--- a/test/kernels/intelcpu/test_mkl_gather.cc
+++ b/test/kernels/intelcpu/test_mkl_gather.cc
@ -0,0 +1,60 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/gather.h"
+
+#include "test.h"
+
+namespace infini {
+TEST(Gather, Cuda) {
+    {
+        Runtime runtime = MklRuntimeObj::getInstance();
+        Graph g = make_ref<GraphObj>(runtime);
+        auto input = g->addTensor({3, 2}, DataType::Float32);
+        auto index = g->addTensor({2, 2}, DataType::UInt32);
+        g->dataMalloc();
+        input->copyin(vector<float>{1, 2, 3, 4, 5, 6});
+        index->copyin(vector<uint32_t>{0, 1, 1, 2});
+
+        auto op = g->addOp<GatherObj>(input, index, nullptr, 0);
+        g->dataMalloc();
+        runtime->run(g);
+
+        EXPECT_TRUE(
+            op->getOutput()->equalData(vector<float>{1, 2, 3, 4, 3, 4, 5, 6}));
+    }
+    {
+        Runtime runtime = MklRuntimeObj::getInstance();
+        Graph g = make_ref<GraphObj>(runtime);
+        auto input = g->addTensor({3, 3}, DataType::Float32);
+        auto index = g->addTensor({1, 2}, DataType::UInt32);
+        g->dataMalloc();
+        input->setData(IncrementalGenerator());
+        index->copyin(vector<uint32_t>{0, 2});
+
+        auto op = g->addOp<GatherObj>(input, index, nullptr, 1);
+        g->dataMalloc();
+        runtime->run(g);
+
+        EXPECT_TRUE(
+            op->getOutput()->equalData(vector<float>{0, 2, 3, 5, 6, 8}));
+    }
+    {
+        Runtime runtime = MklRuntimeObj::getInstance();
+        Graph g = make_ref<GraphObj>(runtime);
+        auto input = g->addTensor({2, 4, 2}, DataType::Float32);
+        auto index = g->addTensor({3, 1}, DataType::UInt32);
+        g->dataMalloc();
+        input->setData(IncrementalGenerator());
+        index->copyin(vector<uint32_t>{0, 3, 1});
+
+        auto op = g->addOp<GatherObj>(input, index, nullptr, 1);
+        g->dataMalloc();
+        runtime->run(g);
+
+        EXPECT_TRUE(op->getOutput()->equalData(
+            vector<float>{0, 1, 6, 7, 2, 3, 8, 9, 14, 15, 10, 11}));
+    }
+}
+} // namespace infini
--- a/test/kernels/intelcpu/test_mkl_matmul.cc
+++ b/test/kernels/intelcpu/test_mkl_matmul.cc
@ -2,7 +2,7 @@
 #include "core/graph.h"
 #include "core/kernel.h"
 #include "core/runtime.h"
-#include "mkl/mkl_runtime.h"
+#include "intelcpu/mkl_runtime.h"
 #include "operators/matmul.h"

 #include "test.h"
@ -27,7 +27,6 @@ void testMatmulMkl(

    gCpu->dataMalloc();
    cpuRuntime->run(gCpu);
-    matmul->getOutput()->printData();
    EXPECT_TRUE(matmul->getOutput()->equalData(ansVec));
 }

--- a/test/kernels/intelcpu/test_mkl_pad.cc
+++ b/test/kernels/intelcpu/test_mkl_pad.cc
@ -0,0 +1,30 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/pad.h"
+#include "test.h"
+
+namespace infini {
+TEST(Pad, Mkl) {
+    Runtime runtime = MklRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    // Build input data
+    Tensor i = g->addTensor(Shape{1, 2, 3, 2}, DataType::Float32);
+    auto op = g->addOp<PadObj>(i, nullptr, vector<int>{1, 0, 1, 1},
+                               vector<int>{0, 3});
+    g->dataMalloc();
+    i->setData(IncrementalGenerator());
+
+    // Execute
+    runtime->run(g);
+
+    auto o = op->getOutput();
+
+    //  check results
+    EXPECT_TRUE(o->equalData(
+        vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,
+                      0, 1, 0, 2, 3, 0, 4, 5, 0, 6, 7, 0, 8, 9, 0, 10, 11, 0,
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0}));
+}
+} // namespace infini
--- a/test/kernels/intelcpu/test_mkl_pooling.cc
+++ b/test/kernels/intelcpu/test_mkl_pooling.cc
@ -0,0 +1,47 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/pooling.h"
+#include "test.h"
+
+namespace infini {
+using KDPS = vector<int>;
+using ExpectOutput = vector<float>;
+
+template <class T>
+void testPoolMkl(const std::function<void(void *, size_t, DataType)> &generator,
+                 const Shape &shape, const KDPS &kdps,
+                 const ExpectOutput &ansVec) {
+    EXPECT_TRUE(kdps.size() == 8);
+    Runtime runtime = MklRuntimeObj::getInstance();
+
+    Graph g = make_ref<GraphObj>(runtime);
+    // Build input data
+    Tensor i0 = g->addTensor(shape, DataType::Float32);
+    auto pool = g->addOp<T>(i0, nullptr, kdps[0], kdps[1], kdps[2], kdps[3],
+                            kdps[4], kdps[5], kdps[6], kdps[7]);
+    g->dataMalloc();
+    i0->setData(generator);
+
+    runtime->run(g);
+    // check results on CPU
+    EXPECT_TRUE(pool->getOutput()->equalData(ansVec));
+}
+
+TEST(mkl_MaxPool, run) {
+    testPoolMkl<MaxPoolObj>(IncrementalGenerator(), Shape{1, 2, 5, 5},
+                            KDPS{3, 3, 1, 1, 1, 1, 2, 2},
+                            ExpectOutput{6, 8, 9, 16, 18, 19, 21, 23, 24, 31,
+                                         33, 34, 41, 43, 44, 46, 48, 49});
+}
+
+TEST(mkl_AvgPool, run) {
+    testPoolMkl<AvgPoolObj>(
+        IncrementalGenerator(), Shape{1, 2, 5, 5}, KDPS{3, 3, 1, 1, 1, 1, 2, 2},
+        ExpectOutput{1.333333, 3.0000, 2.666667, 7.0000, 12.0000, 9.0000,
+                     8.0000, 13.0000, 9.333333, 12.44444, 19.666667, 13.777778,
+                     23.666667, 37.0000, 25.666667, 19.111111, 29.666667,
+                     20.444444});
+}
+
+} // namespace infini
--- a/test/kernels/intelcpu/test_mkl_reduce.cc
+++ b/test/kernels/intelcpu/test_mkl_reduce.cc
@ -0,0 +1,52 @@
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/reduce_mean.h"
+
+#include "test.h"
+
+namespace infini {
+
+void test_reducemean(const Shape &shape, const vector<float> &data,
+                     const optional<const vector<int>> &axis, bool keepDims,
+                     const vector<float> &ExpectData) {
+    Runtime runtime = MklRuntimeObj::getInstance();
+
+    Graph g = make_ref<GraphObj>(runtime);
+    Tensor i = g->addTensor(shape, DataType::Float32);
+    auto op = g->addOp<ReduceMeanObj>(i, nullptr, axis, keepDims);
+
+    g->dataMalloc();
+    i->copyin(data);
+
+    // Execute
+    runtime->run(g);
+
+    auto o = op->getOutput();
+
+    //  check results
+    EXPECT_TRUE(o->equalData(ExpectData));
+}
+
+TEST(MKL_ReduceMean, run) {
+    test_reducemean(Shape{3, 2, 2},
+                    vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2},
+                    std::nullopt, true, vector<float>{18.25});
+    test_reducemean(Shape{1, 3, 2, 2, 1},
+                    vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2},
+                    std::nullopt, false, vector<float>{18.25});
+
+    test_reducemean(Shape{2, 3, 2, 2},
+                    vector<float>{0,  1,  2,  3,  4,  5,  6,  7,
+                                  8,  9,  10, 11, 12, 13, 14, 15,
+                                  16, 17, 18, 19, 20, 21, 22, 23},
+                    vector<int>{1, 2}, false, vector<float>{5, 6, 17, 18});
+    test_reducemean(Shape{2, 3, 2, 2, 1},
+                    vector<float>{0,  1,  2,  3,  4,  5,  6,  7,
+                                  8,  9,  10, 11, 12, 13, 14, 15,
+                                  16, 17, 18, 19, 20, 21, 22, 23},
+                    vector<int>{1, 2}, true, vector<float>{5, 6, 17, 18});
+}
+
+} // namespace infini
--- a/test/kernels/intelcpu/test_mkl_reshape.cc
+++ b/test/kernels/intelcpu/test_mkl_reshape.cc
@ -0,0 +1,57 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/reshape.h"
+
+#include "test.h"
+
+namespace infini {
+
+TEST(Reshape, Mkl) {
+    Runtime runtime = MklRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    auto input = g->addTensor({2, 3, 3, 4}, DataType::Float32);
+    auto op = g->addOp<ReshapeObj>(input, nullptr, Shape{3, 2, 4, 3});
+    g->dataMalloc();
+    input->setData(IncrementalGenerator());
+
+    runtime->run(g);
+
+    auto o = g->cloneTensor(op->getOutput(0));
+    //  check results
+    EXPECT_TRUE(o->equalData(input));
+}
+
+TEST(Flatten, Mkl) {
+    Runtime runtime = MklRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    auto input = g->addTensor({2, 3, 3, 4}, DataType::Float32);
+    auto op = g->addOp<FlattenObj>(input, nullptr, 2);
+    g->dataMalloc();
+    input->setData(IncrementalGenerator());
+
+    runtime->run(g);
+
+    auto o = g->cloneTensor(op->getOutput(0));
+    //  check results
+    EXPECT_TRUE(o->equalData(input));
+}
+
+TEST(Identify, Mkl) {
+    Runtime runtime = MklRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    auto input = g->addTensor({2, 3, 3, 4}, DataType::Float32);
+    auto op = g->addOp<IdentityObj>(input, nullptr);
+    g->dataMalloc();
+    input->setData(IncrementalGenerator());
+
+    runtime->run(g);
+
+    auto o = g->cloneTensor(op->getOutput(0));
+    //  check results
+    EXPECT_TRUE(o->equalData(input));
+}
+} // namespace infini
--- a/test/kernels/intelcpu/test_mkl_resize.cc
+++ b/test/kernels/intelcpu/test_mkl_resize.cc
@ -0,0 +1,30 @@
+#include "cmath"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/resize.h"
+#include "test.h"
+namespace infini {
+TEST(Resize, Mkl_downsample_sizes_nearest) {
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(cpuRuntime);
+
+    auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32);
+    auto sizes = gCpu->addTensor({4}, DataType::UInt32);
+    gCpu->dataMalloc();
+    input->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
+    sizes->copyin(vector<uint32_t>{1, 1, 1, 3});
+
+    auto runtime = make_ref<MklRuntimeObj>();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    auto op = g->addOp<ResizeObj>(g->cloneTensor(input), nullptr, std::nullopt,
+                                  g->cloneTensor(sizes), nullptr, nullptr,
+                                  ResizeObj::EKeepAspectRatioPolicy::stretch,
+                                  ResizeObj::ENearestMode::ceil);
+    g->dataMalloc();
+    runtime->run(g);
+
+    EXPECT_TRUE(op->getOutput(0)->equalData(vector<float>{5, 7, 8}));
+}
+} // namespace infini
--- a/test/kernels/intelcpu/test_mkl_slice.cc
+++ b/test/kernels/intelcpu/test_mkl_slice.cc
@ -0,0 +1,26 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/slice.h"
+#include "test.h"
+
+namespace infini {
+TEST(MKL_Slice, run) {
+    Runtime runtime = MklRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    // Build input data
+    Tensor i = g->addTensor(Shape{3, 2, 1, 5}, DataType::Float32);
+    auto op =
+        g->addOp<SliceObj>(i, nullptr, vector<int>{1, 1}, vector<int>{1, 4},
+                           vector<int>{0, 3}, std::nullopt);
+    g->dataMalloc();
+    i->setData(IncrementalGenerator());
+
+    // Execute
+    runtime->run(g);
+
+    auto o = op->getOutput();
+    EXPECT_TRUE(o->equalData(vector<float>{11, 12, 13, 14, 16, 17, 18, 19}));
+}
+} // namespace infini
--- a/test/kernels/intelcpu/test_mkl_softmax.cc
+++ b/test/kernels/intelcpu/test_mkl_softmax.cc
@ -0,0 +1,83 @@
+
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/softmax.h"
+#include "test.h"
+
+namespace infini {
+TEST(MklSoftmax, run) {
+    // Runtime
+    auto runtime = make_ref<MklRuntimeObj>();
+
+    // Build input data on intelcpu
+    Graph g = make_ref<GraphObj>(runtime);
+    Tensor i = g->addTensor(Shape{2, 4}, DataType::Float32);
+    auto op = g->addOp<SoftmaxObj>(i, nullptr, 1);
+    g->dataMalloc();
+    i->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
+    runtime->run(g);
+
+    // Check
+    EXPECT_TRUE(op->getOutput(0)->equalData(
+        vector<float>{0.032058604, 0.08714432, 0.23688284, 0.6439143,
+                      0.032058604, 0.08714432, 0.23688284, 0.6439143}));
+}
+
+TEST(MklSoftmax, run_axis1) {
+    // Runtime
+    auto runtime = make_ref<MklRuntimeObj>();
+
+    // Build input data on intelcpu
+    Graph g = make_ref<GraphObj>(runtime);
+    Tensor i = g->addTensor(Shape{2, 2, 2, 2}, DataType::Float32);
+    auto op = g->addOp<SoftmaxObj>(i, nullptr, 1);
+    g->dataMalloc();
+    i->setData(IncrementalGenerator());
+    runtime->run(g);
+
+    // Check
+    EXPECT_TRUE(op->getOutput(0)->equalData(vector<float>{
+        0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138, 0.9820138,
+        0.9820138, 0.9820138, 0.0179862, 0.0179862, 0.0179862, 0.0179862,
+        0.9820138, 0.9820138, 0.9820138, 0.9820138}));
+}
+
+TEST(MklSoftmax, run_axis2) {
+    // Runtime
+    auto runtime = make_ref<MklRuntimeObj>();
+
+    // Build input data on intelcpu
+    Graph g = make_ref<GraphObj>(runtime);
+    Tensor i = g->addTensor(Shape{2, 2, 2, 2}, DataType::Float32);
+    auto op = g->addOp<SoftmaxObj>(i, nullptr, 2);
+    g->dataMalloc();
+    i->setData(IncrementalGenerator());
+    runtime->run(g);
+
+    // Check
+    EXPECT_TRUE(op->getOutput(0)->equalData(vector<float>{
+        0.119203, 0.119203, 0.880797, 0.880797, 0.119203, 0.119203, 0.880797,
+        0.880797, 0.119203, 0.119203, 0.880797, 0.880797, 0.119203, 0.119203,
+        0.880797, 0.880797}));
+}
+
+TEST(MklSoftmax, run_axis3) {
+    // Runtime
+    auto runtime = make_ref<MklRuntimeObj>();
+
+    // Build input data on intelcpu
+    Graph g = make_ref<GraphObj>(runtime);
+    Tensor i = g->addTensor(Shape{2, 2, 2, 2}, DataType::Float32);
+    auto op = g->addOp<SoftmaxObj>(i, nullptr, 3);
+    g->dataMalloc();
+    i->setData(IncrementalGenerator());
+    runtime->run(g);
+
+    // Check
+    EXPECT_TRUE(op->getOutput(0)->equalData(vector<float>{
+        0.2689414, 0.7310585, 0.2689414, 0.7310585, 0.2689414, 0.7310585,
+        0.2689414, 0.7310585, 0.2689414, 0.7310585, 0.2689414, 0.7310585,
+        0.2689414, 0.7310585, 0.2689414, 0.7310585}));
+}
+} // namespace infini
--- a/test/kernels/intelcpu/test_mkl_split.cc
+++ b/test/kernels/intelcpu/test_mkl_split.cc
@ -0,0 +1,33 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/split.h"
+
+#include "test.h"
+
+namespace infini {
+
+TEST(Split, Mkl) {
+    Runtime runtime = MklRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    auto input = g->addTensor({2, 10, 2, 1}, DataType::Float32);
+    auto op = g->addOp<SplitObj>(input, std::nullopt, 1, 3);
+    g->dataMalloc();
+    input->setData(IncrementalGenerator());
+
+    runtime->run(g);
+
+    EXPECT_EQ(op->getOutputs().size(), (size_t)3);
+    auto o0 = g->cloneTensor(op->getOutput(0));
+    auto o1 = g->cloneTensor(op->getOutput(1));
+    auto o2 = g->cloneTensor(op->getOutput(2));
+    EXPECT_TRUE(
+        o0->equalData(vector<float>{0, 1, 2, 3, 4, 5, 20, 21, 22, 23, 24, 25}));
+    EXPECT_TRUE(o1->equalData(
+        vector<float>{6, 7, 8, 9, 10, 11, 26, 27, 28, 29, 30, 31}));
+    EXPECT_TRUE(o2->equalData(vector<float>{12, 13, 14, 15, 16, 17, 18, 19, 32,
+                                            33, 34, 35, 36, 37, 38, 39}));
+}
+
+} // namespace infini
--- a/test/operators/test_reshape.cc
+++ b/test/operators/test_reshape.cc
@ -21,8 +21,26 @@ TEST(Flatten, ShapeInference) {
    {
        Graph g = make_ref<GraphObj>(runtime);
        Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
-        auto op = g->addOp<FlattenObj>(i, nullptr);
-        EXPECT_EQ(op->getOutput()->getDims(), (Shape{72}));
+        auto op = g->addOp<FlattenObj>(i, nullptr, 1);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 36}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
+        auto op = g->addOp<FlattenObj>(i, nullptr, 0);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 72}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
+        auto op = g->addOp<FlattenObj>(i, nullptr, -1);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{18, 4}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
+        auto op = g->addOp<FlattenObj>(i, nullptr, -2);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{6, 12}));
    }
 }

--- a/test/script/env_lotus.sh
+++ b/test/script/env_lotus.sh
@ -1,5 +1,26 @@
+#!/bin/bash
+
 . /home/spack/spack/share/spack/setup-env.sh
-spack load cuda@11.0.2 cudnn@8.0.3.33-11.0 intel-oneapi-dnn@2022.1.0 intel-oneapi-mkl@2022.1.0 
+if [ "$#" == 0 ] || [ "$1" == "cuda" ]
+then
+    echo "Load CUDA environment."
+    spack load cuda@11.0.2 cudnn@8.0.3.33-11.0 
    export CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-9.4.0/gcc-9.4.0-st36klijpsnquihiy463hmedsyhoc3g6/bin/gcc
+elif [ "$1" == "intelcpu" ]
+then
+    echo "Load INTELCPU environment."
+    spack load intel-oneapi-dnn@2022.1.0 intel-oneapi-mkl@2022.1.0 intel-oneapi-compilers@2022.1.0
    # The default dnnl library is cpu_dpcpp_gpu_dpcpp which requires libsycl.so, after "spack load", and need to change to gomp explicitly.
    export LD_LIBRARY_PATH=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-12.1.0/intel-oneapi-dnn-2022.1.0-7rs6ht57zozyxhxx6s2qlrqzmqknhgzx/dnnl/2022.1.0/cpu_gomp/lib/:$LD_LIBRARY_PATH
+
+
+    #  flopen mkl libs will fail when used by python. 
+    #  Refering to "https://groups.google.com/g/kaldi-help/c/m3nyQke0HS0/m/4fj8gkSWAgAJ", it is recommended to use mkl_rt instead,
+    #  but mkl_rt do not support dpc++ refered to https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-linux/2023-0/using-the-single-dynamic-library.html
+    #  Preloading the missing libs will work, refered to https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/mkl-fails-to-load/m-p/1155538
+
+    export MKLLIB_PATH=/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-12.1.0/intel-oneapi-mkl-2022.1.0-mf6te62fo6wxlo33jwwwgg5kljoagc6g/mkl/2022.1.0/
+    export LD_PRELOAD=$MKLLIB_PATH/lib/intel64/libmkl_def.so.2:$MKLLIB_PATH/lib/intel64/libmkl_avx2.so.2:$MKLLIB_PATH/lib/intel64/libmkl_core.so:$MKLLIB_PATH/lib/intel64/libmkl_intel_lp64.so:$MKLLIB_PATH/lib/intel64/libmkl_intel_thread.so:/home/spack/spack/opt/spack/linux-ubuntu22.04-broadwell/gcc-11.3.0/intel-oneapi-compilers-2022.1.0-qrq4a63scjip455bpxvl5ipgqbllwecj/compiler/2022.1.0/linux/compiler/lib/intel64_lin/libiomp5.so
+else
+    echo "Bad option. Please enter 'cuda' or 'intelcpu'. CUDA will be loaded by default if nothing specified."
+fi