diff --git a/include/operators/matmul.h b/include/operators/matmul.h
index 40a9af9f..bacdea69 100644
--- a/include/operators/matmul.h
+++ b/include/operators/matmul.h
@@ -44,6 +44,7 @@ class MatmulObj : public OperatorObj {
     int getM() const { return m; }
     int getN() const { return n; }
     int getK() const { return k; }
+    auto getBMNK() const { return tuple{b, m, n, k}; }
 
   private:
     vector<int> getWorkloadVector() const override;
diff --git a/src/kernels/cuda/matmul.cc b/src/kernels/cuda/matmul.cc
new file mode 100644
index 00000000..5ef55696
--- /dev/null
+++ b/src/kernels/cuda/matmul.cc
@@ -0,0 +1,92 @@
+#include "operators/matmul.h"
+#include "core/kernel.h"
+#include "cuda/cuda_runtime.h"
+#include <chrono>
+#include <functional>
+
+namespace infini {
+struct MatmulCudnnPerfRecord : public PerfRecord {
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+};
+constexpr int N_ALGO = 24;
+constexpr cublasGemmAlgo_t ALGOS[N_ALGO] = {
+    CUBLAS_GEMM_ALGO0,  CUBLAS_GEMM_ALGO1,  CUBLAS_GEMM_ALGO2,
+    CUBLAS_GEMM_ALGO3,  CUBLAS_GEMM_ALGO4,  CUBLAS_GEMM_ALGO5,
+    CUBLAS_GEMM_ALGO6,  CUBLAS_GEMM_ALGO7,  CUBLAS_GEMM_ALGO8,
+    CUBLAS_GEMM_ALGO9,  CUBLAS_GEMM_ALGO10, CUBLAS_GEMM_ALGO11,
+    CUBLAS_GEMM_ALGO12, CUBLAS_GEMM_ALGO13, CUBLAS_GEMM_ALGO14,
+    CUBLAS_GEMM_ALGO15, CUBLAS_GEMM_ALGO16, CUBLAS_GEMM_ALGO17,
+    CUBLAS_GEMM_ALGO18, CUBLAS_GEMM_ALGO19, CUBLAS_GEMM_ALGO20,
+    CUBLAS_GEMM_ALGO21, CUBLAS_GEMM_ALGO22, CUBLAS_GEMM_ALGO23,
+};
+
+class matmulCublas : public Kernel {
+    bool do_compute(const Operator &_op, const PerfRecord &_record,
+                    const RuntimeObj *_context) const {
+        auto op = as<MatmulObj>(_op);
+        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
+        void *const inAData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const inBData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const outData = (op->getOutput()->getRawDataPtr<void *>());
+        auto record = dynamic_cast<const MatmulCudnnPerfRecord &>(_record);
+
+        const auto [b, m, n, k] = op->getBMNK();
+        auto opA =
+            op->getTransA() ? CUBLAS_OP_T : CUBLAS_OP_N; // BLAS_N = col major
+        auto opB = op->getTransB() ? CUBLAS_OP_T : CUBLAS_OP_N;
+        const int lda = op->getTransA() ? m : k, ldb = op->getTransB() ? k : n,
+                  ldc = n;
+        const float alpha = 1.f, beta = 0.f;
+        // TODO:use compute type
+        cublasStatus_t stat;
+        if (b > 1) {
+            stat = cublasGemmStridedBatchedEx(
+                context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
+                CUDA_R_32F, ldb, k * n, inAData, CUDA_R_32F, lda, m * k, &beta,
+                outData, CUDA_R_32F, ldc, m * n, b, CUDA_R_32F, record.algo);
+        } else {
+            stat = cublasGemmEx(context->cublasHandle(), opB, opA, n, m, k,
+                                &alpha, inBData, CUDA_R_32F, ldb, inAData,
+                                CUDA_R_32F, lda, &beta, outData, CUDA_R_32F,
+                                ldc, CUDA_R_32F, record.algo);
+        }
+        return (stat == CUBLAS_STATUS_SUCCESS);
+    }
+
+    void compute(const Operator &_op, const PerfRecord &_record,
+                 const RuntimeObj *_context) const override {
+        IT_ASSERT(do_compute(_op, _record, _context));
+    }
+
+    void compute(const Operator &op, const RuntimeObj *context) const override {
+        MatmulCudnnPerfRecord record; // use default record;
+        compute(op, record, context);
+    }
+
+    PerfRecord tune(const Operator &_op,
+                    const RuntimeObj *_context) const override {
+        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
+        auto op = as<MatmulObj>(_op);
+        MatmulCudnnPerfRecord ret;
+        ret.time = std::numeric_limits<double>::max();
+        for (int i = 0; i < N_ALGO; i++) {
+            MatmulCudnnPerfRecord rcd;
+            rcd.algo = ALGOS[i];
+            if (!do_compute(_op, rcd, _context))
+                continue;
+            rcd.time = timeit([&]() { do_compute(_op, rcd, _context); },
+                              [&]() { context->sync(); });
+            if (rcd.time < ret.time)
+                ret = rcd;
+        }
+        IT_ASSERT(ret.time < std::numeric_limits<double>::max(), "No valid "
+                                                                 "algorithm "
+                                                                 "found");
+        return ret;
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::Matmul, DataType::Float32, matmulCublas,
+                "Matmul_cuBLAS_CUDA_Float32");
+
+}; // namespace infini
\ No newline at end of file
diff --git a/test/operators/test_matmul.cc b/test/operators/test_matmul.cc
new file mode 100644
index 00000000..97df77c2
--- /dev/null
+++ b/test/operators/test_matmul.cc
@@ -0,0 +1,95 @@
+
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/matmul.h"
+
+#include "test.h"
+
+namespace infini {
+using ExpectOutput = vector<float>;
+
+TEST(Matmul, ShapeInference) {
+    auto runtime = CpuRuntimeObj::getInstance();
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        auto A = g->addTensor(Shape{1, 3, 5});
+        auto B = g->addTensor(Shape{1, 5, 2});
+        auto matmul = g->addOp<MatmulObj>(A, B, nullptr);
+        auto C = matmul->getOutputs()[0];
+        EXPECT_EQ(C->getDims(), (Shape{1, 3, 2}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        auto A = g->addTensor(Shape{3, 5, 4});
+        auto B = g->addTensor(Shape{3, 5, 2});
+        auto matmul = g->addOp<MatmulObj>(A, B, nullptr, true, false);
+        auto C = matmul->getOutputs()[0];
+        EXPECT_EQ(C->getDims(), (Shape{3, 4, 2}));
+    }
+}
+void testMatmulCuda(
+    const std::function<void(void *, size_t, DataType)> &generatorA,
+    const std::function<void(void *, size_t, DataType)> &generatorB,
+    bool transA, bool transB, const Shape &shapeA, const Shape &shapeB,
+    const ExpectOutput &ansVec) {
+    auto cpuRuntime = CpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(cpuRuntime);
+    auto ACpu = gCpu->addTensor(shapeA, DataType::Float32);
+    auto BCpu = gCpu->addTensor(shapeB, DataType::Float32);
+    gCpu->dataMalloc();
+    ACpu->setData(generatorA);
+    BCpu->setData(generatorB);
+
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    auto gCuda = make_ref<GraphObj>(cudaRuntime);
+    auto ACuda = gCuda->cloneTensor(ACpu);
+    auto BCuda = gCuda->cloneTensor(BCpu);
+    auto matmul =
+        gCuda->addOp<MatmulObj>(ACuda, BCuda, nullptr, transA, transB);
+
+    // allocate CUDA memory
+    gCuda->dataMalloc();
+    cudaRuntime->run(gCuda);
+
+    auto CCpu = gCpu->cloneTensor(matmul->getOutput());
+    // CCpu->printData();
+    //  check results on CPU
+    EXPECT_TRUE(CCpu->equalData(ansVec));
+    // print a tensor/operator/graph by print()
+    // gCuda->print();
+}
+
+TEST(Matmul, cuBlas) {
+    testMatmulCuda(IncrementalGenerator(), OneGenerator(), false, false,
+                   Shape{1, 3, 5}, Shape{1, 5, 2},
+                   ExpectOutput{10, 10, 35, 35, 60, 60});
+    testMatmulCuda(IncrementalGenerator(), IncrementalGenerator(), true, false,
+                   Shape{2, 3, 4}, Shape{2, 3, 2},
+                   ExpectOutput{40, 52, 46, 61, 52, 70, 58, 79, 400, 448, 424,
+                                475, 448, 502, 472, 529});
+}
+
+TEST(Matmul, tune) {
+    auto cpuRuntime = CpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(cpuRuntime);
+    auto ACpu = gCpu->addTensor(Shape{1, 3, 5}, DataType::Float32);
+    auto BCpu = gCpu->addTensor(Shape{1, 5, 2}, DataType::Float32);
+    gCpu->dataMalloc();
+    ACpu->setData(IncrementalGenerator());
+    BCpu->setData(IncrementalGenerator());
+
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    auto gCuda = make_ref<GraphObj>(cudaRuntime);
+    auto ACuda = gCuda->cloneTensor(ACpu);
+    auto BCuda = gCuda->cloneTensor(BCpu);
+    auto matmul = gCuda->addOp<MatmulObj>(ACuda, BCuda, nullptr);
+
+    // allocate CUDA memory
+    gCuda->dataMalloc();
+    cudaRuntime->run(gCuda, true);
+}
+
+}; // namespace infini
\ No newline at end of file