diff --git a/include/operators/matmul.h b/include/operators/matmul.h index 40a9af9f..bacdea69 100644 --- a/include/operators/matmul.h +++ b/include/operators/matmul.h @@ -44,6 +44,7 @@ class MatmulObj : public OperatorObj { int getM() const { return m; } int getN() const { return n; } int getK() const { return k; } + auto getBMNK() const { return tuple{b, m, n, k}; } private: vector getWorkloadVector() const override; diff --git a/src/kernels/cuda/matmul.cc b/src/kernels/cuda/matmul.cc new file mode 100644 index 00000000..5ef55696 --- /dev/null +++ b/src/kernels/cuda/matmul.cc @@ -0,0 +1,92 @@ +#include "operators/matmul.h" +#include "core/kernel.h" +#include "cuda/cuda_runtime.h" +#include +#include + +namespace infini { +struct MatmulCudnnPerfRecord : public PerfRecord { + cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; +}; +constexpr int N_ALGO = 24; +constexpr cublasGemmAlgo_t ALGOS[N_ALGO] = { + CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1, CUBLAS_GEMM_ALGO2, + CUBLAS_GEMM_ALGO3, CUBLAS_GEMM_ALGO4, CUBLAS_GEMM_ALGO5, + CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7, CUBLAS_GEMM_ALGO8, + CUBLAS_GEMM_ALGO9, CUBLAS_GEMM_ALGO10, CUBLAS_GEMM_ALGO11, + CUBLAS_GEMM_ALGO12, CUBLAS_GEMM_ALGO13, CUBLAS_GEMM_ALGO14, + CUBLAS_GEMM_ALGO15, CUBLAS_GEMM_ALGO16, CUBLAS_GEMM_ALGO17, + CUBLAS_GEMM_ALGO18, CUBLAS_GEMM_ALGO19, CUBLAS_GEMM_ALGO20, + CUBLAS_GEMM_ALGO21, CUBLAS_GEMM_ALGO22, CUBLAS_GEMM_ALGO23, +}; + +class matmulCublas : public Kernel { + bool do_compute(const Operator &_op, const PerfRecord &_record, + const RuntimeObj *_context) const { + auto op = as(_op); + auto context = dynamic_cast(_context); + void *const inAData = (op->getInputs(0)->getRawDataPtr()); + void *const inBData = (op->getInputs(1)->getRawDataPtr()); + void *const outData = (op->getOutput()->getRawDataPtr()); + auto record = dynamic_cast(_record); + + const auto [b, m, n, k] = op->getBMNK(); + auto opA = + op->getTransA() ? CUBLAS_OP_T : CUBLAS_OP_N; // BLAS_N = col major + auto opB = op->getTransB() ? CUBLAS_OP_T : CUBLAS_OP_N; + const int lda = op->getTransA() ? m : k, ldb = op->getTransB() ? k : n, + ldc = n; + const float alpha = 1.f, beta = 0.f; + // TODO:use compute type + cublasStatus_t stat; + if (b > 1) { + stat = cublasGemmStridedBatchedEx( + context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData, + CUDA_R_32F, ldb, k * n, inAData, CUDA_R_32F, lda, m * k, &beta, + outData, CUDA_R_32F, ldc, m * n, b, CUDA_R_32F, record.algo); + } else { + stat = cublasGemmEx(context->cublasHandle(), opB, opA, n, m, k, + &alpha, inBData, CUDA_R_32F, ldb, inAData, + CUDA_R_32F, lda, &beta, outData, CUDA_R_32F, + ldc, CUDA_R_32F, record.algo); + } + return (stat == CUBLAS_STATUS_SUCCESS); + } + + void compute(const Operator &_op, const PerfRecord &_record, + const RuntimeObj *_context) const override { + IT_ASSERT(do_compute(_op, _record, _context)); + } + + void compute(const Operator &op, const RuntimeObj *context) const override { + MatmulCudnnPerfRecord record; // use default record; + compute(op, record, context); + } + + PerfRecord tune(const Operator &_op, + const RuntimeObj *_context) const override { + auto context = dynamic_cast(_context); + auto op = as(_op); + MatmulCudnnPerfRecord ret; + ret.time = std::numeric_limits::max(); + for (int i = 0; i < N_ALGO; i++) { + MatmulCudnnPerfRecord rcd; + rcd.algo = ALGOS[i]; + if (!do_compute(_op, rcd, _context)) + continue; + rcd.time = timeit([&]() { do_compute(_op, rcd, _context); }, + [&]() { context->sync(); }); + if (rcd.time < ret.time) + ret = rcd; + } + IT_ASSERT(ret.time < std::numeric_limits::max(), "No valid " + "algorithm " + "found"); + return ret; + } +}; + +REGISTER_KERNEL(Device::CUDA, OpType::Matmul, DataType::Float32, matmulCublas, + "Matmul_cuBLAS_CUDA_Float32"); + +}; // namespace infini \ No newline at end of file diff --git a/test/operators/test_matmul.cc b/test/operators/test_matmul.cc new file mode 100644 index 00000000..97df77c2 --- /dev/null +++ b/test/operators/test_matmul.cc @@ -0,0 +1,95 @@ + +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "cuda/cuda_runtime.h" +#include "cuda/cuda_utility.h" +#include "operators/matmul.h" + +#include "test.h" + +namespace infini { +using ExpectOutput = vector; + +TEST(Matmul, ShapeInference) { + auto runtime = CpuRuntimeObj::getInstance(); + { + Graph g = make_ref(runtime); + auto A = g->addTensor(Shape{1, 3, 5}); + auto B = g->addTensor(Shape{1, 5, 2}); + auto matmul = g->addOp(A, B, nullptr); + auto C = matmul->getOutputs()[0]; + EXPECT_EQ(C->getDims(), (Shape{1, 3, 2})); + } + { + Graph g = make_ref(runtime); + auto A = g->addTensor(Shape{3, 5, 4}); + auto B = g->addTensor(Shape{3, 5, 2}); + auto matmul = g->addOp(A, B, nullptr, true, false); + auto C = matmul->getOutputs()[0]; + EXPECT_EQ(C->getDims(), (Shape{3, 4, 2})); + } +} +void testMatmulCuda( + const std::function &generatorA, + const std::function &generatorB, + bool transA, bool transB, const Shape &shapeA, const Shape &shapeB, + const ExpectOutput &ansVec) { + auto cpuRuntime = CpuRuntimeObj::getInstance(); + Graph gCpu = make_ref(cpuRuntime); + auto ACpu = gCpu->addTensor(shapeA, DataType::Float32); + auto BCpu = gCpu->addTensor(shapeB, DataType::Float32); + gCpu->dataMalloc(); + ACpu->setData(generatorA); + BCpu->setData(generatorB); + + auto cudaRuntime = make_ref(); + auto gCuda = make_ref(cudaRuntime); + auto ACuda = gCuda->cloneTensor(ACpu); + auto BCuda = gCuda->cloneTensor(BCpu); + auto matmul = + gCuda->addOp(ACuda, BCuda, nullptr, transA, transB); + + // allocate CUDA memory + gCuda->dataMalloc(); + cudaRuntime->run(gCuda); + + auto CCpu = gCpu->cloneTensor(matmul->getOutput()); + // CCpu->printData(); + // check results on CPU + EXPECT_TRUE(CCpu->equalData(ansVec)); + // print a tensor/operator/graph by print() + // gCuda->print(); +} + +TEST(Matmul, cuBlas) { + testMatmulCuda(IncrementalGenerator(), OneGenerator(), false, false, + Shape{1, 3, 5}, Shape{1, 5, 2}, + ExpectOutput{10, 10, 35, 35, 60, 60}); + testMatmulCuda(IncrementalGenerator(), IncrementalGenerator(), true, false, + Shape{2, 3, 4}, Shape{2, 3, 2}, + ExpectOutput{40, 52, 46, 61, 52, 70, 58, 79, 400, 448, 424, + 475, 448, 502, 472, 529}); +} + +TEST(Matmul, tune) { + auto cpuRuntime = CpuRuntimeObj::getInstance(); + Graph gCpu = make_ref(cpuRuntime); + auto ACpu = gCpu->addTensor(Shape{1, 3, 5}, DataType::Float32); + auto BCpu = gCpu->addTensor(Shape{1, 5, 2}, DataType::Float32); + gCpu->dataMalloc(); + ACpu->setData(IncrementalGenerator()); + BCpu->setData(IncrementalGenerator()); + + auto cudaRuntime = make_ref(); + auto gCuda = make_ref(cudaRuntime); + auto ACuda = gCuda->cloneTensor(ACpu); + auto BCuda = gCuda->cloneTensor(BCpu); + auto matmul = gCuda->addOp(ACuda, BCuda, nullptr); + + // allocate CUDA memory + gCuda->dataMalloc(); + cudaRuntime->run(gCuda, true); +} + +}; // namespace infini \ No newline at end of file