InfiniTensor/test/kernels/cuda/test_cuda_matmul.cc


#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/matmul.h"

#include "test.h"

namespace infini {
using ExpectOutput = vector<float>;

void testMatmulCuda(
    const std::function<void(void *, size_t, DataType)> &generatorA,
    const std::function<void(void *, size_t, DataType)> &generatorB,
    bool transA, bool transB, const Shape &shapeA, const Shape &shapeB,
    const ExpectOutput &ansVec) {
    auto cpuRuntime = NativeCpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(cpuRuntime);
    auto ACpu = gCpu->addTensor(shapeA, DataType::Float32);
    auto BCpu = gCpu->addTensor(shapeB, DataType::Float32);
    gCpu->dataMalloc();
    ACpu->setData(generatorA);
    BCpu->setData(generatorB);

    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    auto gCuda = make_ref<GraphObj>(cudaRuntime);
    auto ACuda = gCuda->cloneTensor(ACpu);
    auto BCuda = gCuda->cloneTensor(BCpu);
    auto matmul =
        gCuda->addOp<MatmulObj>(ACuda, BCuda, nullptr, transA, transB);

    // allocate CUDA memory
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);

    auto CCpu = gCpu->cloneTensor(matmul->getOutput());
    // CCpu->printData();
    //  check results on CPU
    EXPECT_TRUE(CCpu->equalData(ansVec));
    // print a tensor/operator/graph by print()
    // gCuda->print();
}

TEST(cuBLAS_Matmul, run) {
    testMatmulCuda(IncrementalGenerator(), OneGenerator(), false, false,
                   Shape{1, 3, 5}, Shape{1, 5, 2},
                   ExpectOutput{10, 10, 35, 35, 60, 60});
    testMatmulCuda(IncrementalGenerator(), IncrementalGenerator(), true, false,
                   Shape{2, 3, 4}, Shape{2, 3, 2},
                   ExpectOutput{40, 52, 46, 61, 52, 70, 58, 79, 400, 448, 424,
                                475, 448, 502, 472, 529});
}

TEST(cuBLAS_Matmul, tune) {
    auto cpuRuntime = NativeCpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(cpuRuntime);
    auto ACpu = gCpu->addTensor(Shape{1, 3, 5}, DataType::Float32);
    auto BCpu = gCpu->addTensor(Shape{1, 5, 2}, DataType::Float32);
    gCpu->dataMalloc();
    ACpu->setData(IncrementalGenerator());
    BCpu->setData(IncrementalGenerator());

    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    auto gCuda = make_ref<GraphObj>(cudaRuntime);
    auto ACuda = gCuda->cloneTensor(ACpu);
    auto BCuda = gCuda->cloneTensor(BCpu);
    auto matmul = gCuda->addOp<MatmulObj>(ACuda, BCuda, nullptr);

    // allocate CUDA memory
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda, true);
}

}; // namespace infini
Fix CMake USE_CUDA (#36) * Fix: build lib without cuda * Chore: rename GBMM and G2BMM files * Fix: seperate CUDA tests from operator tests * Fix: CMake CMP0104 * Chore: fix typo * Chore: remove unused headers Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-09-21 12:28:00 +08:00
			`#include "core/graph.h"`
			`#include "core/kernel.h"`
			`#include "core/runtime.h"`
			`#include "cuda/cuda_runtime.h"`
			`#include "cuda/cuda_utility.h"`
			`#include "operators/matmul.h"`

			`#include "test.h"`

			`namespace infini {`
			`using ExpectOutput = vector<float>;`

			`void testMatmulCuda(`
			`const std::function<void(void *, size_t, DataType)> &generatorA,`
			`const std::function<void(void *, size_t, DataType)> &generatorB,`
			`bool transA, bool transB, const Shape &shapeA, const Shape &shapeB,`
			`const ExpectOutput &ansVec) {`
ADD: add mkl runtime for intel cpu , and add mkl kernel for matmul/conv/convtransposed. (#61) * move memory format transformation to TensorObj clang format add MemoryFormat for tensorObj. use post_ops for fused conv/deconv Distinguish mkl op_timer from cuda op timer. add act optype to conv and deconv add operator timer add mkl kernel for convTransposed minor fix for group conv do not use cblas_sgemm_batch CpuRuntimeObj->NativeCpuRuntimeObj add matmul op for mkl * fix: fix bugs when rebasing from master fix: fix bugs when rebasing from master * fix: update api after rebasing * fix: fix format; fix onnx import * fix: fix clang-format * [fix] fix conv_transpose test * [fix] use stronger test case for transposed conv * [fix] remove tensor memory format; fix mkl transpose conv * [fix] add FIXME tag for op_timer python api --------- Co-authored-by: whjthu <haojie0429@gmail.com> 2023-03-27 21:28:49 +08:00			`auto cpuRuntime = NativeCpuRuntimeObj::getInstance();`
Fix CMake USE_CUDA (#36) * Fix: build lib without cuda * Chore: rename GBMM and G2BMM files * Fix: seperate CUDA tests from operator tests * Fix: CMake CMP0104 * Chore: fix typo * Chore: remove unused headers Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-09-21 12:28:00 +08:00			`Graph gCpu = make_ref<GraphObj>(cpuRuntime);`
			`auto ACpu = gCpu->addTensor(shapeA, DataType::Float32);`
			`auto BCpu = gCpu->addTensor(shapeB, DataType::Float32);`
			`gCpu->dataMalloc();`
			`ACpu->setData(generatorA);`
			`BCpu->setData(generatorB);`

			`auto cudaRuntime = make_ref<CudaRuntimeObj>();`
			`auto gCuda = make_ref<GraphObj>(cudaRuntime);`
			`auto ACuda = gCuda->cloneTensor(ACpu);`
			`auto BCuda = gCuda->cloneTensor(BCpu);`
			`auto matmul =`
			`gCuda->addOp<MatmulObj>(ACuda, BCuda, nullptr, transA, transB);`

			`// allocate CUDA memory`
			`gCuda->dataMalloc();`
			`cudaRuntime->run(gCuda);`

			`auto CCpu = gCpu->cloneTensor(matmul->getOutput());`
			`// CCpu->printData();`
			`// check results on CPU`
			`EXPECT_TRUE(CCpu->equalData(ansVec));`
			`// print a tensor/operator/graph by print()`
			`// gCuda->print();`
			`}`

			`TEST(cuBLAS_Matmul, run) {`
			`testMatmulCuda(IncrementalGenerator(), OneGenerator(), false, false,`
			`Shape{1, 3, 5}, Shape{1, 5, 2},`
			`ExpectOutput{10, 10, 35, 35, 60, 60});`
			`testMatmulCuda(IncrementalGenerator(), IncrementalGenerator(), true, false,`
			`Shape{2, 3, 4}, Shape{2, 3, 2},`
			`ExpectOutput{40, 52, 46, 61, 52, 70, 58, 79, 400, 448, 424,`
			`475, 448, 502, 472, 529});`
			`}`

			`TEST(cuBLAS_Matmul, tune) {`
ADD: add mkl runtime for intel cpu , and add mkl kernel for matmul/conv/convtransposed. (#61) * move memory format transformation to TensorObj clang format add MemoryFormat for tensorObj. use post_ops for fused conv/deconv Distinguish mkl op_timer from cuda op timer. add act optype to conv and deconv add operator timer add mkl kernel for convTransposed minor fix for group conv do not use cblas_sgemm_batch CpuRuntimeObj->NativeCpuRuntimeObj add matmul op for mkl * fix: fix bugs when rebasing from master fix: fix bugs when rebasing from master * fix: update api after rebasing * fix: fix format; fix onnx import * fix: fix clang-format * [fix] fix conv_transpose test * [fix] use stronger test case for transposed conv * [fix] remove tensor memory format; fix mkl transpose conv * [fix] add FIXME tag for op_timer python api --------- Co-authored-by: whjthu <haojie0429@gmail.com> 2023-03-27 21:28:49 +08:00			`auto cpuRuntime = NativeCpuRuntimeObj::getInstance();`
Fix CMake USE_CUDA (#36) * Fix: build lib without cuda * Chore: rename GBMM and G2BMM files * Fix: seperate CUDA tests from operator tests * Fix: CMake CMP0104 * Chore: fix typo * Chore: remove unused headers Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-09-21 12:28:00 +08:00			`Graph gCpu = make_ref<GraphObj>(cpuRuntime);`
			`auto ACpu = gCpu->addTensor(Shape{1, 3, 5}, DataType::Float32);`
			`auto BCpu = gCpu->addTensor(Shape{1, 5, 2}, DataType::Float32);`
			`gCpu->dataMalloc();`
			`ACpu->setData(IncrementalGenerator());`
			`BCpu->setData(IncrementalGenerator());`

			`auto cudaRuntime = make_ref<CudaRuntimeObj>();`
			`auto gCuda = make_ref<GraphObj>(cudaRuntime);`
			`auto ACuda = gCuda->cloneTensor(ACpu);`
			`auto BCuda = gCuda->cloneTensor(BCpu);`
			`auto matmul = gCuda->addOp<MatmulObj>(ACuda, BCuda, nullptr);`

			`// allocate CUDA memory`
			`gCuda->dataMalloc();`
			`cudaRuntime->run(gCuda, true);`
			`}`

ADD: batch norm operator and cuda kernel. (#44) fix numInputs of batchNorm, add new line in file ending. ADD: batch norm operator and cuda kernel. add training remove comments. fix compile error. add batch norm operator and cuda kernel. 2022-10-15 16:29:28 +08:00			`}; // namespace infini`