InfiniTensor/test/kernels/cuda/test_cuda_GBMM.cc

#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/GBMM.h"
#include "test.h"

namespace infini {
using ExpectOutput = vector<float>;

TEST(CUDA_GBMM, ShapeInference) {
    const int bs = 1, seqlen = 10000, w = 1000, featlen = 512, heads = 8, d = 4;
    const int hidden = featlen, hiddenPerHead = hidden / heads;
    auto cpuRuntime = NativeCpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(cpuRuntime);
    auto ACpu = gCpu->addTensor(Shape{bs * heads, seqlen, w * 2 + 1},
                                DataType::Float32);
    auto BCpu = gCpu->addTensor(Shape{bs * heads, seqlen, hiddenPerHead},
                                DataType::Float32);
    gCpu->dataMalloc();
    ACpu->setData(IncrementalGenerator());
    BCpu->setData(IncrementalGenerator());

    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    auto gCuda = make_ref<GraphObj>(cudaRuntime);
    auto ACuda = gCuda->cloneTensor(ACpu);
    auto BCuda = gCuda->cloneTensor(BCpu);
    auto GBMM = gCuda->addOp<GBMMObj>(ACuda, BCuda, nullptr, d);
    EXPECT_EQ(GBMM->getOutput()->getDims(),
              (Shape{bs * heads, seqlen, hiddenPerHead}));

    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
}

} // namespace infini
Operators g2bmm&gbmm transplantation (#24) * Function tune and corresponding testcase. Add: Tune function in /src/kernel/cuda/conv.cc and corresponding testcase in test_conv. Fix: A little bug of perfRecord using in /src/core/runtime.cc. * Tune part debug Add: recover the code, fixed the commit error. Add: some anotations in tune function * clang formmat test * Fix: mem leak in CUDA Runtime and Conv * Fix: sync in conv and default sync in timeit * Change the way to tune operator conv. Timeit function cudNNUnfused -> Timeit function cudnnConvolutionForward. * Change: merge the common part of cudnnunfused&tune into cudnndescriptoraccess * clang test * clang-format * clang-format bash. * Added operator G2BMM and corresponding testcase. Added files related to operator G2BMM creating&calling. Added custom_ops.cuh&custom_op.h. * Add operator GBMML * new version * Fix: G2BMM and GBMM kernel bugs * Added testcase of operator GBMML * clang format * Added cmake option REQUIRE_GCC9 * Delete redundent file * Renamed class GBMML into GBMM * clang format * Reviewed. * Added cudahostcompier option. * Add: explicit CMAKE_CUDA_HOST_COMPILER * Rename gbmm kernel * Fix: nvcc warning in GBMM and G2BMM Co-authored-by: wcz112 <wcz19@mails.tsinghua.edu.cn> Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-09-08 21:31:35 +08:00			`#include "core/graph.h"`
			`#include "core/kernel.h"`
			`#include "core/runtime.h"`
			`#include "cuda/cuda_runtime.h"`
			`#include "cuda/cuda_utility.h"`
			`#include "operators/GBMM.h"`
			`#include "test.h"`

			`namespace infini {`
			`using ExpectOutput = vector<float>;`

Fix CMake USE_CUDA (#36) * Fix: build lib without cuda * Chore: rename GBMM and G2BMM files * Fix: seperate CUDA tests from operator tests * Fix: CMake CMP0104 * Chore: fix typo * Chore: remove unused headers Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-09-21 12:28:00 +08:00			`TEST(CUDA_GBMM, ShapeInference) {`
Operators g2bmm&gbmm transplantation (#24) * Function tune and corresponding testcase. Add: Tune function in /src/kernel/cuda/conv.cc and corresponding testcase in test_conv. Fix: A little bug of perfRecord using in /src/core/runtime.cc. * Tune part debug Add: recover the code, fixed the commit error. Add: some anotations in tune function * clang formmat test * Fix: mem leak in CUDA Runtime and Conv * Fix: sync in conv and default sync in timeit * Change the way to tune operator conv. Timeit function cudNNUnfused -> Timeit function cudnnConvolutionForward. * Change: merge the common part of cudnnunfused&tune into cudnndescriptoraccess * clang test * clang-format * clang-format bash. * Added operator G2BMM and corresponding testcase. Added files related to operator G2BMM creating&calling. Added custom_ops.cuh&custom_op.h. * Add operator GBMML * new version * Fix: G2BMM and GBMM kernel bugs * Added testcase of operator GBMML * clang format * Added cmake option REQUIRE_GCC9 * Delete redundent file * Renamed class GBMML into GBMM * clang format * Reviewed. * Added cudahostcompier option. * Add: explicit CMAKE_CUDA_HOST_COMPILER * Rename gbmm kernel * Fix: nvcc warning in GBMM and G2BMM Co-authored-by: wcz112 <wcz19@mails.tsinghua.edu.cn> Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-09-08 21:31:35 +08:00			`const int bs = 1, seqlen = 10000, w = 1000, featlen = 512, heads = 8, d = 4;`
			`const int hidden = featlen, hiddenPerHead = hidden / heads;`
ADD: add mkl runtime for intel cpu , and add mkl kernel for matmul/conv/convtransposed. (#61) * move memory format transformation to TensorObj clang format add MemoryFormat for tensorObj. use post_ops for fused conv/deconv Distinguish mkl op_timer from cuda op timer. add act optype to conv and deconv add operator timer add mkl kernel for convTransposed minor fix for group conv do not use cblas_sgemm_batch CpuRuntimeObj->NativeCpuRuntimeObj add matmul op for mkl * fix: fix bugs when rebasing from master fix: fix bugs when rebasing from master * fix: update api after rebasing * fix: fix format; fix onnx import * fix: fix clang-format * [fix] fix conv_transpose test * [fix] use stronger test case for transposed conv * [fix] remove tensor memory format; fix mkl transpose conv * [fix] add FIXME tag for op_timer python api --------- Co-authored-by: whjthu <haojie0429@gmail.com> 2023-03-27 21:28:49 +08:00			`auto cpuRuntime = NativeCpuRuntimeObj::getInstance();`
Operators g2bmm&gbmm transplantation (#24) * Function tune and corresponding testcase. Add: Tune function in /src/kernel/cuda/conv.cc and corresponding testcase in test_conv. Fix: A little bug of perfRecord using in /src/core/runtime.cc. * Tune part debug Add: recover the code, fixed the commit error. Add: some anotations in tune function * clang formmat test * Fix: mem leak in CUDA Runtime and Conv * Fix: sync in conv and default sync in timeit * Change the way to tune operator conv. Timeit function cudNNUnfused -> Timeit function cudnnConvolutionForward. * Change: merge the common part of cudnnunfused&tune into cudnndescriptoraccess * clang test * clang-format * clang-format bash. * Added operator G2BMM and corresponding testcase. Added files related to operator G2BMM creating&calling. Added custom_ops.cuh&custom_op.h. * Add operator GBMML * new version * Fix: G2BMM and GBMM kernel bugs * Added testcase of operator GBMML * clang format * Added cmake option REQUIRE_GCC9 * Delete redundent file * Renamed class GBMML into GBMM * clang format * Reviewed. * Added cudahostcompier option. * Add: explicit CMAKE_CUDA_HOST_COMPILER * Rename gbmm kernel * Fix: nvcc warning in GBMM and G2BMM Co-authored-by: wcz112 <wcz19@mails.tsinghua.edu.cn> Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> 2022-09-08 21:31:35 +08:00			`Graph gCpu = make_ref<GraphObj>(cpuRuntime);`
			`auto ACpu = gCpu->addTensor(Shape{bs * heads, seqlen, w * 2 + 1},`
			`DataType::Float32);`
			`auto BCpu = gCpu->addTensor(Shape{bs * heads, seqlen, hiddenPerHead},`
			`DataType::Float32);`
			`gCpu->dataMalloc();`
			`ACpu->setData(IncrementalGenerator());`
			`BCpu->setData(IncrementalGenerator());`

			`auto cudaRuntime = make_ref<CudaRuntimeObj>();`
			`auto gCuda = make_ref<GraphObj>(cudaRuntime);`
			`auto ACuda = gCuda->cloneTensor(ACpu);`
			`auto BCuda = gCuda->cloneTensor(BCpu);`
			`auto GBMM = gCuda->addOp<GBMMObj>(ACuda, BCuda, nullptr, d);`
			`EXPECT_EQ(GBMM->getOutput()->getDims(),`
			`(Shape{bs * heads, seqlen, hiddenPerHead}));`

			`gCuda->dataMalloc();`
			`cudaRuntime->run(gCuda);`
			`}`

ADD: batch norm operator and cuda kernel. (#44) fix numInputs of batchNorm, add new line in file ending. ADD: batch norm operator and cuda kernel. add training remove comments. fix compile error. add batch norm operator and cuda kernel. 2022-10-15 16:29:28 +08:00			`} // namespace infini`