InfiniTensor/test/kernels/cuda/test_cuda_conv_fp16.cc

#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/conv.h"
#include <bitset>

#include "test.h"

namespace infini {

void testConvCudnnFP16(
    const std::function<void(void *, size_t, DataType)> &generator,
    vector<float> ansVec) {

    // Construct Runtime and graph for CPU and CUDA
    Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
    Graph gCpu = make_ref<GraphObj>(cpu);
    Runtime cuda = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cuda);
    // Set input data on CPU in a CPU Graph
    Tensor i0Cpu = gCpu->addTensor({1, 3, 4, 4}, DataType::Float16);
    Tensor w0Cpu = gCpu->addTensor({2, 3, 3, 3}, DataType::Float16);
    // Malloc data for all tensors in a graph. Do we need implicit allocation?
    gCpu->dataMalloc();
    i0Cpu->setData(generator);
    w0Cpu->setData(generator);

    // Copy input tensors from CPU to CUDA
    Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
    Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
    // Build CUDA graph
    auto conv =
        gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
    // allocate CUDA memory
    gCuda->dataMalloc();
    i0Cuda->setData(generator);
    w0Cuda->setData(generator);
    // Execute on CUDA
    cuda->run(gCuda);
    // copy output from CUDA to CPU
    auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
    // check results on CPU
    EXPECT_TRUE(o0Cpu->equalData(ansVec));
    // print a tensor/operator/graph by print()
    gCuda->print();
}

TEST(cuDNN_Conv_FP16, run) {
    testConvCudnnFP16(IncrementalGenerator(),
                      vector<float>{48, 48, 72, 72, 48, 48, 72, 72});
}

TEST(cuDNN_Conv_FP16, tune) {
    Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
    Graph gCpu = make_ref<GraphObj>(cpu);
    Runtime cuda = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cuda);
    // Set input data on CPU in a CPU Graph
    Tensor i0Cpu = gCpu->addTensor({1, 3, 224, 224}, DataType::Float16);
    Tensor w0Cpu = gCpu->addTensor({2, 3, 3, 3}, DataType::Float16);
    // Malloc data for all tensors in a graph. Do we need implicit allocation?
    gCpu->dataMalloc();
    i0Cpu->setData(IncrementalGenerator());
    w0Cpu->setData(IncrementalGenerator());

    // Copy input tensors from CPU to CUDA
    Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
    Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
    // Build CUDA graph
    auto conv =
        gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 1, 1, 1, 1);
    // allocate CUDA memory
    gCuda->dataMalloc();
    i0Cuda->setData(IncrementalGenerator());
    w0Cuda->setData(IncrementalGenerator());
    // Execute on CUDA
    bool tune = true;
    cuda->run(gCuda, tune);
}
} // namespace infini
支持fp16 dtype (#96) * add conv_half kernel * Conv Kernel FP16 * dcj: replace "DataType::Float32" with "op->getDType()" to support more DataType * feat: support Float16 dtype * fix: set default clang-format to 14 version * fix: 按照review意见修改 * fix: add data convert to convfp16 kernel test * test: add conv_fp16 kernel test --------- Co-authored-by: zhangyue207 <zhangyue@qiyuanlab.com> Co-authored-by: kilinchange <kilinchange@163.com> 2023-08-02 16:38:16 +08:00			`#include "core/graph.h"`
			`#include "core/kernel.h"`
			`#include "core/runtime.h"`
			`#include "cuda/cuda_runtime.h"`
			`#include "cuda/cuda_utility.h"`
			`#include "operators/conv.h"`
			`#include <bitset>`

			`#include "test.h"`

			`namespace infini {`

			`void testConvCudnnFP16(`
			`const std::function<void(void *, size_t, DataType)> &generator,`
			`vector<float> ansVec) {`

			`// Construct Runtime and graph for CPU and CUDA`
			`Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton`
			`Graph gCpu = make_ref<GraphObj>(cpu);`
			`Runtime cuda = make_ref<CudaRuntimeObj>();`
			`Graph gCuda = make_ref<GraphObj>(cuda);`
			`// Set input data on CPU in a CPU Graph`
			`Tensor i0Cpu = gCpu->addTensor({1, 3, 4, 4}, DataType::Float16);`
			`Tensor w0Cpu = gCpu->addTensor({2, 3, 3, 3}, DataType::Float16);`
			`// Malloc data for all tensors in a graph. Do we need implicit allocation?`
			`gCpu->dataMalloc();`
			`i0Cpu->setData(generator);`
			`w0Cpu->setData(generator);`

			`// Copy input tensors from CPU to CUDA`
			`Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);`
			`Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);`
			`// Build CUDA graph`
			`auto conv =`
			`gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);`
			`// allocate CUDA memory`
			`gCuda->dataMalloc();`
memory_allocator (#103) * - add LazyAllocator class - calculate memory consumption at present * - basic function of lazy_allocator, remaining test * - modify LazyAllocator * - modify InfiniTensor to fit LazyAllocator * - add setDataBlob - modify alignment - fix GraphObj::dataMalloc * - modified alignment value(64bytes -> 8bytes) - fix LazyAllocator::getPtr() - some dubug codes and commonts - do alignment by chaning size instead of tailAddr * - fix some problem * - translate chinese comments to english * - format codes * - fix test * - code format * - modify codes as YdrMaser and bitzyz suggested * - code format * - modify codes as constroy suggested * - codes format * - modify alignment on cuda * - code format * - add test_lazy_allocator - fix tests where not add input tensor into graph.tensors - fix tests where init tensor's data before calling graph->dataMallocate() * - code format * - remove gpu runtime in test_lazy_allocator * - fix test_lazy_allocator: remove cuda include * - add test * - code format * - add ifdef for test of allocator * - code format * - fix test: remove unused ifdef * - fix bang test * - code format * Merge branch 'master' into dcj/memory_allocator * fix: fix cuda conv_fp16 run fail * fix bang_runtime.cc and cuda_runtime.cc * - update mkl code * - fix codes for mkl * - code format * - remove unused commented codes - add an empty line at the end of the blob.cc --------- Co-authored-by: zhangyunze <z13785159769@163.com> 2023-08-13 13:39:35 +08:00			`i0Cuda->setData(generator);`
			`w0Cuda->setData(generator);`
支持fp16 dtype (#96) * add conv_half kernel * Conv Kernel FP16 * dcj: replace "DataType::Float32" with "op->getDType()" to support more DataType * feat: support Float16 dtype * fix: set default clang-format to 14 version * fix: 按照review意见修改 * fix: add data convert to convfp16 kernel test * test: add conv_fp16 kernel test --------- Co-authored-by: zhangyue207 <zhangyue@qiyuanlab.com> Co-authored-by: kilinchange <kilinchange@163.com> 2023-08-02 16:38:16 +08:00			`// Execute on CUDA`
			`cuda->run(gCuda);`
			`// copy output from CUDA to CPU`
			`auto o0Cpu = gCpu->cloneTensor(conv->getOutput());`
			`// check results on CPU`
			`EXPECT_TRUE(o0Cpu->equalData(ansVec));`
			`// print a tensor/operator/graph by print()`
			`gCuda->print();`
			`}`

			`TEST(cuDNN_Conv_FP16, run) {`
			`testConvCudnnFP16(IncrementalGenerator(),`
			`vector<float>{48, 48, 72, 72, 48, 48, 72, 72});`
			`}`

			`TEST(cuDNN_Conv_FP16, tune) {`
			`Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton`
			`Graph gCpu = make_ref<GraphObj>(cpu);`
			`Runtime cuda = make_ref<CudaRuntimeObj>();`
			`Graph gCuda = make_ref<GraphObj>(cuda);`
			`// Set input data on CPU in a CPU Graph`
			`Tensor i0Cpu = gCpu->addTensor({1, 3, 224, 224}, DataType::Float16);`
			`Tensor w0Cpu = gCpu->addTensor({2, 3, 3, 3}, DataType::Float16);`
			`// Malloc data for all tensors in a graph. Do we need implicit allocation?`
			`gCpu->dataMalloc();`
			`i0Cpu->setData(IncrementalGenerator());`
			`w0Cpu->setData(IncrementalGenerator());`

			`// Copy input tensors from CPU to CUDA`
			`Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);`
			`Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);`
			`// Build CUDA graph`
			`auto conv =`
			`gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 1, 1, 1, 1);`
			`// allocate CUDA memory`
			`gCuda->dataMalloc();`
memory_allocator (#103) * - add LazyAllocator class - calculate memory consumption at present * - basic function of lazy_allocator, remaining test * - modify LazyAllocator * - modify InfiniTensor to fit LazyAllocator * - add setDataBlob - modify alignment - fix GraphObj::dataMalloc * - modified alignment value(64bytes -> 8bytes) - fix LazyAllocator::getPtr() - some dubug codes and commonts - do alignment by chaning size instead of tailAddr * - fix some problem * - translate chinese comments to english * - format codes * - fix test * - code format * - modify codes as YdrMaser and bitzyz suggested * - code format * - modify codes as constroy suggested * - codes format * - modify alignment on cuda * - code format * - add test_lazy_allocator - fix tests where not add input tensor into graph.tensors - fix tests where init tensor's data before calling graph->dataMallocate() * - code format * - remove gpu runtime in test_lazy_allocator * - fix test_lazy_allocator: remove cuda include * - add test * - code format * - add ifdef for test of allocator * - code format * - fix test: remove unused ifdef * - fix bang test * - code format * Merge branch 'master' into dcj/memory_allocator * fix: fix cuda conv_fp16 run fail * fix bang_runtime.cc and cuda_runtime.cc * - update mkl code * - fix codes for mkl * - code format * - remove unused commented codes - add an empty line at the end of the blob.cc --------- Co-authored-by: zhangyunze <z13785159769@163.com> 2023-08-13 13:39:35 +08:00			`i0Cuda->setData(IncrementalGenerator());`
			`w0Cuda->setData(IncrementalGenerator());`
支持fp16 dtype (#96) * add conv_half kernel * Conv Kernel FP16 * dcj: replace "DataType::Float32" with "op->getDType()" to support more DataType * feat: support Float16 dtype * fix: set default clang-format to 14 version * fix: 按照review意见修改 * fix: add data convert to convfp16 kernel test * test: add conv_fp16 kernel test --------- Co-authored-by: zhangyue207 <zhangyue@qiyuanlab.com> Co-authored-by: kilinchange <kilinchange@163.com> 2023-08-02 16:38:16 +08:00			`// Execute on CUDA`
			`bool tune = true;`
			`cuda->run(gCuda, tune);`
			`}`
			`} // namespace infini`