From 0dc53470890fcc64e6d1f520577766867f4217fd Mon Sep 17 00:00:00 2001 From: kilinchange <44265800+kilinchange@users.noreply.github.com> Date: Sun, 13 Aug 2023 13:39:35 +0800 Subject: [PATCH] memory_allocator (#103) * - add LazyAllocator class - calculate memory consumption at present * - basic function of lazy_allocator, remaining test * - modify LazyAllocator * - modify InfiniTensor to fit LazyAllocator * - add setDataBlob - modify alignment - fix GraphObj::dataMalloc * - modified alignment value(64bytes -> 8bytes) - fix LazyAllocator::getPtr() - some dubug codes and commonts - do alignment by chaning size instead of tailAddr * - fix some problem * - translate chinese comments to english * - format codes * - fix test * - code format * - modify codes as YdrMaser and bitzyz suggested * - code format * - modify codes as constroy suggested * - codes format * - modify alignment on cuda * - code format * - add test_lazy_allocator - fix tests where not add input tensor into graph.tensors - fix tests where init tensor's data before calling graph->dataMallocate() * - code format * - remove gpu runtime in test_lazy_allocator * - fix test_lazy_allocator: remove cuda include * - add test * - code format * - add ifdef for test of allocator * - code format * - fix test: remove unused ifdef * - fix bang test * - code format * Merge branch 'master' into dcj/memory_allocator * fix: fix cuda conv_fp16 run fail * fix bang_runtime.cc and cuda_runtime.cc * - update mkl code * - fix codes for mkl * - code format * - remove unused commented codes - add an empty line at the end of the blob.cc --------- Co-authored-by: zhangyunze --- CMakeLists.txt | 1 + example | 2 +- include/core/graph.h | 5 +- include/core/lazy_allocator.h | 84 +++++++ include/core/tensor.h | 6 + src/core/blob.cc | 4 +- src/core/graph.cc | 53 ++++- src/core/lazy_allocator.cc | 143 ++++++++++++ src/core/tensor.cc | 2 + src/kernels/intelcpu/element_wise.cc | 4 +- src/kernels/intelcpu/matmul_dpcpp.cc | 2 +- src/kernels/intelcpu/pooling.cc | 2 +- test/core/test_lazy_allocator.cc | 96 ++++++++ test/kernels/bang/test_bang_bangcKernel.cc | 10 +- test/kernels/bang/test_bang_conv.cc | 10 +- test/kernels/bang/test_bang_element_wise.cc | 2 + test/kernels/bang/test_bang_matmul.cc | 10 +- test/kernels/bang/test_bang_optensor.cc | 10 +- test/kernels/bang/test_bang_unary.cc | 5 +- test/kernels/cuda/test_cuda_clip.cc | 5 +- test/kernels/cuda/test_cuda_concat.cc | 13 +- test/kernels/cuda/test_cuda_conv.cc | 4 + test/kernels/cuda/test_cuda_conv_fp16.cc | 4 + .../cuda/test_cuda_conv_transposed_2d.cc | 8 + test/kernels/cuda/test_cuda_element_wise.cc | 6 +- test/kernels/cuda/test_cuda_extend.cc | 3 +- test/kernels/cuda/test_cuda_gather.cc | 21 +- test/kernels/cuda/test_cuda_matmul.cc | 2 + test/kernels/cuda/test_cuda_pad.cc | 3 +- test/kernels/cuda/test_cuda_pooling.cc | 3 +- test/kernels/cuda/test_cuda_reduce_mean.cc | 3 +- test/kernels/cuda/test_cuda_reshape.cc | 3 + test/kernels/cuda/test_cuda_resize.cc | 208 +++++++++++++----- test/kernels/cuda/test_cuda_slice.cc | 1 + test/kernels/cuda/test_cuda_softmax.cc | 15 +- test/kernels/cuda/test_cuda_split.cc | 6 +- test/kernels/cuda/test_cuda_unary.cc | 5 +- test/kernels/intelcpu/test_mkl_gather.cc | 15 +- test/kernels/intelcpu/test_mkl_matmul.cc | 5 +- test/kernels/intelcpu/test_mkl_resize.cc | 12 +- test/operators/test_clip.cc | 4 +- 41 files changed, 658 insertions(+), 142 deletions(-) create mode 100644 include/core/lazy_allocator.h create mode 100644 src/core/lazy_allocator.cc create mode 100644 test/core/test_lazy_allocator.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index f5fd43ce..d2993c04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -234,6 +234,7 @@ function(build_test files) endfunction() if(BUILD_TEST) + add_compile_definitions(BUILD_TEST=1) enable_testing() if(USE_TRACE) build_test(test/trace/*.cc) diff --git a/example b/example index d6ac8c8c..51d31052 160000 --- a/example +++ b/example @@ -1 +1 @@ -Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9 +Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77 diff --git a/include/core/graph.h b/include/core/graph.h index dab31d79..3efd893f 100644 --- a/include/core/graph.h +++ b/include/core/graph.h @@ -1,4 +1,5 @@ #pragma once +#include "core/lazy_allocator.h" #include "core/operator.h" #include "core/tensor.h" @@ -9,9 +10,11 @@ class GraphObj : public Object { Runtime runtime; TensorVec tensors; OpVec ops; + LazyAllocator allocator; public: - explicit GraphObj(Runtime runtime) : runtime(runtime), sorted(false){}; + explicit GraphObj(Runtime runtime) + : runtime(runtime), allocator(runtime), sorted(false){}; GraphObj(Runtime runtime, OpVec ops_in); string toString() const override; Runtime getRuntime() const { return runtime; } diff --git a/include/core/lazy_allocator.h b/include/core/lazy_allocator.h new file mode 100644 index 00000000..228639a3 --- /dev/null +++ b/include/core/lazy_allocator.h @@ -0,0 +1,84 @@ +#pragma once +#include "core/runtime.h" +#include "core/tensor.h" +#ifdef BUILD_TEST +#include "gtest/gtest.h" +#endif +#include +#include +#include + +namespace infini { + +class LazyAllocator { + private: +#ifdef BUILD_TEST + FRIEND_TEST(LazyAllocator, testMergeFreeBlocks); + + FRIEND_TEST(LazyAllocator, testAllocWithEndFreeBlock); +#endif + + Runtime runtime; + + size_t used; + + size_t peak; + + size_t alignment; + + // pointer to the memory actually allocated + void *ptr; + + struct freeBlockInfo { + size_t addr; + size_t blockSize; + }; + + struct cmpFreeBlockInfo { + bool operator()(const freeBlockInfo &a, const freeBlockInfo &b) const { + return (a.blockSize != b.blockSize) ? (a.blockSize < b.blockSize) + : (a.addr < b.addr); + } + }; + + // free balanced tree, maintains all free memory blocks + std::set freeBlocks; + + // key: head address offset of the free memory block + // value: blockSize of the block + std::unordered_map headAddrToBlockSize; + + // key: tail address offset of the free memory block + // value: blockSize of the block + std::unordered_map tailAddrToBlockSize; + + public: + LazyAllocator(Runtime runtime); + + virtual ~LazyAllocator(); + + // function: simulate memory allocation + // arguments: + // size: size of memory block to be allocated + // return: head address offset of the allocated memory block + size_t alloc(size_t size); + + // function: simulate memory free + // arguments: + // addr: head address offset of memory block to be free + // size: size of memory block to be freed + void free(size_t addr, size_t size); + + // function: perform actual memory allocation + // return: pointer to the head address of the allocated memory + void *getPtr(); + + void info(); + + private: + // function: memory alignment, rouned up + // return: size of the aligned memory block + size_t getAlignedSize(size_t size); +}; + +} // namespace infini diff --git a/include/core/tensor.h b/include/core/tensor.h index d2fad79e..6dadd0d9 100644 --- a/include/core/tensor.h +++ b/include/core/tensor.h @@ -71,10 +71,16 @@ class TensorObj : public TensorBaseObj { void copyData(const TensorObj *src); void copyData(const Tensor &src) { copyData(src.get()); } + // TODO: Rename this function later, because it is confused that it will + // change the field data, but actually it generates data and maybe copy to + // device. // FIXME: std::fucntion copies the generator instead of passing it by ref. // Thus the internal state of generator cannot be updated. void setData( std::function const &generator) const; + + void setDataBlob(const Blob &blob); + Tensor clone() const { auto obj = make_ref(*this); obj->freeData(); diff --git a/src/core/blob.cc b/src/core/blob.cc index a5a71f30..c8ae3f62 100644 --- a/src/core/blob.cc +++ b/src/core/blob.cc @@ -5,7 +5,7 @@ namespace infini { BlobObj::~BlobObj() { // Avoid cycled inclusion - runtime->dealloc(ptr); + // destruction is performed in LazyAllocator } -} // namespace infini \ No newline at end of file +} // namespace infini diff --git a/src/core/graph.cc b/src/core/graph.cc index f8934b65..05f45fae 100644 --- a/src/core/graph.cc +++ b/src/core/graph.cc @@ -5,7 +5,7 @@ namespace infini { GraphObj::GraphObj(Runtime runtime, OpVec ops_in) - : runtime(runtime), sorted(false) { + : runtime(runtime), allocator(runtime), sorted(false) { map tensorPool; // Clone tensors for (const auto &op : ops_in) { @@ -124,9 +124,58 @@ void GraphObj::optimize() { } void GraphObj::dataMalloc() { + // topological sorting first + IT_ASSERT(topo_sort() == true); + // count the number of times all tensors are used + std::unordered_map tensorToRefCount; + // record the memory address offsets of all tensors to be allocated + std::unordered_map tensorToOffset; + + // record all constant tensors, including weight tensors and input tensors + std::unordered_set constTensor; for (auto &tensor : tensors) { - tensor->dataMalloc(); + if (tensor.get()->getSource() == nullptr) { + // allocate memory for all constant tensors first, and this memory + // will not be reused later + constTensor.insert(tensor.get()); + tensorToOffset[tensor.get()] = allocator.alloc(tensor->getBytes()); + } else { + tensorToRefCount[tensor.get()] = tensor->getTargets().size(); + } } + // traverse in topological order and simulate memory allocation + for (auto &op : ops) { + // memory should be allocated for the output first + auto outputs = op->getOutputs(); + for (auto &tensor : outputs) { + tensorToOffset[tensor.get()] = allocator.alloc(tensor->getBytes()); + } + auto inputs = op->getInputs(); + for (auto &tensor : inputs) { + if (constTensor.find(tensor.get()) == constTensor.end()) { + auto tensorIter = tensorToRefCount.find(tensor.get()); + IT_ASSERT(tensorIter != tensorToRefCount.end()); + tensorToRefCount[tensor.get()] -= 1; + if (tensorToRefCount[tensor.get()] == 0) { + // indicate that this tensor will no longer be used and + // perform memory free + tensorToRefCount.erase(tensor.get()); + allocator.free(tensorToOffset[tensor.get()], + tensor->getBytes()); + } + } + } + } + + // perform actual memory allocation + for (auto &tensor : tensors) { + IT_ASSERT(tensorToOffset.find(tensor.get()) != tensorToOffset.end()); + tensor->setDataBlob(make_ref( + tensor->runtime, static_cast(allocator.getPtr()) + + tensorToOffset[tensor.get()])); + } + + allocator.info(); } Tensor GraphObj::addTensor(Shape dim, DataType dtype) { diff --git a/src/core/lazy_allocator.cc b/src/core/lazy_allocator.cc new file mode 100644 index 00000000..bb7f766f --- /dev/null +++ b/src/core/lazy_allocator.cc @@ -0,0 +1,143 @@ +#include "core/lazy_allocator.h" +#include + +namespace infini { + +// In +// cuda-c-programming-guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses): +// Any address of a variable residing in global memory or returned by one of the +// memory allocation routines from the driver or runtime API is always aligned +// to at least 256 bytes. +constexpr size_t alignmentInBytesForCUDA = 256; + +LazyAllocator::LazyAllocator(Runtime runtime) : runtime(runtime) { + used = 0; + peak = 0; + ptr = nullptr; + if (runtime->isCuda()) { + // TODO: the alignment on cuda might need further discussion + alignment = alignmentInBytesForCUDA; + } else { + // 'alignment' defaults to sizeof(uint64_t), because it is the length of + // the longest data type currently supported by the DataType field of + // the tensor + // TODO: the alignment on bang might need further discussion + alignment = sizeof(uint64_t); + } +} + +LazyAllocator::~LazyAllocator() { + if (this->ptr != nullptr) { + runtime->dealloc(this->ptr); + } +} + +size_t LazyAllocator::alloc(size_t size) { + IT_ASSERT(this->ptr == nullptr); + // pad the size to the multiple of alignment + size = this->getAlignedSize(size); + auto it = this->freeBlocks.lower_bound(freeBlockInfo{(size_t)0, size}); + + size_t retAddr = this->peak; + if (it != this->freeBlocks.end()) { + // found an alvailable free memory block for allocation + size_t blockSize = it->blockSize; + retAddr = it->addr; + size_t tailAddr = retAddr + size; + // update the map of head and tail address offset of memory blocks + this->headAddrToBlockSize.erase(retAddr); + this->tailAddrToBlockSize.erase(tailAddr); + // memory block splitting + if (blockSize > tailAddr - retAddr) { + freeBlockInfo newBlock = {tailAddr, + blockSize - (tailAddr - retAddr)}; + this->headAddrToBlockSize[tailAddr] = newBlock.blockSize; + this->tailAddrToBlockSize[retAddr + blockSize] = newBlock.blockSize; + this->freeBlocks.insert(newBlock); + } + // update the free balanced tree + this->freeBlocks.erase(it); + this->used += tailAddr - retAddr; + } else { + // the allocated memory space is not sufficient for reallocation, it + // needs to be extended + auto blockTailWithPeak = this->tailAddrToBlockSize.find(this->peak); + if (blockTailWithPeak != this->tailAddrToBlockSize.end()) { + // there is a free block located at the end of the currently + // allocated memory, where this free block has its tail address as + // 'peak' + retAddr = this->peak - blockTailWithPeak->second; + IT_ASSERT(blockTailWithPeak->second < size); + this->peak += (size - blockTailWithPeak->second); + // updata freeBlocks, headAddrToBlockSize and tailAddrToBlockSize + freeBlockInfo endBlock = {retAddr, blockTailWithPeak->second}; + this->freeBlocks.erase(endBlock); + this->headAddrToBlockSize.erase(endBlock.addr); + this->tailAddrToBlockSize.erase(endBlock.addr + endBlock.blockSize); + } else { + this->peak = this->peak + size; + } + this->used += size; + } + + return retAddr; +} + +void LazyAllocator::free(size_t addr, size_t size) { + IT_ASSERT(this->ptr == nullptr); + size = getAlignedSize(size); + auto tailAddr = addr + size; + freeBlockInfo block = {addr, tailAddr - addr}; + this->headAddrToBlockSize[addr] = block.blockSize; + this->tailAddrToBlockSize[tailAddr] = block.blockSize; + auto preFreeBlockIter = this->tailAddrToBlockSize.find(addr); + auto subFreeBlockIter = this->headAddrToBlockSize.find(tailAddr); + if (preFreeBlockIter != this->tailAddrToBlockSize.end()) { + // the head address of the memory block to be freed matches the end of a + // free block, merge them together + size_t preBlockSize = preFreeBlockIter->second; + this->headAddrToBlockSize.erase(block.addr); + this->headAddrToBlockSize[block.addr - preBlockSize] += block.blockSize; + this->tailAddrToBlockSize.erase(block.addr); + this->tailAddrToBlockSize[tailAddr] += preBlockSize; + block.addr -= preBlockSize; + block.blockSize += preBlockSize; + // delete the preceding adjacent free block + this->freeBlocks.erase(freeBlockInfo{block.addr, preBlockSize}); + } + if (subFreeBlockIter != this->headAddrToBlockSize.end()) { + // the tail address of the memory block to be freed matches the start of + // a free block, merge them together + auto subBlockSize = subFreeBlockIter->second; + this->headAddrToBlockSize.erase(tailAddr); + this->headAddrToBlockSize[block.addr] += subBlockSize; + this->tailAddrToBlockSize.erase(tailAddr); + this->tailAddrToBlockSize[tailAddr + subBlockSize] += block.blockSize; + tailAddr += subBlockSize; + block.blockSize += subBlockSize; + // delete the succeeding adjacent memory block + this->freeBlocks.erase( + freeBlockInfo{tailAddr - subBlockSize, subBlockSize}); + } + this->freeBlocks.insert(block); + this->used -= size; +} + +void *LazyAllocator::getPtr() { + if (this->ptr == nullptr) { + this->ptr = runtime->alloc(this->peak); + printf("LazyAllocator really alloc: %p %lu bytes\n", this->ptr, peak); + } + return this->ptr; +} + +size_t LazyAllocator::getAlignedSize(size_t size) { + return ((size - 1) / this->alignment + 1) * this->alignment; +} + +void LazyAllocator::info() { + std::cout << "Used memory: " << this->used + << ", peak memory: " << this->peak << std::endl; +} + +} // namespace infini diff --git a/src/core/tensor.cc b/src/core/tensor.cc index c80ff8f7..f7c35d78 100644 --- a/src/core/tensor.cc +++ b/src/core/tensor.cc @@ -150,6 +150,8 @@ void TensorObj::setData( } } +void TensorObj::setDataBlob(const Blob &blob) { this->data = blob; } + void TensorObj::load(std::string file_path) { loadTensorData(this, file_path); } void TensorObj::save(std::string file_path) { saveTensorData(this, file_path); } diff --git a/src/kernels/intelcpu/element_wise.cc b/src/kernels/intelcpu/element_wise.cc index dbc19b32..2bccc819 100644 --- a/src/kernels/intelcpu/element_wise.cc +++ b/src/kernels/intelcpu/element_wise.cc @@ -6,7 +6,7 @@ namespace infini { class MklBinary : public MklKernelWithoutConfig { dnnl::algorithm getAlgorithem(const Ref &op) const { - switch (op->getOpType()) { + switch (op->getOpType().underlying()) { case OpType::Add: return dnnl::algorithm::binary_add; case OpType::Sub: @@ -64,7 +64,7 @@ class MklBinary : public MklKernelWithoutConfig { class MklUnary : public MklKernelWithoutConfig { dnnl::algorithm getAlgorithem(const Ref &op) const { - switch (op->getOpType()) { + switch (op->getOpType().underlying()) { case OpType::Relu: return dnnl::algorithm::eltwise_relu; case OpType::Tanh: diff --git a/src/kernels/intelcpu/matmul_dpcpp.cc b/src/kernels/intelcpu/matmul_dpcpp.cc index fd77ee39..8fdddfe2 100644 --- a/src/kernels/intelcpu/matmul_dpcpp.cc +++ b/src/kernels/intelcpu/matmul_dpcpp.cc @@ -69,7 +69,7 @@ template class MklDpcppMatmul : public CpuKernelWithoutConfig { } }; -REGISTER_KERNEL(Device::INTELCPU, OpType::Matmul, DataType::Float32, +REGISTER_KERNEL(Device::INTELCPU, OpType::MatMul, DataType::Float32, MklDpcppMatmul, "MklDpcppMatmul_CPU_float32"); } // namespace infini diff --git a/src/kernels/intelcpu/pooling.cc b/src/kernels/intelcpu/pooling.cc index d27238fe..cfe8364f 100644 --- a/src/kernels/intelcpu/pooling.cc +++ b/src/kernels/intelcpu/pooling.cc @@ -77,7 +77,7 @@ class MklMaxPool : public MklPooling { } }; -REGISTER_KERNEL(Device::INTELCPU, OpType::AvgPool, DataType::Float32, +REGISTER_KERNEL(Device::INTELCPU, OpType::AveragePool, DataType::Float32, MklAvgPool, "AvgPool_Mkl_Float32"); REGISTER_KERNEL(Device::INTELCPU, OpType::MaxPool, DataType::Float32, MklMaxPool, "MaxPool_Mkl_Float32"); diff --git a/test/core/test_lazy_allocator.cc b/test/core/test_lazy_allocator.cc new file mode 100644 index 00000000..736b1e7d --- /dev/null +++ b/test/core/test_lazy_allocator.cc @@ -0,0 +1,96 @@ +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +TEST(LazyAllocator, testMergeFreeBlocks) { + Shape shape = Shape{1, 2, 2, 3}; + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + Tensor a = make_ref(shape, DataType::Float32, runtime); + Tensor b = make_ref(shape, DataType::Float32, runtime); + Tensor c = make_ref(shape, DataType::Float32, runtime); + Tensor d = make_ref(shape, DataType::Float32, runtime); + LazyAllocator allocator = LazyAllocator(runtime); + // allocate a->b->c->d + allocator.alloc(a->getBytes()); + size_t offsetB = allocator.alloc(b->getBytes()); + size_t offsetC = allocator.alloc(c->getBytes()); + allocator.alloc(d->getBytes()); + // free b and c + allocator.free(offsetB, b->getBytes()); + allocator.free(offsetC, c->getBytes()); + // expected to be a->mergedFreeBlock->d, where mergedFreeBlock is the result + // of merging the memory blocks corresponding to the already freed b and c + EXPECT_EQ(allocator.freeBlocks.size(), 1); + EXPECT_EQ(allocator.freeBlocks.begin()->addr, offsetB); + EXPECT_EQ(allocator.freeBlocks.begin()->blockSize, + allocator.getAlignedSize(b->getBytes()) + + allocator.getAlignedSize(c->getBytes())); +} + +TEST(LazyAllocator, testAlloc) { + Shape shape = Shape{1, 2, 2, 3}; + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + Tensor a = make_ref(shape, DataType::Float32, runtime); + Tensor b = make_ref(shape, DataType::Float32, runtime); + Tensor c = make_ref(shape, DataType::Float32, runtime); + Tensor d = make_ref(shape, DataType::Float32, runtime); + LazyAllocator allocator = LazyAllocator(runtime); + // allocate a->b->c + allocator.alloc(a->getBytes()); + size_t offsetB = allocator.alloc(b->getBytes()); + allocator.alloc(c->getBytes()); + // free b, then allocate d + allocator.free(offsetB, b->getBytes()); + size_t offsetC = allocator.alloc(d->getBytes()); + // expected to be a->d->c + EXPECT_EQ(offsetB, offsetC); +} + +TEST(LazyAllocator, testAllocWithEndFreeBlock) { + Shape shape = Shape{1, 2, 2, 3}; + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + Tensor a = make_ref(shape, DataType::Float32, runtime); + Tensor b = make_ref(shape, DataType::Float32, runtime); + Tensor c = make_ref(shape, DataType::Float32, runtime); + Tensor d = + make_ref(Shape{2, 2, 2, 3}, DataType::Float32, runtime); + LazyAllocator allocator = LazyAllocator(runtime); + // allocate a->b->c + allocator.alloc(a->getBytes()); + allocator.alloc(b->getBytes()); + size_t offsetC = allocator.alloc(c->getBytes()); + allocator.info(); + // free c, then allocate d + allocator.free(offsetC, c->getBytes()); + size_t offsetD = allocator.alloc(d->getBytes()); + allocator.info(); + // expected to be a->b->d, with no free block between b and c + EXPECT_EQ(allocator.freeBlocks.size(), 0); + EXPECT_EQ(offsetC, offsetD); +} + +TEST(LazyAllocator, testGetPtr) { + Shape shape = Shape{1, 2, 2, 3}; + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + Tensor a = make_ref(shape, DataType::Float32, runtime); + Tensor b = make_ref(shape, DataType::Float32, runtime); + Tensor c = make_ref(shape, DataType::Float32, runtime); + Tensor d = make_ref(shape, DataType::Float32, runtime); + LazyAllocator allocator = LazyAllocator(runtime); + // allocate a->b->c->d + allocator.alloc(a->getBytes()); + allocator.alloc(b->getBytes()); + allocator.alloc(c->getBytes()); + allocator.alloc(d->getBytes()); + // multiple calls to the getPtr() function should return the same pointer + void *ptr1 = allocator.getPtr(); + void *ptr2 = allocator.getPtr(); + EXPECT_EQ(ptr1, ptr2); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_bangcKernel.cc b/test/kernels/bang/test_bang_bangcKernel.cc index 8c4e62e7..b89850f7 100644 --- a/test/kernels/bang/test_bang_bangcKernel.cc +++ b/test/kernels/bang/test_bang_bangcKernel.cc @@ -21,12 +21,8 @@ void testBangcKernel( // Build input data on CPU Tensor inputCpu1 = make_ref(shape, DataType::Float32, cpuRuntime); - inputCpu1->dataMalloc(); - inputCpu1->setData(generator); Tensor inputCpu2 = make_ref(shape, DataType::Float32, cpuRuntime); - inputCpu2->dataMalloc(); - inputCpu2->setData(generator); // inputCpu1->printData(); // inputCpu2->printData(); @@ -37,6 +33,8 @@ void testBangcKernel( auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); bangGraph->dataMalloc(); + inputGpu1->setData(generator); + inputGpu2->setData(generator); bangRuntime->run(bangGraph); auto outputGpu = gpuOp->getOutput(); auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); @@ -44,7 +42,11 @@ void testBangcKernel( // CPU Graph cpuGraph = make_ref(cpuRuntime); auto cpuOp = cpuGraph->addOp(inputCpu1, inputCpu2, nullptr); + cpuGraph->addTensor(inputCpu1); + cpuGraph->addTensor(inputCpu2); cpuGraph->dataMalloc(); + inputCpu1->setData(generator); + inputCpu2->setData(generator); cpuRuntime->run(cpuGraph); auto outputCpu = cpuOp->getOutput(); // outputCpu->printData(); diff --git a/test/kernels/bang/test_bang_conv.cc b/test/kernels/bang/test_bang_conv.cc index 0b415b0f..0ac49cff 100644 --- a/test/kernels/bang/test_bang_conv.cc +++ b/test/kernels/bang/test_bang_conv.cc @@ -19,12 +19,8 @@ void testConv(const std::function &generatorA, // Build input data on CPU Tensor inputCpu1 = make_ref(shapeA, DataType::Float32, cpuRuntime); - inputCpu1->dataMalloc(); - inputCpu1->setData(generatorA); Tensor inputCpu2 = make_ref(shapeB, DataType::Float32, cpuRuntime); - inputCpu2->dataMalloc(); - inputCpu2->setData(generatorB); // MLU Graph bangGraph = make_ref(bangRuntime); @@ -33,6 +29,8 @@ void testConv(const std::function &generatorA, auto mluOp = bangGraph->addOp(inputMlu1, inputMlu2, nullptr, 1, 1, 1, 1, 1, 1); bangGraph->dataMalloc(); + inputMlu1->setData(generatorA); + inputMlu2->setData(generatorB); bangRuntime->run(bangGraph); auto outputMlu = mluOp->getOutput(); auto outputMlu2Cpu = outputMlu->clone(cpuRuntime); @@ -40,7 +38,11 @@ void testConv(const std::function &generatorA, Graph cpuGraph = make_ref(cpuRuntime); auto cpuOp = cpuGraph->addOp(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1); + cpuGraph->addTensor(inputCpu1); + cpuGraph->addTensor(inputCpu2); cpuGraph->dataMalloc(); + inputCpu1->setData(generatorA); + inputCpu2->setData(generatorB); cpuRuntime->run(cpuGraph); auto outputCpu = cpuOp->getOutput(); outputCpu->print(); diff --git a/test/kernels/bang/test_bang_element_wise.cc b/test/kernels/bang/test_bang_element_wise.cc index 7dc6ac3e..e1809ff0 100644 --- a/test/kernels/bang/test_bang_element_wise.cc +++ b/test/kernels/bang/test_bang_element_wise.cc @@ -33,6 +33,8 @@ void testElementWiseCnnl( // allocate BANG memory g->dataMalloc(); + a->setData(generator); + b->setData(generator); // Execute on BANG bangRuntime->run(g); diff --git a/test/kernels/bang/test_bang_matmul.cc b/test/kernels/bang/test_bang_matmul.cc index f6a47802..a68cb541 100644 --- a/test/kernels/bang/test_bang_matmul.cc +++ b/test/kernels/bang/test_bang_matmul.cc @@ -20,12 +20,8 @@ void testMatmul(const std::function &generatorA, // Build input data on CPU Tensor inputCpu1 = make_ref(shapeA, DataType::Float32, cpuRuntime); - inputCpu1->dataMalloc(); - inputCpu1->setData(generatorA); Tensor inputCpu2 = make_ref(shapeB, DataType::Float32, cpuRuntime); - inputCpu2->dataMalloc(); - inputCpu2->setData(generatorB); // MLU Graph bangGraph = make_ref(bangRuntime); @@ -33,13 +29,19 @@ void testMatmul(const std::function &generatorA, auto inputMlu2 = bangGraph->cloneTensor(inputCpu2); auto mluOp = bangGraph->addOp(inputMlu1, inputMlu2, nullptr); bangGraph->dataMalloc(); + inputMlu1->setData(generatorA); + inputMlu2->setData(generatorB); bangRuntime->run(bangGraph); auto outputMlu = mluOp->getOutput(); auto outputMlu2Cpu = outputMlu->clone(cpuRuntime); // CPU Graph cpuGraph = make_ref(cpuRuntime); auto cpuOp = cpuGraph->addOp(inputCpu1, inputCpu2, nullptr); + cpuGraph->addTensor(inputCpu1); + cpuGraph->addTensor(inputCpu2); cpuGraph->dataMalloc(); + inputCpu1->setData(generatorA); + inputCpu2->setData(generatorB); cpuRuntime->run(cpuGraph); auto outputCpu = cpuOp->getOutput(); outputCpu->print(); diff --git a/test/kernels/bang/test_bang_optensor.cc b/test/kernels/bang/test_bang_optensor.cc index c46f80d3..08a7034f 100644 --- a/test/kernels/bang/test_bang_optensor.cc +++ b/test/kernels/bang/test_bang_optensor.cc @@ -19,12 +19,8 @@ void testOptensor( // Build input data on CPU Tensor inputCpu1 = make_ref(shape, DataType::Float32, cpuRuntime); - inputCpu1->dataMalloc(); - inputCpu1->setData(generator); Tensor inputCpu2 = make_ref(shape, DataType::Float32, cpuRuntime); - inputCpu2->dataMalloc(); - inputCpu2->setData(generator); // GPU Graph bangGraph = make_ref(bangRuntime); @@ -32,13 +28,19 @@ void testOptensor( auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); bangGraph->dataMalloc(); + inputGpu1->setData(generator); + inputGpu2->setData(generator); bangRuntime->run(bangGraph); auto outputGpu = gpuOp->getOutput(); auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); // CPU Graph cpuGraph = make_ref(cpuRuntime); auto cpuOp = cpuGraph->addOp(inputCpu1, inputCpu2, nullptr); + cpuGraph->addTensor(inputCpu1); + cpuGraph->addTensor(inputCpu2); cpuGraph->dataMalloc(); + inputCpu1->setData(generator); + inputCpu2->setData(generator); cpuRuntime->run(cpuGraph); auto outputCpu = cpuOp->getOutput(); // Check diff --git a/test/kernels/bang/test_bang_unary.cc b/test/kernels/bang/test_bang_unary.cc index 68534a3f..36b9c160 100644 --- a/test/kernels/bang/test_bang_unary.cc +++ b/test/kernels/bang/test_bang_unary.cc @@ -17,21 +17,22 @@ void testUnary(const std::function &generator, // Build input data on CPU Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); - inputCpu->dataMalloc(); - inputCpu->setData(generator); // GPU Graph bangGraph = make_ref(bangRuntime); auto inputGpu = bangGraph->cloneTensor(inputCpu); auto gpuOp = bangGraph->addOp(inputGpu, nullptr); bangGraph->dataMalloc(); + inputGpu->setData(generator); bangRuntime->run(bangGraph); auto outputGpu = gpuOp->getOutput(); auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); // CPU Graph cpuGraph = make_ref(cpuRuntime); auto cpuOp = cpuGraph->addOp(inputCpu, nullptr); + cpuGraph->addTensor(inputCpu); cpuGraph->dataMalloc(); + inputCpu->setData(generator); cpuRuntime->run(cpuGraph); auto outputCpu = cpuOp->getOutput(); // Check diff --git a/test/kernels/cuda/test_cuda_clip.cc b/test/kernels/cuda/test_cuda_clip.cc index 2c6abaf7..3402cce0 100644 --- a/test/kernels/cuda/test_cuda_clip.cc +++ b/test/kernels/cuda/test_cuda_clip.cc @@ -18,8 +18,6 @@ void testClip(const std::function &generator, // Build input data on CPU Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); - inputCpu->dataMalloc(); - inputCpu->setData(generator); // GPU Graph cudaGraph = make_ref(cudaRuntime); @@ -28,13 +26,16 @@ void testClip(const std::function &generator, float max = 4.0; auto gpuOp = cudaGraph->addOp(inputGpu, nullptr, min, max); cudaGraph->dataMalloc(); + inputGpu->setData(generator); cudaRuntime->run(cudaGraph); auto outputGpu = gpuOp->getOutput(); auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); // CPU Graph cpuGraph = make_ref(cpuRuntime); auto cpuOp = cpuGraph->addOp(inputCpu, nullptr, min, max); + cpuGraph->addTensor(inputCpu); cpuGraph->dataMalloc(); + inputCpu->setData(generator); cpuRuntime->run(cpuGraph); auto outputCpu = cpuOp->getOutput(); // Check diff --git a/test/kernels/cuda/test_cuda_concat.cc b/test/kernels/cuda/test_cuda_concat.cc index 41832e82..4bc7e950 100644 --- a/test/kernels/cuda/test_cuda_concat.cc +++ b/test/kernels/cuda/test_cuda_concat.cc @@ -58,11 +58,16 @@ TEST(Concat, Cuda) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); - auto op = gCuda->addOp(TensorVec{gCuda->cloneTensor(t1), - gCuda->cloneTensor(t2), - gCuda->cloneTensor(t3)}, - nullptr, 2); + auto t1Gpu = gCuda->cloneTensor(t1); + auto t2Gpu = gCuda->cloneTensor(t2); + auto t3Gpu = gCuda->cloneTensor(t3); + + auto op = + gCuda->addOp(TensorVec{t1Gpu, t2Gpu, t3Gpu}, nullptr, 2); gCuda->dataMalloc(); + t1Gpu->setData(IncrementalGenerator()); + t2Gpu->setData(OneGenerator()); + t3Gpu->setData(OneGenerator()); cudaRuntime->run(gCuda); // cudaPrintTensor(op->getOutput()); diff --git a/test/kernels/cuda/test_cuda_conv.cc b/test/kernels/cuda/test_cuda_conv.cc index 657ecd17..d096ecec 100644 --- a/test/kernels/cuda/test_cuda_conv.cc +++ b/test/kernels/cuda/test_cuda_conv.cc @@ -33,6 +33,8 @@ void testConvCudnn( gCuda->addOp(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2); // allocate CUDA memory gCuda->dataMalloc(); + i0Cuda->setData(generator); + w0Cuda->setData(generator); // Execute on CUDA cuda->run(gCuda); // copy output from CUDA to CPU @@ -72,6 +74,8 @@ TEST(cuDNN_Conv, tune) { gCuda->addOp(i0Cuda, w0Cuda, nullptr, 1, 1, 1, 1, 1, 1); // allocate CUDA memory gCuda->dataMalloc(); + i0Cuda->setData(IncrementalGenerator()); + w0Cuda->setData(IncrementalGenerator()); // Execute on CUDA bool tune = true; cuda->run(gCuda, tune); diff --git a/test/kernels/cuda/test_cuda_conv_fp16.cc b/test/kernels/cuda/test_cuda_conv_fp16.cc index 994e2dee..12f86748 100644 --- a/test/kernels/cuda/test_cuda_conv_fp16.cc +++ b/test/kernels/cuda/test_cuda_conv_fp16.cc @@ -35,6 +35,8 @@ void testConvCudnnFP16( gCuda->addOp(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2); // allocate CUDA memory gCuda->dataMalloc(); + i0Cuda->setData(generator); + w0Cuda->setData(generator); // Execute on CUDA cuda->run(gCuda); // copy output from CUDA to CPU @@ -71,6 +73,8 @@ TEST(cuDNN_Conv_FP16, tune) { gCuda->addOp(i0Cuda, w0Cuda, nullptr, 1, 1, 1, 1, 1, 1); // allocate CUDA memory gCuda->dataMalloc(); + i0Cuda->setData(IncrementalGenerator()); + w0Cuda->setData(IncrementalGenerator()); // Execute on CUDA bool tune = true; cuda->run(gCuda, tune); diff --git a/test/kernels/cuda/test_cuda_conv_transposed_2d.cc b/test/kernels/cuda/test_cuda_conv_transposed_2d.cc index 8def6a32..0c8899e4 100644 --- a/test/kernels/cuda/test_cuda_conv_transposed_2d.cc +++ b/test/kernels/cuda/test_cuda_conv_transposed_2d.cc @@ -36,6 +36,8 @@ void testConvTransposedCudnn( padding, padding, stride, stride, dilation, dilation); gCuda->dataMalloc(); + i0Cuda->setData(generator); + w0Cuda->setData(generator); // Execute on CUDA cuda->run(gCuda); // copy output from CUDA to CPU @@ -70,6 +72,8 @@ void testConvTransposedNHWCCudnn( i0Cuda, w0Cuda, nullptr, padding, padding, stride, stride, dilation, dilation); gCuda->dataMalloc(); + i0Cuda->setData(generator); + w0Cuda->setData(generator); // Execute on CUDA cuda->run(gCuda); // copy output from CUDA to CPU @@ -115,6 +119,8 @@ TEST(cuDNN_ConvTransposed, run1) { auto conv = gCuda->addOp(i0Cuda, w0Cuda, nullptr, 0, 0); gCuda->dataMalloc(); + i0Cuda->setData(IncrementalGenerator()); + w0Cuda->setData(IncrementalGenerator()); // Execute on CUDA cuda->run(gCuda); // copy output from CUDA to CPU @@ -148,6 +154,8 @@ TEST(cuDNN_ConvTransposed, tune) { auto conv = gCuda->addOp(i0Cuda, w0Cuda, nullptr); // allocate CUDA memory gCuda->dataMalloc(); + i0Cuda->setData(IncrementalGenerator()); + w0Cuda->setData(IncrementalGenerator()); // Execute on CUDA bool tune = true; cuda->run(gCuda, tune); diff --git a/test/kernels/cuda/test_cuda_element_wise.cc b/test/kernels/cuda/test_cuda_element_wise.cc index a5c04f77..05872388 100644 --- a/test/kernels/cuda/test_cuda_element_wise.cc +++ b/test/kernels/cuda/test_cuda_element_wise.cc @@ -19,12 +19,8 @@ void testElementWiseCudnn( // Build input data on CPU Tensor acpu = make_ref(shape, DataType::Float32, cpuRuntime); - acpu->dataMalloc(); - acpu->setData(generator); Tensor bcpu = make_ref(shape, DataType::Float32, cpuRuntime); - bcpu->dataMalloc(); - bcpu->setData(generator); // Build CUDA graph Graph g = make_ref(cudaRuntime); @@ -34,6 +30,8 @@ void testElementWiseCudnn( // allocate CUDA memory g->dataMalloc(); + a->setData(generator); + b->setData(generator); // Execute on CUDA cudaRuntime->run(g); diff --git a/test/kernels/cuda/test_cuda_extend.cc b/test/kernels/cuda/test_cuda_extend.cc index a0f431f3..10ca84cc 100644 --- a/test/kernels/cuda/test_cuda_extend.cc +++ b/test/kernels/cuda/test_cuda_extend.cc @@ -16,8 +16,6 @@ TEST(CUDA_Extend, run) { // Build input data on CPU Tensor icpu = make_ref(Shape{2, 3, 2, 2}, DataType::Float32, cpuRuntime); - icpu->dataMalloc(); - icpu->setData(IncrementalGenerator()); // Build CUDA graph Graph g = make_ref(cudaRuntime); @@ -26,6 +24,7 @@ TEST(CUDA_Extend, run) { // allocate CUDA memory g->dataMalloc(); + i->setData(IncrementalGenerator()); // Execute on CUDA cudaRuntime->run(g); diff --git a/test/kernels/cuda/test_cuda_gather.cc b/test/kernels/cuda/test_cuda_gather.cc index 90620a89..9dc987ba 100644 --- a/test/kernels/cuda/test_cuda_gather.cc +++ b/test/kernels/cuda/test_cuda_gather.cc @@ -186,9 +186,12 @@ TEST(Gather, Cuda) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); - auto op = gCuda->addOp( - gCuda->cloneTensor(input), gCuda->cloneTensor(index), nullptr, 0); + auto inputCuda = gCuda->cloneTensor(input); + auto indexCuda = gCuda->cloneTensor(index); + auto op = gCuda->addOp(inputCuda, indexCuda, nullptr, 0); gCuda->dataMalloc(); + inputCuda->copyin(vector{1, 2, 3, 4, 5, 6}); + indexCuda->copyin(vector{0, 1, 1, 2}); cudaRuntime->run(gCuda); // cudaPrintTensor(op->getOutput()); @@ -207,9 +210,12 @@ TEST(Gather, Cuda) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); - auto op = gCuda->addOp( - gCuda->cloneTensor(input), gCuda->cloneTensor(index), nullptr, 1); + auto inputCuda = gCuda->cloneTensor(input); + auto indexCuda = gCuda->cloneTensor(index); + auto op = gCuda->addOp(inputCuda, indexCuda, nullptr, 1); gCuda->dataMalloc(); + inputCuda->setData(IncrementalGenerator()); + indexCuda->copyin(vector{0, 2}); cudaRuntime->run(gCuda); // cudaPrintTensor(op->getOutput()); @@ -228,9 +234,12 @@ TEST(Gather, Cuda) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); - auto op = gCuda->addOp( - gCuda->cloneTensor(input), gCuda->cloneTensor(index), nullptr, 1); + auto inputCuda = gCuda->cloneTensor(input); + auto indexCuda = gCuda->cloneTensor(index); + auto op = gCuda->addOp(inputCuda, indexCuda, nullptr, 1); gCuda->dataMalloc(); + inputCuda->setData(IncrementalGenerator()); + indexCuda->copyin(vector{0, 3, 1}); cudaRuntime->run(gCuda); // cudaPrintTensor(op->getOutput()); diff --git a/test/kernels/cuda/test_cuda_matmul.cc b/test/kernels/cuda/test_cuda_matmul.cc index 805096c4..1f9bd677 100644 --- a/test/kernels/cuda/test_cuda_matmul.cc +++ b/test/kernels/cuda/test_cuda_matmul.cc @@ -32,6 +32,8 @@ void testMatmulCuda( // allocate CUDA memory gCuda->dataMalloc(); + ACuda->setData(generatorA); + BCuda->setData(generatorB); cudaRuntime->run(gCuda); auto CCpu = gCpu->cloneTensor(matmul->getOutput()); diff --git a/test/kernels/cuda/test_cuda_pad.cc b/test/kernels/cuda/test_cuda_pad.cc index ead88962..aca9af5a 100644 --- a/test/kernels/cuda/test_cuda_pad.cc +++ b/test/kernels/cuda/test_cuda_pad.cc @@ -13,8 +13,6 @@ TEST(Pad, Cuda) { // Build input data on CPU Tensor icpu = make_ref(Shape{1, 2, 3, 2}, DataType::Float32, cpuRuntime); - icpu->dataMalloc(); - icpu->setData(IncrementalGenerator()); // Build CUDA graph; Graph g = make_ref(cudaRuntime); @@ -24,6 +22,7 @@ TEST(Pad, Cuda) { // allocate CUDA memory g->dataMalloc(); + i->setData(IncrementalGenerator()); // Execute on CUDA cudaRuntime->run(g); diff --git a/test/kernels/cuda/test_cuda_pooling.cc b/test/kernels/cuda/test_cuda_pooling.cc index f055a881..7347b951 100644 --- a/test/kernels/cuda/test_cuda_pooling.cc +++ b/test/kernels/cuda/test_cuda_pooling.cc @@ -19,8 +19,6 @@ void testPoolCudnn( // Build input data on CPU Tensor i0cpu = make_ref(shape, DataType::Float32, cpuRuntime); - i0cpu->dataMalloc(); - i0cpu->setData(generator); // Build CUDA graph Graph g = make_ref(cudaRuntime); @@ -30,6 +28,7 @@ void testPoolCudnn( // allocate CUDA memory g->dataMalloc(); + i0->setData(generator); // Execute on CUDA cudaRuntime->run(g); diff --git a/test/kernels/cuda/test_cuda_reduce_mean.cc b/test/kernels/cuda/test_cuda_reduce_mean.cc index 830c49c4..2ad672a7 100644 --- a/test/kernels/cuda/test_cuda_reduce_mean.cc +++ b/test/kernels/cuda/test_cuda_reduce_mean.cc @@ -17,8 +17,6 @@ void test_reducemean(const Shape &shape, const vector &data, // Build input data on CPU Tensor icpu = make_ref(shape, DataType::Float32, cpuRuntime); - icpu->dataMalloc(); - icpu->copyin(data); // Build CUDA graph Graph g = make_ref(cudaRuntime); @@ -27,6 +25,7 @@ void test_reducemean(const Shape &shape, const vector &data, // allocate CUDA memory g->dataMalloc(); + i->copyin(data); // Execute on CUDA cudaRuntime->run(g); diff --git a/test/kernels/cuda/test_cuda_reshape.cc b/test/kernels/cuda/test_cuda_reshape.cc index 7e4a9c0c..a4b39c31 100644 --- a/test/kernels/cuda/test_cuda_reshape.cc +++ b/test/kernels/cuda/test_cuda_reshape.cc @@ -26,6 +26,7 @@ TEST(CUDA_Reshape, run) { // allocate CUDA memory g->dataMalloc(); + i->setData(IncrementalGenerator()); // Execute on CUDA cudaRuntime->run(g); @@ -55,6 +56,7 @@ TEST(CUDA_Flatten, run) { // allocate CUDA memory g->dataMalloc(); + i->setData(IncrementalGenerator()); // Execute on CUDA cudaRuntime->run(g); @@ -84,6 +86,7 @@ TEST(CUDA_Identity, run) { // allocate CUDA memory g->dataMalloc(); + i->setData(IncrementalGenerator()); // Execute on CUDA cudaRuntime->run(g); diff --git a/test/kernels/cuda/test_cuda_resize.cc b/test/kernels/cuda/test_cuda_resize.cc index 7b096790..57c3aaf5 100644 --- a/test/kernels/cuda/test_cuda_resize.cc +++ b/test/kernels/cuda/test_cuda_resize.cc @@ -19,11 +19,15 @@ TEST(Resize, Cuda_downsample_sizes_nearest) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto sizesCuda = gCuda->cloneTensor(sizes); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, - gCuda->cloneTensor(sizes), nullptr, nullptr, + inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr, ResizeObj::EKeepAspectRatioPolicy::stretch); gCuda->dataMalloc(); + inputCuda->copyin(vector{1, 2, 3, 4, 5, 6, 7, 8}); + sizesCuda->copyin(vector{1, 1, 1, 3}); + cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -44,13 +48,16 @@ TEST(Resize, Cuda_upsample_sizes_nearest_notlarger) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto sizesCuda = gCuda->cloneTensor(sizes); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, vector{2, 3}, - gCuda->cloneTensor(sizes), nullptr, nullptr, + inputCuda, nullptr, vector{2, 3}, sizesCuda, nullptr, nullptr, ResizeObj::EKeepAspectRatioPolicy::notLarger, ResizeObj::ENearestMode::roundPreferFloor, ResizeObj::ECoordinateTransMode::halfPixel); gCuda->dataMalloc(); + inputCuda->copyin(vector{1, 2, 3, 4}); + sizesCuda->copyin(vector{7, 8}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -74,13 +81,16 @@ TEST(Resize, Cuda_upsample_sizes_nearest_notsmaller) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto sizesCuda = gCuda->cloneTensor(sizes); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, vector{2, 3}, - gCuda->cloneTensor(sizes), nullptr, nullptr, + inputCuda, nullptr, vector{2, 3}, sizesCuda, nullptr, nullptr, ResizeObj::EKeepAspectRatioPolicy::notSmaller, ResizeObj::ENearestMode::roundPreferFloor, ResizeObj::ECoordinateTransMode::halfPixel); gCuda->dataMalloc(); + inputCuda->copyin(vector{1, 2, 3, 4}); + sizesCuda->copyin(vector{7, 8}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -105,13 +115,17 @@ TEST(Resize, Cuda_upsample_sizes_nearest_ceil_half_pixel) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto sizesCuda = gCuda->cloneTensor(sizes); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, - gCuda->cloneTensor(sizes), nullptr, nullptr, + inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr, ResizeObj::EKeepAspectRatioPolicy::stretch, ResizeObj::ENearestMode::ceil, ResizeObj::ECoordinateTransMode::halfPixel); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + sizesCuda->copyin(vector{1, 1, 8, 8}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -138,13 +152,17 @@ TEST(Resize, Cuda_upsample_sizes_nearest_floor_align_corners) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto sizesCuda = gCuda->cloneTensor(sizes); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, vector{3, 2}, - gCuda->cloneTensor(sizes), nullptr, nullptr, + inputCuda, nullptr, vector{3, 2}, sizesCuda, nullptr, nullptr, ResizeObj::EKeepAspectRatioPolicy::stretch, ResizeObj::ENearestMode::floor, ResizeObj::ECoordinateTransMode::alignCorners); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + sizesCuda->copyin(vector{8, 8}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -171,13 +189,18 @@ TEST(Resize, Cuda_upsample_sizes_nearest_round_prefer_ceil_asymmetri) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto sizesCuda = gCuda->cloneTensor(sizes); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, - gCuda->cloneTensor(sizes), nullptr, nullptr, + inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr, ResizeObj::EKeepAspectRatioPolicy::stretch, ResizeObj::ENearestMode::roundPreferCeil, ResizeObj::ECoordinateTransMode::asymmetric); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + sizesCuda->copyin(vector{1, 1, 8, 8}); + cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -203,10 +226,13 @@ TEST(Resize, Cuda_downsample_scales_nearest) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); - auto op = gCuda->addOp(gCuda->cloneTensor(input), nullptr, - std::nullopt, nullptr, - gCuda->cloneTensor(scales), nullptr); + auto inputCuda = gCuda->cloneTensor(input); + auto scalesCuda = gCuda->cloneTensor(scales); + auto op = gCuda->addOp(inputCuda, nullptr, std::nullopt, nullptr, + scalesCuda, nullptr); gCuda->dataMalloc(); + inputCuda->copyin(vector{1, 2, 3, 4, 5, 6, 7, 8}); + scalesCuda->copyin(vector{1, 1, 0.6, 0.6}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -227,10 +253,13 @@ TEST(Resize, Cuda_upsample_scales_nearest) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); - auto op = gCuda->addOp(gCuda->cloneTensor(input), nullptr, - std::nullopt, nullptr, - gCuda->cloneTensor(scales), nullptr); + auto inputCuda = gCuda->cloneTensor(input); + auto scalesCuda = gCuda->cloneTensor(scales); + auto op = gCuda->addOp(inputCuda, nullptr, std::nullopt, nullptr, + scalesCuda, nullptr); gCuda->dataMalloc(); + inputCuda->copyin(vector{1, 2, 3, 4}); + scalesCuda->copyin(vector{1, 1, 2, 3}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -253,10 +282,13 @@ TEST(Resize, Cuda_upsample_scales_nearest_axes_3_2) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); - auto op = gCuda->addOp(gCuda->cloneTensor(input), nullptr, - vector{3, 2}, nullptr, - gCuda->cloneTensor(scales), nullptr); + auto inputCuda = gCuda->cloneTensor(input); + auto scalesCuda = gCuda->cloneTensor(scales); + auto op = gCuda->addOp(inputCuda, nullptr, vector{3, 2}, + nullptr, scalesCuda, nullptr); gCuda->dataMalloc(); + inputCuda->copyin(vector{1, 2, 3, 4}); + scalesCuda->copyin(vector{3, 2}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -279,10 +311,14 @@ TEST(Resize, Cuda_downsample_scales_linear) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); - auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr, - gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::linear); + auto inputCuda = gCuda->cloneTensor(input); + auto scalesCuda = gCuda->cloneTensor(scales); + auto op = gCuda->addOp(inputCuda, nullptr, std::nullopt, nullptr, + scalesCuda, nullptr, + ResizeObj::ECoeffMode::linear); gCuda->dataMalloc(); + inputCuda->copyin(vector{1, 2, 3, 4, 5, 6, 7, 8}); + scalesCuda->copyin(vector{1, 1, 0.6, 0.6}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -303,12 +339,15 @@ TEST(Resize, Cuda_downsample_scales_linear_aligncorners) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto scalesCuda = gCuda->cloneTensor(scales); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr, - gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::linear, - ResizeObj::EKeepAspectRatioPolicy::none, + inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr, + ResizeObj::ECoeffMode::linear, ResizeObj::EKeepAspectRatioPolicy::none, ResizeObj::ECoordinateTransMode::alignCorners); gCuda->dataMalloc(); + inputCuda->copyin(vector{1, 2, 3, 4, 5, 6, 7, 8}); + scalesCuda->copyin(vector{1, 1, 0.6, 0.6}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -329,10 +368,14 @@ TEST(Resize, Cuda_upsample_scales_linear) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); - auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr, - gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::linear); + auto inputCuda = gCuda->cloneTensor(input); + auto scalesCuda = gCuda->cloneTensor(scales); + auto op = gCuda->addOp(inputCuda, nullptr, std::nullopt, nullptr, + scalesCuda, nullptr, + ResizeObj::ECoeffMode::linear); gCuda->dataMalloc(); + inputCuda->copyin(vector{1, 2, 3, 4}); + scalesCuda->copyin(vector{1, 1, 2, 2}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -355,12 +398,15 @@ TEST(Resize, Cuda_upsample_scales_linear_align_corners) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto scalesCuda = gCuda->cloneTensor(scales); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr, - gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::linear, - ResizeObj::EKeepAspectRatioPolicy::none, + inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr, + ResizeObj::ECoeffMode::linear, ResizeObj::EKeepAspectRatioPolicy::none, ResizeObj::ECoordinateTransMode::alignCorners); gCuda->dataMalloc(); + inputCuda->copyin(vector{1, 2, 3, 4}); + scalesCuda->copyin(vector{1, 1, 2, 2}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -384,13 +430,17 @@ TEST(Resize, Cuda_downsample_sizes_linear_pytorchhalfpixel) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto sizesCuda = gCuda->cloneTensor(sizes); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, - gCuda->cloneTensor(sizes), nullptr, nullptr, + inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr, ResizeObj::ECoeffMode::linear, ResizeObj::EKeepAspectRatioPolicy::stretch, ResizeObj::ECoordinateTransMode::pytorchHalfPixel); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + sizesCuda->copyin(vector{1, 1, 3, 1}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -414,13 +464,19 @@ TEST(Resize, Cuda_tf_crop_and_resize) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto sizesCuda = gCuda->cloneTensor(sizes); + auto roiCuda = gCuda->cloneTensor(roi); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, - gCuda->cloneTensor(sizes), nullptr, gCuda->cloneTensor(roi), + inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, roiCuda, ResizeObj::ECoeffMode::linear, ResizeObj::EKeepAspectRatioPolicy::stretch, ResizeObj::ECoordinateTransMode::tfCropAndResize); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + sizesCuda->copyin(vector{1, 1, 3, 3}); + roiCuda->copyin(vector{0, 0, 0.4, 0.6, 1, 1, 0.6, 0.8}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -445,13 +501,19 @@ TEST(Resize, Cuda_tf_crop_and_resize_axes_3_2) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto sizesCuda = gCuda->cloneTensor(sizes); + auto roiCuda = gCuda->cloneTensor(roi); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, vector{3, 2}, - gCuda->cloneTensor(sizes), nullptr, gCuda->cloneTensor(roi), + inputCuda, nullptr, vector{3, 2}, sizesCuda, nullptr, roiCuda, ResizeObj::ECoeffMode::linear, ResizeObj::EKeepAspectRatioPolicy::stretch, ResizeObj::ECoordinateTransMode::tfCropAndResize); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + sizesCuda->copyin(vector{3, 3}); + roiCuda->copyin(vector{0.6, 0.4, 0.8, 0.6}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -474,10 +536,15 @@ TEST(Resize, Cuda_downsample_scales_cubic) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); - auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr, - gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic); + auto inputCuda = gCuda->cloneTensor(input); + auto scalesCuda = gCuda->cloneTensor(scales); + auto op = gCuda->addOp(inputCuda, nullptr, std::nullopt, nullptr, + scalesCuda, nullptr, + ResizeObj::ECoeffMode::cubic); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + scalesCuda->copyin(vector{1.0, 1.0, 0.8, 0.8}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -501,12 +568,16 @@ TEST(Resize, Cuda_downsample_scales_cubic_align_corners) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto scalesCuda = gCuda->cloneTensor(scales); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr, - gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic, - ResizeObj::EKeepAspectRatioPolicy::none, + inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr, + ResizeObj::ECoeffMode::cubic, ResizeObj::EKeepAspectRatioPolicy::none, ResizeObj::ECoordinateTransMode::alignCorners); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + scalesCuda->copyin(vector{1.0, 1.0, 0.8, 0.8}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU auto oCpu = gCpu->cloneTensor(op->getOutput(0)); @@ -529,10 +600,15 @@ TEST(Resize, Cuda_upsample_scales_cubic) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); - auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr, - gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic); + auto inputCuda = gCuda->cloneTensor(input); + auto scalesCuda = gCuda->cloneTensor(scales); + auto op = gCuda->addOp(inputCuda, nullptr, std::nullopt, nullptr, + scalesCuda, nullptr, + ResizeObj::ECoeffMode::cubic); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + scalesCuda->copyin(vector{1.0, 1.0, 2, 2}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU auto oCpu = gCpu->cloneTensor(op->getOutput(0)); @@ -566,12 +642,16 @@ TEST(Resize, Cuda_upsample_scales_cubic_align_corners) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto scalesCuda = gCuda->cloneTensor(scales); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr, - gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic, - ResizeObj::EKeepAspectRatioPolicy::none, + inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr, + ResizeObj::ECoeffMode::cubic, ResizeObj::EKeepAspectRatioPolicy::none, ResizeObj::ECoordinateTransMode::alignCorners); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + scalesCuda->copyin(vector{1.0, 1.0, 2, 2}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU auto oCpu = gCpu->cloneTensor(op->getOutput(0)); @@ -605,12 +685,16 @@ TEST(Resize, Cuda_upsample_scales_cubic_asymmetric) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto scalesCuda = gCuda->cloneTensor(scales); auto op = gCuda->addOp( - gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr, - gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic, - ResizeObj::EKeepAspectRatioPolicy::none, + inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr, + ResizeObj::ECoeffMode::cubic, ResizeObj::EKeepAspectRatioPolicy::none, ResizeObj::ECoordinateTransMode::asymmetric); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + scalesCuda->copyin(vector{1.0, 1.0, 2, 2}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU auto oCpu = gCpu->cloneTensor(op->getOutput(0)); @@ -640,12 +724,16 @@ TEST(Resize, Cuda_downsample_sizes_cubic) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto sizesCuda = gCuda->cloneTensor(sizes); auto op = - gCuda->addOp(gCuda->cloneTensor(input), nullptr, - std::nullopt, gCuda->cloneTensor(sizes), + gCuda->addOp(inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr, ResizeObj::ECoeffMode::cubic, ResizeObj::EKeepAspectRatioPolicy::stretch); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + sizesCuda->copyin(vector{1, 1, 3, 3}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU @@ -674,12 +762,16 @@ TEST(Resize, Cuda_upsample_sizes_cubic) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); + auto inputCuda = gCuda->cloneTensor(input); + auto sizesCuda = gCuda->cloneTensor(sizes); auto op = - gCuda->addOp(gCuda->cloneTensor(input), nullptr, - std::nullopt, gCuda->cloneTensor(sizes), + gCuda->addOp(inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr, ResizeObj::ECoeffMode::cubic, ResizeObj::EKeepAspectRatioPolicy::stretch); gCuda->dataMalloc(); + inputCuda->copyin( + vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + sizesCuda->copyin(vector{1, 1, 9, 10}); cudaRuntime->run(gCuda); // copy output from CUDA to CPU auto oCpu = gCpu->cloneTensor(op->getOutput(0)); diff --git a/test/kernels/cuda/test_cuda_slice.cc b/test/kernels/cuda/test_cuda_slice.cc index 850fc2a8..5962d8cc 100644 --- a/test/kernels/cuda/test_cuda_slice.cc +++ b/test/kernels/cuda/test_cuda_slice.cc @@ -25,6 +25,7 @@ TEST(CUDA_Slice, run) { // allocate CUDA memory g->dataMalloc(); + i->setData(IncrementalGenerator()); // Execute on CUDA cudaRuntime->run(g); diff --git a/test/kernels/cuda/test_cuda_softmax.cc b/test/kernels/cuda/test_cuda_softmax.cc index 5a07ca78..9ce9705d 100644 --- a/test/kernels/cuda/test_cuda_softmax.cc +++ b/test/kernels/cuda/test_cuda_softmax.cc @@ -16,14 +16,13 @@ TEST(cuDNN_Softmax, run_axis1) { // Build input data on CPU Tensor inputCpu = make_ref(Shape{2, 4}, DataType::Float32, cpuRuntime); - inputCpu->dataMalloc(); - inputCpu->copyin(vector{0, 1, 2, 3, 10000, 10001, 10002, 10003}); // GPU Graph cudaGraph = make_ref(cudaRuntime); auto inputGpu = cudaGraph->cloneTensor(inputCpu); auto gpuOp = cudaGraph->addOp(inputGpu, nullptr, 1); cudaGraph->dataMalloc(); + inputGpu->copyin(vector{0, 1, 2, 3, 10000, 10001, 10002, 10003}); cudaRuntime->run(cudaGraph); auto outputGpu = gpuOp->getOutput(); auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); @@ -42,14 +41,13 @@ TEST(cuDNN_Softmax, run_axis0) { // Build input data on CPU Tensor inputCpu = make_ref(Shape{2, 4}, DataType::Float32, cpuRuntime); - inputCpu->dataMalloc(); - inputCpu->copyin(vector{0, 1, 2, 3, 10000, 10001, 10002, 10003}); // GPU Graph cudaGraph = make_ref(cudaRuntime); auto inputGpu = cudaGraph->cloneTensor(inputCpu); auto gpuOp = cudaGraph->addOp(inputGpu, nullptr, 0); cudaGraph->dataMalloc(); + inputGpu->copyin(vector{0, 1, 2, 3, 10000, 10001, 10002, 10003}); cudaRuntime->run(cudaGraph); auto outputGpu = gpuOp->getOutput(); auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); @@ -67,14 +65,13 @@ TEST(cuDNN_Softmax2, run_axis1) { // Build input data on CPU Tensor inputCpu = make_ref(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime); - inputCpu->dataMalloc(); - inputCpu->setData(IncrementalGenerator()); // GPU Graph cudaGraph = make_ref(cudaRuntime); auto inputGpu = cudaGraph->cloneTensor(inputCpu); auto gpuOp = cudaGraph->addOp(inputGpu, nullptr, 1); cudaGraph->dataMalloc(); + inputGpu->setData(IncrementalGenerator()); cudaRuntime->run(cudaGraph); auto outputGpu = gpuOp->getOutput(); auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); @@ -94,14 +91,13 @@ TEST(cuDNN_Softmax2, run_axis2) { // Build input data on CPU Tensor inputCpu = make_ref(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime); - inputCpu->dataMalloc(); - inputCpu->setData(IncrementalGenerator()); // GPU Graph cudaGraph = make_ref(cudaRuntime); auto inputGpu = cudaGraph->cloneTensor(inputCpu); auto gpuOp = cudaGraph->addOp(inputGpu, nullptr, 2); cudaGraph->dataMalloc(); + inputGpu->setData(IncrementalGenerator()); cudaRuntime->run(cudaGraph); auto outputGpu = gpuOp->getOutput(); auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); @@ -121,14 +117,13 @@ TEST(cuDNN_Softmax2, run_axis3) { // Build input data on CPU Tensor inputCpu = make_ref(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime); - inputCpu->dataMalloc(); - inputCpu->setData(IncrementalGenerator()); // GPU Graph cudaGraph = make_ref(cudaRuntime); auto inputGpu = cudaGraph->cloneTensor(inputCpu); auto gpuOp = cudaGraph->addOp(inputGpu, nullptr, 3); cudaGraph->dataMalloc(); + inputGpu->setData(IncrementalGenerator()); cudaRuntime->run(cudaGraph); auto outputGpu = gpuOp->getOutput(); auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); diff --git a/test/kernels/cuda/test_cuda_split.cc b/test/kernels/cuda/test_cuda_split.cc index 9b68a70c..163bba5c 100644 --- a/test/kernels/cuda/test_cuda_split.cc +++ b/test/kernels/cuda/test_cuda_split.cc @@ -19,9 +19,11 @@ TEST(Split, Cuda) { auto cudaRuntime = make_ref(); Graph gCuda = make_ref(cudaRuntime); - auto op = - gCuda->addOp(gCuda->cloneTensor(input), std::nullopt, 1, 3); + auto inputGpu = gCuda->cloneTensor(input); + auto op = gCuda->addOp(inputGpu, std::nullopt, 1, 3); gCuda->dataMalloc(); + inputGpu->setData(IncrementalGenerator()); + cudaRuntime->run(gCuda); // copy output from CUDA to CPU diff --git a/test/kernels/cuda/test_cuda_unary.cc b/test/kernels/cuda/test_cuda_unary.cc index c7beb760..22fed565 100644 --- a/test/kernels/cuda/test_cuda_unary.cc +++ b/test/kernels/cuda/test_cuda_unary.cc @@ -18,21 +18,22 @@ void testUnary(const std::function &generator, // Build input data on CPU Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); - inputCpu->dataMalloc(); - inputCpu->setData(generator); // GPU Graph cudaGraph = make_ref(cudaRuntime); auto inputGpu = cudaGraph->cloneTensor(inputCpu); auto gpuOp = cudaGraph->addOp(inputGpu, nullptr); cudaGraph->dataMalloc(); + inputGpu->setData(generator); cudaRuntime->run(cudaGraph); auto outputGpu = gpuOp->getOutput(); auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); // CPU Graph cpuGraph = make_ref(cpuRuntime); auto cpuOp = cpuGraph->addOp(inputCpu, nullptr); + cpuGraph->addTensor(inputCpu); cpuGraph->dataMalloc(); + inputCpu->setData(generator); cpuRuntime->run(cpuGraph); auto outputCpu = cpuOp->getOutput(); // Check diff --git a/test/kernels/intelcpu/test_mkl_gather.cc b/test/kernels/intelcpu/test_mkl_gather.cc index 1fc1f09b..cbe68533 100644 --- a/test/kernels/intelcpu/test_mkl_gather.cc +++ b/test/kernels/intelcpu/test_mkl_gather.cc @@ -13,12 +13,11 @@ TEST(Gather, Cuda) { Graph g = make_ref(runtime); auto input = g->addTensor({3, 2}, DataType::Float32); auto index = g->addTensor({2, 2}, DataType::UInt32); - g->dataMalloc(); - input->copyin(vector{1, 2, 3, 4, 5, 6}); - index->copyin(vector{0, 1, 1, 2}); auto op = g->addOp(input, index, nullptr, 0); g->dataMalloc(); + input->copyin(vector{1, 2, 3, 4, 5, 6}); + index->copyin(vector{0, 1, 1, 2}); runtime->run(g); EXPECT_TRUE( @@ -29,12 +28,11 @@ TEST(Gather, Cuda) { Graph g = make_ref(runtime); auto input = g->addTensor({3, 3}, DataType::Float32); auto index = g->addTensor({1, 2}, DataType::UInt32); - g->dataMalloc(); - input->setData(IncrementalGenerator()); - index->copyin(vector{0, 2}); auto op = g->addOp(input, index, nullptr, 1); g->dataMalloc(); + input->setData(IncrementalGenerator()); + index->copyin(vector{0, 2}); runtime->run(g); EXPECT_TRUE( @@ -45,12 +43,11 @@ TEST(Gather, Cuda) { Graph g = make_ref(runtime); auto input = g->addTensor({2, 4, 2}, DataType::Float32); auto index = g->addTensor({3, 1}, DataType::UInt32); - g->dataMalloc(); - input->setData(IncrementalGenerator()); - index->copyin(vector{0, 3, 1}); auto op = g->addOp(input, index, nullptr, 1); g->dataMalloc(); + input->setData(IncrementalGenerator()); + index->copyin(vector{0, 3, 1}); runtime->run(g); EXPECT_TRUE(op->getOutput()->equalData( diff --git a/test/kernels/intelcpu/test_mkl_matmul.cc b/test/kernels/intelcpu/test_mkl_matmul.cc index 8fcfe964..d5a5e9a7 100644 --- a/test/kernels/intelcpu/test_mkl_matmul.cc +++ b/test/kernels/intelcpu/test_mkl_matmul.cc @@ -19,13 +19,12 @@ void testMatmulMkl( Graph gCpu = make_ref(cpuRuntime); auto ACpu = gCpu->addTensor(shapeA, DataType::Float32); auto BCpu = gCpu->addTensor(shapeB, DataType::Float32); - gCpu->dataMalloc(); - ACpu->setData(generatorA); - BCpu->setData(generatorB); auto matmul = gCpu->addOp(ACpu, BCpu, nullptr, transA, transB); gCpu->dataMalloc(); + ACpu->setData(generatorA); + BCpu->setData(generatorB); cpuRuntime->run(gCpu); EXPECT_TRUE(matmul->getOutput()->equalData(ansVec)); } diff --git a/test/kernels/intelcpu/test_mkl_resize.cc b/test/kernels/intelcpu/test_mkl_resize.cc index c3c71a9d..68847829 100644 --- a/test/kernels/intelcpu/test_mkl_resize.cc +++ b/test/kernels/intelcpu/test_mkl_resize.cc @@ -18,11 +18,15 @@ TEST(Resize, Mkl_downsample_sizes_nearest) { auto runtime = make_ref(); Graph g = make_ref(runtime); - auto op = g->addOp(g->cloneTensor(input), nullptr, std::nullopt, - g->cloneTensor(sizes), nullptr, nullptr, - ResizeObj::EKeepAspectRatioPolicy::stretch, - ResizeObj::ENearestMode::ceil); + auto input2 = g->cloneTensor(input); + auto sizes2 = g->cloneTensor(sizes); + auto op = + g->addOp(input2, nullptr, std::nullopt, sizes2, nullptr, + nullptr, ResizeObj::EKeepAspectRatioPolicy::stretch, + ResizeObj::ENearestMode::ceil); g->dataMalloc(); + input2->copyin(vector{1, 2, 3, 4, 5, 6, 7, 8}); + sizes2->copyin(vector{1, 1, 1, 3}); runtime->run(g); EXPECT_TRUE(op->getOutput(0)->equalData(vector{5, 7, 8})); diff --git a/test/operators/test_clip.cc b/test/operators/test_clip.cc index 424f0c60..6453c3f9 100644 --- a/test/operators/test_clip.cc +++ b/test/operators/test_clip.cc @@ -15,15 +15,15 @@ void testClip(const std::function &generator, // Build input data on CPU Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); - inputCpu->dataMalloc(); - inputCpu->setData(generator); // GPU Graph Graph = make_ref(cpuRuntime); float min = 1.0; float max = 4.0; auto Op = Graph->addOp(inputCpu, nullptr, min, max); + Graph->addTensor(inputCpu); Graph->dataMalloc(); + inputCpu->setData(generator); cpuRuntime->run(Graph); auto output = Op->getOutput(); inputCpu->printData();