memory_allocator (#103)

* - add LazyAllocator class
- calculate memory consumption at present

* - basic function of lazy_allocator, remaining test

* - modify LazyAllocator

* - modify InfiniTensor to fit LazyAllocator

* - add setDataBlob
- modify alignment
- fix GraphObj::dataMalloc

* - modified alignment value(64bytes -> 8bytes)
- fix LazyAllocator::getPtr()
- some dubug codes and commonts
- do alignment by chaning size instead of tailAddr

* - fix some problem

* - translate chinese comments to english

* - format codes

* - fix test

* - code format

* - modify codes as YdrMaser and bitzyz suggested

* - code format

* - modify codes as constroy suggested

* - codes format

* - modify alignment on cuda

* - code format

* - add test_lazy_allocator
- fix tests where not add input tensor into graph.tensors
- fix tests where init tensor's data before calling graph->dataMallocate()

* - code format

* - remove gpu runtime in test_lazy_allocator

* - fix test_lazy_allocator: remove cuda include

* - add test

* - code format

* - add ifdef for test of allocator

* - code format

* - fix test: remove unused ifdef

* - fix bang test

* - code format

* Merge branch 'master' into dcj/memory_allocator

* fix: fix cuda conv_fp16 run fail

* fix bang_runtime.cc and cuda_runtime.cc

* - update mkl code

* - fix codes for mkl

* - code format

* - remove unused commented codes
- add an empty line at the end of the blob.cc

---------

Co-authored-by: zhangyunze <z13785159769@163.com>
This commit is contained in:
kilinchange 2023-08-13 13:39:35 +08:00 committed by GitHub
parent bd9e1aeb3f
commit 0dc5347089
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
41 changed files with 658 additions and 142 deletions

View File

@ -234,6 +234,7 @@ function(build_test files)
endfunction()
if(BUILD_TEST)
add_compile_definitions(BUILD_TEST=1)
enable_testing()
if(USE_TRACE)
build_test(test/trace/*.cc)

@ -1 +1 @@
Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9
Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77

View File

@ -1,4 +1,5 @@
#pragma once
#include "core/lazy_allocator.h"
#include "core/operator.h"
#include "core/tensor.h"
@ -9,9 +10,11 @@ class GraphObj : public Object {
Runtime runtime;
TensorVec tensors;
OpVec ops;
LazyAllocator allocator;
public:
explicit GraphObj(Runtime runtime) : runtime(runtime), sorted(false){};
explicit GraphObj(Runtime runtime)
: runtime(runtime), allocator(runtime), sorted(false){};
GraphObj(Runtime runtime, OpVec ops_in);
string toString() const override;
Runtime getRuntime() const { return runtime; }

View File

@ -0,0 +1,84 @@
#pragma once
#include "core/runtime.h"
#include "core/tensor.h"
#ifdef BUILD_TEST
#include "gtest/gtest.h"
#endif
#include <cstddef>
#include <map>
#include <unordered_set>
namespace infini {
class LazyAllocator {
private:
#ifdef BUILD_TEST
FRIEND_TEST(LazyAllocator, testMergeFreeBlocks);
FRIEND_TEST(LazyAllocator, testAllocWithEndFreeBlock);
#endif
Runtime runtime;
size_t used;
size_t peak;
size_t alignment;
// pointer to the memory actually allocated
void *ptr;
struct freeBlockInfo {
size_t addr;
size_t blockSize;
};
struct cmpFreeBlockInfo {
bool operator()(const freeBlockInfo &a, const freeBlockInfo &b) const {
return (a.blockSize != b.blockSize) ? (a.blockSize < b.blockSize)
: (a.addr < b.addr);
}
};
// free balanced tree, maintains all free memory blocks
std::set<freeBlockInfo, cmpFreeBlockInfo> freeBlocks;
// key: head address offset of the free memory block
// value: blockSize of the block
std::unordered_map<size_t, size_t> headAddrToBlockSize;
// key: tail address offset of the free memory block
// value: blockSize of the block
std::unordered_map<size_t, size_t> tailAddrToBlockSize;
public:
LazyAllocator(Runtime runtime);
virtual ~LazyAllocator();
// function: simulate memory allocation
// arguments
// size: size of memory block to be allocated
// return: head address offset of the allocated memory block
size_t alloc(size_t size);
// function: simulate memory free
// arguments:
// addr: head address offset of memory block to be free
// size: size of memory block to be freed
void free(size_t addr, size_t size);
// function: perform actual memory allocation
// return: pointer to the head address of the allocated memory
void *getPtr();
void info();
private:
// function: memory alignment, rouned up
// return: size of the aligned memory block
size_t getAlignedSize(size_t size);
};
} // namespace infini

View File

@ -71,10 +71,16 @@ class TensorObj : public TensorBaseObj {
void copyData(const TensorObj *src);
void copyData(const Tensor &src) { copyData(src.get()); }
// TODO: Rename this function later, because it is confused that it will
// change the field data, but actually it generates data and maybe copy to
// device.
// FIXME: std::fucntion copies the generator instead of passing it by ref.
// Thus the internal state of generator cannot be updated.
void setData(
std::function<void(void *, size_t, DataType)> const &generator) const;
void setDataBlob(const Blob &blob);
Tensor clone() const {
auto obj = make_ref<TensorObj>(*this);
obj->freeData();

View File

@ -5,7 +5,7 @@ namespace infini {
BlobObj::~BlobObj() {
// Avoid cycled inclusion
runtime->dealloc(ptr);
// destruction is performed in LazyAllocator
}
} // namespace infini
} // namespace infini

View File

@ -5,7 +5,7 @@
namespace infini {
GraphObj::GraphObj(Runtime runtime, OpVec ops_in)
: runtime(runtime), sorted(false) {
: runtime(runtime), allocator(runtime), sorted(false) {
map<UidBaseType, Tensor> tensorPool;
// Clone tensors
for (const auto &op : ops_in) {
@ -124,9 +124,58 @@ void GraphObj::optimize() {
}
void GraphObj::dataMalloc() {
// topological sorting first
IT_ASSERT(topo_sort() == true);
// count the number of times all tensors are used
std::unordered_map<TensorObj *, size_t> tensorToRefCount;
// record the memory address offsets of all tensors to be allocated
std::unordered_map<TensorObj *, size_t> tensorToOffset;
// record all constant tensors, including weight tensors and input tensors
std::unordered_set<TensorObj *> constTensor;
for (auto &tensor : tensors) {
tensor->dataMalloc();
if (tensor.get()->getSource() == nullptr) {
// allocate memory for all constant tensors first, and this memory
// will not be reused later
constTensor.insert(tensor.get());
tensorToOffset[tensor.get()] = allocator.alloc(tensor->getBytes());
} else {
tensorToRefCount[tensor.get()] = tensor->getTargets().size();
}
}
// traverse in topological order and simulate memory allocation
for (auto &op : ops) {
// memory should be allocated for the output first
auto outputs = op->getOutputs();
for (auto &tensor : outputs) {
tensorToOffset[tensor.get()] = allocator.alloc(tensor->getBytes());
}
auto inputs = op->getInputs();
for (auto &tensor : inputs) {
if (constTensor.find(tensor.get()) == constTensor.end()) {
auto tensorIter = tensorToRefCount.find(tensor.get());
IT_ASSERT(tensorIter != tensorToRefCount.end());
tensorToRefCount[tensor.get()] -= 1;
if (tensorToRefCount[tensor.get()] == 0) {
// indicate that this tensor will no longer be used and
// perform memory free
tensorToRefCount.erase(tensor.get());
allocator.free(tensorToOffset[tensor.get()],
tensor->getBytes());
}
}
}
}
// perform actual memory allocation
for (auto &tensor : tensors) {
IT_ASSERT(tensorToOffset.find(tensor.get()) != tensorToOffset.end());
tensor->setDataBlob(make_ref<BlobObj>(
tensor->runtime, static_cast<uint8_t *>(allocator.getPtr()) +
tensorToOffset[tensor.get()]));
}
allocator.info();
}
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {

143
src/core/lazy_allocator.cc Normal file
View File

@ -0,0 +1,143 @@
#include "core/lazy_allocator.h"
#include <utility>
namespace infini {
// In
// cuda-c-programming-guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses):
// Any address of a variable residing in global memory or returned by one of the
// memory allocation routines from the driver or runtime API is always aligned
// to at least 256 bytes.
constexpr size_t alignmentInBytesForCUDA = 256;
LazyAllocator::LazyAllocator(Runtime runtime) : runtime(runtime) {
used = 0;
peak = 0;
ptr = nullptr;
if (runtime->isCuda()) {
// TODO: the alignment on cuda might need further discussion
alignment = alignmentInBytesForCUDA;
} else {
// 'alignment' defaults to sizeof(uint64_t), because it is the length of
// the longest data type currently supported by the DataType field of
// the tensor
// TODO: the alignment on bang might need further discussion
alignment = sizeof(uint64_t);
}
}
LazyAllocator::~LazyAllocator() {
if (this->ptr != nullptr) {
runtime->dealloc(this->ptr);
}
}
size_t LazyAllocator::alloc(size_t size) {
IT_ASSERT(this->ptr == nullptr);
// pad the size to the multiple of alignment
size = this->getAlignedSize(size);
auto it = this->freeBlocks.lower_bound(freeBlockInfo{(size_t)0, size});
size_t retAddr = this->peak;
if (it != this->freeBlocks.end()) {
// found an alvailable free memory block for allocation
size_t blockSize = it->blockSize;
retAddr = it->addr;
size_t tailAddr = retAddr + size;
// update the map of head and tail address offset of memory blocks
this->headAddrToBlockSize.erase(retAddr);
this->tailAddrToBlockSize.erase(tailAddr);
// memory block splitting
if (blockSize > tailAddr - retAddr) {
freeBlockInfo newBlock = {tailAddr,
blockSize - (tailAddr - retAddr)};
this->headAddrToBlockSize[tailAddr] = newBlock.blockSize;
this->tailAddrToBlockSize[retAddr + blockSize] = newBlock.blockSize;
this->freeBlocks.insert(newBlock);
}
// update the free balanced tree
this->freeBlocks.erase(it);
this->used += tailAddr - retAddr;
} else {
// the allocated memory space is not sufficient for reallocation, it
// needs to be extended
auto blockTailWithPeak = this->tailAddrToBlockSize.find(this->peak);
if (blockTailWithPeak != this->tailAddrToBlockSize.end()) {
// there is a free block located at the end of the currently
// allocated memory, where this free block has its tail address as
// 'peak'
retAddr = this->peak - blockTailWithPeak->second;
IT_ASSERT(blockTailWithPeak->second < size);
this->peak += (size - blockTailWithPeak->second);
// updata freeBlocks, headAddrToBlockSize and tailAddrToBlockSize
freeBlockInfo endBlock = {retAddr, blockTailWithPeak->second};
this->freeBlocks.erase(endBlock);
this->headAddrToBlockSize.erase(endBlock.addr);
this->tailAddrToBlockSize.erase(endBlock.addr + endBlock.blockSize);
} else {
this->peak = this->peak + size;
}
this->used += size;
}
return retAddr;
}
void LazyAllocator::free(size_t addr, size_t size) {
IT_ASSERT(this->ptr == nullptr);
size = getAlignedSize(size);
auto tailAddr = addr + size;
freeBlockInfo block = {addr, tailAddr - addr};
this->headAddrToBlockSize[addr] = block.blockSize;
this->tailAddrToBlockSize[tailAddr] = block.blockSize;
auto preFreeBlockIter = this->tailAddrToBlockSize.find(addr);
auto subFreeBlockIter = this->headAddrToBlockSize.find(tailAddr);
if (preFreeBlockIter != this->tailAddrToBlockSize.end()) {
// the head address of the memory block to be freed matches the end of a
// free block, merge them together
size_t preBlockSize = preFreeBlockIter->second;
this->headAddrToBlockSize.erase(block.addr);
this->headAddrToBlockSize[block.addr - preBlockSize] += block.blockSize;
this->tailAddrToBlockSize.erase(block.addr);
this->tailAddrToBlockSize[tailAddr] += preBlockSize;
block.addr -= preBlockSize;
block.blockSize += preBlockSize;
// delete the preceding adjacent free block
this->freeBlocks.erase(freeBlockInfo{block.addr, preBlockSize});
}
if (subFreeBlockIter != this->headAddrToBlockSize.end()) {
// the tail address of the memory block to be freed matches the start of
// a free block, merge them together
auto subBlockSize = subFreeBlockIter->second;
this->headAddrToBlockSize.erase(tailAddr);
this->headAddrToBlockSize[block.addr] += subBlockSize;
this->tailAddrToBlockSize.erase(tailAddr);
this->tailAddrToBlockSize[tailAddr + subBlockSize] += block.blockSize;
tailAddr += subBlockSize;
block.blockSize += subBlockSize;
// delete the succeeding adjacent memory block
this->freeBlocks.erase(
freeBlockInfo{tailAddr - subBlockSize, subBlockSize});
}
this->freeBlocks.insert(block);
this->used -= size;
}
void *LazyAllocator::getPtr() {
if (this->ptr == nullptr) {
this->ptr = runtime->alloc(this->peak);
printf("LazyAllocator really alloc: %p %lu bytes\n", this->ptr, peak);
}
return this->ptr;
}
size_t LazyAllocator::getAlignedSize(size_t size) {
return ((size - 1) / this->alignment + 1) * this->alignment;
}
void LazyAllocator::info() {
std::cout << "Used memory: " << this->used
<< ", peak memory: " << this->peak << std::endl;
}
} // namespace infini

View File

@ -150,6 +150,8 @@ void TensorObj::setData(
}
}
void TensorObj::setDataBlob(const Blob &blob) { this->data = blob; }
void TensorObj::load(std::string file_path) { loadTensorData(this, file_path); }
void TensorObj::save(std::string file_path) { saveTensorData(this, file_path); }

View File

@ -6,7 +6,7 @@
namespace infini {
class MklBinary : public MklKernelWithoutConfig {
dnnl::algorithm getAlgorithem(const Ref<ElementWiseObj> &op) const {
switch (op->getOpType()) {
switch (op->getOpType().underlying()) {
case OpType::Add:
return dnnl::algorithm::binary_add;
case OpType::Sub:
@ -64,7 +64,7 @@ class MklBinary : public MklKernelWithoutConfig {
class MklUnary : public MklKernelWithoutConfig {
dnnl::algorithm getAlgorithem(const Ref<UnaryObj> &op) const {
switch (op->getOpType()) {
switch (op->getOpType().underlying()) {
case OpType::Relu:
return dnnl::algorithm::eltwise_relu;
case OpType::Tanh:

View File

@ -69,7 +69,7 @@ template <typename T> class MklDpcppMatmul : public CpuKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::INTELCPU, OpType::Matmul, DataType::Float32,
REGISTER_KERNEL(Device::INTELCPU, OpType::MatMul, DataType::Float32,
MklDpcppMatmul<float>, "MklDpcppMatmul_CPU_float32");
} // namespace infini

View File

@ -77,7 +77,7 @@ class MklMaxPool : public MklPooling {
}
};
REGISTER_KERNEL(Device::INTELCPU, OpType::AvgPool, DataType::Float32,
REGISTER_KERNEL(Device::INTELCPU, OpType::AveragePool, DataType::Float32,
MklAvgPool, "AvgPool_Mkl_Float32");
REGISTER_KERNEL(Device::INTELCPU, OpType::MaxPool, DataType::Float32,
MklMaxPool, "MaxPool_Mkl_Float32");

View File

@ -0,0 +1,96 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/unary.h"
#include "test.h"
namespace infini {
TEST(LazyAllocator, testMergeFreeBlocks) {
Shape shape = Shape{1, 2, 2, 3};
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
Tensor d = make_ref<TensorObj>(shape, DataType::Float32, runtime);
LazyAllocator allocator = LazyAllocator(runtime);
// allocate a->b->c->d
allocator.alloc(a->getBytes());
size_t offsetB = allocator.alloc(b->getBytes());
size_t offsetC = allocator.alloc(c->getBytes());
allocator.alloc(d->getBytes());
// free b and c
allocator.free(offsetB, b->getBytes());
allocator.free(offsetC, c->getBytes());
// expected to be a->mergedFreeBlock->d, where mergedFreeBlock is the result
// of merging the memory blocks corresponding to the already freed b and c
EXPECT_EQ(allocator.freeBlocks.size(), 1);
EXPECT_EQ(allocator.freeBlocks.begin()->addr, offsetB);
EXPECT_EQ(allocator.freeBlocks.begin()->blockSize,
allocator.getAlignedSize(b->getBytes()) +
allocator.getAlignedSize(c->getBytes()));
}
TEST(LazyAllocator, testAlloc) {
Shape shape = Shape{1, 2, 2, 3};
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
Tensor d = make_ref<TensorObj>(shape, DataType::Float32, runtime);
LazyAllocator allocator = LazyAllocator(runtime);
// allocate a->b->c
allocator.alloc(a->getBytes());
size_t offsetB = allocator.alloc(b->getBytes());
allocator.alloc(c->getBytes());
// free b, then allocate d
allocator.free(offsetB, b->getBytes());
size_t offsetC = allocator.alloc(d->getBytes());
// expected to be a->d->c
EXPECT_EQ(offsetB, offsetC);
}
TEST(LazyAllocator, testAllocWithEndFreeBlock) {
Shape shape = Shape{1, 2, 2, 3};
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
Tensor d =
make_ref<TensorObj>(Shape{2, 2, 2, 3}, DataType::Float32, runtime);
LazyAllocator allocator = LazyAllocator(runtime);
// allocate a->b->c
allocator.alloc(a->getBytes());
allocator.alloc(b->getBytes());
size_t offsetC = allocator.alloc(c->getBytes());
allocator.info();
// free c, then allocate d
allocator.free(offsetC, c->getBytes());
size_t offsetD = allocator.alloc(d->getBytes());
allocator.info();
// expected to be a->b->d, with no free block between b and c
EXPECT_EQ(allocator.freeBlocks.size(), 0);
EXPECT_EQ(offsetC, offsetD);
}
TEST(LazyAllocator, testGetPtr) {
Shape shape = Shape{1, 2, 2, 3};
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
Tensor d = make_ref<TensorObj>(shape, DataType::Float32, runtime);
LazyAllocator allocator = LazyAllocator(runtime);
// allocate a->b->c->d
allocator.alloc(a->getBytes());
allocator.alloc(b->getBytes());
allocator.alloc(c->getBytes());
allocator.alloc(d->getBytes());
// multiple calls to the getPtr() function should return the same pointer
void *ptr1 = allocator.getPtr();
void *ptr2 = allocator.getPtr();
EXPECT_EQ(ptr1, ptr2);
}
} // namespace infini

View File

@ -21,12 +21,8 @@ void testBangcKernel(
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generator);
Tensor inputCpu2 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu2->dataMalloc();
inputCpu2->setData(generator);
// inputCpu1->printData();
// inputCpu2->printData();
@ -37,6 +33,8 @@ void testBangcKernel(
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
bangGraph->dataMalloc();
inputGpu1->setData(generator);
inputGpu2->setData(generator);
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
@ -44,7 +42,11 @@ void testBangcKernel(
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
cpuGraph->addTensor(inputCpu1);
cpuGraph->addTensor(inputCpu2);
cpuGraph->dataMalloc();
inputCpu1->setData(generator);
inputCpu2->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// outputCpu->printData();

View File

@ -19,12 +19,8 @@ void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generatorA);
Tensor inputCpu2 =
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
inputCpu2->dataMalloc();
inputCpu2->setData(generatorB);
// MLU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
@ -33,6 +29,8 @@ void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
auto mluOp =
bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr, 1, 1, 1, 1, 1, 1);
bangGraph->dataMalloc();
inputMlu1->setData(generatorA);
inputMlu2->setData(generatorB);
bangRuntime->run(bangGraph);
auto outputMlu = mluOp->getOutput();
auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
@ -40,7 +38,11 @@ void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp =
cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1);
cpuGraph->addTensor(inputCpu1);
cpuGraph->addTensor(inputCpu2);
cpuGraph->dataMalloc();
inputCpu1->setData(generatorA);
inputCpu2->setData(generatorB);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
outputCpu->print();

View File

@ -33,6 +33,8 @@ void testElementWiseCnnl(
// allocate BANG memory
g->dataMalloc();
a->setData(generator);
b->setData(generator);
// Execute on BANG
bangRuntime->run(g);

View File

@ -20,12 +20,8 @@ void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generatorA);
Tensor inputCpu2 =
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
inputCpu2->dataMalloc();
inputCpu2->setData(generatorB);
// MLU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
@ -33,13 +29,19 @@ void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
auto inputMlu2 = bangGraph->cloneTensor(inputCpu2);
auto mluOp = bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr);
bangGraph->dataMalloc();
inputMlu1->setData(generatorA);
inputMlu2->setData(generatorB);
bangRuntime->run(bangGraph);
auto outputMlu = mluOp->getOutput();
auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
cpuGraph->addTensor(inputCpu1);
cpuGraph->addTensor(inputCpu2);
cpuGraph->dataMalloc();
inputCpu1->setData(generatorA);
inputCpu2->setData(generatorB);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
outputCpu->print();

View File

@ -19,12 +19,8 @@ void testOptensor(
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generator);
Tensor inputCpu2 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu2->dataMalloc();
inputCpu2->setData(generator);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
@ -32,13 +28,19 @@ void testOptensor(
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
bangGraph->dataMalloc();
inputGpu1->setData(generator);
inputGpu2->setData(generator);
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
cpuGraph->addTensor(inputCpu1);
cpuGraph->addTensor(inputCpu2);
cpuGraph->dataMalloc();
inputCpu1->setData(generator);
inputCpu2->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check

View File

@ -17,21 +17,22 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu = bangGraph->cloneTensor(inputCpu);
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
bangGraph->dataMalloc();
inputGpu->setData(generator);
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr);
cpuGraph->addTensor(inputCpu);
cpuGraph->dataMalloc();
inputCpu->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check

View File

@ -18,8 +18,6 @@ void testClip(const std::function<void(void *, size_t, DataType)> &generator,
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// GPU
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
@ -28,13 +26,16 @@ void testClip(const std::function<void(void *, size_t, DataType)> &generator,
float max = 4.0;
auto gpuOp = cudaGraph->addOp<T>(inputGpu, nullptr, min, max);
cudaGraph->dataMalloc();
inputGpu->setData(generator);
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr, min, max);
cpuGraph->addTensor(inputCpu);
cpuGraph->dataMalloc();
inputCpu->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check

View File

@ -58,11 +58,16 @@ TEST(Concat, Cuda) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto op = gCuda->addOp<ConcatObj>(TensorVec{gCuda->cloneTensor(t1),
gCuda->cloneTensor(t2),
gCuda->cloneTensor(t3)},
nullptr, 2);
auto t1Gpu = gCuda->cloneTensor(t1);
auto t2Gpu = gCuda->cloneTensor(t2);
auto t3Gpu = gCuda->cloneTensor(t3);
auto op =
gCuda->addOp<ConcatObj>(TensorVec{t1Gpu, t2Gpu, t3Gpu}, nullptr, 2);
gCuda->dataMalloc();
t1Gpu->setData(IncrementalGenerator());
t2Gpu->setData(OneGenerator());
t3Gpu->setData(OneGenerator());
cudaRuntime->run(gCuda);
// cudaPrintTensor(op->getOutput());

View File

@ -33,6 +33,8 @@ void testConvCudnn(
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
// allocate CUDA memory
gCuda->dataMalloc();
i0Cuda->setData(generator);
w0Cuda->setData(generator);
// Execute on CUDA
cuda->run(gCuda);
// copy output from CUDA to CPU
@ -72,6 +74,8 @@ TEST(cuDNN_Conv, tune) {
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 1, 1, 1, 1);
// allocate CUDA memory
gCuda->dataMalloc();
i0Cuda->setData(IncrementalGenerator());
w0Cuda->setData(IncrementalGenerator());
// Execute on CUDA
bool tune = true;
cuda->run(gCuda, tune);

View File

@ -35,6 +35,8 @@ void testConvCudnnFP16(
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
// allocate CUDA memory
gCuda->dataMalloc();
i0Cuda->setData(generator);
w0Cuda->setData(generator);
// Execute on CUDA
cuda->run(gCuda);
// copy output from CUDA to CPU
@ -71,6 +73,8 @@ TEST(cuDNN_Conv_FP16, tune) {
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 1, 1, 1, 1);
// allocate CUDA memory
gCuda->dataMalloc();
i0Cuda->setData(IncrementalGenerator());
w0Cuda->setData(IncrementalGenerator());
// Execute on CUDA
bool tune = true;
cuda->run(gCuda, tune);

View File

@ -36,6 +36,8 @@ void testConvTransposedCudnn(
padding, padding, stride,
stride, dilation, dilation);
gCuda->dataMalloc();
i0Cuda->setData(generator);
w0Cuda->setData(generator);
// Execute on CUDA
cuda->run(gCuda);
// copy output from CUDA to CPU
@ -70,6 +72,8 @@ void testConvTransposedNHWCCudnn(
i0Cuda, w0Cuda, nullptr, padding, padding, stride, stride, dilation,
dilation);
gCuda->dataMalloc();
i0Cuda->setData(generator);
w0Cuda->setData(generator);
// Execute on CUDA
cuda->run(gCuda);
// copy output from CUDA to CPU
@ -115,6 +119,8 @@ TEST(cuDNN_ConvTransposed, run1) {
auto conv =
gCuda->addOp<ConvTransposed2dObj>(i0Cuda, w0Cuda, nullptr, 0, 0);
gCuda->dataMalloc();
i0Cuda->setData(IncrementalGenerator());
w0Cuda->setData(IncrementalGenerator());
// Execute on CUDA
cuda->run(gCuda);
// copy output from CUDA to CPU
@ -148,6 +154,8 @@ TEST(cuDNN_ConvTransposed, tune) {
auto conv = gCuda->addOp<ConvTransposed2dObj>(i0Cuda, w0Cuda, nullptr);
// allocate CUDA memory
gCuda->dataMalloc();
i0Cuda->setData(IncrementalGenerator());
w0Cuda->setData(IncrementalGenerator());
// Execute on CUDA
bool tune = true;
cuda->run(gCuda, tune);

View File

@ -19,12 +19,8 @@ void testElementWiseCudnn(
// Build input data on CPU
Tensor acpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
acpu->dataMalloc();
acpu->setData(generator);
Tensor bcpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
bcpu->dataMalloc();
bcpu->setData(generator);
// Build CUDA graph
Graph g = make_ref<GraphObj>(cudaRuntime);
@ -34,6 +30,8 @@ void testElementWiseCudnn(
// allocate CUDA memory
g->dataMalloc();
a->setData(generator);
b->setData(generator);
// Execute on CUDA
cudaRuntime->run(g);

View File

@ -16,8 +16,6 @@ TEST(CUDA_Extend, run) {
// Build input data on CPU
Tensor icpu =
make_ref<TensorObj>(Shape{2, 3, 2, 2}, DataType::Float32, cpuRuntime);
icpu->dataMalloc();
icpu->setData(IncrementalGenerator());
// Build CUDA graph
Graph g = make_ref<GraphObj>(cudaRuntime);
@ -26,6 +24,7 @@ TEST(CUDA_Extend, run) {
// allocate CUDA memory
g->dataMalloc();
i->setData(IncrementalGenerator());
// Execute on CUDA
cudaRuntime->run(g);

View File

@ -186,9 +186,12 @@ TEST(Gather, Cuda) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto op = gCuda->addOp<GatherObj>(
gCuda->cloneTensor(input), gCuda->cloneTensor(index), nullptr, 0);
auto inputCuda = gCuda->cloneTensor(input);
auto indexCuda = gCuda->cloneTensor(index);
auto op = gCuda->addOp<GatherObj>(inputCuda, indexCuda, nullptr, 0);
gCuda->dataMalloc();
inputCuda->copyin(vector<float>{1, 2, 3, 4, 5, 6});
indexCuda->copyin(vector<uint32_t>{0, 1, 1, 2});
cudaRuntime->run(gCuda);
// cudaPrintTensor(op->getOutput());
@ -207,9 +210,12 @@ TEST(Gather, Cuda) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto op = gCuda->addOp<GatherObj>(
gCuda->cloneTensor(input), gCuda->cloneTensor(index), nullptr, 1);
auto inputCuda = gCuda->cloneTensor(input);
auto indexCuda = gCuda->cloneTensor(index);
auto op = gCuda->addOp<GatherObj>(inputCuda, indexCuda, nullptr, 1);
gCuda->dataMalloc();
inputCuda->setData(IncrementalGenerator());
indexCuda->copyin(vector<uint32_t>{0, 2});
cudaRuntime->run(gCuda);
// cudaPrintTensor(op->getOutput());
@ -228,9 +234,12 @@ TEST(Gather, Cuda) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto op = gCuda->addOp<GatherObj>(
gCuda->cloneTensor(input), gCuda->cloneTensor(index), nullptr, 1);
auto inputCuda = gCuda->cloneTensor(input);
auto indexCuda = gCuda->cloneTensor(index);
auto op = gCuda->addOp<GatherObj>(inputCuda, indexCuda, nullptr, 1);
gCuda->dataMalloc();
inputCuda->setData(IncrementalGenerator());
indexCuda->copyin(vector<uint32_t>{0, 3, 1});
cudaRuntime->run(gCuda);
// cudaPrintTensor(op->getOutput());

View File

@ -32,6 +32,8 @@ void testMatmulCuda(
// allocate CUDA memory
gCuda->dataMalloc();
ACuda->setData(generatorA);
BCuda->setData(generatorB);
cudaRuntime->run(gCuda);
auto CCpu = gCpu->cloneTensor(matmul->getOutput());

View File

@ -13,8 +13,6 @@ TEST(Pad, Cuda) {
// Build input data on CPU
Tensor icpu =
make_ref<TensorObj>(Shape{1, 2, 3, 2}, DataType::Float32, cpuRuntime);
icpu->dataMalloc();
icpu->setData(IncrementalGenerator());
// Build CUDA graph;
Graph g = make_ref<GraphObj>(cudaRuntime);
@ -24,6 +22,7 @@ TEST(Pad, Cuda) {
// allocate CUDA memory
g->dataMalloc();
i->setData(IncrementalGenerator());
// Execute on CUDA
cudaRuntime->run(g);

View File

@ -19,8 +19,6 @@ void testPoolCudnn(
// Build input data on CPU
Tensor i0cpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
i0cpu->dataMalloc();
i0cpu->setData(generator);
// Build CUDA graph
Graph g = make_ref<GraphObj>(cudaRuntime);
@ -30,6 +28,7 @@ void testPoolCudnn(
// allocate CUDA memory
g->dataMalloc();
i0->setData(generator);
// Execute on CUDA
cudaRuntime->run(g);

View File

@ -17,8 +17,6 @@ void test_reducemean(const Shape &shape, const vector<float> &data,
// Build input data on CPU
Tensor icpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
icpu->dataMalloc();
icpu->copyin(data);
// Build CUDA graph
Graph g = make_ref<GraphObj>(cudaRuntime);
@ -27,6 +25,7 @@ void test_reducemean(const Shape &shape, const vector<float> &data,
// allocate CUDA memory
g->dataMalloc();
i->copyin(data);
// Execute on CUDA
cudaRuntime->run(g);

View File

@ -26,6 +26,7 @@ TEST(CUDA_Reshape, run) {
// allocate CUDA memory
g->dataMalloc();
i->setData(IncrementalGenerator());
// Execute on CUDA
cudaRuntime->run(g);
@ -55,6 +56,7 @@ TEST(CUDA_Flatten, run) {
// allocate CUDA memory
g->dataMalloc();
i->setData(IncrementalGenerator());
// Execute on CUDA
cudaRuntime->run(g);
@ -84,6 +86,7 @@ TEST(CUDA_Identity, run) {
// allocate CUDA memory
g->dataMalloc();
i->setData(IncrementalGenerator());
// Execute on CUDA
cudaRuntime->run(g);

View File

@ -19,11 +19,15 @@ TEST(Resize, Cuda_downsample_sizes_nearest) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto sizesCuda = gCuda->cloneTensor(sizes);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt,
gCuda->cloneTensor(sizes), nullptr, nullptr,
inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr,
ResizeObj::EKeepAspectRatioPolicy::stretch);
gCuda->dataMalloc();
inputCuda->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
sizesCuda->copyin(vector<uint32_t>{1, 1, 1, 3});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -44,13 +48,16 @@ TEST(Resize, Cuda_upsample_sizes_nearest_notlarger) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto sizesCuda = gCuda->cloneTensor(sizes);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, vector<int>{2, 3},
gCuda->cloneTensor(sizes), nullptr, nullptr,
inputCuda, nullptr, vector<int>{2, 3}, sizesCuda, nullptr, nullptr,
ResizeObj::EKeepAspectRatioPolicy::notLarger,
ResizeObj::ENearestMode::roundPreferFloor,
ResizeObj::ECoordinateTransMode::halfPixel);
gCuda->dataMalloc();
inputCuda->copyin(vector<float>{1, 2, 3, 4});
sizesCuda->copyin(vector<uint32_t>{7, 8});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -74,13 +81,16 @@ TEST(Resize, Cuda_upsample_sizes_nearest_notsmaller) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto sizesCuda = gCuda->cloneTensor(sizes);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, vector<int>{2, 3},
gCuda->cloneTensor(sizes), nullptr, nullptr,
inputCuda, nullptr, vector<int>{2, 3}, sizesCuda, nullptr, nullptr,
ResizeObj::EKeepAspectRatioPolicy::notSmaller,
ResizeObj::ENearestMode::roundPreferFloor,
ResizeObj::ECoordinateTransMode::halfPixel);
gCuda->dataMalloc();
inputCuda->copyin(vector<float>{1, 2, 3, 4});
sizesCuda->copyin(vector<uint32_t>{7, 8});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -105,13 +115,17 @@ TEST(Resize, Cuda_upsample_sizes_nearest_ceil_half_pixel) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto sizesCuda = gCuda->cloneTensor(sizes);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt,
gCuda->cloneTensor(sizes), nullptr, nullptr,
inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr,
ResizeObj::EKeepAspectRatioPolicy::stretch,
ResizeObj::ENearestMode::ceil,
ResizeObj::ECoordinateTransMode::halfPixel);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
sizesCuda->copyin(vector<uint32_t>{1, 1, 8, 8});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -138,13 +152,17 @@ TEST(Resize, Cuda_upsample_sizes_nearest_floor_align_corners) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto sizesCuda = gCuda->cloneTensor(sizes);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, vector<int>{3, 2},
gCuda->cloneTensor(sizes), nullptr, nullptr,
inputCuda, nullptr, vector<int>{3, 2}, sizesCuda, nullptr, nullptr,
ResizeObj::EKeepAspectRatioPolicy::stretch,
ResizeObj::ENearestMode::floor,
ResizeObj::ECoordinateTransMode::alignCorners);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
sizesCuda->copyin(vector<uint32_t>{8, 8});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -171,13 +189,18 @@ TEST(Resize, Cuda_upsample_sizes_nearest_round_prefer_ceil_asymmetri) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto sizesCuda = gCuda->cloneTensor(sizes);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt,
gCuda->cloneTensor(sizes), nullptr, nullptr,
inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr,
ResizeObj::EKeepAspectRatioPolicy::stretch,
ResizeObj::ENearestMode::roundPreferCeil,
ResizeObj::ECoordinateTransMode::asymmetric);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
sizesCuda->copyin(vector<uint32_t>{1, 1, 8, 8});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -203,10 +226,13 @@ TEST(Resize, Cuda_downsample_scales_nearest) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto op = gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
std::nullopt, nullptr,
gCuda->cloneTensor(scales), nullptr);
auto inputCuda = gCuda->cloneTensor(input);
auto scalesCuda = gCuda->cloneTensor(scales);
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, nullptr,
scalesCuda, nullptr);
gCuda->dataMalloc();
inputCuda->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
scalesCuda->copyin(vector<float>{1, 1, 0.6, 0.6});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -227,10 +253,13 @@ TEST(Resize, Cuda_upsample_scales_nearest) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto op = gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
std::nullopt, nullptr,
gCuda->cloneTensor(scales), nullptr);
auto inputCuda = gCuda->cloneTensor(input);
auto scalesCuda = gCuda->cloneTensor(scales);
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, nullptr,
scalesCuda, nullptr);
gCuda->dataMalloc();
inputCuda->copyin(vector<float>{1, 2, 3, 4});
scalesCuda->copyin(vector<float>{1, 1, 2, 3});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -253,10 +282,13 @@ TEST(Resize, Cuda_upsample_scales_nearest_axes_3_2) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto op = gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
vector<int>{3, 2}, nullptr,
gCuda->cloneTensor(scales), nullptr);
auto inputCuda = gCuda->cloneTensor(input);
auto scalesCuda = gCuda->cloneTensor(scales);
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, vector<int>{3, 2},
nullptr, scalesCuda, nullptr);
gCuda->dataMalloc();
inputCuda->copyin(vector<float>{1, 2, 3, 4});
scalesCuda->copyin(vector<float>{3, 2});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -279,10 +311,14 @@ TEST(Resize, Cuda_downsample_scales_linear) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::linear);
auto inputCuda = gCuda->cloneTensor(input);
auto scalesCuda = gCuda->cloneTensor(scales);
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, nullptr,
scalesCuda, nullptr,
ResizeObj::ECoeffMode::linear);
gCuda->dataMalloc();
inputCuda->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
scalesCuda->copyin(vector<float>{1, 1, 0.6, 0.6});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -303,12 +339,15 @@ TEST(Resize, Cuda_downsample_scales_linear_aligncorners) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto scalesCuda = gCuda->cloneTensor(scales);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::linear,
ResizeObj::EKeepAspectRatioPolicy::none,
inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr,
ResizeObj::ECoeffMode::linear, ResizeObj::EKeepAspectRatioPolicy::none,
ResizeObj::ECoordinateTransMode::alignCorners);
gCuda->dataMalloc();
inputCuda->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
scalesCuda->copyin(vector<float>{1, 1, 0.6, 0.6});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -329,10 +368,14 @@ TEST(Resize, Cuda_upsample_scales_linear) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::linear);
auto inputCuda = gCuda->cloneTensor(input);
auto scalesCuda = gCuda->cloneTensor(scales);
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, nullptr,
scalesCuda, nullptr,
ResizeObj::ECoeffMode::linear);
gCuda->dataMalloc();
inputCuda->copyin(vector<float>{1, 2, 3, 4});
scalesCuda->copyin(vector<float>{1, 1, 2, 2});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -355,12 +398,15 @@ TEST(Resize, Cuda_upsample_scales_linear_align_corners) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto scalesCuda = gCuda->cloneTensor(scales);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::linear,
ResizeObj::EKeepAspectRatioPolicy::none,
inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr,
ResizeObj::ECoeffMode::linear, ResizeObj::EKeepAspectRatioPolicy::none,
ResizeObj::ECoordinateTransMode::alignCorners);
gCuda->dataMalloc();
inputCuda->copyin(vector<float>{1, 2, 3, 4});
scalesCuda->copyin(vector<float>{1, 1, 2, 2});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -384,13 +430,17 @@ TEST(Resize, Cuda_downsample_sizes_linear_pytorchhalfpixel) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto sizesCuda = gCuda->cloneTensor(sizes);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt,
gCuda->cloneTensor(sizes), nullptr, nullptr,
inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr,
ResizeObj::ECoeffMode::linear,
ResizeObj::EKeepAspectRatioPolicy::stretch,
ResizeObj::ECoordinateTransMode::pytorchHalfPixel);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
sizesCuda->copyin(vector<uint32_t>{1, 1, 3, 1});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -414,13 +464,19 @@ TEST(Resize, Cuda_tf_crop_and_resize) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto sizesCuda = gCuda->cloneTensor(sizes);
auto roiCuda = gCuda->cloneTensor(roi);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt,
gCuda->cloneTensor(sizes), nullptr, gCuda->cloneTensor(roi),
inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, roiCuda,
ResizeObj::ECoeffMode::linear,
ResizeObj::EKeepAspectRatioPolicy::stretch,
ResizeObj::ECoordinateTransMode::tfCropAndResize);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
sizesCuda->copyin(vector<uint32_t>{1, 1, 3, 3});
roiCuda->copyin(vector<float>{0, 0, 0.4, 0.6, 1, 1, 0.6, 0.8});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -445,13 +501,19 @@ TEST(Resize, Cuda_tf_crop_and_resize_axes_3_2) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto sizesCuda = gCuda->cloneTensor(sizes);
auto roiCuda = gCuda->cloneTensor(roi);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, vector<int>{3, 2},
gCuda->cloneTensor(sizes), nullptr, gCuda->cloneTensor(roi),
inputCuda, nullptr, vector<int>{3, 2}, sizesCuda, nullptr, roiCuda,
ResizeObj::ECoeffMode::linear,
ResizeObj::EKeepAspectRatioPolicy::stretch,
ResizeObj::ECoordinateTransMode::tfCropAndResize);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
sizesCuda->copyin(vector<uint32_t>{3, 3});
roiCuda->copyin(vector<float>{0.6, 0.4, 0.8, 0.6});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -474,10 +536,15 @@ TEST(Resize, Cuda_downsample_scales_cubic) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic);
auto inputCuda = gCuda->cloneTensor(input);
auto scalesCuda = gCuda->cloneTensor(scales);
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, nullptr,
scalesCuda, nullptr,
ResizeObj::ECoeffMode::cubic);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
scalesCuda->copyin(vector<float>{1.0, 1.0, 0.8, 0.8});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -501,12 +568,16 @@ TEST(Resize, Cuda_downsample_scales_cubic_align_corners) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto scalesCuda = gCuda->cloneTensor(scales);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic,
ResizeObj::EKeepAspectRatioPolicy::none,
inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr,
ResizeObj::ECoeffMode::cubic, ResizeObj::EKeepAspectRatioPolicy::none,
ResizeObj::ECoordinateTransMode::alignCorners);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
scalesCuda->copyin(vector<float>{1.0, 1.0, 0.8, 0.8});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
@ -529,10 +600,15 @@ TEST(Resize, Cuda_upsample_scales_cubic) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic);
auto inputCuda = gCuda->cloneTensor(input);
auto scalesCuda = gCuda->cloneTensor(scales);
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, nullptr,
scalesCuda, nullptr,
ResizeObj::ECoeffMode::cubic);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
scalesCuda->copyin(vector<float>{1.0, 1.0, 2, 2});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
@ -566,12 +642,16 @@ TEST(Resize, Cuda_upsample_scales_cubic_align_corners) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto scalesCuda = gCuda->cloneTensor(scales);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic,
ResizeObj::EKeepAspectRatioPolicy::none,
inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr,
ResizeObj::ECoeffMode::cubic, ResizeObj::EKeepAspectRatioPolicy::none,
ResizeObj::ECoordinateTransMode::alignCorners);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
scalesCuda->copyin(vector<float>{1.0, 1.0, 2, 2});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
@ -605,12 +685,16 @@ TEST(Resize, Cuda_upsample_scales_cubic_asymmetric) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto scalesCuda = gCuda->cloneTensor(scales);
auto op = gCuda->addOp<ResizeObj>(
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic,
ResizeObj::EKeepAspectRatioPolicy::none,
inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr,
ResizeObj::ECoeffMode::cubic, ResizeObj::EKeepAspectRatioPolicy::none,
ResizeObj::ECoordinateTransMode::asymmetric);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
scalesCuda->copyin(vector<float>{1.0, 1.0, 2, 2});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
@ -640,12 +724,16 @@ TEST(Resize, Cuda_downsample_sizes_cubic) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto sizesCuda = gCuda->cloneTensor(sizes);
auto op =
gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
std::nullopt, gCuda->cloneTensor(sizes),
gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, sizesCuda,
nullptr, nullptr, ResizeObj::ECoeffMode::cubic,
ResizeObj::EKeepAspectRatioPolicy::stretch);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
sizesCuda->copyin(vector<uint32_t>{1, 1, 3, 3});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
@ -674,12 +762,16 @@ TEST(Resize, Cuda_upsample_sizes_cubic) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto sizesCuda = gCuda->cloneTensor(sizes);
auto op =
gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
std::nullopt, gCuda->cloneTensor(sizes),
gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, sizesCuda,
nullptr, nullptr, ResizeObj::ECoeffMode::cubic,
ResizeObj::EKeepAspectRatioPolicy::stretch);
gCuda->dataMalloc();
inputCuda->copyin(
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
sizesCuda->copyin(vector<uint32_t>{1, 1, 9, 10});
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput(0));

View File

@ -25,6 +25,7 @@ TEST(CUDA_Slice, run) {
// allocate CUDA memory
g->dataMalloc();
i->setData(IncrementalGenerator());
// Execute on CUDA
cudaRuntime->run(g);

View File

@ -16,14 +16,13 @@ TEST(cuDNN_Softmax, run_axis1) {
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
// GPU
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
cudaGraph->dataMalloc();
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
@ -42,14 +41,13 @@ TEST(cuDNN_Softmax, run_axis0) {
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
// GPU
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 0);
cudaGraph->dataMalloc();
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
@ -67,14 +65,13 @@ TEST(cuDNN_Softmax2, run_axis1) {
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(IncrementalGenerator());
// GPU
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
cudaGraph->dataMalloc();
inputGpu->setData(IncrementalGenerator());
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
@ -94,14 +91,13 @@ TEST(cuDNN_Softmax2, run_axis2) {
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(IncrementalGenerator());
// GPU
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 2);
cudaGraph->dataMalloc();
inputGpu->setData(IncrementalGenerator());
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
@ -121,14 +117,13 @@ TEST(cuDNN_Softmax2, run_axis3) {
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(IncrementalGenerator());
// GPU
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 3);
cudaGraph->dataMalloc();
inputGpu->setData(IncrementalGenerator());
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);

View File

@ -19,9 +19,11 @@ TEST(Split, Cuda) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto op =
gCuda->addOp<SplitObj>(gCuda->cloneTensor(input), std::nullopt, 1, 3);
auto inputGpu = gCuda->cloneTensor(input);
auto op = gCuda->addOp<SplitObj>(inputGpu, std::nullopt, 1, 3);
gCuda->dataMalloc();
inputGpu->setData(IncrementalGenerator());
cudaRuntime->run(gCuda);
// copy output from CUDA to CPU

View File

@ -18,21 +18,22 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// GPU
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
auto gpuOp = cudaGraph->addOp<T>(inputGpu, nullptr);
cudaGraph->dataMalloc();
inputGpu->setData(generator);
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// CPU
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr);
cpuGraph->addTensor(inputCpu);
cpuGraph->dataMalloc();
inputCpu->setData(generator);
cpuRuntime->run(cpuGraph);
auto outputCpu = cpuOp->getOutput();
// Check

View File

@ -13,12 +13,11 @@ TEST(Gather, Cuda) {
Graph g = make_ref<GraphObj>(runtime);
auto input = g->addTensor({3, 2}, DataType::Float32);
auto index = g->addTensor({2, 2}, DataType::UInt32);
g->dataMalloc();
input->copyin(vector<float>{1, 2, 3, 4, 5, 6});
index->copyin(vector<uint32_t>{0, 1, 1, 2});
auto op = g->addOp<GatherObj>(input, index, nullptr, 0);
g->dataMalloc();
input->copyin(vector<float>{1, 2, 3, 4, 5, 6});
index->copyin(vector<uint32_t>{0, 1, 1, 2});
runtime->run(g);
EXPECT_TRUE(
@ -29,12 +28,11 @@ TEST(Gather, Cuda) {
Graph g = make_ref<GraphObj>(runtime);
auto input = g->addTensor({3, 3}, DataType::Float32);
auto index = g->addTensor({1, 2}, DataType::UInt32);
g->dataMalloc();
input->setData(IncrementalGenerator());
index->copyin(vector<uint32_t>{0, 2});
auto op = g->addOp<GatherObj>(input, index, nullptr, 1);
g->dataMalloc();
input->setData(IncrementalGenerator());
index->copyin(vector<uint32_t>{0, 2});
runtime->run(g);
EXPECT_TRUE(
@ -45,12 +43,11 @@ TEST(Gather, Cuda) {
Graph g = make_ref<GraphObj>(runtime);
auto input = g->addTensor({2, 4, 2}, DataType::Float32);
auto index = g->addTensor({3, 1}, DataType::UInt32);
g->dataMalloc();
input->setData(IncrementalGenerator());
index->copyin(vector<uint32_t>{0, 3, 1});
auto op = g->addOp<GatherObj>(input, index, nullptr, 1);
g->dataMalloc();
input->setData(IncrementalGenerator());
index->copyin(vector<uint32_t>{0, 3, 1});
runtime->run(g);
EXPECT_TRUE(op->getOutput()->equalData(

View File

@ -19,13 +19,12 @@ void testMatmulMkl(
Graph gCpu = make_ref<GraphObj>(cpuRuntime);
auto ACpu = gCpu->addTensor(shapeA, DataType::Float32);
auto BCpu = gCpu->addTensor(shapeB, DataType::Float32);
gCpu->dataMalloc();
ACpu->setData(generatorA);
BCpu->setData(generatorB);
auto matmul = gCpu->addOp<MatmulObj>(ACpu, BCpu, nullptr, transA, transB);
gCpu->dataMalloc();
ACpu->setData(generatorA);
BCpu->setData(generatorB);
cpuRuntime->run(gCpu);
EXPECT_TRUE(matmul->getOutput()->equalData(ansVec));
}

View File

@ -18,11 +18,15 @@ TEST(Resize, Mkl_downsample_sizes_nearest) {
auto runtime = make_ref<MklRuntimeObj>();
Graph g = make_ref<GraphObj>(runtime);
auto op = g->addOp<ResizeObj>(g->cloneTensor(input), nullptr, std::nullopt,
g->cloneTensor(sizes), nullptr, nullptr,
ResizeObj::EKeepAspectRatioPolicy::stretch,
ResizeObj::ENearestMode::ceil);
auto input2 = g->cloneTensor(input);
auto sizes2 = g->cloneTensor(sizes);
auto op =
g->addOp<ResizeObj>(input2, nullptr, std::nullopt, sizes2, nullptr,
nullptr, ResizeObj::EKeepAspectRatioPolicy::stretch,
ResizeObj::ENearestMode::ceil);
g->dataMalloc();
input2->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
sizes2->copyin(vector<uint32_t>{1, 1, 1, 3});
runtime->run(g);
EXPECT_TRUE(op->getOutput(0)->equalData(vector<float>{5, 7, 8}));

View File

@ -15,15 +15,15 @@ void testClip(const std::function<void(void *, size_t, DataType)> &generator,
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// GPU
Graph Graph = make_ref<GraphObj>(cpuRuntime);
float min = 1.0;
float max = 4.0;
auto Op = Graph->addOp<T>(inputCpu, nullptr, min, max);
Graph->addTensor(inputCpu);
Graph->dataMalloc();
inputCpu->setData(generator);
cpuRuntime->run(Graph);
auto output = Op->getOutput();
inputCpu->printData();