forked from jiuyuan/InfiniTensor
memory_allocator (#103)
* - add LazyAllocator class - calculate memory consumption at present * - basic function of lazy_allocator, remaining test * - modify LazyAllocator * - modify InfiniTensor to fit LazyAllocator * - add setDataBlob - modify alignment - fix GraphObj::dataMalloc * - modified alignment value(64bytes -> 8bytes) - fix LazyAllocator::getPtr() - some dubug codes and commonts - do alignment by chaning size instead of tailAddr * - fix some problem * - translate chinese comments to english * - format codes * - fix test * - code format * - modify codes as YdrMaser and bitzyz suggested * - code format * - modify codes as constroy suggested * - codes format * - modify alignment on cuda * - code format * - add test_lazy_allocator - fix tests where not add input tensor into graph.tensors - fix tests where init tensor's data before calling graph->dataMallocate() * - code format * - remove gpu runtime in test_lazy_allocator * - fix test_lazy_allocator: remove cuda include * - add test * - code format * - add ifdef for test of allocator * - code format * - fix test: remove unused ifdef * - fix bang test * - code format * Merge branch 'master' into dcj/memory_allocator * fix: fix cuda conv_fp16 run fail * fix bang_runtime.cc and cuda_runtime.cc * - update mkl code * - fix codes for mkl * - code format * - remove unused commented codes - add an empty line at the end of the blob.cc --------- Co-authored-by: zhangyunze <z13785159769@163.com>
This commit is contained in:
parent
bd9e1aeb3f
commit
0dc5347089
|
@ -234,6 +234,7 @@ function(build_test files)
|
|||
endfunction()
|
||||
|
||||
if(BUILD_TEST)
|
||||
add_compile_definitions(BUILD_TEST=1)
|
||||
enable_testing()
|
||||
if(USE_TRACE)
|
||||
build_test(test/trace/*.cc)
|
||||
|
|
2
example
2
example
|
@ -1 +1 @@
|
|||
Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9
|
||||
Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77
|
|
@ -1,4 +1,5 @@
|
|||
#pragma once
|
||||
#include "core/lazy_allocator.h"
|
||||
#include "core/operator.h"
|
||||
#include "core/tensor.h"
|
||||
|
||||
|
@ -9,9 +10,11 @@ class GraphObj : public Object {
|
|||
Runtime runtime;
|
||||
TensorVec tensors;
|
||||
OpVec ops;
|
||||
LazyAllocator allocator;
|
||||
|
||||
public:
|
||||
explicit GraphObj(Runtime runtime) : runtime(runtime), sorted(false){};
|
||||
explicit GraphObj(Runtime runtime)
|
||||
: runtime(runtime), allocator(runtime), sorted(false){};
|
||||
GraphObj(Runtime runtime, OpVec ops_in);
|
||||
string toString() const override;
|
||||
Runtime getRuntime() const { return runtime; }
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
#pragma once
|
||||
#include "core/runtime.h"
|
||||
#include "core/tensor.h"
|
||||
#ifdef BUILD_TEST
|
||||
#include "gtest/gtest.h"
|
||||
#endif
|
||||
#include <cstddef>
|
||||
#include <map>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace infini {
|
||||
|
||||
class LazyAllocator {
|
||||
private:
|
||||
#ifdef BUILD_TEST
|
||||
FRIEND_TEST(LazyAllocator, testMergeFreeBlocks);
|
||||
|
||||
FRIEND_TEST(LazyAllocator, testAllocWithEndFreeBlock);
|
||||
#endif
|
||||
|
||||
Runtime runtime;
|
||||
|
||||
size_t used;
|
||||
|
||||
size_t peak;
|
||||
|
||||
size_t alignment;
|
||||
|
||||
// pointer to the memory actually allocated
|
||||
void *ptr;
|
||||
|
||||
struct freeBlockInfo {
|
||||
size_t addr;
|
||||
size_t blockSize;
|
||||
};
|
||||
|
||||
struct cmpFreeBlockInfo {
|
||||
bool operator()(const freeBlockInfo &a, const freeBlockInfo &b) const {
|
||||
return (a.blockSize != b.blockSize) ? (a.blockSize < b.blockSize)
|
||||
: (a.addr < b.addr);
|
||||
}
|
||||
};
|
||||
|
||||
// free balanced tree, maintains all free memory blocks
|
||||
std::set<freeBlockInfo, cmpFreeBlockInfo> freeBlocks;
|
||||
|
||||
// key: head address offset of the free memory block
|
||||
// value: blockSize of the block
|
||||
std::unordered_map<size_t, size_t> headAddrToBlockSize;
|
||||
|
||||
// key: tail address offset of the free memory block
|
||||
// value: blockSize of the block
|
||||
std::unordered_map<size_t, size_t> tailAddrToBlockSize;
|
||||
|
||||
public:
|
||||
LazyAllocator(Runtime runtime);
|
||||
|
||||
virtual ~LazyAllocator();
|
||||
|
||||
// function: simulate memory allocation
|
||||
// arguments:
|
||||
// size: size of memory block to be allocated
|
||||
// return: head address offset of the allocated memory block
|
||||
size_t alloc(size_t size);
|
||||
|
||||
// function: simulate memory free
|
||||
// arguments:
|
||||
// addr: head address offset of memory block to be free
|
||||
// size: size of memory block to be freed
|
||||
void free(size_t addr, size_t size);
|
||||
|
||||
// function: perform actual memory allocation
|
||||
// return: pointer to the head address of the allocated memory
|
||||
void *getPtr();
|
||||
|
||||
void info();
|
||||
|
||||
private:
|
||||
// function: memory alignment, rouned up
|
||||
// return: size of the aligned memory block
|
||||
size_t getAlignedSize(size_t size);
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -71,10 +71,16 @@ class TensorObj : public TensorBaseObj {
|
|||
void copyData(const TensorObj *src);
|
||||
void copyData(const Tensor &src) { copyData(src.get()); }
|
||||
|
||||
// TODO: Rename this function later, because it is confused that it will
|
||||
// change the field data, but actually it generates data and maybe copy to
|
||||
// device.
|
||||
// FIXME: std::fucntion copies the generator instead of passing it by ref.
|
||||
// Thus the internal state of generator cannot be updated.
|
||||
void setData(
|
||||
std::function<void(void *, size_t, DataType)> const &generator) const;
|
||||
|
||||
void setDataBlob(const Blob &blob);
|
||||
|
||||
Tensor clone() const {
|
||||
auto obj = make_ref<TensorObj>(*this);
|
||||
obj->freeData();
|
||||
|
|
|
@ -5,7 +5,7 @@ namespace infini {
|
|||
|
||||
BlobObj::~BlobObj() {
|
||||
// Avoid cycled inclusion
|
||||
runtime->dealloc(ptr);
|
||||
// destruction is performed in LazyAllocator
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -5,7 +5,7 @@
|
|||
namespace infini {
|
||||
|
||||
GraphObj::GraphObj(Runtime runtime, OpVec ops_in)
|
||||
: runtime(runtime), sorted(false) {
|
||||
: runtime(runtime), allocator(runtime), sorted(false) {
|
||||
map<UidBaseType, Tensor> tensorPool;
|
||||
// Clone tensors
|
||||
for (const auto &op : ops_in) {
|
||||
|
@ -124,9 +124,58 @@ void GraphObj::optimize() {
|
|||
}
|
||||
|
||||
void GraphObj::dataMalloc() {
|
||||
// topological sorting first
|
||||
IT_ASSERT(topo_sort() == true);
|
||||
// count the number of times all tensors are used
|
||||
std::unordered_map<TensorObj *, size_t> tensorToRefCount;
|
||||
// record the memory address offsets of all tensors to be allocated
|
||||
std::unordered_map<TensorObj *, size_t> tensorToOffset;
|
||||
|
||||
// record all constant tensors, including weight tensors and input tensors
|
||||
std::unordered_set<TensorObj *> constTensor;
|
||||
for (auto &tensor : tensors) {
|
||||
tensor->dataMalloc();
|
||||
if (tensor.get()->getSource() == nullptr) {
|
||||
// allocate memory for all constant tensors first, and this memory
|
||||
// will not be reused later
|
||||
constTensor.insert(tensor.get());
|
||||
tensorToOffset[tensor.get()] = allocator.alloc(tensor->getBytes());
|
||||
} else {
|
||||
tensorToRefCount[tensor.get()] = tensor->getTargets().size();
|
||||
}
|
||||
}
|
||||
// traverse in topological order and simulate memory allocation
|
||||
for (auto &op : ops) {
|
||||
// memory should be allocated for the output first
|
||||
auto outputs = op->getOutputs();
|
||||
for (auto &tensor : outputs) {
|
||||
tensorToOffset[tensor.get()] = allocator.alloc(tensor->getBytes());
|
||||
}
|
||||
auto inputs = op->getInputs();
|
||||
for (auto &tensor : inputs) {
|
||||
if (constTensor.find(tensor.get()) == constTensor.end()) {
|
||||
auto tensorIter = tensorToRefCount.find(tensor.get());
|
||||
IT_ASSERT(tensorIter != tensorToRefCount.end());
|
||||
tensorToRefCount[tensor.get()] -= 1;
|
||||
if (tensorToRefCount[tensor.get()] == 0) {
|
||||
// indicate that this tensor will no longer be used and
|
||||
// perform memory free
|
||||
tensorToRefCount.erase(tensor.get());
|
||||
allocator.free(tensorToOffset[tensor.get()],
|
||||
tensor->getBytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// perform actual memory allocation
|
||||
for (auto &tensor : tensors) {
|
||||
IT_ASSERT(tensorToOffset.find(tensor.get()) != tensorToOffset.end());
|
||||
tensor->setDataBlob(make_ref<BlobObj>(
|
||||
tensor->runtime, static_cast<uint8_t *>(allocator.getPtr()) +
|
||||
tensorToOffset[tensor.get()]));
|
||||
}
|
||||
|
||||
allocator.info();
|
||||
}
|
||||
|
||||
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
|
||||
|
|
|
@ -0,0 +1,143 @@
|
|||
#include "core/lazy_allocator.h"
|
||||
#include <utility>
|
||||
|
||||
namespace infini {
|
||||
|
||||
// In
|
||||
// cuda-c-programming-guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses):
|
||||
// Any address of a variable residing in global memory or returned by one of the
|
||||
// memory allocation routines from the driver or runtime API is always aligned
|
||||
// to at least 256 bytes.
|
||||
constexpr size_t alignmentInBytesForCUDA = 256;
|
||||
|
||||
LazyAllocator::LazyAllocator(Runtime runtime) : runtime(runtime) {
|
||||
used = 0;
|
||||
peak = 0;
|
||||
ptr = nullptr;
|
||||
if (runtime->isCuda()) {
|
||||
// TODO: the alignment on cuda might need further discussion
|
||||
alignment = alignmentInBytesForCUDA;
|
||||
} else {
|
||||
// 'alignment' defaults to sizeof(uint64_t), because it is the length of
|
||||
// the longest data type currently supported by the DataType field of
|
||||
// the tensor
|
||||
// TODO: the alignment on bang might need further discussion
|
||||
alignment = sizeof(uint64_t);
|
||||
}
|
||||
}
|
||||
|
||||
LazyAllocator::~LazyAllocator() {
|
||||
if (this->ptr != nullptr) {
|
||||
runtime->dealloc(this->ptr);
|
||||
}
|
||||
}
|
||||
|
||||
size_t LazyAllocator::alloc(size_t size) {
|
||||
IT_ASSERT(this->ptr == nullptr);
|
||||
// pad the size to the multiple of alignment
|
||||
size = this->getAlignedSize(size);
|
||||
auto it = this->freeBlocks.lower_bound(freeBlockInfo{(size_t)0, size});
|
||||
|
||||
size_t retAddr = this->peak;
|
||||
if (it != this->freeBlocks.end()) {
|
||||
// found an alvailable free memory block for allocation
|
||||
size_t blockSize = it->blockSize;
|
||||
retAddr = it->addr;
|
||||
size_t tailAddr = retAddr + size;
|
||||
// update the map of head and tail address offset of memory blocks
|
||||
this->headAddrToBlockSize.erase(retAddr);
|
||||
this->tailAddrToBlockSize.erase(tailAddr);
|
||||
// memory block splitting
|
||||
if (blockSize > tailAddr - retAddr) {
|
||||
freeBlockInfo newBlock = {tailAddr,
|
||||
blockSize - (tailAddr - retAddr)};
|
||||
this->headAddrToBlockSize[tailAddr] = newBlock.blockSize;
|
||||
this->tailAddrToBlockSize[retAddr + blockSize] = newBlock.blockSize;
|
||||
this->freeBlocks.insert(newBlock);
|
||||
}
|
||||
// update the free balanced tree
|
||||
this->freeBlocks.erase(it);
|
||||
this->used += tailAddr - retAddr;
|
||||
} else {
|
||||
// the allocated memory space is not sufficient for reallocation, it
|
||||
// needs to be extended
|
||||
auto blockTailWithPeak = this->tailAddrToBlockSize.find(this->peak);
|
||||
if (blockTailWithPeak != this->tailAddrToBlockSize.end()) {
|
||||
// there is a free block located at the end of the currently
|
||||
// allocated memory, where this free block has its tail address as
|
||||
// 'peak'
|
||||
retAddr = this->peak - blockTailWithPeak->second;
|
||||
IT_ASSERT(blockTailWithPeak->second < size);
|
||||
this->peak += (size - blockTailWithPeak->second);
|
||||
// updata freeBlocks, headAddrToBlockSize and tailAddrToBlockSize
|
||||
freeBlockInfo endBlock = {retAddr, blockTailWithPeak->second};
|
||||
this->freeBlocks.erase(endBlock);
|
||||
this->headAddrToBlockSize.erase(endBlock.addr);
|
||||
this->tailAddrToBlockSize.erase(endBlock.addr + endBlock.blockSize);
|
||||
} else {
|
||||
this->peak = this->peak + size;
|
||||
}
|
||||
this->used += size;
|
||||
}
|
||||
|
||||
return retAddr;
|
||||
}
|
||||
|
||||
void LazyAllocator::free(size_t addr, size_t size) {
|
||||
IT_ASSERT(this->ptr == nullptr);
|
||||
size = getAlignedSize(size);
|
||||
auto tailAddr = addr + size;
|
||||
freeBlockInfo block = {addr, tailAddr - addr};
|
||||
this->headAddrToBlockSize[addr] = block.blockSize;
|
||||
this->tailAddrToBlockSize[tailAddr] = block.blockSize;
|
||||
auto preFreeBlockIter = this->tailAddrToBlockSize.find(addr);
|
||||
auto subFreeBlockIter = this->headAddrToBlockSize.find(tailAddr);
|
||||
if (preFreeBlockIter != this->tailAddrToBlockSize.end()) {
|
||||
// the head address of the memory block to be freed matches the end of a
|
||||
// free block, merge them together
|
||||
size_t preBlockSize = preFreeBlockIter->second;
|
||||
this->headAddrToBlockSize.erase(block.addr);
|
||||
this->headAddrToBlockSize[block.addr - preBlockSize] += block.blockSize;
|
||||
this->tailAddrToBlockSize.erase(block.addr);
|
||||
this->tailAddrToBlockSize[tailAddr] += preBlockSize;
|
||||
block.addr -= preBlockSize;
|
||||
block.blockSize += preBlockSize;
|
||||
// delete the preceding adjacent free block
|
||||
this->freeBlocks.erase(freeBlockInfo{block.addr, preBlockSize});
|
||||
}
|
||||
if (subFreeBlockIter != this->headAddrToBlockSize.end()) {
|
||||
// the tail address of the memory block to be freed matches the start of
|
||||
// a free block, merge them together
|
||||
auto subBlockSize = subFreeBlockIter->second;
|
||||
this->headAddrToBlockSize.erase(tailAddr);
|
||||
this->headAddrToBlockSize[block.addr] += subBlockSize;
|
||||
this->tailAddrToBlockSize.erase(tailAddr);
|
||||
this->tailAddrToBlockSize[tailAddr + subBlockSize] += block.blockSize;
|
||||
tailAddr += subBlockSize;
|
||||
block.blockSize += subBlockSize;
|
||||
// delete the succeeding adjacent memory block
|
||||
this->freeBlocks.erase(
|
||||
freeBlockInfo{tailAddr - subBlockSize, subBlockSize});
|
||||
}
|
||||
this->freeBlocks.insert(block);
|
||||
this->used -= size;
|
||||
}
|
||||
|
||||
void *LazyAllocator::getPtr() {
|
||||
if (this->ptr == nullptr) {
|
||||
this->ptr = runtime->alloc(this->peak);
|
||||
printf("LazyAllocator really alloc: %p %lu bytes\n", this->ptr, peak);
|
||||
}
|
||||
return this->ptr;
|
||||
}
|
||||
|
||||
size_t LazyAllocator::getAlignedSize(size_t size) {
|
||||
return ((size - 1) / this->alignment + 1) * this->alignment;
|
||||
}
|
||||
|
||||
void LazyAllocator::info() {
|
||||
std::cout << "Used memory: " << this->used
|
||||
<< ", peak memory: " << this->peak << std::endl;
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -150,6 +150,8 @@ void TensorObj::setData(
|
|||
}
|
||||
}
|
||||
|
||||
void TensorObj::setDataBlob(const Blob &blob) { this->data = blob; }
|
||||
|
||||
void TensorObj::load(std::string file_path) { loadTensorData(this, file_path); }
|
||||
|
||||
void TensorObj::save(std::string file_path) { saveTensorData(this, file_path); }
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
namespace infini {
|
||||
class MklBinary : public MklKernelWithoutConfig {
|
||||
dnnl::algorithm getAlgorithem(const Ref<ElementWiseObj> &op) const {
|
||||
switch (op->getOpType()) {
|
||||
switch (op->getOpType().underlying()) {
|
||||
case OpType::Add:
|
||||
return dnnl::algorithm::binary_add;
|
||||
case OpType::Sub:
|
||||
|
@ -64,7 +64,7 @@ class MklBinary : public MklKernelWithoutConfig {
|
|||
|
||||
class MklUnary : public MklKernelWithoutConfig {
|
||||
dnnl::algorithm getAlgorithem(const Ref<UnaryObj> &op) const {
|
||||
switch (op->getOpType()) {
|
||||
switch (op->getOpType().underlying()) {
|
||||
case OpType::Relu:
|
||||
return dnnl::algorithm::eltwise_relu;
|
||||
case OpType::Tanh:
|
||||
|
|
|
@ -69,7 +69,7 @@ template <typename T> class MklDpcppMatmul : public CpuKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::INTELCPU, OpType::Matmul, DataType::Float32,
|
||||
REGISTER_KERNEL(Device::INTELCPU, OpType::MatMul, DataType::Float32,
|
||||
MklDpcppMatmul<float>, "MklDpcppMatmul_CPU_float32");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -77,7 +77,7 @@ class MklMaxPool : public MklPooling {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::INTELCPU, OpType::AvgPool, DataType::Float32,
|
||||
REGISTER_KERNEL(Device::INTELCPU, OpType::AveragePool, DataType::Float32,
|
||||
MklAvgPool, "AvgPool_Mkl_Float32");
|
||||
REGISTER_KERNEL(Device::INTELCPU, OpType::MaxPool, DataType::Float32,
|
||||
MklMaxPool, "MaxPool_Mkl_Float32");
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(LazyAllocator, testMergeFreeBlocks) {
|
||||
Shape shape = Shape{1, 2, 2, 3};
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
Tensor d = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
LazyAllocator allocator = LazyAllocator(runtime);
|
||||
// allocate a->b->c->d
|
||||
allocator.alloc(a->getBytes());
|
||||
size_t offsetB = allocator.alloc(b->getBytes());
|
||||
size_t offsetC = allocator.alloc(c->getBytes());
|
||||
allocator.alloc(d->getBytes());
|
||||
// free b and c
|
||||
allocator.free(offsetB, b->getBytes());
|
||||
allocator.free(offsetC, c->getBytes());
|
||||
// expected to be a->mergedFreeBlock->d, where mergedFreeBlock is the result
|
||||
// of merging the memory blocks corresponding to the already freed b and c
|
||||
EXPECT_EQ(allocator.freeBlocks.size(), 1);
|
||||
EXPECT_EQ(allocator.freeBlocks.begin()->addr, offsetB);
|
||||
EXPECT_EQ(allocator.freeBlocks.begin()->blockSize,
|
||||
allocator.getAlignedSize(b->getBytes()) +
|
||||
allocator.getAlignedSize(c->getBytes()));
|
||||
}
|
||||
|
||||
TEST(LazyAllocator, testAlloc) {
|
||||
Shape shape = Shape{1, 2, 2, 3};
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
Tensor d = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
LazyAllocator allocator = LazyAllocator(runtime);
|
||||
// allocate a->b->c
|
||||
allocator.alloc(a->getBytes());
|
||||
size_t offsetB = allocator.alloc(b->getBytes());
|
||||
allocator.alloc(c->getBytes());
|
||||
// free b, then allocate d
|
||||
allocator.free(offsetB, b->getBytes());
|
||||
size_t offsetC = allocator.alloc(d->getBytes());
|
||||
// expected to be a->d->c
|
||||
EXPECT_EQ(offsetB, offsetC);
|
||||
}
|
||||
|
||||
TEST(LazyAllocator, testAllocWithEndFreeBlock) {
|
||||
Shape shape = Shape{1, 2, 2, 3};
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
Tensor d =
|
||||
make_ref<TensorObj>(Shape{2, 2, 2, 3}, DataType::Float32, runtime);
|
||||
LazyAllocator allocator = LazyAllocator(runtime);
|
||||
// allocate a->b->c
|
||||
allocator.alloc(a->getBytes());
|
||||
allocator.alloc(b->getBytes());
|
||||
size_t offsetC = allocator.alloc(c->getBytes());
|
||||
allocator.info();
|
||||
// free c, then allocate d
|
||||
allocator.free(offsetC, c->getBytes());
|
||||
size_t offsetD = allocator.alloc(d->getBytes());
|
||||
allocator.info();
|
||||
// expected to be a->b->d, with no free block between b and c
|
||||
EXPECT_EQ(allocator.freeBlocks.size(), 0);
|
||||
EXPECT_EQ(offsetC, offsetD);
|
||||
}
|
||||
|
||||
TEST(LazyAllocator, testGetPtr) {
|
||||
Shape shape = Shape{1, 2, 2, 3};
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
Tensor d = make_ref<TensorObj>(shape, DataType::Float32, runtime);
|
||||
LazyAllocator allocator = LazyAllocator(runtime);
|
||||
// allocate a->b->c->d
|
||||
allocator.alloc(a->getBytes());
|
||||
allocator.alloc(b->getBytes());
|
||||
allocator.alloc(c->getBytes());
|
||||
allocator.alloc(d->getBytes());
|
||||
// multiple calls to the getPtr() function should return the same pointer
|
||||
void *ptr1 = allocator.getPtr();
|
||||
void *ptr2 = allocator.getPtr();
|
||||
EXPECT_EQ(ptr1, ptr2);
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -21,12 +21,8 @@ void testBangcKernel(
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// inputCpu1->printData();
|
||||
// inputCpu2->printData();
|
||||
|
@ -37,6 +33,8 @@ void testBangcKernel(
|
|||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
inputGpu1->setData(generator);
|
||||
inputGpu2->setData(generator);
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
|
@ -44,7 +42,11 @@ void testBangcKernel(
|
|||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||
cpuGraph->addTensor(inputCpu1);
|
||||
cpuGraph->addTensor(inputCpu2);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
inputCpu2->setData(generator);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// outputCpu->printData();
|
||||
|
|
|
@ -19,12 +19,8 @@ void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generatorB);
|
||||
|
||||
// MLU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
|
@ -33,6 +29,8 @@ void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
|
|||
auto mluOp =
|
||||
bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
bangGraph->dataMalloc();
|
||||
inputMlu1->setData(generatorA);
|
||||
inputMlu2->setData(generatorB);
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputMlu = mluOp->getOutput();
|
||||
auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
|
||||
|
@ -40,7 +38,11 @@ void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
|
|||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp =
|
||||
cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
cpuGraph->addTensor(inputCpu1);
|
||||
cpuGraph->addTensor(inputCpu2);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
inputCpu2->setData(generatorB);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
outputCpu->print();
|
||||
|
|
|
@ -33,6 +33,8 @@ void testElementWiseCnnl(
|
|||
|
||||
// allocate BANG memory
|
||||
g->dataMalloc();
|
||||
a->setData(generator);
|
||||
b->setData(generator);
|
||||
|
||||
// Execute on BANG
|
||||
bangRuntime->run(g);
|
||||
|
|
|
@ -20,12 +20,8 @@ void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generatorB);
|
||||
|
||||
// MLU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
|
@ -33,13 +29,19 @@ void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
|
|||
auto inputMlu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto mluOp = bangGraph->addOp<T>(inputMlu1, inputMlu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
inputMlu1->setData(generatorA);
|
||||
inputMlu2->setData(generatorB);
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputMlu = mluOp->getOutput();
|
||||
auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||
cpuGraph->addTensor(inputCpu1);
|
||||
cpuGraph->addTensor(inputCpu2);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
inputCpu2->setData(generatorB);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
outputCpu->print();
|
||||
|
|
|
@ -19,12 +19,8 @@ void testOptensor(
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
|
@ -32,13 +28,19 @@ void testOptensor(
|
|||
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu1, inputGpu2, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
inputGpu1->setData(generator);
|
||||
inputGpu2->setData(generator);
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||
cpuGraph->addTensor(inputCpu1);
|
||||
cpuGraph->addTensor(inputCpu2);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
inputCpu2->setData(generator);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
|
|
|
@ -17,21 +17,22 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr);
|
||||
bangGraph->dataMalloc();
|
||||
inputGpu->setData(generator);
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr);
|
||||
cpuGraph->addTensor(inputCpu);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
|
|
|
@ -18,8 +18,6 @@ void testClip(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
|
@ -28,13 +26,16 @@ void testClip(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
float max = 4.0;
|
||||
auto gpuOp = cudaGraph->addOp<T>(inputGpu, nullptr, min, max);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->setData(generator);
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr, min, max);
|
||||
cpuGraph->addTensor(inputCpu);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
|
|
|
@ -58,11 +58,16 @@ TEST(Concat, Cuda) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto op = gCuda->addOp<ConcatObj>(TensorVec{gCuda->cloneTensor(t1),
|
||||
gCuda->cloneTensor(t2),
|
||||
gCuda->cloneTensor(t3)},
|
||||
nullptr, 2);
|
||||
auto t1Gpu = gCuda->cloneTensor(t1);
|
||||
auto t2Gpu = gCuda->cloneTensor(t2);
|
||||
auto t3Gpu = gCuda->cloneTensor(t3);
|
||||
|
||||
auto op =
|
||||
gCuda->addOp<ConcatObj>(TensorVec{t1Gpu, t2Gpu, t3Gpu}, nullptr, 2);
|
||||
gCuda->dataMalloc();
|
||||
t1Gpu->setData(IncrementalGenerator());
|
||||
t2Gpu->setData(OneGenerator());
|
||||
t3Gpu->setData(OneGenerator());
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// cudaPrintTensor(op->getOutput());
|
||||
|
|
|
@ -33,6 +33,8 @@ void testConvCudnn(
|
|||
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
i0Cuda->setData(generator);
|
||||
w0Cuda->setData(generator);
|
||||
// Execute on CUDA
|
||||
cuda->run(gCuda);
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -72,6 +74,8 @@ TEST(cuDNN_Conv, tune) {
|
|||
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
i0Cuda->setData(IncrementalGenerator());
|
||||
w0Cuda->setData(IncrementalGenerator());
|
||||
// Execute on CUDA
|
||||
bool tune = true;
|
||||
cuda->run(gCuda, tune);
|
||||
|
|
|
@ -35,6 +35,8 @@ void testConvCudnnFP16(
|
|||
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
i0Cuda->setData(generator);
|
||||
w0Cuda->setData(generator);
|
||||
// Execute on CUDA
|
||||
cuda->run(gCuda);
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -71,6 +73,8 @@ TEST(cuDNN_Conv_FP16, tune) {
|
|||
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
i0Cuda->setData(IncrementalGenerator());
|
||||
w0Cuda->setData(IncrementalGenerator());
|
||||
// Execute on CUDA
|
||||
bool tune = true;
|
||||
cuda->run(gCuda, tune);
|
||||
|
|
|
@ -36,6 +36,8 @@ void testConvTransposedCudnn(
|
|||
padding, padding, stride,
|
||||
stride, dilation, dilation);
|
||||
gCuda->dataMalloc();
|
||||
i0Cuda->setData(generator);
|
||||
w0Cuda->setData(generator);
|
||||
// Execute on CUDA
|
||||
cuda->run(gCuda);
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -70,6 +72,8 @@ void testConvTransposedNHWCCudnn(
|
|||
i0Cuda, w0Cuda, nullptr, padding, padding, stride, stride, dilation,
|
||||
dilation);
|
||||
gCuda->dataMalloc();
|
||||
i0Cuda->setData(generator);
|
||||
w0Cuda->setData(generator);
|
||||
// Execute on CUDA
|
||||
cuda->run(gCuda);
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -115,6 +119,8 @@ TEST(cuDNN_ConvTransposed, run1) {
|
|||
auto conv =
|
||||
gCuda->addOp<ConvTransposed2dObj>(i0Cuda, w0Cuda, nullptr, 0, 0);
|
||||
gCuda->dataMalloc();
|
||||
i0Cuda->setData(IncrementalGenerator());
|
||||
w0Cuda->setData(IncrementalGenerator());
|
||||
// Execute on CUDA
|
||||
cuda->run(gCuda);
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -148,6 +154,8 @@ TEST(cuDNN_ConvTransposed, tune) {
|
|||
auto conv = gCuda->addOp<ConvTransposed2dObj>(i0Cuda, w0Cuda, nullptr);
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
i0Cuda->setData(IncrementalGenerator());
|
||||
w0Cuda->setData(IncrementalGenerator());
|
||||
// Execute on CUDA
|
||||
bool tune = true;
|
||||
cuda->run(gCuda, tune);
|
||||
|
|
|
@ -19,12 +19,8 @@ void testElementWiseCudnn(
|
|||
|
||||
// Build input data on CPU
|
||||
Tensor acpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
acpu->dataMalloc();
|
||||
acpu->setData(generator);
|
||||
|
||||
Tensor bcpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
bcpu->dataMalloc();
|
||||
bcpu->setData(generator);
|
||||
|
||||
// Build CUDA graph
|
||||
Graph g = make_ref<GraphObj>(cudaRuntime);
|
||||
|
@ -34,6 +30,8 @@ void testElementWiseCudnn(
|
|||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
a->setData(generator);
|
||||
b->setData(generator);
|
||||
|
||||
// Execute on CUDA
|
||||
cudaRuntime->run(g);
|
||||
|
|
|
@ -16,8 +16,6 @@ TEST(CUDA_Extend, run) {
|
|||
// Build input data on CPU
|
||||
Tensor icpu =
|
||||
make_ref<TensorObj>(Shape{2, 3, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
icpu->dataMalloc();
|
||||
icpu->setData(IncrementalGenerator());
|
||||
|
||||
// Build CUDA graph
|
||||
Graph g = make_ref<GraphObj>(cudaRuntime);
|
||||
|
@ -26,6 +24,7 @@ TEST(CUDA_Extend, run) {
|
|||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
i->setData(IncrementalGenerator());
|
||||
|
||||
// Execute on CUDA
|
||||
cudaRuntime->run(g);
|
||||
|
|
|
@ -186,9 +186,12 @@ TEST(Gather, Cuda) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto op = gCuda->addOp<GatherObj>(
|
||||
gCuda->cloneTensor(input), gCuda->cloneTensor(index), nullptr, 0);
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto indexCuda = gCuda->cloneTensor(index);
|
||||
auto op = gCuda->addOp<GatherObj>(inputCuda, indexCuda, nullptr, 0);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(vector<float>{1, 2, 3, 4, 5, 6});
|
||||
indexCuda->copyin(vector<uint32_t>{0, 1, 1, 2});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// cudaPrintTensor(op->getOutput());
|
||||
|
@ -207,9 +210,12 @@ TEST(Gather, Cuda) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto op = gCuda->addOp<GatherObj>(
|
||||
gCuda->cloneTensor(input), gCuda->cloneTensor(index), nullptr, 1);
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto indexCuda = gCuda->cloneTensor(index);
|
||||
auto op = gCuda->addOp<GatherObj>(inputCuda, indexCuda, nullptr, 1);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->setData(IncrementalGenerator());
|
||||
indexCuda->copyin(vector<uint32_t>{0, 2});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// cudaPrintTensor(op->getOutput());
|
||||
|
@ -228,9 +234,12 @@ TEST(Gather, Cuda) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto op = gCuda->addOp<GatherObj>(
|
||||
gCuda->cloneTensor(input), gCuda->cloneTensor(index), nullptr, 1);
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto indexCuda = gCuda->cloneTensor(index);
|
||||
auto op = gCuda->addOp<GatherObj>(inputCuda, indexCuda, nullptr, 1);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->setData(IncrementalGenerator());
|
||||
indexCuda->copyin(vector<uint32_t>{0, 3, 1});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// cudaPrintTensor(op->getOutput());
|
||||
|
|
|
@ -32,6 +32,8 @@ void testMatmulCuda(
|
|||
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
ACuda->setData(generatorA);
|
||||
BCuda->setData(generatorB);
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
auto CCpu = gCpu->cloneTensor(matmul->getOutput());
|
||||
|
|
|
@ -13,8 +13,6 @@ TEST(Pad, Cuda) {
|
|||
// Build input data on CPU
|
||||
Tensor icpu =
|
||||
make_ref<TensorObj>(Shape{1, 2, 3, 2}, DataType::Float32, cpuRuntime);
|
||||
icpu->dataMalloc();
|
||||
icpu->setData(IncrementalGenerator());
|
||||
|
||||
// Build CUDA graph;
|
||||
Graph g = make_ref<GraphObj>(cudaRuntime);
|
||||
|
@ -24,6 +22,7 @@ TEST(Pad, Cuda) {
|
|||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
i->setData(IncrementalGenerator());
|
||||
|
||||
// Execute on CUDA
|
||||
cudaRuntime->run(g);
|
||||
|
|
|
@ -19,8 +19,6 @@ void testPoolCudnn(
|
|||
|
||||
// Build input data on CPU
|
||||
Tensor i0cpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
i0cpu->dataMalloc();
|
||||
i0cpu->setData(generator);
|
||||
|
||||
// Build CUDA graph
|
||||
Graph g = make_ref<GraphObj>(cudaRuntime);
|
||||
|
@ -30,6 +28,7 @@ void testPoolCudnn(
|
|||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
i0->setData(generator);
|
||||
|
||||
// Execute on CUDA
|
||||
cudaRuntime->run(g);
|
||||
|
|
|
@ -17,8 +17,6 @@ void test_reducemean(const Shape &shape, const vector<float> &data,
|
|||
|
||||
// Build input data on CPU
|
||||
Tensor icpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
icpu->dataMalloc();
|
||||
icpu->copyin(data);
|
||||
|
||||
// Build CUDA graph
|
||||
Graph g = make_ref<GraphObj>(cudaRuntime);
|
||||
|
@ -27,6 +25,7 @@ void test_reducemean(const Shape &shape, const vector<float> &data,
|
|||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
i->copyin(data);
|
||||
|
||||
// Execute on CUDA
|
||||
cudaRuntime->run(g);
|
||||
|
|
|
@ -26,6 +26,7 @@ TEST(CUDA_Reshape, run) {
|
|||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
i->setData(IncrementalGenerator());
|
||||
|
||||
// Execute on CUDA
|
||||
cudaRuntime->run(g);
|
||||
|
@ -55,6 +56,7 @@ TEST(CUDA_Flatten, run) {
|
|||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
i->setData(IncrementalGenerator());
|
||||
|
||||
// Execute on CUDA
|
||||
cudaRuntime->run(g);
|
||||
|
@ -84,6 +86,7 @@ TEST(CUDA_Identity, run) {
|
|||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
i->setData(IncrementalGenerator());
|
||||
|
||||
// Execute on CUDA
|
||||
cudaRuntime->run(g);
|
||||
|
|
|
@ -19,11 +19,15 @@ TEST(Resize, Cuda_downsample_sizes_nearest) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto sizesCuda = gCuda->cloneTensor(sizes);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt,
|
||||
gCuda->cloneTensor(sizes), nullptr, nullptr,
|
||||
inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr,
|
||||
ResizeObj::EKeepAspectRatioPolicy::stretch);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
|
||||
sizesCuda->copyin(vector<uint32_t>{1, 1, 1, 3});
|
||||
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -44,13 +48,16 @@ TEST(Resize, Cuda_upsample_sizes_nearest_notlarger) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto sizesCuda = gCuda->cloneTensor(sizes);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, vector<int>{2, 3},
|
||||
gCuda->cloneTensor(sizes), nullptr, nullptr,
|
||||
inputCuda, nullptr, vector<int>{2, 3}, sizesCuda, nullptr, nullptr,
|
||||
ResizeObj::EKeepAspectRatioPolicy::notLarger,
|
||||
ResizeObj::ENearestMode::roundPreferFloor,
|
||||
ResizeObj::ECoordinateTransMode::halfPixel);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(vector<float>{1, 2, 3, 4});
|
||||
sizesCuda->copyin(vector<uint32_t>{7, 8});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -74,13 +81,16 @@ TEST(Resize, Cuda_upsample_sizes_nearest_notsmaller) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto sizesCuda = gCuda->cloneTensor(sizes);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, vector<int>{2, 3},
|
||||
gCuda->cloneTensor(sizes), nullptr, nullptr,
|
||||
inputCuda, nullptr, vector<int>{2, 3}, sizesCuda, nullptr, nullptr,
|
||||
ResizeObj::EKeepAspectRatioPolicy::notSmaller,
|
||||
ResizeObj::ENearestMode::roundPreferFloor,
|
||||
ResizeObj::ECoordinateTransMode::halfPixel);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(vector<float>{1, 2, 3, 4});
|
||||
sizesCuda->copyin(vector<uint32_t>{7, 8});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -105,13 +115,17 @@ TEST(Resize, Cuda_upsample_sizes_nearest_ceil_half_pixel) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto sizesCuda = gCuda->cloneTensor(sizes);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt,
|
||||
gCuda->cloneTensor(sizes), nullptr, nullptr,
|
||||
inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr,
|
||||
ResizeObj::EKeepAspectRatioPolicy::stretch,
|
||||
ResizeObj::ENearestMode::ceil,
|
||||
ResizeObj::ECoordinateTransMode::halfPixel);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
sizesCuda->copyin(vector<uint32_t>{1, 1, 8, 8});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -138,13 +152,17 @@ TEST(Resize, Cuda_upsample_sizes_nearest_floor_align_corners) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto sizesCuda = gCuda->cloneTensor(sizes);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, vector<int>{3, 2},
|
||||
gCuda->cloneTensor(sizes), nullptr, nullptr,
|
||||
inputCuda, nullptr, vector<int>{3, 2}, sizesCuda, nullptr, nullptr,
|
||||
ResizeObj::EKeepAspectRatioPolicy::stretch,
|
||||
ResizeObj::ENearestMode::floor,
|
||||
ResizeObj::ECoordinateTransMode::alignCorners);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
sizesCuda->copyin(vector<uint32_t>{8, 8});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -171,13 +189,18 @@ TEST(Resize, Cuda_upsample_sizes_nearest_round_prefer_ceil_asymmetri) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto sizesCuda = gCuda->cloneTensor(sizes);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt,
|
||||
gCuda->cloneTensor(sizes), nullptr, nullptr,
|
||||
inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr,
|
||||
ResizeObj::EKeepAspectRatioPolicy::stretch,
|
||||
ResizeObj::ENearestMode::roundPreferCeil,
|
||||
ResizeObj::ECoordinateTransMode::asymmetric);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
sizesCuda->copyin(vector<uint32_t>{1, 1, 8, 8});
|
||||
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -203,10 +226,13 @@ TEST(Resize, Cuda_downsample_scales_nearest) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto op = gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
|
||||
std::nullopt, nullptr,
|
||||
gCuda->cloneTensor(scales), nullptr);
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto scalesCuda = gCuda->cloneTensor(scales);
|
||||
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, nullptr,
|
||||
scalesCuda, nullptr);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
|
||||
scalesCuda->copyin(vector<float>{1, 1, 0.6, 0.6});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -227,10 +253,13 @@ TEST(Resize, Cuda_upsample_scales_nearest) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto op = gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
|
||||
std::nullopt, nullptr,
|
||||
gCuda->cloneTensor(scales), nullptr);
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto scalesCuda = gCuda->cloneTensor(scales);
|
||||
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, nullptr,
|
||||
scalesCuda, nullptr);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(vector<float>{1, 2, 3, 4});
|
||||
scalesCuda->copyin(vector<float>{1, 1, 2, 3});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -253,10 +282,13 @@ TEST(Resize, Cuda_upsample_scales_nearest_axes_3_2) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto op = gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
|
||||
vector<int>{3, 2}, nullptr,
|
||||
gCuda->cloneTensor(scales), nullptr);
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto scalesCuda = gCuda->cloneTensor(scales);
|
||||
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, vector<int>{3, 2},
|
||||
nullptr, scalesCuda, nullptr);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(vector<float>{1, 2, 3, 4});
|
||||
scalesCuda->copyin(vector<float>{3, 2});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -279,10 +311,14 @@ TEST(Resize, Cuda_downsample_scales_linear) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
|
||||
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::linear);
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto scalesCuda = gCuda->cloneTensor(scales);
|
||||
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, nullptr,
|
||||
scalesCuda, nullptr,
|
||||
ResizeObj::ECoeffMode::linear);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
|
||||
scalesCuda->copyin(vector<float>{1, 1, 0.6, 0.6});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -303,12 +339,15 @@ TEST(Resize, Cuda_downsample_scales_linear_aligncorners) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto scalesCuda = gCuda->cloneTensor(scales);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
|
||||
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::linear,
|
||||
ResizeObj::EKeepAspectRatioPolicy::none,
|
||||
inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr,
|
||||
ResizeObj::ECoeffMode::linear, ResizeObj::EKeepAspectRatioPolicy::none,
|
||||
ResizeObj::ECoordinateTransMode::alignCorners);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
|
||||
scalesCuda->copyin(vector<float>{1, 1, 0.6, 0.6});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -329,10 +368,14 @@ TEST(Resize, Cuda_upsample_scales_linear) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
|
||||
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::linear);
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto scalesCuda = gCuda->cloneTensor(scales);
|
||||
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, nullptr,
|
||||
scalesCuda, nullptr,
|
||||
ResizeObj::ECoeffMode::linear);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(vector<float>{1, 2, 3, 4});
|
||||
scalesCuda->copyin(vector<float>{1, 1, 2, 2});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -355,12 +398,15 @@ TEST(Resize, Cuda_upsample_scales_linear_align_corners) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto scalesCuda = gCuda->cloneTensor(scales);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
|
||||
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::linear,
|
||||
ResizeObj::EKeepAspectRatioPolicy::none,
|
||||
inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr,
|
||||
ResizeObj::ECoeffMode::linear, ResizeObj::EKeepAspectRatioPolicy::none,
|
||||
ResizeObj::ECoordinateTransMode::alignCorners);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(vector<float>{1, 2, 3, 4});
|
||||
scalesCuda->copyin(vector<float>{1, 1, 2, 2});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -384,13 +430,17 @@ TEST(Resize, Cuda_downsample_sizes_linear_pytorchhalfpixel) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto sizesCuda = gCuda->cloneTensor(sizes);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt,
|
||||
gCuda->cloneTensor(sizes), nullptr, nullptr,
|
||||
inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, nullptr,
|
||||
ResizeObj::ECoeffMode::linear,
|
||||
ResizeObj::EKeepAspectRatioPolicy::stretch,
|
||||
ResizeObj::ECoordinateTransMode::pytorchHalfPixel);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
sizesCuda->copyin(vector<uint32_t>{1, 1, 3, 1});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -414,13 +464,19 @@ TEST(Resize, Cuda_tf_crop_and_resize) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto sizesCuda = gCuda->cloneTensor(sizes);
|
||||
auto roiCuda = gCuda->cloneTensor(roi);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt,
|
||||
gCuda->cloneTensor(sizes), nullptr, gCuda->cloneTensor(roi),
|
||||
inputCuda, nullptr, std::nullopt, sizesCuda, nullptr, roiCuda,
|
||||
ResizeObj::ECoeffMode::linear,
|
||||
ResizeObj::EKeepAspectRatioPolicy::stretch,
|
||||
ResizeObj::ECoordinateTransMode::tfCropAndResize);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
sizesCuda->copyin(vector<uint32_t>{1, 1, 3, 3});
|
||||
roiCuda->copyin(vector<float>{0, 0, 0.4, 0.6, 1, 1, 0.6, 0.8});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -445,13 +501,19 @@ TEST(Resize, Cuda_tf_crop_and_resize_axes_3_2) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto sizesCuda = gCuda->cloneTensor(sizes);
|
||||
auto roiCuda = gCuda->cloneTensor(roi);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, vector<int>{3, 2},
|
||||
gCuda->cloneTensor(sizes), nullptr, gCuda->cloneTensor(roi),
|
||||
inputCuda, nullptr, vector<int>{3, 2}, sizesCuda, nullptr, roiCuda,
|
||||
ResizeObj::ECoeffMode::linear,
|
||||
ResizeObj::EKeepAspectRatioPolicy::stretch,
|
||||
ResizeObj::ECoordinateTransMode::tfCropAndResize);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
sizesCuda->copyin(vector<uint32_t>{3, 3});
|
||||
roiCuda->copyin(vector<float>{0.6, 0.4, 0.8, 0.6});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -474,10 +536,15 @@ TEST(Resize, Cuda_downsample_scales_cubic) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
|
||||
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic);
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto scalesCuda = gCuda->cloneTensor(scales);
|
||||
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, nullptr,
|
||||
scalesCuda, nullptr,
|
||||
ResizeObj::ECoeffMode::cubic);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
scalesCuda->copyin(vector<float>{1.0, 1.0, 0.8, 0.8});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -501,12 +568,16 @@ TEST(Resize, Cuda_downsample_scales_cubic_align_corners) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto scalesCuda = gCuda->cloneTensor(scales);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
|
||||
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic,
|
||||
ResizeObj::EKeepAspectRatioPolicy::none,
|
||||
inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr,
|
||||
ResizeObj::ECoeffMode::cubic, ResizeObj::EKeepAspectRatioPolicy::none,
|
||||
ResizeObj::ECoordinateTransMode::alignCorners);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
scalesCuda->copyin(vector<float>{1.0, 1.0, 0.8, 0.8});
|
||||
cudaRuntime->run(gCuda);
|
||||
// copy output from CUDA to CPU
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
|
||||
|
@ -529,10 +600,15 @@ TEST(Resize, Cuda_upsample_scales_cubic) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
|
||||
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic);
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto scalesCuda = gCuda->cloneTensor(scales);
|
||||
auto op = gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, nullptr,
|
||||
scalesCuda, nullptr,
|
||||
ResizeObj::ECoeffMode::cubic);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
scalesCuda->copyin(vector<float>{1.0, 1.0, 2, 2});
|
||||
cudaRuntime->run(gCuda);
|
||||
// copy output from CUDA to CPU
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
|
||||
|
@ -566,12 +642,16 @@ TEST(Resize, Cuda_upsample_scales_cubic_align_corners) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto scalesCuda = gCuda->cloneTensor(scales);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
|
||||
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic,
|
||||
ResizeObj::EKeepAspectRatioPolicy::none,
|
||||
inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr,
|
||||
ResizeObj::ECoeffMode::cubic, ResizeObj::EKeepAspectRatioPolicy::none,
|
||||
ResizeObj::ECoordinateTransMode::alignCorners);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
scalesCuda->copyin(vector<float>{1.0, 1.0, 2, 2});
|
||||
cudaRuntime->run(gCuda);
|
||||
// copy output from CUDA to CPU
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
|
||||
|
@ -605,12 +685,16 @@ TEST(Resize, Cuda_upsample_scales_cubic_asymmetric) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto scalesCuda = gCuda->cloneTensor(scales);
|
||||
auto op = gCuda->addOp<ResizeObj>(
|
||||
gCuda->cloneTensor(input), nullptr, std::nullopt, nullptr,
|
||||
gCuda->cloneTensor(scales), nullptr, ResizeObj::ECoeffMode::cubic,
|
||||
ResizeObj::EKeepAspectRatioPolicy::none,
|
||||
inputCuda, nullptr, std::nullopt, nullptr, scalesCuda, nullptr,
|
||||
ResizeObj::ECoeffMode::cubic, ResizeObj::EKeepAspectRatioPolicy::none,
|
||||
ResizeObj::ECoordinateTransMode::asymmetric);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
scalesCuda->copyin(vector<float>{1.0, 1.0, 2, 2});
|
||||
cudaRuntime->run(gCuda);
|
||||
// copy output from CUDA to CPU
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
|
||||
|
@ -640,12 +724,16 @@ TEST(Resize, Cuda_downsample_sizes_cubic) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto sizesCuda = gCuda->cloneTensor(sizes);
|
||||
auto op =
|
||||
gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
|
||||
std::nullopt, gCuda->cloneTensor(sizes),
|
||||
gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, sizesCuda,
|
||||
nullptr, nullptr, ResizeObj::ECoeffMode::cubic,
|
||||
ResizeObj::EKeepAspectRatioPolicy::stretch);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
sizesCuda->copyin(vector<uint32_t>{1, 1, 3, 3});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
@ -674,12 +762,16 @@ TEST(Resize, Cuda_upsample_sizes_cubic) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto sizesCuda = gCuda->cloneTensor(sizes);
|
||||
auto op =
|
||||
gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
|
||||
std::nullopt, gCuda->cloneTensor(sizes),
|
||||
gCuda->addOp<ResizeObj>(inputCuda, nullptr, std::nullopt, sizesCuda,
|
||||
nullptr, nullptr, ResizeObj::ECoeffMode::cubic,
|
||||
ResizeObj::EKeepAspectRatioPolicy::stretch);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->copyin(
|
||||
vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
sizesCuda->copyin(vector<uint32_t>{1, 1, 9, 10});
|
||||
cudaRuntime->run(gCuda);
|
||||
// copy output from CUDA to CPU
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput(0));
|
||||
|
|
|
@ -25,6 +25,7 @@ TEST(CUDA_Slice, run) {
|
|||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
i->setData(IncrementalGenerator());
|
||||
|
||||
// Execute on CUDA
|
||||
cudaRuntime->run(g);
|
||||
|
|
|
@ -16,14 +16,13 @@ TEST(cuDNN_Softmax, run_axis1) {
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
|
@ -42,14 +41,13 @@ TEST(cuDNN_Softmax, run_axis0) {
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 0);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
|
@ -67,14 +65,13 @@ TEST(cuDNN_Softmax2, run_axis1) {
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(IncrementalGenerator());
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->setData(IncrementalGenerator());
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
|
@ -94,14 +91,13 @@ TEST(cuDNN_Softmax2, run_axis2) {
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(IncrementalGenerator());
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 2);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->setData(IncrementalGenerator());
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
|
@ -121,14 +117,13 @@ TEST(cuDNN_Softmax2, run_axis3) {
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(IncrementalGenerator());
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 3);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->setData(IncrementalGenerator());
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
|
|
|
@ -19,9 +19,11 @@ TEST(Split, Cuda) {
|
|||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto op =
|
||||
gCuda->addOp<SplitObj>(gCuda->cloneTensor(input), std::nullopt, 1, 3);
|
||||
auto inputGpu = gCuda->cloneTensor(input);
|
||||
auto op = gCuda->addOp<SplitObj>(inputGpu, std::nullopt, 1, 3);
|
||||
gCuda->dataMalloc();
|
||||
inputGpu->setData(IncrementalGenerator());
|
||||
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// copy output from CUDA to CPU
|
||||
|
|
|
@ -18,21 +18,22 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = cudaGraph->addOp<T>(inputGpu, nullptr);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->setData(generator);
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr);
|
||||
cpuGraph->addTensor(inputCpu);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
|
|
|
@ -13,12 +13,11 @@ TEST(Gather, Cuda) {
|
|||
Graph g = make_ref<GraphObj>(runtime);
|
||||
auto input = g->addTensor({3, 2}, DataType::Float32);
|
||||
auto index = g->addTensor({2, 2}, DataType::UInt32);
|
||||
g->dataMalloc();
|
||||
input->copyin(vector<float>{1, 2, 3, 4, 5, 6});
|
||||
index->copyin(vector<uint32_t>{0, 1, 1, 2});
|
||||
|
||||
auto op = g->addOp<GatherObj>(input, index, nullptr, 0);
|
||||
g->dataMalloc();
|
||||
input->copyin(vector<float>{1, 2, 3, 4, 5, 6});
|
||||
index->copyin(vector<uint32_t>{0, 1, 1, 2});
|
||||
runtime->run(g);
|
||||
|
||||
EXPECT_TRUE(
|
||||
|
@ -29,12 +28,11 @@ TEST(Gather, Cuda) {
|
|||
Graph g = make_ref<GraphObj>(runtime);
|
||||
auto input = g->addTensor({3, 3}, DataType::Float32);
|
||||
auto index = g->addTensor({1, 2}, DataType::UInt32);
|
||||
g->dataMalloc();
|
||||
input->setData(IncrementalGenerator());
|
||||
index->copyin(vector<uint32_t>{0, 2});
|
||||
|
||||
auto op = g->addOp<GatherObj>(input, index, nullptr, 1);
|
||||
g->dataMalloc();
|
||||
input->setData(IncrementalGenerator());
|
||||
index->copyin(vector<uint32_t>{0, 2});
|
||||
runtime->run(g);
|
||||
|
||||
EXPECT_TRUE(
|
||||
|
@ -45,12 +43,11 @@ TEST(Gather, Cuda) {
|
|||
Graph g = make_ref<GraphObj>(runtime);
|
||||
auto input = g->addTensor({2, 4, 2}, DataType::Float32);
|
||||
auto index = g->addTensor({3, 1}, DataType::UInt32);
|
||||
g->dataMalloc();
|
||||
input->setData(IncrementalGenerator());
|
||||
index->copyin(vector<uint32_t>{0, 3, 1});
|
||||
|
||||
auto op = g->addOp<GatherObj>(input, index, nullptr, 1);
|
||||
g->dataMalloc();
|
||||
input->setData(IncrementalGenerator());
|
||||
index->copyin(vector<uint32_t>{0, 3, 1});
|
||||
runtime->run(g);
|
||||
|
||||
EXPECT_TRUE(op->getOutput()->equalData(
|
||||
|
|
|
@ -19,13 +19,12 @@ void testMatmulMkl(
|
|||
Graph gCpu = make_ref<GraphObj>(cpuRuntime);
|
||||
auto ACpu = gCpu->addTensor(shapeA, DataType::Float32);
|
||||
auto BCpu = gCpu->addTensor(shapeB, DataType::Float32);
|
||||
gCpu->dataMalloc();
|
||||
ACpu->setData(generatorA);
|
||||
BCpu->setData(generatorB);
|
||||
|
||||
auto matmul = gCpu->addOp<MatmulObj>(ACpu, BCpu, nullptr, transA, transB);
|
||||
|
||||
gCpu->dataMalloc();
|
||||
ACpu->setData(generatorA);
|
||||
BCpu->setData(generatorB);
|
||||
cpuRuntime->run(gCpu);
|
||||
EXPECT_TRUE(matmul->getOutput()->equalData(ansVec));
|
||||
}
|
||||
|
|
|
@ -18,11 +18,15 @@ TEST(Resize, Mkl_downsample_sizes_nearest) {
|
|||
auto runtime = make_ref<MklRuntimeObj>();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
|
||||
auto op = g->addOp<ResizeObj>(g->cloneTensor(input), nullptr, std::nullopt,
|
||||
g->cloneTensor(sizes), nullptr, nullptr,
|
||||
ResizeObj::EKeepAspectRatioPolicy::stretch,
|
||||
ResizeObj::ENearestMode::ceil);
|
||||
auto input2 = g->cloneTensor(input);
|
||||
auto sizes2 = g->cloneTensor(sizes);
|
||||
auto op =
|
||||
g->addOp<ResizeObj>(input2, nullptr, std::nullopt, sizes2, nullptr,
|
||||
nullptr, ResizeObj::EKeepAspectRatioPolicy::stretch,
|
||||
ResizeObj::ENearestMode::ceil);
|
||||
g->dataMalloc();
|
||||
input2->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
|
||||
sizes2->copyin(vector<uint32_t>{1, 1, 1, 3});
|
||||
runtime->run(g);
|
||||
|
||||
EXPECT_TRUE(op->getOutput(0)->equalData(vector<float>{5, 7, 8}));
|
||||
|
|
|
@ -15,15 +15,15 @@ void testClip(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph Graph = make_ref<GraphObj>(cpuRuntime);
|
||||
float min = 1.0;
|
||||
float max = 4.0;
|
||||
auto Op = Graph->addOp<T>(inputCpu, nullptr, min, max);
|
||||
Graph->addTensor(inputCpu);
|
||||
Graph->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
cpuRuntime->run(Graph);
|
||||
auto output = Op->getOutput();
|
||||
inputCpu->printData();
|
||||
|
|
Loading…
Reference in New Issue