Add: resize operator and cuda kernel,support nearest/linear coef. (#51)

ADD: resize operator and cuda kernel,support nearest/linear coef. fix some fix tests add more tests for linear mode. add linear coef mode. add scales add tests fix tests. add notLarger notSmaller fix add test ADD:resize operator and cuda kernel
2022-11-14 09:30:22 +08:00 · 2022-11-14 09:30:22 +08:00 · c5966f8d81
parent 63d8aff985
commit c5966f8d81
8 changed files with 1053 additions and 0 deletions
--- a/include/cuda/resize.cuh
+++ b/include/cuda/resize.cuh
@ -0,0 +1,17 @@
 #pragma once
 #include "cuda/cuda_common.h"
 typedef struct {
    int nDims;
    int oDims[4];
    int inDims[4];
    int inStride[4];
    float scale[4];
 } MetaData;
 namespace infini {
 void resize_kernel_nearest(float *in, float *out, const MetaData &metaData,
                           size_t num, int coordinateMode, int nearestMode);
 void resize_kernel_linear(float *in, float *out, const MetaData &metaData,
                          size_t num, int coordinateMode);
 } // namespace infini
--- a/include/operators/resize.h
+++ b/include/operators/resize.h
@ -0,0 +1,83 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 class ResizeObj : public OperatorObj {
  public:
    enum class ECoordinateTransMode {
        halfPixel,
        pytorchHalfPixel,
        alignCorners,
        asymmetric,
        tfCropAndResize
    };
    enum class ENearestMode { roundPreferFloor, roundPreferCeil, floor, ceil };
    enum class EKeepAspectRatioPolicy { stretch, notLarger, notSmaller };
    enum class ECoeffMode { nearest, linear, cubic };
  private:
    vector<int> axes;
    vector<float> scales;
    ECoordinateTransMode coMode; // compute src coordinate from dst coordinate
    ECoeffMode mode; // coeff mode,for computing dst value from coordinate src
                     // neighborhood .
    ENearestMode nearestMode; // used in "nearest" mode, indicates how to get
                              // "nearest" pixel
    EKeepAspectRatioPolicy
        ratioPolicy; // used for computing shape when using "sizes"
  public:
    // nearest mode, not tf_crop_and_resize
    ResizeObj(
        GraphObj *graph, Tensor input, Tensor output,
        const std::optional<vector<int>> &axes, Tensor sizes,
        EKeepAspectRatioPolicy ratioPolicy,
        ENearestMode nearestMode = ENearestMode::roundPreferFloor,
        ECoordinateTransMode coordTransMode = ECoordinateTransMode::halfPixel);
    ResizeObj(
        GraphObj *graph, Tensor input, Tensor output,
        const std::optional<vector<int>> &axes, Tensor scales,
        ENearestMode nearestMode = ENearestMode::roundPreferFloor,
        ECoordinateTransMode coordTransMode = ECoordinateTransMode::halfPixel);
    // linear mode
    ResizeObj(
        GraphObj *graph, Tensor input, Tensor output,
        const std::optional<vector<int>> &axes, Tensor sizes,
        EKeepAspectRatioPolicy ratioPolicy, ECoeffMode mode,
        ECoordinateTransMode coordTransMode = ECoordinateTransMode::halfPixel);
    ResizeObj(
        GraphObj *graph, Tensor input, Tensor output,
        const std::optional<vector<int>> &axes, Tensor scales, ECoeffMode mode,
        ECoordinateTransMode coordTransMode = ECoordinateTransMode::halfPixel);
    vector<DataType> inferDataType(const TensorVec &inputs) const override;
    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
    std::string toString() const override;
    int numInputs() const override { return 4; }
    int numOutputs() const override { return 1; }
    ECoeffMode getMode() const { return mode; }
    int getNearestMode() const { return enum_to_underlying(nearestMode); }
    int getKeepAxesRatioPolicy() const {
        return enum_to_underlying(ratioPolicy);
    }
    int getCoordinateTransMode() const { return enum_to_underlying(coMode); }
    float getScale(int i) const {
        IT_ASSERT((size_t)i < scales.size());
        return scales[i];
    }
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
    float round_int(float x) const;
    bool checkCoordinateTransValid(int resizedCo, int origiCo) const;
    void InitBySizes(Tensor input, Tensor sizes,
                     const std::optional<vector<int>> &axes);
    void InitByScales(Tensor input, Tensor sizes,
                      const std::optional<vector<int>> &axes);
 };
 } // namespace infini
--- a/src/kernels/cuda/resize.cc
+++ b/src/kernels/cuda/resize.cc
@ -0,0 +1,47 @@
 #include "operators/resize.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/resize.cuh"
 namespace infini {
 class ResizeCuda : public CudaKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<ResizeObj>(_op);
        auto in = op->getInputs(0);
        auto out = op->getOutputs()[0];
        int nDims = in->getDims().size();
        if (nDims > 4)
            IT_TODO_HALT();
        MetaData metaData;
        memset(&metaData, 0, sizeof(metaData));
        metaData.nDims = nDims;
        for (int i = 0; i < nDims; ++i) {
            metaData.inDims[i] = in->getDims()[i];
            metaData.oDims[i] = out->getDims()[i];
            metaData.inStride[i] = in->getStride()[i];
            metaData.scale[i] = op->getScale(i);
        }
        switch (op->getMode()) {
        case ResizeObj::ECoeffMode::nearest:
            resize_kernel_nearest(in->getRawDataPtr<float *>(),
                                  out->getRawDataPtr<float *>(), metaData,
                                  out->size(), op->getCoordinateTransMode(),
                                  op->getNearestMode());
            break;
        case ResizeObj::ECoeffMode::linear:
            resize_kernel_linear(in->getRawDataPtr<float *>(),
                                 out->getRawDataPtr<float *>(), metaData,
                                 out->size(), op->getCoordinateTransMode());
            break;
        default:
            IT_TODO_HALT();
        }
    }
 };
 REGISTER_KERNEL(Device::CUDA, OpType::Resize, DataType::Float32, ResizeCuda,
                "Resize_CUDA_Float32");
 } // namespace infini
--- a/src/kernels/cuda/resize.cu
+++ b/src/kernels/cuda/resize.cu
@ -0,0 +1,208 @@
 #include "cmath"
 #include "cuda/cuda_common.h"
 #include "cuda/resize.cuh"
 #include <functional>
 #ifndef GPU_LAMBDA
 #define GPU_LAMBDA __device__
 #endif
 // nearest mode
 __device__ int round_prefer_ceil(float x) {
    return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5);
 }
 __device__ int round_prefer_floor(float x) {
    return (x > 0.0) ? floor(x + 0.4) : ceil(x - 0.4);
 }
 __device__ int prefer_floor(float x) { return std::floor(x); }
 __device__ int prefer_ceil(float x) { return std::ceil(x); }
 //  coordinate transform mode
 __device__ float half_pixel(int idx, float scale, int, int) {
    return (idx + 0.5) / scale - 0.5;
 }
 __device__ float pytorch_half_pixel(int idx, float scale, int length_resized,
                                    int) {
    return length_resized > 1 ? (idx + 0.5) / scale - 0.5 : 0;
 }
 __device__ float align_corners(int idx, float scale, int length_resized,
                               int length_original) {
    if (length_resized == 1)
        return 0;
    return (float)idx * (float)(length_original - 1) /
           (float)(length_resized - 1);
 }
 __device__ float asymmetric(int idx, float scale, int length_resized,
                            int length_original) {
    return idx / scale;
 }
 /*
 __device__ float tf_crop_and_resize(int idx, float scale, int length_resized,
                            int length_original) {
 }*/
 // ATTENTION:The order of device functions in array must be consistent with the
 // order in the enums of ResizeObj.
 using nearest_mod_func_t = int (*)(float);
 __device__ nearest_mod_func_t p_nearest_mode_fun[] = {
    round_prefer_floor, round_prefer_ceil, prefer_floor, prefer_ceil};
 using coordinate_trans_mod_func_t = float (*)(int idxO, float scale, int lenO,
                                              int lenR);
 __device__ coordinate_trans_mod_func_t p_cooridnate_trans_mode_func[] = {
    half_pixel, pytorch_half_pixel, align_corners, asymmetric};
 template <typename T1, typename T2>
 __device__ int nearestCoordinateTrans(int dOffset, MetaData metaData,
                                      T1 transModeFun, T2 nearestModeFun) {
    int sOffset = 0;
    for (int i = metaData.nDims - 1; i >= 0; --i) {
        int dIdx = dOffset % metaData.oDims[i];
        dOffset = dOffset / metaData.oDims[i];
        if (metaData.inDims[i] == metaData.oDims[i])
            sOffset += dIdx * metaData.inStride[i];
        else {
            float scale = (float)metaData.oDims[i] / (float)metaData.inDims[i];
            int sIdx = nearestModeFun(transModeFun(
                dIdx, scale, metaData.oDims[i], metaData.inDims[i]));
            if (sIdx > metaData.inDims[i] - 1)
                sIdx = metaData.inDims[i] - 1;
            else if (sIdx < 0)
                sIdx = 0;
            sOffset += sIdx * metaData.inStride[i];
        }
    }
    return sOffset;
 }
 __global__ void _resize_kernel_nearest(float *in, float *out, MetaData metaData,
                                       size_t num, int coordinateMode,
                                       int nearestMode) {
    auto tid = threadIdx.x + blockIdx.x * blockDim.x;
    auto stride = blockDim.x * gridDim.x;
    while (tid < num) {
        int offset = nearestCoordinateTrans(
            tid, metaData, p_cooridnate_trans_mode_func[coordinateMode],
            p_nearest_mode_fun[nearestMode]);
        out[tid] = in[offset];
        tid += stride;
    }
 }
 // ATTENTION: Make sure dim <=4
 typedef struct {
    int offset[16];
    float power[16];
 } NeighborList;
 int __device__ getLimitIdx(int idx, int limit) {
    if (idx < 0)
        return 0;
    if (idx > limit)
        return limit;
    return idx;
 }
 __global__ void _resize_kernel_linear(float *in, float *out, MetaData metaData,
                                      size_t num, int coordinateMode) {
    auto tid = threadIdx.x + blockIdx.x * blockDim.x;
    auto stride = blockDim.x * gridDim.x;
    while (tid < num) {
        auto dOffset = tid;
        auto neighborNum = 0;
        NeighborList neighborList;
        memset(&neighborList, 0, sizeof(neighborList));
        for (int i = metaData.nDims - 1; i >= 0; --i) {
            int dIdx = dOffset % metaData.oDims[i];
            float scale = metaData.scale[i];
            float sIdx = p_cooridnate_trans_mode_func[coordinateMode](
                dIdx, scale, scale * metaData.inDims[i], metaData.inDims[i]);
            int idx = std::floor(sIdx);
            float power = 1 - (sIdx - idx);
            // update neighborList
            if (metaData.inDims[i] == 1) {
                if (neighborNum == 0) {
                    neighborList.offset[0] = 0;
                    neighborList.power[0] = power;
                    neighborNum = 1;
                } else {
                    for (int j = 0; j < neighborNum; j++) {
                        neighborList.power[j] *= power;
                    }
                }
            } else {
                if (neighborNum == 0) {
                    neighborList.offset[0] =
                        getLimitIdx(idx, metaData.inDims[i] - 1) *
                        metaData.inStride[i];
                    neighborList.power[0] = power;
                    neighborList.offset[1] =
                        getLimitIdx(idx + 1, metaData.inDims[i] - 1) *
                        metaData.inStride[i];
                    neighborList.power[1] = 1 - power;
                    neighborNum = 2;
                } else {
                    for (int j = 0; j < neighborNum; j++) {
                        neighborList.offset[j + neighborNum] =
                            neighborList.offset[j] +
                            getLimitIdx(idx + 1, metaData.inDims[i] - 1) *
                                metaData.inStride[i];
                        neighborList.power[j + neighborNum] =
                            (neighborList.power[j]) * (1 - power);
                        neighborList.offset[j] +=
                            getLimitIdx(idx, metaData.inDims[i] - 1) *
                            metaData.inStride[i];
                        neighborList.power[j] *= power;
                    }
                    neighborNum *= 2;
                }
            }
            dOffset = dOffset / metaData.oDims[i];
        }
        float val = 0;
        for (int i = 0; i < neighborNum; ++i) {
            val += in[neighborList.offset[i]] * neighborList.power[i];
        }
        out[tid] = val;
        tid += stride;
    }
 }
 namespace infini {
 void resize_kernel_nearest(float *in, float *out, const MetaData &metaData,
                           size_t num, int coordinateMode, int nearestMode) {
    int blocksize = 32 * 16;
    auto gridsize = (num + blocksize - 1) / blocksize;
    IT_ASSERT(coordinateMode < sizeof(p_cooridnate_trans_mode_func) /
                                   sizeof(p_cooridnate_trans_mode_func[0]));
    IT_ASSERT(nearestMode <
              sizeof(p_nearest_mode_fun) / sizeof(p_nearest_mode_fun[0]));
    _resize_kernel_nearest<<<blocksize, gridsize>>>(
        in, out, metaData, num, coordinateMode, nearestMode);
 }
 void resize_kernel_linear(float *in, float *out, const MetaData &metaData,
                          size_t num, int coordinateMode) {
    int blocksize = 32 * 16;
    auto gridsize = (num + blocksize - 1) / blocksize;
    IT_ASSERT(coordinateMode < sizeof(p_cooridnate_trans_mode_func) /
                                   sizeof(p_cooridnate_trans_mode_func[0]));
    _resize_kernel_linear<<<blocksize, gridsize>>>(in, out, metaData, num,
                                                   coordinateMode);
 }
 } // namespace infini
--- a/src/operators/gather.cc
+++ b/src/operators/gather.cc
@ -69,6 +69,7 @@ std::string GatherObj::toString() const {
    os << "output=" << outputs[0]->getGuid() << ")";
    return os.str();
 }
 vector<int> GatherObj::getWorkloadVector() const {
    vector<int> ret = inputs[0]->getDims();
    ret.emplace(ret.begin(), enum_to_underlying(type));
--- a/src/operators/resize.cc
+++ b/src/operators/resize.cc
@ -0,0 +1,248 @@
 #include "operators/resize.h"
 #include <cmath>
 namespace infini {
 ResizeObj::ResizeObj(GraphObj *graph, Tensor input, Tensor output,
                     const std::optional<vector<int>> &axes, Tensor sizes,
                     EKeepAspectRatioPolicy ratioPolicy,
                     ENearestMode nearestMode,
                     ECoordinateTransMode coordTransMode)
    : OperatorObj(OpType::Resize, {input, nullptr, nullptr, sizes}, {output}),
      coMode(coordTransMode), mode(ECoeffMode::nearest),
      nearestMode(nearestMode), ratioPolicy(ratioPolicy) {
    if (coordTransMode == ECoordinateTransMode::tfCropAndResize)
        IT_TODO_HALT();
    InitBySizes(input, sizes, axes);
    IT_ASSERT(checkValid(graph));
 }
 ResizeObj::ResizeObj(GraphObj *graph, Tensor input, Tensor output,
                     const std::optional<vector<int>> &axes, Tensor scales,
                     ENearestMode nearestMode,
                     ECoordinateTransMode coordTransMode)
    : OperatorObj(OpType::Resize, {input, nullptr, scales, nullptr}, {output}),
      coMode(coordTransMode), mode(ECoeffMode::nearest),
      nearestMode(nearestMode) {
    InitByScales(input, scales, axes);
    IT_ASSERT(checkValid(graph));
 }
 ResizeObj::ResizeObj(GraphObj *graph, Tensor input, Tensor output,
                     const std::optional<vector<int>> &axes, Tensor sizes,
                     EKeepAspectRatioPolicy ratioPolicy, ECoeffMode mode,
                     ECoordinateTransMode coordTransMode)
    : OperatorObj(OpType::Resize, {input, nullptr, nullptr, sizes}, {output}),
      coMode(coordTransMode), mode(mode), ratioPolicy(ratioPolicy) {
    if (coordTransMode == ECoordinateTransMode::tfCropAndResize)
        IT_TODO_HALT();
    InitBySizes(input, sizes, axes);
    IT_ASSERT(checkValid(graph));
 }
 ResizeObj::ResizeObj(GraphObj *graph, Tensor input, Tensor output,
                     const std::optional<vector<int>> &axes, Tensor scales,
                     ECoeffMode mode, ECoordinateTransMode coordTransMode)
    : OperatorObj(OpType::Resize, {input, nullptr, scales, nullptr}, {output}),
      coMode(coordTransMode), mode(mode) {
    if (coordTransMode == ECoordinateTransMode::tfCropAndResize)
        IT_TODO_HALT();
    InitByScales(input, scales, axes);
    IT_ASSERT(checkValid(graph));
 }
 void ResizeObj::InitBySizes(Tensor input, Tensor sizes,
                            const std::optional<vector<int>> &axes) {
    IT_ASSERT(sizes != nullptr);
    size_t size = sizes->getDims()[0];
    IT_ASSERT(size == input->getDims().size() ||
              (axes != std::nullopt && size == (*axes).size()));
    if (axes == std::nullopt)
        for (size_t i = 0; i < input->getDims().size(); ++i)
            this->axes.emplace_back(i);
    else
        // check axes
        for (size_t i = 0; i < (*axes).size(); ++i) {
            auto val = (*axes)[i];
            if (val < 0)
                IT_TODO_HALT();
            IT_ASSERT((size_t)val < inputs[0]->getDims().size());
            this->axes.emplace_back(val);
        }
    // init this->scales
    for (size_t i = 0; i < input->getDims().size(); ++i) {
        this->scales.emplace_back(1);
    }
    // copy sizes data to host.
    IT_ASSERT(sizes->getDataBlob() != nullptr);
    Runtime runtime = CpuRuntimeObj::getInstance();
    int *data = (int *)runtime->alloc(sizes->getBytes());
    sizes->getRuntime()->copyBlobToCPU(
        (void *)data, sizes->getRawDataPtr<void *>(), sizes->getBytes());
    auto inDims = input->getDims();
    int n = this->axes.size();
    switch (ratioPolicy) {
    case EKeepAspectRatioPolicy::stretch:
        for (int i = 0; i < n; ++i)
            scales[this->axes[i]] =
                (float)data[i] / (float)inDims[this->axes[i]];
        break;
    case EKeepAspectRatioPolicy::notLarger: {
        float scale = (float)data[0] / (float)inDims[this->axes[0]];
        for (int i = 1; i < n; ++i) {
            auto tmp = (float)data[i] / (float)inDims[this->axes[i]];
            scale = scale < tmp ? scale : tmp;
        }
        for (int i = 0; i < n; ++i)
            scales[this->axes[i]] = scale;
        break;
    }
    case EKeepAspectRatioPolicy::notSmaller: {
        float scale = (float)data[0] / (float)inDims[this->axes[0]];
        for (int i = 1; i < n; ++i) {
            auto tmp = (float)data[i] / (float)inDims[this->axes[i]];
            scale = scale > tmp ? scale : tmp;
        }
        for (int i = 0; i < n; ++i)
            scales[this->axes[i]] = scale;
        break;
    }
    default:
        IT_ASSERT(0);
    }
    runtime->dealloc(data);
 }
 void ResizeObj::InitByScales(Tensor input, Tensor scales,
                             const std::optional<vector<int>> &axes) {
    IT_ASSERT(scales != nullptr);
    size_t size = scales->getDims()[0];
    IT_ASSERT(size == input->getDims().size() ||
              (axes != std::nullopt && size == (*axes).size()));
    // copy scales data to host.
    IT_ASSERT(scales->getDataBlob() != nullptr);
    Runtime runtime = CpuRuntimeObj::getInstance();
    float *data = (float *)runtime->alloc(scales->getBytes());
    scales->getRuntime()->copyBlobToCPU(
        (void *)data, scales->getRawDataPtr<void *>(), scales->getBytes());
    // init this->scales
    for (size_t i = 0; i < input->getDims().size(); ++i) {
        this->scales.emplace_back(1);
    }
    if (axes == std::nullopt)
        for (size_t i = 0; i < input->getDims().size(); ++i) {
            this->axes.emplace_back(i);
            IT_ASSERT(data[i] > 0);
            this->scales[i] = data[i];
        }
    else
        // check axes
        for (size_t i = 0; i < (*axes).size(); ++i) {
            auto val = (*axes)[i];
            if (val < 0)
                IT_TODO_HALT();
            IT_ASSERT((size_t)val < inputs[0]->getDims().size());
            this->axes.emplace_back(val);
            IT_ASSERT(data[i] > 0);
            this->scales[val] = data[i];
        }
    runtime->dealloc(data);
 }
 vector<DataType> ResizeObj::inferDataType(const TensorVec &inputs) const {
    IT_ASSERT(inputs.size() == 4);
    auto roi = inputs[1];
    auto scales = inputs[2];
    auto sizes = inputs[3];
    IT_ASSERT(roi == nullptr || roi->getDType() == DataType::Float32);
    IT_ASSERT(scales == nullptr || scales->getDType() == DataType::Float32);
    IT_ASSERT(sizes == nullptr || sizes->getDType() == DataType::UInt32);
    return {inputs[0]->getDType()};
 }
 bool ResizeObj::checkCoordinateTransValid(int resizedX, int origiX) const {
    if (ECoordinateTransMode::alignCorners == coMode) {
        return (!(resizedX <= 1 && origiX != resizedX));
    }
    return true;
 }
 float ResizeObj::round_int(float x) const {
    return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5);
 }
 // output shape is related to sizes/scales value.
 optional<vector<Shape>> ResizeObj::inferShape(const TensorVec &inputs) const {
    auto inDims = inputs[0]->getDims();
    Shape ret = inDims;
    int nDim = inDims.size();
    for (int i = 0; i < nDim; ++i) {
        int size = round_int(scales[i] * inDims[i]);
        IT_ASSERT(checkCoordinateTransValid(size, inDims[i]));
        ret[i] = size;
    }
    return {{ret}};
 }
 std::string ResizeObj::toString() const {
    std::ostringstream os;
    os << "Resize"
       << "[" << getGuid() << "]";
    os << "(";
    os << vecToString(inputs[0]->getDims()) << ",";
    if (inputs[1] != nullptr)
        os << "roi=" << vecToString(inputs[1]->getDims()) << ",";
    if (inputs[2] != nullptr)
        os << "scales=" << vecToString(inputs[2]->getDims()) << ",";
    if (inputs[3] != nullptr)
        os << "sizes=" << vecToString(inputs[3]->getDims()) << ",";
    os << "axes=" << vecToString(axes) << ",";
    os << "coMode=" << enum_to_underlying(coMode) << ",";
    os << "nearestMode=" << enum_to_underlying(nearestMode) << ",";
    os << "ratioPolicy=" << enum_to_underlying(ratioPolicy) << ",";
    os << "input=" << inputs[0]->getGuid() << ",";
    if (inputs[1] != nullptr)
        os << inputs[1]->getGuid() << ",";
    if (inputs[2] != nullptr)
        os << inputs[2]->getGuid() << ",";
    if (inputs[3] != nullptr)
        os << inputs[3]->getGuid() << ",";
    os << "output=" << outputs[0]->getGuid() << ")";
    return os.str();
 }
 vector<int> ResizeObj::getWorkloadVector() const {
    vector<int> ret = inputs[0]->getDims();
    for (size_t i = 0; i < outputs[0]->getDims().size(); ++i)
        ret.emplace_back(outputs[0]->getDims()[i]);
    // ratioPolicy only effects output shape, so did not need
    // here.
    ret.emplace_back(enum_to_underlying(coMode));
    ret.emplace_back(enum_to_underlying(nearestMode));
    ret.emplace(ret.begin(), enum_to_underlying(type));
    return ret;
 }
 vector<int> ResizeObj::getOpAttrVector() const {
    vector<int> ret = axes;
    ret.emplace_back(enum_to_underlying(coMode));
    ret.emplace_back(enum_to_underlying(nearestMode));
    ret.emplace_back(enum_to_underlying(ratioPolicy));
    ret.emplace(ret.begin(), enum_to_underlying(type));
    return ret;
 }
 } // namespace infini
--- a/test/kernels/cuda/test_cuda_resize.cc
+++ b/test/kernels/cuda/test_cuda_resize.cc
@ -0,0 +1,370 @@
 #include "cmath"
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
 #include "operators/resize.h"
 #include "test.h"
 namespace infini {
 TEST(Resize, Cuda_downsample_sizes_nearest) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32);
    auto sizes = gCpu->addTensor({4}, DataType::UInt32);
    gCpu->dataMalloc();
    input->copyData(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
    sizes->copyData(vector<uint32_t>{1, 1, 1, 3});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op = gCuda->addOp<ResizeObj>(
        gCuda->cloneTensor(input), nullptr, std::nullopt,
        gCuda->cloneTensor(sizes), ResizeObj::EKeepAspectRatioPolicy::stretch);
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    //  copy output from CUDA to CPU
    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
    EXPECT_TRUE(oCpu->equalData(vector<float>{1, 2, 4}));
 }
 TEST(Resize, Cuda_upsample_sizes_nearest_notlarger) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
    auto sizes = gCpu->addTensor({2}, DataType::UInt32);
    gCpu->dataMalloc();
    input->copyData(vector<float>{1, 2, 3, 4});
    sizes->copyData(vector<uint32_t>{7, 8});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op = gCuda->addOp<ResizeObj>(
        gCuda->cloneTensor(input), nullptr, vector<int>{2, 3},
        gCuda->cloneTensor(sizes), ResizeObj::EKeepAspectRatioPolicy::notLarger,
        ResizeObj::ENearestMode::roundPreferFloor,
        ResizeObj::ECoordinateTransMode::halfPixel);
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    //  copy output from CUDA to CPU
    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
    EXPECT_TRUE(oCpu->equalData(
        vector<float>{1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1,
                      1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4,
                      4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 4, 4, 4}));
 }
 TEST(Resize, Cuda_upsample_sizes_nearest_notsmaller) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
    auto sizes = gCpu->addTensor({2}, DataType::UInt32);
    gCpu->dataMalloc();
    input->copyData(vector<float>{1, 2, 3, 4});
    sizes->copyData(vector<uint32_t>{7, 8});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op =
        gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
                                vector<int>{2, 3}, gCuda->cloneTensor(sizes),
                                ResizeObj::EKeepAspectRatioPolicy::notSmaller,
                                ResizeObj::ENearestMode::roundPreferFloor,
                                ResizeObj::ECoordinateTransMode::halfPixel);
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    //  copy output from CUDA to CPU
    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
    EXPECT_TRUE(oCpu->equalData(vector<float>{
        1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2,
        2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3,
        4, 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4}));
 }
 TEST(Resize, Cuda_upsample_sizes_nearest_ceil_half_pixel) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
    auto sizes = gCpu->addTensor({4}, DataType::UInt32);
    gCpu->dataMalloc();
    input->copyData(
        vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
    sizes->copyData(vector<uint32_t>{1, 1, 8, 8});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op = gCuda->addOp<ResizeObj>(
        gCuda->cloneTensor(input), nullptr, std::nullopt,
        gCuda->cloneTensor(sizes), ResizeObj::EKeepAspectRatioPolicy::stretch,
        ResizeObj::ENearestMode::ceil,
        ResizeObj::ECoordinateTransMode::halfPixel);
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    //  copy output from CUDA to CPU
    auto o = op->getOutput(0);
    // //cudaPrintTensor(o);
    auto oCpu = gCpu->cloneTensor(o);
    EXPECT_TRUE(oCpu->equalData(vector<float>{
        1,  2,  2,  3,  3,  4,  4,  4,  5,  6,  6,  7,  7,  8,  8,  8,
        5,  6,  6,  7,  7,  8,  8,  8,  9,  10, 10, 11, 11, 12, 12, 12,
        9,  10, 10, 11, 11, 12, 12, 12, 13, 14, 14, 15, 15, 16, 16, 16,
        13, 14, 14, 15, 15, 16, 16, 16, 13, 14, 14, 15, 15, 16, 16, 16}));
 }
 TEST(Resize, Cuda_upsample_sizes_nearest_floor_align_corners) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
    auto sizes = gCpu->addTensor({2}, DataType::UInt32);
    gCpu->dataMalloc();
    input->copyData(
        vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
    sizes->copyData(vector<uint32_t>{8, 8});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op = gCuda->addOp<ResizeObj>(
        gCuda->cloneTensor(input), nullptr, vector<int>{3, 2},
        gCuda->cloneTensor(sizes), ResizeObj::EKeepAspectRatioPolicy::stretch,
        ResizeObj::ENearestMode::floor,
        ResizeObj::ECoordinateTransMode::alignCorners);
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    //  copy output from CUDA to CPU
    auto o = op->getOutput(0);
    // cudaPrintTensor(o);
    auto oCpu = gCpu->cloneTensor(o);
    EXPECT_TRUE(oCpu->equalData(vector<float>{
        1, 1, 1, 2,  2,  3,  3,  4,  1,  1,  1,  2,  2,  3,  3,  4,
        1, 1, 1, 2,  2,  3,  3,  4,  5,  5,  5,  6,  6,  7,  7,  8,
        5, 5, 5, 6,  6,  7,  7,  8,  9,  9,  9,  10, 10, 11, 11, 12,
        9, 9, 9, 10, 10, 11, 11, 12, 13, 13, 13, 14, 14, 15, 15, 16}));
 }
 TEST(Resize, Cuda_upsample_sizes_nearest_round_prefer_ceil_asymmetri) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
    auto sizes = gCpu->addTensor({4}, DataType::UInt32);
    gCpu->dataMalloc();
    input->copyData(
        vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
    sizes->copyData(vector<uint32_t>{1, 1, 8, 8});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op = gCuda->addOp<ResizeObj>(
        gCuda->cloneTensor(input), nullptr, std::nullopt,
        gCuda->cloneTensor(sizes), ResizeObj::EKeepAspectRatioPolicy::stretch,
        ResizeObj::ENearestMode::roundPreferCeil,
        ResizeObj::ECoordinateTransMode::asymmetric);
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    //  copy output from CUDA to CPU
    auto o = op->getOutput(0);
    // cudaPrintTensor(o);
    auto oCpu = gCpu->cloneTensor(o);
    EXPECT_TRUE(oCpu->equalData(vector<float>{
        1,  2,  2,  3,  3,  4,  4,  4,  5,  6,  6,  7,  7,  8,  8,  8,
        5,  6,  6,  7,  7,  8,  8,  8,  9,  10, 10, 11, 11, 12, 12, 12,
        9,  10, 10, 11, 11, 12, 12, 12, 13, 14, 14, 15, 15, 16, 16, 16,
        13, 14, 14, 15, 15, 16, 16, 16, 13, 14, 14, 15, 15, 16, 16, 16}));
 }
 TEST(Resize, Cuda_downsample_scales_nearest) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32);
    auto scales = gCpu->addTensor({4}, DataType::Float32);
    gCpu->dataMalloc();
    input->copyData(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
    scales->copyData(vector<float>{1, 1, 0.6, 0.6});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op = gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
                                      std::nullopt, gCuda->cloneTensor(scales));
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    //  copy output from CUDA to CPU
    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
    EXPECT_TRUE(oCpu->equalData(vector<float>{1, 3}));
 }
 TEST(Resize, Cuda_upsample_scales_nearest) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
    auto scales = gCpu->addTensor({4}, DataType::Float32);
    gCpu->dataMalloc();
    input->copyData(vector<float>{1, 2, 3, 4});
    scales->copyData(vector<float>{1, 1, 2, 3});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op = gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
                                      std::nullopt, gCuda->cloneTensor(scales));
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    //  copy output from CUDA to CPU
    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
    EXPECT_TRUE(
        oCpu->equalData(vector<float>{1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2,
                                      3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4}));
 }
 TEST(Resize, Cuda_upsample_scales_nearest_axes_3_2) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
    auto scales = gCpu->addTensor({2}, DataType::Float32);
    gCpu->dataMalloc();
    input->copyData(vector<float>{1, 2, 3, 4});
    scales->copyData(vector<float>{3, 2});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op =
        gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
                                vector<int>{3, 2}, gCuda->cloneTensor(scales));
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    //  copy output from CUDA to CPU
    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
    EXPECT_TRUE(
        oCpu->equalData(vector<float>{1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2,
                                      3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4}));
 }
 TEST(Resize, Cuda_downsample_scales_linear) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32);
    auto scales = gCpu->addTensor({4}, DataType::Float32);
    gCpu->dataMalloc();
    input->copyData(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
    scales->copyData(vector<float>{1, 1, 0.6, 0.6});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op = gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
                                      std::nullopt, gCuda->cloneTensor(scales),
                                      ResizeObj::ECoeffMode::linear);
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    //  copy output from CUDA to CPU
    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
    EXPECT_TRUE(oCpu->equalData(vector<float>{2.6666665, 4.3333331}));
 }
 TEST(Resize, Cuda_upsample_scales_linear) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
    auto scales = gCpu->addTensor({4}, DataType::Float32);
    gCpu->dataMalloc();
    input->copyData(vector<float>{1, 2, 3, 4});
    scales->copyData(vector<float>{1, 1, 2, 2});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op = gCuda->addOp<ResizeObj>(gCuda->cloneTensor(input), nullptr,
                                      std::nullopt, gCuda->cloneTensor(scales),
                                      ResizeObj::ECoeffMode::linear);
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    //  copy output from CUDA to CPU
    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
    EXPECT_TRUE(
        oCpu->equalData(vector<float>{1, 1.25, 1.75, 2, 1.5, 1.75, 2.25, 2.5,
                                      2.5, 2.75, 3.25, 3.5, 3, 3.25, 3.75, 4}));
 }
 TEST(Resize, Cuda_upsample_scales_linear_align_corners) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
    auto scales = gCpu->addTensor({4}, DataType::Float32);
    gCpu->dataMalloc();
    input->copyData(vector<float>{1, 2, 3, 4});
    scales->copyData(vector<float>{1, 1, 2, 2});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op = gCuda->addOp<ResizeObj>(
        gCuda->cloneTensor(input), nullptr, std::nullopt,
        gCuda->cloneTensor(scales), ResizeObj::ECoeffMode::linear,
        ResizeObj::ECoordinateTransMode::alignCorners);
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    cudaPrintTensor(op->getOutput(0));
    //  copy output from CUDA to CPU
    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
    EXPECT_TRUE(oCpu->equalData(vector<float>{
        1, 1.333333, 1.666667, 2, 1.666667, 2, 2.333333, 2.666667, 2.333333,
        2.6666667, 3, 3.333333, 3, 3.333333, 3.6666667, 4}));
 }
 TEST(Resize, Cuda_downsample_sizes_linear_pytorchhalfpixel) {
    Runtime runtime = CpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(runtime);
    auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
    auto sizes = gCpu->addTensor({4}, DataType::UInt32);
    gCpu->dataMalloc();
    input->copyData(
        vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
    sizes->copyData(vector<uint32_t>{1, 1, 3, 1});
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto op = gCuda->addOp<ResizeObj>(
        gCuda->cloneTensor(input), nullptr, std::nullopt,
        gCuda->cloneTensor(sizes), ResizeObj::EKeepAspectRatioPolicy::stretch,
        ResizeObj::ECoeffMode::linear,
        ResizeObj::ECoordinateTransMode::pytorchHalfPixel);
    gCuda->dataMalloc();
    cudaRuntime->run(gCuda);
    //  copy output from CUDA to CPU
    auto oCpu = gCpu->cloneTensor(op->getOutput(0));
    // cudaPrintTensor(op->getOutput(0));
    EXPECT_TRUE(oCpu->equalData(vector<float>{1.666667, 7, 12.33333}));
 }
 } // namespace infini
--- a/test/operators/test_resize.cc
+++ b/test/operators/test_resize.cc
@ -0,0 +1,79 @@
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "operators/resize.h"
 #include "test.h"
 namespace infini {
 TEST(Resize, ShapeInference) {
    Runtime cpuRuntime = CpuRuntimeObj::getInstance();
    // downsample_sizes_nearest no axes
    {
        Graph g = make_ref<GraphObj>(cpuRuntime);
        Tensor i = g->addTensor({1, 1, 2, 4}, DataType::UInt32);
        Tensor sizes = g->addTensor({4}, DataType::UInt32);
        sizes->dataMalloc();
        sizes->copyData(vector<uint32_t>{1, 1, 1, 3});
        auto op =
            g->addOp<ResizeObj>(i, nullptr, std::nullopt, sizes,
                                ResizeObj::EKeepAspectRatioPolicy::stretch);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 1, 1, 3}));
    }
    // upsample_sizes_nearest with axes
    {
        Graph g = make_ref<GraphObj>(cpuRuntime);
        Tensor i = g->addTensor({1, 1, 2, 4}, DataType::UInt32);
        Tensor sizes = g->addTensor({2}, DataType::UInt32);
        sizes->dataMalloc();
        sizes->copyData(vector<uint32_t>{1, 3});
        auto op =
            g->addOp<ResizeObj>(i, nullptr, vector<int>{2, 3}, sizes,
                                ResizeObj::EKeepAspectRatioPolicy::stretch);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 1, 1, 3}));
    }
    // upsample_sizes_nearest_notlarger
    {
        Graph g = make_ref<GraphObj>(cpuRuntime);
        Tensor i = g->addTensor({1, 3, 2, 4}, DataType::UInt32);
        Tensor sizes = g->addTensor({2}, DataType::UInt32);
        sizes->dataMalloc();
        sizes->copyData(vector<uint32_t>{7, 8});
        auto op =
            g->addOp<ResizeObj>(i, nullptr, vector<int>{2, 3}, sizes,
                                ResizeObj::EKeepAspectRatioPolicy::notLarger);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 4, 8}));
    }
    // upsample_sizes_nearest_notsmaller
    {
        Graph g = make_ref<GraphObj>(cpuRuntime);
        Tensor i = g->addTensor({1, 3, 2, 4}, DataType::UInt32);
        Tensor sizes = g->addTensor({3}, DataType::UInt32);
        sizes->dataMalloc();
        sizes->copyData(vector<uint32_t>{2, 6, 8});
        auto op =
            g->addOp<ResizeObj>(i, nullptr, vector<int>{1, 2, 3}, sizes,
                                ResizeObj::EKeepAspectRatioPolicy::notSmaller);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 9, 6, 12}));
    }
    // downsample_scales
    {
        Graph g = make_ref<GraphObj>(cpuRuntime);
        Tensor i = g->addTensor({1, 1, 4, 4}, DataType::UInt32);
        Tensor scales = g->addTensor({3}, DataType::Float32);
        scales->dataMalloc();
        scales->copyData(vector<float>{1, 0.8, 0.8});
        auto op = g->addOp<ResizeObj>(i, nullptr, vector<int>{1, 2, 3}, scales);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 1, 3, 3}));
    }
    // upsample_scales
    {
        Graph g = make_ref<GraphObj>(cpuRuntime);
        Tensor i = g->addTensor({1, 1, 2, 2}, DataType::UInt32);
        Tensor scales = g->addTensor({4}, DataType::Float32);
        scales->dataMalloc();
        scales->copyData(vector<float>{1, 1, 2, 2});
        auto op = g->addOp<ResizeObj>(i, nullptr, std::nullopt, scales);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 1, 4, 4}));
    }
 }
 } // namespace infini