diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index 7f514ebd..c93a355f 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -32,9 +32,9 @@ class GraphHandlerObj {
                               float momentum, float eps, bool training);
 
     Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
-                   int ph, int pw, int sh, int sw);
+                   int ph, int pw, int sh, int sw, int ceilMode);
     Tensor avgPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
-                   int ph, int pw, int sh, int sw);
+                   int ph, int pw, int sh, int sw, int ceilMode);
 
     Tensor add(Tensor a, Tensor b, Tensor c);
     Tensor sub(Tensor a, Tensor b, Tensor c);
diff --git a/include/operators/pooling.h b/include/operators/pooling.h
index a163ab0f..7f28224d 100644
--- a/include/operators/pooling.h
+++ b/include/operators/pooling.h
@@ -12,6 +12,7 @@ class PoolingObj : public OperatorObj {
     int dh, dw;
     int ph, pw;
     int sh, sw;
+    int ceilMode;
     int n, c, h, w;
 
   public:
@@ -32,9 +33,12 @@ class PoolingObj : public OperatorObj {
      * @param pw Padding at the width dimension.
      * @param sh Stride at the height dimension.
      * @param sw Stride at the width dimension.
+     * @param ceilMode Whether to use ceil(1) or floor(0) to compute the output
+     * shape.
      */
     PoolingObj(GraphObj *graph, OpType optype, Tensor input, Tensor output,
-               int kh, int kw, int dh, int dw, int ph, int pw, int sh, int sw);
+               int kh, int kw, int dh, int dw, int ph, int pw, int sh, int sw,
+               int ceilMode);
     OP_CLONE(PoolingObj);
 
     optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
@@ -50,6 +54,7 @@ class PoolingObj : public OperatorObj {
     int getPw() const { return pw; }
     int getSh() const { return sh; }
     int getSw() const { return sw; }
+    int getCeilMode() const { return ceilMode; }
 
     auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); }
     auto getNCHWRS() const { return tuple(n, c, h, w, kh, kw); }
@@ -62,15 +67,15 @@ class PoolingObj : public OperatorObj {
 class MaxPoolObj : public PoolingObj {
   public:
     MaxPoolObj(GraphObj *graph, Tensor input, Tensor output, int kh, int kw,
-               int dh, int dw, int ph, int pw, int sh, int sw)
+               int dh, int dw, int ph, int pw, int sh, int sw, int ceilMode)
         : PoolingObj(graph, OpType::MaxPool, input, output, kh, kw, dh, dw, ph,
-                     pw, sh, sw) {}
+                     pw, sh, sw, ceilMode) {}
 };
 class AvgPoolObj : public PoolingObj {
   public:
     AvgPoolObj(GraphObj *graph, Tensor input, Tensor output, int kh, int kw,
-               int dh, int dw, int ph, int pw, int sh, int sw)
+               int dh, int dw, int ph, int pw, int sh, int sw, int ceilMode)
         : PoolingObj(graph, OpType::AveragePool, input, output, kh, kw, dh, dw,
-                     ph, pw, sh, sw) {}
+                     ph, pw, sh, sw, ceilMode) {}
 };
 }; // namespace infini
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index f8e53b1c..149c284f 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -228,11 +228,12 @@ class OnnxStub:
                             "dilations": [1, 1],
                             "pads": [0, 0, 0, 0],
                             "strides": [1, 1],
+                            "ceil_mode": 0,
                         },
                     )
-                    (k, d, p, s) = (
+                    (k, d, p, s, ceil_mode) = (
                         attributes[name]
-                        for name in ["kernel_shape", "dilations", "pads", "strides"]
+                        for name in ["kernel_shape", "dilations", "pads", "strides", "ceil_mode"]
                     )
                     if p[0] != p[2] or p[1] != p[3]:
                         adapt = "{}-adapt".format(node.output[0])
@@ -250,6 +251,7 @@ class OnnxStub:
                             0,
                             s[0],
                             s[1],
+                            ceil_mode,
                         )
                     else:
                         tensors[node.output[0]] = self.handler.maxPool(
@@ -263,6 +265,7 @@ class OnnxStub:
                             p[1],
                             s[0],
                             s[1],
+                            ceil_mode,
                         )
                 elif node.op_type == "AveragePool":
                     attributes = _parse_attribute(
@@ -271,10 +274,11 @@ class OnnxStub:
                             "kernel_shape": None,
                             "pads": [0, 0, 0, 0],
                             "strides": [1, 1],
+                            "ceil_mode": 0,
                         },
                     )
-                    (k, p, s) = (
-                        attributes[name] for name in ["kernel_shape", "pads", "strides"]
+                    (k, p, s, ceil_mode) = (
+                        attributes[name] for name in ["kernel_shape", "pads", "strides", "ceil_mode"]
                     )
                     if p[0] != p[2] or p[1] != p[3]:
                         adapt = "{}-adapt".format(node.output[0])
@@ -292,6 +296,7 @@ class OnnxStub:
                             0,
                             s[0],
                             s[1],
+                            ceil_mode,
                         )
                     else:
                         tensors[node.output[0]] = self.handler.avgPool(
@@ -305,6 +310,7 @@ class OnnxStub:
                             p[1],
                             s[0],
                             s[1],
+                            ceil_mode,
                         )
                 elif node.op_type == "GlobalAveragePool":
                     [_, _, h, w] = _search_shape(model, node.input[0])
@@ -319,6 +325,7 @@ class OnnxStub:
                         0,
                         1,
                         1,
+                        0,
                     )
                 elif node.op_type == "Add":
                     tensors[node.output[0]] = self.handler.add(
@@ -866,7 +873,7 @@ class OnnxStub:
                     )
                 )
             elif ty == backend.OpTypeId.MaxPool:
-                kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
+                kh, kw, dh, dw, ph, pw, sh, sw, ceil_mode = backend.pool_attrs_of(op)
                 ctx.push_node(
                     make_node(
                         ty.name,
@@ -877,10 +884,11 @@ class OnnxStub:
                         pads=[ph, pw, ph, pw],
                         dilations=[dh, dw],
                         strides=[sh, sw],
+                        ceil_mode=ceil_mode,
                     )
                 )
             elif ty == backend.OpTypeId.AveragePool:
-                kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
+                kh, kw, dh, dw, ph, pw, sh, sw, ceil_mode = backend.pool_attrs_of(op)
                 ctx.push_node(
                     make_node(
                         "AveragePool",
@@ -890,6 +898,7 @@ class OnnxStub:
                         kernel_shape=[kh, kw],
                         pads=[ph, pw, ph, pw],
                         strides=[sh, sw],
+                        ceil_mode=ceil_mode,
                     )
                 )
             elif ty in [
diff --git a/src/core/graph.cc b/src/core/graph.cc
index 0f844c34..f662cf32 100644
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@@ -210,10 +210,6 @@ void GraphObj::dataMalloc() {
                                      tensorToOffset[tensor.get()]));
         }
     }
-
-#ifdef DEBUG_MODE
-    allocator.info();
-#endif
 }
 
 Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc
index a804a8c7..7267fddf 100644
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@@ -95,30 +95,30 @@ Tensor GraphHandlerObj::batchNormalization(Tensor input, Tensor output,
 }
 
 Tensor GraphHandlerObj::maxPool(Tensor input, Tensor output, int kh, int kw,
-                                int dh, int dw, int ph, int pw, int sh,
-                                int sw) {
+                                int dh, int dw, int ph, int pw, int sh, int sw,
+                                int ceilMode) {
     if (output) {
         g->addOpWithOutputs<MaxPoolObj>(std::move(input), output, kh, kw, dh,
-                                        dw, ph, pw, sh, sw);
+                                        dw, ph, pw, sh, sw, ceilMode);
         return output;
     } else {
         return g
             ->addOp<MaxPoolObj>(std::move(input), output, kh, kw, dh, dw, ph,
-                                pw, sh, sw)
+                                pw, sh, sw, ceilMode)
             ->getOutput();
     }
 }
 Tensor GraphHandlerObj::avgPool(Tensor input, Tensor output, int kh, int kw,
-                                int dh, int dw, int ph, int pw, int sh,
-                                int sw) {
+                                int dh, int dw, int ph, int pw, int sh, int sw,
+                                int ceilMode) {
     if (output) {
         g->addOpWithOutputs<AvgPoolObj>(std::move(input), output, kh, kw, dh,
-                                        dw, ph, pw, sh, sw);
+                                        dw, ph, pw, sh, sw, ceilMode);
         return output;
     } else {
         return g
             ->addOp<AvgPoolObj>(std::move(input), output, kh, kw, dh, dw, ph,
-                                pw, sh, sw)
+                                pw, sh, sw, ceilMode)
             ->getOutput();
     }
 }
diff --git a/src/core/lazy_allocator.cc b/src/core/lazy_allocator.cc
index a5014e5c..c3407320 100644
--- a/src/core/lazy_allocator.cc
+++ b/src/core/lazy_allocator.cc
@@ -145,10 +145,10 @@ void LazyAllocator::free(size_t addr, size_t size) {
 void *LazyAllocator::getPtr() {
     if (this->ptr == nullptr) {
         this->ptr = runtime->alloc(this->peak);
-#ifdef DEBUG_MODE
-        printf("LazyAllocator really alloc non-weight: %p %lu bytes\n",
-               this->ptr, peak);
-#endif
+        // #ifdef DEBUG_MODE
+        //         printf("LazyAllocator really alloc non-weight: %p %lu
+        //         bytes\n", this->ptr, peak);
+        // #endif
     }
     return this->ptr;
 }
@@ -156,10 +156,10 @@ void *LazyAllocator::getPtr() {
 void *LazyAllocator::getWeightPtr() {
     if (this->weightPtr == nullptr) {
         this->weightPtr = runtime->alloc(this->weightPeak);
-#ifdef DEBUG_MODE
-        printf("LazyAllocator really alloc weight: %p %lu bytes\n",
-               this->weightPtr, weightPeak);
-#endif
+        // #ifdef DEBUG_MODE
+        //         printf("LazyAllocator really alloc weight: %p %lu bytes\n",
+        //                this->weightPtr, weightPeak);
+        // #endif
     }
     return this->weightPtr;
 }
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index bea3f4bc..92feba2a 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -187,14 +187,14 @@ static std::tuple<float, float, bool> batch_norm_attrs_of(Operator op) {
                            batchnorm->getTrainingMode());
 }
 
-static std::tuple<int, int, int, int, int, int, int, int>
+static std::tuple<int, int, int, int, int, int, int, int, int>
 pool_attrs_of(Operator op) {
     IT_ASSERT(op->getOpType() == OpType::MaxPool ||
               op->getOpType() == OpType::AveragePool);
     auto pool = dynamic_cast<const PoolingObj *>(op.get());
     return std::make_tuple(pool->getKh(), pool->getKw(), pool->getDh(),
                            pool->getDw(), pool->getPh(), pool->getPw(),
-                           pool->getSh(), pool->getSw());
+                           pool->getSh(), pool->getSw(), pool->getCeilMode());
 }
 
 static std::tuple<std::optional<float>, std::optional<float>>
diff --git a/src/kernels/bang/pooling.cc b/src/kernels/bang/pooling.cc
index 6f907705..8a91b466 100644
--- a/src/kernels/bang/pooling.cc
+++ b/src/kernels/bang/pooling.cc
@@ -30,6 +30,7 @@ class PoolingCnnl : public BangKernelWithoutConfig {
             ph, pw, pw, sh, sw, dh, dw, false));
 
         // get outputs
+        // TODO: verify ceiling mode
         auto outVec = op->getOutput()->getDims();
         int outArray[4] = {outVec[0], outVec[1], outVec[2], outVec[3]};
         cnnlTensorDescriptor_t outDesc;
diff --git a/src/kernels/cpu/pooling.cc b/src/kernels/cpu/pooling.cc
index acb88d9c..1242e14f 100644
--- a/src/kernels/cpu/pooling.cc
+++ b/src/kernels/cpu/pooling.cc
@@ -21,6 +21,7 @@ template <typename T> class NativePooling : public CpuKernelWithoutConfig {
                 auto inoffset = i * (c * ih * iw) + j * ih * iw;
                 for (auto h = 0; h < oh; h++) {
                     for (auto w = 0; w < ow; w++) {
+                        // TODO: verify ceil mode
                         T val =
                             getPoolingValue(kh, kw, h * sh - ph, w * sw - pw,
                                             ih, iw, inptr + inoffset);
diff --git a/src/kernels/cuda/pooling.cc b/src/kernels/cuda/pooling.cc
index 90aa3944..d8b2e0f8 100644
--- a/src/kernels/cuda/pooling.cc
+++ b/src/kernels/cuda/pooling.cc
@@ -29,17 +29,27 @@ class poolingCudnn : public CudaKernelWithoutConfig {
             pw, sh, sw));
 
         // get outputs
-        int outn, outc, outh, outw;
-        checkCudnnError(cudnnGetPooling2dForwardOutputDim(
-            poolingDesc, inDesc, &outn, &outc, &outh, &outw));
+        auto outDims = op->getOutput()->getDims();
+        int outn = outDims[0], outc = outDims[1], outh = outDims[2],
+            outw = outDims[3];
+        // NOTICE: cudnn pooling does not support ceil mode, so the shape
+        // inference of cudnn pooling is not consistant with our framework. Ceil
+        // mode is also supported in Pytorch and ONNX. See
+        // https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html#torch.nn.MaxPool2d
+        // and https://github.com/onnx/onnx/blob/main/docs/Operators.md#MaxPool
+        // for reference.
+        // TODO: Make sure the result after considering ceil mode is correct.
+        // int outn, outc, outh, outw;
+        // checkCudnnError(cudnnGetPooling2dForwardOutputDim(poolingDesc,
+        // inDesc, &outn, &outc, &outh, &outw));
         cudnnTensorDescriptor_t outDesc;
         checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
         checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
                                                    CUDNN_DATA_FLOAT, outn, outc,
                                                    outh, outw));
-        IT_ASSERT((vector{outn, outc, outh, outw}) ==
-                      op->getOutput()->getDims(),
-                  "cuDNN output shape mismatches with OP output shape");
+        // IT_ASSERT((vector{outn, outc, outh, outw}) ==
+        //               op->getOutput()->getDims(),
+        //           "cuDNN output shape mismatches with OP output shape");
 
         float alpha = 1.f, beta = 0.f;
         checkCudnnError(cudnnPoolingForward(context->cudnnHandle(), poolingDesc,
diff --git a/src/operators/pooling.cc b/src/operators/pooling.cc
index d7153699..b1bb2e3d 100644
--- a/src/operators/pooling.cc
+++ b/src/operators/pooling.cc
@@ -4,11 +4,9 @@ namespace infini {
 
 PoolingObj::PoolingObj(GraphObj *graph, OpType optype, Tensor input,
                        Tensor output, int kh, int kw, int dh, int dw, int ph,
-                       int pw, int sh, int sw)
-    : OperatorObj(optype, {input}, {output}),
-
-      kh(kh), kw(kw), dh(dh), dw(dw), ph(ph), pw(pw), sh(sh), sw(sw),
-
+                       int pw, int sh, int sw, int ceilMode)
+    : OperatorObj(optype, {input}, {output}), kh(kh), kw(kw), dh(dh), dw(dw),
+      ph(ph), pw(pw), sh(sh), sw(sw), ceilMode(ceilMode),
       n(input->getDims()[0]), c(input->getDims()[1]), h(input->getDims()[2]),
       w(input->getDims()[3]) {
     IT_ASSERT(checkValid(graph));
@@ -18,8 +16,14 @@ optional<vector<Shape>> PoolingObj::inferShape(const TensorVec &inputs) const {
     const auto &input = inputs[0];
     auto h = input->getDims()[input->getRank() - 2],
          w = input->getDims()[input->getRank() - 1];
-    int oh = (h - (kh - sh) + ph * 2) / sh;
-    int ow = (w - (kw - sw) + pw * 2) / sw;
+    int oh, ow;
+    if (ceilMode) {
+        oh = ceil(((float)(h + 2 * ph - dh * (kh - 1) - 1)) / sh + 1);
+        ow = ceil(((float)(w + 2 * pw - dw * (kw - 1) - 1)) / sw + 1);
+    } else {
+        oh = floor(((float)(h + 2 * ph - dh * (kh - 1) - 1)) / sh + 1);
+        ow = floor(((float)(w + 2 * pw - dw * (kw - 1) - 1)) / sw + 1);
+    }
     auto ret = input->getDims();
     ret[input->getRank() - 2] = oh;
     ret[input->getRank() - 1] = ow;
@@ -34,17 +38,19 @@ std::string PoolingObj::toString() const {
     os << "p=[" << ph << "," << pw << "],";
     os << "s=[" << sh << "," << sw << "],";
     os << "d=[" << dh << "," << dw << "],";
+    os << "ceil mode=" << ceilMode << ",";
     os << "input=" << inputs[0]->getGuid() << ",";
     os << "output=" << outputs[0]->getGuid() << ")";
     return os.str();
 }
 
 vector<int> PoolingObj::getWorkloadVector() const {
-    return {type.underlying(), n, c, h, w, kh, kw, ph, pw, sh, sw, dh, dw};
+    return {type.underlying(), n, c, h, w, kh, kw, ph, pw, sh, sw, dh, dw,
+            ceilMode};
 }
 
 vector<int> PoolingObj::getOpAttrVector() const {
-    return {type.underlying(), kh, kw, ph, pw, sh, sw, dh, dw};
+    return {type.underlying(), kh, kw, ph, pw, sh, sw, dh, dw, ceilMode};
 }
 
 }; // namespace infini
diff --git a/test/core/test_graph_replace.cc b/test/core/test_graph_replace.cc
index a77624ec..cada8860 100644
--- a/test/core/test_graph_replace.cc
+++ b/test/core/test_graph_replace.cc
@@ -208,16 +208,13 @@ TEST(MatchGraph, multi_output) {
     SubGraph subg0 = make_ref<SubGraphObj>(runtime, TensorVec{i});
     {
         auto maxpool =
-            subg0->addOp<MaxPoolObj>(i, nullptr, 3, 3, 0, 0, 0, 0, 2, 2);
+            subg0->addOp<MaxPoolObj>(i, nullptr, 3, 3, 1, 1, 0, 0, 2, 2, 0);
         Tensor w0 = subg0->addTensor(Shape{64, 192, 1, 1}, DataType::UInt32);
         auto conv0 = subg0->addOp<ConvObj>(maxpool->getOutput(0), w0, nullptr);
         auto relu0 = subg0->addOp<ReluObj>(conv0->getOutput(0), nullptr);
 
-        auto pad = subg0->addOp<PadObj>(maxpool->getOutput(0), nullptr,
-                                        vector<int>{0, 0, 1, 1, 0, 0, 1, 1},
-                                        std::nullopt);
-        auto avgpool = subg0->addOp<AvgPoolObj>(pad->getOutput(0), nullptr, 3,
-                                                3, 0, 0, 0, 0, 1, 1);
+        auto avgpool = subg0->addOp<AvgPoolObj>(maxpool->getOutput(0), nullptr,
+                                                3, 3, 0, 0, 0, 0, 1, 1, 0);
         subg0->setOutputs(
             TensorVec{relu0->getOutput(0), avgpool->getOutput(0)});
     }
@@ -225,8 +222,9 @@ TEST(MatchGraph, multi_output) {
     SubGraph subg1 =
         make_ref<SubGraphObj>(runtime, TensorVec{i->clone(runtime)});
     {
-        auto avgpool = subg1->addOp<AvgPoolObj>(
-            subg1->getInputsFromOutside()[0], nullptr, 3, 3, 0, 0, 0, 0, 2, 2);
+        auto avgpool =
+            subg1->addOp<AvgPoolObj>(subg1->getInputsFromOutside()[0], nullptr,
+                                     3, 3, 1, 1, 0, 0, 2, 2, 0);
 
         auto relu0 = subg1->addOp<ReluObj>(avgpool->getOutput(0), nullptr);
 
@@ -295,7 +293,7 @@ TEST(MatchGraph, multi_input_output) {
         Tensor w2 = subg0->addTensor(Shape{128, 256, 1, 1}, DataType::UInt32);
         auto conv2 = subg0->addOp<ConvObj>(relu1->getOutput(0), w2, nullptr);
         auto maxpool = subg0->addOp<MaxPoolObj>(relu1->getOutput(0), nullptr, 3,
-                                                3, 0, 0, 0, 0, 2, 2);
+                                                3, 1, 1, 0, 0, 2, 2, 0);
         subg0->setOutputs(
             TensorVec{conv2->getOutput(0), maxpool->getOutput(0)});
     }
@@ -317,7 +315,7 @@ TEST(MatchGraph, multi_input_output) {
         Tensor w2 = subg1->addTensor(Shape{128, 256, 1, 1}, DataType::UInt32);
         auto conv2 = subg1->addOp<ConvObj>(relu1->getOutput(0), w2, nullptr);
         auto maxpool = subg1->addOp<MaxPoolObj>(relu1->getOutput(0), nullptr, 3,
-                                                3, 0, 0, 0, 0, 2, 2);
+                                                3, 1, 1, 0, 0, 2, 2, 0);
         subg1->setOutputs(
             TensorVec{maxpool->getOutput(0), conv2->getOutput(0)});
     }
@@ -338,7 +336,7 @@ TEST(MatchGraph, multi_input_output) {
         Tensor w2 = subg2->addTensor(Shape{128, 256, 1, 1}, DataType::UInt32);
         auto conv2 = subg2->addOp<ConvObj>(relu1->getOutput(0), w2, nullptr);
         auto avgpool = subg2->addOp<AvgPoolObj>(relu1->getOutput(0), nullptr, 3,
-                                                3, 0, 0, 0, 0, 2, 2);
+                                                3, 1, 1, 0, 0, 2, 2, 0);
         subg2->setOutputs(
             TensorVec{conv2->getOutput(0), avgpool->getOutput(0)});
     }
@@ -349,7 +347,7 @@ TEST(MatchGraph, multi_input_output) {
         auto i = g->addTensor(Shape{1, 64, 112, 112}, DataType::UInt32);
         auto relu = g->addOp<ReluObj>(i, nullptr);
         auto maxPool = g->addOp<MaxPoolObj>(relu->getOutput(0), nullptr, 3, 3,
-                                            0, 0, 1, 1, 2, 2);
+                                            1, 1, 1, 1, 2, 2, 0);
         auto out0 =
             v.addSubGraph(subg0, {relu->getOutput(0), maxPool->getOutput(0)});
         auto out1 =
diff --git a/test/kernels/bang/test_bang_pooling.cc b/test/kernels/bang/test_bang_pooling.cc
index 20347ae9..4bbc8091 100644
--- a/test/kernels/bang/test_bang_pooling.cc
+++ b/test/kernels/bang/test_bang_pooling.cc
@@ -8,7 +8,8 @@
 
 namespace infini {
 
-template <class T>
+template <class T, typename std::enable_if<std::is_base_of<PoolingObj, T>{},
+                                           int>::type = 0>
 void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
                  const Shape &shape) {
     // Runtime
@@ -23,7 +24,8 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
     // GPU
     Graph bangGraph = make_ref<GraphObj>(bangRuntime);
     auto inputGpu = bangGraph->cloneTensor(inputCpu);
-    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2);
+    auto gpuOp =
+        bangGraph->addOp<T>(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
     bangGraph->dataMalloc();
     bangRuntime->run(bangGraph);
     auto outputGpu = gpuOp->getOutput();
diff --git a/test/kernels/cuda/test_cuda_inception.cc b/test/kernels/cuda/test_cuda_inception.cc
index 31a7b888..b6765648 100644
--- a/test/kernels/cuda/test_cuda_inception.cc
+++ b/test/kernels/cuda/test_cuda_inception.cc
@@ -29,7 +29,7 @@ TEST(CUDA_Inception_v3_block, run) {
     TensorVec outputs;
     vector<OpVec> ops;
     auto maxpool =
-        g->addOp<MaxPoolObj>(blockInput, nullptr, 3, 3, 1, 1, 1, 1, 1, 1);
+        g->addOp<MaxPoolObj>(blockInput, nullptr, 3, 3, 1, 1, 1, 1, 1, 1, 0);
     auto chainInput = maxpool->getOutput();
     for (auto &pathConfig : configs) {
         int inputChannels = initialChannels;
@@ -52,7 +52,7 @@ TEST(CUDA_Inception_v3_block, run) {
                 inputChannels = f;
             } else { // Add AveragePool
                 auto pool = g->addOp<AvgPoolObj>(input, nullptr, r, r, 1, 1,
-                                                 r / 2, r / 2, 1, 1);
+                                                 r / 2, r / 2, 1, 1, 0);
                 input = pool->getOutput();
                 ops.back().emplace_back(pool);
             }
diff --git a/test/kernels/cuda/test_cuda_pooling.cc b/test/kernels/cuda/test_cuda_pooling.cc
index 7347b951..7e9a6deb 100644
--- a/test/kernels/cuda/test_cuda_pooling.cc
+++ b/test/kernels/cuda/test_cuda_pooling.cc
@@ -9,7 +9,8 @@ namespace infini {
 using KDPS = vector<int>;
 using ExpectOutput = vector<float>;
 
-template <class T>
+template <class T, typename std::enable_if<std::is_base_of<PoolingObj, T>{},
+                                           int>::type = 0>
 void testPoolCudnn(
     const std::function<void(void *, size_t, DataType)> &generator,
     const Shape &shape, const KDPS &kdps, const ExpectOutput &ansVec) {
@@ -24,7 +25,7 @@ void testPoolCudnn(
     Graph g = make_ref<GraphObj>(cudaRuntime);
     auto i0 = g->cloneTensor(i0cpu);
     auto pool = g->addOp<T>(i0, nullptr, kdps[0], kdps[1], kdps[2], kdps[3],
-                            kdps[4], kdps[5], kdps[6], kdps[7]);
+                            kdps[4], kdps[5], kdps[6], kdps[7], 0);
 
     // allocate CUDA memory
     g->dataMalloc();
diff --git a/test/operators/test_pooling.cc b/test/operators/test_pooling.cc
index 8b4c52ed..1ddefa13 100644
--- a/test/operators/test_pooling.cc
+++ b/test/operators/test_pooling.cc
@@ -12,16 +12,16 @@ TEST(MaxPool, ShapeInference) {
         Graph g = make_ref<GraphObj>(cpuRuntime);
         Tensor i = g->addTensor({1, 64, 162, 162}, DataType::UInt32);
         const int kh = 3, kw = 3, dh = 1, dw = 1, ph = 0, pw = 0, sh = 2,
-                  sw = 2;
-        auto op =
-            g->addOp<MaxPoolObj>(i, nullptr, kh, kw, dh, dw, ph, pw, sh, sw);
+                  sw = 2, ceilMode = 0;
+        auto op = g->addOp<MaxPoolObj>(i, nullptr, kh, kw, dh, dw, ph, pw, sh,
+                                       sw, ceilMode);
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 64, 80, 80}));
     }
 
     { // dilation & stride
         Graph g = make_ref<GraphObj>(cpuRuntime);
         Tensor i = g->addTensor({1, 64, 162, 162}, DataType::UInt32);
-        auto op = g->addOp<MaxPoolObj>(i, nullptr, 4, 3, 1, 1, 2, 1, 1, 2);
+        auto op = g->addOp<MaxPoolObj>(i, nullptr, 4, 3, 1, 1, 2, 1, 1, 2, 0);
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 64, 163, 81}));
     }
 }
@@ -30,7 +30,7 @@ TEST(MaxPool, NaiveCPU) {
     Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
     Graph g = make_ref<GraphObj>(cpuRuntime);
     Tensor i = g->addTensor({1, 2, 5, 5}, DataType::UInt32);
-    auto op = g->addOp<MaxPoolObj>(i, nullptr, 3, 3, 1, 1, 1, 1, 2, 2);
+    auto op = g->addOp<MaxPoolObj>(i, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
 
     g->dataMalloc();
     i->setData(IncrementalGenerator());
@@ -49,7 +49,7 @@ TEST(AvgPool, NaiveCPU) {
     Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
     Graph g = make_ref<GraphObj>(cpuRuntime);
     Tensor i = g->addTensor({1, 2, 5, 5}, DataType::Float32);
-    auto op = g->addOp<AvgPoolObj>(i, nullptr, 3, 3, 1, 1, 1, 1, 2, 2);
+    auto op = g->addOp<AvgPoolObj>(i, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
 
     g->dataMalloc();
     i->setData(IncrementalGenerator());