diff --git a/3rd-party/backward-cpp b/3rd-party/backward-cpp index 3bb9240c..f30744bc 160000 --- a/3rd-party/backward-cpp +++ b/3rd-party/backward-cpp @@ -1 +1 @@ -Subproject commit 3bb9240cb15459768adb3e7d963a20e1523a6294 +Subproject commit f30744bcf726ea3735df7ecf9e9de9ddac540283 diff --git a/3rd-party/googletest b/3rd-party/googletest index b796f7d4..e2239ee6 160000 --- a/3rd-party/googletest +++ b/3rd-party/googletest @@ -1 +1 @@ -Subproject commit b796f7d44681514f58a683a3a71ff17c94edb0c1 +Subproject commit e2239ee6043f73722e7aa812a459f54a28552929 diff --git a/3rd-party/nlohmann_json_cmake_fetchcontent b/3rd-party/nlohmann_json_cmake_fetchcontent index 13132dd3..6aebf092 160000 --- a/3rd-party/nlohmann_json_cmake_fetchcontent +++ b/3rd-party/nlohmann_json_cmake_fetchcontent @@ -1 +1 @@ -Subproject commit 13132dd361c8c5b5753983d5186cf54f689d90f9 +Subproject commit 6aebf09233951e4ce30a63919186a70b2b195756 diff --git a/3rd-party/pybind11 b/3rd-party/pybind11 index 0bd8896a..1e3400b6 160000 --- a/3rd-party/pybind11 +++ b/3rd-party/pybind11 @@ -1 +1 @@ -Subproject commit 0bd8896a4010f2d91b2340570c24fa08606ec406 +Subproject commit 1e3400b6742288429f2069aaf5febf92d0662dae diff --git a/include/cuda/cuda_runtime.h b/include/cuda/cuda_runtime.h index a4a0114a..b0b181aa 100644 --- a/include/cuda/cuda_runtime.h +++ b/include/cuda/cuda_runtime.h @@ -1,7 +1,6 @@ #pragma once #include "core/runtime.h" #include "cuda/cuda_common.h" -#include "nnet/dbg.h" namespace infini { @@ -31,7 +30,6 @@ class CudaRuntimeObj : public RuntimeObj { void sync() const; CudaPtr alloc(size_t size) override { void *ptr; - // dbg(size); checkCudaError(cudaMalloc(&ptr, size)); allocatedGPUMemorySize += size; allocationMap[ptr] = size; diff --git a/src/core/search_engine.cc b/src/core/search_engine.cc index c94607b1..d190a689 100644 --- a/src/core/search_engine.cc +++ b/src/core/search_engine.cc @@ -502,9 +502,10 @@ Graph SearchEngine::fuseVertically(const Graph &graph) { auto bestGraph = make_ref(runtimeExec, chainOps); // Eliminate transpose and reshape operators - // if (auto eliminatedGraph = mutator->eliminateVertically( - // make_ref(runtimeExec, chainOps))) - // bestGraph = eliminatedGraph; + // FIXME: current Relu only support 3D and 4D tensors + if (auto eliminatedGraph = mutator->eliminateVertically( + make_ref(runtimeExec, chainOps))) + bestGraph = eliminatedGraph; // Fuse membound operators if (auto optGraph = mutator->fuseVertically(bestGraph)) bestGraph = optGraph; diff --git a/src/core/tensor.cc b/src/core/tensor.cc index ba81fa21..627bb021 100644 --- a/src/core/tensor.cc +++ b/src/core/tensor.cc @@ -2,7 +2,6 @@ #include "core/blob.h" #include "core/operator.h" #include "core/runtime.h" -#include "nnet/dbg.h" #include "utils/dataloader.h" #include #include diff --git a/src/cuda/cuda_runtime.cc b/src/cuda/cuda_runtime.cc index c8d08092..da65104a 100644 --- a/src/cuda/cuda_runtime.cc +++ b/src/cuda/cuda_runtime.cc @@ -3,6 +3,7 @@ #include "core/perf_engine.h" #include "core/runtime.h" #include "cuda_profiler_api.h" +#include "nnet/dbg.h" #include "operators/conv.h" #include "operators/matmul.h" #ifdef INFINI_USE_TVM @@ -165,9 +166,9 @@ double CudaRuntimeObj::timeWithCudaGraph(Graph graph, int rounds) { auto [cudaGraphInstance, numCudaGraphNodes] = endCudaGraphStreamCapture(); // Since one TVM packed function may contaion more than one CUDA kernel, the // number of captured kernels may exceed the number of operators. - // IT_ASSERT(numCudaGraphNodes >= kernels.size(), - // std::to_string(numCudaGraphNodes) + - // " != " + std::to_string(kernels.size())); + IT_ASSERT(numCudaGraphNodes >= kernels.size(), + std::to_string(numCudaGraphNodes) + + " != " + std::to_string(kernels.size())); printf("numCudaGraphNodes = %lu\n", numCudaGraphNodes); return timeit( [&, cudaGraphInstance = cudaGraphInstance, stream = getStream()]() { diff --git a/src/kernels/cuda/conv.cc b/src/kernels/cuda/conv.cc index 7f5f6031..cf214cb3 100644 --- a/src/kernels/cuda/conv.cc +++ b/src/kernels/cuda/conv.cc @@ -82,6 +82,9 @@ class convCudnn : public Kernel { // get kernels cudnnFilterDescriptor_t knDesc; checkCudnnError(cudnnCreateFilterDescriptor(&knDesc)); + // FIXME: filter data layout is not changed with input data layout + // since FCRS shows better performance for NHWC inputs in some cases. + // This should be tunable. checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, f, channelsPerGrp, r, s)); diff --git a/src/kernels/cuda/conv2dreduce.cc b/src/kernels/cuda/conv2dreduce.cc index 2afbfc47..aaf1b5f6 100644 --- a/src/kernels/cuda/conv2dreduce.cc +++ b/src/kernels/cuda/conv2dreduce.cc @@ -41,4 +41,4 @@ REGISTER_KERNEL(Device::CUDA, OpType::Conv2dReduce, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::Conv2dReduceTranspose, DataType::Float32, Conv2dReduceCuda, "Conv2dReduceTranspose_CUDA_Float32"); -} // namespace infini \ No newline at end of file +} // namespace infini diff --git a/src/kernels/cuda/unary.cc b/src/kernels/cuda/unary.cc index 00a56860..ee4e590b 100644 --- a/src/kernels/cuda/unary.cc +++ b/src/kernels/cuda/unary.cc @@ -32,7 +32,6 @@ class ActivationCudnn : public CudaKernelWithoutConfig { } else if (dim.size() == 3) { n = 1, c = dim[0], h = dim[1], w = dim[2]; } else { - dbg(vecToString(dim)); IT_TODO_HALT(); } diff --git a/src/nnet/nmutator.cc b/src/nnet/nmutator.cc index 8941bd23..8e9c75fa 100644 --- a/src/nnet/nmutator.cc +++ b/src/nnet/nmutator.cc @@ -574,24 +574,21 @@ Graph NMutator::transformConvToGEMMReduce(Operator _op) { IT_ASSERT(inputDims[2] == w); IT_ASSERT(inputDims[3] == c); const DataType dtype = A->getDType(); - // IT_ASSERT(outputDims[0] == n); - // IT_ASSERT(outputDims[1] == h); - // IT_ASSERT(outputDims[2] == w); - // IT_ASSERT(outputDims[3] == f); auto g = make_ref(runtime); - dbg(vecToString(inputDims)); - dbg(vecToString(weightDims)); auto newA = g->addTensor( {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype); - auto newW = g->addTensor( - {weightDims[3], weightDims[0] * weightDims[1] * weightDims[2]}, dtype); + // // If use Matmul with transpose 0,0 // auto newW = g->addTensor( - // {weightDims[0] * weightDims[1] * weightDims[2], weightDims[3]}, - // dtype); + // {weightDims[3], weightDims[0] * weightDims[1] * weightDims[2]}, dtype); + + // If use Matmul with transpose 0, 1 + auto newW = g->addTensor( + {weightDims[0] * weightDims[1] * weightDims[2], weightDims[3]}, + dtype); g->addOpWithOutputs(g->cloneTensor(A), newA, newA->getDims()); g->addOpWithOutputs(g->cloneTensor(W), newW, newW->getDims()); - Tensor newO = g->addOp(newA, newW, nullptr, 0, 0)->getOutput(); + Tensor newO = g->addOp(newA, newW, nullptr, 0, 1)->getOutput(); auto new1 = g->addTensor({n, h, w, f, r, s}, dtype); g->addOpWithOutputs(newO, new1, new1->getDims()); g->addOpWithOutputs( @@ -605,39 +602,27 @@ Graph NMutator::transformConvTranposeToGEMMReduce(Operator _op) { return nullptr; const auto &A = op->getInputs()[0]; const auto &W = op->getInputs()[1]; + // f is the de-facto input channel for ConvTranspose const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS(); const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); const Shape inputDims = op->getInputs(0)->getDims(); const Shape weightDims = op->getInputs(1)->getDims(); const Shape outputDims = op->getOutput()->getDims(); - dbg(vecToString(inputDims)); - dbg(vecToString(weightDims)); - dbg(vecToString(op->getOutput()->getDims())); - // dbg(vecToString(op->getNCHWFRS()); - IT_ASSERT(weightDims[0] == f); - IT_ASSERT(weightDims[1] == r); - IT_ASSERT(weightDims[2] == s); - IT_ASSERT(weightDims[3] == c); - IT_ASSERT(inputDims[0] == n); - IT_ASSERT(inputDims[1] == h); - IT_ASSERT(inputDims[2] == w); - IT_ASSERT(inputDims[3] == f); const DataType dtype = A->getDType(); auto g = make_ref(runtime); - auto newA = g->addTensor( + auto newA = g->addTensor( // [N,H,W,F] {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype); - auto newW = g->addTensor( + auto newW = g->addTensor( // [F, CRS] {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]}, - dtype); // hack + dtype); // HACK: this should be a transpose - // auto newW = g->addTensor( - // {weightDims[0] * weightDims[1] * weightDims[2], weightDims[3]}, - // dtype); g->addOpWithOutputs(g->cloneTensor(A), newA, newA->getDims()); g->addOpWithOutputs(g->cloneTensor(W), newW, newW->getDims()); + // newO [NHW, CRS] Tensor newO = g->addOp(newA, newW, nullptr, 0, 0)->getOutput(); auto new1 = g->addTensor({n, h, w, c, r, s}, dtype); g->addOpWithOutputs(newO, new1, new1->getDims()); + // [NHW, CRS] -> [N,H,W,C] g->addOpWithOutputs( new1, nullptr, g->cloneTensor(op->getOutput()), false, 0.f, ph, pw); return g; diff --git a/src/operators/conv2dreduce.cc b/src/operators/conv2dreduce.cc index 798e6d9c..86d3700d 100644 --- a/src/operators/conv2dreduce.cc +++ b/src/operators/conv2dreduce.cc @@ -95,4 +95,4 @@ Conv2dReduceTranspose::inferShape(const TensorVec &inputs) const { return {{{on, oh, ow, of}}}; } -} // namespace infini \ No newline at end of file +} // namespace infini diff --git a/test/nnet/run_models_nnet.py b/test/nnet/run_models_nnet.py index 008a7c21..383a4393 100644 --- a/test/nnet/run_models_nnet.py +++ b/test/nnet/run_models_nnet.py @@ -155,7 +155,7 @@ if __name__ == "__main__": # construct_convTranspose2d(runtime) # (load_onnx(runtime, '/mnt/auxHome/models/einnet/fsrcnn.bs1.onnx'), 'fsrcnn.bs1'), (ft.getFSRCNNGraph(1, runtime), "fsrcnn.bs1"), - (ft.getFSRCNNGraph(16, runtime), "fsrcnn.bs16") + (ft.getFSRCNNGraph(16, runtime), "fsrcnn.bs16"), # (construct_conv_nhwc(runtime, 1, 56, 32, 32, 12, 1, 1, 0, 1, 1), 'conv1x1') ]