Add: TVM headers and CMake include paths

2023-04-09 17:16:36 +08:00 · 2023-04-09 17:16:36 +08:00 · 69d894e003
parent e8b4e3f03f
commit 69d894e003
5 changed files with 223 additions and 11 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -16,6 +16,16 @@ cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST

 set(DEFAULT_BUILD_TYPE "RelWithDebInfo")

+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
+  message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")
+  include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
+else()
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/config.cmake)
+    message(STATUS "Using config.cmake in CMAKE_CURRENT_SOURCE_DIR directory")
+    include(${CMAKE_CURRENT_SOURCE_DIR}/config.cmake)
+  endif()
+endif()
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off

@ -63,6 +73,12 @@ include_directories(3rd-party/pybind11/include)
 add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
 include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)

+# TVM and DMLC for invoking TVM packed functions
+include_directories(${TVM_INCLUDE_DIR})
+include_directories(${DMLC_INCLUDE_DIR})
+include_directories(${DLPACK_INCLUDE_DIR})
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
+
 if(BUILD_TEST)
  set(BUILD_GMOCK
      OFF
--- a/include/operators/membound.h
+++ b/include/operators/membound.h
@ -7,10 +7,10 @@ namespace infini {
 class MemBoundObj : public OperatorObj {
  private:
    std::vector<nnet::Tensor> nnetInputs;
-    nnet::Expr expr;
+    nnet::Expr expr, simplifiedExpr;
    double exec_time;
    std::string hint;
-    HashType hash;
+    HashType hash, simplifiedHash;
    int n, f, h, w;

  public:
@ -27,11 +27,15 @@ class MemBoundObj : public OperatorObj {
    int numOutputs() const override { return outputs.size(); }
    const vector<nnet::Tensor> &getNnetInputs() const { return nnetInputs; }
    const nnet::Expr getNnetExpr() const { return expr; }
+    pair<const nnet::Expr, HashType> getSimplifiedNnetExpr() const {
+        return {expr, hash};
+    }

  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
-    HashType getHash() const;
+    static HashType calcHash(nnet::Expr expr);
+    static bool checkOOB(nnet::Expr expr);
 };

 } // namespace infini
--- a/src/kernels/cuda/membound_tvm_extract_source.cc
+++ b/src/kernels/cuda/membound_tvm_extract_source.cc
@ -26,7 +26,7 @@ class TVMRecordObj : public PerfRecordObj {

 using TVMRecord = Ref<TVMRecordObj>;

-class MemboundTVM : public Kernel {
+class MemboundTVMExtractSource : public Kernel {
  public:
    void compute(const Operator &_op, const PerfRecord &record,
                 const RuntimeObj *_context) const override {
@ -236,6 +236,6 @@ class MemboundTVM : public Kernel {
    }
 };

-REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32, MemboundTVM,
-                "Memobund_TVM_Ansor");
+// REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32, MemboundTVMExtractSource,
+//                 "Memobund_TVM_Ansor_extract_source");
 }; // namespace infini
--- a/src/kernels/cuda/membound_tvm_packed_function.cc
+++ b/src/kernels/cuda/membound_tvm_packed_function.cc
@ -0,0 +1,167 @@
+#include "core/kernel.h"
+#include "cuda/cuda_runtime.h"
+#include "dlpack/dlpack.h"
+#include "ffi/ffi_embed.h"
+#include "nnet/Visitor/AsTVMVisitor.h"
+#include "nnet/Visitor/HashVisitor.h"
+#include "nnet/dbg.h"
+#include "operators/membound.h"
+#include "operators/pooling.h"
+#include "tvm/runtime/module.h"
+#include "tvm/runtime/packed_func.h"
+
+namespace py = pybind11;
+
+namespace infini {
+
+class TVMRecordObj : public PerfRecordObj {
+    // TODO: Add more attrs
+  public:
+    size_t logSize, ptxSize;
+    std::string log, ptx;
+    std::vector<int> invokeParams;
+    std::string kernelName;
+    HashType simplifiedExprHash;
+};
+
+using TVMRecord = Ref<TVMRecordObj>;
+
+class MemboundTVMPackedFunction : public Kernel {
+  public:
+    void compute(const Operator &_op, const PerfRecord &record,
+                 const RuntimeObj *_context) const override {
+        auto op = as<MemBoundObj>(_op);
+        // auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
+        auto tvmRecord = std::dynamic_pointer_cast<TVMRecordObj>(record);
+        // TODO
+    }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        IT_ASSERT(false, "A TVM record is required for membound kernel.");
+    }
+
+    std::string getVarName(const Tensor &t) const {
+        return "var_" + std::to_string(t->getGuid());
+    }
+
+    // Premise: op is idempotent since it is called multiple times.
+    PerfRecord tune(const Operator &_op,
+                    const RuntimeObj *_context) const override {
+        TVMRecord ret = std::make_shared<TVMRecordObj>();
+        auto op = as<MemBoundObj>(_op);
+        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
+
+        // invoke Ansor to tune a membound kernel
+        auto [expr, hash] = op->getSimplifiedNnetExpr();
+        nnet::AsTVMVisitor visitor;
+        visitor.dispatch(expr);
+        auto &&stmts = visitor.getStmts();
+        auto &&inShapes = visitor.getInputShapes();
+        auto &&outShape = visitor.getOutputShape();
+
+        std::vector<std::string> inputs;
+        for (auto &&in : op->getInputs()) {
+            inputs.emplace_back(getVarName(in));
+        }
+        const std::string output = getVarName(op->getOutput());
+
+        const std::string func = "membound_" + std::to_string(hash);
+        const std::string kernelName = func + "_kernel0";
+        auto res = getAnsorCode(
+            inShapes, std::vector<std::string>(inShapes.size(), "float32"),
+            outShape, "float32", stmts, func, inputs, output, op->toString(),
+            expr->toReadable(), hash);
+        // TODO: 1. Convert Tensor to DLTensor in convertTensorToDLTensor
+        //       2. Store and load TVM function
+        //       3. Prepare PerfRecordObj
+        //       4. Impliment MemboundTVMPackedFunction::compute
+        return std::dynamic_pointer_cast<PerfRecordObj>(ret);
+    }
+
+    /// @brief
+    /// @param inDims
+    /// @param inDTypes
+    /// @param outDims
+    /// @param outDType
+    /// @param lambda
+    /// @param funcName Generated function name
+    /// @param inputNames Input array names in the generated invocation code.
+    /// @param outputName Output array names in the generated invocation code.
+    /// @param nnetExpressionString Save expr in string for logging.
+    /// @param nnetSimplifiedExprString Save simplified expr in string for
+    /// logging.
+    /// @param hashCode (optional) Hash code of the input expression for kernel
+    /// cache.
+    /// @return
+    std::pair<std::string, std::vector<int>>
+    getAnsorCode(const std::vector<std::vector<int>> &inDims,
+                 const std::vector<std::string> &inDTypes,
+                 const std::vector<int> &outDims, const std::string &outDType,
+                 const std::string &lambda, const std::string &funcName,
+                 const std::vector<std::string> &inputNames,
+                 const std::string &outputName,
+                 const std::string &nnetExprString,
+                 const std::string &nnetSimplifiedExprString,
+                 const HashType hashCode) const {
+        std::string funcCode;
+        std::vector<int> invokeParams;
+        try {
+            start_interpreter();
+            // Use static to avoid re-importing the module. Re-importing results
+            // in cuBLAS failure, whose root cause is not identified yet.
+            static auto func =
+                py::module::import("cpp_plugin").attr("gen_ansor_op");
+            py::tuple code =
+                func(inDims, inDTypes, outDims, outDType, lambda, funcName,
+                     inputNames, outputName, nnetExprString,
+                     nnetSimplifiedExprString, std::to_string(hashCode));
+            funcCode = py::str(code[0]);
+            auto temp = py::list(code[3]);
+            for (int i = 0; i < 6; ++i) {
+                invokeParams.push_back(temp[i].cast<int>());
+            }
+        } catch (py::error_already_set &e) {
+            if (e.matches(PyExc_ImportError)) {
+                std::cerr << "Import Error. Don't forget to set environment "
+                             "variable PYTHONPATH to contain "
+                             "<repo-root>/python"
+                          << std::endl;
+            }
+            throw;
+        }
+        return std::make_pair(funcCode, invokeParams);
+    }
+
+    tvm::runtime::PackedFunc getPackedFunction(string path,
+                                               string functionName) const {
+        tvm::runtime::Module mod = tvm::runtime::Module::LoadFromFile(path);
+        return mod.GetFunction(functionName);
+    }
+
+    pair<DLTensor, Ref<vector<int64_t>>>
+    convertTensorToDLTensor(const Tensor &tensor) const {
+        IT_ASSERT_TODO(tensor->getRuntime()->isCuda());
+        // The lifecycle of shapeInt64 is managed by the caller.
+        auto shapeInt64 =
+            make_ref<vector<int64_t>>(tensor->getDims().size(), 0);
+        for (auto v : tensor->getDims())
+            shapeInt64->push_back(v);
+        // TODO
+        // DLTensor ret{
+        //     .data = data->getPtr<void *>(),
+        //     .device = kDLCUDA,
+        //     .ndim = (int32_t)shape.size(),
+        //     .dtype = kDLFloat,
+        //     .shape = static_cast<int64_t *>(shapeInt64->data()),
+        //     .strides = nullptr,
+        //     .byte_offset = 0,
+        // };
+        // return {ret, shapeInt64};
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
+                MemboundTVMPackedFunction,
+                "Memobund_TVM_Ansor_packed_funciton");
+}; // namespace infini
--- a/src/operators/membound.cc
+++ b/src/operators/membound.cc
@ -1,5 +1,7 @@
 #include "operators/membound.h"
+#include "nnet/Visitor/CheckOOBVisitor.h"
 #include "nnet/Visitor/HashVisitor.h"
+#include "nnet/Visitor/MergeMemboundMutator.h"

 namespace infini {

@ -10,7 +12,19 @@ MemBoundObj::MemBoundObj(GraphObj *graph, const TensorVec &input,
    : OperatorObj(OpType::MemBound, input, output), nnetInputs(nnetInputs),
      expr(expr), exec_time(exec_time), hint(hint) {
    IT_ASSERT(checkValid(graph));
-    hash = getHash();
+    IT_ASSERT(!checkOOB(expr));
+    hash = calcHash(expr);
+
+    // fuse stages in nnet expr to reduce kernels generated by TVM
+    if (auto mergedExpr =
+            nnet::MergeMemboundMutator({expr}).merge(false, true)) {
+        simplifiedExpr = mergedExpr;
+        IT_ASSERT(!checkOOB(simplifiedExpr));
+        simplifiedHash = calcHash(simplifiedExpr);
+    } else {
+        simplifiedExpr = expr;
+        simplifiedHash = hash;
+    }
 }

 string MemBoundObj::toString() const {
@ -33,8 +47,14 @@ string MemBoundObj::toString() const {
    for (const auto &tensor : nnetInputs)
        os << tensor->toReadable() << ",";
    os << "]";
-    os << ", ExprHash=" << hash << ")";
-    os << "\n" << (expr ? expr->toReadable() : "Empty expression") << "\n";
+    os << ", ExprHash=" << hash;
+    os << ", SimplifiedExprHash=" << simplifiedHash;
+    os << ")\n";
+    os << ">>> Original expr\n"
+       << (expr ? expr->toReadable() : "Empty expression") << "\n";
+    os << ">>> Simplified expr\n"
+       << (simplifiedExpr ? simplifiedExpr->toReadable() : "Empty expression")
+       << "\n";
    return os.str();
 }

@ -49,13 +69,18 @@ optional<vector<Shape>> MemBoundObj::inferShape(const TensorVec &inputs) const {
 }

 vector<int> MemBoundObj::getWorkloadVector() const {
-    return {enum_to_underlying(type), (int)hash};
+    return {enum_to_underlying(type), (int)simplifiedHash};
 }

 vector<int> MemBoundObj::getOpAttrVector() const { return getWorkloadVector(); }

-HashType MemBoundObj::getHash() const {
+HashType MemBoundObj::calcHash(nnet::Expr expr) {
    return nnet::HashVisitor().dispatch(expr);
 }

+bool MemBoundObj::checkOOB(nnet::Expr expr) {
+    return nnet::CheckOOBVisitor().checkRangeOp(
+        nnet::as<nnet::RangeOpNode>(expr));
+}
+
 } // namespace infini