Add: TVM headers and CMake include paths

This commit is contained in:
Liyan Zheng 2023-04-09 17:16:36 +08:00
parent e8b4e3f03f
commit 69d894e003
5 changed files with 223 additions and 11 deletions

View File

@ -16,6 +16,16 @@ cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST
set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")
include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
else()
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/config.cmake)
message(STATUS "Using config.cmake in CMAKE_CURRENT_SOURCE_DIR directory")
include(${CMAKE_CURRENT_SOURCE_DIR}/config.cmake)
endif()
endif()
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
@ -63,6 +73,12 @@ include_directories(3rd-party/pybind11/include)
add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)
# TVM and DMLC for invoking TVM packed functions
include_directories(${TVM_INCLUDE_DIR})
include_directories(${DMLC_INCLUDE_DIR})
include_directories(${DLPACK_INCLUDE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
if(BUILD_TEST)
set(BUILD_GMOCK
OFF

View File

@ -7,10 +7,10 @@ namespace infini {
class MemBoundObj : public OperatorObj {
private:
std::vector<nnet::Tensor> nnetInputs;
nnet::Expr expr;
nnet::Expr expr, simplifiedExpr;
double exec_time;
std::string hint;
HashType hash;
HashType hash, simplifiedHash;
int n, f, h, w;
public:
@ -27,11 +27,15 @@ class MemBoundObj : public OperatorObj {
int numOutputs() const override { return outputs.size(); }
const vector<nnet::Tensor> &getNnetInputs() const { return nnetInputs; }
const nnet::Expr getNnetExpr() const { return expr; }
pair<const nnet::Expr, HashType> getSimplifiedNnetExpr() const {
return {expr, hash};
}
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
HashType getHash() const;
static HashType calcHash(nnet::Expr expr);
static bool checkOOB(nnet::Expr expr);
};
} // namespace infini

View File

@ -26,7 +26,7 @@ class TVMRecordObj : public PerfRecordObj {
using TVMRecord = Ref<TVMRecordObj>;
class MemboundTVM : public Kernel {
class MemboundTVMExtractSource : public Kernel {
public:
void compute(const Operator &_op, const PerfRecord &record,
const RuntimeObj *_context) const override {
@ -236,6 +236,6 @@ class MemboundTVM : public Kernel {
}
};
REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32, MemboundTVM,
"Memobund_TVM_Ansor");
// REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32, MemboundTVMExtractSource,
// "Memobund_TVM_Ansor_extract_source");
}; // namespace infini

View File

@ -0,0 +1,167 @@
#include "core/kernel.h"
#include "cuda/cuda_runtime.h"
#include "dlpack/dlpack.h"
#include "ffi/ffi_embed.h"
#include "nnet/Visitor/AsTVMVisitor.h"
#include "nnet/Visitor/HashVisitor.h"
#include "nnet/dbg.h"
#include "operators/membound.h"
#include "operators/pooling.h"
#include "tvm/runtime/module.h"
#include "tvm/runtime/packed_func.h"
namespace py = pybind11;
namespace infini {
class TVMRecordObj : public PerfRecordObj {
// TODO: Add more attrs
public:
size_t logSize, ptxSize;
std::string log, ptx;
std::vector<int> invokeParams;
std::string kernelName;
HashType simplifiedExprHash;
};
using TVMRecord = Ref<TVMRecordObj>;
class MemboundTVMPackedFunction : public Kernel {
public:
void compute(const Operator &_op, const PerfRecord &record,
const RuntimeObj *_context) const override {
auto op = as<MemBoundObj>(_op);
// auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
auto tvmRecord = std::dynamic_pointer_cast<TVMRecordObj>(record);
// TODO
}
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
IT_ASSERT(false, "A TVM record is required for membound kernel.");
}
std::string getVarName(const Tensor &t) const {
return "var_" + std::to_string(t->getGuid());
}
// Premise: op is idempotent since it is called multiple times.
PerfRecord tune(const Operator &_op,
const RuntimeObj *_context) const override {
TVMRecord ret = std::make_shared<TVMRecordObj>();
auto op = as<MemBoundObj>(_op);
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
// invoke Ansor to tune a membound kernel
auto [expr, hash] = op->getSimplifiedNnetExpr();
nnet::AsTVMVisitor visitor;
visitor.dispatch(expr);
auto &&stmts = visitor.getStmts();
auto &&inShapes = visitor.getInputShapes();
auto &&outShape = visitor.getOutputShape();
std::vector<std::string> inputs;
for (auto &&in : op->getInputs()) {
inputs.emplace_back(getVarName(in));
}
const std::string output = getVarName(op->getOutput());
const std::string func = "membound_" + std::to_string(hash);
const std::string kernelName = func + "_kernel0";
auto res = getAnsorCode(
inShapes, std::vector<std::string>(inShapes.size(), "float32"),
outShape, "float32", stmts, func, inputs, output, op->toString(),
expr->toReadable(), hash);
// TODO: 1. Convert Tensor to DLTensor in convertTensorToDLTensor
// 2. Store and load TVM function
// 3. Prepare PerfRecordObj
// 4. Impliment MemboundTVMPackedFunction::compute
return std::dynamic_pointer_cast<PerfRecordObj>(ret);
}
/// @brief
/// @param inDims
/// @param inDTypes
/// @param outDims
/// @param outDType
/// @param lambda
/// @param funcName Generated function name
/// @param inputNames Input array names in the generated invocation code.
/// @param outputName Output array names in the generated invocation code.
/// @param nnetExpressionString Save expr in string for logging.
/// @param nnetSimplifiedExprString Save simplified expr in string for
/// logging.
/// @param hashCode (optional) Hash code of the input expression for kernel
/// cache.
/// @return
std::pair<std::string, std::vector<int>>
getAnsorCode(const std::vector<std::vector<int>> &inDims,
const std::vector<std::string> &inDTypes,
const std::vector<int> &outDims, const std::string &outDType,
const std::string &lambda, const std::string &funcName,
const std::vector<std::string> &inputNames,
const std::string &outputName,
const std::string &nnetExprString,
const std::string &nnetSimplifiedExprString,
const HashType hashCode) const {
std::string funcCode;
std::vector<int> invokeParams;
try {
start_interpreter();
// Use static to avoid re-importing the module. Re-importing results
// in cuBLAS failure, whose root cause is not identified yet.
static auto func =
py::module::import("cpp_plugin").attr("gen_ansor_op");
py::tuple code =
func(inDims, inDTypes, outDims, outDType, lambda, funcName,
inputNames, outputName, nnetExprString,
nnetSimplifiedExprString, std::to_string(hashCode));
funcCode = py::str(code[0]);
auto temp = py::list(code[3]);
for (int i = 0; i < 6; ++i) {
invokeParams.push_back(temp[i].cast<int>());
}
} catch (py::error_already_set &e) {
if (e.matches(PyExc_ImportError)) {
std::cerr << "Import Error. Don't forget to set environment "
"variable PYTHONPATH to contain "
"<repo-root>/python"
<< std::endl;
}
throw;
}
return std::make_pair(funcCode, invokeParams);
}
tvm::runtime::PackedFunc getPackedFunction(string path,
string functionName) const {
tvm::runtime::Module mod = tvm::runtime::Module::LoadFromFile(path);
return mod.GetFunction(functionName);
}
pair<DLTensor, Ref<vector<int64_t>>>
convertTensorToDLTensor(const Tensor &tensor) const {
IT_ASSERT_TODO(tensor->getRuntime()->isCuda());
// The lifecycle of shapeInt64 is managed by the caller.
auto shapeInt64 =
make_ref<vector<int64_t>>(tensor->getDims().size(), 0);
for (auto v : tensor->getDims())
shapeInt64->push_back(v);
// TODO
// DLTensor ret{
// .data = data->getPtr<void *>(),
// .device = kDLCUDA,
// .ndim = (int32_t)shape.size(),
// .dtype = kDLFloat,
// .shape = static_cast<int64_t *>(shapeInt64->data()),
// .strides = nullptr,
// .byte_offset = 0,
// };
// return {ret, shapeInt64};
}
};
REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
MemboundTVMPackedFunction,
"Memobund_TVM_Ansor_packed_funciton");
}; // namespace infini

View File

@ -1,5 +1,7 @@
#include "operators/membound.h"
#include "nnet/Visitor/CheckOOBVisitor.h"
#include "nnet/Visitor/HashVisitor.h"
#include "nnet/Visitor/MergeMemboundMutator.h"
namespace infini {
@ -10,7 +12,19 @@ MemBoundObj::MemBoundObj(GraphObj *graph, const TensorVec &input,
: OperatorObj(OpType::MemBound, input, output), nnetInputs(nnetInputs),
expr(expr), exec_time(exec_time), hint(hint) {
IT_ASSERT(checkValid(graph));
hash = getHash();
IT_ASSERT(!checkOOB(expr));
hash = calcHash(expr);
// fuse stages in nnet expr to reduce kernels generated by TVM
if (auto mergedExpr =
nnet::MergeMemboundMutator({expr}).merge(false, true)) {
simplifiedExpr = mergedExpr;
IT_ASSERT(!checkOOB(simplifiedExpr));
simplifiedHash = calcHash(simplifiedExpr);
} else {
simplifiedExpr = expr;
simplifiedHash = hash;
}
}
string MemBoundObj::toString() const {
@ -33,8 +47,14 @@ string MemBoundObj::toString() const {
for (const auto &tensor : nnetInputs)
os << tensor->toReadable() << ",";
os << "]";
os << ", ExprHash=" << hash << ")";
os << "\n" << (expr ? expr->toReadable() : "Empty expression") << "\n";
os << ", ExprHash=" << hash;
os << ", SimplifiedExprHash=" << simplifiedHash;
os << ")\n";
os << ">>> Original expr\n"
<< (expr ? expr->toReadable() : "Empty expression") << "\n";
os << ">>> Simplified expr\n"
<< (simplifiedExpr ? simplifiedExpr->toReadable() : "Empty expression")
<< "\n";
return os.str();
}
@ -49,13 +69,18 @@ optional<vector<Shape>> MemBoundObj::inferShape(const TensorVec &inputs) const {
}
vector<int> MemBoundObj::getWorkloadVector() const {
return {enum_to_underlying(type), (int)hash};
return {enum_to_underlying(type), (int)simplifiedHash};
}
vector<int> MemBoundObj::getOpAttrVector() const { return getWorkloadVector(); }
HashType MemBoundObj::getHash() const {
HashType MemBoundObj::calcHash(nnet::Expr expr) {
return nnet::HashVisitor().dispatch(expr);
}
bool MemBoundObj::checkOOB(nnet::Expr expr) {
return nnet::CheckOOBVisitor().checkRangeOp(
nnet::as<nnet::RangeOpNode>(expr));
}
} // namespace infini