forked from jiuyuan/InfiniTensor
Compare commits
39 Commits
Author | SHA1 | Date |
---|---|---|
Liyan Zheng | 28bf2f0e83 | |
Liyan Zheng | 133d91bc76 | |
Liyan Zheng | 37f3e9cf22 | |
Liyan Zheng | 885a978016 | |
Liyan Zheng | a65dbff5f9 | |
Liyan Zheng | da3f5605e5 | |
Liyan Zheng | 45f7841d83 | |
Liyan Zheng | 7a1d271c79 | |
Liyan Zheng | b2c53458d9 | |
Liyan Zheng | e76f2d0f10 | |
Liyan Zheng | f881db7ceb | |
Liyan Zheng | b72662bb9b | |
Liyan Zheng | 582de83629 | |
Liyan Zheng | b6b37ccf33 | |
Liyan Zheng | 9a009be337 | |
Liyan Zheng | f0785225d7 | |
Liyan Zheng | f939ebf8bb | |
Liyan Zheng | 31b03ef91a | |
Liyan Zheng | bd76a5c8d6 | |
Liyan Zheng | 83d5842f56 | |
Liyan Zheng | 005c113948 | |
Liyan Zheng | 0d70643867 | |
Liyan Zheng | f29f54687c | |
huangshuhong | 67e9fb9632 | |
huangshuhong | af80830d56 | |
huangshuhong | f19bd4d1d6 | |
huangshuhong | 2e501de8db | |
Liyan Zheng | 31b1edc600 | |
Liyan Zheng | 69d894e003 | |
Liyan Zheng | e8b4e3f03f | |
Liyan Zheng | 0d8685b1e0 | |
Liyan Zheng | 04858012bf | |
Liyan Zheng | 6aaf963f50 | |
Liyan Zheng | 6441abb88f | |
huangshuhong | 220b76f355 | |
Liyan Zheng | aac4bd4650 | |
Liyan Zheng | 7478b698dc | |
huangshuhong | 8e6fc4136e | |
Liyan Zheng | bd12b67ea3 |
|
@ -16,6 +16,16 @@ cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST
|
|||
|
||||
set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
|
||||
|
||||
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
|
||||
message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")
|
||||
include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
|
||||
else()
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/config.cmake)
|
||||
message(STATUS "Using config.cmake in CMAKE_CURRENT_SOURCE_DIR directory")
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/config.cmake)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
|
||||
|
||||
|
@ -63,6 +73,19 @@ include_directories(3rd-party/pybind11/include)
|
|||
add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
|
||||
include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)
|
||||
|
||||
# TVM backend
|
||||
if(BUILD_TEST_EINNET)
|
||||
if (NOT TVM_INCLUDE_DIR OR NOT DMLC_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR)
|
||||
message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_TEST_EINNET is ON")
|
||||
endif()
|
||||
# TVM and DMLC for invoking TVM packed functions
|
||||
include_directories(${TVM_INCLUDE_DIR})
|
||||
include_directories(${DMLC_INCLUDE_DIR})
|
||||
include_directories(${DLPACK_INCLUDE_DIR})
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
|
||||
set(CMAKE_CXX_FLAGS "-DINFINI_USE_TVM=1") # Enable TVM codegen kernels
|
||||
endif()
|
||||
|
||||
if(BUILD_TEST)
|
||||
set(BUILD_GMOCK
|
||||
OFF
|
||||
|
@ -100,6 +123,11 @@ endif()
|
|||
|
||||
target_link_libraries(InfiniTensor pybind11::embed)
|
||||
|
||||
# TVM backend
|
||||
if(BUILD_TEST_EINNET)
|
||||
target_link_libraries(InfiniTensor ${TVM_LIB_DIR}/libtvm.so)
|
||||
endif()
|
||||
|
||||
# Python bindings
|
||||
file(GLOB_RECURSE FFIS src/ffi/ffi_infinitensor.cc)
|
||||
pybind11_add_module(backend MODULE ${FFIS})
|
||||
|
@ -219,5 +247,9 @@ if(BUILD_TEST)
|
|||
endif()
|
||||
if(BUILD_TEST_EINNET)
|
||||
build_test(test/nnet/test_*.cc)
|
||||
|
||||
# Build expression reader
|
||||
add_executable(nnet_reader test/nnet/readlog.cc)
|
||||
target_link_libraries(nnet_reader InfiniTensor)
|
||||
endif()
|
||||
endif()
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
set(TVM_HOME "/home/zly/Apps/tvm-v0.10.0")
|
||||
set(TVM_INCLUDE_DIR "${TVM_HOME}/include")
|
||||
set(TVM_LIB_DIR "${TVM_HOME}/build")
|
||||
set(DMLC_INCLUDE_DIR "${TVM_HOME}/3rdparty/dmlc-core/include")
|
||||
set(DLPACK_INCLUDE_DIR "${TVM_HOME}/3rdparty/dlpack/include")
|
||||
|
||||
set(USE_CUDA ON)
|
||||
set(USE_BANG OFF)
|
||||
|
||||
set(BUILD_TEST ON)
|
||||
set(BUILD_TEST_CORE ON)
|
||||
set(BUILD_TEST_PET OFF)
|
||||
set(BUILD_TEST_EINNET ON)
|
|
@ -16,9 +16,13 @@ class GraphObj : public Object {
|
|||
string toString() const override;
|
||||
Runtime getRuntime() const { return runtime; }
|
||||
|
||||
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
|
||||
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32,
|
||||
TensorType tensorType = TensorType::Other);
|
||||
Tensor addTensor(const Tensor &tensor);
|
||||
TensorVec addTensor(const TensorVec &tensors);
|
||||
/**
|
||||
* @brief Clone a tensor and add it to the graph.
|
||||
*/
|
||||
Tensor cloneTensor(const Tensor &tensor) {
|
||||
return addTensor(tensor->clone(runtime));
|
||||
}
|
||||
|
@ -79,6 +83,8 @@ class GraphObj : public Object {
|
|||
return ret;
|
||||
}
|
||||
|
||||
bool selfCheck(bool assert = false) const;
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Add reverse connections and Op relationship in ctor.
|
||||
|
|
|
@ -30,6 +30,7 @@ class Mutator {
|
|||
virtual bool isMultiBranchMergable(const Graph &in_graph) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
virtual Graph fuseVertically(const Graph &inputGraph) { IT_TODO_HALT(); }
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -39,6 +39,7 @@ enum class OpType {
|
|||
Tanh,
|
||||
Abs,
|
||||
Resize,
|
||||
PRelu,
|
||||
//
|
||||
MemBound = 300,
|
||||
};
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
#include "core/common.h"
|
||||
#include "core/object.h"
|
||||
#include "core/ref.h"
|
||||
#include <memory>
|
||||
|
||||
|
@ -59,10 +60,11 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
|||
* execution happens.
|
||||
*
|
||||
* @param graph
|
||||
* @param profiling Whether to print breakdown of time
|
||||
* @param printProfiling Whether to print breakdown of time
|
||||
* @return double Return the sum of perf time for each operator
|
||||
*/
|
||||
double getPerfTime(const Graph &graph, bool profiling = false) const;
|
||||
double getPerfTime(const Graph &graph, bool printProfiling = false,
|
||||
bool allowEstimation = false) const;
|
||||
Blob allocBlob(size_t size);
|
||||
bool isCpu() const {
|
||||
return device == Device::CPU || device == Device::MKL;
|
||||
|
@ -77,6 +79,11 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
|||
size_t bytes) const = 0;
|
||||
virtual string toString() const = 0;
|
||||
|
||||
map<UidBaseType, bool>
|
||||
getCompileTimeComputableAttribute(const Graph &graph) const;
|
||||
|
||||
double timeNonCtcOperators(const Graph &graph) const;
|
||||
|
||||
protected:
|
||||
void printProfilingData(double totTime,
|
||||
const std::map<OpType, double> &opTime,
|
||||
|
|
|
@ -4,19 +4,15 @@
|
|||
#include "graph.h"
|
||||
#include "mutator.h"
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
namespace infini {
|
||||
class SearchEngine {
|
||||
private:
|
||||
Runtime runtimeExec;
|
||||
Ref<Mutator> mutator;
|
||||
std::function<bool(const Graph &, const Graph &)> graphTimeComparer;
|
||||
|
||||
public:
|
||||
SearchEngine(Runtime _runtime, Ref<Mutator> _mutator) {
|
||||
runtimeExec = _runtime;
|
||||
mutator = _mutator;
|
||||
}
|
||||
SearchEngine(Runtime runtime, Ref<Mutator> mutator);
|
||||
~SearchEngine() {}
|
||||
|
||||
private: // Configurations
|
||||
|
@ -24,24 +20,17 @@ class SearchEngine {
|
|||
3; // cut nodes whose #in + #out >= partitionThreshold
|
||||
size_t GRAPH_SIZE = 16; // num of best graphs.
|
||||
|
||||
private: // Composed objects
|
||||
std::shared_ptr<Mutator> mutationEngine;
|
||||
|
||||
public:
|
||||
std::shared_ptr<Mutator> getMutationEngine() { return mutationEngine; };
|
||||
struct GroupEdge {
|
||||
int v, next;
|
||||
GroupEdge() = delete;
|
||||
};
|
||||
|
||||
struct Candidate { // a graph with perf
|
||||
std::shared_ptr<Graph> graph;
|
||||
double perf = INFINITY;
|
||||
};
|
||||
class MetaGraph { // a graph of subgraphs, for searching.
|
||||
public:
|
||||
MetaGraph() {}
|
||||
~MetaGraph() {}
|
||||
// struct Candidate { // a graph with perf
|
||||
// Graph graph;
|
||||
// double perf = INFINITY;
|
||||
// };
|
||||
struct MetaGraphObj { // a graph of subgraphs, for searching.
|
||||
struct Node {
|
||||
Graph graph;
|
||||
std::vector<int> suc;
|
||||
|
@ -50,31 +39,33 @@ class SearchEngine {
|
|||
};
|
||||
std::vector<Node> nodes;
|
||||
};
|
||||
using MetaGraph = Ref<MetaGraphObj>;
|
||||
|
||||
Graph run(const Graph graph); // entrance of search engine.
|
||||
Graph run(const Graph graph); // entrance to search engine.
|
||||
std::vector<Graph> search(const Graph &graph); // search for a partition.
|
||||
|
||||
private:
|
||||
std::vector<Graph> partitionGraph(const Graph graph);
|
||||
std::shared_ptr<MetaGraph> buildMetaGraphWithGraph(const Graph graph);
|
||||
std::shared_ptr<MetaGraph>
|
||||
buildMetaGraphWithPlan(const std::shared_ptr<MetaGraph> metaGraph,
|
||||
const std::vector<int> &plan);
|
||||
MetaGraph buildMetaGraphWithGraph(const Graph graph);
|
||||
MetaGraph buildMetaGraphWithPlan(const MetaGraph metaGraph,
|
||||
const std::vector<int> &plan);
|
||||
// search horizontal merges
|
||||
std::vector<std::shared_ptr<MetaGraph>>
|
||||
searchMerge(std::shared_ptr<MetaGraph> &metaGraph);
|
||||
void searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
|
||||
std::vector<int> &plan, std::vector<int> &frontier,
|
||||
std::vector<MetaGraph> searchMerge(MetaGraph &metaGraph);
|
||||
void searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
|
||||
std::vector<int> &frontier,
|
||||
std::vector<std::vector<int>> &plans,
|
||||
std::unordered_set<uint64_t> &planSet);
|
||||
std::vector<Graph>
|
||||
searchMutation(const std::shared_ptr<MetaGraph> &metaGraph);
|
||||
std::vector<Graph> searchMutation(const MetaGraph &metaGraph);
|
||||
|
||||
void printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph);
|
||||
void printMetaGraph(MetaGraph metaGraph);
|
||||
/**
|
||||
* @brief Check whether a multi-brach graph can be merged into a single
|
||||
* branch.
|
||||
*/
|
||||
bool isMultiBranchMergable(const Graph graph);
|
||||
Graph fuseVertically(const Graph &graph);
|
||||
|
||||
double getEstimatedGraphPerf(Graph graph);
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -12,13 +12,14 @@ namespace infini {
|
|||
// TODO: how to deal with this
|
||||
using ShapeElem = int;
|
||||
using Shape = vector<ShapeElem>;
|
||||
enum class TensorType { Error = 0, Input = 1, Initialized = 2, Other = 3 };
|
||||
class TensorObj : public TensorBaseObj {
|
||||
private:
|
||||
Shape shape;
|
||||
size_t _size; // Cache of Π(shape).
|
||||
Fuid fuid; // Cloned tensors share the same id. Tensors constructed from
|
||||
// scratch have a new id.
|
||||
|
||||
TensorType tensorType;
|
||||
void copyin(const void *ptr, size_t size) {
|
||||
runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
|
||||
}
|
||||
|
@ -27,7 +28,8 @@ class TensorObj : public TensorBaseObj {
|
|||
}
|
||||
|
||||
public:
|
||||
TensorObj(Shape shape, DataType dtype, Runtime runtime);
|
||||
TensorObj(Shape shape, DataType dtype, Runtime runtime,
|
||||
TensorType tensorType = TensorType::Other);
|
||||
virtual ~TensorObj() {}
|
||||
string toString() const override;
|
||||
|
||||
|
@ -39,6 +41,7 @@ class TensorObj : public TensorBaseObj {
|
|||
size_t getOffset(const vector<int> &ds) const;
|
||||
void dataMalloc();
|
||||
UidBaseType getFuid() const { return fuid; }
|
||||
TensorType getTensorType() const { return tensorType; }
|
||||
|
||||
void load(std::string file_path);
|
||||
void save(std::string file_path);
|
||||
|
@ -69,6 +72,9 @@ class TensorObj : public TensorBaseObj {
|
|||
|
||||
void copyData(const TensorObj *src);
|
||||
void copyData(const Tensor &src) { copyData(src.get()); }
|
||||
|
||||
// FIXME: std::fucntion copies the generator instead of passing it by ref.
|
||||
// Thus the internal state of generator cannot be updated.
|
||||
void setData(
|
||||
const std::function<void(void *, size_t, DataType)> &generator) const;
|
||||
Tensor clone() const {
|
||||
|
@ -92,29 +98,31 @@ class TensorObj : public TensorBaseObj {
|
|||
}
|
||||
|
||||
void printData() const;
|
||||
bool equalData(const Tensor &rhs) const;
|
||||
bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
|
||||
|
||||
template <typename T> bool equalData(const vector<T> &dataVector) {
|
||||
IT_ASSERT(DataType::get<T>() == dtype);
|
||||
IT_ASSERT(size() == dataVector.size());
|
||||
return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
|
||||
return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size(),
|
||||
1e-6);
|
||||
}
|
||||
|
||||
size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const;
|
||||
|
||||
private:
|
||||
void printDataFloat() const;
|
||||
void printDataUint32_t() const;
|
||||
void printDataFloat(float *ptr) const;
|
||||
void printDataUint32_t(uint32_t *ptr) const;
|
||||
|
||||
template <typename T>
|
||||
bool equalDataImpl(const T *a, const T *b, size_t size) const {
|
||||
bool equalDataImpl(const T *a, const T *b, size_t size,
|
||||
double relativeError) const {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if constexpr (std::is_integral_v<T>) {
|
||||
if (a[i] != b[i])
|
||||
return false;
|
||||
} else if constexpr (std::is_floating_point_v<T>) {
|
||||
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
|
||||
1e-6) {
|
||||
relativeError) {
|
||||
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -23,9 +23,8 @@
|
|||
const char *errName; \
|
||||
if (CUDA_SUCCESS != err) { \
|
||||
cuGetErrorString(err, &errName); \
|
||||
fprintf(stderr, "Cuda error in %s:%i : %s.\n", __FILE__, __LINE__, \
|
||||
errName); \
|
||||
exit(EXIT_FAILURE); \
|
||||
IT_ASSERT(err == CUDA_SUCCESS, \
|
||||
(string("CU error: ") + string(errName))); \
|
||||
} \
|
||||
}
|
||||
|
||||
|
|
|
@ -11,18 +11,8 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
CudaPtr workspace;
|
||||
size_t workspaceSize;
|
||||
|
||||
public:
|
||||
CUdevice cuDevice;
|
||||
CUcontext newContext;
|
||||
|
||||
public:
|
||||
CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
|
||||
// Prepare for nvrtc. cuCtxCreate should be called befero others.
|
||||
// Otherwise it will result in strange failure, such as cuBLAS failed on
|
||||
// certian inputs.
|
||||
checkCUresult(cuInit(0));
|
||||
checkCUresult(cuDeviceGet(&cuDevice, 0));
|
||||
checkCUresult(cuCtxCreate(&newContext, 0, cuDevice));
|
||||
|
||||
checkCudnnError(cudnnCreate(&cudnn));
|
||||
checkCublasError(cublasCreate(&cublas));
|
||||
|
@ -32,10 +22,13 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
workspace = alloc(workspaceSize);
|
||||
}
|
||||
virtual ~CudaRuntimeObj() {
|
||||
dealloc(workspace);
|
||||
checkCudnnError(cudnnDestroy(cudnn));
|
||||
checkCublasError(cublasDestroy(cublas));
|
||||
checkCUresult(cuCtxDestroy(newContext));
|
||||
try {
|
||||
dealloc(workspace);
|
||||
checkCudnnError(cudnnDestroy(cudnn));
|
||||
checkCublasError(cublasDestroy(cublas));
|
||||
} catch (const std::exception &e) {
|
||||
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
|
||||
}
|
||||
}
|
||||
string toString() const override;
|
||||
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
#include "core/graph_handler.h"
|
||||
#include "core/mutator.h"
|
||||
#include "core/search_engine.h"
|
||||
|
||||
namespace infini {
|
||||
namespace callback {
|
||||
void exportONNX(const Graph &graph, const string &path);
|
||||
}
|
||||
} // namespace infini
|
|
@ -22,10 +22,11 @@ class HashVisitor : public Functor<HashType(void)> {
|
|||
HashType visit_(const Subscript &c) override;
|
||||
HashType visit_(const Tensor &c) override;
|
||||
HashType visit_(const Var &c) override;
|
||||
HashType visit_(const Func &c) override;
|
||||
|
||||
public:
|
||||
HashVisitor(int _verobse = 0) : Functor(_verobse) {}
|
||||
HashType getHash(const Expr &c);
|
||||
};
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -20,7 +20,13 @@ class MergeMemboundMutator : public Mutator {
|
|||
*/
|
||||
MergeMemboundMutator(const VecExpr &kernels)
|
||||
: Mutator(), kernels(kernels), curDepth(kernels.size() - 1) {}
|
||||
Expr merge(bool allowEmptyMembound = false);
|
||||
|
||||
/// @brief Merged multiple expressions into one with one or several stages.
|
||||
/// @param allowEmptyMembound
|
||||
/// @param allowFailure If true, return nullptr when merging fails. If
|
||||
/// false, assert will fail.
|
||||
/// @return
|
||||
Expr merge(bool allowEmptyMembound = false, bool allowFailure = false);
|
||||
};
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -20,6 +20,7 @@ class Serializer : public Functor<string()> {
|
|||
string visit_(const Subscript &c) override;
|
||||
string visit_(const Var &c) override;
|
||||
string visit_(const Tensor &c) override;
|
||||
string visit_(const Func &c) override;
|
||||
string dispatchRoutine(const Routine &c);
|
||||
|
||||
Expr buildExprTree(string key);
|
||||
|
|
|
@ -66,10 +66,11 @@ static inline HashType genhash(string s) {
|
|||
}
|
||||
|
||||
#define nnet_unimplemented_halt() \
|
||||
{ assert(!"Unimplemented"); }
|
||||
{ IT_TODO_HALT(); }
|
||||
|
||||
#define nnet_unimplemented_continue() \
|
||||
{ dbg("Unimplemented"); }
|
||||
{}
|
||||
// { dbg("Unimplemented"); }
|
||||
|
||||
#define nnet_assert(expr, msg) assert(((void)(msg), (expr)))
|
||||
|
||||
|
|
|
@ -104,10 +104,11 @@ enum class NodeType {
|
|||
FuncNodeType
|
||||
};
|
||||
|
||||
enum class FuncType { Relu, Tanh };
|
||||
enum class FuncType { Relu = 1000, Tanh, PRelu };
|
||||
|
||||
#define DEFINE_GETTYPE(CLASS) \
|
||||
NodeType getType() const override { return NodeType::CLASS##Type; }
|
||||
#define DEFINE_GETTYPE(CLASS, isScalar_v) \
|
||||
NodeType getType() const override { return NodeType::CLASS##Type; } \
|
||||
bool isScalar() const override { return isScalar_v; }
|
||||
|
||||
class ExprNode {
|
||||
public:
|
||||
|
@ -119,6 +120,7 @@ class ExprNode {
|
|||
friend std::ostream &operator<<(std::ostream &ios, const ExprNode &expr);
|
||||
|
||||
virtual NodeType getType() const = 0;
|
||||
virtual bool isScalar() const = 0;
|
||||
};
|
||||
|
||||
class VarNode : public ExprNode {
|
||||
|
@ -127,7 +129,7 @@ class VarNode : public ExprNode {
|
|||
public:
|
||||
VarNode(std::string _name) : name(_name){};
|
||||
virtual ~VarNode() {}
|
||||
DEFINE_GETTYPE(VarNode);
|
||||
DEFINE_GETTYPE(VarNode, true);
|
||||
|
||||
const std::string &getName() const { return name; }
|
||||
HashType hash() const override { return genhash(name); };
|
||||
|
@ -152,7 +154,7 @@ class TensorNode : public ExprNode {
|
|||
TensorNode(string _name, vector<int> _shape, vector<int> _paddings = {},
|
||||
Routine _source = nullptr);
|
||||
virtual ~TensorNode() {}
|
||||
DEFINE_GETTYPE(TensorNode);
|
||||
DEFINE_GETTYPE(TensorNode, false);
|
||||
|
||||
bool operator==(const string &rhs) { return name == rhs; }
|
||||
friend bool operator==(const string &lhs, const TensorNode &rhs) {
|
||||
|
@ -174,6 +176,7 @@ class TensorNode : public ExprNode {
|
|||
const Routine &getSource() const { return source; }
|
||||
int getData(const Ref<vector<int>> &data, const vector<int> &idx);
|
||||
size_t getOffset(const vector<int> &idx);
|
||||
bool hasPadding();
|
||||
};
|
||||
|
||||
enum class OpType { Range, Add, Mul, Div, Mod, Sub };
|
||||
|
@ -203,7 +206,8 @@ struct IterationType {
|
|||
enum { Loop, Sum };
|
||||
constexpr static int NumIterationType = 2;
|
||||
};
|
||||
class RangeOpNode : public OperatorNode {
|
||||
class RangeOpNode : public OperatorNode,
|
||||
public std::enable_shared_from_this<RangeOpNode> {
|
||||
public:
|
||||
enum { Summand, END_POS };
|
||||
constexpr static int Loop = IterationType::Loop;
|
||||
|
@ -220,13 +224,14 @@ class RangeOpNode : public OperatorNode {
|
|||
const vector<int> &paddings)
|
||||
: OperatorNode(OpType::Range, {_summand}), vars{_loopIters, _sumIters},
|
||||
paddings(paddings){};
|
||||
DEFINE_GETTYPE(RangeOpNode);
|
||||
DEFINE_GETTYPE(RangeOpNode, false);
|
||||
|
||||
virtual HashType hash() const override {
|
||||
nnet_unimplemented_halt();
|
||||
return 0;
|
||||
};
|
||||
string toReadable() const override;
|
||||
string getFullExpression();
|
||||
const Expr &getSummand() const { return subExprs[Summand]; }
|
||||
const vector<VarRangePair> &getVarRanges(int _index) const {
|
||||
return vars[_index];
|
||||
|
@ -289,7 +294,7 @@ class BinaryOpNode : public OperatorNode {
|
|||
BinaryOpNode(OpType _opType, Expr _lhs, Expr _rhs)
|
||||
: OperatorNode(_opType, {_lhs, _rhs}){};
|
||||
virtual ~BinaryOpNode() {}
|
||||
DEFINE_GETTYPE(BinaryOpNode);
|
||||
DEFINE_GETTYPE(BinaryOpNode, true);
|
||||
|
||||
virtual HashType hash() const override {
|
||||
return genhash((HashType)opType,
|
||||
|
@ -314,7 +319,7 @@ class ConstantNode : public ExprNode {
|
|||
ConstantNode(int _val) : val(_val){};
|
||||
ConstantNode(const ConstantNode &rhs) : ExprNode(rhs), val(rhs.val){};
|
||||
virtual ~ConstantNode() {}
|
||||
DEFINE_GETTYPE(ConstantNode);
|
||||
DEFINE_GETTYPE(ConstantNode, true);
|
||||
|
||||
int getValue() const { return val; }
|
||||
virtual HashType hash() const override { return genhash(val, 6214587); };
|
||||
|
@ -334,7 +339,7 @@ class SubscriptNode : public ExprNode {
|
|||
SubscriptNode(Expr _indexed, vector<Expr> _subExprs) : subExprs(_subExprs) {
|
||||
setObject(_indexed);
|
||||
};
|
||||
DEFINE_GETTYPE(SubscriptNode);
|
||||
DEFINE_GETTYPE(SubscriptNode, true);
|
||||
|
||||
virtual HashType hash() const override {
|
||||
nnet_unimplemented_continue();
|
||||
|
@ -358,14 +363,15 @@ class SubscriptNode : public ExprNode {
|
|||
|
||||
class FuncNode : public ExprNode {
|
||||
protected:
|
||||
Subscript object;
|
||||
Expr object;
|
||||
FuncType funcType;
|
||||
|
||||
public:
|
||||
FuncNode(Expr object, FuncType funcType) : funcType(funcType) {
|
||||
setObject(object);
|
||||
FuncNode(Expr object, FuncType funcType)
|
||||
: object(object), funcType(funcType) {
|
||||
nnet_assert(object->isScalar(), "FuncNode operates on a scalar");
|
||||
}
|
||||
DEFINE_GETTYPE(FuncNode);
|
||||
DEFINE_GETTYPE(FuncNode, true);
|
||||
|
||||
virtual HashType hash() const override {
|
||||
nnet_unimplemented_continue();
|
||||
|
@ -373,7 +379,7 @@ class FuncNode : public ExprNode {
|
|||
};
|
||||
virtual string toReadable() const override;
|
||||
|
||||
const Subscript &getObject() const { return object; }
|
||||
const Expr &getObject() const { return object; }
|
||||
void setObject(Expr e);
|
||||
|
||||
FuncType getFuncType() const { return funcType; }
|
||||
|
|
|
@ -7,12 +7,13 @@ namespace infini {
|
|||
class NMutator : public Mutator {
|
||||
public:
|
||||
enum class Mode { Normal, ToNaiveMembound, RuleBased };
|
||||
using NameNToTensorT = map<string, Tensor>;
|
||||
|
||||
private:
|
||||
// Suffix -N: NNet objects.
|
||||
// Suffix -T: tpm objects.
|
||||
// Map: NNet tensors -> tpm tensor.
|
||||
std::map<std::string, Tensor> inputsNameNToTensorT;
|
||||
NameNToTensorT inputsNameNToTensorT;
|
||||
Mode mode;
|
||||
const double bandwidth = double(200) * 1024 * 1024 * 1024;
|
||||
// If in RuleBased mode, use derivationRules in derivator
|
||||
|
@ -20,19 +21,30 @@ class NMutator : public Mutator {
|
|||
|
||||
public:
|
||||
NMutator(Mode mode = Mode::Normal);
|
||||
NMutator(const std::vector<int> &derivationRules);
|
||||
NMutator(Mode mode, const std::vector<int> &derivationRules);
|
||||
~NMutator();
|
||||
|
||||
vector<Graph> run(const Graph &in_graph) override;
|
||||
void setToNaiveMembound();
|
||||
Graph fuseVertically(const Graph &in_graph) override;
|
||||
|
||||
void setToNaiveMembound();
|
||||
void setMaxDepth(int _maxDepth) { maxDepth = _maxDepth; }
|
||||
long long cntStates = 0;
|
||||
long long cntCandidates = 0;
|
||||
|
||||
static void memboundToJson(const Graph &g, const string path);
|
||||
|
||||
private:
|
||||
int maxDepth = 8;
|
||||
nnet::Expr opToExpression(Operator op);
|
||||
/// @brief
|
||||
/// @param op
|
||||
/// @return pair<Expr, map from NNet tensor names to InfiniTensor tensors>
|
||||
static pair<nnet::Expr, NameNToTensorT> extractOp(Operator op);
|
||||
static pair<nnet::Expr, NMutator::NameNToTensorT>
|
||||
generateUnaryExpr(const Operator &op);
|
||||
static pair<nnet::Expr, vector<nnet::Tensor>> generateRevert(Tensor in);
|
||||
|
||||
void runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs);
|
||||
|
||||
/**
|
||||
|
@ -48,11 +60,11 @@ class NMutator : public Mutator {
|
|||
|
||||
// TODO: recover these rules
|
||||
// Graph fuseHetConv(nnet::Expr expr, Graph in_graph);
|
||||
// Graph transformTConv1x1(Operator op);
|
||||
// Graph transformTConv3x3(Operator op);
|
||||
Graph transformConvtransposed1x1(Operator _op);
|
||||
// Graph transformConvtransposed(Operator op);
|
||||
// Graph transformDialtedConv(Operator op);
|
||||
// Graph transformConv1x1(Operator op);
|
||||
// Graph transformConv1xk(Operator op);
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
} // namespace infini
|
||||
|
|
|
@ -19,9 +19,15 @@ class MatmulObj : public OperatorObj {
|
|||
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new Matmul object. This comments show how operators is
|
||||
* defined in InfiniTensor. The constructor can create output tensors for
|
||||
* the operator or not, which depends on `graph`.
|
||||
* @brief Matmul operator with batch broadcast and tensor transpose
|
||||
* supports. Only one tensor with singe batch can be broadcasted due to the
|
||||
* BLAS interface restriction. Tranpose indicates whether the last two
|
||||
* dimensions should be transposed before Matmul and does not affect other
|
||||
* leading dimensions.
|
||||
*
|
||||
* Matmul show how operators are defined in InfiniTensor. The constructor of
|
||||
* an operator can create output tensors for the operator or not, which
|
||||
* depends on `graph`.
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param A The input tensor.
|
||||
|
|
|
@ -6,10 +6,13 @@ namespace infini {
|
|||
|
||||
class MemBoundObj : public OperatorObj {
|
||||
private:
|
||||
std::vector<nnet::Tensor> nnetInputs;
|
||||
nnet::Expr expr;
|
||||
std::vector<nnet::Tensor>
|
||||
nnetInputs; // The order of inputs in nnetInputs should be consistant
|
||||
// with inputs in infinitensor
|
||||
nnet::Expr expr, simplifiedExpr;
|
||||
double exec_time;
|
||||
std::string hint;
|
||||
HashType hash, simplifiedHash;
|
||||
int n, f, h, w;
|
||||
|
||||
public:
|
||||
|
@ -26,11 +29,17 @@ class MemBoundObj : public OperatorObj {
|
|||
int numOutputs() const override { return outputs.size(); }
|
||||
const vector<nnet::Tensor> &getNnetInputs() const { return nnetInputs; }
|
||||
const nnet::Expr getNnetExpr() const { return expr; }
|
||||
pair<const nnet::Expr, HashType> getSimplifiedNnetExpr() const {
|
||||
return {expr, hash};
|
||||
}
|
||||
double getEstimatedTime() const { return exec_time; }
|
||||
void saveAsJson(string path) const;
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
HashType getHash() const;
|
||||
static HashType calcHash(nnet::Expr expr);
|
||||
static bool checkOOB(nnet::Expr expr);
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -41,4 +41,5 @@ DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
|
|||
DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
|
||||
DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)
|
||||
DEFINE_UNARY_OBJ(Abs, OpType::Abs)
|
||||
DEFINE_UNARY_OBJ(PRelu, OpType::PRelu)
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
#pragma once
|
||||
#include "core/common.h"
|
||||
#include "core/tensor_base.h"
|
||||
#include <random>
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -38,6 +40,34 @@ class IncrementalGenerator : public DataGenerator {
|
|||
void fill(float *data, size_t size) override { fill<float>(data, size); }
|
||||
};
|
||||
|
||||
class RandomGenerator : public DataGenerator {
|
||||
private:
|
||||
double l, r;
|
||||
std::mt19937 e;
|
||||
std::uniform_int_distribution<int> di;
|
||||
std::uniform_real_distribution<float> dr;
|
||||
bool generateInteger;
|
||||
|
||||
public:
|
||||
RandomGenerator(double l = 0, double r = 1, unsigned int seed = 0,
|
||||
bool generateInteger = false)
|
||||
: l(l), r(r), e(seed), di(l, r), dr(l, r),
|
||||
generateInteger(generateInteger) {}
|
||||
virtual ~RandomGenerator() {}
|
||||
|
||||
private:
|
||||
void fill(uint32_t *data, size_t size) override {
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
data[i] = di(e);
|
||||
}
|
||||
}
|
||||
void fill(float *data, size_t size) override {
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
data[i] = (generateInteger) ? di(e) : dr(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <int val> class ValGenerator : public DataGenerator {
|
||||
public:
|
||||
virtual ~ValGenerator() {}
|
||||
|
|
|
@ -1 +1,2 @@
|
|||
from .gen_ansor_op import gen_ansor_op
|
||||
from .gen_ansor_so import gen_ansor_so
|
||||
|
|
|
@ -3,19 +3,50 @@ import re
|
|||
import numpy as np
|
||||
import tvm
|
||||
from tvm import te, tir, auto_scheduler, topi
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
|
||||
USE_CACHE = True
|
||||
logger = logging.getLogger('InfiniTensor')
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
def gen_ansor_op(input_tensors, input_dtypes, output_tensor, output_dtype, f, func_name, input_names, output_name):
|
||||
def gen_ansor_op(input_tensors, input_dtypes, output_tensor, output_dtype, f,
|
||||
func_name, input_names, output_name, nnet_expression: str,
|
||||
nnet_simplified_expression: str, hash_code=None):
|
||||
assert len(input_tensors) == len(input_dtypes)
|
||||
assert len(input_tensors) == len(input_names)
|
||||
|
||||
logging.debug(f'Work on hash {hash_code}')
|
||||
|
||||
dir_name = os.path.join(".cache", "generated_kernels", str(hash_code))
|
||||
func_code_fn = os.path.join(dir_name, "kernel.cu")
|
||||
invoke_code_fn = os.path.join(dir_name, "invoke.cpp")
|
||||
config_fn = os.path.join(dir_name, "config.json")
|
||||
|
||||
if USE_CACHE and hash_code is not None:
|
||||
if os.path.exists(dir_name):
|
||||
print(f"Use cache in {dir_name}")
|
||||
with open(func_code_fn, "r") as func_code_fin:
|
||||
func_code = func_code_fin.read()
|
||||
with open(invoke_code_fn, "r") as invoke_code_fin:
|
||||
invoke_code = invoke_code_fin.read()
|
||||
with open(config_fn, "r") as config_fin:
|
||||
config = json.loads(config_fin.read().strip())
|
||||
conv_time = config["conv_time"]
|
||||
invoke_params = config["invoke_params"]
|
||||
|
||||
logger.debug(f'Find tuning log for {hash_code}')
|
||||
return func_code, invoke_code, conv_time, invoke_params
|
||||
|
||||
print("Generating Ansor op: ")
|
||||
print(f)
|
||||
|
||||
@auto_scheduler.register_workload(func_name)
|
||||
def compute():
|
||||
_locals = locals()
|
||||
exec(f, {'tvm': tvm, 'te': te, 'tir': tir}, _locals)
|
||||
exec(f, {'tvm': tvm, 'te': te, 'tir': tir, 'topi': topi}, _locals)
|
||||
return _locals['ret']
|
||||
|
||||
target = tvm.target.Target("cuda")
|
||||
|
@ -43,6 +74,28 @@ def gen_ansor_op(input_tensors, input_dtypes, output_tensor, output_dtype, f, fu
|
|||
# Kill the measurement process
|
||||
del measure_ctx
|
||||
|
||||
def test_mutator():
|
||||
# test part
|
||||
tgt_temp = tvm.target.Target(target="llvm", host="llvm")
|
||||
all_tensors = compute()
|
||||
sch = te.create_schedule(all_tensors[0].op)
|
||||
args = all_tensors
|
||||
C0, K0, A0 = args
|
||||
func_temp = tvm.build(sch, args, tgt_temp, name="temp")
|
||||
|
||||
# print result
|
||||
n, c, h, w, f, r, s = 1, 1, 2, 2, 1, 4, 4
|
||||
dev_temp = tvm.device(tgt_temp.kind.name, 0)
|
||||
A_temp = tvm.nd.array(
|
||||
np.arange(n*h*w*f).reshape(n, h, w, f).astype(A0.dtype), dev_temp)
|
||||
K_temp = tvm.nd.array(
|
||||
np.arange(f*r*s*c).reshape(f, r, s, c).astype(K0.dtype), dev_temp)
|
||||
C_temp = tvm.nd.array(
|
||||
np.zeros((1, 4, 4, 1)).astype(C0.dtype), dev_temp)
|
||||
func_temp(C_temp, K_temp, A_temp)
|
||||
print("================= Test Result =====================")
|
||||
print(C_temp)
|
||||
|
||||
ir = str(tvm.lower(sch, args, simple_mode=True))
|
||||
thread_dim = [1, 1, 1]
|
||||
block_dim = [1, 1, 1]
|
||||
|
@ -83,11 +136,27 @@ def gen_ansor_op(input_tensors, input_dtypes, output_tensor, output_dtype, f, fu
|
|||
|
||||
print("Func Code")
|
||||
# Attach TVM code behind func_code
|
||||
func_code += "\n/* " + f + "*/"
|
||||
func_code += "\n/* NNET tensor expression \n" + nnet_expression + "\n*/\n"
|
||||
func_code += "\n/* NNET simplified tensor expression \n" + \
|
||||
nnet_simplified_expression + "\n*/\n"
|
||||
func_code += "\n/* TVM compute\n" + f + "\n*/\n"
|
||||
print(func_code)
|
||||
print("Invoke Code")
|
||||
print(invoke_code)
|
||||
print("Time")
|
||||
print(conv_time)
|
||||
|
||||
if hash_code is not None:
|
||||
if not os.path.exists(dir_name):
|
||||
os.makedirs(dir_name)
|
||||
with open(func_code_fn, "w") as func_code_fout:
|
||||
func_code_fout.write(func_code)
|
||||
with open(invoke_code_fn, "w") as invoke_code_fout:
|
||||
invoke_code_fout.write(invoke_code)
|
||||
with open(config_fn, "w") as config_fout:
|
||||
config_fout.write(json.dumps({
|
||||
"conv_time": conv_time,
|
||||
"invoke_params": invoke_params
|
||||
}, ensure_ascii=False, indent=2))
|
||||
|
||||
return func_code, invoke_code, conv_time, invoke_params # ms
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
import os
|
||||
import sys
|
||||
import json
|
||||
from contextlib import redirect_stdout
|
||||
import time
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import tvm
|
||||
from tvm import te, tir, auto_scheduler, topi
|
||||
|
||||
USE_CACHE = True
|
||||
logging.basicConfig()
|
||||
logger = logging.getLogger('InfiniTensor')
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def gen_ansor_so(input_tensors, input_dtypes, output_tensor, output_dtype,
|
||||
tvm_code, func_name, nnet_expression: str,
|
||||
nnet_simplified_expression: str, hash_code: str = None):
|
||||
assert len(input_tensors) == len(input_dtypes)
|
||||
|
||||
logger.debug(f'Work on hash {hash_code}')
|
||||
dir_name = os.path.join(".cache", "generated_kernels", str(hash_code))
|
||||
|
||||
if not os.path.exists(dir_name):
|
||||
os.makedirs(dir_name)
|
||||
|
||||
so_fn = os.path.join(dir_name, f"{func_name}.so")
|
||||
config_fn = os.path.join(dir_name, "config_so.json")
|
||||
desc_fn = os.path.join(dir_name, "desc.txt")
|
||||
log_fn = os.path.join(dir_name, f"ansor_{func_name}_log.json")
|
||||
out_fn = os.path.join(dir_name, "out.txt")
|
||||
|
||||
logger.debug(f"Generating Ansor op: {tvm_code}")
|
||||
logger.debug(f"Input shape: {input_tensors}")
|
||||
logger.debug(f"Output shape: {output_tensor}")
|
||||
|
||||
if USE_CACHE and hash_code is not None:
|
||||
if os.path.exists(dir_name) and \
|
||||
os.path.exists(so_fn) and \
|
||||
os.path.exists(config_fn):
|
||||
print(f"Use cache in {dir_name}")
|
||||
with open(config_fn, "r") as config_fin:
|
||||
config = json.loads(config_fin.read().strip())
|
||||
conv_time = config["conv_time"]
|
||||
|
||||
logger.info(f'Find tuning log for {hash_code} in {so_fn}')
|
||||
return so_fn, conv_time
|
||||
logger.info(f"TVM Tuning kernel with hash {hash_code}. See {out_fn}")
|
||||
|
||||
time_start = time.perf_counter()
|
||||
# Print descriptions of the task
|
||||
if USE_CACHE and hash_code is not None:
|
||||
with redirect_stdout(open(desc_fn, "w")):
|
||||
print("====NNET tensor expression====")
|
||||
print(nnet_expression+"\n")
|
||||
print("====NNET simplified tensor expression====")
|
||||
print(nnet_simplified_expression+"\n")
|
||||
print("====TVM compute====")
|
||||
print(tvm_code+"\n")
|
||||
print("Input shape: ", input_tensors)
|
||||
print("Output shape: ", output_tensor)
|
||||
|
||||
@auto_scheduler.register_workload(func_name)
|
||||
def compute():
|
||||
_locals = locals()
|
||||
exec(tvm_code, {'tvm': tvm, 'te': te,
|
||||
'tir': tir, 'topi': topi}, _locals)
|
||||
return _locals['ret']
|
||||
|
||||
target = tvm.target.Target("cuda")
|
||||
|
||||
task = auto_scheduler.SearchTask(func=func_name, args=(), target=target)
|
||||
|
||||
with redirect_stdout(open(out_fn, 'w')):
|
||||
# Inspect the computational graph
|
||||
print("Computational DAG:")
|
||||
print(task.compute_dag)
|
||||
|
||||
measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
|
||||
tune_option = auto_scheduler.TuningOptions(
|
||||
num_measure_trials=10,
|
||||
runner=measure_ctx.runner,
|
||||
measure_callbacks=[auto_scheduler.RecordToFile(log_fn)],
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
# Run auto-tuning (search)
|
||||
task.tune(tune_option)
|
||||
# Apply the best schedule
|
||||
sch, args = task.apply_best(log_fn)
|
||||
|
||||
# Kill the measurement process
|
||||
del measure_ctx
|
||||
|
||||
func = tvm.build(sch, args, target, name=func_name)
|
||||
func.export_library(so_fn)
|
||||
|
||||
ctx = tvm.cuda(0)
|
||||
input_a = []
|
||||
for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
|
||||
a_np = np.random.uniform(size=shape).astype(dtype)
|
||||
input_a.append(tvm.nd.array(a_np, ctx))
|
||||
a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
|
||||
func(a_out, *input_a)
|
||||
evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
|
||||
conv_time = evaluator(a_out, *input_a).mean * 1e3
|
||||
|
||||
time_end = time.perf_counter()
|
||||
|
||||
if USE_CACHE and hash_code is not None:
|
||||
with open(config_fn, "w") as config_fout:
|
||||
config_fout.write(json.dumps({
|
||||
"conv_time": conv_time,
|
||||
"tuning_time": time_end - time_start,
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
|
||||
}, ensure_ascii=False, indent=2))
|
||||
|
||||
return so_fn, conv_time
|
||||
|
||||
# Read arguments from pipe, which is redirected to stdin.
|
||||
# Write generated library path to pipe.
|
||||
|
||||
|
||||
def pipe_gen(fd: int):
|
||||
args = json.load(sys.stdin) # read from pipe
|
||||
# print(args, f'fd={fd}')
|
||||
ret = gen_ansor_so(**args)
|
||||
with os.fdopen(fd, 'w') as f:
|
||||
print(ret[0], file=f, end='') # write to pipe
|
|
@ -0,0 +1,7 @@
|
|||
import backend
|
||||
from backend import *
|
||||
import sys
|
||||
|
||||
sys.path.extend(__path__)
|
||||
|
||||
print("import backend: {}".format(backend))
|
|
@ -0,0 +1,941 @@
|
|||
import backend
|
||||
import onnx
|
||||
from onnx import (
|
||||
ModelProto,
|
||||
TensorProto,
|
||||
NodeProto,
|
||||
AttributeProto,
|
||||
TensorShapeProto,
|
||||
ValueInfoProto,
|
||||
)
|
||||
from onnx.helper import (
|
||||
make_node,
|
||||
make_tensor_value_info,
|
||||
make_tensor,
|
||||
make_graph,
|
||||
make_model,
|
||||
)
|
||||
from onnx.checker import (
|
||||
check_graph,
|
||||
check_model,
|
||||
check_node,
|
||||
check_value_info,
|
||||
check_tensor,
|
||||
)
|
||||
from onnx.shape_inference import infer_shapes
|
||||
from onnx.numpy_helper import to_array
|
||||
from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
|
||||
from functools import reduce
|
||||
|
||||
|
||||
class OnnxStub:
|
||||
"""
|
||||
The Onnx model imported into infinitensor.
|
||||
It can be generated from an Onnx model object.
|
||||
"""
|
||||
|
||||
# inputs: Dict[str, backend.Tensor] = {}
|
||||
# outputs: Dict[str, backend.Tensor] = {}
|
||||
initializer: Dict[int, TensorProto] = {}
|
||||
# handler: backend.GraphHandler
|
||||
|
||||
# def __init__(self, model: ModelProto, runtime):
|
||||
# model = infer_shapes(model)
|
||||
# self.handler = backend.GraphHandler(runtime)
|
||||
|
||||
# tensors: Dict[str, backend.Tensor] = dict()
|
||||
# data: Dict[str, TensorProto] = dict()
|
||||
|
||||
# for input in model.graph.input:
|
||||
# dims = _take_shape_dim(input.type.tensor_type.shape)
|
||||
# tensors[input.name] = self.handler.tensor(
|
||||
# dims, input.type.tensor_type.elem_type
|
||||
# )
|
||||
|
||||
# for output in model.graph.output:
|
||||
# dims = _take_shape_dim(output.type.tensor_type.shape)
|
||||
# tensors[output.name] = self.handler.tensor(
|
||||
# dims, output.type.tensor_type.elem_type
|
||||
# )
|
||||
|
||||
# for initializer in model.graph.initializer:
|
||||
# dims = [d for d in initializer.dims]
|
||||
# tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
|
||||
# data[initializer.name] = initializer
|
||||
|
||||
# for node in model.graph.node:
|
||||
# if node.op_type == "Conv":
|
||||
# attributes = _parse_attribute(
|
||||
# node,
|
||||
# {
|
||||
# "dilations": [1, 1],
|
||||
# "pads": [0, 0, 0, 0],
|
||||
# "strides": [1, 1],
|
||||
# },
|
||||
# )
|
||||
# (d, p, s) = (
|
||||
# attributes[name] for name in ["dilations", "pads", "strides"]
|
||||
# )
|
||||
# if p[0] != p[2] or p[1] != p[3]:
|
||||
# adapt = "{}-adapt".format(node.output[0])
|
||||
# tensors[adapt] = self.handler.pad(
|
||||
# tensors[node.input[0]], None, p, [-2, -1]
|
||||
# )
|
||||
# p = [0, 0, 0, 0]
|
||||
# else:
|
||||
# adapt = node.input[0]
|
||||
|
||||
# if len(node.input) > 2:
|
||||
# bias = "{}-bias".format(node.output[0])
|
||||
# reshape = "{}-reshape".format(node.output[0])
|
||||
# tensors[bias] = self.handler.conv(
|
||||
# tensors[adapt],
|
||||
# tensors[node.input[1]],
|
||||
# None,
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# )
|
||||
# tensors[reshape] = self.handler.reshape(
|
||||
# tensors[node.input[2]],
|
||||
# None,
|
||||
# [
|
||||
# 1,
|
||||
# reduce(
|
||||
# lambda acc, x: acc * x,
|
||||
# _search_shape(model, node.input[2]),
|
||||
# ),
|
||||
# 1,
|
||||
# 1,
|
||||
# ],
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.add(
|
||||
# tensors[bias],
|
||||
# tensors[reshape],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# else:
|
||||
# tensors[node.output[0]] = self.handler.conv(
|
||||
# tensors[adapt],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# )
|
||||
# elif node.op_type == "ConvTranspose":
|
||||
# attributes = _parse_attribute(
|
||||
# node,
|
||||
# {
|
||||
# "dilations": [1, 1],
|
||||
# "pads": [0, 0],
|
||||
# "strides": [1, 1],
|
||||
# "output_padding": [0, 0],
|
||||
# },
|
||||
# )
|
||||
# (d, p, s, op) = (
|
||||
# attributes[name]
|
||||
# for name in ["dilations", "pads", "strides", "output_padding"]
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.convTransposed2d(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# op[0],
|
||||
# op[1],
|
||||
# )
|
||||
# elif node.op_type == "MatMul":
|
||||
# tensors[node.output[0]] = self.handler.matmul(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# False,
|
||||
# False,
|
||||
# None,
|
||||
# backend.ActType.Linear,
|
||||
# )
|
||||
# elif node.op_type == "Gemm":
|
||||
# attributes = _parse_attribute(
|
||||
# node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
|
||||
# )
|
||||
# (alpha, beta, transA, transB) = (
|
||||
# attributes[name] for name in ["alpha", "beta", "transA", "transB"]
|
||||
# )
|
||||
# # FIXME unsupport attributes: `alpha` `beta`
|
||||
# assert alpha == 1.0
|
||||
# assert beta == 1.0
|
||||
# tensors[node.output[0]] = self.handler.matmul(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# transA == 1,
|
||||
# transB == 1,
|
||||
# tensors[node.input[2]] if len(node.input) > 2 else None,
|
||||
# backend.ActType.Linear,
|
||||
# )
|
||||
# elif node.op_type == "BatchNormalization":
|
||||
# (input, mean, var, scale, bias) = (
|
||||
# tensors[node.input[i]] for i in [0, 3, 4, 1, 2]
|
||||
# )
|
||||
# output = tensors.get(node.output[0])
|
||||
# attributes = _parse_attribute(
|
||||
# node, {"momentum": 0.9, "epsilon": 1e-05, "training_mode": 0}
|
||||
# )
|
||||
# (momentum, eps, training) = (
|
||||
# attributes[name]
|
||||
# for name in ["momentum", "epsilon", "training_mode"]
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.batchNorm(
|
||||
# input, output, mean, var, scale, bias, momentum, eps, training != 0
|
||||
# )
|
||||
# elif node.op_type == "MaxPool":
|
||||
# attributes = _parse_attribute(
|
||||
# node,
|
||||
# {
|
||||
# "kernel_shape": None,
|
||||
# "dilations": [1, 1],
|
||||
# "pads": [0, 0, 0, 0],
|
||||
# "strides": [1, 1],
|
||||
# },
|
||||
# )
|
||||
# (k, d, p, s) = (
|
||||
# attributes[name]
|
||||
# for name in ["kernel_shape", "dilations", "pads", "strides"]
|
||||
# )
|
||||
# if p[0] != p[2] or p[1] != p[3]:
|
||||
# adapt = "{}-adapt".format(node.output[0])
|
||||
# tensors[adapt] = self.handler.pad(
|
||||
# tensors.get(node.input[0]), None, p, [-2, -1]
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.maxPool(
|
||||
# tensors[adapt],
|
||||
# tensors.get(node.output[0]),
|
||||
# k[0],
|
||||
# k[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# 0,
|
||||
# 0,
|
||||
# s[0],
|
||||
# s[1],
|
||||
# )
|
||||
# else:
|
||||
# tensors[node.output[0]] = self.handler.maxPool(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# k[0],
|
||||
# k[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# )
|
||||
# elif node.op_type == "AveragePool":
|
||||
# attributes = _parse_attribute(
|
||||
# node,
|
||||
# {
|
||||
# "kernel_shape": None,
|
||||
# "pads": [0, 0, 0, 0],
|
||||
# "strides": [1, 1],
|
||||
# },
|
||||
# )
|
||||
# (k, p, s) = (
|
||||
# attributes[name] for name in ["kernel_shape", "pads", "strides"]
|
||||
# )
|
||||
# if p[0] != p[2] or p[1] != p[3]:
|
||||
# adapt = "{}-adapt".format(node.output[0])
|
||||
# tensors[adapt] = self.handler.pad(
|
||||
# tensors.get(node.input[0]), None, p, [-2, -1]
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.avgPool(
|
||||
# tensors[adapt],
|
||||
# tensors.get(node.output[0]),
|
||||
# k[0],
|
||||
# k[1],
|
||||
# 1,
|
||||
# 1,
|
||||
# 0,
|
||||
# 0,
|
||||
# s[0],
|
||||
# s[1],
|
||||
# )
|
||||
# else:
|
||||
# tensors[node.output[0]] = self.handler.avgPool(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# k[0],
|
||||
# k[1],
|
||||
# 1,
|
||||
# 1,
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# )
|
||||
# elif node.op_type == "GlobalAveragePool":
|
||||
# [_, _, h, w] = _search_shape(model, node.input[0])
|
||||
# tensors[node.output[0]] = self.handler.avgPool(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# h,
|
||||
# w,
|
||||
# 1,
|
||||
# 1,
|
||||
# 0,
|
||||
# 0,
|
||||
# 1,
|
||||
# 1,
|
||||
# )
|
||||
# elif node.op_type == "Add":
|
||||
# tensors[node.output[0]] = self.handler.add(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Sub":
|
||||
# tensors[node.output[0]] = self.handler.sub(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Mul":
|
||||
# tensors[node.output[0]] = self.handler.mul(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Div":
|
||||
# tensors[node.output[0]] = self.handler.div(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Pow":
|
||||
# tensors[node.output[0]] = self.handler.pow(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Relu":
|
||||
# tensors[node.output[0]] = self.handler.relu(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Sigmoid":
|
||||
# tensors[node.output[0]] = self.handler.sigmoid(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Tanh":
|
||||
# tensors[node.output[0]] = self.handler.tanh(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Softmax":
|
||||
# tensors[node.output[0]] = self.handler.softmax(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Abs":
|
||||
# tensors[node.output[0]] = self.handler.abs(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Shape":
|
||||
# tensors[node.output[0]] = self.handler.shape(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Identity":
|
||||
# tensors[node.output[0]] = self.handler.identity(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Flatten":
|
||||
# # FIXME axis must be 1
|
||||
# axis = next(
|
||||
# (attr.i for attr in node.attribute if attr.name == "axis"), None
|
||||
# )
|
||||
# assert axis == None or axis == 1
|
||||
# tensors[node.output[0]] = self.handler.flatten(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "PRelu":
|
||||
# tensors[node.output[0]] = self.handler.pRelu(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Clip":
|
||||
# tensors[node.output[0]] = self.handler.clip(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# next(_parse_data(data[node.input[1]]).__iter__(), None)
|
||||
# if len(node.input) > 1
|
||||
# else None,
|
||||
# next(_parse_data(data[node.input[2]]).__iter__(), None)
|
||||
# if len(node.input) > 2
|
||||
# else None,
|
||||
# )
|
||||
# elif node.op_type == "Transpose":
|
||||
# perm = next(
|
||||
# (attr.ints for attr in node.attribute if attr.name == "perm"), None
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.transpose(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# perm,
|
||||
# )
|
||||
# elif node.op_type == "Reshape":
|
||||
# dims = _search_shape(model, node.input[0])
|
||||
# size = reduce(lambda acc, x: acc * x, dims)
|
||||
# input_shape = _parse_data(data[node.input[1]])
|
||||
# for i, x in enumerate(input_shape):
|
||||
# if x == 0:
|
||||
# input_shape[i] = dims[i]
|
||||
# temp = reduce(lambda acc, x: acc * x, input_shape, 1)
|
||||
# if temp < 0:
|
||||
# input_shape[input_shape.index(-1)] = size // -temp
|
||||
# tensors[node.output[0]] = self.handler.reshape(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# input_shape,
|
||||
# )
|
||||
# elif node.op_type == "Squeeze":
|
||||
# input_shape = _search_shape(model, node.input[0])
|
||||
# axes = set(
|
||||
# [int(i) for i in data[node.input[1]].int64_data]
|
||||
# if len(node.input) > 1
|
||||
# else _parse_attribute(node, {"axes": None})["axes"]
|
||||
# )
|
||||
# assert all(input_shape[d] == 1 for d in axes)
|
||||
# output_shape = []
|
||||
# for i, x in enumerate(input_shape):
|
||||
# if i not in axes:
|
||||
# output_shape.append(x)
|
||||
# tensors[node.output[0]] = self.handler.reshape(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# output_shape,
|
||||
# )
|
||||
# elif node.op_type == "Unsqueeze":
|
||||
# input_shape = _search_shape(model, node.input[0])
|
||||
# axes = (
|
||||
# [int(i) for i in data[node.input[1]].int64_data]
|
||||
# if len(node.input) > 1
|
||||
# else _parse_attribute(node, {"axes": None})["axes"]
|
||||
# )
|
||||
# for i in axes:
|
||||
# input_shape.insert(i, 1)
|
||||
# tensors[node.output[0]] = self.handler.reshape(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# input_shape,
|
||||
# )
|
||||
# elif node.op_type == "Concat":
|
||||
# tensors[node.output[0]] = self.handler.concat(
|
||||
# [tensors[name] for name in node.input],
|
||||
# tensors.get(node.output[0]),
|
||||
# next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||
# )
|
||||
# elif node.op_type == "Split":
|
||||
# for name, tensor in zip(
|
||||
# node.output,
|
||||
# self.handler.split(
|
||||
# tensors[node.input[0]],
|
||||
# None,
|
||||
# next(
|
||||
# (attr.i for attr in node.attribute if attr.name == "axis"),
|
||||
# 0,
|
||||
# ),
|
||||
# len(node.output),
|
||||
# ),
|
||||
# ):
|
||||
# tensors[name] = tensor
|
||||
# elif node.op_type == "Gather":
|
||||
# tensors[node.output[0]] = self.handler.gather(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||
# )
|
||||
# elif node.op_type == "ReduceMean":
|
||||
# tensors[node.output[0]] = self.handler.reduce_mean(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# next(
|
||||
# (attr.ints for attr in node.attribute if attr.name == "axes"),
|
||||
# None,
|
||||
# ),
|
||||
# next((attr.i for attr in node.attribute if attr.name == "keepdims"))
|
||||
# != 0,
|
||||
# )
|
||||
# elif node.op_type == "Slice":
|
||||
# tensors[node.output[0]] = self.handler.slice(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# _parse_data(data[node.input[1]]),
|
||||
# _parse_data(data[node.input[2]]),
|
||||
# _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
|
||||
# _parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
|
||||
# )
|
||||
# elif node.op_type == "Pad":
|
||||
# tensors[node.output[0]] = self.handler.pad(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# _parse_data(data[node.input[1]]),
|
||||
# _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
|
||||
# )
|
||||
# elif node.op_type == "Dropout":
|
||||
# for name, tensor in zip(
|
||||
# node.output,
|
||||
# self.handler.dropout(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# tensors.get(node.output[1]) if len(node.output) > 1 else None,
|
||||
# _parse_data(data[node.input[1]])[0]
|
||||
# if len(node.input) > 1
|
||||
# else 0.5,
|
||||
# _parse_data(data[node.input[2]])[0]
|
||||
# if len(node.input) > 2
|
||||
# else False,
|
||||
# ),
|
||||
# ):
|
||||
# tensors[name] = tensor
|
||||
# else:
|
||||
# raise Exception('Unsupported operator "{}"'.format(node.op_type))
|
||||
|
||||
# self.handler.data_malloc()
|
||||
|
||||
# for name, obj in tensors.items():
|
||||
# tensor = data.get(name)
|
||||
# if tensor == None:
|
||||
# if any(input.name == name for input in model.graph.input):
|
||||
# self.inputs[name] = obj
|
||||
# else:
|
||||
# self.initializer[obj.fuid()] = tensor
|
||||
# if tensor.data_type == TensorProto.INT32:
|
||||
# obj.copyin_int32(_parse_data(tensor))
|
||||
# elif tensor.data_type == TensorProto.INT64:
|
||||
# obj.copyin_int64(_parse_data(tensor))
|
||||
# elif tensor.data_type == TensorProto.FLOAT:
|
||||
# obj.copyin_float(_parse_data(tensor))
|
||||
# else:
|
||||
# assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
|
||||
|
||||
# for output in model.graph.output:
|
||||
# self.outputs[output.name] = tensors[output.name]
|
||||
|
||||
def to_onnx(self, g: backend.Graph, path: str, name: str = 'my_onnx') -> ModelProto:
|
||||
class Context:
|
||||
# saves object names, including tensors and operators
|
||||
names: Dict[Union[backend.Tensor, backend.Operator], str] = dict()
|
||||
# counts the occurrence times of each operator for naming
|
||||
count_op: Dict[backend.OpType, int] = dict()
|
||||
# counts input and output tensors for naming
|
||||
count_in, count_out = 0, 0
|
||||
# saves nodes (operators)
|
||||
nodes: List[NodeProto] = []
|
||||
# saves global input tensors
|
||||
inputs: List[ValueInfoProto] = []
|
||||
# saves global output tensors
|
||||
outputs: List[ValueInfoProto] = []
|
||||
# saves global input tensors
|
||||
initializers: List[TensorProto] = []
|
||||
|
||||
def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
|
||||
ty = op.op_type()
|
||||
name = "{}_{}".format(ty.name, op.guid())
|
||||
self.names[op] = name
|
||||
self.count_op[ty] = self.count_op.get(ty, 0) + 1
|
||||
return ty, name
|
||||
|
||||
def push_output(self, name: str, tensor: backend.Tensor) -> str:
|
||||
self.names[tensor] = name
|
||||
if not tensor.has_target():
|
||||
shape = tensor.shape()
|
||||
dtype = backend.tensor_dtype(tensor)
|
||||
value_info = make_tensor_value_info(name, dtype, shape)
|
||||
check_value_info(value_info)
|
||||
self.outputs.append(value_info)
|
||||
return name
|
||||
|
||||
def push_input(
|
||||
self, tensor: backend.Tensor, init: Optional[TensorProto]
|
||||
) -> str:
|
||||
name = self.names.get(tensor)
|
||||
# means that this input is a global input
|
||||
if name is None:
|
||||
self.count_in += 1
|
||||
name = "input_{}".format(tensor.guid())
|
||||
self.names[tensor] = name
|
||||
if init != None:
|
||||
init.name = name
|
||||
self.initializers.append(init)
|
||||
else:
|
||||
shape = tensor.shape()
|
||||
dtype = backend.tensor_dtype(tensor)
|
||||
value_info = make_tensor_value_info(name, dtype, shape)
|
||||
check_value_info(value_info)
|
||||
self.inputs.append(value_info)
|
||||
return name
|
||||
|
||||
def push_data_input(
|
||||
self,
|
||||
node_name: str,
|
||||
attr_name: str,
|
||||
elem_type: int,
|
||||
shape: Sequence[int],
|
||||
vals: Any,
|
||||
) -> str:
|
||||
name = "{}_{}".format(node_name, attr_name)
|
||||
tensor = make_tensor(name, elem_type, shape, vals)
|
||||
check_tensor(tensor)
|
||||
self.initializers.append(tensor)
|
||||
return name
|
||||
|
||||
def push_node(self, node: NodeProto) -> None:
|
||||
# check_node(node)
|
||||
self.nodes.append(node)
|
||||
|
||||
def build(self, name: str) -> ModelProto:
|
||||
graph = make_graph(
|
||||
self.nodes, name, self.inputs, self.outputs, self.initializers
|
||||
)
|
||||
# check_graph(graph)
|
||||
|
||||
model = make_model(graph)
|
||||
# check_model(model)
|
||||
|
||||
return model
|
||||
|
||||
# 拓扑排序
|
||||
if not g.topo_sort():
|
||||
raise Exception("Sorting fails")
|
||||
|
||||
ops = g.operators() # 图中所有算子(节点)
|
||||
|
||||
ctx = Context()
|
||||
|
||||
for op in ops:
|
||||
ty, name = ctx.name_op(op)
|
||||
inputs = [
|
||||
ctx.push_input(it, self.initializer.get(it.fuid()))
|
||||
for it in op.inputs()
|
||||
]
|
||||
outputs = [
|
||||
ctx.push_output("{}_{}_{}".format(
|
||||
name, i, tensor.guid()), tensor)
|
||||
for (i, tensor) in enumerate(op.outputs())
|
||||
]
|
||||
if ty == backend.OpType.Conv:
|
||||
ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
pads=[ph, pw, ph, pw],
|
||||
strides=[sh, sw],
|
||||
dilations=[dh, dw],
|
||||
group=op.inputs()[0].shape()[
|
||||
1] // op.inputs()[1].shape()[1],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.ConvTrans:
|
||||
ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(
|
||||
op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"ConvTranspose",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
pads=[ph, pw],
|
||||
strides=[sh, sw],
|
||||
dilations=[dh, dw],
|
||||
output_padding=[oph, opw],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.ConvTransNHWC:
|
||||
# ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"ConvTranspose",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
domain="nnet",
|
||||
# pads=[ph, pw],
|
||||
# strides=[sh, sw],
|
||||
# dilations=[dh, dw],
|
||||
# output_padding=[oph, opw],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.MemBound:
|
||||
# ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"Membound",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
domain="nnet",
|
||||
# pads=[ph, pw],
|
||||
# strides=[sh, sw],
|
||||
# dilations=[dh, dw],
|
||||
# output_padding=[oph, opw],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.Matmul:
|
||||
# transA, transB = backend.matmul_attrs_of(op)
|
||||
# HACK: recover this
|
||||
transA, transB = False, False
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"Gemm", inputs, outputs, name, transA=transA, transB=transB
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.BatchNorm:
|
||||
inputs = [inputs[i] for i in [0, 3, 4, 1, 2]]
|
||||
momentum, eps, training = backend.batch_norm_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"BatchNormalization",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
epsilon=eps,
|
||||
momentum=momentum,
|
||||
training_mode=training,
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.MaxPool:
|
||||
kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
kernel_shape=[kh, kw],
|
||||
pads=[ph, pw, ph, pw],
|
||||
dilations=[dh, dw],
|
||||
strides=[sh, sw],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.AvgPool:
|
||||
kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"AveragePool",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
kernel_shape=[kh, kw],
|
||||
pads=[ph, pw, ph, pw],
|
||||
strides=[sh, sw],
|
||||
)
|
||||
)
|
||||
elif ty in [
|
||||
backend.OpType.Add,
|
||||
backend.OpType.Sub,
|
||||
backend.OpType.Mul,
|
||||
backend.OpType.Div,
|
||||
backend.OpType.Pow,
|
||||
backend.OpType.Relu,
|
||||
backend.OpType.Sigmoid,
|
||||
backend.OpType.Tanh,
|
||||
backend.OpType.Softmax,
|
||||
backend.OpType.Abs,
|
||||
backend.OpType.Identity,
|
||||
backend.OpType.PRelu,
|
||||
]:
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
elif ty == backend.OpType.Flatten:
|
||||
raise Exception("TODO")
|
||||
elif ty == backend.OpType.Transpose:
|
||||
perm = backend.transpose_permute_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs,
|
||||
outputs, name, perm=perm))
|
||||
elif ty == backend.OpType.Reshape:
|
||||
shape = backend.reshape_shape_of(op)
|
||||
inputs.append(
|
||||
ctx.push_data_input(
|
||||
name,
|
||||
"shape",
|
||||
TensorProto.INT64,
|
||||
[len(shape)],
|
||||
shape,
|
||||
)
|
||||
)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
elif ty == backend.OpType.Concat:
|
||||
axis = backend.concat_axis_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs,
|
||||
outputs, name, axis=axis))
|
||||
elif ty == backend.OpType.Split:
|
||||
axis = backend.split_axis_of(op)
|
||||
num_outputs = len(outputs)
|
||||
split = op.inputs()[0].shape()[axis] // num_outputs
|
||||
inputs.append(
|
||||
ctx.push_data_input(
|
||||
name,
|
||||
"split",
|
||||
TensorProto.INT64,
|
||||
[len(outputs)],
|
||||
[split for _ in range(0, num_outputs)],
|
||||
)
|
||||
)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
axis=axis,
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.Gather:
|
||||
axis = backend.gather_axis_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs,
|
||||
outputs, name, axis=axis))
|
||||
elif ty == backend.OpType.ReduceMean:
|
||||
axes, keepdims = backend.reduce_mean_attrs_of(op)
|
||||
inputs.append(
|
||||
ctx.push_data_input(
|
||||
name, "axes", TensorProto.INT64, [len(axes)], axes
|
||||
)
|
||||
)
|
||||
ctx.push_node(
|
||||
make_node(ty.name, inputs, outputs,
|
||||
name, keepdims=keepdims)
|
||||
)
|
||||
elif ty == backend.OpType.Slice:
|
||||
raise Exception("TODO")
|
||||
elif ty == backend.OpType.Pad:
|
||||
pads = backend.pad_pads_of(op)
|
||||
inputs.append(
|
||||
ctx.push_data_input(
|
||||
name, "pads", TensorProto.INT64, [len(pads)], pads
|
||||
)
|
||||
)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
# elif ty == backend.OpType.Clip:
|
||||
# min, max = backend.clip_attrs_of(op)
|
||||
# if min != None:
|
||||
# inputs.append(
|
||||
# ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [min])
|
||||
# )
|
||||
# else:
|
||||
# inputs.append(
|
||||
# ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [])
|
||||
# )
|
||||
# if max != None:
|
||||
# inputs.append(
|
||||
# ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [max])
|
||||
# )
|
||||
# else:
|
||||
# inputs.append(
|
||||
# ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
|
||||
# )
|
||||
# ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
else:
|
||||
raise Exception("Unsupported OpType", ty)
|
||||
|
||||
model = ctx.build(name)
|
||||
onnx.save(model, path)
|
||||
return model
|
||||
|
||||
# def init(self) -> None:
|
||||
# self.handler.data_malloc()
|
||||
|
||||
# def optimize(self) -> None:
|
||||
# self.handler.optimize()
|
||||
|
||||
# def run(self) -> None:
|
||||
# self.handler.run()
|
||||
|
||||
|
||||
# def from_onnx(model: ModelProto, runtime):
|
||||
# stub = OnnxStub(model, runtime)
|
||||
# return stub.inputs, stub.outputs, stub.handler
|
||||
|
||||
|
||||
# def _search_shape(model: ModelProto, name: str) -> List[int]:
|
||||
# ans = (
|
||||
# next(
|
||||
# (
|
||||
# [
|
||||
# (d.dim_value if d.dim_value > 0 else 1)
|
||||
# for d in tensor.type.tensor_type.shape.dim
|
||||
# ]
|
||||
# for tensor in model.graph.value_info
|
||||
# if tensor.name == name
|
||||
# ),
|
||||
# None,
|
||||
# )
|
||||
# or next(
|
||||
# (
|
||||
# [
|
||||
# (d.dim_value if d.dim_value > 0 else 1)
|
||||
# for d in tensor.type.tensor_type.shape.dim
|
||||
# ]
|
||||
# for tensor in model.graph.input
|
||||
# if tensor.name == name
|
||||
# ),
|
||||
# None,
|
||||
# )
|
||||
# or next(
|
||||
# [int(d) for d in tensor.dims]
|
||||
# for tensor in model.graph.initializer
|
||||
# if tensor.name == name
|
||||
# )
|
||||
# )
|
||||
# return ans
|
||||
|
||||
|
||||
# def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
|
||||
# for attr in node.attribute:
|
||||
# if attr.name in attrs:
|
||||
# if attr.type == AttributeProto.INT:
|
||||
# attrs[attr.name] = attr.i
|
||||
# elif attr.type == AttributeProto.INTS:
|
||||
# attrs[attr.name] = attr.ints
|
||||
# elif attr.type == AttributeProto.FLOAT:
|
||||
# attrs[attr.name] = attr.f
|
||||
# elif attr.type == AttributeProto.STRING:
|
||||
# attrs[attr.name] = attr.s
|
||||
# elif attr.type == AttributeProto.TENSOR:
|
||||
# attrs[attr.name] = attr.t
|
||||
# else:
|
||||
# assert False, "Unsupported Attribute Type: {}".format(attr.type)
|
||||
# return attrs
|
||||
|
||||
|
||||
# def _parse_data(tensor: TensorProto) -> List[Any]:
|
||||
# return to_array(tensor).flatten().tolist()
|
||||
|
||||
|
||||
# def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
|
||||
# return [(d.dim_value if d.dim_value > 0 else 1) for d in shape.dim]
|
||||
|
||||
def export_onnx(g: backend.Graph, path: str) -> None:
|
||||
stub = OnnxStub()
|
||||
stub.to_onnx(g, path)
|
|
@ -11,13 +11,11 @@ GraphObj::GraphObj(Runtime runtime, OpVec ops_in)
|
|||
for (const auto &op : ops_in) {
|
||||
for (const auto &t : op->getInputs())
|
||||
if (tensorPool.find(t->getFuid()) == tensorPool.end())
|
||||
tensorPool[t->getFuid()] = t->clone();
|
||||
tensorPool[t->getFuid()] = cloneTensor(t);
|
||||
for (const auto &t : op->getOutputs())
|
||||
if (tensorPool.find(t->getFuid()) == tensorPool.end())
|
||||
tensorPool[t->getFuid()] = t->clone();
|
||||
tensorPool[t->getFuid()] = cloneTensor(t);
|
||||
}
|
||||
for (const auto &[_, t] : tensorPool)
|
||||
addTensor(t);
|
||||
// Clone operators and add connections
|
||||
for (const auto &op : ops_in) {
|
||||
TensorVec inputs, outputs;
|
||||
|
@ -122,13 +120,18 @@ void GraphObj::dataMalloc() {
|
|||
}
|
||||
}
|
||||
|
||||
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
|
||||
return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
|
||||
Tensor GraphObj::addTensor(Shape dim, DataType dtype, TensorType tensorType) {
|
||||
return tensors.emplace_back(
|
||||
make_ref<TensorObj>(dim, dtype, runtime, tensorType));
|
||||
}
|
||||
|
||||
Tensor GraphObj::addTensor(const Tensor &tensor) {
|
||||
IT_ASSERT(tensor->getRuntime() == runtime, "Tensor runtime mismatch");
|
||||
return tensors.emplace_back(tensor);
|
||||
IT_ASSERT(tensor->getRuntime() == runtime,
|
||||
std::string("Tensor runtime mismatch: cannot add a tenosr in ") +
|
||||
tensor->getRuntime()->toString() + " to " +
|
||||
runtime->toString());
|
||||
tensors.emplace_back(tensor);
|
||||
return tensor;
|
||||
}
|
||||
|
||||
TensorVec GraphObj::addTensor(const TensorVec &tensors) {
|
||||
|
@ -145,4 +148,19 @@ OpVec GraphObj::getComputeOps() const {
|
|||
return opList;
|
||||
};
|
||||
|
||||
bool GraphObj::selfCheck(bool assert) const {
|
||||
std::set<UidBaseType> s;
|
||||
// check whether two tensors with the same FUID exist
|
||||
for (auto tensor : tensors) {
|
||||
int cnt = s.count(tensor->getFuid());
|
||||
|
||||
if (assert)
|
||||
IT_ASSERT(cnt == 0, std::to_string(tensor->getFuid()));
|
||||
else if (cnt > 0)
|
||||
return false;
|
||||
s.insert(tensor->getFuid());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#include "core/operator.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/hash.h"
|
||||
#include "nnet/dbg.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -24,8 +25,8 @@ bool OperatorObj::isConcatOp() const { return type == OpType::Concat; }
|
|||
|
||||
bool OperatorObj::isComputeOp() const {
|
||||
return type == OpType::Conv || type == OpType::Matmul ||
|
||||
type == OpType::ConvTrans || type == OpType::G2BMM ||
|
||||
type == OpType::GBMM;
|
||||
type == OpType::ConvTrans || type == OpType::ConvTransNHWC ||
|
||||
type == OpType::G2BMM || type == OpType::GBMM;
|
||||
}
|
||||
|
||||
bool OperatorObj::isTransposeOp() const { return type == OpType::Transpose; }
|
||||
|
@ -34,7 +35,8 @@ bool OperatorObj::isReshapeOp() const { return type == OpType::Reshape; }
|
|||
|
||||
bool OperatorObj::isMemBoundOp() const {
|
||||
return type == OpType::MemBound || type == OpType::Activation ||
|
||||
type == OpType::Transpose;
|
||||
type == OpType::Transpose || type == OpType::Relu ||
|
||||
type == OpType::Tanh;
|
||||
}
|
||||
|
||||
OpPerfKey OperatorObj::getOpPerfKey() const {
|
||||
|
@ -57,22 +59,29 @@ HashType OperatorObj::hash() const {
|
|||
|
||||
bool OperatorObj::checkValid(GraphObj *graph) {
|
||||
auto optShapes = inferShape();
|
||||
IT_ASSERT(optShapes);
|
||||
if (!optShapes) // shape inference failed
|
||||
return false;
|
||||
|
||||
const vector<Shape> &shapes = *optShapes;
|
||||
IT_ASSERT(shapes.size() == outputs.size());
|
||||
if (shapes.size() != outputs.size())
|
||||
return false;
|
||||
if (graph) { // if graph != nullptr, outputs should be created
|
||||
auto dataTypes = inferDataType();
|
||||
for (size_t i = 0; i < outputs.size(); i++) {
|
||||
IT_ASSERT(!outputs[i]);
|
||||
outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
|
||||
IT_ASSERT(!outputs[i], "Find empty output while operator creation");
|
||||
outputs[i] =
|
||||
graph->addTensor(shapes[i], dataTypes[i], TensorType::Other);
|
||||
}
|
||||
} else { // if outputs have been created, check their shapes
|
||||
for (size_t i = 0; i < shapes.size(); ++i) {
|
||||
if (shapes[i] != outputs[i]->getDims())
|
||||
IT_ASSERT(shapes[i] == outputs[i]->getDims());
|
||||
if (shapes[i] != outputs[i]->getDims()) {
|
||||
dbg(shapes[i], outputs[i]->getDims());
|
||||
return false;
|
||||
}
|
||||
IT_ASSERT(outputs[i]->getTensorType() == TensorType::Other);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
#include "core/blob.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/perf_engine.h"
|
||||
#include "cuda_profiler_api.h"
|
||||
#include "operators/membound.h"
|
||||
#include "utils/data_generator.h"
|
||||
#include <chrono>
|
||||
#include <cstring>
|
||||
|
@ -60,13 +62,35 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
|||
printProfilingData(totalTime, opTime, opCnt);
|
||||
}
|
||||
|
||||
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
||||
map<UidBaseType, bool>
|
||||
RuntimeObj::getCompileTimeComputableAttribute(const Graph &graph) const {
|
||||
map<UidBaseType, bool> ctcMap; // compile-time computable
|
||||
// Skip static computation
|
||||
bool status = graph->topo_sort();
|
||||
IT_ASSERT(status, "Topological sort failed");
|
||||
for (auto &op : graph->getOperators()) {
|
||||
bool compileTimeComputable = true;
|
||||
for (auto input : op->getInputs()) {
|
||||
// FIXME: propogate the tensor type. Current only the first operator
|
||||
// after weights are compile-time computable.
|
||||
if (input->getTensorType() != TensorType::Initialized)
|
||||
compileTimeComputable = false;
|
||||
}
|
||||
ctcMap[op->getGuid()] = compileTimeComputable;
|
||||
}
|
||||
return ctcMap;
|
||||
}
|
||||
|
||||
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
|
||||
bool allowEstimation) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
// Statistics
|
||||
double totalTime = 0;
|
||||
std::map<OpType, double> opTime;
|
||||
std::map<OpType, int> opCnt;
|
||||
// compile-time computable
|
||||
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType(), op->getDType()};
|
||||
|
@ -74,11 +98,17 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
|||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
||||
PerfRecord record;
|
||||
// Tune the kernel if there is no record
|
||||
if (!perfData) {
|
||||
double time = -1e9;
|
||||
if (ctcMap[op->getGuid()]) { // Compile-time computable operators
|
||||
time = 0;
|
||||
} else if (perfData) { // Tune the kernel if there is no record
|
||||
time = perfData->time;
|
||||
} else if (allowEstimation && op->getOpType() == OpType::MemBound) {
|
||||
time = as<MemBoundObj>(op)->getEstimatedTime();
|
||||
} else {
|
||||
// TODO: should tenosrs automatically allocate when access data?
|
||||
// allocate memory for empty tensors and release it after profiling
|
||||
// allocate memory for empty tensors and release it after
|
||||
// profiling
|
||||
TensorVec allocatedTensors;
|
||||
for (auto t : op->getInputs())
|
||||
if (!t->hasData())
|
||||
|
@ -92,21 +122,20 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
|||
}
|
||||
|
||||
// Profile operators and record the results
|
||||
record = kernel->tune(op, this);
|
||||
PerfRecord record = kernel->tune(op, this);
|
||||
time = record->time;
|
||||
perfEngine.setPerfData(perfKey, record);
|
||||
|
||||
// Free allocated memory
|
||||
for (auto t : allocatedTensors)
|
||||
t->freeData();
|
||||
} else
|
||||
record = perfData;
|
||||
}
|
||||
|
||||
double t = record->time;
|
||||
totalTime += t;
|
||||
totalTime += time;
|
||||
if (profiling) {
|
||||
op->print();
|
||||
printf(" op_time %lf\n", t);
|
||||
opTime[op->getOpType()] += t;
|
||||
printf(" op_time %lf\n", time);
|
||||
opTime[op->getOpType()] += time;
|
||||
opCnt[op->getOpType()]++;
|
||||
}
|
||||
}
|
||||
|
@ -164,4 +193,46 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,
|
|||
|
||||
string NativeCpuRuntimeObj::toString() const { return "CPU Runtime"; }
|
||||
|
||||
double RuntimeObj::timeNonCtcOperators(const Graph &graph) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
// compile-time computable
|
||||
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
|
||||
vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
|
||||
bool status = graph->topo_sort();
|
||||
IT_ASSERT(status, "Topological sort failed");
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType(), DataType::Float32};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
if (perfData)
|
||||
kernel->compute(op, perfData, this);
|
||||
else
|
||||
kernel->compute(op, this);
|
||||
// if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
|
||||
if (op->getOpType() == OpType::Matmul)
|
||||
kernels.emplace_back(op, kernel, perfData);
|
||||
}
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
dbg(op);
|
||||
}
|
||||
cudaProfilerStart(); // HACK: Debug
|
||||
double ret = timeit(
|
||||
[&]() {
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
if (perfData)
|
||||
kernel->compute(op, perfData, this);
|
||||
else
|
||||
kernel->compute(op, this);
|
||||
}
|
||||
},
|
||||
[&]() { sync(); });
|
||||
cudaProfilerStop(); // HACK: Debug
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -8,7 +8,17 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
void SearchEngine::printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph) {
|
||||
using MetaGraph = SearchEngine::MetaGraph;
|
||||
|
||||
SearchEngine::SearchEngine(Runtime runtime, Ref<Mutator> mutator)
|
||||
: runtimeExec(runtime), mutator(mutator) {
|
||||
// Compare graph with estimated time
|
||||
graphTimeComparer = [this](const Graph &a, const Graph &b) -> bool {
|
||||
return getEstimatedGraphPerf(a) < getEstimatedGraphPerf(b);
|
||||
};
|
||||
}
|
||||
|
||||
void SearchEngine::printMetaGraph(MetaGraph metaGraph) {
|
||||
for (size_t i = 0; i < metaGraph->nodes.size(); i++) {
|
||||
auto &node = metaGraph->nodes[i];
|
||||
std::cout << "id: " << i << std::endl;
|
||||
|
@ -32,8 +42,7 @@ Graph SearchEngine::run(const Graph graph) {
|
|||
IT_ASSERT(runtimeExec == graph->getRuntime());
|
||||
std::cout << "[INFO] original graph: " << std::endl;
|
||||
std::cout << graph->toString();
|
||||
std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(graph)
|
||||
<< std::endl;
|
||||
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(graph) << std::endl;
|
||||
|
||||
std::vector<Graph> partitions = partitionGraph(graph);
|
||||
|
||||
|
@ -65,9 +74,7 @@ Graph SearchEngine::run(const Graph graph) {
|
|||
nextGraphs.emplace_back(tmp);
|
||||
}
|
||||
}
|
||||
std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
|
||||
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
|
||||
});
|
||||
std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
|
||||
if (nextGraphs.size() > GRAPH_SIZE) {
|
||||
nextGraphs.resize(GRAPH_SIZE);
|
||||
}
|
||||
|
@ -81,10 +88,22 @@ Graph SearchEngine::run(const Graph graph) {
|
|||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
std::cout << "bestGraph " << i << ":" << std::endl;
|
||||
std::cout << bestGraphs[i]->toString();
|
||||
std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(bestGraphs[i])
|
||||
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[i])
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
// Fuse vertically and sort according to performance
|
||||
for (size_t i = 0; i < bestGraphs.size(); ++i) {
|
||||
// Debug
|
||||
bestGraphs[i] = fuseVertically(bestGraphs[i]);
|
||||
}
|
||||
std::sort(bestGraphs.begin(), bestGraphs.end(), graphTimeComparer);
|
||||
|
||||
std::cout << "[INFO] best fused graph: " << std::endl;
|
||||
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[0])
|
||||
<< std::endl;
|
||||
std::cout << bestGraphs[0] << std::endl;
|
||||
|
||||
return bestGraphs[0];
|
||||
}
|
||||
|
||||
|
@ -102,9 +121,8 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
|
|||
}
|
||||
}
|
||||
|
||||
sort(results.begin(), results.end(), [&](Graph x, Graph y) {
|
||||
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
|
||||
}); // compare with perf time
|
||||
// compare with perf time
|
||||
std::sort(results.begin(), results.end(), graphTimeComparer);
|
||||
if (results.size() > GRAPH_SIZE) {
|
||||
results.resize(GRAPH_SIZE);
|
||||
}
|
||||
|
@ -112,9 +130,8 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
|
|||
}
|
||||
|
||||
// Build metagraph with a graph, each operator is a node.
|
||||
std::shared_ptr<SearchEngine::MetaGraph>
|
||||
SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
||||
auto metaGraph = std::make_shared<MetaGraph>();
|
||||
MetaGraph SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
||||
auto metaGraph = make_ref<MetaGraphObj>();
|
||||
|
||||
int numOps = graph->getOperators().size();
|
||||
std::vector<int> cnt(numOps, 0);
|
||||
|
@ -123,7 +140,7 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
|||
std::vector<int> q(0);
|
||||
for (size_t i = 0; i < graph->getOperators().size(); i++) {
|
||||
auto &op = graph->getOperators()[i];
|
||||
MetaGraph::Node node;
|
||||
MetaGraphObj::Node node;
|
||||
std::vector<Operator> ops;
|
||||
ops.emplace_back(op);
|
||||
node.graph = make_ref<GraphObj>(runtimeExec, ops);
|
||||
|
@ -157,9 +174,8 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
|||
|
||||
// Build a metagraph with graph and a plan, a plan is which ops should be a
|
||||
// node.
|
||||
std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
|
||||
const std::shared_ptr<SearchEngine::MetaGraph> metaGraph,
|
||||
const std::vector<int> &plan) {
|
||||
MetaGraph SearchEngine::buildMetaGraphWithPlan(const MetaGraph metaGraph,
|
||||
const std::vector<int> &plan) {
|
||||
int numGroups = 0;
|
||||
for (auto i : plan) {
|
||||
if (i > numGroups) {
|
||||
|
@ -172,12 +188,12 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
|
|||
groups[plan[i]].emplace_back(i);
|
||||
}
|
||||
|
||||
auto resultMetaGraph = make_ref<MetaGraph>();
|
||||
auto resultMetaGraph = make_ref<MetaGraphObj>();
|
||||
for (auto &group : groups) {
|
||||
std::vector<Operator> ops;
|
||||
std::unordered_set<int> preSet, sucSet;
|
||||
for (auto id : group) {
|
||||
MetaGraph::Node node;
|
||||
MetaGraphObj::Node node;
|
||||
for (auto op : metaGraph->nodes[id].graph->getOperators()) {
|
||||
ops.emplace_back(op);
|
||||
}
|
||||
|
@ -204,8 +220,7 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
|
|||
}
|
||||
|
||||
// Search how to merge multiple ops.
|
||||
std::vector<std::shared_ptr<SearchEngine::MetaGraph>>
|
||||
SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
||||
vector<MetaGraph> SearchEngine::searchMerge(MetaGraph &metaGraph) {
|
||||
IT_ASSERT(metaGraph != nullptr);
|
||||
std::vector<int> plan(metaGraph->nodes.size());
|
||||
for (size_t i = 0; i < plan.size(); i++) {
|
||||
|
@ -222,7 +237,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
|||
std::unordered_set<HashType> planSet;
|
||||
searchMergeDfs(metaGraph, plan, frontier, plans, planSet);
|
||||
|
||||
std::vector<std::shared_ptr<SearchEngine::MetaGraph>> metaGraphs;
|
||||
vector<MetaGraph> metaGraphs;
|
||||
for (auto &curPlan : plans) {
|
||||
metaGraphs.emplace_back(buildMetaGraphWithPlan(metaGraph, curPlan));
|
||||
}
|
||||
|
@ -230,8 +245,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
|||
}
|
||||
|
||||
// DFS impl for search merge.
|
||||
void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
|
||||
std::vector<int> &plan,
|
||||
void SearchEngine::searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
|
||||
std::vector<int> &frontier,
|
||||
std::vector<std::vector<int>> &plans,
|
||||
std::unordered_set<uint64_t> &planSet) {
|
||||
|
@ -320,8 +334,7 @@ void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
|
|||
}
|
||||
|
||||
// Search mutation for each compute op.
|
||||
std::vector<Graph> SearchEngine::searchMutation(
|
||||
const std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
||||
std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
|
||||
std::vector<Graph> graphs = {nullptr};
|
||||
// Append a node to all existing candidates
|
||||
for (auto &node : metaGraph->nodes) {
|
||||
|
@ -360,9 +373,7 @@ std::vector<Graph> SearchEngine::searchMutation(
|
|||
for (auto g : nextGraphs) {
|
||||
g->dataMalloc();
|
||||
}
|
||||
std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
|
||||
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
|
||||
});
|
||||
std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
|
||||
if (nextGraphs.size() > GRAPH_SIZE) {
|
||||
nextGraphs.resize(GRAPH_SIZE);
|
||||
}
|
||||
|
@ -372,7 +383,7 @@ std::vector<Graph> SearchEngine::searchMutation(
|
|||
}
|
||||
|
||||
bool SearchEngine::isMultiBranchMergable(const Graph graph) {
|
||||
return mutationEngine->isMultiBranchMergable(graph);
|
||||
return mutator->isMultiBranchMergable(graph);
|
||||
}
|
||||
|
||||
// Split a graph into multiple independt graphs. Search engine will search for
|
||||
|
@ -438,4 +449,60 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
|
|||
return partitions;
|
||||
}
|
||||
|
||||
double SearchEngine::getEstimatedGraphPerf(Graph graph) {
|
||||
return runtimeExec->getPerfTime(graph, false, true);
|
||||
}
|
||||
|
||||
Graph SearchEngine::fuseVertically(const Graph &graph) {
|
||||
std::unordered_map<UidBaseType, int> visitTime;
|
||||
std::vector<Operator> ops;
|
||||
|
||||
graph->topo_sort();
|
||||
int cnt = 0;
|
||||
for (auto op : graph->getOperators()) {
|
||||
// Skip visited OP
|
||||
if (visitTime.find(op->getGuid()) != visitTime.end()) {
|
||||
continue;
|
||||
}
|
||||
// Skip compute OP and multi-input/output OP
|
||||
if (!op->isMemBoundOp() || (op->getPredecessors().size() != 1 &&
|
||||
op->getSuccessors().size() != 1)) {
|
||||
visitTime.emplace(op->getGuid(), ++cnt);
|
||||
ops.emplace_back(op);
|
||||
continue;
|
||||
}
|
||||
vector<Operator> chainOps;
|
||||
visitTime.emplace(op->getGuid(), ++cnt);
|
||||
|
||||
vector<Operator> tmp;
|
||||
auto cur = op;
|
||||
while (cur->getPredecessors().size() == 1 &&
|
||||
cur->getPredecessors()[0]->isMemBoundOp()) {
|
||||
cur = cur->getPredecessors()[0];
|
||||
tmp.emplace_back(cur);
|
||||
visitTime.emplace(cur->getGuid(), cnt);
|
||||
}
|
||||
for (int i = tmp.size() - 1; i >= 0; i--) {
|
||||
chainOps.emplace_back(tmp[i]);
|
||||
}
|
||||
chainOps.emplace_back(op);
|
||||
cur = op;
|
||||
while (cur->getSuccessors().size() == 1 &&
|
||||
cur->getSuccessors()[0]->isMemBoundOp()) {
|
||||
cur = cur->getSuccessors()[0];
|
||||
chainOps.emplace_back(cur);
|
||||
visitTime.emplace(cur->getGuid(), cnt);
|
||||
}
|
||||
make_ref<GraphObj>(runtimeExec, chainOps)->print();
|
||||
|
||||
Graph optGraph =
|
||||
mutator->fuseVertically(make_ref<GraphObj>(runtimeExec, chainOps));
|
||||
for (auto op : optGraph->getOperators()) {
|
||||
ops.emplace_back(op);
|
||||
}
|
||||
}
|
||||
|
||||
return make_ref<GraphObj>(runtimeExec, ops);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -8,17 +8,26 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime)
|
||||
TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime,
|
||||
TensorType tensorType)
|
||||
: TensorBaseObj(shape.size(), dtype, runtime), shape(std::move(shape_)),
|
||||
_size(shape.empty()
|
||||
? 0
|
||||
: std::accumulate(shape.begin(), shape.end(), 1,
|
||||
[](auto acc, auto x) { return acc * x; })) {}
|
||||
[](auto acc, auto x) { return acc * x; })),
|
||||
tensorType(tensorType) {}
|
||||
|
||||
string TensorObj::toString() const {
|
||||
// Convert data pointer to string
|
||||
std::stringstream ss;
|
||||
if (data != nullptr)
|
||||
ss << data->getPtr<void *>();
|
||||
else
|
||||
ss << "nullptr data";
|
||||
string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
|
||||
std::to_string(fuid) + ", shape " + vecToString(shape) +
|
||||
", dtype " + dtype.toString();
|
||||
", dtype " + dtype.toString() + ", tensorType " +
|
||||
std::to_string(enum_to_underlying(tensorType));
|
||||
vector<UidBaseType> targetGuids;
|
||||
for (const auto &op : targets)
|
||||
targetGuids.emplace_back(op.lock()->getGuid());
|
||||
|
@ -27,6 +36,7 @@ string TensorObj::toString() const {
|
|||
else
|
||||
ret += ", source None";
|
||||
ret += ", targets " + vecToString(targetGuids);
|
||||
ret += ", " + runtime->toString() + ", " + ss.str();
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -57,25 +67,36 @@ vector<size_t> TensorObj::getStride() const {
|
|||
|
||||
void TensorObj::printData() const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
if (!runtime->isCpu())
|
||||
IT_TODO_HALT();
|
||||
void *ptr = nullptr;
|
||||
Blob buffer;
|
||||
if (!runtime->isCpu()) {
|
||||
buffer = NativeCpuRuntimeObj::getInstance()->allocBlob(getBytes());
|
||||
runtime->copyBlobToCPU(buffer->getPtr<void *>(),
|
||||
getRawDataPtr<void *>(), getBytes());
|
||||
ptr = buffer->getPtr<void *>();
|
||||
} else
|
||||
ptr = data->getPtr<float *>();
|
||||
if (dtype == DataType::Float32)
|
||||
printDataFloat();
|
||||
printDataFloat(static_cast<float *>(ptr));
|
||||
else if (dtype == DataType::UInt32)
|
||||
printDataUint32_t();
|
||||
printDataUint32_t(static_cast<uint32_t *>(ptr));
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
void TensorObj::printDataFloat() const {
|
||||
void TensorObj::printDataFloat(float *ptr) const {
|
||||
std::cout << "Tensor: " << guid << std::endl;
|
||||
auto numDims = shape.size();
|
||||
auto dimSzVec = std::vector<int>(numDims, 1);
|
||||
auto ptr = data->getPtr<float *>();
|
||||
dimSzVec[numDims - 1] = shape[numDims - 1];
|
||||
for (int i = numDims - 1; i != 0; --i)
|
||||
dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
|
||||
for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
|
||||
if (iEnd > 1000 && i > 20 && i < iEnd - 20) {
|
||||
printf("... , ");
|
||||
i = iEnd - 20;
|
||||
continue;
|
||||
}
|
||||
for (size_t j = 0; j < numDims; ++j) {
|
||||
if (i % dimSzVec[j] == 0) {
|
||||
std::cout << "[";
|
||||
|
@ -94,12 +115,11 @@ void TensorObj::printDataFloat() const {
|
|||
}
|
||||
}
|
||||
|
||||
void TensorObj::printDataUint32_t() const {
|
||||
void TensorObj::printDataUint32_t(uint32_t *ptr) const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
std::cout << "Tensor: " << guid << std::endl;
|
||||
auto numDims = shape.size();
|
||||
auto dimSzVec = std::vector<int>(numDims, 1);
|
||||
auto ptr = data->getPtr<VType *>();
|
||||
dimSzVec[numDims - 1] = shape[numDims - 1];
|
||||
for (int i = numDims - 1; i != 0; --i)
|
||||
dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
|
||||
|
@ -122,7 +142,7 @@ void TensorObj::printDataUint32_t() const {
|
|||
}
|
||||
}
|
||||
|
||||
bool TensorObj::equalData(const Tensor &rhs) const {
|
||||
bool TensorObj::equalData(const Tensor &rhs, double relativeError) const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
IT_ASSERT(rhs->data != nullptr);
|
||||
IT_ASSERT(getDType() == rhs->getDType());
|
||||
|
@ -132,10 +152,11 @@ bool TensorObj::equalData(const Tensor &rhs) const {
|
|||
return false;
|
||||
if (getDType() == DataType::UInt32)
|
||||
return equalDataImpl(getRawDataPtr<uint32_t *>(),
|
||||
rhs->getRawDataPtr<uint32_t *>(), size());
|
||||
rhs->getRawDataPtr<uint32_t *>(), size(), 0);
|
||||
else if (getDType() == DataType::Float32)
|
||||
return equalDataImpl(getRawDataPtr<float *>(),
|
||||
rhs->getRawDataPtr<float *>(), size());
|
||||
rhs->getRawDataPtr<float *>(), size(),
|
||||
relativeError);
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
#include "core/graph.h"
|
||||
#include <pybind11/stl.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace infini {
|
||||
|
||||
namespace callback {
|
||||
|
||||
using namespace py::literals;
|
||||
|
||||
static std::function<void(const Graph &, string)> exportONNXImpl;
|
||||
void exportONNX(const Graph &graph, const string &path) {
|
||||
IT_ASSERT(Py_IsInitialized(), "Python interpreter is not running.");
|
||||
static auto exportONNXImpl =
|
||||
py::module_::import("infinitensor.if_onnx").attr("export_onnx");
|
||||
exportONNXImpl(graph, path);
|
||||
}
|
||||
|
||||
} // namespace callback
|
||||
|
||||
} // namespace infini
|
|
@ -1,4 +1,7 @@
|
|||
#include "core/graph_handler.h"
|
||||
#include "core/mutator.h"
|
||||
#include "core/search_engine.h"
|
||||
#include "nnet/nmutator.h"
|
||||
#include "operators/batch_norm.h"
|
||||
#include "operators/concat.h"
|
||||
#include "operators/conv.h"
|
||||
|
@ -53,6 +56,7 @@ void export_values(py::module &m) {
|
|||
.VALUE(OpType, Conv)
|
||||
.VALUE(OpType, Matmul)
|
||||
.VALUE(OpType, ConvTrans)
|
||||
.VALUE(OpType, ConvTransNHWC)
|
||||
.VALUE(OpType, G2BMM)
|
||||
.VALUE(OpType, GBMM)
|
||||
.VALUE(OpType, Pad)
|
||||
|
@ -82,6 +86,7 @@ void export_values(py::module &m) {
|
|||
.VALUE(OpType, Abs)
|
||||
.VALUE(OpType, Resize)
|
||||
.VALUE(OpType, MemBound)
|
||||
.VALUE(OpType, PRelu)
|
||||
.export_values();
|
||||
|
||||
#undef VALUE
|
||||
|
@ -176,6 +181,9 @@ void export_functions(py::module &m) {
|
|||
void init_graph_builder(py::module &m) {
|
||||
using Handler = GraphHandlerObj;
|
||||
|
||||
py::class_<Object, Ref<Object>>(m, "_Object")
|
||||
.def("__str__", &Object::toString)
|
||||
.def("guid", &Object::getGuid);
|
||||
py::class_<RuntimeObj, std::shared_ptr<RuntimeObj>>(m, "Runtime");
|
||||
py::class_<NativeCpuRuntimeObj, std::shared_ptr<NativeCpuRuntimeObj>,
|
||||
RuntimeObj>(m, "CpuRuntime");
|
||||
|
@ -183,7 +191,7 @@ void init_graph_builder(py::module &m) {
|
|||
py::class_<CudaRuntimeObj, std::shared_ptr<CudaRuntimeObj>, RuntimeObj>(
|
||||
m, "CudaRuntime");
|
||||
#endif
|
||||
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
|
||||
py::class_<TensorObj, std::shared_ptr<TensorObj>, Object>(m, "Tensor")
|
||||
.def("fuid", &TensorObj::getFuid, policy::automatic)
|
||||
.def("shape", &TensorObj::getDims, policy::move)
|
||||
.def("copyin_float", &TensorObj::copyin<float>, policy::move)
|
||||
|
@ -194,8 +202,9 @@ void init_graph_builder(py::module &m) {
|
|||
.def("copyout_int64", &TensorObj::copyout<int64_t>, policy::move)
|
||||
.def("has_target", &TensorObj::hasTarget, policy::automatic)
|
||||
.def("src", &TensorObj::getSource, policy::move)
|
||||
.def("printData", &TensorObj::printData, policy::automatic);
|
||||
py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
|
||||
.def("print_data", &TensorObj::printData)
|
||||
.def("data_malloc", &TensorObj::dataMalloc);
|
||||
py::class_<OperatorObj, std::shared_ptr<OperatorObj>, Object>(m, "Operator")
|
||||
.def("op_type", &OperatorObj::getOpType, policy::automatic)
|
||||
.def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_),
|
||||
policy::reference)
|
||||
|
@ -232,6 +241,36 @@ void init_graph_builder(py::module &m) {
|
|||
.def("operators", &Handler::operators, policy::move)
|
||||
.def("data_malloc", &Handler::data_malloc, policy::automatic)
|
||||
.def("run", &Handler::run, policy::automatic);
|
||||
py::class_<Mutator, Ref<Mutator>>(m, "Mutator").def("run", &Mutator::run);
|
||||
py::enum_<NMutator::Mode>(m, "NMutatorMode")
|
||||
.value("RuleBased", NMutator::Mode::RuleBased);
|
||||
py::class_<NMutator, Ref<NMutator>, Mutator>(m, "NMutator")
|
||||
.def(py::init<NMutator::Mode>())
|
||||
.def(py::init<NMutator::Mode, vector<int>>())
|
||||
.def("run", &NMutator::run)
|
||||
.def_static("memboundToJson", &NMutator::memboundToJson);
|
||||
py::class_<SearchEngine>(m, "SearchEngine")
|
||||
.def(py::init<Runtime, Ref<Mutator>>())
|
||||
.def("run", &SearchEngine::run);
|
||||
py::class_<GraphObj, Ref<GraphObj>, Object>(m, "Graph")
|
||||
.def("tensors", &GraphObj::getTensors)
|
||||
.def("operators", &GraphObj::getOperators)
|
||||
.def("inputs", &GraphObj::getInputs)
|
||||
.def("outputs", &GraphObj::getOutputs)
|
||||
.def("print", &GraphObj::print)
|
||||
.def("topo_sort", &GraphObj::topo_sort);
|
||||
}
|
||||
|
||||
Graph getInfoGAN(int batch, Runtime runtime, int nLayers);
|
||||
vector<Tensor> runInfoGAN(int nLayers);
|
||||
Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId);
|
||||
Graph optimizeGraph(Graph g, Runtime runtime, bool tuning);
|
||||
void export_test_model(py::module &m) {
|
||||
m.def("runInfoGAN", &runInfoGAN);
|
||||
m.def("getInfoGAN", &getInfoGAN);
|
||||
m.def("getConvtransposedNHWC", &getConvtransposedNHWC);
|
||||
m.def("optimizeGraph", &optimizeGraph, "graph"_a, "runtime"_a,
|
||||
"tuning"_a = false);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
@ -241,4 +280,5 @@ PYBIND11_MODULE(backend, m) {
|
|||
infini::export_values(m);
|
||||
infini::export_functions(m);
|
||||
infini::init_graph_builder(m);
|
||||
infini::export_test_model(m);
|
||||
}
|
||||
|
|
|
@ -50,10 +50,23 @@ class matmulCublas : public Kernel {
|
|||
// TODO:use compute type
|
||||
cublasStatus_t stat;
|
||||
if (b > 1) {
|
||||
// Support batch broadcast with zero stride
|
||||
int dimA = op->getInputs(0)->getDims().size();
|
||||
int dimB = op->getInputs(1)->getDims().size();
|
||||
long long strideA =
|
||||
(dimA == 2 ||
|
||||
(dimA == 3 && op->getInputs(0)->getDims()[0] == 1))
|
||||
? 0 // Broadcast the batch dimension if batch size is 1
|
||||
: m * k;
|
||||
long long strideB =
|
||||
(dimB == 2 ||
|
||||
(dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
|
||||
? 0 // Broadcast the batch dimension if batch size is 1
|
||||
: n * k;
|
||||
stat = cublasGemmStridedBatchedEx(
|
||||
context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
|
||||
CUDA_R_32F, ldb, k * n, inAData, CUDA_R_32F, lda, m * k, &beta,
|
||||
outData, CUDA_R_32F, ldc, m * n, b, CUDA_R_32F,
|
||||
CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA,
|
||||
&beta, outData, CUDA_R_32F, ldc, m * n, b, CUDA_R_32F,
|
||||
(cublasGemmAlgo_t)record->algo);
|
||||
} else {
|
||||
stat = cublasGemmEx(
|
||||
|
@ -61,6 +74,8 @@ class matmulCublas : public Kernel {
|
|||
CUDA_R_32F, ldb, inAData, CUDA_R_32F, lda, &beta, outData,
|
||||
CUDA_R_32F, ldc, CUDA_R_32F, (cublasGemmAlgo_t)record->algo);
|
||||
}
|
||||
// if (stat != CUBLAS_STATUS_SUCCESS)
|
||||
// cout << cublasGetErrorString(stat);
|
||||
return (stat == CUBLAS_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
|
@ -79,6 +94,8 @@ class matmulCublas : public Kernel {
|
|||
const RuntimeObj *_context) const override {
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
auto op = as<MatmulObj>(_op);
|
||||
IT_ASSERT(context);
|
||||
IT_ASSERT(op);
|
||||
auto ret = make_ref<MatmulCublasPerfRecordObj>();
|
||||
ret->time = std::numeric_limits<double>::max();
|
||||
for (int i = 0; i < N_ALGO; i++) {
|
||||
|
@ -91,9 +108,8 @@ class matmulCublas : public Kernel {
|
|||
if (rcd->time < ret->time)
|
||||
ret = rcd;
|
||||
}
|
||||
IT_ASSERT(ret->time < std::numeric_limits<double>::max(), "No valid "
|
||||
"algorithm "
|
||||
"found");
|
||||
IT_ASSERT(ret->time < std::numeric_limits<double>::max(),
|
||||
"No valid algorithm found for " + op->toString());
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
#ifdef INFINI_USE_TVM
|
||||
#include "core/kernel.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "ffi/ffi_embed.h"
|
||||
#include "nnet/Visitor/AsTVMVisitor.h"
|
||||
#include "nnet/Visitor/CheckOOBVisitor.h"
|
||||
#include "nnet/Visitor/HashVisitor.h"
|
||||
#include "nnet/Visitor/MergeMemboundMutator.h"
|
||||
#include "nvrtc.h"
|
||||
#include "operators/membound.h"
|
||||
#include "operators/pooling.h"
|
||||
|
@ -17,11 +21,12 @@ class TVMRecordObj : public PerfRecordObj {
|
|||
std::string log, ptx;
|
||||
std::vector<int> invokeParams;
|
||||
std::string kernelName;
|
||||
HashType simplifiedExprHash;
|
||||
};
|
||||
|
||||
using TVMRecord = Ref<TVMRecordObj>;
|
||||
|
||||
class MemboundTVM : public Kernel {
|
||||
class MemboundTVMExtractSource : public Kernel {
|
||||
public:
|
||||
void compute(const Operator &_op, const PerfRecord &record,
|
||||
const RuntimeObj *_context) const override {
|
||||
|
@ -65,6 +70,11 @@ class MemboundTVM : public Kernel {
|
|||
return "var_" + std::to_string(t->getGuid());
|
||||
}
|
||||
|
||||
bool checkOOB(nnet::Expr expr) const {
|
||||
return nnet::CheckOOBVisitor().checkRangeOp(
|
||||
nnet::as<nnet::RangeOpNode>(expr));
|
||||
}
|
||||
|
||||
// Premise: op is idempotent since it is called multiple times.
|
||||
PerfRecord tune(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
|
@ -73,10 +83,18 @@ class MemboundTVM : public Kernel {
|
|||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
|
||||
// invoke Ansor to tune a membound kernel
|
||||
std::string func = "mem_bound_" + std::to_string(op->getGuid());
|
||||
std::string kernelName = func + "_kernel0";
|
||||
nnet::AsTVMVisitor visitor;
|
||||
visitor.dispatch(op->getNnetExpr());
|
||||
IT_ASSERT(!checkOOB(op->getNnetExpr()));
|
||||
// fuse stages in nnet expr to reduce kernels generated by TVM
|
||||
auto expr = op->getNnetExpr();
|
||||
if (auto mergedExpr =
|
||||
nnet::MergeMemboundMutator({expr}).merge(false, true))
|
||||
expr = mergedExpr;
|
||||
|
||||
nnet::HashVisitor hashVisitor;
|
||||
HashType hashCode = hashVisitor.getHash(expr);
|
||||
|
||||
visitor.dispatch(expr);
|
||||
auto &&stmts = visitor.getStmts();
|
||||
auto &&inShapes = visitor.getInputShapes();
|
||||
auto &&outShape = visitor.getOutputShape();
|
||||
|
@ -85,10 +103,14 @@ class MemboundTVM : public Kernel {
|
|||
for (auto &&in : op->getInputs()) {
|
||||
inputs.emplace_back(getVarName(in));
|
||||
}
|
||||
std::string output = getVarName(op->getOutput());
|
||||
const std::string output = getVarName(op->getOutput());
|
||||
|
||||
const std::string func = "membound_" + std::to_string(hashCode);
|
||||
const std::string kernelName = func + "_kernel0";
|
||||
auto res = getAnsorCode(
|
||||
inShapes, std::vector<std::string>(inShapes.size(), "float32"),
|
||||
outShape, "float32", stmts, func, inputs, output);
|
||||
outShape, "float32", stmts, func, inputs, output, op->toString(),
|
||||
expr->toReadable(), hashCode);
|
||||
|
||||
// compile the kernel
|
||||
auto funcCode = res.first;
|
||||
|
@ -119,6 +141,7 @@ class MemboundTVM : public Kernel {
|
|||
nvrtcGetPTX(prog, ret->ptx.data());
|
||||
ret->invokeParams = invokeParams;
|
||||
ret->kernelName = kernelName;
|
||||
ret->simplifiedExprHash = hashCode;
|
||||
|
||||
// prepare for evaluation
|
||||
CUmodule module;
|
||||
|
@ -151,20 +174,43 @@ class MemboundTVM : public Kernel {
|
|||
return std::dynamic_pointer_cast<PerfRecordObj>(ret);
|
||||
}
|
||||
|
||||
/// @brief
|
||||
/// @param inDims
|
||||
/// @param inDTypes
|
||||
/// @param outDims
|
||||
/// @param outDType
|
||||
/// @param lambda
|
||||
/// @param funcName Generated function name
|
||||
/// @param inputNames Input array names in the generated invocation code.
|
||||
/// @param outputName Output array names in the generated invocation code.
|
||||
/// @param nnetExpressionString Save expr in string for logging.
|
||||
/// @param nnetSimplifiedExprString Save simplified expr in string for
|
||||
/// logging.
|
||||
/// @param hashCode (optional) Hash code of the input expression for kernel
|
||||
/// cache.
|
||||
/// @return
|
||||
std::pair<std::string, std::vector<int>>
|
||||
getAnsorCode(const std::vector<std::vector<int>> &inDims,
|
||||
const std::vector<std::string> &inDTypes,
|
||||
const std::vector<int> &outDims, const std::string &outDType,
|
||||
const std::string &lambda, const std::string &funcName,
|
||||
const std::vector<std::string> &inputNames,
|
||||
const std::string &outputName) const {
|
||||
const std::string &outputName,
|
||||
const std::string &nnetExprString,
|
||||
const std::string &nnetSimplifiedExprString,
|
||||
const HashType hashCode) const {
|
||||
std::string funcCode;
|
||||
std::vector<int> invokeParams;
|
||||
try {
|
||||
start_interpreter();
|
||||
auto func = py::module::import("cpp_plugin").attr("gen_ansor_op");
|
||||
py::tuple code = func(inDims, inDTypes, outDims, outDType, lambda,
|
||||
funcName, inputNames, outputName);
|
||||
// Use static to avoid re-importing the module. Re-importing results
|
||||
// in cuBLAS failure, whose root cause is not identified yet.
|
||||
static auto func =
|
||||
py::module::import("cpp_plugin").attr("gen_ansor_op");
|
||||
py::tuple code =
|
||||
func(inDims, inDTypes, outDims, outDType, lambda, funcName,
|
||||
inputNames, outputName, nnetExprString,
|
||||
nnetSimplifiedExprString, std::to_string(hashCode));
|
||||
funcCode = py::str(code[0]);
|
||||
auto temp = py::list(code[3]);
|
||||
for (int i = 0; i < 6; ++i) {
|
||||
|
@ -183,6 +229,9 @@ class MemboundTVM : public Kernel {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32, MemboundTVM,
|
||||
"Memobund_TVM_Ansor");
|
||||
// REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
|
||||
// MemboundTVMExtractSource,
|
||||
// "Memobund_TVM_Ansor_extract_source");
|
||||
}; // namespace infini
|
||||
|
||||
#endif
|
|
@ -0,0 +1,273 @@
|
|||
#ifdef INFINI_USE_TVM
|
||||
#include "core/kernel.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "dlpack/dlpack.h"
|
||||
#include "ffi/ffi_embed.h"
|
||||
#include "nnet/Visitor/AsTVMVisitor.h"
|
||||
#include "operators/membound.h"
|
||||
#include "operators/pooling.h"
|
||||
#include "tvm/runtime/module.h"
|
||||
#include "tvm/runtime/packed_func.h"
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
using json = nlohmann::json;
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace infini {
|
||||
|
||||
using DLTensorHolder = pair<DLTensor, Ref<vector<int64_t>>>;
|
||||
|
||||
class TVMRecordObj : public PerfRecordObj {
|
||||
public:
|
||||
std::string kernelName;
|
||||
HashType simplifiedExprHash;
|
||||
std::string dllPath;
|
||||
std::string funcName;
|
||||
std::vector<int> inputIdx;
|
||||
tvm::runtime::PackedFunc packedFunc;
|
||||
};
|
||||
|
||||
using TVMRecord = Ref<TVMRecordObj>;
|
||||
|
||||
class MemboundTVMPackedFunction : public Kernel {
|
||||
public:
|
||||
void compute(const Operator &_op, const PerfRecord &record,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<MemBoundObj>(_op);
|
||||
// auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
auto tvmRecord = std::dynamic_pointer_cast<TVMRecordObj>(record);
|
||||
tvm::runtime::PackedFunc packedFunc = tvmRecord->packedFunc;
|
||||
// IT_ASSERT(packedFunc != nullptr);
|
||||
|
||||
// prepare inputs and outputs
|
||||
vector<DLTensorHolder> inputsHolder;
|
||||
for (auto idx : tvmRecord->inputIdx) {
|
||||
inputsHolder.emplace_back(
|
||||
convertTensorToDLTensor(op->getInputs()[idx]));
|
||||
}
|
||||
DLTensorHolder outputHolder = convertTensorToDLTensor(op->getOutput());
|
||||
|
||||
// make tvm arg and rv
|
||||
pair<vector<TVMValue>, vector<int>> preArgs =
|
||||
convertInOutToTVMArgs(inputsHolder, outputHolder);
|
||||
tvm::runtime::TVMRetValue rv;
|
||||
tvm::runtime::TVMArgs args(preArgs.first.data(), preArgs.second.data(),
|
||||
preArgs.first.size());
|
||||
|
||||
packedFunc.CallPacked(args, &rv);
|
||||
}
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
IT_ASSERT(false, "A TVM record is required for membound kernel.");
|
||||
}
|
||||
|
||||
// Premise: op is idempotent since it is called multiple times.
|
||||
PerfRecord tune(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
TVMRecord ret = std::make_shared<TVMRecordObj>();
|
||||
auto op = as<MemBoundObj>(_op);
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
|
||||
// invoke Ansor to tune a membound kernel
|
||||
auto [expr, hash] = op->getSimplifiedNnetExpr();
|
||||
nnet::AsTVMVisitor visitor;
|
||||
visitor.dispatch(expr);
|
||||
auto &&stmts = visitor.getStmts();
|
||||
auto &&inShapes = visitor.getInputShapes();
|
||||
auto &&outShape = visitor.getOutputShape();
|
||||
|
||||
const std::string func = "membound_" + std::to_string(hash);
|
||||
const std::string kernelName = func + "_kernel0";
|
||||
// Set the dllPath directly when debugging
|
||||
auto dllPath = getAnsorDLL(
|
||||
inShapes, std::vector<std::string>(inShapes.size(), "float32"),
|
||||
outShape, "float32", stmts, func, op->toString(),
|
||||
expr->toReadable(), hash);
|
||||
|
||||
// remap input
|
||||
vector<int> inputIdx;
|
||||
int numInputs = op->getInputs().size();
|
||||
for (int i = 0; i < numInputs; ++i) {
|
||||
string inputName = visitor.getInputs()[i];
|
||||
int j = 0;
|
||||
for (; j < numInputs; ++j) {
|
||||
if (inputName == op->getNnetInputs()[j]->getName())
|
||||
break;
|
||||
}
|
||||
IT_ASSERT(j < numInputs, "Cannot find input name: " + inputName);
|
||||
inputIdx.emplace_back(j);
|
||||
}
|
||||
|
||||
tvm::runtime::PackedFunc packedFunc = getPackedFunction(dllPath, func);
|
||||
IT_ASSERT(packedFunc != nullptr);
|
||||
|
||||
// prepare inputs and outputs
|
||||
vector<DLTensorHolder> inputsHolder;
|
||||
for (auto idx : inputIdx) {
|
||||
inputsHolder.emplace_back(
|
||||
convertTensorToDLTensor(op->getInputs()[idx]));
|
||||
}
|
||||
DLTensorHolder outputHolder = convertTensorToDLTensor(op->getOutput());
|
||||
|
||||
// make tvm arg and rv
|
||||
pair<vector<TVMValue>, vector<int>> preArgs =
|
||||
convertInOutToTVMArgs(inputsHolder, outputHolder);
|
||||
tvm::runtime::TVMRetValue rv;
|
||||
tvm::runtime::TVMArgs args(preArgs.first.data(), preArgs.second.data(),
|
||||
preArgs.first.size());
|
||||
|
||||
ret->time = timeit([&]() { packedFunc.CallPacked(args, &rv); },
|
||||
[&]() { context->sync(); });
|
||||
ret->kernelName = kernelName;
|
||||
ret->dllPath = dllPath;
|
||||
ret->funcName = func;
|
||||
ret->inputIdx = inputIdx;
|
||||
ret->packedFunc = packedFunc;
|
||||
|
||||
return std::dynamic_pointer_cast<PerfRecordObj>(ret);
|
||||
}
|
||||
|
||||
std::string serializeTVMArgs(const std::vector<std::vector<int>> &inDims,
|
||||
const std::vector<std::string> &inDTypes,
|
||||
const std::vector<int> &outDims,
|
||||
const std::string &outDType,
|
||||
const std::string &lambda,
|
||||
const std::string &funcName,
|
||||
const std::string &nnetExprString,
|
||||
const std::string &nnetSimplifiedExprString,
|
||||
const HashType hashCode) const {
|
||||
json j;
|
||||
// Consistant with python API interface
|
||||
j["input_tensors"] = inDims;
|
||||
j["input_dtypes"] = inDTypes;
|
||||
j["output_tensor"] = outDims;
|
||||
j["output_dtype"] = outDType;
|
||||
j["tvm_code"] = lambda;
|
||||
j["func_name"] = funcName;
|
||||
j["nnet_expression"] = nnetExprString;
|
||||
j["nnet_simplified_expression"] = nnetSimplifiedExprString;
|
||||
j["hash_code"] = std::to_string(hashCode);
|
||||
return j.dump();
|
||||
}
|
||||
|
||||
std::string getAnsorDLL(const std::vector<std::vector<int>> &inDims,
|
||||
const std::vector<std::string> &inDTypes,
|
||||
const std::vector<int> &outDims,
|
||||
const std::string &outDType,
|
||||
const std::string &lambda,
|
||||
const std::string &funcName,
|
||||
const std::string &nnetExprString,
|
||||
const std::string &nnetSimplifiedExprString,
|
||||
const HashType hashCode) const {
|
||||
int fdP2C[2], fdC2P[2];
|
||||
for (auto fd : {fdP2C, fdC2P}) {
|
||||
int status = pipe(fd);
|
||||
IT_ASSERT(status == 0, "pipe failed");
|
||||
}
|
||||
pid_t pid = fork();
|
||||
IT_ASSERT(pid >= 0, "fork failed");
|
||||
if (pid == 0) { // Child process
|
||||
close(fdP2C[1]);
|
||||
close(fdC2P[0]);
|
||||
|
||||
dup2(fdP2C[0], STDIN_FILENO);
|
||||
close(fdP2C[0]);
|
||||
|
||||
string cmd =
|
||||
"from cpp_plugin.gen_ansor_so import pipe_gen; pipe_gen(+" +
|
||||
std::to_string(fdC2P[1]) + ")";
|
||||
const char *const argv[] = {"python3", "-c", cmd.data(), NULL};
|
||||
execvp("python3", const_cast<char *const *>(argv));
|
||||
} else { // Parent process
|
||||
close(fdP2C[0]);
|
||||
close(fdC2P[1]);
|
||||
|
||||
// Write to pipe
|
||||
string serializedArgs = serializeTVMArgs(
|
||||
inDims, inDTypes, outDims, outDType, lambda, funcName,
|
||||
nnetExprString, nnetSimplifiedExprString, hashCode);
|
||||
int status = -1;
|
||||
status =
|
||||
write(fdP2C[1], serializedArgs.data(), serializedArgs.size());
|
||||
IT_ASSERT((size_t)status == serializedArgs.size(),
|
||||
"Failed to write to pipe");
|
||||
close(fdP2C[1]);
|
||||
|
||||
// Wait for TVM
|
||||
waitpid(pid, &status, 0);
|
||||
IT_ASSERT(WIFEXITED(status), "TVM process was terminated");
|
||||
const int es = WEXITSTATUS(status);
|
||||
IT_ASSERT(es == 0,
|
||||
"TVM process exit with code " + std::to_string(es));
|
||||
|
||||
// Read from pipe
|
||||
FILE *stream;
|
||||
stream = fdopen(fdC2P[0], "r");
|
||||
char buf_read[257] = {0};
|
||||
status = std::fscanf(stream, "%256c", buf_read);
|
||||
IT_ASSERT(status == 1, "Failed to read from pipe");
|
||||
IT_ASSERT(buf_read[256] == 0, "Pipe buffer overflow");
|
||||
fclose(stream);
|
||||
close(fdC2P[0]);
|
||||
return buf_read;
|
||||
}
|
||||
IT_ASSERT(false, "Should not reach here");
|
||||
return "";
|
||||
}
|
||||
|
||||
tvm::runtime::PackedFunc getPackedFunction(string path,
|
||||
string functionName) const {
|
||||
tvm::runtime::Module mod = tvm::runtime::Module::LoadFromFile(path);
|
||||
return mod.GetFunction(functionName);
|
||||
}
|
||||
|
||||
DLTensorHolder convertTensorToDLTensor(const Tensor &tensor) const {
|
||||
IT_ASSERT(tensor->getRuntime()->isCuda());
|
||||
// The lifecycle of shapeInt64 is managed by the caller.
|
||||
auto shapeInt64 = make_ref<vector<int64_t>>();
|
||||
for (auto v : tensor->getDims())
|
||||
shapeInt64->push_back(v);
|
||||
DLTensor ret{
|
||||
.data = tensor->getRawDataPtr<void *>(),
|
||||
.device = DLDevice{.device_type = kDLCUDA, .device_id = 0},
|
||||
.ndim = (int32_t)shapeInt64->size(),
|
||||
.dtype =
|
||||
DLDataType{.code = (uint8_t)kDLFloat, .bits = 32, .lanes = 1},
|
||||
.shape = static_cast<int64_t *>(shapeInt64->data()),
|
||||
.strides = nullptr,
|
||||
.byte_offset = 0,
|
||||
};
|
||||
return {ret, shapeInt64};
|
||||
}
|
||||
|
||||
pair<vector<TVMValue>, vector<int>>
|
||||
convertInOutToTVMArgs(const vector<DLTensorHolder> &inputs,
|
||||
const DLTensorHolder &output) const {
|
||||
vector<TVMValue> values;
|
||||
vector<int> type_codes;
|
||||
|
||||
// The order of inputs and outputs is consistant with definition of TVM
|
||||
// computation in Python, which is determined by AsTVMVisitor.
|
||||
values.emplace_back(TVMValue{.v_handle = (void *)&output.first});
|
||||
type_codes.emplace_back(kTVMDLTensorHandle);
|
||||
|
||||
for (auto &in : inputs) {
|
||||
values.emplace_back(TVMValue{.v_handle = (void *)&in.first});
|
||||
type_codes.emplace_back(kTVMDLTensorHandle);
|
||||
}
|
||||
|
||||
return {values, type_codes};
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
|
||||
MemboundTVMPackedFunction,
|
||||
"Memobund_TVM_Ansor_packed_funciton");
|
||||
}; // namespace infini
|
||||
|
||||
#endif
|
|
@ -0,0 +1,237 @@
|
|||
#include "core/blob.h"
|
||||
#include "core/dummy_mutator.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "core/search_engine.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "ffi/ffi_callback.h"
|
||||
#include "nnet/nmutator.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/unary.h"
|
||||
#include "test.h"
|
||||
#include <pybind11/stl.h>
|
||||
|
||||
namespace infini {
|
||||
|
||||
// NHWC format
|
||||
Graph getInfoGAN(int batch, Runtime runtime, int nLayers) {
|
||||
IT_ASSERT(1 <= nLayers && nLayers <= 5);
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
vector<Tensor> weights;
|
||||
vector<tuple<int, int, int, int, bool>> cs{
|
||||
// Channel, kernelSize, pad, stride, isTanh
|
||||
{448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
|
||||
{64, 4, 1, 2, false}, {3, 4, 1, 2, true},
|
||||
};
|
||||
|
||||
Tensor input =
|
||||
g->addTensor({batch, 1, 1, 228}, DataType::Float32, TensorType::Input);
|
||||
for (int i = 0; i < (int)cs.size() && i < nLayers; ++i) {
|
||||
auto [channel, kernelSize, pad, stride, tanh] = cs[i];
|
||||
int f = input->getDims()[3]; // n, h, w, f
|
||||
auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
|
||||
DataType::Float32,
|
||||
TensorType::Initialized); // f, r, s, c
|
||||
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
|
||||
pad, stride, stride, 1, 1)
|
||||
->getOutput();
|
||||
if (tanh) {
|
||||
input = g->addOp<TanhObj>(input, nullptr)->getOutput();
|
||||
} else {
|
||||
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
|
||||
}
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId) {
|
||||
IT_ASSERT(0 <= layerId && layerId < 5);
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
vector<Tensor> weights;
|
||||
vector<tuple<int, int, int, int, bool>> cs{
|
||||
// Channel, kernelSize, pad, stride, isTanh
|
||||
{448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
|
||||
{64, 4, 1, 2, false}, {3, 4, 1, 2, true},
|
||||
};
|
||||
|
||||
Tensor input = g->addTensor(shape, DataType::Float32, TensorType::Input);
|
||||
for (int i = layerId; i < layerId + 1; ++i) {
|
||||
auto [channel, kernelSize, pad, stride, tanh] = cs[i];
|
||||
int f = input->getDims()[3]; // n, h, w, f
|
||||
auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
|
||||
DataType::Float32,
|
||||
TensorType::Initialized); // f, r, s, c
|
||||
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
|
||||
pad, stride, stride, 1, 1)
|
||||
->getOutput();
|
||||
if (tanh) {
|
||||
input = g->addOp<TanhObj>(input, nullptr)->getOutput();
|
||||
} else {
|
||||
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
|
||||
}
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
void printGraph(Graph g) {
|
||||
g->print();
|
||||
puts("============ Data ============");
|
||||
for (auto t : g->getTensors()) {
|
||||
dbg(t);
|
||||
t->printData();
|
||||
}
|
||||
}
|
||||
|
||||
Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) {
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
|
||||
auto mutator =
|
||||
make_ref<NMutator>(NMutator::Mode::RuleBased,
|
||||
vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
|
||||
vector<Graph> bestGraphs;
|
||||
SearchEngine searchEngine(runtime, mutator);
|
||||
bestGraphs.emplace_back(searchEngine.run(g));
|
||||
g->topo_sort();
|
||||
dbg(g, bestGraphs[0], bestGraphs.size());
|
||||
g->print();
|
||||
|
||||
g->dataMalloc();
|
||||
map<UidBaseType, Tensor> fuidToInputTensor;
|
||||
for (auto t : g->getInputs()) {
|
||||
IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
|
||||
fuidToInputTensor[t->getFuid()] = t;
|
||||
}
|
||||
|
||||
auto gen = RandomGenerator(-0.1, 0.1, 0);
|
||||
for (auto t : g->getInputs()) {
|
||||
t->setData(gen);
|
||||
}
|
||||
for (auto t : g->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
runtime->run(g);
|
||||
dbg("Baseline graph");
|
||||
printGraph(g);
|
||||
dbg(runtime->getPerfTime(g, true));
|
||||
|
||||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
auto bestGraphCpu = bestGraphs[i];
|
||||
auto bestGraph =
|
||||
make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
|
||||
bestGraph->topo_sort();
|
||||
|
||||
bestGraph->dataMalloc();
|
||||
// Initialize inputs with random data
|
||||
for (auto t : bestGraph->getInputs()) {
|
||||
t->copyData(fuidToInputTensor[t->getFuid()]);
|
||||
}
|
||||
|
||||
// Initialize outputs with zeros
|
||||
for (auto t : bestGraph->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
|
||||
dbg(bestGraph);
|
||||
dbg(bestGraph->getOutputs());
|
||||
|
||||
if (tuning) {
|
||||
runtime->run(bestGraph, true); // Tune kernels
|
||||
runtime->run(bestGraph, false); // Execute transfomraed graph
|
||||
|
||||
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
|
||||
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
|
||||
// EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
|
||||
dbg(go0->equalData(bgo0, 1e-3));
|
||||
dbg(runtime->getPerfTime(bestGraph, true));
|
||||
dbg(runtime->timeNonCtcOperators(bestGraph));
|
||||
}
|
||||
|
||||
dbg("Best graph");
|
||||
printGraph(bestGraph);
|
||||
return bestGraph;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
vector<Tensor> runInfoGAN(int nLayers) {
|
||||
auto cuda = make_ref<CudaRuntimeObj>();
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
|
||||
Graph g = getInfoGAN(1, cuda, nLayers);
|
||||
|
||||
auto mutator =
|
||||
make_ref<NMutator>(NMutator::Mode::RuleBased,
|
||||
vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
|
||||
// // Translate OP to membound without derivation
|
||||
// mutator->setToNaiveMembound();
|
||||
|
||||
vector<Graph> bestGraphs;
|
||||
SearchEngine searchEngine(cuda, mutator);
|
||||
bestGraphs.emplace_back(searchEngine.run(g));
|
||||
g->topo_sort();
|
||||
dbg(g, bestGraphs[0], bestGraphs.size());
|
||||
g->print();
|
||||
|
||||
g->dataMalloc();
|
||||
map<UidBaseType, Tensor> fuidToInputTensor;
|
||||
for (auto t : g->getInputs()) {
|
||||
IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
|
||||
fuidToInputTensor[t->getFuid()] = t;
|
||||
}
|
||||
|
||||
auto gen = RandomGenerator(-0.1, 0.1, 0);
|
||||
// auto gen = RandomGenerator(-5, 5, 0, true);
|
||||
for (auto t : g->getInputs()) {
|
||||
t->setData(gen);
|
||||
}
|
||||
for (auto t : g->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
cuda->run(g);
|
||||
dbg("Baseline graph");
|
||||
printGraph(g);
|
||||
dbg(cuda->getPerfTime(g, true));
|
||||
|
||||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
auto bestGraphCpu = bestGraphs[i];
|
||||
auto bestGraph = make_ref<GraphObj>(cuda, bestGraphCpu->getOperators());
|
||||
bestGraph->topo_sort();
|
||||
|
||||
bestGraph->dataMalloc();
|
||||
// Initialize inputs with random data
|
||||
for (auto t : bestGraph->getInputs()) {
|
||||
t->copyData(fuidToInputTensor[t->getFuid()]);
|
||||
}
|
||||
|
||||
// Initialize outputs with zeros
|
||||
for (auto t : bestGraph->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
|
||||
dbg(bestGraph);
|
||||
dbg(bestGraph->getOutputs());
|
||||
|
||||
cuda->run(bestGraph, true); // Tune kernels
|
||||
cuda->run(bestGraph, false); // Execute transfomraed graph
|
||||
|
||||
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
|
||||
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
|
||||
// EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
|
||||
std::cout << go0->equalData(bgo0, 1e-3) << std::endl;
|
||||
bgo0->printData();
|
||||
go0->printData();
|
||||
dbg(cuda->getPerfTime(bestGraph, true));
|
||||
|
||||
dbg("Best graph");
|
||||
printGraph(bestGraph);
|
||||
callback::exportONNX(bestGraph, "best_graph.onnx"); // Debug
|
||||
return {g->getOutputs()[0], bestGraph->getOutputs()[0]};
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
// TEST(ModelE2E, InfoGAN) { runInfoGAN(); }
|
||||
|
||||
} // namespace infini
|
|
@ -27,13 +27,16 @@ std::string AsTVMVisitor::visit_(const BinaryOp &c) {
|
|||
}
|
||||
}
|
||||
std::string AsTVMVisitor::visit_(const Func &c) {
|
||||
string nested = dispatch(c->getObject());
|
||||
switch (c->getFuncType()) {
|
||||
case FuncType::Relu:
|
||||
// TODO: Deduce the dtype
|
||||
return "te.max(" + dispatch(c->getObject()) +
|
||||
", tvm.tir.const(0, 'float32'))";
|
||||
return "te.max(" + nested + ", tvm.tir.const(0, 'float32'))";
|
||||
case FuncType::Tanh:
|
||||
return "te.tanh(" + dispatch(c->getObject()) + ")";
|
||||
return "te.tanh(" + nested + ")";
|
||||
case FuncType::PRelu:
|
||||
return "tir.if_then_else(0.0 < " + nested + ", " + nested +
|
||||
", (0.25 * " + nested + "))";
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
|
@ -114,6 +117,11 @@ std::string AsTVMVisitor::visit_(const Subscript &c) {
|
|||
str += " - " +
|
||||
std::to_string(rangeOp->getLoopVarRanges()[i].second.first -
|
||||
rangeOp->getPaddings(i));
|
||||
} else if (c->getObject()->getType() == NodeType::TensorNodeType) {
|
||||
auto tensor = as<TensorNode>(c->getObject());
|
||||
if (auto pad_i = tensor->getPadding(i); pad_i > 0) {
|
||||
str += " + " + std::to_string(pad_i);
|
||||
}
|
||||
}
|
||||
}
|
||||
str += "]";
|
||||
|
@ -138,6 +146,24 @@ std::string AsTVMVisitor::visit_(const Tensor &c) {
|
|||
}
|
||||
stmt += "), name='" + c->getName() + "')";
|
||||
stmts += stmt + "\n";
|
||||
|
||||
if (c->hasPadding()) {
|
||||
std::string name_after_pad = "pad_" + c->getName();
|
||||
pythonVars.emplace_back(name_after_pad);
|
||||
// inputs.emplace_back(name_after_pad);
|
||||
std::string pad_tuple = "(";
|
||||
for (auto pad : c->getPaddings()) {
|
||||
pad_tuple += std::to_string(pad) + ", ";
|
||||
}
|
||||
pad_tuple += ")";
|
||||
|
||||
std::string pad_stmt = name_after_pad + " = " + "topi.nn.pad(" +
|
||||
c->getName() + ", " + pad_tuple + ", " +
|
||||
pad_tuple + ", 0.0, \"" + name_after_pad + "\")";
|
||||
stmts += pad_stmt + "\n";
|
||||
return name_after_pad;
|
||||
}
|
||||
|
||||
return c->getName();
|
||||
}
|
||||
std::string AsTVMVisitor::getStmts() const {
|
||||
|
|
|
@ -13,7 +13,8 @@ string FullPrinterVisitor::print(const Expr &root) {
|
|||
oss << "==> ROOT\n" << root->toReadable() << "\n";
|
||||
for (size_t i = 0; i < q.size(); ++i) {
|
||||
const auto &[name, routine, tensor] = q[i];
|
||||
oss << "==> " << name << " : ";
|
||||
oss << "==> " << name << " " << infini::vecToString(tensor->getShape())
|
||||
<< " : ";
|
||||
if (routine) {
|
||||
oss << routine->toReadable() << "\n";
|
||||
if (routine->getExpr()) {
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
namespace nnet {
|
||||
|
||||
Expr MergeMemboundMutator::merge(bool allowEmptyMembound) {
|
||||
Expr MergeMemboundMutator::merge(bool allowEmptyMembound, bool allowFailure) {
|
||||
// FIXME: fix empty expression in membound
|
||||
assert(kernels.size() >= 1);
|
||||
if (checkEmpty()) {
|
||||
|
@ -27,19 +27,30 @@ Expr MergeMemboundMutator::merge(bool allowEmptyMembound) {
|
|||
assert(CheckOOBVisitor().checkRangeOp(curRangeOp) == false);
|
||||
auto summand = curRangeOp->getSummand();
|
||||
if (auto subscriptOp = as<SubscriptNode>(summand)) {
|
||||
// Try merging the current and next stages
|
||||
if (auto mergedExpr = rule4StageMerging(*curExpr, true)) {
|
||||
// dbg(*curExpr, mergedExpr);
|
||||
*curExpr = mergedExpr;
|
||||
merged = true;
|
||||
break;
|
||||
}
|
||||
// If merging fails, try the next stage
|
||||
curExpr = subscriptOp->getObjectPtr();
|
||||
nnet_assert(*curExpr != nullptr, __LINE__);
|
||||
} else if (auto funcOp = as<FuncNode>(summand)) {
|
||||
// Relu({...}[i,j])
|
||||
curExpr = funcOp->getObject()->getObjectPtr();
|
||||
} else
|
||||
nnet_unimplemented_halt();
|
||||
// If the object of FuncNode is a subscript, like
|
||||
// Relu({...}[i,j]), we can further merge it. Otherwise, like
|
||||
// Relu(A[i]+B[j]), we cannot.
|
||||
if (auto sub = as<SubscriptNode>(funcOp->getObject()))
|
||||
curExpr = sub->getObjectPtr();
|
||||
else
|
||||
break;
|
||||
} else {
|
||||
if (allowFailure)
|
||||
return nullptr;
|
||||
else
|
||||
nnet_unimplemented_halt();
|
||||
}
|
||||
}
|
||||
} while (merged);
|
||||
return expr;
|
||||
|
|
|
@ -73,6 +73,14 @@ string Serializer::visit_(const Tensor &c) {
|
|||
return key;
|
||||
}
|
||||
|
||||
string Serializer::visit_(const Func &c) {
|
||||
const string key = std::to_string(id++);
|
||||
j[key]["type"] = c->getType();
|
||||
j[key]["funcType"] = c->getFuncType();
|
||||
j[key]["object"] = dispatch(c->getObject());
|
||||
return key;
|
||||
}
|
||||
|
||||
bool Serializer::serialize(const Expr &expr, const string &filePath,
|
||||
const string &msg) {
|
||||
// Metadata
|
||||
|
@ -180,6 +188,10 @@ Expr Serializer::buildExprTree(string key) {
|
|||
return make_ref<TensorNode>(j[key]["name"], j[key]["shape"],
|
||||
j[key]["paddings"], source);
|
||||
}
|
||||
case NodeType::FuncNodeType: {
|
||||
auto object = buildExprTree(j[key]["object"]);
|
||||
return make_ref<FuncNode>(object, j[key]["funcType"]);
|
||||
}
|
||||
default: {
|
||||
nnet_unimplemented_halt();
|
||||
break;
|
||||
|
|
|
@ -153,4 +153,11 @@ HashType HashVisitor::visit_(const Var &c) {
|
|||
return varHash[c];
|
||||
}
|
||||
|
||||
HashType HashVisitor::visit_(const Func &c) {
|
||||
HashType objHash = dispatch(c->getObject());
|
||||
return hash(binPrefix,
|
||||
hash((((HashType)c->getFuncType()) + 10086), objHash));
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace nnet
|
|
@ -1,4 +1,5 @@
|
|||
#include "nnet/expr.h"
|
||||
#include "nnet/Visitor/FullPrinterVisitor.h"
|
||||
#include "nnet/Visitor/GetTensorsVisitor.h"
|
||||
|
||||
namespace nnet {
|
||||
|
@ -90,6 +91,14 @@ size_t TensorNode::getOffset(const vector<int> &idx) {
|
|||
return offset;
|
||||
}
|
||||
|
||||
bool TensorNode::hasPadding() {
|
||||
for (auto pad : paddings) {
|
||||
if (pad > 0)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
string RangeOpNode::toReadable() const {
|
||||
string ret;
|
||||
for (int i = 0; i < IterationType::NumIterationType; ++i) {
|
||||
|
@ -264,10 +273,15 @@ string FuncNode::toReadable() const {
|
|||
ret += "Relu";
|
||||
else if (funcType == FuncType::Tanh)
|
||||
ret += "Tanh";
|
||||
else if (funcType == FuncType::PRelu)
|
||||
ret += "PRelu";
|
||||
else
|
||||
nnet_unimplemented_halt();
|
||||
ret += "( ... " + serializeVec(object->getIndex()) + ")\n {" +
|
||||
object->getObject()->toReadable() + "}";
|
||||
if (auto sub = as<SubscriptNode>(object))
|
||||
ret += "( ... " + serializeVec(sub->getIndex()) + ")\n {" +
|
||||
sub->getObject()->toReadable() + "}";
|
||||
else
|
||||
ret += "(" + object->toReadable() + ")";
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -380,6 +394,7 @@ int64_t TensorNode::getSize() const {
|
|||
size *= len;
|
||||
return size;
|
||||
}
|
||||
|
||||
int RangeOpNode::getPaddings(int dim) const {
|
||||
return dim < (int)paddings.size() ? paddings[dim] : 0;
|
||||
}
|
||||
|
@ -445,8 +460,13 @@ vector<Range> RangeOpNode::getOutputRanges() const {
|
|||
}
|
||||
|
||||
void FuncNode::setObject(Expr e) {
|
||||
object = as<SubscriptNode>(e);
|
||||
nnet_assert(object, "Illegal subscripted object");
|
||||
nnet_assert(e->isScalar(), "FuncNode operates on scalars");
|
||||
object = e;
|
||||
}
|
||||
|
||||
string RangeOpNode::getFullExpression() {
|
||||
FullPrinterVisitor printer;
|
||||
return printer.print(this->shared_from_this());
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
|
|
|
@ -574,7 +574,8 @@ Expr ConvTransPattern::getExpr(Tensor A, Tensor K, int N, int C, int H, int W,
|
|||
|
||||
auto subA = makeSubscript(A, {n, x1 + r - 1, y1 + s - 1, f});
|
||||
auto subK =
|
||||
makeSubscript(K, {(R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, f, c});
|
||||
// makeSubscript(K, {(R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, f, c});
|
||||
makeSubscript(K, {f, (R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, c});
|
||||
// x1=(h+1)//2, x2=(h+1)%2, y1=(w+1)//2
|
||||
|
||||
auto range1 = makeRangeOperator(
|
||||
|
|
|
@ -1,21 +1,27 @@
|
|||
#include "nnet/nmutator.h"
|
||||
#include "core/graph.h"
|
||||
#include "ffi/ffi_callback.h"
|
||||
#include "nnet/Visitor/FullPrinterVisitor.h"
|
||||
#include "nnet/Visitor/GetTensorsVisitor.h"
|
||||
#include "nnet/Visitor/MatchReshapeVisitor.h"
|
||||
#include "nnet/Visitor/MergeMemboundMutator.h"
|
||||
#include "nnet/derivator.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/membound.h"
|
||||
#include "operators/reshape.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
NMutator::NMutator(Mode mode) : Mutator(10), mode{mode} {
|
||||
IT_ASSERT(mode != Mode::RuleBased, "Use RuleBased in the other ctor.");
|
||||
IT_ASSERT(mode != Mode::RuleBased, "Specify rules for the RuleBased mode.");
|
||||
}
|
||||
|
||||
NMutator::NMutator(const std::vector<int> &derivationRules)
|
||||
: Mutator(10), mode{Mode::RuleBased}, derivationRules{derivationRules} {}
|
||||
NMutator::NMutator(Mode mode, const std::vector<int> &derivationRules)
|
||||
: Mutator(10), mode{Mode::RuleBased}, derivationRules{derivationRules} {
|
||||
IT_ASSERT(mode == Mode::RuleBased);
|
||||
}
|
||||
|
||||
NMutator::~NMutator() {}
|
||||
|
||||
|
@ -46,9 +52,9 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
|
|||
assert(computeOps.size() == 1);
|
||||
const auto &computeOp = computeOps[0];
|
||||
auto g = infini::make_ref<GraphObj>(in_graph->getRuntime());
|
||||
auto expr = opToExpression(computeOp);
|
||||
nnet::Expr expr = opToExpression(computeOp);
|
||||
auto inputsN = nnet::GetTensorsVisitor().get(expr);
|
||||
dbg(inputsN, expr);
|
||||
// dbg(inputsN, expr);
|
||||
IT_ASSERT(inputsN.count("B") + inputsN.count("K") == 1,
|
||||
"Which one is the second input tensor?");
|
||||
vector<nnet::Tensor> inputsVectorN = {inputsN.at("A")};
|
||||
|
@ -69,18 +75,17 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
|
|||
}
|
||||
|
||||
void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
|
||||
IT_TODO_HALT();
|
||||
// OpVec computeOps = in_graph->getComputeOps();
|
||||
OpVec computeOps = in_graph->getComputeOps();
|
||||
IT_ASSERT(computeOps.size() == 1);
|
||||
|
||||
// if (infini::Graph g = transformTConv1x1(computeOps[0])) {
|
||||
// out_graphs.emplace_back(g);
|
||||
// return;
|
||||
// }
|
||||
// // Commented for debug, not implemented yet
|
||||
// // if (infini::Graph g = transformTConv3x3(computeOps[0])) {
|
||||
// // Graph graph = new Graph(g->getOperators());
|
||||
// // out_graphs.emplace_back(graph);
|
||||
// // return;
|
||||
// // }
|
||||
if (Graph g = transformConvtransposed1x1(computeOps[0])) {
|
||||
out_graphs.emplace_back(g);
|
||||
return;
|
||||
}
|
||||
// if (infini::Graph g = transformDialtedConv(computeOps[0])) {
|
||||
// out_graphs.emplace_back(g);
|
||||
// return;
|
||||
|
@ -96,38 +101,35 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
|
|||
// // return;
|
||||
// // }
|
||||
|
||||
// auto expr = opToExpression(computeOps[0]);
|
||||
// if (!expr)
|
||||
// return;
|
||||
auto expr = opToExpression(computeOps[0]);
|
||||
if (!expr)
|
||||
return;
|
||||
|
||||
// nnet::Derivator derivator(maxDepth);
|
||||
// nnet::Formula conv_9x9(expr, 0);
|
||||
// // const std::vector<int> rules{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90}; //
|
||||
// Tconv
|
||||
// // const std::vector<int> rules{1, 7, 7, 2, 8, 6, 6}; // G2BMM
|
||||
// if (mode == Mode::Normal) {
|
||||
// derivator.search(conv_9x9, 0);
|
||||
// } else if (mode == Mode::RuleBased) {
|
||||
// dbg(derivationRules);
|
||||
// derivator.ruleBasedDFS(conv_9x9, 0, derivationRules);
|
||||
// } else
|
||||
// nnet_assert(0, "Unknown mode");
|
||||
// const auto &candidates = derivator.getCandidates();
|
||||
nnet::Derivator derivator(maxDepth);
|
||||
nnet::Formula conv_9x9(expr, 0);
|
||||
if (mode == Mode::Normal) {
|
||||
derivator.search(conv_9x9, 0);
|
||||
} else if (mode == Mode::RuleBased) {
|
||||
// dbg(derivationRules);
|
||||
derivator.ruleBasedDFS(conv_9x9, 0, derivationRules);
|
||||
} else
|
||||
IT_TODO_HALT_MSG("Unknown NMutator search mode.");
|
||||
const auto &candidates = derivator.getCandidates();
|
||||
// dbg(candidates.size());
|
||||
// // derivator.print();
|
||||
// for (const auto &candidate : candidates) {
|
||||
// // dbg(nnet::FullPrinterVisitor().print(candidate.root));
|
||||
// if (auto g = expressionToGraph(candidate.root, in_graph)) {
|
||||
// out_graphs.emplace_back(g);
|
||||
// }
|
||||
// // break; // HACK:Debug only for the first subgraph
|
||||
// derivator.print();
|
||||
for (const auto &candidate : candidates) {
|
||||
// dbg(nnet::FullPrinterVisitor().print(candidate.root));
|
||||
if (auto g = expressionToGraph(candidate.root, in_graph)) {
|
||||
out_graphs.emplace_back(g);
|
||||
}
|
||||
// break; // HACK:Debug only for the first subgraph
|
||||
}
|
||||
// dbg(out_graphs);
|
||||
// for (auto graph : out_graphs) {
|
||||
// graph->print();
|
||||
// }
|
||||
// // dbg(out_graphs);
|
||||
// // for (auto graph : out_graphs) {
|
||||
// // graph->print();
|
||||
// // }
|
||||
// cntStates += derivator.getNumIntermediateStates();
|
||||
// cntCandidates += derivator.getNumCandidates();
|
||||
cntStates += derivator.getNumIntermediateStates();
|
||||
cntCandidates += derivator.getNumCandidates();
|
||||
}
|
||||
|
||||
void NMutator::runMultipleOps(Graph in_graph, std::vector<Graph> &out_graphs) {
|
||||
|
@ -228,31 +230,38 @@ void NMutator::runMultipleOps(Graph in_graph, std::vector<Graph> &out_graphs) {
|
|||
// }
|
||||
// }
|
||||
|
||||
nnet::Expr NMutator::opToExpression(Operator op) {
|
||||
// IT_TODO_HALT();
|
||||
if (auto convOp = as<ConvObj>(op)) {
|
||||
nnet::Expr NMutator::opToExpression(Operator opT) {
|
||||
auto [expr, mapNameNToTensorT] = extractOp(opT);
|
||||
for (auto &[name, tensorT] : mapNameNToTensorT) {
|
||||
IT_ASSERT(inputsNameNToTensorT.count(name) == 0);
|
||||
inputsNameNToTensorT[name] = tensorT;
|
||||
}
|
||||
return expr;
|
||||
}
|
||||
|
||||
pair<nnet::Expr, NMutator::NameNToTensorT> NMutator::extractOp(Operator opT) {
|
||||
if (auto convOp = as<ConvObj>(opT)) {
|
||||
const auto &inputs = convOp->getInputs();
|
||||
const auto &AT = inputs[0];
|
||||
const auto &KT = inputs[1];
|
||||
const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS();
|
||||
const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation();
|
||||
if (!(sh == 1 && sw == 1 && dh == 1 && dw == 1))
|
||||
return nullptr;
|
||||
return {};
|
||||
assert(sh == 1 && sw == 1 && dh == 1 && dw == 1);
|
||||
inputsNameNToTensorT["A"] = AT;
|
||||
inputsNameNToTensorT["K"] = KT;
|
||||
const auto A = nnet::makeTensor("A", AT->getDims(),
|
||||
std::vector<int>{0, 0, ph, pw});
|
||||
const auto K = nnet::makeTensor("K", KT->getDims());
|
||||
return nnet::ConvPattern::getExpr(A, K, n, c, h, w, f, r, s);
|
||||
} else if (auto convOp = as<ConvTransposed2dObj>(op)) {
|
||||
return {nnet::ConvPattern::getExpr(A, K, n, c, h, w, f, r, s),
|
||||
{{"A", AT}, {"K", KT}}};
|
||||
} else if (auto convOp = as<ConvTransposed2dNHWCObj>(opT)) {
|
||||
const auto &AT = convOp->getInputs()[0];
|
||||
const auto &KT = convOp->getInputs()[1];
|
||||
inputsNameNToTensorT["A"] = AT;
|
||||
inputsNameNToTensorT["K"] = KT;
|
||||
const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS();
|
||||
const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation();
|
||||
IT_ASSERT_TODO(convOp->getNumGroups() == 1);
|
||||
if (r != 4)
|
||||
return {};
|
||||
IT_ASSERT_TODO(r == 4);
|
||||
IT_ASSERT_TODO(ph == pw);
|
||||
IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));
|
||||
|
@ -264,8 +273,9 @@ nnet::Expr NMutator::opToExpression(Operator op) {
|
|||
const auto A = nnet::makeTensor(
|
||||
"A", AT->getDims(), std::vector<int>{0, padding, padding, 0});
|
||||
const auto K = nnet::makeTensor("K", KT->getDims());
|
||||
return nnet::ConvTransPattern::getExpr(A, K, n, c, h, w, f, r, s);
|
||||
// } else if (auto g2bmmOp = dynamic_cast<G2BMMOp *>(op)) {
|
||||
return {nnet::ConvTransPattern::getExpr(A, K, n, c, h, w, f, r, s),
|
||||
{{"A", AT}, {"K", KT}}};
|
||||
// } else if (auto g2bmmOp = dynamic_cast<G2BMMOp *>(opT)) {
|
||||
// const auto &AT = g2bmmOp->getInputs()[0];
|
||||
// const auto &BT = g2bmmOp->getInputs()[1];
|
||||
// const auto [b, m, k, width, dilation] = g2bmmOp->getArgs();
|
||||
|
@ -275,7 +285,7 @@ nnet::Expr NMutator::opToExpression(Operator op) {
|
|||
// inputsNameNToTensorT[inputsN.first->getName()] = AT;
|
||||
// inputsNameNToTensorT[inputsN.second->getName()] = BT;
|
||||
// return expr;
|
||||
// } else if (auto gbmmlOp = dynamic_cast<GBMMLOp *>(op)) {
|
||||
// } else if (auto gbmmlOp = dynamic_cast<GBMMLOp *>(opT)) {
|
||||
// const auto &AT = gbmmlOp->getInputs()[0];
|
||||
// const auto &BT = gbmmlOp->getInputs()[1];
|
||||
// const auto [b, m, w, k, dilation] = gbmmlOp->getArgs();
|
||||
|
@ -285,118 +295,146 @@ nnet::Expr NMutator::opToExpression(Operator op) {
|
|||
// inputsNameNToTensorT[inputsN.second->getName()] = BT;
|
||||
// dbg(b, m, w, k, dilation, expr);
|
||||
// return expr;
|
||||
} else if (auto matmulOp = as<MatmulObj>(op)) {
|
||||
} else if (auto matmulOp = as<MatmulObj>(opT)) {
|
||||
const auto &AT = matmulOp->getInputs()[0];
|
||||
const auto &BT = matmulOp->getInputs()[1];
|
||||
const auto [b, m, n, k, transA, transB] = matmulOp->getBMNKTransAB();
|
||||
const auto &[expr, inputsN] =
|
||||
nnet::MatmulPattern::getExpr(transA, transB, b, m, n, k);
|
||||
inputsNameNToTensorT[inputsN.first->getName()] = AT;
|
||||
inputsNameNToTensorT[inputsN.second->getName()] = BT;
|
||||
// dbg(b, m, n, k, expr);
|
||||
return expr;
|
||||
return {
|
||||
expr,
|
||||
{{inputsN.first->getName(), AT}, {inputsN.second->getName(), BT}}};
|
||||
} else if (auto op = as<MemBoundObj>(opT)) {
|
||||
NameNToTensorT m;
|
||||
for (int i = 0; i < op->numInputs(); ++i)
|
||||
m[op->getNnetInputs()[i]->getName()] = opT->getInputs()[i];
|
||||
return {op->getNnetExpr(), m};
|
||||
} else if (opT->getOpType() == OpType::Relu ||
|
||||
opT->getOpType() == OpType::Tanh) {
|
||||
return generateUnaryExpr(opT);
|
||||
}
|
||||
// // else if (auto transposeOp = dynamic_cast<TransposeOp *>(op)) {
|
||||
// // else if (auto transposeOp = dynamic_cast<TransposeOp *>(opT)) {
|
||||
// // return transposeOpToExpression(transposeOp);
|
||||
// // }
|
||||
nnet_unimplemented_continue();
|
||||
return nullptr;
|
||||
IT_TODO_HALT_MSG("Cannot convert " + opT->toString() +
|
||||
" to an NNet expression");
|
||||
return {};
|
||||
}
|
||||
|
||||
infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
|
||||
IT_TODO_HALT();
|
||||
// auto g = make_ref<GraphObj>();
|
||||
// nnet::FullPrinterVisitor fullVisitor;
|
||||
// const auto &tensorQueueN = fullVisitor.traverse(expr);
|
||||
// // Build tensors: Skip the first one, which is output
|
||||
// auto nameNToTensorT = inputsNameNToTensorT;
|
||||
// for (size_t i = 1; i < tensorQueueN.size(); ++i) {
|
||||
// const auto &[nameN, routineN, tensorN] = tensorQueueN[i];
|
||||
// // dbg(nameN, routineN, tensorN);
|
||||
// if (!routineN) {
|
||||
// // This is an inputs
|
||||
// assert(nameNToTensorT.count(nameN));
|
||||
// } else {
|
||||
// assert(!nameNToTensorT.count(nameN));
|
||||
// nameNToTensorT[nameN] = g->addTensor(tensorN->getShape());
|
||||
// }
|
||||
// }
|
||||
// const auto &outputsPET = in_graph->getOutputs();
|
||||
// if (outputsPET.size() != 1) {
|
||||
// nnet_unimplemented_continue();
|
||||
// return nullptr;
|
||||
// }
|
||||
// nameNToTensorT[std::get<0>(tensorQueueN.at(0))] = outputsPET[0];
|
||||
// // Build computation graph in PET:
|
||||
// for (int i = tensorQueueN.size() - 1; i >= 0; --i) {
|
||||
// const auto &[outputNameN, routineN, tensorN] = tensorQueueN[i];
|
||||
// if (!routineN)
|
||||
// continue;
|
||||
// // dbg(outputNameN, routineN, tensorN, routineN->getType());
|
||||
// if (auto op = nnet::as<nnet::ConvNode>(routineN)) {
|
||||
// // g->conv(i8, w9, 2, 2);
|
||||
// std::vector<nnet::Tensor> inputsN = op->getInputs();
|
||||
// auto A = nameNToTensorT.at(inputsN[0]->getName());
|
||||
// auto K = nameNToTensorT.at(inputsN[1]->getName());
|
||||
// auto output = nameNToTensorT.at(outputNameN);
|
||||
// const auto &[ph, pw, sh, sw, dh, dw] = op->getArgs();
|
||||
// g->conv(A, K, output, ph, pw, sh, sw, dh, dw);
|
||||
// } else if (auto op = nnet::as<nnet::ElementWiseNode>(routineN)) {
|
||||
// assert(op->getInputs().size() == 1);
|
||||
// nnet::MatchReshapeVisitor matchReshapeVisitor;
|
||||
// if (matchReshapeVisitor(op->getExpr())) {
|
||||
// auto input =
|
||||
// nameNToTensorT.at(op->getInputs().at(0)->getName());
|
||||
// auto output = nameNToTensorT.at(outputNameN);
|
||||
// g->reshape(input, output);
|
||||
// } else {
|
||||
// TensorVec inputsPET;
|
||||
// TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
|
||||
// for (const auto &inputN : op->getInputs())
|
||||
// inputsPET.emplace_back(
|
||||
// nameNToTensorT.at(inputN->getName()));
|
||||
// // Re-estimate time here.
|
||||
// ssize_t cnt = 0;
|
||||
// for (const auto tensor : inputsPET)
|
||||
// cnt += tensor->size();
|
||||
// for (const auto tensor : outputsPET)
|
||||
// cnt += tensor->size();
|
||||
// g->membound(inputsPET, outputsPET, op->getInputs(),
|
||||
// op->getExpr(), memboundTime(cnt));
|
||||
// }
|
||||
// } else if (auto op = nnet::as<nnet::MatmulNode>(routineN)) {
|
||||
// assert(op->getInputs().size() == 2);
|
||||
// nnet::Tensor AN = op->getInputs()[0];
|
||||
// nnet::Tensor BN = op->getInputs()[1];
|
||||
// TensorVec inputsPET = {nameNToTensorT.at(AN->getName()),
|
||||
// nameNToTensorT.at(BN->getName())};
|
||||
// TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
|
||||
// const auto &[b, m, n, k, transa, transb] = op->getArgs();
|
||||
// g->matmul(inputsPET[0], inputsPET[1], outputsPET[0], transa,
|
||||
// transb);
|
||||
// } else if (auto op = nnet::as<nnet::G2bmmNode>(routineN)) {
|
||||
// assert(op->getInputs().size() == 2);
|
||||
// nnet::Tensor AN = op->getInputs()[0];
|
||||
// nnet::Tensor BN = op->getInputs()[1];
|
||||
// TensorVec inputsPET = {nameNToTensorT.at(AN->getName()),
|
||||
// nameNToTensorT.at(BN->getName())};
|
||||
// TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
|
||||
// const auto &[b, m, w, k, dilation] = op->getArgs();
|
||||
// g->g2bmm(inputsPET[0], inputsPET[1], outputsPET[0], w, dilation);
|
||||
// } else if (auto op = nnet::as<nnet::GbmmNode>(routineN)) {
|
||||
// assert(op->getInputs().size() == 2);
|
||||
// nnet::Tensor AN = op->getInputs()[0];
|
||||
// nnet::Tensor BN = op->getInputs()[1];
|
||||
// TensorVec inputsPET = {nameNToTensorT.at(AN->getName()),
|
||||
// nameNToTensorT.at(BN->getName())};
|
||||
// TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
|
||||
// const auto &[b, m, w, n, dilation] = op->getArgs();
|
||||
// g->gbmml(inputsPET[0], inputsPET[1], outputsPET[0], dilation);
|
||||
// }
|
||||
// }
|
||||
// g->updateConnection();
|
||||
// Graph graph = new Graph(g->getOperators());
|
||||
// return graph;
|
||||
auto g = make_ref<GraphObj>(runtime);
|
||||
nnet::FullPrinterVisitor fullVisitor;
|
||||
// Get tensors in the reversed topological order
|
||||
const auto &tensorQueueN = fullVisitor.traverse(expr);
|
||||
// dbg(fullVisitor.print(expr));
|
||||
|
||||
// Build a map: name in nnet -> tensors in infini
|
||||
// Add input tensors to the map
|
||||
std::map<std::string, Tensor> nameNToTensorT;
|
||||
for (const auto &[k, v] : inputsNameNToTensorT)
|
||||
nameNToTensorT[k] = g->cloneTensor(v);
|
||||
|
||||
// Add output tensors to the map
|
||||
const auto &outputsT = in_graph->getOutputs();
|
||||
if (outputsT.size() != 1) {
|
||||
nnet_unimplemented_continue();
|
||||
return nullptr;
|
||||
}
|
||||
nameNToTensorT[std::get<0>(tensorQueueN.at(0))] =
|
||||
g->cloneTensor(outputsT[0]);
|
||||
// Skip the first tensor, which is output and should be created by clone
|
||||
for (size_t i = 1; i < tensorQueueN.size(); ++i) {
|
||||
const auto &[nameN, routineN, tensorN] = tensorQueueN[i];
|
||||
// dbg(nameN, routineN, tensorN);
|
||||
if (!routineN) {
|
||||
// this tensor is an input as it is not contrusted by a routine
|
||||
IT_ASSERT(nameNToTensorT.count(nameN),
|
||||
"Missing an input tensor in graph or a rountine for this "
|
||||
"tensor.");
|
||||
} else { // this tensor is an intermediate result
|
||||
IT_ASSERT(!nameNToTensorT.count(nameN),
|
||||
"An NNET tensor appears twice or it is an input tensor "
|
||||
"with routine specified.");
|
||||
nameNToTensorT[nameN] = g->addTensor(tensorN->getShape());
|
||||
}
|
||||
}
|
||||
|
||||
// Build computation graph in InfiniTensor
|
||||
for (int i = tensorQueueN.size() - 1; i >= 0; --i) {
|
||||
const auto &[outputNameN, routineN, tensorN] = tensorQueueN[i];
|
||||
if (!routineN)
|
||||
continue;
|
||||
// dbg(outputNameN, routineN, tensorN, routineN->getType());
|
||||
if (auto op = nnet::as<nnet::ConvNode>(routineN)) {
|
||||
std::vector<nnet::Tensor> inputsN = op->getInputs();
|
||||
auto A = nameNToTensorT.at(inputsN[0]->getName());
|
||||
auto K = nameNToTensorT.at(inputsN[1]->getName());
|
||||
auto output = nameNToTensorT.at(outputNameN);
|
||||
const auto &[ph, pw, sh, sw, dh, dw] = op->getArgs();
|
||||
g->addOpWithOutputs<ConvObj>(A, K, output, ph, pw, sh, sw, dh, dw);
|
||||
} else if (auto op = nnet::as<nnet::ElementWiseNode>(routineN)) {
|
||||
assert(op->getInputs().size() == 1);
|
||||
nnet::MatchReshapeVisitor matchReshapeVisitor;
|
||||
// If this routine only change the shape, translate it to a Reshape
|
||||
if (matchReshapeVisitor(op->getExpr())) {
|
||||
auto input =
|
||||
nameNToTensorT.at(op->getInputs().at(0)->getName());
|
||||
auto output = nameNToTensorT.at(outputNameN);
|
||||
g->addOpWithOutputs<ReshapeObj>(input, output,
|
||||
output->getDims());
|
||||
} else {
|
||||
TensorVec inputsPET;
|
||||
TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
|
||||
for (const auto &inputN : op->getInputs())
|
||||
inputsPET.emplace_back(
|
||||
nameNToTensorT.at(inputN->getName()));
|
||||
// Re-estimate time here.
|
||||
ssize_t cnt = 0;
|
||||
for (const auto &tensor : inputsPET)
|
||||
cnt += tensor->size();
|
||||
for (const auto &tensor : outputsPET)
|
||||
cnt += tensor->size();
|
||||
// dbg(inputsPET, outputsPET, op->getInputs(), op->getExpr(),
|
||||
// memboundTime(cnt));
|
||||
g->addOpWithOutputs<MemBoundObj>(inputsPET, outputsPET,
|
||||
op->getInputs(), op->getExpr(),
|
||||
memboundTime(cnt));
|
||||
}
|
||||
} else if (auto op = nnet::as<nnet::MatmulNode>(routineN)) {
|
||||
assert(op->getInputs().size() == 2);
|
||||
nnet::Tensor AN = op->getInputs()[0];
|
||||
nnet::Tensor BN = op->getInputs()[1];
|
||||
TensorVec inputsPET = {nameNToTensorT.at(AN->getName()),
|
||||
nameNToTensorT.at(BN->getName())};
|
||||
TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
|
||||
const auto &[b, m, n, k, transa, transb] = op->getArgs();
|
||||
g->addOpWithOutputs<MatmulObj>(inputsPET[0], inputsPET[1],
|
||||
outputsPET[0], transa, transb);
|
||||
}
|
||||
// TODO
|
||||
// else if (auto op = nnet::as<nnet::G2bmmNode>(routineN)) {
|
||||
// assert(op->getInputs().size() == 2);
|
||||
// nnet::Tensor AN = op->getInputs()[0];
|
||||
// nnet::Tensor BN = op->getInputs()[1];
|
||||
// TensorVec inputsPET = {nameNToTensorT.at(AN->getName()),
|
||||
// nameNToTensorT.at(BN->getName())};
|
||||
// TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
|
||||
// const auto &[b, m, w, k, dilation] = op->getArgs();
|
||||
// g->g2bmm(inputsPET[0], inputsPET[1], outputsPET[0], w, dilation);
|
||||
// } else if (auto op = nnet::as<nnet::GbmmNode>(routineN)) {
|
||||
// assert(op->getInputs().size() == 2);
|
||||
// nnet::Tensor AN = op->getInputs()[0];
|
||||
// nnet::Tensor BN = op->getInputs()[1];
|
||||
// TensorVec inputsPET = {nameNToTensorT.at(AN->getName()),
|
||||
// nameNToTensorT.at(BN->getName())};
|
||||
// TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
|
||||
// const auto &[b, m, w, n, dilation] = op->getArgs();
|
||||
// g->gbmml(inputsPET[0], inputsPET[1], outputsPET[0], dilation);
|
||||
// }
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
double NMutator::memboundTime(ssize_t cnt) {
|
||||
|
@ -475,43 +513,82 @@ double NMutator::memboundTime(const Shape &dims) {
|
|||
// return nullptr;
|
||||
// }
|
||||
|
||||
// Graph NMutator::transformTConv3x3(Operator op) {
|
||||
// if (auto tconvOp = dynamic_cast<ConvTransOp *>(op)) {
|
||||
// dbg(tconvOp->getInputs()[1]->getDims());
|
||||
// if (tconvOp->getPh() == 1 && tconvOp->getSh() == 2 &&
|
||||
// tconvOp->getInputs()[1]->getDims()[0] == 3 &&
|
||||
// tconvOp->getInputs()[1]->getDims()[1] == 3) {
|
||||
// auto g = new infini::Graph();
|
||||
// auto inputDims = tconvOp->getInputs(0)->getDims();
|
||||
// auto weightDims = tconvOp->getInputs(1)->getDims();
|
||||
// auto outputDims = tconvOp->getOutput()->getDims();
|
||||
// // NHWF
|
||||
// auto newA = g->tensor(
|
||||
// {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]});
|
||||
// // RSFC
|
||||
// auto newW = g->tensor(
|
||||
// {weightDims[0] * weightDims[1] * weightDims[3],
|
||||
// weightDims[2]});
|
||||
// auto newO =
|
||||
// g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
|
||||
Graph NMutator::transformConvtransposed1x1(Operator _op) {
|
||||
auto op = as<ConvTransposed2dNHWCObj>(_op);
|
||||
if (!op)
|
||||
return nullptr;
|
||||
const auto &A = op->getInputs()[0];
|
||||
const auto &W = op->getInputs()[1];
|
||||
const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
const Shape inputDims = op->getInputs(0)->getDims();
|
||||
const Shape weightDims = op->getInputs(1)->getDims();
|
||||
const Shape outputDims = op->getOutput()->getDims();
|
||||
const DataType dtype = A->getDType();
|
||||
IT_ASSERT_TODO(op->getNumGroups() == 1);
|
||||
if (h != 1 || w != 1)
|
||||
return {};
|
||||
IT_ASSERT_TODO(ph == pw);
|
||||
IT_ASSERT_TODO(tie(sh, sw) == tuple(1, 1));
|
||||
IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
|
||||
auto g = make_ref<GraphObj>(runtime);
|
||||
// NHWF
|
||||
auto newA = g->addTensor(
|
||||
{inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
|
||||
// FRSC
|
||||
auto newW = g->addTensor(
|
||||
{weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]}, dtype);
|
||||
g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
|
||||
g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
|
||||
Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 0)->getOutput();
|
||||
g->addOpWithOutputs<ReshapeObj>(newO, g->cloneTensor(op->getOutput()),
|
||||
op->getOutput()->getDims());
|
||||
return g;
|
||||
}
|
||||
|
||||
// Graph NMutator::transformConvtransposed(Operator _op) {
|
||||
// auto op = as<ConvTransposed2dNHWCObj>(_op);
|
||||
// if (!op)
|
||||
// return nullptr;
|
||||
// const auto &AT = op->getInputs()[0];
|
||||
// const auto &KT = op->getInputs()[1];
|
||||
// const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
// const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
// IT_ASSERT_TODO(op->getNumGroups() == 1);
|
||||
// if (r != 4)
|
||||
// return {};
|
||||
// IT_ASSERT_TODO(ph == pw);
|
||||
// IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));
|
||||
// IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
|
||||
|
||||
// auto g = make_ref<Graph>();
|
||||
// // TODO: implement transformation rules
|
||||
// // How to efficiently write an expression...
|
||||
// auto inputDims = op->getInputs(0)->getDims();
|
||||
// auto weightDims = op->getInputs(1)->getDims();
|
||||
// auto outputDims = op->getOutput()->getDims();
|
||||
// // NHWF
|
||||
// auto newA =
|
||||
// g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
|
||||
// inputDims[3]});
|
||||
// // RSFC
|
||||
// auto newW = g->tensor(
|
||||
// {weightDims[0] * weightDims[1] * weightDims[3], weightDims[2]});
|
||||
// auto newO = g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
|
||||
// weightDims[0] * weightDims[1] * weightDims[3]});
|
||||
// g->reshape(tconvOp->getInputs(0), newA);
|
||||
// g->reshape(tconvOp->getInputs(1), newW);
|
||||
// g->matmul(newA, newW, newO, 0, 1);
|
||||
// // g->reshape(newO, tconvOp->getOutput());
|
||||
// tconvOp->print();
|
||||
// dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
|
||||
// assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
|
||||
// g->membound(
|
||||
// {newO}, {tconvOp->getOutput()}, {}, nullptr,
|
||||
// g->reshape(op->getInputs(0), newA);
|
||||
// g->reshape(op->getInputs(1), newW);
|
||||
// g->matmul(newA, newW, newO, 0, 1);
|
||||
// // g->reshape(newO, tconvOp->getOutput());
|
||||
// tconvOp->print();
|
||||
// dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
|
||||
// assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
|
||||
// g->membound({newO}, {tconvOp->getOutput()}, {}, nullptr,
|
||||
// memboundTime(newO->size() + tconvOp->getOutput()->size()),
|
||||
// "TConv3x3 reduce");
|
||||
// g->updateConnection();
|
||||
// Graph graph = new Graph(g->getOperators());
|
||||
// return graph;
|
||||
// }
|
||||
// }
|
||||
// return nullptr;
|
||||
// g->updateConnection();
|
||||
// Graph graph = new Graph(g->getOperators());
|
||||
// return graph;
|
||||
// }
|
||||
|
||||
// Graph NMutator::transformTConv1x1(Operator op) {
|
||||
|
@ -607,4 +684,95 @@ double NMutator::memboundTime(const Shape &dims) {
|
|||
// return graph;
|
||||
// }
|
||||
|
||||
Graph NMutator::fuseVertically(const Graph &inputGraph) {
|
||||
Graph optGraph = make_ref<GraphObj>(runtime);
|
||||
|
||||
auto chainOps = inputGraph->getOperators();
|
||||
IT_ASSERT(!chainOps.empty());
|
||||
for (auto &op : chainOps) {
|
||||
IT_ASSERT(op->isMemBoundOp());
|
||||
IT_ASSERT_TODO(op->getInputs().size() == 1);
|
||||
IT_ASSERT(op->getOutputs().size() == 1);
|
||||
}
|
||||
if (chainOps.size() == 1) {
|
||||
return make_ref<GraphObj>(runtime, chainOps);
|
||||
}
|
||||
std::vector<nnet::Expr> exprs;
|
||||
for (const auto &op : chainOps) {
|
||||
auto [expr, _] = extractOp(op);
|
||||
exprs.emplace_back(expr);
|
||||
// dbg(op, infini::as<nnet::RangeOpNode>(expr)->getFullExpression());
|
||||
}
|
||||
// double maxTime = getMaxPerf(std::make_shared<SubGraph>(chainOps));
|
||||
// Fuse a MemboundOp chain
|
||||
auto expr = nnet::MergeMemboundMutator(exprs).merge(true);
|
||||
auto inputNMap = nnet::GetTensorsVisitor().get(exprs.front());
|
||||
IT_ASSERT(inputNMap.size() == 1);
|
||||
vector<nnet::Tensor> inputsN;
|
||||
for (const auto &[name, t] : inputNMap) {
|
||||
inputsN.emplace_back(t);
|
||||
}
|
||||
optGraph->addOpWithOutputs<MemBoundObj>(chainOps.front()->getInputs(),
|
||||
chainOps.back()->getOutputs(),
|
||||
inputsN, expr, 0);
|
||||
// TODO: set time
|
||||
return optGraph;
|
||||
}
|
||||
|
||||
pair<nnet::Expr, NMutator::NameNToTensorT>
|
||||
NMutator::generateUnaryExpr(const Operator &op) {
|
||||
using namespace nnet;
|
||||
const map<OpType, nnet::FuncType> opTToFuncN = {
|
||||
{OpType::PRelu, nnet::FuncType::PRelu},
|
||||
{OpType::Relu, nnet::FuncType::Relu},
|
||||
{OpType::Tanh, nnet::FuncType::Tanh}};
|
||||
Shape shape = op->getInputs()[0]->getDims();
|
||||
nnet::FuncType type = opTToFuncN.at(op->getOpType());
|
||||
auto T = make_ref<TensorNode>("T", shape);
|
||||
VecExpr indices;
|
||||
for (size_t i = 0; i < shape.size(); ++i) {
|
||||
indices.emplace_back(make_ref<VarNode>("i" + std::to_string(i)));
|
||||
}
|
||||
auto sub = makeSubscript(T, indices);
|
||||
auto func = nnet::make_ref<FuncNode>(sub, type);
|
||||
vector<VarRangePair> varRanges;
|
||||
for (size_t i = 0; i < shape.size(); ++i) {
|
||||
varRanges.emplace_back(nnet::as<VarNode>(indices[i]),
|
||||
Range{0, shape[i]});
|
||||
}
|
||||
return {makeRangeOperator(varRanges, {}, func),
|
||||
NameNToTensorT{{"T", op->getInputs()[0]}}};
|
||||
}
|
||||
|
||||
void NMutator::memboundToJson(const Graph &g, const string path) {
|
||||
for (auto &_op : g->getOperators()) {
|
||||
if (auto op = as<MemBoundObj>(_op)) {
|
||||
op->saveAsJson(path + "/" + "membound_" +
|
||||
std::to_string(op->getGuid()) + ".json");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pair<nnet::Expr, vector<nnet::Tensor>> NMutator::generateRevert(Tensor in) {
|
||||
using namespace nnet;
|
||||
using infini::make_ref;
|
||||
const Shape &orignalShape = in->getDims();
|
||||
auto tensor = makeTensor("T", in->getDims());
|
||||
VecExpr iters;
|
||||
for (size_t i = 0; i < orignalShape.size(); ++i) {
|
||||
iters.emplace_back(make_ref<VarNode>("i" + std::to_string(i)));
|
||||
}
|
||||
|
||||
Shape newShape = orignalShape;
|
||||
std::reverse(newShape.begin(), newShape.end());
|
||||
auto sub = makeSubscript(tensor, iters);
|
||||
vector<VarRangePair> loopIters;
|
||||
for (int i = orignalShape.size() - 1; i >= 0; --i) {
|
||||
loopIters.emplace_back(infini::as<VarNode>(iters[i]),
|
||||
Range{0, orignalShape[i]});
|
||||
}
|
||||
auto range = makeRangeOperator(loopIters, {}, sub);
|
||||
return {range, {tensor}};
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -5,22 +5,24 @@ namespace infini {
|
|||
MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
|
||||
bool transB, [[maybe_unused]] Tensor bias, ActType act)
|
||||
: OperatorObj(OpType::Matmul, {A, B}, {C}), transA(transA), transB(transB),
|
||||
act(act), b(1) {
|
||||
act(act) {
|
||||
auto shape_a = A->getDims();
|
||||
auto shape_b = B->getDims();
|
||||
IT_ASSERT(shape_a.size() == shape_b.size());
|
||||
switch (shape_a.size()) {
|
||||
case 0:
|
||||
case 1:
|
||||
IT_ASSERT(false);
|
||||
case 2:
|
||||
break;
|
||||
default:
|
||||
int dimA = shape_a.size(), dimB = shape_b.size();
|
||||
IT_ASSERT(dimA >= 2 && dimB >= 2);
|
||||
|
||||
b = 1;
|
||||
if (dimA <= 3 && dimB <= 3) {
|
||||
int b1 = dimA == 2 ? 1 : A->getDims()[0];
|
||||
int b2 = dimB == 2 ? 1 : B->getDims()[0];
|
||||
|
||||
b = std::max(b1, b2);
|
||||
} else {
|
||||
IT_ASSERT_TODO(dimA == dimB);
|
||||
for (size_t i = 0; i < shape_a.size() - 2; ++i) {
|
||||
IT_ASSERT(shape_a[i] == shape_b[i]);
|
||||
IT_ASSERT_TODO(shape_a[i] == shape_b[i]);
|
||||
b *= shape_a[i];
|
||||
}
|
||||
break;
|
||||
}
|
||||
m = *(transA ? shape_a.rbegin() : shape_a.rbegin() + 1);
|
||||
n = *(transB ? shape_b.rbegin() + 1 : shape_b.rbegin());
|
||||
|
@ -38,11 +40,44 @@ string MatmulObj::toString() const {
|
|||
}
|
||||
|
||||
optional<vector<Shape>> MatmulObj::inferShape(const TensorVec &inputs) const {
|
||||
auto shape_a = inputs[0]->getDims();
|
||||
auto it = shape_a.rbegin();
|
||||
*it++ = n;
|
||||
*it++ = m;
|
||||
return {{std::move(shape_a)}};
|
||||
auto A = inputs[0], B = inputs[1];
|
||||
int dimA = A->getDims().size(), dimB = B->getDims().size();
|
||||
|
||||
if (dimA > 3 || dimB > 3) {
|
||||
// no broadcast
|
||||
auto shape_a = inputs[0]->getDims();
|
||||
auto it = shape_a.rbegin();
|
||||
*it++ = n;
|
||||
*it++ = m;
|
||||
return {{std::move(shape_a)}};
|
||||
}
|
||||
|
||||
int b1 = dimA == 2 ? 1 : A->getDims()[0];
|
||||
int b2 = dimB == 2 ? 1 : B->getDims()[0];
|
||||
|
||||
int b = std::max(b1, b2);
|
||||
int m = transA ? A->getDims()[dimA - 1] : A->getDims()[dimA - 2];
|
||||
int n = transB ? B->getDims()[dimB - 2] : B->getDims()[dimB - 1];
|
||||
int kA = transA ? A->getDims()[dimA - 2] : A->getDims()[dimA - 1];
|
||||
int kB = transB ? B->getDims()[dimB - 1] : B->getDims()[dimB - 2];
|
||||
|
||||
if ((dimA != 2 && dimA != 3) || (dimB != 2 && dimB != 3)) {
|
||||
printf("Bad input dim: dimA = %d, dimB = %d\n", dimA, dimB);
|
||||
return {};
|
||||
}
|
||||
if (b1 != 1 && b2 != 1 && b1 != b2) {
|
||||
printf("Bad batch size b1 = %d, b2 = %d\n", b1, b2);
|
||||
return {};
|
||||
}
|
||||
if (kA != kB) {
|
||||
printf("Bad K: kA = %d, kB = %d\n", kA, kB);
|
||||
return {};
|
||||
}
|
||||
if (dimA == 2 && dimB == 2) {
|
||||
return {{{m, n}}};
|
||||
} else {
|
||||
return {{{b, m, n}}};
|
||||
}
|
||||
}
|
||||
|
||||
vector<int> MatmulObj::getWorkloadVector() const {
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
#include "operators/membound.h"
|
||||
#include "nnet/Visitor/CheckOOBVisitor.h"
|
||||
#include "nnet/Visitor/HashVisitor.h"
|
||||
#include "nnet/Visitor/MergeMemboundMutator.h"
|
||||
#include "nnet/Visitor/Serializer.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -10,6 +13,19 @@ MemBoundObj::MemBoundObj(GraphObj *graph, const TensorVec &input,
|
|||
: OperatorObj(OpType::MemBound, input, output), nnetInputs(nnetInputs),
|
||||
expr(expr), exec_time(exec_time), hint(hint) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
IT_ASSERT(!checkOOB(expr));
|
||||
hash = calcHash(expr);
|
||||
|
||||
// fuse stages in nnet expr to reduce kernels generated by TVM
|
||||
if (auto mergedExpr =
|
||||
nnet::MergeMemboundMutator({expr}).merge(false, true)) {
|
||||
simplifiedExpr = mergedExpr;
|
||||
IT_ASSERT(!checkOOB(simplifiedExpr));
|
||||
simplifiedHash = calcHash(simplifiedExpr);
|
||||
} else {
|
||||
simplifiedExpr = expr;
|
||||
simplifiedHash = hash;
|
||||
}
|
||||
}
|
||||
|
||||
string MemBoundObj::toString() const {
|
||||
|
@ -30,9 +46,16 @@ string MemBoundObj::toString() const {
|
|||
os << "exec_time=" << exec_time << ", ";
|
||||
os << "NNet Inputs=[";
|
||||
for (const auto &tensor : nnetInputs)
|
||||
os << tensor->toReadable() << ",";
|
||||
os << "])";
|
||||
os << "\n" << (expr ? expr->toReadable() : "Empty expression") << "\n";
|
||||
os << tensor->toReadable() << vecToString(tensor->getShape()) << ",";
|
||||
os << "]";
|
||||
os << ", ExprHash=" << hash;
|
||||
os << ", SimplifiedExprHash=" << simplifiedHash;
|
||||
os << ")\n";
|
||||
os << ">>> Original expr\n"
|
||||
<< (expr ? expr->toReadable() : "Empty expression") << "\n";
|
||||
os << ">>> Simplified expr\n"
|
||||
<< (simplifiedExpr ? simplifiedExpr->toReadable() : "Empty expression")
|
||||
<< "\n";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
|
@ -47,13 +70,23 @@ optional<vector<Shape>> MemBoundObj::inferShape(const TensorVec &inputs) const {
|
|||
}
|
||||
|
||||
vector<int> MemBoundObj::getWorkloadVector() const {
|
||||
return {enum_to_underlying(type), (int)getHash()};
|
||||
return {enum_to_underlying(type), (int)simplifiedHash};
|
||||
}
|
||||
|
||||
vector<int> MemBoundObj::getOpAttrVector() const { return getWorkloadVector(); }
|
||||
|
||||
HashType MemBoundObj::getHash() const {
|
||||
HashType MemBoundObj::calcHash(nnet::Expr expr) {
|
||||
return nnet::HashVisitor().dispatch(expr);
|
||||
}
|
||||
|
||||
bool MemBoundObj::checkOOB(nnet::Expr expr) {
|
||||
return nnet::CheckOOBVisitor().checkRangeOp(
|
||||
nnet::as<nnet::RangeOpNode>(expr));
|
||||
}
|
||||
|
||||
void MemBoundObj::saveAsJson(string path) const {
|
||||
bool status = nnet::Serializer().serialize(expr, path);
|
||||
IT_ASSERT(status);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -8,7 +8,7 @@ namespace infini {
|
|||
TEST(Hash, OperatorHash) {
|
||||
OpPerfKey key1(0, OpType::Unknown), key2(0, OpType::Unknown);
|
||||
{ // build with addOpWithOutputs
|
||||
Graph g = make_ref<GraphObj>(nullptr);
|
||||
Graph g = make_ref<GraphObj>(NativeCpuRuntimeObj::getInstance());
|
||||
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
||||
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
||||
Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
|
||||
|
@ -18,7 +18,7 @@ TEST(Hash, OperatorHash) {
|
|||
EXPECT_GT(key1.attrs.size(), (size_t)5);
|
||||
}
|
||||
{ // build with addOp
|
||||
Graph g = make_ref<GraphObj>(nullptr);
|
||||
Graph g = make_ref<GraphObj>(NativeCpuRuntimeObj::getInstance());
|
||||
Tensor i0 = g->addTensor({2, 2, 3}, DataType::UInt32);
|
||||
Tensor w0 = g->addTensor({2, 3, 4}, DataType::UInt32);
|
||||
auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
|
@ -51,26 +50,38 @@ TEST(cuBLAS_Matmul, run) {
|
|||
Shape{2, 3, 4}, Shape{2, 3, 2},
|
||||
ExpectOutput{40, 52, 46, 61, 52, 70, 58, 79, 400, 448, 424,
|
||||
475, 448, 502, 472, 529});
|
||||
testMatmulCuda(
|
||||
IncrementalGenerator(), IncrementalGenerator(), false, false,
|
||||
Shape{2, 3, 5}, Shape{5, 2},
|
||||
ExpectOutput{60, 70, 160, 195, 260, 320, 360, 445, 460, 570, 560, 695});
|
||||
testMatmulCuda(IncrementalGenerator(), IncrementalGenerator(), true, false,
|
||||
Shape{2, 5, 3}, Shape{5, 2},
|
||||
ExpectOutput{180, 210, 200, 235, 220, 260, 480, 585, 500,
|
||||
610, 520, 635});
|
||||
testMatmulCuda(IncrementalGenerator(), IncrementalGenerator(), false, false,
|
||||
Shape{3, 5}, Shape{5, 2},
|
||||
ExpectOutput{60, 70, 160, 195, 260, 320});
|
||||
}
|
||||
|
||||
TEST(cuBLAS_Matmul, tune) {
|
||||
auto cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(cpuRuntime);
|
||||
auto ACpu = gCpu->addTensor(Shape{1, 3, 5}, DataType::Float32);
|
||||
auto BCpu = gCpu->addTensor(Shape{1, 5, 2}, DataType::Float32);
|
||||
gCpu->dataMalloc();
|
||||
ACpu->setData(IncrementalGenerator());
|
||||
BCpu->setData(IncrementalGenerator());
|
||||
|
||||
// Matmul([A^T,B,act=0],A=597,B=595,C=598,bmnk=[1,4,4096,448])
|
||||
const int B = 1, M = 4, N = 4096, K = 448;
|
||||
const bool transA = true, transB = false;
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
auto gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
auto ACuda = gCuda->cloneTensor(ACpu);
|
||||
auto BCuda = gCuda->cloneTensor(BCpu);
|
||||
auto matmul = gCuda->addOp<MatmulObj>(ACuda, BCuda, nullptr);
|
||||
|
||||
Graph g = make_ref<GraphObj>(cudaRuntime);
|
||||
auto a = g->addTensor(transA ? Shape{B, K, M} : Shape{B, M, K});
|
||||
auto b = g->addTensor(transB ? Shape{B, N, K} : Shape{B, K, N});
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
cudaRuntime->run(gCuda, true);
|
||||
g->dataMalloc();
|
||||
a->setData(IncrementalGenerator());
|
||||
b->setData(IncrementalGenerator());
|
||||
|
||||
auto matmul = g->addOp<MatmulObj>(a, b, nullptr, transA, transB);
|
||||
matmul->print();
|
||||
double time = cudaRuntime->getPerfTime(g);
|
||||
EXPECT_GT(time, 1e-3);
|
||||
EXPECT_LT(time, 1);
|
||||
cudaRuntime->run(g, true);
|
||||
}
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -0,0 +1,92 @@
|
|||
import onnx
|
||||
import torch
|
||||
from matplotlib import pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import infinitensor as ft
|
||||
|
||||
|
||||
def to_pytorch_tensor(tensor) -> torch.Tensor:
|
||||
data = tensor.copyout_float()
|
||||
tt = torch.tensor(data)
|
||||
return tt.reshape(tensor.shape())
|
||||
|
||||
|
||||
def run_InfoGAN_return_tesnor(n_layers: int):
|
||||
if_tensors = ft.runInfoGAN(n_layers)
|
||||
tensors = [to_pytorch_tensor(t) for t in if_tensors]
|
||||
return tensors
|
||||
|
||||
|
||||
def read_and_check():
|
||||
for n_layers in range(1, 6):
|
||||
ans = torch.load(f'torch_{n_layers}layers_0.pt')
|
||||
x = torch.load(f'torch_{n_layers}layers_1.pt')
|
||||
print(f'=== {n_layers} layers ===')
|
||||
print(x.abs().max())
|
||||
|
||||
|
||||
def run_e2e_InfoGAN():
|
||||
data = []
|
||||
for n_layers in range(5, 6):
|
||||
tensors = run_InfoGAN_return_tesnor(n_layers)
|
||||
for i, t in enumerate(tensors):
|
||||
torch.save(t, f'torch_{n_layers}layers_{i}.pt')
|
||||
print(f'============ {n_layers} layers = = =')
|
||||
ans, x = tensors
|
||||
print(f'Allclose {torch.allclose(ans, x)}')
|
||||
|
||||
# Print error numbers
|
||||
tot = np.product(ans.shape)
|
||||
data.append([])
|
||||
for i in range(0, 10):
|
||||
tol = 10**(-i)
|
||||
clo = torch.isclose(ans, x, atol=tol, rtol=tol).sum().item()
|
||||
print(f'0.1^{i} close: {clo}/{tot} = {clo/tot}')
|
||||
data[-1].append(clo/tot)
|
||||
|
||||
rel_err = torch.abs((ans-x)/ans)
|
||||
print(rel_err, rel_err.max())
|
||||
print(f'ans = {ans}')
|
||||
print(f'x = {x}')
|
||||
|
||||
# # Plot CDF
|
||||
# fig, axes = plt.subplots(9,1)
|
||||
# print(axes)
|
||||
# for i, ax in enumerate(axes):
|
||||
# print(i)
|
||||
# ax:plt.Axes
|
||||
# ax.hist(torch.flatten(rel_err), density=True, cumulative=True, label='CDF',
|
||||
# histtype='step', alpha=0.8, color='k')
|
||||
# ax.set_xlim(0, 10**(-i))
|
||||
# # ax.set_title('')
|
||||
# plt.show()
|
||||
# plt.savefig('a.pdf')
|
||||
df = pd.DataFrame(data)
|
||||
print(df.to_string())
|
||||
df.set_axis([f'0.1^{i}' for i in range(0, 10)], axis=1, inplace=True)
|
||||
print(df.to_string())
|
||||
df.to_csv('a.csv')
|
||||
|
||||
|
||||
def runSingleConvT():
|
||||
runtime = ft.cuda_runtime()
|
||||
g = ft.getConvtransposedNHWC(runtime, [1, 2, 2, 448], 1)
|
||||
opt_g = ft.optimizeGraph(g, runtime)
|
||||
ft.if_onnx.export_onnx(opt_g, 'convtransposed.onnx')
|
||||
|
||||
|
||||
def run_InfoGAN_without_tuning(tuning: bool):
|
||||
runtime = ft.cuda_runtime()
|
||||
g = ft.getInfoGAN(1, runtime, 5)
|
||||
# g = ft.getInfoGAN(1, runtime, 1)
|
||||
opt_g = ft.optimizeGraph(g, runtime, tuning)
|
||||
ft.if_onnx.export_onnx(opt_g, 'infogan_transformed.onnx')
|
||||
ft.NMutator.memboundToJson(opt_g, ".")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# run_e2e_InfoGAN()
|
||||
run_InfoGAN_without_tuning(True)
|
||||
# runSingleConvT()
|
||||
# read_and_check()
|
|
@ -388,4 +388,4 @@ TEST(Conv2conv, InfoGAN_ConvTranspose_3_OOB_Test) {
|
|||
dbg(expr);
|
||||
Derivator derivator;
|
||||
derivator.checkOOB(as<RangeOpNode>(expr));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
#include "nnet/routine.h"
|
||||
#include "nnet/test.h"
|
||||
#include "operators/matmul.h"
|
||||
#include <chrono>
|
||||
#include "operators/membound.h"
|
||||
#include "test.h"
|
||||
using namespace infini;
|
||||
using namespace std;
|
||||
|
||||
|
@ -18,8 +19,8 @@ TEST(nnet, MemboundOpInterpretation) {
|
|||
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
||||
Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
|
||||
g->dataMalloc();
|
||||
i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||
w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||
i0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||
w0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
||||
NMutator nmutator(NMutator::Mode::ToNaiveMembound);
|
||||
auto mutations = nmutator.run(g);
|
||||
|
@ -36,7 +37,7 @@ TEST(nnet, MemboundOpInterpretation) {
|
|||
EXPECT_EQ(membound->getOpType(), OpType::MemBound);
|
||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
|
||||
ans->dataMalloc();
|
||||
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||
ans->copyin(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||
EXPECT_TRUE(membound->getOutput()->equalData(ans));
|
||||
}
|
||||
|
||||
|
@ -49,8 +50,8 @@ TEST(nnet, MemboundOp_Ansor_Codegen) {
|
|||
Tensor w0 = g->addTensor({1, 3, 4}, DataType::Float32);
|
||||
Tensor o0 = g->addTensor({1, 2, 4}, DataType::Float32);
|
||||
g->dataMalloc();
|
||||
i0->copyData(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||
w0->copyData(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||
i0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||
w0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
||||
NMutator nmutator(NMutator::Mode::ToNaiveMembound);
|
||||
auto mutations = nmutator.run(g);
|
||||
|
@ -67,7 +68,7 @@ TEST(nnet, MemboundOp_Ansor_Codegen) {
|
|||
EXPECT_EQ(membound->getOpType(), OpType::MemBound);
|
||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::Float32, cpu);
|
||||
ans->dataMalloc();
|
||||
ans->copyData(vector<float>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||
ans->copyin(vector<float>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||
|
||||
auto oCpu = gCpu->cloneTensor(membound->getOutput());
|
||||
oCpu->printData();
|
||||
|
@ -77,3 +78,41 @@ TEST(nnet, MemboundOp_Ansor_Codegen) {
|
|||
// double time = timeit([&]() { runtime->run(gNew, false); }); // tune
|
||||
// kernels std::cout << "Time (ms):" << time << std::endl;
|
||||
}
|
||||
|
||||
pair<std::vector<nnet::Tensor>, nnet::Expr> getPReluExpr(int size) {
|
||||
using namespace nnet;
|
||||
using nnet::make_ref;
|
||||
DEFINE_VAR(i);
|
||||
auto A = make_ref<TensorNode>("A", vector{size});
|
||||
auto B = make_ref<TensorNode>("B", vector{size});
|
||||
Expr e = make_ref<FuncNode>(makeSubscript(A, {i}) - makeSubscript(B, {i}),
|
||||
FuncType::PRelu);
|
||||
Expr ret = makeRangeOperator({{i, {0, size}}}, {}, e);
|
||||
return {{A, B}, ret};
|
||||
}
|
||||
|
||||
TEST(nnet, PRelu_Ansor_Codegen) {
|
||||
auto cuda = make_ref<CudaRuntimeObj>();
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(cuda);
|
||||
Tensor i0 = g->addTensor(vector{12});
|
||||
Tensor w0 = g->addTensor(vector{12});
|
||||
Tensor o0 = g->addTensor(vector{12});
|
||||
auto [nnetInputs, expr] = getPReluExpr(12);
|
||||
g->addOpWithOutputs<MemBoundObj>(vector{i0, w0}, vector{o0}, nnetInputs,
|
||||
expr, -1);
|
||||
g->dataMalloc();
|
||||
i0->setData(IncrementalGenerator());
|
||||
w0->setData(ValGenerator<5>());
|
||||
cuda->run(g, true); // tune kernels
|
||||
|
||||
// check answer
|
||||
auto ans = make_ref<TensorObj>(Shape{12}, DataType::Float32, cpu);
|
||||
ans->dataMalloc();
|
||||
ans->copyin(
|
||||
vector<float>{-1.25, -1., -0.75, -0.5, -0.25, 0, 1, 2, 3, 4, 5, 6});
|
||||
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
auto oCpu = gCpu->cloneTensor(o0);
|
||||
EXPECT_TRUE(oCpu->equalData(ans));
|
||||
}
|
||||
|
|
|
@ -4,19 +4,16 @@
|
|||
#include "nnet/Visitor/HashVisitor.h"
|
||||
#include "nnet/Visitor/MergeMemboundMutator.h"
|
||||
#include "nnet/expr.h"
|
||||
#include "nnet/test.h"
|
||||
#include "gtest/gtest.h"
|
||||
using namespace nnet;
|
||||
using namespace std;
|
||||
#define DEFINE_VAR(name) auto name = make_ref<VarNode>(#name);
|
||||
|
||||
TEST(FuseMembound, Relu) {
|
||||
const int n_heads = 8, seq_len = 10000, feat_len = 512;
|
||||
// dilation_heads = 2;
|
||||
const int Batch = n_heads, M = seq_len, K = feat_len, W = 32;
|
||||
DEFINE_VAR(b);
|
||||
DEFINE_VAR(m);
|
||||
DEFINE_VAR(w);
|
||||
DEFINE_VAR(k);
|
||||
DEFINE_VAR(b, m, w, k);
|
||||
auto A = make_ref<TensorNode>("A", vector<int>({Batch, M, K}),
|
||||
vector<int>{0, 0, 0});
|
||||
|
||||
|
@ -35,10 +32,7 @@ TEST(FuseMembound, MemMemFusion) {
|
|||
const int n_heads = 8, seq_len = 100, feat_len = 100;
|
||||
// dilation_heads = 2;
|
||||
const int Batch = n_heads, M = seq_len, K = feat_len;
|
||||
DEFINE_VAR(b);
|
||||
DEFINE_VAR(m);
|
||||
DEFINE_VAR(w);
|
||||
DEFINE_VAR(k);
|
||||
DEFINE_VAR(b, m, w, k);
|
||||
auto A = make_ref<TensorNode>("A", vector<int>({Batch, M, K}),
|
||||
vector<int>{0, 0, 0});
|
||||
auto B = make_ref<TensorNode>("B", vector<int>({Batch, K, M}),
|
||||
|
@ -54,4 +48,26 @@ TEST(FuseMembound, MemMemFusion) {
|
|||
RangeOp ans = makeRangeOperator({{b, {0, Batch}}, {m, {0, M}}},
|
||||
{{k, {0, K}}}, makeSubscript(A, {b, m, k}));
|
||||
EXPECT_EQ(HashVisitor().getHash(merged), HashVisitor().getHash(ans));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(FuseMembound, mergeNestedStagesInRangeOp) {
|
||||
// Case in ConvTranspose to Matmul
|
||||
// L<f:0:448><i39:0:4096>Sum ... [i39,f]
|
||||
// {L<i39:0:4096><f:0:448>Sum ... [f,(i39 / 1024),((i39 / 256) % 4),(i39
|
||||
// % 256)] {K}}
|
||||
DEFINE_VAR(f, i);
|
||||
const int I = 4096, F = 448;
|
||||
auto K = make_ref<TensorNode>("K", vector<int>({448, 4, 4, 256}));
|
||||
|
||||
auto subA = makeSubscript(K, {f, i / 1024, (i / 256) % 4, i % 256});
|
||||
auto range = makeRangeOperator({{i, {0, I}}, {f, {0, F}}}, {}, subA);
|
||||
auto outerRange = makeRangeOperator({{f, {0, F}}, {i, {0, I}}}, {},
|
||||
makeSubscript(range, {i, f}));
|
||||
auto merged = MergeMemboundMutator({outerRange}).merge();
|
||||
|
||||
// Compare the result with answer
|
||||
RangeOp ans = makeRangeOperator(
|
||||
{{f, {0, F}}, {i, {0, I}}}, {},
|
||||
makeSubscript(K, {f, i / 1024, (i / 256) % 4, i % 256}));
|
||||
EXPECT_EQ(HashVisitor().getHash(merged), HashVisitor().getHash(ans));
|
||||
}
|
||||
|
|
|
@ -5,12 +5,13 @@
|
|||
#include "core/search_engine.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "nnet/nmutator.h"
|
||||
#include "nnet/test.h"
|
||||
#include "operators/conv.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Mutator, NaiveConvWithInterpreter) {
|
||||
TEST(NMutator, NaiveConvWithInterpreter) {
|
||||
// verifyNaiveMembound True: subgraph after transformation
|
||||
// verifyNaiveMembound False: subgraph of one single membound (eOP)
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
|
@ -55,46 +56,75 @@ TEST(Mutator, NaiveConvWithInterpreter) {
|
|||
}
|
||||
|
||||
// FIXME: failed since implicit transpose for DLT
|
||||
TEST(Mutator, InfoGAN_TConv_3_correctness) {
|
||||
// verifyNaiveMembound True: subgraph after transformation
|
||||
// verifyNaiveMembound False: subgraph of one single membound (eOP)
|
||||
// const bool verifyNaiveMembound = false;
|
||||
TEST(NMutator, InfoGAN_TConv_3_correctness) {
|
||||
const bool useMutatorDirectly = false;
|
||||
Runtime runtime = make_ref<CudaRuntimeObj>();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
|
||||
// {n, h, w, f} * {f, r, s, c}
|
||||
auto i0 = g->addTensor({1, 2, 2, 448});
|
||||
auto w0 = g->addTensor({448, 4, 4, 256});
|
||||
const int n = 1, c = 256, h = 2, w = 2, f = 448, r = 4, s = 4;
|
||||
// // Minimum config for test
|
||||
// const int n = 1, c = 1, h = 2, w = 2, f = 1, r = 4, s = 4;
|
||||
// const int n = 1, c = 2, h = 2, w = 2, f = 2, r = 4, s = 4;
|
||||
|
||||
auto i0 = g->addTensor({n, h, w, f}, DataType::Float32, TensorType::Input);
|
||||
auto w0 =
|
||||
g->addTensor({f, r, s, c}, DataType::Float32, TensorType::Initialized);
|
||||
g->addOp<ConvTransposed2dNHWCObj>(i0, w0, nullptr, 1, 1, 2, 2, 1, 1);
|
||||
|
||||
auto mutator = make_ref<NMutator>();
|
||||
mutator->setToNaiveMembound();
|
||||
SearchEngine searchEngine(runtime, mutator);
|
||||
auto bestGraph = searchEngine.run(g);
|
||||
bestGraph->print();
|
||||
printf("--- SearchEngine Finished ---\n");
|
||||
auto mutator =
|
||||
make_ref<NMutator>(NMutator::Mode::RuleBased,
|
||||
vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
|
||||
// // Translate OP to membound without derivation
|
||||
// mutator->setToNaiveMembound();
|
||||
|
||||
vector<Graph> bestGraphs;
|
||||
if (useMutatorDirectly) { // Use mutator results
|
||||
bestGraphs = mutator->run(g);
|
||||
} else { // Use search engine results
|
||||
SearchEngine searchEngine(runtime, mutator);
|
||||
bestGraphs.emplace_back(searchEngine.run(g));
|
||||
}
|
||||
g->dataMalloc();
|
||||
bestGraph->dataMalloc();
|
||||
for (auto t : g->getTensors()) {
|
||||
if (t->getFuid() <= 2)
|
||||
t->setData(IncrementalGenerator());
|
||||
map<UidBaseType, Tensor> fuidToInputTensor;
|
||||
for (auto t : g->getInputs()) {
|
||||
EXPECT_EQ(fuidToInputTensor.count(t->getFuid()), 0);
|
||||
fuidToInputTensor[t->getFuid()] = t;
|
||||
}
|
||||
for (auto t : bestGraph->getTensors()) {
|
||||
if (t->getFuid() <= 2)
|
||||
t->setData(IncrementalGenerator());
|
||||
|
||||
std::cout << "# bestGraphs = " << bestGraphs.size() << std::endl;
|
||||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
auto bestGraphCpu = bestGraphs[i];
|
||||
auto bestGraph =
|
||||
make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
|
||||
|
||||
auto gen = RandomGenerator(0.1, 0.1, i);
|
||||
bestGraph->dataMalloc();
|
||||
// Initialize inputs with random data
|
||||
for (auto t : g->getInputs()) {
|
||||
t->setData(gen);
|
||||
}
|
||||
for (auto t : bestGraph->getInputs()) {
|
||||
t->copyData(fuidToInputTensor[t->getFuid()]);
|
||||
}
|
||||
|
||||
// Initialize outputs with zeros
|
||||
for (auto t : g->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
for (auto t : bestGraph->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
|
||||
runtime->run(bestGraph, true); // Tune kernels
|
||||
runtime->run(g);
|
||||
runtime->run(bestGraph, false); // Execute transfomraed graph
|
||||
|
||||
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
|
||||
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
|
||||
EXPECT_TRUE(go0->equalData(bgo0, 1e-4));
|
||||
}
|
||||
runtime->run(g);
|
||||
runtime->run(bestGraph);
|
||||
|
||||
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
|
||||
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
|
||||
|
||||
EXPECT_TRUE(go0->equalData(bgo0));
|
||||
EXPECT_TRUE(g->getOutputs()[0]->getRawDataPtr<void *>() !=
|
||||
bestGraph->getOutputs()[0]->getRawDataPtr<void *>());
|
||||
}
|
||||
|
||||
// TEST(Mutator, Conv9x9) {
|
||||
|
|
|
@ -76,12 +76,14 @@ TEST(Serializer, CompareTwoExprs) {
|
|||
auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0});
|
||||
auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0});
|
||||
auto subA = makeSubscript(A, {b, (i3 + (2500 * i4)), k});
|
||||
auto funcA = make_ref<FuncNode>(subA, FuncType::Relu);
|
||||
auto subB = makeSubscript(B, {b, ((i3 + (2500 * i4)) + w), k});
|
||||
auto range = makeRangeOperator(
|
||||
{{i3, {0, 2500}}, {i4, {0, 4}}, {b, {0, 8}}, {w, {0, 65}}},
|
||||
{{k, {0, 512}}}, subA * subB);
|
||||
{{k, {0, 512}}}, funcA * subB);
|
||||
Serializer().serialize(range, "./test_serializer.json");
|
||||
auto expr = Serializer().deserialize("./test_serializer.json");
|
||||
dbg(expr);
|
||||
|
||||
EXPECT_EQ(range->toReadable(), expr->toReadable());
|
||||
}
|
||||
|
@ -90,11 +92,9 @@ TEST(Serializer, Serialization_NestedTensor) {
|
|||
FullPrinterVisitor printer;
|
||||
auto range = buildNestedExpr();
|
||||
auto ans = printer.print(range);
|
||||
dbg(ans);
|
||||
auto isSuccessful = Serializer().serialize(range, "./test_serializer.json");
|
||||
EXPECT_TRUE(isSuccessful);
|
||||
auto exprDeserialized = Serializer().deserialize("./test_serializer.json");
|
||||
auto output = printer.print(exprDeserialized);
|
||||
dbg(output);
|
||||
EXPECT_EQ(output, ans);
|
||||
}
|
Loading…
Reference in New Issue