forked from jiuyuan/InfiniTensor
Compare commits
34 Commits
master
...
NNET_e2e_f
Author | SHA1 | Date |
---|---|---|
huangshuhong | 8c91faa948 | |
huangshuhong | c0ae03a2d7 | |
Liyan Zheng | 0cb8729bc1 | |
YdrMaster | 8bc2d3e48d | |
YdrMaster | 28b123753e | |
Liyan Zheng | 94730d93b5 | |
Liyan Zheng | 6d17c4caa2 | |
Liyan Zheng | 15d0eb79cd | |
Liyan Zheng | 2a343e240e | |
Liyan Zheng | 34ca6bf149 | |
YdrMaster | a6019e79e3 | |
YdrMaster | 4e1cc8d3e4 | |
YdrMaster | 725f9260cf | |
YdrMaster | 0edd138919 | |
Liyan Zheng | 0b23a065ca | |
Liyan Zheng | e86e993ed4 | |
Liyan Zheng | e4c20a9ae2 | |
Liyan Zheng | 537b3b4ea4 | |
Liyan Zheng | 2812900ea2 | |
Liyan Zheng | 01fc19795d | |
Liyan Zheng | afc4123328 | |
Liyan Zheng | b981951a47 | |
Liyan Zheng | 99b5c95455 | |
Liyan Zheng | 9d50b30af8 | |
Liyan Zheng | bc31219bde | |
Liyan Zheng | edf4e33353 | |
Liyan Zheng | 872f3504a9 | |
Liyan Zheng | da49e91ab0 | |
Liyan Zheng | a6b8f344d4 | |
Liyan Zheng | 09293730ea | |
Liyan Zheng | 307614d95d | |
Liyan Zheng | f14edcd52f | |
Liyan Zheng | d2d49c5d4f | |
Liyan Zheng | e72fe79168 |
|
@ -129,7 +129,7 @@ if(BUILD_TEST_EINNET)
|
|||
endif()
|
||||
|
||||
# Python bindings
|
||||
file(GLOB_RECURSE FFIS src/ffi/ffi_infinitensor.cc)
|
||||
file(GLOB_RECURSE FFIS src/ffi/ffi_callback.cc src/ffi/ffi_infinitensor.cc)
|
||||
pybind11_add_module(backend MODULE ${FFIS})
|
||||
target_link_libraries(backend PRIVATE InfiniTensor)
|
||||
|
||||
|
@ -168,6 +168,7 @@ endif()
|
|||
|
||||
if(USE_CUDA)
|
||||
add_compile_definitions(USE_CUDA=1)
|
||||
add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM=1) # Support CUDA graph stream caputre
|
||||
# Since enable_language only executes once, rerun cmake is required if CMAKE_CUDA_HOST_COMPILER is wrong
|
||||
set(CMAKE_CUDA_HOST_COMPILER
|
||||
${CMAKE_CXX_COMPILER}
|
||||
|
|
|
@ -81,7 +81,7 @@ import onnx
|
|||
from pyinfinitensor.onnx import OnnxStub
|
||||
from pyinfinitensor import backend
|
||||
|
||||
stub = OnnxStub(onnx.load("model_file"), backend.cpu_runtime())
|
||||
stub = OnnxStub.from_model(onnx.load("model_file"), backend.cpu_runtime())
|
||||
```
|
||||
|
||||
[`onnx.load`](https://onnx.ai/onnx/api/serialization.html#load-a-model) 是 onnx 提供的加载函数,将 onnx 文件读取为保存在内存中的 onnx 模型。
|
||||
|
@ -201,7 +201,7 @@ def infer(model: ModelProto, input) -> dict:
|
|||
|
||||
|
||||
model0 = onnx.load(sys.argv[1])
|
||||
model1 = OnnxStub(model0, backend.cpu_runtime()).to_onnx("new")
|
||||
model1 = OnnxStub.from_model(model0, backend.cpu_runtime()).to_onnx("new")
|
||||
|
||||
input_shape = [x.dim_value for x in model1.graph.input[0].type.tensor_type.shape.dim]
|
||||
input = numpy.random.random(input_shape).astype(numpy.float32)
|
||||
|
|
|
@ -16,7 +16,8 @@ class GraphObj : public Object {
|
|||
string toString() const override;
|
||||
Runtime getRuntime() const { return runtime; }
|
||||
|
||||
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
|
||||
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32,
|
||||
TensorType tensorType = TensorType::Other);
|
||||
Tensor addTensor(const Tensor &tensor);
|
||||
TensorVec addTensor(const TensorVec &tensors);
|
||||
/**
|
||||
|
|
|
@ -35,14 +35,22 @@ class GraphHandlerObj {
|
|||
Graph g;
|
||||
|
||||
public:
|
||||
GraphHandlerObj(Runtime runtime)
|
||||
explicit GraphHandlerObj(Runtime runtime)
|
||||
: g(make_ref<GraphObj>(std::move(runtime))) {}
|
||||
|
||||
Tensor tensor(Shape dims, int dtype);
|
||||
explicit GraphHandlerObj(Graph g) : g(std::move(g)) {}
|
||||
|
||||
//------ tensors
|
||||
|
||||
vector<Tensor> inputs() { return g->getInputs(); }
|
||||
|
||||
vector<Tensor> outputs() { return g->getOutputs(); }
|
||||
|
||||
Tensor tensor(Shape dims, int dtype, TensorType ttype);
|
||||
|
||||
//------ operators
|
||||
|
||||
inline OpVec operators() { return g->getOperators(); }
|
||||
OpVec operators() { return g->getOperators(); }
|
||||
|
||||
Tensor conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
|
||||
int sh, int sw, int dh, int dw);
|
||||
|
@ -90,18 +98,22 @@ class GraphHandlerObj {
|
|||
const optional<vector<int>> &steps);
|
||||
Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
|
||||
const optional<vector<int>> &axes);
|
||||
/// @brief Import memBound operator from a json
|
||||
TensorVec memBound(const TensorVec &inputs, const Tensor &outputs,
|
||||
const string &jsonString);
|
||||
|
||||
//------ modifiers
|
||||
|
||||
inline bool topo_sort() { return g->topo_sort(); }
|
||||
bool topo_sort() { return g->topo_sort(); }
|
||||
|
||||
inline void optimize() { g->optimize(); }
|
||||
void optimize() { g->optimize(); }
|
||||
|
||||
//------ runtime
|
||||
|
||||
inline void data_malloc() { g->dataMalloc(); }
|
||||
void data_malloc() { g->dataMalloc(); }
|
||||
|
||||
inline void run() { g->getRuntime()->run(g); }
|
||||
void run() { g->getRuntime()->run(g); }
|
||||
Graph getGraph() const { return g; }
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -30,6 +30,7 @@ class Mutator {
|
|||
virtual bool isMultiBranchMergable(const Graph &in_graph) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
virtual Graph fuseVertically(const Graph &inputGraph) { IT_TODO_HALT(); }
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -121,6 +121,7 @@ class OpRegistry {
|
|||
FOP(ConvBackwardData);
|
||||
FOP(Matmul);
|
||||
FOP(ConvTrans);
|
||||
FOP(ConvTransNHWC);
|
||||
FOP(G2BMM);
|
||||
FOP(GBMM);
|
||||
FOP(Pad);
|
||||
|
@ -209,7 +210,8 @@ class OpRegistry {
|
|||
//
|
||||
FOP(MemBound);
|
||||
default:
|
||||
IT_ASSERT(false);
|
||||
IT_ASSERT(false, "Unknown OpType " +
|
||||
std::to_string(enum_to_underlying(opType)));
|
||||
break;
|
||||
}
|
||||
#undef FOP
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
#include "core/common.h"
|
||||
#include "core/object.h"
|
||||
#include "core/ref.h"
|
||||
#include <memory>
|
||||
|
||||
|
@ -59,10 +60,12 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
|||
* execution happens.
|
||||
*
|
||||
* @param graph
|
||||
* @param profiling Whether to print breakdown of time
|
||||
* @param printProfiling Whether to print breakdown of time
|
||||
* @return double Return the sum of perf time for each operator
|
||||
*/
|
||||
double getPerfTime(const Graph &graph, bool profiling = false) const;
|
||||
double getPerfTime(const Graph &graph, bool printProfiling = false,
|
||||
bool allowEstimation = false,
|
||||
bool ignoreMemboundOp = false) const;
|
||||
Blob allocBlob(size_t size);
|
||||
bool isCpu() const {
|
||||
return device == Device::CPU || device == Device::INTELCPU;
|
||||
|
@ -77,6 +80,12 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
|||
size_t bytes) const = 0;
|
||||
virtual string toString() const = 0;
|
||||
|
||||
map<UidBaseType, bool>
|
||||
getCompileTimeComputableAttribute(const Graph &graph) const;
|
||||
|
||||
double timeNonCtcOperators(const Graph &graph, int warmup = 1000,
|
||||
int repeat = 1000) const;
|
||||
|
||||
protected:
|
||||
void printProfilingData(double totTime,
|
||||
const std::map<OpType, double> &opTime,
|
||||
|
|
|
@ -4,19 +4,15 @@
|
|||
#include "graph.h"
|
||||
#include "mutator.h"
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
namespace infini {
|
||||
class SearchEngine {
|
||||
private:
|
||||
Runtime runtimeExec;
|
||||
Ref<Mutator> mutator;
|
||||
std::function<bool(const Graph &, const Graph &)> graphTimeComparer;
|
||||
|
||||
public:
|
||||
SearchEngine(Runtime _runtime, Ref<Mutator> _mutator) {
|
||||
runtimeExec = _runtime;
|
||||
mutator = _mutator;
|
||||
}
|
||||
SearchEngine(Runtime runtime, Ref<Mutator> mutator);
|
||||
~SearchEngine() {}
|
||||
|
||||
private: // Configurations
|
||||
|
@ -24,24 +20,17 @@ class SearchEngine {
|
|||
3; // cut nodes whose #in + #out >= partitionThreshold
|
||||
size_t GRAPH_SIZE = 16; // num of best graphs.
|
||||
|
||||
private: // Composed objects
|
||||
std::shared_ptr<Mutator> mutationEngine;
|
||||
|
||||
public:
|
||||
std::shared_ptr<Mutator> getMutationEngine() { return mutationEngine; };
|
||||
struct GroupEdge {
|
||||
int v, next;
|
||||
GroupEdge() = delete;
|
||||
};
|
||||
|
||||
struct Candidate { // a graph with perf
|
||||
std::shared_ptr<Graph> graph;
|
||||
double perf = INFINITY;
|
||||
};
|
||||
class MetaGraph { // a graph of subgraphs, for searching.
|
||||
public:
|
||||
MetaGraph() {}
|
||||
~MetaGraph() {}
|
||||
// struct Candidate { // a graph with perf
|
||||
// Graph graph;
|
||||
// double perf = INFINITY;
|
||||
// };
|
||||
struct MetaGraphObj { // a graph of subgraphs, for searching.
|
||||
struct Node {
|
||||
Graph graph;
|
||||
std::vector<int> suc;
|
||||
|
@ -50,31 +39,33 @@ class SearchEngine {
|
|||
};
|
||||
std::vector<Node> nodes;
|
||||
};
|
||||
using MetaGraph = Ref<MetaGraphObj>;
|
||||
|
||||
Graph run(const Graph graph); // entrance of search engine.
|
||||
Graph run(const Graph graph); // entrance to search engine.
|
||||
std::vector<Graph> search(const Graph &graph); // search for a partition.
|
||||
|
||||
private:
|
||||
std::vector<Graph> partitionGraph(const Graph graph);
|
||||
std::shared_ptr<MetaGraph> buildMetaGraphWithGraph(const Graph graph);
|
||||
std::shared_ptr<MetaGraph>
|
||||
buildMetaGraphWithPlan(const std::shared_ptr<MetaGraph> metaGraph,
|
||||
const std::vector<int> &plan);
|
||||
MetaGraph buildMetaGraphWithGraph(const Graph graph);
|
||||
MetaGraph buildMetaGraphWithPlan(const MetaGraph metaGraph,
|
||||
const std::vector<int> &plan);
|
||||
// search horizontal merges
|
||||
std::vector<std::shared_ptr<MetaGraph>>
|
||||
searchMerge(std::shared_ptr<MetaGraph> &metaGraph);
|
||||
void searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
|
||||
std::vector<int> &plan, std::vector<int> &frontier,
|
||||
std::vector<MetaGraph> searchMerge(MetaGraph &metaGraph);
|
||||
void searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
|
||||
std::vector<int> &frontier,
|
||||
std::vector<std::vector<int>> &plans,
|
||||
std::unordered_set<uint64_t> &planSet);
|
||||
std::vector<Graph>
|
||||
searchMutation(const std::shared_ptr<MetaGraph> &metaGraph);
|
||||
std::vector<Graph> searchMutation(const MetaGraph &metaGraph);
|
||||
|
||||
void printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph);
|
||||
void printMetaGraph(MetaGraph metaGraph);
|
||||
/**
|
||||
* @brief Check whether a multi-brach graph can be merged into a single
|
||||
* branch.
|
||||
*/
|
||||
bool isMultiBranchMergable(const Graph graph);
|
||||
Graph fuseVertically(const Graph &graph);
|
||||
|
||||
double getEstimatedGraphPerf(Graph graph);
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -12,13 +12,14 @@ namespace infini {
|
|||
// TODO: how to deal with this
|
||||
using ShapeElem = int;
|
||||
using Shape = vector<ShapeElem>;
|
||||
enum class TensorType { Error = 0, Input = 1, Initialized = 2, Other = 3 };
|
||||
class TensorObj : public TensorBaseObj {
|
||||
private:
|
||||
Shape shape;
|
||||
size_t _size; // Cache of Π(shape).
|
||||
Fuid fuid; // Cloned tensors share the same id. Tensors constructed from
|
||||
// scratch have a new id.
|
||||
|
||||
TensorType tensorType;
|
||||
void copyin(const void *ptr, size_t size) {
|
||||
runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
|
||||
}
|
||||
|
@ -27,7 +28,8 @@ class TensorObj : public TensorBaseObj {
|
|||
}
|
||||
|
||||
public:
|
||||
TensorObj(Shape shape, DataType dtype, Runtime runtime);
|
||||
TensorObj(Shape shape, DataType dtype, Runtime runtime,
|
||||
TensorType tensorType = TensorType::Other);
|
||||
virtual ~TensorObj() {}
|
||||
string toString() const override;
|
||||
|
||||
|
@ -39,6 +41,7 @@ class TensorObj : public TensorBaseObj {
|
|||
size_t getOffset(const vector<int> &ds) const;
|
||||
void dataMalloc();
|
||||
UidBaseType getFuid() const { return fuid; }
|
||||
TensorType getTensorType() const { return tensorType; }
|
||||
|
||||
void load(std::string file_path);
|
||||
void save(std::string file_path);
|
||||
|
@ -74,6 +77,7 @@ class TensorObj : public TensorBaseObj {
|
|||
// Thus the internal state of generator cannot be updated.
|
||||
void setData(
|
||||
std::function<void(void *, size_t, DataType)> const &generator) const;
|
||||
void setData(const Blob &_blob) { data = _blob; }
|
||||
Tensor clone() const {
|
||||
auto obj = make_ref<TensorObj>(*this);
|
||||
obj->freeData();
|
||||
|
@ -106,13 +110,13 @@ class TensorObj : public TensorBaseObj {
|
|||
size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const;
|
||||
|
||||
private:
|
||||
template <class T> string dataToString() const {
|
||||
template <class T> string dataToString(void *rawPtr) const {
|
||||
std::stringstream builder;
|
||||
builder << "Tensor: " << guid << std::endl;
|
||||
|
||||
auto numDims = shape.size();
|
||||
auto dimSzVec = vector<int>(numDims, 1);
|
||||
auto ptr = data->getPtr<T *>();
|
||||
T *ptr = (T *)rawPtr;
|
||||
dimSzVec[numDims - 1] = shape[numDims - 1];
|
||||
|
||||
for (int i = numDims - 1; i != 0; --i)
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
|
||||
namespace infini {
|
||||
|
||||
void conv2dreduce_kernel(float *input, float *bias, float *output, bool PReLU,
|
||||
int n, int h, int w, int f, int r, int s, int oh,
|
||||
int ow, int ph, int pw, int sh, int sw, int dh,
|
||||
int dw);
|
||||
|
||||
void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
|
||||
int act, int n, int h, int w, int f, int r,
|
||||
int s, int oh, int ow, int ph, int pw, int sh,
|
||||
int sw, int dh, int dw);
|
||||
} // namespace infini
|
|
@ -1,35 +1,22 @@
|
|||
#pragma once
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include "nnet/dbg.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class CudaRuntimeObj : public RuntimeObj {
|
||||
private:
|
||||
cudaStream_t stream;
|
||||
cudnnHandle_t cudnn;
|
||||
cublasHandle_t cublas;
|
||||
CudaPtr workspace;
|
||||
size_t workspaceSize;
|
||||
bool cudaGraphStatus; // Whether CUDA graph stream capture is enabled
|
||||
|
||||
public:
|
||||
CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
|
||||
|
||||
checkCudnnError(cudnnCreate(&cudnn));
|
||||
checkCublasError(cublasCreate(&cublas));
|
||||
// 10GB for Longformer
|
||||
// size_t longformerNum = 3lu * (1 << 30);
|
||||
workspaceSize = 7ll << 30; // 7 GB
|
||||
workspace = alloc(workspaceSize);
|
||||
}
|
||||
virtual ~CudaRuntimeObj() {
|
||||
try {
|
||||
dealloc(workspace);
|
||||
checkCudnnError(cudnnDestroy(cudnn));
|
||||
checkCublasError(cublasDestroy(cublas));
|
||||
} catch (const std::exception &e) {
|
||||
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
|
||||
}
|
||||
}
|
||||
CudaRuntimeObj();
|
||||
virtual ~CudaRuntimeObj();
|
||||
string toString() const override;
|
||||
|
||||
void run(const Graph &graph, bool tune = false,
|
||||
|
@ -69,7 +56,15 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
|
||||
void runWithoutSync(const Graph &graph) const;
|
||||
|
||||
bool isInCudaGraph() const { return cudaGraphStatus; }
|
||||
cudaStream_t getStream() const { return stream; }
|
||||
|
||||
double timeWithCudaGraph(Graph graph);
|
||||
|
||||
private:
|
||||
void tune(const Graph &graph, bool profiling) const;
|
||||
|
||||
void beginCudaGraphStreamCapture();
|
||||
tuple<cudaGraphExec_t, size_t> endCudaGraphStreamCapture();
|
||||
};
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
#include "core/graph_handler.h"
|
||||
#include "core/mutator.h"
|
||||
#include "core/search_engine.h"
|
||||
|
||||
namespace infini {
|
||||
namespace callback {
|
||||
void exportONNX(const Graph &graph, const string &path);
|
||||
}
|
||||
} // namespace infini
|
|
@ -20,6 +20,7 @@ class Serializer : public Functor<string()> {
|
|||
string visit_(const Subscript &c) override;
|
||||
string visit_(const Var &c) override;
|
||||
string visit_(const Tensor &c) override;
|
||||
string visit_(const Func &c) override;
|
||||
string dispatchRoutine(const Routine &c);
|
||||
|
||||
Expr buildExprTree(string key);
|
||||
|
@ -29,16 +30,44 @@ class Serializer : public Functor<string()> {
|
|||
Serializer(int _verobse = 0);
|
||||
virtual ~Serializer();
|
||||
|
||||
/**
|
||||
* @brief Serialize the given expression to string
|
||||
*
|
||||
* @param expr The expression to be serialized
|
||||
* @param msg Message of derivation
|
||||
* @param inputs membound operator attributes
|
||||
* @param exec_time membound operator attributes
|
||||
* @param hint membound operator attributes
|
||||
* @return bool Whether the serialization succeed
|
||||
*/
|
||||
std::optional<std::string> toString(Expr const &expr,
|
||||
const string &msg = "",
|
||||
vector<Tensor> inputs = {},
|
||||
double exec_time = -1e9,
|
||||
string hint = "");
|
||||
|
||||
/**
|
||||
* @brief Serialize the given expression to json file
|
||||
*
|
||||
* @param expr The expression to be serialized
|
||||
* @param filePath The path of json file to be output
|
||||
* @param msg Message of derivation
|
||||
* @param inputs membound operator attributes
|
||||
* @param exec_time membound operator attributes
|
||||
* @param hint membound operator attributes
|
||||
* @return bool Whether the serialization succeed
|
||||
*/
|
||||
bool serialize(const Expr &expr, const string &filePath,
|
||||
const string &msg = "");
|
||||
bool toFile(const Expr &expr, const string &filePath,
|
||||
const string &msg = "", vector<Tensor> inputs = {},
|
||||
double exec_time = -1e9, string hint = "");
|
||||
|
||||
/**
|
||||
* @brief Deserialize the given json file to expression
|
||||
*
|
||||
* @param text The text of the expr to be deserialized
|
||||
* @return Expression deserialized from the given json file
|
||||
*/
|
||||
Expr fromString(const string &text);
|
||||
|
||||
/**
|
||||
* @brief Deserialize the given json file to expression
|
||||
|
@ -46,7 +75,15 @@ class Serializer : public Functor<string()> {
|
|||
* @param filePath The path to file to be deserialized
|
||||
* @return Expression deserialized from the given json file
|
||||
*/
|
||||
Expr deserialize(const string &filePath);
|
||||
Expr fromFile(const string &filePath);
|
||||
|
||||
tuple<Expr, vector<Tensor>, double, string>
|
||||
deserializeAsMemobundOp(const string &filePath);
|
||||
|
||||
// FIXME: the order of elements in tuple is not consistent with memboundObj
|
||||
// constructor
|
||||
tuple<Expr, vector<Tensor>, double, string>
|
||||
membundOpFromString(const string &data);
|
||||
};
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -69,7 +69,8 @@ static inline HashType genhash(string s) {
|
|||
{ IT_TODO_HALT(); }
|
||||
|
||||
#define nnet_unimplemented_continue() \
|
||||
{ dbg("Unimplemented"); }
|
||||
{}
|
||||
// { dbg("Unimplemented"); }
|
||||
|
||||
#define nnet_assert(expr, msg) assert(((void)(msg), (expr)))
|
||||
|
||||
|
|
|
@ -104,7 +104,7 @@ enum class NodeType {
|
|||
FuncNodeType
|
||||
};
|
||||
|
||||
enum class FuncType { Relu, Tanh, PRelu };
|
||||
enum class FuncType { Relu = 1000, Tanh, PRelu };
|
||||
|
||||
#define DEFINE_GETTYPE(CLASS, isScalar_v) \
|
||||
NodeType getType() const override { return NodeType::CLASS##Type; } \
|
||||
|
@ -206,7 +206,8 @@ struct IterationType {
|
|||
enum { Loop, Sum };
|
||||
constexpr static int NumIterationType = 2;
|
||||
};
|
||||
class RangeOpNode : public OperatorNode {
|
||||
class RangeOpNode : public OperatorNode,
|
||||
public std::enable_shared_from_this<RangeOpNode> {
|
||||
public:
|
||||
enum { Summand, END_POS };
|
||||
constexpr static int Loop = IterationType::Loop;
|
||||
|
@ -230,6 +231,7 @@ class RangeOpNode : public OperatorNode {
|
|||
return 0;
|
||||
};
|
||||
string toReadable() const override;
|
||||
string getFullExpression();
|
||||
const Expr &getSummand() const { return subExprs[Summand]; }
|
||||
const vector<VarRangePair> &getVarRanges(int _index) const {
|
||||
return vars[_index];
|
||||
|
|
|
@ -7,12 +7,13 @@ namespace infini {
|
|||
class NMutator : public Mutator {
|
||||
public:
|
||||
enum class Mode { Normal, ToNaiveMembound, RuleBased };
|
||||
using NameNToTensorT = map<string, Tensor>;
|
||||
|
||||
private:
|
||||
// Suffix -N: NNet objects.
|
||||
// Suffix -T: tpm objects.
|
||||
// Map: NNet tensors -> tpm tensor.
|
||||
std::map<std::string, Tensor> inputsNameNToTensorT;
|
||||
NameNToTensorT inputsNameNToTensorT;
|
||||
Mode mode;
|
||||
const double bandwidth = double(200) * 1024 * 1024 * 1024;
|
||||
// If in RuleBased mode, use derivationRules in derivator
|
||||
|
@ -24,8 +25,9 @@ class NMutator : public Mutator {
|
|||
~NMutator();
|
||||
|
||||
vector<Graph> run(const Graph &in_graph) override;
|
||||
void setToNaiveMembound();
|
||||
Graph fuseVertically(const Graph &in_graph) override;
|
||||
|
||||
void setToNaiveMembound();
|
||||
void setMaxDepth(int _maxDepth) { maxDepth = _maxDepth; }
|
||||
long long cntStates = 0;
|
||||
long long cntCandidates = 0;
|
||||
|
@ -33,6 +35,14 @@ class NMutator : public Mutator {
|
|||
private:
|
||||
int maxDepth = 8;
|
||||
nnet::Expr opToExpression(Operator op);
|
||||
/// @brief
|
||||
/// @param op
|
||||
/// @return pair<Expr, map from NNet tensor names to InfiniTensor tensors>
|
||||
static pair<nnet::Expr, NameNToTensorT> extractOp(Operator op);
|
||||
static pair<nnet::Expr, NMutator::NameNToTensorT>
|
||||
generateUnaryExpr(const Operator &op);
|
||||
static pair<nnet::Expr, vector<nnet::Tensor>> generateRevert(Tensor in);
|
||||
|
||||
void runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs);
|
||||
|
||||
/**
|
||||
|
@ -48,8 +58,8 @@ class NMutator : public Mutator {
|
|||
|
||||
// TODO: recover these rules
|
||||
// Graph fuseHetConv(nnet::Expr expr, Graph in_graph);
|
||||
// Graph transformTConv1x1(Operator op);
|
||||
// Graph transformTConv3x3(Operator op);
|
||||
Graph transformConvtransposed1x1(Operator _op);
|
||||
// Graph transformConvtransposed(Operator op);
|
||||
// Graph transformDialtedConv(Operator op);
|
||||
// Graph transformConv1x1(Operator op);
|
||||
// Graph transformConv1xk(Operator op);
|
||||
|
|
|
@ -6,12 +6,17 @@ namespace infini {
|
|||
|
||||
class MemBoundObj : public OperatorObj {
|
||||
private:
|
||||
std::vector<nnet::Tensor> nnetInputs;
|
||||
nnet::Expr expr, simplifiedExpr;
|
||||
nnet::Expr expr;
|
||||
std::vector<nnet::Tensor>
|
||||
nnetInputs; // The order of inputs in nnetInputs should be consistant
|
||||
// with inputs in infinitensor
|
||||
double exec_time;
|
||||
std::string hint;
|
||||
HashType hash, simplifiedHash;
|
||||
int n, f, h, w;
|
||||
|
||||
// Generated attributes
|
||||
HashType hash;
|
||||
nnet::Expr simplifiedExpr;
|
||||
HashType simplifiedHash;
|
||||
|
||||
public:
|
||||
MemBoundObj(GraphObj *graph, const TensorVec &input,
|
||||
|
@ -27,9 +32,12 @@ class MemBoundObj : public OperatorObj {
|
|||
int numOutputs() const override { return outputs.size(); }
|
||||
const vector<nnet::Tensor> &getNnetInputs() const { return nnetInputs; }
|
||||
const nnet::Expr getNnetExpr() const { return expr; }
|
||||
HashType getHash() const { return hash; }
|
||||
pair<const nnet::Expr, HashType> getSimplifiedNnetExpr() const {
|
||||
return {expr, hash};
|
||||
}
|
||||
double getEstimatedTime() const { return exec_time; }
|
||||
string toJson() const;
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
|
|
|
@ -46,10 +46,13 @@ class RandomGenerator : public DataGenerator {
|
|||
std::mt19937 e;
|
||||
std::uniform_int_distribution<int> di;
|
||||
std::uniform_real_distribution<float> dr;
|
||||
bool generateInteger;
|
||||
|
||||
public:
|
||||
RandomGenerator(double l = 0, double r = 1, unsigned int seed = 0)
|
||||
: l(l), r(r), e(seed), di(l, r), dr(l, r) {}
|
||||
RandomGenerator(double l = 0, double r = 1, unsigned int seed = 0,
|
||||
bool generateInteger = false)
|
||||
: l(l), r(r), e(seed), di(l, r), dr(l, r),
|
||||
generateInteger(generateInteger) {}
|
||||
virtual ~RandomGenerator() {}
|
||||
|
||||
private:
|
||||
|
@ -60,7 +63,7 @@ class RandomGenerator : public DataGenerator {
|
|||
}
|
||||
void fill(float *data, size_t size) override {
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
data[i] = dr(e);
|
||||
data[i] = (generateInteger) ? di(e) : dr(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -37,29 +37,48 @@ class OnnxStub:
|
|||
outputs: Dict[str, backend.Tensor] = {}
|
||||
initializer: Dict[int, TensorProto] = {}
|
||||
handler: backend.GraphHandler
|
||||
disable_check: bool
|
||||
|
||||
def __init__(self, model: ModelProto, runtime):
|
||||
model = infer_shapes(model)
|
||||
self.handler = backend.GraphHandler(runtime)
|
||||
@classmethod
|
||||
def from_onnx(cls, model: ModelProto, runtime, enable_onnx_shape_infernce=True):
|
||||
if enable_onnx_shape_infernce:
|
||||
model = infer_shapes(model)
|
||||
ans = OnnxStub()
|
||||
ans.handler = backend.GraphHandler(runtime)
|
||||
|
||||
tensors: Dict[str, backend.Tensor] = dict()
|
||||
data: Dict[str, TensorProto] = dict()
|
||||
|
||||
cnt_infini_inputs = 0
|
||||
for input in model.graph.input:
|
||||
dims = _take_shape_dim(input.type.tensor_type.shape)
|
||||
tensors[input.name] = self.handler.tensor(
|
||||
dims, input.type.tensor_type.elem_type
|
||||
if input.name.startswith('input'):
|
||||
tensor_type = backend.TensorType.Input
|
||||
cnt_infini_inputs += 1
|
||||
else:
|
||||
tensor_type = backend.TensorType.Initialized
|
||||
tensors[input.name] = ans.handler.tensor(
|
||||
dims,
|
||||
input.type.tensor_type.elem_type,
|
||||
tensor_type,
|
||||
)
|
||||
assert cnt_infini_inputs == 1, f'{cnt_infini_inputs} tensor names start with "input" found.'
|
||||
|
||||
for output in model.graph.output:
|
||||
dims = _take_shape_dim(output.type.tensor_type.shape)
|
||||
tensors[output.name] = self.handler.tensor(
|
||||
dims, output.type.tensor_type.elem_type
|
||||
tensors[output.name] = ans.handler.tensor(
|
||||
dims,
|
||||
output.type.tensor_type.elem_type,
|
||||
backend.TensorType.Other,
|
||||
)
|
||||
|
||||
for initializer in model.graph.initializer:
|
||||
dims = [d for d in initializer.dims]
|
||||
tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
|
||||
tensors[initializer.name] = ans.handler.tensor(
|
||||
dims,
|
||||
initializer.data_type,
|
||||
backend.TensorType.Initialized,
|
||||
)
|
||||
data[initializer.name] = initializer
|
||||
|
||||
for node in model.graph.node:
|
||||
|
@ -77,7 +96,7 @@ class OnnxStub:
|
|||
)
|
||||
if p[0] != p[2] or p[1] != p[3]:
|
||||
adapt = "{}-adapt".format(node.output[0])
|
||||
tensors[adapt] = self.handler.pad(
|
||||
tensors[adapt] = ans.handler.pad(
|
||||
tensors[node.input[0]], None, p, [-2, -1]
|
||||
)
|
||||
p = [0, 0, 0, 0]
|
||||
|
@ -87,7 +106,7 @@ class OnnxStub:
|
|||
if len(node.input) > 2:
|
||||
bias = "{}-bias".format(node.output[0])
|
||||
reshape = "{}-reshape".format(node.output[0])
|
||||
tensors[bias] = self.handler.conv(
|
||||
tensors[bias] = ans.handler.conv(
|
||||
tensors[adapt],
|
||||
tensors[node.input[1]],
|
||||
None,
|
||||
|
@ -98,7 +117,7 @@ class OnnxStub:
|
|||
d[0],
|
||||
d[1],
|
||||
)
|
||||
tensors[reshape] = self.handler.reshape(
|
||||
tensors[reshape] = ans.handler.reshape(
|
||||
tensors[node.input[2]],
|
||||
None,
|
||||
[
|
||||
|
@ -111,13 +130,13 @@ class OnnxStub:
|
|||
1,
|
||||
],
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.add(
|
||||
tensors[node.output[0]] = ans.handler.add(
|
||||
tensors[bias],
|
||||
tensors[reshape],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
else:
|
||||
tensors[node.output[0]] = self.handler.conv(
|
||||
tensors[node.output[0]] = ans.handler.conv(
|
||||
tensors[adapt],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
|
@ -142,7 +161,7 @@ class OnnxStub:
|
|||
attributes[name]
|
||||
for name in ["dilations", "pads", "strides", "output_padding"]
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.convTransposed2d(
|
||||
tensors[node.output[0]] = ans.handler.convTransposed2d(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
|
@ -156,7 +175,7 @@ class OnnxStub:
|
|||
op[1],
|
||||
)
|
||||
elif node.op_type == "MatMul":
|
||||
tensors[node.output[0]] = self.handler.matmul(
|
||||
tensors[node.output[0]] = ans.handler.matmul(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
|
@ -175,7 +194,7 @@ class OnnxStub:
|
|||
# FIXME unsupport attributes: `alpha` `beta`
|
||||
assert alpha == 1.0
|
||||
assert beta == 1.0
|
||||
tensors[node.output[0]] = self.handler.matmul(
|
||||
tensors[node.output[0]] = ans.handler.matmul(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
|
@ -196,7 +215,7 @@ class OnnxStub:
|
|||
attributes[name]
|
||||
for name in ["momentum", "epsilon", "training_mode"]
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.batchNorm(
|
||||
tensors[node.output[0]] = ans.handler.batchNorm(
|
||||
input, output, mean, var, scale, bias, momentum, eps, training != 0
|
||||
)
|
||||
elif node.op_type == "MaxPool":
|
||||
|
@ -215,10 +234,10 @@ class OnnxStub:
|
|||
)
|
||||
if p[0] != p[2] or p[1] != p[3]:
|
||||
adapt = "{}-adapt".format(node.output[0])
|
||||
tensors[adapt] = self.handler.pad(
|
||||
tensors[adapt] = ans.handler.pad(
|
||||
tensors.get(node.input[0]), None, p, [-2, -1]
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.maxPool(
|
||||
tensors[node.output[0]] = ans.handler.maxPool(
|
||||
tensors[adapt],
|
||||
tensors.get(node.output[0]),
|
||||
k[0],
|
||||
|
@ -231,7 +250,7 @@ class OnnxStub:
|
|||
s[1],
|
||||
)
|
||||
else:
|
||||
tensors[node.output[0]] = self.handler.maxPool(
|
||||
tensors[node.output[0]] = ans.handler.maxPool(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
k[0],
|
||||
|
@ -257,10 +276,10 @@ class OnnxStub:
|
|||
)
|
||||
if p[0] != p[2] or p[1] != p[3]:
|
||||
adapt = "{}-adapt".format(node.output[0])
|
||||
tensors[adapt] = self.handler.pad(
|
||||
tensors[adapt] = ans.handler.pad(
|
||||
tensors.get(node.input[0]), None, p, [-2, -1]
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.avgPool(
|
||||
tensors[node.output[0]] = ans.handler.avgPool(
|
||||
tensors[adapt],
|
||||
tensors.get(node.output[0]),
|
||||
k[0],
|
||||
|
@ -273,7 +292,7 @@ class OnnxStub:
|
|||
s[1],
|
||||
)
|
||||
else:
|
||||
tensors[node.output[0]] = self.handler.avgPool(
|
||||
tensors[node.output[0]] = ans.handler.avgPool(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
k[0],
|
||||
|
@ -287,7 +306,7 @@ class OnnxStub:
|
|||
)
|
||||
elif node.op_type == "GlobalAveragePool":
|
||||
[_, _, h, w] = _search_shape(model, node.input[0])
|
||||
tensors[node.output[0]] = self.handler.avgPool(
|
||||
tensors[node.output[0]] = ans.handler.avgPool(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
h,
|
||||
|
@ -300,52 +319,52 @@ class OnnxStub:
|
|||
1,
|
||||
)
|
||||
elif node.op_type == "Add":
|
||||
tensors[node.output[0]] = self.handler.add(
|
||||
tensors[node.output[0]] = ans.handler.add(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Sub":
|
||||
tensors[node.output[0]] = self.handler.sub(
|
||||
tensors[node.output[0]] = ans.handler.sub(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Mul":
|
||||
tensors[node.output[0]] = self.handler.mul(
|
||||
tensors[node.output[0]] = ans.handler.mul(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Div":
|
||||
tensors[node.output[0]] = self.handler.div(
|
||||
tensors[node.output[0]] = ans.handler.div(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Pow":
|
||||
tensors[node.output[0]] = self.handler.pow(
|
||||
tensors[node.output[0]] = ans.handler.pow(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Relu":
|
||||
tensors[node.output[0]] = self.handler.relu(
|
||||
tensors[node.output[0]] = ans.handler.relu(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Sigmoid":
|
||||
tensors[node.output[0]] = self.handler.sigmoid(
|
||||
tensors[node.output[0]] = ans.handler.sigmoid(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Tanh":
|
||||
tensors[node.output[0]] = self.handler.tanh(
|
||||
tensors[node.output[0]] = ans.handler.tanh(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Softmax":
|
||||
tensors[node.output[0]] = self.handler.softmax(
|
||||
tensors[node.output[0]] = ans.handler.softmax(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
next(
|
||||
|
@ -353,34 +372,34 @@ class OnnxStub:
|
|||
),
|
||||
)
|
||||
elif node.op_type == "Abs":
|
||||
tensors[node.output[0]] = self.handler.abs(
|
||||
tensors[node.output[0]] = ans.handler.abs(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Shape":
|
||||
tensors[node.output[0]] = self.handler.shape(
|
||||
tensors[node.output[0]] = ans.handler.shape(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Identity":
|
||||
tensors[node.output[0]] = self.handler.identity(
|
||||
tensors[node.output[0]] = ans.handler.identity(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Flatten":
|
||||
tensors[node.output[0]] = self.handler.flatten(
|
||||
tensors[node.output[0]] = ans.handler.flatten(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||
)
|
||||
elif node.op_type == "PRelu":
|
||||
tensors[node.output[0]] = self.handler.pRelu(
|
||||
tensors[node.output[0]] = ans.handler.pRelu(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Clip":
|
||||
tensors[node.output[0]] = self.handler.clip(
|
||||
tensors[node.output[0]] = ans.handler.clip(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
next(_parse_data(data[node.input[1]]).__iter__(), None)
|
||||
|
@ -394,7 +413,7 @@ class OnnxStub:
|
|||
perm = next(
|
||||
(attr.ints for attr in node.attribute if attr.name == "perm"), None
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.transpose(
|
||||
tensors[node.output[0]] = ans.handler.transpose(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
perm,
|
||||
|
@ -409,7 +428,7 @@ class OnnxStub:
|
|||
temp = reduce(lambda acc, x: acc * x, input_shape, 1)
|
||||
if temp < 0:
|
||||
input_shape[input_shape.index(-1)] = size // -temp
|
||||
tensors[node.output[0]] = self.handler.reshape(
|
||||
tensors[node.output[0]] = ans.handler.reshape(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
input_shape,
|
||||
|
@ -426,7 +445,7 @@ class OnnxStub:
|
|||
for i, x in enumerate(input_shape):
|
||||
if i not in axes:
|
||||
output_shape.append(x)
|
||||
tensors[node.output[0]] = self.handler.reshape(
|
||||
tensors[node.output[0]] = ans.handler.reshape(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
output_shape,
|
||||
|
@ -440,13 +459,13 @@ class OnnxStub:
|
|||
)
|
||||
for i in axes:
|
||||
input_shape.insert(i, 1)
|
||||
tensors[node.output[0]] = self.handler.reshape(
|
||||
tensors[node.output[0]] = ans.handler.reshape(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
input_shape,
|
||||
)
|
||||
elif node.op_type == "Concat":
|
||||
tensors[node.output[0]] = self.handler.concat(
|
||||
tensors[node.output[0]] = ans.handler.concat(
|
||||
[tensors[name] for name in node.input],
|
||||
tensors.get(node.output[0]),
|
||||
next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||
|
@ -454,7 +473,7 @@ class OnnxStub:
|
|||
elif node.op_type == "Split":
|
||||
for name, tensor in zip(
|
||||
node.output,
|
||||
self.handler.split(
|
||||
ans.handler.split(
|
||||
tensors[node.input[0]],
|
||||
None,
|
||||
next(
|
||||
|
@ -466,14 +485,14 @@ class OnnxStub:
|
|||
):
|
||||
tensors[name] = tensor
|
||||
elif node.op_type == "Gather":
|
||||
tensors[node.output[0]] = self.handler.gather(
|
||||
tensors[node.output[0]] = ans.handler.gather(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||
)
|
||||
elif node.op_type == "ReduceMean":
|
||||
tensors[node.output[0]] = self.handler.reduce_mean(
|
||||
tensors[node.output[0]] = ans.handler.reduce_mean(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
tensors[node.input[1]] if len(node.input) > 1 else None,
|
||||
|
@ -481,7 +500,7 @@ class OnnxStub:
|
|||
!= 0,
|
||||
)
|
||||
elif node.op_type == "Slice":
|
||||
tensors[node.output[0]] = self.handler.slice(
|
||||
tensors[node.output[0]] = ans.handler.slice(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
_parse_data(data[node.input[1]]),
|
||||
|
@ -490,7 +509,7 @@ class OnnxStub:
|
|||
_parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
|
||||
)
|
||||
elif node.op_type == "Pad":
|
||||
tensors[node.output[0]] = self.handler.pad(
|
||||
tensors[node.output[0]] = ans.handler.pad(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
_parse_data(data[node.input[1]]),
|
||||
|
@ -499,7 +518,7 @@ class OnnxStub:
|
|||
elif node.op_type == "Dropout":
|
||||
for name, tensor in zip(
|
||||
node.output,
|
||||
self.handler.dropout(
|
||||
ans.handler.dropout(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
tensors.get(node.output[1]) if len(node.output) > 1 else None,
|
||||
|
@ -512,18 +531,33 @@ class OnnxStub:
|
|||
),
|
||||
):
|
||||
tensors[name] = tensor
|
||||
elif node.op_type == "MemBound":
|
||||
attributes = _parse_attribute(node, {"expr": None})
|
||||
expr: str = attributes["expr"]
|
||||
assert expr is not None
|
||||
assert (
|
||||
len(node.output) == 1
|
||||
), """MemBound with multiple
|
||||
outputs requires rewrite the logic of tensor creation"""
|
||||
outputs = ans.handler.memBound(
|
||||
[tensors[name] for name in node.input],
|
||||
tensors.get(node.output[0]),
|
||||
expr,
|
||||
)
|
||||
for name, tensor in zip(node.output, outputs):
|
||||
tensors[name] = tensor
|
||||
else:
|
||||
raise Exception('Unsupported operator "{}"'.format(node.op_type))
|
||||
|
||||
self.handler.data_malloc()
|
||||
ans.handler.data_malloc()
|
||||
|
||||
for name, obj in tensors.items():
|
||||
tensor = data.get(name)
|
||||
if tensor == None:
|
||||
if any(input.name == name for input in model.graph.input):
|
||||
self.inputs[name] = obj
|
||||
ans.inputs[name] = obj
|
||||
else:
|
||||
self.initializer[obj.fuid()] = tensor
|
||||
ans.initializer[obj.fuid()] = tensor
|
||||
if tensor.data_type == TensorProto.INT32:
|
||||
obj.copyin_int32(_parse_data(tensor))
|
||||
elif tensor.data_type == TensorProto.INT64:
|
||||
|
@ -533,8 +567,19 @@ class OnnxStub:
|
|||
else:
|
||||
assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
|
||||
|
||||
for output in model.graph.output:
|
||||
self.outputs[output.name] = tensors[output.name]
|
||||
return ans
|
||||
|
||||
@classmethod
|
||||
def from_graph(cls, g: backend.Graph):
|
||||
ans = OnnxStub()
|
||||
handler = backend.GraphHandler(g)
|
||||
for i, tensor in enumerate(handler.inputs()):
|
||||
ans.inputs["input{}".format(i)] = tensor
|
||||
for i, tensor in enumerate(handler.outputs()):
|
||||
ans.inputs["output{}".format(i)] = tensor
|
||||
ans.handler = handler
|
||||
ans.disable_check = True
|
||||
return ans
|
||||
|
||||
def to_onnx(self, name: str) -> ModelProto:
|
||||
class Context:
|
||||
|
@ -552,6 +597,13 @@ class OnnxStub:
|
|||
outputs: List[ValueInfoProto] = []
|
||||
# saves global input tensors
|
||||
initializers: List[TensorProto] = []
|
||||
# saves global output tensors
|
||||
value_info: List[ValueInfoProto] = []
|
||||
|
||||
enable_check = False
|
||||
|
||||
def __init__(self, enable_check):
|
||||
self.enable_check = enable_check
|
||||
|
||||
def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
|
||||
ty = op.op_type()
|
||||
|
@ -562,12 +614,15 @@ class OnnxStub:
|
|||
|
||||
def push_output(self, name: str, tensor: backend.Tensor) -> str:
|
||||
self.names[tensor] = name
|
||||
if not tensor.has_target():
|
||||
shape = tensor.shape()
|
||||
dtype = backend.tensor_dtype(tensor)
|
||||
value_info = make_tensor_value_info(name, dtype, shape)
|
||||
check_value_info(value_info)
|
||||
|
||||
shape = tensor.shape()
|
||||
dtype = backend.tensor_dtype(tensor)
|
||||
value_info = make_tensor_value_info(name, dtype, shape)
|
||||
check_value_info(value_info)
|
||||
if not tensor.has_target(): # if this output is a global output
|
||||
self.outputs.append(value_info)
|
||||
else: # if this output is a local output
|
||||
self.value_info.append(value_info)
|
||||
return name
|
||||
|
||||
def push_input(
|
||||
|
@ -577,7 +632,10 @@ class OnnxStub:
|
|||
# means that this input is a global input
|
||||
if name is None:
|
||||
self.count_in += 1
|
||||
name = "input{}".format(self.count_in)
|
||||
if tensor.getTensorType() == backend.TensorType.Input:
|
||||
name = "input{}".format(self.count_in)
|
||||
else:
|
||||
name = "weight{}".format(self.count_in)
|
||||
self.names[tensor] = name
|
||||
if init != None:
|
||||
init.name = name
|
||||
|
@ -605,17 +663,25 @@ class OnnxStub:
|
|||
return name
|
||||
|
||||
def push_node(self, node: NodeProto) -> None:
|
||||
check_node(node)
|
||||
if self.enable_check:
|
||||
check_node(node)
|
||||
self.nodes.append(node)
|
||||
|
||||
def build(self, name: str) -> ModelProto:
|
||||
graph = make_graph(
|
||||
self.nodes, name, self.inputs, self.outputs, self.initializers
|
||||
self.nodes,
|
||||
name,
|
||||
self.inputs,
|
||||
self.outputs,
|
||||
self.initializers,
|
||||
value_info=self.value_info,
|
||||
)
|
||||
check_graph(graph)
|
||||
if self.enable_check:
|
||||
check_graph(graph)
|
||||
|
||||
model = make_model(graph)
|
||||
check_model(model)
|
||||
if self.enable_check:
|
||||
check_model(model)
|
||||
|
||||
return model
|
||||
|
||||
|
@ -625,7 +691,7 @@ class OnnxStub:
|
|||
|
||||
ops = self.handler.operators() # 图中所有算子(节点)
|
||||
|
||||
ctx = Context()
|
||||
ctx = Context(not self.disable_check)
|
||||
|
||||
for op in ops:
|
||||
ty, name = ctx.name_op(op)
|
||||
|
@ -812,6 +878,18 @@ class OnnxStub:
|
|||
ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
|
||||
)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
elif ty == backend.OpType.MemBound:
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
domain="nnet",
|
||||
expr=backend.membound_expr_of(op),
|
||||
hash=str(backend.membound_hash_of(op)),
|
||||
)
|
||||
)
|
||||
else:
|
||||
raise Exception("Unsupported OpType", ty)
|
||||
|
||||
|
@ -828,7 +906,7 @@ class OnnxStub:
|
|||
|
||||
|
||||
def from_onnx(model: ModelProto, runtime):
|
||||
stub = OnnxStub(model, runtime)
|
||||
stub = OnnxStub.from_onnx(model, runtime)
|
||||
return stub.inputs, stub.outputs, stub.handler
|
||||
|
||||
|
||||
|
|
|
@ -40,9 +40,9 @@ class TestStringMethods(unittest.TestCase):
|
|||
file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
|
||||
)
|
||||
)
|
||||
model = OnnxStub(onnx.load(model_file), backend.cpu_runtime()).to_onnx(
|
||||
"new"
|
||||
)
|
||||
model = OnnxStub.from_onnx(
|
||||
onnx.load(model_file), backend.cpu_runtime()
|
||||
).to_onnx("new")
|
||||
model = infer_shapes(model)
|
||||
|
||||
def test_tensor(self):
|
||||
|
@ -304,16 +304,16 @@ class TestStringMethods(unittest.TestCase):
|
|||
|
||||
def test_frontend(self):
|
||||
handler = backend.GraphHandler(backend.cpu_runtime())
|
||||
a = handler.tensor([1, 2, 3], 12)
|
||||
b = handler.tensor([1, 2, 3], 12)
|
||||
c = handler.tensor([1, 2, 3], 12)
|
||||
d = handler.tensor([1, 2, 3], 12)
|
||||
e = handler.tensor([1, 2, 3], 12)
|
||||
a = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
|
||||
b = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
|
||||
c = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
|
||||
d = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
|
||||
e = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
|
||||
|
||||
x = handler.add(
|
||||
handler.add(handler.add(handler.add(a, b, None), c, None), d, None), e, None
|
||||
)
|
||||
y = handler.tensor([3, 2, 1], 12)
|
||||
y = handler.tensor([3, 2, 1], 12, backend.TensorType.Other)
|
||||
handler.reshape(x, y, [3, 2, 1])
|
||||
|
||||
|
||||
|
|
|
@ -1,106 +1,131 @@
|
|||
import re
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from contextlib import redirect_stdout
|
||||
import time
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import tvm
|
||||
from tvm import te, tir, auto_scheduler, topi
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
|
||||
USE_CACHE = True
|
||||
logging.basicConfig()
|
||||
logger = logging.getLogger('InfiniTensor')
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def gen_ansor_so(input_tensors, input_dtypes, output_tensor, output_dtype,
|
||||
tvm_code, func_name, nnet_expression: str,
|
||||
nnet_simplified_expression: str, hash_code=None):
|
||||
nnet_simplified_expression: str, hash_code: str = None):
|
||||
assert len(input_tensors) == len(input_dtypes)
|
||||
|
||||
logging.debug(f'Work on hash {hash_code}')
|
||||
|
||||
logger.debug(f'Work on hash {hash_code}')
|
||||
dir_name = os.path.join(".cache", "generated_kernels", str(hash_code))
|
||||
|
||||
|
||||
if not os.path.exists(dir_name):
|
||||
os.makedirs(dir_name)
|
||||
|
||||
|
||||
so_fn = os.path.join(dir_name, f"{func_name}.so")
|
||||
config_fn = os.path.join(dir_name, "config_so.json")
|
||||
|
||||
print("Generating Ansor op: ")
|
||||
print(tvm_code)
|
||||
|
||||
print("Input shape: ")
|
||||
print(input_tensors)
|
||||
print("Output shape: ")
|
||||
print(output_tensor)
|
||||
|
||||
desc_fn = os.path.join(dir_name, "desc.txt")
|
||||
log_fn = os.path.join(dir_name, f"ansor_{func_name}_log.json")
|
||||
out_fn = os.path.join(dir_name, "out.txt")
|
||||
|
||||
logger.debug(f"Generating Ansor op: {tvm_code}")
|
||||
logger.debug(f"Input shape: {input_tensors}")
|
||||
logger.debug(f"Output shape: {output_tensor}")
|
||||
|
||||
if USE_CACHE and hash_code is not None:
|
||||
if os.path.exists(dir_name) and \
|
||||
os.path.exists(so_fn) and \
|
||||
os.path.exists(config_fn):
|
||||
os.path.exists(so_fn) and \
|
||||
os.path.exists(config_fn):
|
||||
print(f"Use cache in {dir_name}")
|
||||
with open(config_fn, "r") as config_fin:
|
||||
config = json.loads(config_fin.read().strip())
|
||||
conv_time = config["conv_time"]
|
||||
|
||||
logger.debug(f'Find tuning log for {hash_code}')
|
||||
logger.info(f'Find tuning log for {hash_code} in {so_fn}')
|
||||
return so_fn, conv_time
|
||||
|
||||
logger.info(f"TVM Tuning kernel with hash {hash_code}. See {out_fn}")
|
||||
|
||||
time_start = time.perf_counter()
|
||||
# Print descriptions of the task
|
||||
if USE_CACHE and hash_code is not None:
|
||||
with redirect_stdout(open(desc_fn, "w")):
|
||||
print("====NNET tensor expression====")
|
||||
print(nnet_expression+"\n")
|
||||
print("====NNET simplified tensor expression====")
|
||||
print(nnet_simplified_expression+"\n")
|
||||
print("====TVM compute====")
|
||||
print(tvm_code+"\n")
|
||||
print("Input shape: ", input_tensors)
|
||||
print("Output shape: ", output_tensor)
|
||||
|
||||
@auto_scheduler.register_workload(func_name)
|
||||
def compute():
|
||||
_locals = locals()
|
||||
exec(tvm_code, {'tvm': tvm, 'te': te, 'tir': tir, 'topi': topi}, _locals)
|
||||
exec(tvm_code, {'tvm': tvm, 'te': te,
|
||||
'tir': tir, 'topi': topi}, _locals)
|
||||
return _locals['ret']
|
||||
|
||||
|
||||
target = tvm.target.Target("cuda")
|
||||
|
||||
task = auto_scheduler.SearchTask(func=func_name, args=(), target=target)
|
||||
|
||||
# Inspect the computational graph
|
||||
print("Computational DAG:")
|
||||
print(task.compute_dag)
|
||||
with redirect_stdout(open(out_fn, 'w')):
|
||||
# Inspect the computational graph
|
||||
print("Computational DAG:")
|
||||
print(task.compute_dag)
|
||||
|
||||
log_file = f"ansor_{func_name}_log.json"
|
||||
measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
|
||||
tune_option = auto_scheduler.TuningOptions(
|
||||
num_measure_trials=10,
|
||||
runner=measure_ctx.runner,
|
||||
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
|
||||
verbose=2,
|
||||
)
|
||||
measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
|
||||
tune_option = auto_scheduler.TuningOptions(
|
||||
num_measure_trials=10,
|
||||
runner=measure_ctx.runner,
|
||||
measure_callbacks=[auto_scheduler.RecordToFile(log_fn)],
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
# Run auto-tuning (search)
|
||||
task.tune(tune_option)
|
||||
# Apply the best schedule
|
||||
sch, args = task.apply_best(log_file)
|
||||
# Run auto-tuning (search)
|
||||
task.tune(tune_option)
|
||||
# Apply the best schedule
|
||||
sch, args = task.apply_best(log_fn)
|
||||
|
||||
# Kill the measurement process
|
||||
del measure_ctx
|
||||
|
||||
func = tvm.build(sch, args, target, name=func_name)
|
||||
func.export_library(so_fn)
|
||||
|
||||
ctx = tvm.cuda(0)
|
||||
input_a = []
|
||||
for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
|
||||
a_np = np.random.uniform(size=shape).astype(dtype)
|
||||
input_a.append(tvm.nd.array(a_np, ctx))
|
||||
a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
|
||||
func(a_out, *input_a)
|
||||
evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
|
||||
conv_time = evaluator(a_out, *input_a).mean * 1e3
|
||||
|
||||
time_end = time.perf_counter()
|
||||
|
||||
# Kill the measurement process
|
||||
del measure_ctx
|
||||
|
||||
func = tvm.build(sch, args, target, name=func_name)
|
||||
func.export_library(so_fn)
|
||||
|
||||
ctx = tvm.cuda(0)
|
||||
input_a = []
|
||||
for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
|
||||
a_np = np.random.uniform(size=shape).astype(dtype)
|
||||
input_a.append(tvm.nd.array(a_np, ctx))
|
||||
a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
|
||||
func(a_out, *input_a)
|
||||
evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
|
||||
conv_time = evaluator(a_out, *input_a).mean * 1e3
|
||||
|
||||
print("====NNET tensor expression====")
|
||||
print(nnet_expression+"\n")
|
||||
print("====NNET simplified tensor expression====")
|
||||
print(nnet_simplified_expression+"\n")
|
||||
print("====Time====")
|
||||
print(conv_time)
|
||||
|
||||
if USE_CACHE and hash_code is not None:
|
||||
with open(config_fn, "w") as config_fout:
|
||||
config_fout.write(json.dumps({
|
||||
"conv_time": conv_time,
|
||||
"tuning_time": time_end - time_start,
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
|
||||
}, ensure_ascii=False, indent=2))
|
||||
|
||||
|
||||
return so_fn, conv_time
|
||||
|
||||
# Read arguments from pipe, which is redirected to stdin.
|
||||
# Write generated library path to pipe.
|
||||
|
||||
|
||||
def pipe_gen(fd: int):
|
||||
args = json.load(sys.stdin) # read from pipe
|
||||
# print(args, f'fd={fd}')
|
||||
ret = gen_ansor_so(**args)
|
||||
with os.fdopen(fd, 'w') as f:
|
||||
print(ret[0], file=f, end='') # write to pipe
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
import backend
|
||||
from backend import *
|
||||
import sys
|
||||
|
||||
sys.path.extend(__path__)
|
||||
|
||||
print("import backend: {}".format(backend))
|
|
@ -0,0 +1,941 @@
|
|||
import backend
|
||||
import onnx
|
||||
from onnx import (
|
||||
ModelProto,
|
||||
TensorProto,
|
||||
NodeProto,
|
||||
AttributeProto,
|
||||
TensorShapeProto,
|
||||
ValueInfoProto,
|
||||
)
|
||||
from onnx.helper import (
|
||||
make_node,
|
||||
make_tensor_value_info,
|
||||
make_tensor,
|
||||
make_graph,
|
||||
make_model,
|
||||
)
|
||||
from onnx.checker import (
|
||||
check_graph,
|
||||
check_model,
|
||||
check_node,
|
||||
check_value_info,
|
||||
check_tensor,
|
||||
)
|
||||
from onnx.shape_inference import infer_shapes
|
||||
from onnx.numpy_helper import to_array
|
||||
from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
|
||||
from functools import reduce
|
||||
|
||||
|
||||
class OnnxStub:
|
||||
"""
|
||||
The Onnx model imported into infinitensor.
|
||||
It can be generated from an Onnx model object.
|
||||
"""
|
||||
|
||||
# inputs: Dict[str, backend.Tensor] = {}
|
||||
# outputs: Dict[str, backend.Tensor] = {}
|
||||
initializer: Dict[int, TensorProto] = {}
|
||||
# handler: backend.GraphHandler
|
||||
|
||||
# def __init__(self, model: ModelProto, runtime):
|
||||
# model = infer_shapes(model)
|
||||
# self.handler = backend.GraphHandler(runtime)
|
||||
|
||||
# tensors: Dict[str, backend.Tensor] = dict()
|
||||
# data: Dict[str, TensorProto] = dict()
|
||||
|
||||
# for input in model.graph.input:
|
||||
# dims = _take_shape_dim(input.type.tensor_type.shape)
|
||||
# tensors[input.name] = self.handler.tensor(
|
||||
# dims, input.type.tensor_type.elem_type
|
||||
# )
|
||||
|
||||
# for output in model.graph.output:
|
||||
# dims = _take_shape_dim(output.type.tensor_type.shape)
|
||||
# tensors[output.name] = self.handler.tensor(
|
||||
# dims, output.type.tensor_type.elem_type
|
||||
# )
|
||||
|
||||
# for initializer in model.graph.initializer:
|
||||
# dims = [d for d in initializer.dims]
|
||||
# tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
|
||||
# data[initializer.name] = initializer
|
||||
|
||||
# for node in model.graph.node:
|
||||
# if node.op_type == "Conv":
|
||||
# attributes = _parse_attribute(
|
||||
# node,
|
||||
# {
|
||||
# "dilations": [1, 1],
|
||||
# "pads": [0, 0, 0, 0],
|
||||
# "strides": [1, 1],
|
||||
# },
|
||||
# )
|
||||
# (d, p, s) = (
|
||||
# attributes[name] for name in ["dilations", "pads", "strides"]
|
||||
# )
|
||||
# if p[0] != p[2] or p[1] != p[3]:
|
||||
# adapt = "{}-adapt".format(node.output[0])
|
||||
# tensors[adapt] = self.handler.pad(
|
||||
# tensors[node.input[0]], None, p, [-2, -1]
|
||||
# )
|
||||
# p = [0, 0, 0, 0]
|
||||
# else:
|
||||
# adapt = node.input[0]
|
||||
|
||||
# if len(node.input) > 2:
|
||||
# bias = "{}-bias".format(node.output[0])
|
||||
# reshape = "{}-reshape".format(node.output[0])
|
||||
# tensors[bias] = self.handler.conv(
|
||||
# tensors[adapt],
|
||||
# tensors[node.input[1]],
|
||||
# None,
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# )
|
||||
# tensors[reshape] = self.handler.reshape(
|
||||
# tensors[node.input[2]],
|
||||
# None,
|
||||
# [
|
||||
# 1,
|
||||
# reduce(
|
||||
# lambda acc, x: acc * x,
|
||||
# _search_shape(model, node.input[2]),
|
||||
# ),
|
||||
# 1,
|
||||
# 1,
|
||||
# ],
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.add(
|
||||
# tensors[bias],
|
||||
# tensors[reshape],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# else:
|
||||
# tensors[node.output[0]] = self.handler.conv(
|
||||
# tensors[adapt],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# )
|
||||
# elif node.op_type == "ConvTranspose":
|
||||
# attributes = _parse_attribute(
|
||||
# node,
|
||||
# {
|
||||
# "dilations": [1, 1],
|
||||
# "pads": [0, 0],
|
||||
# "strides": [1, 1],
|
||||
# "output_padding": [0, 0],
|
||||
# },
|
||||
# )
|
||||
# (d, p, s, op) = (
|
||||
# attributes[name]
|
||||
# for name in ["dilations", "pads", "strides", "output_padding"]
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.convTransposed2d(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# op[0],
|
||||
# op[1],
|
||||
# )
|
||||
# elif node.op_type == "MatMul":
|
||||
# tensors[node.output[0]] = self.handler.matmul(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# False,
|
||||
# False,
|
||||
# None,
|
||||
# backend.ActType.Linear,
|
||||
# )
|
||||
# elif node.op_type == "Gemm":
|
||||
# attributes = _parse_attribute(
|
||||
# node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
|
||||
# )
|
||||
# (alpha, beta, transA, transB) = (
|
||||
# attributes[name] for name in ["alpha", "beta", "transA", "transB"]
|
||||
# )
|
||||
# # FIXME unsupport attributes: `alpha` `beta`
|
||||
# assert alpha == 1.0
|
||||
# assert beta == 1.0
|
||||
# tensors[node.output[0]] = self.handler.matmul(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# transA == 1,
|
||||
# transB == 1,
|
||||
# tensors[node.input[2]] if len(node.input) > 2 else None,
|
||||
# backend.ActType.Linear,
|
||||
# )
|
||||
# elif node.op_type == "BatchNormalization":
|
||||
# (input, mean, var, scale, bias) = (
|
||||
# tensors[node.input[i]] for i in [0, 3, 4, 1, 2]
|
||||
# )
|
||||
# output = tensors.get(node.output[0])
|
||||
# attributes = _parse_attribute(
|
||||
# node, {"momentum": 0.9, "epsilon": 1e-05, "training_mode": 0}
|
||||
# )
|
||||
# (momentum, eps, training) = (
|
||||
# attributes[name]
|
||||
# for name in ["momentum", "epsilon", "training_mode"]
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.batchNorm(
|
||||
# input, output, mean, var, scale, bias, momentum, eps, training != 0
|
||||
# )
|
||||
# elif node.op_type == "MaxPool":
|
||||
# attributes = _parse_attribute(
|
||||
# node,
|
||||
# {
|
||||
# "kernel_shape": None,
|
||||
# "dilations": [1, 1],
|
||||
# "pads": [0, 0, 0, 0],
|
||||
# "strides": [1, 1],
|
||||
# },
|
||||
# )
|
||||
# (k, d, p, s) = (
|
||||
# attributes[name]
|
||||
# for name in ["kernel_shape", "dilations", "pads", "strides"]
|
||||
# )
|
||||
# if p[0] != p[2] or p[1] != p[3]:
|
||||
# adapt = "{}-adapt".format(node.output[0])
|
||||
# tensors[adapt] = self.handler.pad(
|
||||
# tensors.get(node.input[0]), None, p, [-2, -1]
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.maxPool(
|
||||
# tensors[adapt],
|
||||
# tensors.get(node.output[0]),
|
||||
# k[0],
|
||||
# k[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# 0,
|
||||
# 0,
|
||||
# s[0],
|
||||
# s[1],
|
||||
# )
|
||||
# else:
|
||||
# tensors[node.output[0]] = self.handler.maxPool(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# k[0],
|
||||
# k[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# )
|
||||
# elif node.op_type == "AveragePool":
|
||||
# attributes = _parse_attribute(
|
||||
# node,
|
||||
# {
|
||||
# "kernel_shape": None,
|
||||
# "pads": [0, 0, 0, 0],
|
||||
# "strides": [1, 1],
|
||||
# },
|
||||
# )
|
||||
# (k, p, s) = (
|
||||
# attributes[name] for name in ["kernel_shape", "pads", "strides"]
|
||||
# )
|
||||
# if p[0] != p[2] or p[1] != p[3]:
|
||||
# adapt = "{}-adapt".format(node.output[0])
|
||||
# tensors[adapt] = self.handler.pad(
|
||||
# tensors.get(node.input[0]), None, p, [-2, -1]
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.avgPool(
|
||||
# tensors[adapt],
|
||||
# tensors.get(node.output[0]),
|
||||
# k[0],
|
||||
# k[1],
|
||||
# 1,
|
||||
# 1,
|
||||
# 0,
|
||||
# 0,
|
||||
# s[0],
|
||||
# s[1],
|
||||
# )
|
||||
# else:
|
||||
# tensors[node.output[0]] = self.handler.avgPool(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# k[0],
|
||||
# k[1],
|
||||
# 1,
|
||||
# 1,
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# )
|
||||
# elif node.op_type == "GlobalAveragePool":
|
||||
# [_, _, h, w] = _search_shape(model, node.input[0])
|
||||
# tensors[node.output[0]] = self.handler.avgPool(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# h,
|
||||
# w,
|
||||
# 1,
|
||||
# 1,
|
||||
# 0,
|
||||
# 0,
|
||||
# 1,
|
||||
# 1,
|
||||
# )
|
||||
# elif node.op_type == "Add":
|
||||
# tensors[node.output[0]] = self.handler.add(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Sub":
|
||||
# tensors[node.output[0]] = self.handler.sub(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Mul":
|
||||
# tensors[node.output[0]] = self.handler.mul(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Div":
|
||||
# tensors[node.output[0]] = self.handler.div(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Pow":
|
||||
# tensors[node.output[0]] = self.handler.pow(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Relu":
|
||||
# tensors[node.output[0]] = self.handler.relu(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Sigmoid":
|
||||
# tensors[node.output[0]] = self.handler.sigmoid(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Tanh":
|
||||
# tensors[node.output[0]] = self.handler.tanh(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Softmax":
|
||||
# tensors[node.output[0]] = self.handler.softmax(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Abs":
|
||||
# tensors[node.output[0]] = self.handler.abs(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Shape":
|
||||
# tensors[node.output[0]] = self.handler.shape(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Identity":
|
||||
# tensors[node.output[0]] = self.handler.identity(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Flatten":
|
||||
# # FIXME axis must be 1
|
||||
# axis = next(
|
||||
# (attr.i for attr in node.attribute if attr.name == "axis"), None
|
||||
# )
|
||||
# assert axis == None or axis == 1
|
||||
# tensors[node.output[0]] = self.handler.flatten(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "PRelu":
|
||||
# tensors[node.output[0]] = self.handler.pRelu(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Clip":
|
||||
# tensors[node.output[0]] = self.handler.clip(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# next(_parse_data(data[node.input[1]]).__iter__(), None)
|
||||
# if len(node.input) > 1
|
||||
# else None,
|
||||
# next(_parse_data(data[node.input[2]]).__iter__(), None)
|
||||
# if len(node.input) > 2
|
||||
# else None,
|
||||
# )
|
||||
# elif node.op_type == "Transpose":
|
||||
# perm = next(
|
||||
# (attr.ints for attr in node.attribute if attr.name == "perm"), None
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.transpose(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# perm,
|
||||
# )
|
||||
# elif node.op_type == "Reshape":
|
||||
# dims = _search_shape(model, node.input[0])
|
||||
# size = reduce(lambda acc, x: acc * x, dims)
|
||||
# input_shape = _parse_data(data[node.input[1]])
|
||||
# for i, x in enumerate(input_shape):
|
||||
# if x == 0:
|
||||
# input_shape[i] = dims[i]
|
||||
# temp = reduce(lambda acc, x: acc * x, input_shape, 1)
|
||||
# if temp < 0:
|
||||
# input_shape[input_shape.index(-1)] = size // -temp
|
||||
# tensors[node.output[0]] = self.handler.reshape(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# input_shape,
|
||||
# )
|
||||
# elif node.op_type == "Squeeze":
|
||||
# input_shape = _search_shape(model, node.input[0])
|
||||
# axes = set(
|
||||
# [int(i) for i in data[node.input[1]].int64_data]
|
||||
# if len(node.input) > 1
|
||||
# else _parse_attribute(node, {"axes": None})["axes"]
|
||||
# )
|
||||
# assert all(input_shape[d] == 1 for d in axes)
|
||||
# output_shape = []
|
||||
# for i, x in enumerate(input_shape):
|
||||
# if i not in axes:
|
||||
# output_shape.append(x)
|
||||
# tensors[node.output[0]] = self.handler.reshape(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# output_shape,
|
||||
# )
|
||||
# elif node.op_type == "Unsqueeze":
|
||||
# input_shape = _search_shape(model, node.input[0])
|
||||
# axes = (
|
||||
# [int(i) for i in data[node.input[1]].int64_data]
|
||||
# if len(node.input) > 1
|
||||
# else _parse_attribute(node, {"axes": None})["axes"]
|
||||
# )
|
||||
# for i in axes:
|
||||
# input_shape.insert(i, 1)
|
||||
# tensors[node.output[0]] = self.handler.reshape(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# input_shape,
|
||||
# )
|
||||
# elif node.op_type == "Concat":
|
||||
# tensors[node.output[0]] = self.handler.concat(
|
||||
# [tensors[name] for name in node.input],
|
||||
# tensors.get(node.output[0]),
|
||||
# next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||
# )
|
||||
# elif node.op_type == "Split":
|
||||
# for name, tensor in zip(
|
||||
# node.output,
|
||||
# self.handler.split(
|
||||
# tensors[node.input[0]],
|
||||
# None,
|
||||
# next(
|
||||
# (attr.i for attr in node.attribute if attr.name == "axis"),
|
||||
# 0,
|
||||
# ),
|
||||
# len(node.output),
|
||||
# ),
|
||||
# ):
|
||||
# tensors[name] = tensor
|
||||
# elif node.op_type == "Gather":
|
||||
# tensors[node.output[0]] = self.handler.gather(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||
# )
|
||||
# elif node.op_type == "ReduceMean":
|
||||
# tensors[node.output[0]] = self.handler.reduce_mean(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# next(
|
||||
# (attr.ints for attr in node.attribute if attr.name == "axes"),
|
||||
# None,
|
||||
# ),
|
||||
# next((attr.i for attr in node.attribute if attr.name == "keepdims"))
|
||||
# != 0,
|
||||
# )
|
||||
# elif node.op_type == "Slice":
|
||||
# tensors[node.output[0]] = self.handler.slice(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# _parse_data(data[node.input[1]]),
|
||||
# _parse_data(data[node.input[2]]),
|
||||
# _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
|
||||
# _parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
|
||||
# )
|
||||
# elif node.op_type == "Pad":
|
||||
# tensors[node.output[0]] = self.handler.pad(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# _parse_data(data[node.input[1]]),
|
||||
# _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
|
||||
# )
|
||||
# elif node.op_type == "Dropout":
|
||||
# for name, tensor in zip(
|
||||
# node.output,
|
||||
# self.handler.dropout(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# tensors.get(node.output[1]) if len(node.output) > 1 else None,
|
||||
# _parse_data(data[node.input[1]])[0]
|
||||
# if len(node.input) > 1
|
||||
# else 0.5,
|
||||
# _parse_data(data[node.input[2]])[0]
|
||||
# if len(node.input) > 2
|
||||
# else False,
|
||||
# ),
|
||||
# ):
|
||||
# tensors[name] = tensor
|
||||
# else:
|
||||
# raise Exception('Unsupported operator "{}"'.format(node.op_type))
|
||||
|
||||
# self.handler.data_malloc()
|
||||
|
||||
# for name, obj in tensors.items():
|
||||
# tensor = data.get(name)
|
||||
# if tensor == None:
|
||||
# if any(input.name == name for input in model.graph.input):
|
||||
# self.inputs[name] = obj
|
||||
# else:
|
||||
# self.initializer[obj.fuid()] = tensor
|
||||
# if tensor.data_type == TensorProto.INT32:
|
||||
# obj.copyin_int32(_parse_data(tensor))
|
||||
# elif tensor.data_type == TensorProto.INT64:
|
||||
# obj.copyin_int64(_parse_data(tensor))
|
||||
# elif tensor.data_type == TensorProto.FLOAT:
|
||||
# obj.copyin_float(_parse_data(tensor))
|
||||
# else:
|
||||
# assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
|
||||
|
||||
# for output in model.graph.output:
|
||||
# self.outputs[output.name] = tensors[output.name]
|
||||
|
||||
def to_onnx(self, g: backend.Graph, path: str, name: str = 'my_onnx') -> ModelProto:
|
||||
class Context:
|
||||
# saves object names, including tensors and operators
|
||||
names: Dict[Union[backend.Tensor, backend.Operator], str] = dict()
|
||||
# counts the occurrence times of each operator for naming
|
||||
count_op: Dict[backend.OpType, int] = dict()
|
||||
# counts input and output tensors for naming
|
||||
count_in, count_out = 0, 0
|
||||
# saves nodes (operators)
|
||||
nodes: List[NodeProto] = []
|
||||
# saves global input tensors
|
||||
inputs: List[ValueInfoProto] = []
|
||||
# saves global output tensors
|
||||
outputs: List[ValueInfoProto] = []
|
||||
# saves global input tensors
|
||||
initializers: List[TensorProto] = []
|
||||
|
||||
def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
|
||||
ty = op.op_type()
|
||||
name = "{}_{}".format(ty.name, op.guid())
|
||||
self.names[op] = name
|
||||
self.count_op[ty] = self.count_op.get(ty, 0) + 1
|
||||
return ty, name
|
||||
|
||||
def push_output(self, name: str, tensor: backend.Tensor) -> str:
|
||||
self.names[tensor] = name
|
||||
if not tensor.has_target():
|
||||
shape = tensor.shape()
|
||||
dtype = backend.tensor_dtype(tensor)
|
||||
value_info = make_tensor_value_info(name, dtype, shape)
|
||||
check_value_info(value_info)
|
||||
self.outputs.append(value_info)
|
||||
return name
|
||||
|
||||
def push_input(
|
||||
self, tensor: backend.Tensor, init: Optional[TensorProto]
|
||||
) -> str:
|
||||
name = self.names.get(tensor)
|
||||
# means that this input is a global input
|
||||
if name is None:
|
||||
self.count_in += 1
|
||||
name = "input_{}".format(tensor.guid())
|
||||
self.names[tensor] = name
|
||||
if init != None:
|
||||
init.name = name
|
||||
self.initializers.append(init)
|
||||
else:
|
||||
shape = tensor.shape()
|
||||
dtype = backend.tensor_dtype(tensor)
|
||||
value_info = make_tensor_value_info(name, dtype, shape)
|
||||
check_value_info(value_info)
|
||||
self.inputs.append(value_info)
|
||||
return name
|
||||
|
||||
def push_data_input(
|
||||
self,
|
||||
node_name: str,
|
||||
attr_name: str,
|
||||
elem_type: int,
|
||||
shape: Sequence[int],
|
||||
vals: Any,
|
||||
) -> str:
|
||||
name = "{}_{}".format(node_name, attr_name)
|
||||
tensor = make_tensor(name, elem_type, shape, vals)
|
||||
check_tensor(tensor)
|
||||
self.initializers.append(tensor)
|
||||
return name
|
||||
|
||||
def push_node(self, node: NodeProto) -> None:
|
||||
# check_node(node)
|
||||
self.nodes.append(node)
|
||||
|
||||
def build(self, name: str) -> ModelProto:
|
||||
graph = make_graph(
|
||||
self.nodes, name, self.inputs, self.outputs, self.initializers
|
||||
)
|
||||
# check_graph(graph)
|
||||
|
||||
model = make_model(graph)
|
||||
# check_model(model)
|
||||
|
||||
return model
|
||||
|
||||
# 拓扑排序
|
||||
if not g.topo_sort():
|
||||
raise Exception("Sorting fails")
|
||||
|
||||
ops = g.operators() # 图中所有算子(节点)
|
||||
|
||||
ctx = Context()
|
||||
|
||||
for op in ops:
|
||||
ty, name = ctx.name_op(op)
|
||||
inputs = [
|
||||
ctx.push_input(it, self.initializer.get(it.fuid()))
|
||||
for it in op.inputs()
|
||||
]
|
||||
outputs = [
|
||||
ctx.push_output("{}_{}_{}".format(
|
||||
name, i, tensor.guid()), tensor)
|
||||
for (i, tensor) in enumerate(op.outputs())
|
||||
]
|
||||
if ty == backend.OpType.Conv:
|
||||
ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
pads=[ph, pw, ph, pw],
|
||||
strides=[sh, sw],
|
||||
dilations=[dh, dw],
|
||||
group=op.inputs()[0].shape()[
|
||||
1] // op.inputs()[1].shape()[1],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.ConvTrans:
|
||||
ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(
|
||||
op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"ConvTranspose",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
pads=[ph, pw],
|
||||
strides=[sh, sw],
|
||||
dilations=[dh, dw],
|
||||
output_padding=[oph, opw],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.ConvTransNHWC:
|
||||
# ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"ConvTranspose",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
domain="nnet",
|
||||
# pads=[ph, pw],
|
||||
# strides=[sh, sw],
|
||||
# dilations=[dh, dw],
|
||||
# output_padding=[oph, opw],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.MemBound:
|
||||
# ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"Membound",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
domain="nnet",
|
||||
# pads=[ph, pw],
|
||||
# strides=[sh, sw],
|
||||
# dilations=[dh, dw],
|
||||
# output_padding=[oph, opw],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.Matmul:
|
||||
# transA, transB = backend.matmul_attrs_of(op)
|
||||
# HACK: recover this
|
||||
transA, transB = False, False
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"Gemm", inputs, outputs, name, transA=transA, transB=transB
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.BatchNorm:
|
||||
inputs = [inputs[i] for i in [0, 3, 4, 1, 2]]
|
||||
momentum, eps, training = backend.batch_norm_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"BatchNormalization",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
epsilon=eps,
|
||||
momentum=momentum,
|
||||
training_mode=training,
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.MaxPool:
|
||||
kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
kernel_shape=[kh, kw],
|
||||
pads=[ph, pw, ph, pw],
|
||||
dilations=[dh, dw],
|
||||
strides=[sh, sw],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.AvgPool:
|
||||
kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"AveragePool",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
kernel_shape=[kh, kw],
|
||||
pads=[ph, pw, ph, pw],
|
||||
strides=[sh, sw],
|
||||
)
|
||||
)
|
||||
elif ty in [
|
||||
backend.OpType.Add,
|
||||
backend.OpType.Sub,
|
||||
backend.OpType.Mul,
|
||||
backend.OpType.Div,
|
||||
backend.OpType.Pow,
|
||||
backend.OpType.Relu,
|
||||
backend.OpType.Sigmoid,
|
||||
backend.OpType.Tanh,
|
||||
backend.OpType.Softmax,
|
||||
backend.OpType.Abs,
|
||||
backend.OpType.Identity,
|
||||
backend.OpType.PRelu,
|
||||
]:
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
elif ty == backend.OpType.Flatten:
|
||||
raise Exception("TODO")
|
||||
elif ty == backend.OpType.Transpose:
|
||||
perm = backend.transpose_permute_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs,
|
||||
outputs, name, perm=perm))
|
||||
elif ty == backend.OpType.Reshape:
|
||||
shape = backend.reshape_shape_of(op)
|
||||
inputs.append(
|
||||
ctx.push_data_input(
|
||||
name,
|
||||
"shape",
|
||||
TensorProto.INT64,
|
||||
[len(shape)],
|
||||
shape,
|
||||
)
|
||||
)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
elif ty == backend.OpType.Concat:
|
||||
axis = backend.concat_axis_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs,
|
||||
outputs, name, axis=axis))
|
||||
elif ty == backend.OpType.Split:
|
||||
axis = backend.split_axis_of(op)
|
||||
num_outputs = len(outputs)
|
||||
split = op.inputs()[0].shape()[axis] // num_outputs
|
||||
inputs.append(
|
||||
ctx.push_data_input(
|
||||
name,
|
||||
"split",
|
||||
TensorProto.INT64,
|
||||
[len(outputs)],
|
||||
[split for _ in range(0, num_outputs)],
|
||||
)
|
||||
)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
axis=axis,
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.Gather:
|
||||
axis = backend.gather_axis_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs,
|
||||
outputs, name, axis=axis))
|
||||
elif ty == backend.OpType.ReduceMean:
|
||||
axes, keepdims = backend.reduce_mean_attrs_of(op)
|
||||
inputs.append(
|
||||
ctx.push_data_input(
|
||||
name, "axes", TensorProto.INT64, [len(axes)], axes
|
||||
)
|
||||
)
|
||||
ctx.push_node(
|
||||
make_node(ty.name, inputs, outputs,
|
||||
name, keepdims=keepdims)
|
||||
)
|
||||
elif ty == backend.OpType.Slice:
|
||||
raise Exception("TODO")
|
||||
elif ty == backend.OpType.Pad:
|
||||
pads = backend.pad_pads_of(op)
|
||||
inputs.append(
|
||||
ctx.push_data_input(
|
||||
name, "pads", TensorProto.INT64, [len(pads)], pads
|
||||
)
|
||||
)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
# elif ty == backend.OpType.Clip:
|
||||
# min, max = backend.clip_attrs_of(op)
|
||||
# if min != None:
|
||||
# inputs.append(
|
||||
# ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [min])
|
||||
# )
|
||||
# else:
|
||||
# inputs.append(
|
||||
# ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [])
|
||||
# )
|
||||
# if max != None:
|
||||
# inputs.append(
|
||||
# ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [max])
|
||||
# )
|
||||
# else:
|
||||
# inputs.append(
|
||||
# ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
|
||||
# )
|
||||
# ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
else:
|
||||
raise Exception("Unsupported OpType", ty)
|
||||
|
||||
model = ctx.build(name)
|
||||
onnx.save(model, path)
|
||||
return model
|
||||
|
||||
# def init(self) -> None:
|
||||
# self.handler.data_malloc()
|
||||
|
||||
# def optimize(self) -> None:
|
||||
# self.handler.optimize()
|
||||
|
||||
# def run(self) -> None:
|
||||
# self.handler.run()
|
||||
|
||||
|
||||
# def from_onnx(model: ModelProto, runtime):
|
||||
# stub = OnnxStub(model, runtime)
|
||||
# return stub.inputs, stub.outputs, stub.handler
|
||||
|
||||
|
||||
# def _search_shape(model: ModelProto, name: str) -> List[int]:
|
||||
# ans = (
|
||||
# next(
|
||||
# (
|
||||
# [
|
||||
# (d.dim_value if d.dim_value > 0 else 1)
|
||||
# for d in tensor.type.tensor_type.shape.dim
|
||||
# ]
|
||||
# for tensor in model.graph.value_info
|
||||
# if tensor.name == name
|
||||
# ),
|
||||
# None,
|
||||
# )
|
||||
# or next(
|
||||
# (
|
||||
# [
|
||||
# (d.dim_value if d.dim_value > 0 else 1)
|
||||
# for d in tensor.type.tensor_type.shape.dim
|
||||
# ]
|
||||
# for tensor in model.graph.input
|
||||
# if tensor.name == name
|
||||
# ),
|
||||
# None,
|
||||
# )
|
||||
# or next(
|
||||
# [int(d) for d in tensor.dims]
|
||||
# for tensor in model.graph.initializer
|
||||
# if tensor.name == name
|
||||
# )
|
||||
# )
|
||||
# return ans
|
||||
|
||||
|
||||
# def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
|
||||
# for attr in node.attribute:
|
||||
# if attr.name in attrs:
|
||||
# if attr.type == AttributeProto.INT:
|
||||
# attrs[attr.name] = attr.i
|
||||
# elif attr.type == AttributeProto.INTS:
|
||||
# attrs[attr.name] = attr.ints
|
||||
# elif attr.type == AttributeProto.FLOAT:
|
||||
# attrs[attr.name] = attr.f
|
||||
# elif attr.type == AttributeProto.STRING:
|
||||
# attrs[attr.name] = attr.s
|
||||
# elif attr.type == AttributeProto.TENSOR:
|
||||
# attrs[attr.name] = attr.t
|
||||
# else:
|
||||
# assert False, "Unsupported Attribute Type: {}".format(attr.type)
|
||||
# return attrs
|
||||
|
||||
|
||||
# def _parse_data(tensor: TensorProto) -> List[Any]:
|
||||
# return to_array(tensor).flatten().tolist()
|
||||
|
||||
|
||||
# def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
|
||||
# return [(d.dim_value if d.dim_value > 0 else 1) for d in shape.dim]
|
||||
|
||||
def export_onnx(g: backend.Graph, path: str) -> None:
|
||||
stub = OnnxStub()
|
||||
stub.to_onnx(g, path)
|
|
@ -129,8 +129,9 @@ void GraphObj::dataMalloc() {
|
|||
}
|
||||
}
|
||||
|
||||
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
|
||||
return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
|
||||
Tensor GraphObj::addTensor(Shape dim, DataType dtype, TensorType tensorType) {
|
||||
return tensors.emplace_back(
|
||||
make_ref<TensorObj>(dim, dtype, runtime, tensorType));
|
||||
}
|
||||
|
||||
Tensor GraphObj::addTensor(const Tensor &tensor) {
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
#include "core/graph_handler.h"
|
||||
#include "nnet/Visitor/Serializer.h"
|
||||
#include "operators/batch_norm.h"
|
||||
#include "operators/concat.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/membound.h"
|
||||
#include "operators/pad.h"
|
||||
#include "operators/pooling.h"
|
||||
#include "operators/reduce_mean.h"
|
||||
|
@ -19,8 +21,8 @@ namespace infini {
|
|||
|
||||
static DataType dtype_repr_convert(int);
|
||||
|
||||
Tensor GraphHandlerObj::tensor(Shape dims, int dtype) {
|
||||
return g->addTensor(std::move(dims), dtype_repr_convert(dtype));
|
||||
Tensor GraphHandlerObj::tensor(Shape dims, int dtype, TensorType ttype) {
|
||||
return g->addTensor(std::move(dims), dtype_repr_convert(dtype), ttype);
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::conv(Tensor input, Tensor weight, Tensor output, int ph,
|
||||
|
@ -291,6 +293,22 @@ Tensor GraphHandlerObj::pad(Tensor input, Tensor output,
|
|||
}
|
||||
}
|
||||
|
||||
TensorVec GraphHandlerObj::memBound(const TensorVec &inputs,
|
||||
const Tensor &output,
|
||||
const string &jsonString) {
|
||||
const auto &[expr, nnetInputs, execTime, hint] =
|
||||
nnet::Serializer().membundOpFromString(jsonString);
|
||||
if (output) {
|
||||
g->addOpWithOutputs<MemBoundObj>(std::move(inputs), TensorVec{output},
|
||||
nnetInputs, expr, execTime, hint);
|
||||
return {output};
|
||||
} else
|
||||
return g
|
||||
->addOp<MemBoundObj>(std::move(inputs), TensorVec{nullptr},
|
||||
nnetInputs, expr, execTime, hint)
|
||||
->getOutputs();
|
||||
}
|
||||
|
||||
static DataType dtype_repr_convert(int dtype) {
|
||||
switch ((OnnxDType)dtype) {
|
||||
case OnnxDType::FLOAT:
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#include "core/operator.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/hash.h"
|
||||
#include "nnet/dbg.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -34,7 +35,8 @@ bool OperatorObj::isReshapeOp() const { return type == OpType::Reshape; }
|
|||
|
||||
bool OperatorObj::isMemBoundOp() const {
|
||||
return type == OpType::MemBound || type == OpType::Activation ||
|
||||
type == OpType::Transpose;
|
||||
type == OpType::Transpose || type == OpType::Relu ||
|
||||
type == OpType::Tanh;
|
||||
}
|
||||
|
||||
void OperatorObj::removePredecessors(const Operator &op) {
|
||||
|
@ -83,22 +85,29 @@ HashType OperatorObj::hash() const {
|
|||
|
||||
bool OperatorObj::checkValid(GraphObj *graph) {
|
||||
auto optShapes = inferShape();
|
||||
IT_ASSERT(optShapes);
|
||||
if (!optShapes) // shape inference failed
|
||||
return false;
|
||||
|
||||
const vector<Shape> &shapes = *optShapes;
|
||||
IT_ASSERT(shapes.size() == outputs.size());
|
||||
if (shapes.size() != outputs.size())
|
||||
return false;
|
||||
if (graph) { // if graph != nullptr, outputs should be created
|
||||
auto dataTypes = inferDataType();
|
||||
for (size_t i = 0; i < outputs.size(); i++) {
|
||||
IT_ASSERT(!outputs[i], "Find empty output while operator creation");
|
||||
outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
|
||||
outputs[i] =
|
||||
graph->addTensor(shapes[i], dataTypes[i], TensorType::Other);
|
||||
}
|
||||
} else { // if outputs have been created, check their shapes
|
||||
for (size_t i = 0; i < shapes.size(); ++i) {
|
||||
if (shapes[i] != outputs[i]->getDims())
|
||||
IT_ASSERT(shapes[i] == outputs[i]->getDims());
|
||||
if (shapes[i] != outputs[i]->getDims()) {
|
||||
dbg(shapes[i], outputs[i]->getDims());
|
||||
return false;
|
||||
}
|
||||
IT_ASSERT(outputs[i]->getTensorType() == TensorType::Other);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
|
|
@ -2,9 +2,15 @@
|
|||
#include "core/blob.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/perf_engine.h"
|
||||
#include "operators/membound.h"
|
||||
#include "utils/data_generator.h"
|
||||
#include <chrono>
|
||||
#include <cstring>
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#include "cuda_profiler_api.h"
|
||||
#endif
|
||||
|
||||
namespace infini {
|
||||
void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
||||
if (!tune && profiling)
|
||||
|
@ -56,13 +62,36 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
|||
printProfilingData(totalTime, opTime, opCnt);
|
||||
}
|
||||
|
||||
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
||||
map<UidBaseType, bool>
|
||||
RuntimeObj::getCompileTimeComputableAttribute(const Graph &graph) const {
|
||||
map<UidBaseType, bool> ctcMap; // compile-time computable
|
||||
// Skip static computation
|
||||
bool status = graph->topo_sort();
|
||||
IT_ASSERT(status, "Topological sort failed");
|
||||
for (auto &op : graph->getOperators()) {
|
||||
bool compileTimeComputable = true;
|
||||
for (auto input : op->getInputs()) {
|
||||
// FIXME: propogate the tensor type. Current only the first operator
|
||||
// after weights are compile-time computable.
|
||||
if (input->getTensorType() != TensorType::Initialized)
|
||||
compileTimeComputable = false;
|
||||
}
|
||||
ctcMap[op->getGuid()] = compileTimeComputable;
|
||||
}
|
||||
return ctcMap;
|
||||
}
|
||||
|
||||
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
|
||||
bool allowEstimation,
|
||||
bool ignoreMemboundOp) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
// Statistics
|
||||
double totalTime = 0;
|
||||
std::map<OpType, double> opTime;
|
||||
std::map<OpType, int> opCnt;
|
||||
// compile-time computable
|
||||
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType(), op->getDType()};
|
||||
|
@ -70,11 +99,21 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
|||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
||||
PerfRecord record;
|
||||
// Tune the kernel if there is no record
|
||||
if (!perfData) {
|
||||
double time = -1e9;
|
||||
if (ctcMap[op->getGuid()]) { // Compile-time computable operators
|
||||
time = 0;
|
||||
} else if (op->getOpType() == OpType::Reshape) {
|
||||
time = 0;
|
||||
} else if (op->getOpType() == OpType::MemBound && ignoreMemboundOp) {
|
||||
time = 0;
|
||||
} else if (op->getOpType() == OpType::MemBound && allowEstimation) {
|
||||
time = as<MemBoundObj>(op)->getEstimatedTime();
|
||||
} else if (perfData) { // Tune the kernel if there is no record
|
||||
time = perfData->time;
|
||||
} else {
|
||||
// TODO: should tenosrs automatically allocate when access data?
|
||||
// allocate memory for empty tensors and release it after profiling
|
||||
// allocate memory for empty tensors and release it after
|
||||
// profiling
|
||||
TensorVec allocatedTensors;
|
||||
for (auto t : op->getInputs())
|
||||
if (!t->hasData())
|
||||
|
@ -88,21 +127,20 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
|||
}
|
||||
|
||||
// Profile operators and record the results
|
||||
record = kernel->tune(op, this);
|
||||
PerfRecord record = kernel->tune(op, this);
|
||||
time = record->time;
|
||||
perfEngine.setPerfData(perfKey, record);
|
||||
|
||||
// Free allocated memory
|
||||
for (auto t : allocatedTensors)
|
||||
t->freeData();
|
||||
} else
|
||||
record = perfData;
|
||||
}
|
||||
|
||||
double t = record->time;
|
||||
totalTime += t;
|
||||
totalTime += time;
|
||||
if (profiling) {
|
||||
op->print();
|
||||
printf(" op_time %lf\n", t);
|
||||
opTime[op->getOpType()] += t;
|
||||
printf(" op_time %lf\n", time);
|
||||
opTime[op->getOpType()] += time;
|
||||
opCnt[op->getOpType()]++;
|
||||
}
|
||||
}
|
||||
|
@ -160,4 +198,44 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,
|
|||
|
||||
string NativeCpuRuntimeObj::toString() const { return "CPU Runtime"; }
|
||||
|
||||
double RuntimeObj::timeNonCtcOperators(const Graph &graph, int warmup,
|
||||
int repeat) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
// compile-time computable
|
||||
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
|
||||
vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
|
||||
bool status = graph->topo_sort();
|
||||
IT_ASSERT(status, "Topological sort failed");
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType(), DataType::Float32};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
if (perfData)
|
||||
kernel->compute(op, perfData, this);
|
||||
else
|
||||
kernel->compute(op, this);
|
||||
if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
|
||||
kernels.emplace_back(op, kernel, perfData);
|
||||
}
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
dbg(op);
|
||||
}
|
||||
double ret = timeit(
|
||||
[&]() {
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
if (perfData)
|
||||
kernel->compute(op, perfData, this);
|
||||
else
|
||||
kernel->compute(op, this);
|
||||
}
|
||||
},
|
||||
[&]() { sync(); }, warmup, repeat);
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#include "core/search_engine.h"
|
||||
#include "core/hash.h"
|
||||
#include "core/runtime.h"
|
||||
#include "nnet/dbg.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
|
@ -8,7 +9,17 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
void SearchEngine::printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph) {
|
||||
using MetaGraph = SearchEngine::MetaGraph;
|
||||
|
||||
SearchEngine::SearchEngine(Runtime runtime, Ref<Mutator> mutator)
|
||||
: runtimeExec(runtime), mutator(mutator) {
|
||||
// Compare graph with estimated time
|
||||
graphTimeComparer = [this](const Graph &a, const Graph &b) -> bool {
|
||||
return getEstimatedGraphPerf(a) < getEstimatedGraphPerf(b);
|
||||
};
|
||||
}
|
||||
|
||||
void SearchEngine::printMetaGraph(MetaGraph metaGraph) {
|
||||
for (size_t i = 0; i < metaGraph->nodes.size(); i++) {
|
||||
auto &node = metaGraph->nodes[i];
|
||||
std::cout << "id: " << i << std::endl;
|
||||
|
@ -32,8 +43,7 @@ Graph SearchEngine::run(const Graph graph) {
|
|||
IT_ASSERT(runtimeExec == graph->getRuntime());
|
||||
std::cout << "[INFO] original graph: " << std::endl;
|
||||
std::cout << graph->toString();
|
||||
std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(graph)
|
||||
<< std::endl;
|
||||
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(graph) << std::endl;
|
||||
|
||||
std::vector<Graph> partitions = partitionGraph(graph);
|
||||
|
||||
|
@ -45,7 +55,6 @@ Graph SearchEngine::run(const Graph graph) {
|
|||
std::vector<Graph> candidates = search(subGraph);
|
||||
std::cout << "[INFO] size: " << candidates.size() << std::endl;
|
||||
IT_ASSERT(candidates.size() > 0);
|
||||
std::cout << subGraph->toString() << std::endl;
|
||||
std::vector<Graph> nextGraphs;
|
||||
for (auto lastGraph : bestGraphs) {
|
||||
for (auto thisGraph : candidates) {
|
||||
|
@ -65,9 +74,9 @@ Graph SearchEngine::run(const Graph graph) {
|
|||
nextGraphs.emplace_back(tmp);
|
||||
}
|
||||
}
|
||||
std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
|
||||
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
|
||||
});
|
||||
dbg("===Num" + std::to_string(nextGraphs.size()));
|
||||
std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
|
||||
|
||||
if (nextGraphs.size() > GRAPH_SIZE) {
|
||||
nextGraphs.resize(GRAPH_SIZE);
|
||||
}
|
||||
|
@ -81,10 +90,22 @@ Graph SearchEngine::run(const Graph graph) {
|
|||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
std::cout << "bestGraph " << i << ":" << std::endl;
|
||||
std::cout << bestGraphs[i]->toString();
|
||||
std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(bestGraphs[i])
|
||||
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[i])
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
// Fuse vertically and sort according to performance
|
||||
for (size_t i = 0; i < bestGraphs.size(); ++i) {
|
||||
// Debug
|
||||
bestGraphs[i] = fuseVertically(bestGraphs[i]);
|
||||
}
|
||||
std::sort(bestGraphs.begin(), bestGraphs.end(), graphTimeComparer);
|
||||
|
||||
std::cout << "[INFO] best fused graph: " << std::endl;
|
||||
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[0])
|
||||
<< std::endl;
|
||||
std::cout << bestGraphs[0] << std::endl;
|
||||
|
||||
return bestGraphs[0];
|
||||
}
|
||||
|
||||
|
@ -102,9 +123,9 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
|
|||
}
|
||||
}
|
||||
|
||||
sort(results.begin(), results.end(), [&](Graph x, Graph y) {
|
||||
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
|
||||
}); // compare with perf time
|
||||
// compare with perf time
|
||||
dbg("===Num" + std::to_string(results.size()));
|
||||
std::sort(results.begin(), results.end(), graphTimeComparer);
|
||||
if (results.size() > GRAPH_SIZE) {
|
||||
results.resize(GRAPH_SIZE);
|
||||
}
|
||||
|
@ -112,9 +133,8 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
|
|||
}
|
||||
|
||||
// Build metagraph with a graph, each operator is a node.
|
||||
std::shared_ptr<SearchEngine::MetaGraph>
|
||||
SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
||||
auto metaGraph = std::make_shared<MetaGraph>();
|
||||
MetaGraph SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
||||
auto metaGraph = make_ref<MetaGraphObj>();
|
||||
|
||||
int numOps = graph->getOperators().size();
|
||||
std::vector<int> cnt(numOps, 0);
|
||||
|
@ -123,7 +143,7 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
|||
std::vector<int> q(0);
|
||||
for (size_t i = 0; i < graph->getOperators().size(); i++) {
|
||||
auto &op = graph->getOperators()[i];
|
||||
MetaGraph::Node node;
|
||||
MetaGraphObj::Node node;
|
||||
std::vector<Operator> ops;
|
||||
ops.emplace_back(op);
|
||||
node.graph = make_ref<GraphObj>(runtimeExec, ops);
|
||||
|
@ -157,9 +177,8 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
|||
|
||||
// Build a metagraph with graph and a plan, a plan is which ops should be a
|
||||
// node.
|
||||
std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
|
||||
const std::shared_ptr<SearchEngine::MetaGraph> metaGraph,
|
||||
const std::vector<int> &plan) {
|
||||
MetaGraph SearchEngine::buildMetaGraphWithPlan(const MetaGraph metaGraph,
|
||||
const std::vector<int> &plan) {
|
||||
int numGroups = 0;
|
||||
for (auto i : plan) {
|
||||
if (i > numGroups) {
|
||||
|
@ -172,12 +191,12 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
|
|||
groups[plan[i]].emplace_back(i);
|
||||
}
|
||||
|
||||
auto resultMetaGraph = make_ref<MetaGraph>();
|
||||
auto resultMetaGraph = make_ref<MetaGraphObj>();
|
||||
for (auto &group : groups) {
|
||||
std::vector<Operator> ops;
|
||||
std::unordered_set<int> preSet, sucSet;
|
||||
for (auto id : group) {
|
||||
MetaGraph::Node node;
|
||||
MetaGraphObj::Node node;
|
||||
for (auto op : metaGraph->nodes[id].graph->getOperators()) {
|
||||
ops.emplace_back(op);
|
||||
}
|
||||
|
@ -204,8 +223,7 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
|
|||
}
|
||||
|
||||
// Search how to merge multiple ops.
|
||||
std::vector<std::shared_ptr<SearchEngine::MetaGraph>>
|
||||
SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
||||
vector<MetaGraph> SearchEngine::searchMerge(MetaGraph &metaGraph) {
|
||||
IT_ASSERT(metaGraph != nullptr);
|
||||
std::vector<int> plan(metaGraph->nodes.size());
|
||||
for (size_t i = 0; i < plan.size(); i++) {
|
||||
|
@ -222,7 +240,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
|||
std::unordered_set<HashType> planSet;
|
||||
searchMergeDfs(metaGraph, plan, frontier, plans, planSet);
|
||||
|
||||
std::vector<std::shared_ptr<SearchEngine::MetaGraph>> metaGraphs;
|
||||
vector<MetaGraph> metaGraphs;
|
||||
for (auto &curPlan : plans) {
|
||||
metaGraphs.emplace_back(buildMetaGraphWithPlan(metaGraph, curPlan));
|
||||
}
|
||||
|
@ -230,8 +248,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
|||
}
|
||||
|
||||
// DFS impl for search merge.
|
||||
void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
|
||||
std::vector<int> &plan,
|
||||
void SearchEngine::searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
|
||||
std::vector<int> &frontier,
|
||||
std::vector<std::vector<int>> &plans,
|
||||
std::unordered_set<uint64_t> &planSet) {
|
||||
|
@ -320,14 +337,16 @@ void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
|
|||
}
|
||||
|
||||
// Search mutation for each compute op.
|
||||
std::vector<Graph> SearchEngine::searchMutation(
|
||||
const std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
||||
std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
|
||||
std::vector<Graph> graphs = {nullptr};
|
||||
// Append a node to all existing candidates
|
||||
for (auto &node : metaGraph->nodes) {
|
||||
std::vector<Graph> nextGraphs;
|
||||
if (node.type == 1) { // If it has computing OPs
|
||||
auto mutatedGraphs = mutator->run(node.graph);
|
||||
// // HACK: only try the first one for debug
|
||||
if (mutatedGraphs.size() > 2)
|
||||
mutatedGraphs.resize(2);
|
||||
for (auto graph : graphs) {
|
||||
for (auto mutatedGraph : mutatedGraphs) {
|
||||
std::vector<Operator> ops;
|
||||
|
@ -360,9 +379,8 @@ std::vector<Graph> SearchEngine::searchMutation(
|
|||
for (auto g : nextGraphs) {
|
||||
g->dataMalloc();
|
||||
}
|
||||
std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
|
||||
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
|
||||
});
|
||||
dbg("===Num" + std::to_string(nextGraphs.size()));
|
||||
std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
|
||||
if (nextGraphs.size() > GRAPH_SIZE) {
|
||||
nextGraphs.resize(GRAPH_SIZE);
|
||||
}
|
||||
|
@ -372,7 +390,7 @@ std::vector<Graph> SearchEngine::searchMutation(
|
|||
}
|
||||
|
||||
bool SearchEngine::isMultiBranchMergable(const Graph graph) {
|
||||
return mutationEngine->isMultiBranchMergable(graph);
|
||||
return mutator->isMultiBranchMergable(graph);
|
||||
}
|
||||
|
||||
// Split a graph into multiple independt graphs. Search engine will search for
|
||||
|
@ -438,4 +456,60 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
|
|||
return partitions;
|
||||
}
|
||||
|
||||
double SearchEngine::getEstimatedGraphPerf(Graph graph) {
|
||||
return runtimeExec->getPerfTime(graph, false, true, true);
|
||||
}
|
||||
|
||||
Graph SearchEngine::fuseVertically(const Graph &graph) {
|
||||
std::unordered_map<UidBaseType, int> visitTime;
|
||||
std::vector<Operator> ops;
|
||||
|
||||
graph->topo_sort();
|
||||
int cnt = 0;
|
||||
for (auto op : graph->getOperators()) {
|
||||
// Skip visited OP
|
||||
if (visitTime.find(op->getGuid()) != visitTime.end()) {
|
||||
continue;
|
||||
}
|
||||
// Skip compute OP and multi-input/output OP
|
||||
if (!op->isMemBoundOp() || (op->getPredecessors().size() != 1 &&
|
||||
op->getSuccessors().size() != 1)) {
|
||||
visitTime.emplace(op->getGuid(), ++cnt);
|
||||
ops.emplace_back(op);
|
||||
continue;
|
||||
}
|
||||
vector<Operator> chainOps;
|
||||
visitTime.emplace(op->getGuid(), ++cnt);
|
||||
|
||||
vector<Operator> tmp;
|
||||
auto cur = op;
|
||||
while (cur->getPredecessors().size() == 1 &&
|
||||
cur->getPredecessors()[0]->isMemBoundOp()) {
|
||||
cur = cur->getPredecessors()[0];
|
||||
tmp.emplace_back(cur);
|
||||
visitTime.emplace(cur->getGuid(), cnt);
|
||||
}
|
||||
for (int i = tmp.size() - 1; i >= 0; i--) {
|
||||
chainOps.emplace_back(tmp[i]);
|
||||
}
|
||||
chainOps.emplace_back(op);
|
||||
cur = op;
|
||||
while (cur->getSuccessors().size() == 1 &&
|
||||
cur->getSuccessors()[0]->isMemBoundOp()) {
|
||||
cur = cur->getSuccessors()[0];
|
||||
chainOps.emplace_back(cur);
|
||||
visitTime.emplace(cur->getGuid(), cnt);
|
||||
}
|
||||
make_ref<GraphObj>(runtimeExec, chainOps)->print();
|
||||
|
||||
Graph optGraph =
|
||||
mutator->fuseVertically(make_ref<GraphObj>(runtimeExec, chainOps));
|
||||
for (auto op : optGraph->getOperators()) {
|
||||
ops.emplace_back(op);
|
||||
}
|
||||
}
|
||||
|
||||
return make_ref<GraphObj>(runtimeExec, ops);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -8,12 +8,14 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime)
|
||||
TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime,
|
||||
TensorType tensorType)
|
||||
: TensorBaseObj(shape_.size(), dtype, runtime), shape(std::move(shape_)),
|
||||
_size(shape.empty()
|
||||
? 0
|
||||
: std::accumulate(shape.begin(), shape.end(), 1,
|
||||
[](auto acc, auto x) { return acc * x; })) {}
|
||||
[](auto acc, auto x) { return acc * x; })),
|
||||
tensorType(tensorType) {}
|
||||
|
||||
string TensorObj::toString() const {
|
||||
// Convert data pointer to string
|
||||
|
@ -24,8 +26,8 @@ string TensorObj::toString() const {
|
|||
ss << "nullptr data";
|
||||
string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
|
||||
std::to_string(fuid) + ", shape " + vecToString(shape) +
|
||||
", dtype " + dtype.toString() + ", " + runtime->toString() +
|
||||
", " + ss.str() + "\n";
|
||||
", dtype " + dtype.toString() + ", tensorType " +
|
||||
std::to_string(enum_to_underlying(tensorType));
|
||||
vector<UidBaseType> targetGuids;
|
||||
for (const auto &op : targets)
|
||||
targetGuids.emplace_back(op.lock()->getGuid());
|
||||
|
@ -34,6 +36,7 @@ string TensorObj::toString() const {
|
|||
else
|
||||
ret += ", source None";
|
||||
ret += ", targets " + vecToString(targetGuids);
|
||||
ret += ", " + runtime->toString() + ", " + ss.str();
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -64,12 +67,19 @@ vector<size_t> TensorObj::getStride() const {
|
|||
|
||||
void TensorObj::printData() const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
if (!runtime->isCpu())
|
||||
IT_TODO_HALT();
|
||||
void *ptr = nullptr;
|
||||
Blob buffer;
|
||||
if (!runtime->isCpu()) { // copy data to main memory
|
||||
buffer = NativeCpuRuntimeObj::getInstance()->allocBlob(getBytes());
|
||||
runtime->copyBlobToCPU(buffer->getPtr<void *>(),
|
||||
getRawDataPtr<void *>(), getBytes());
|
||||
ptr = buffer->getPtr<void *>();
|
||||
} else
|
||||
ptr = data->getPtr<float *>();
|
||||
|
||||
#define TRY_PRINT(N) \
|
||||
if (dtype == DataType(N)) \
|
||||
std::cout << dataToString<DT<N>::t>() << std::endl;
|
||||
std::cout << dataToString<DT<N>::t>(ptr) << std::endl;
|
||||
|
||||
TRY_PRINT(0) // fmt: new line
|
||||
else TRY_PRINT(1) //
|
||||
|
|
|
@ -2,10 +2,56 @@
|
|||
#include "core/kernel.h"
|
||||
#include "core/perf_engine.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda_profiler_api.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/matmul.h"
|
||||
#ifdef INFINI_USE_TVM
|
||||
#include "tvm/runtime/device_api.h"
|
||||
#endif
|
||||
namespace infini {
|
||||
|
||||
CudaRuntimeObj::CudaRuntimeObj()
|
||||
: RuntimeObj(Device::CUDA), stream(cudaStreamPerThread),
|
||||
cudaGraphStatus(false) {
|
||||
checkCudnnError(cudnnCreate(&cudnn));
|
||||
checkCublasError(cublasCreate(&cublas));
|
||||
checkCudnnError(cudnnSetStream(cudnn, stream));
|
||||
checkCublasError(cublasSetStream(cublas, stream));
|
||||
// 10GB for Longformer
|
||||
// size_t longformerNum = 3lu * (1 << 30);
|
||||
workspaceSize = 7ll << 30; // 7 GB
|
||||
workspace = alloc(workspaceSize);
|
||||
}
|
||||
|
||||
CudaRuntimeObj::~CudaRuntimeObj() {
|
||||
try {
|
||||
dealloc(workspace);
|
||||
checkCudnnError(cudnnDestroy(cudnn));
|
||||
checkCublasError(cublasDestroy(cublas));
|
||||
} catch (const std::exception &e) {
|
||||
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void CudaRuntimeObj::beginCudaGraphStreamCapture() {
|
||||
enum cudaStreamCaptureStatus pCaptureStatus;
|
||||
checkCudaError(cudaStreamIsCapturing(stream, &pCaptureStatus));
|
||||
IT_ASSERT(pCaptureStatus == cudaStreamCaptureStatusNone);
|
||||
cudaGraphStatus = true;
|
||||
checkCudaError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
|
||||
}
|
||||
|
||||
tuple<cudaGraphExec_t, size_t> CudaRuntimeObj::endCudaGraphStreamCapture() {
|
||||
cudaGraph_t cudaGraph;
|
||||
cudaGraphExec_t instance;
|
||||
checkCudaError(cudaStreamEndCapture(stream, &cudaGraph));
|
||||
cudaGraphStatus = false;
|
||||
size_t numCudaGraphNodes;
|
||||
checkCudaError(cudaGraphGetNodes(cudaGraph, nullptr, &numCudaGraphNodes));
|
||||
checkCudaError(cudaGraphInstantiate(&instance, cudaGraph, NULL, NULL, 0));
|
||||
return {instance, numCudaGraphNodes};
|
||||
}
|
||||
|
||||
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
|
@ -75,4 +121,57 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
|
|||
|
||||
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
|
||||
|
||||
double CudaRuntimeObj::timeWithCudaGraph(Graph graph) {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
// compile-time computable
|
||||
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
|
||||
vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
|
||||
bool status = graph->topo_sort();
|
||||
IT_ASSERT(status, "Topological sort failed");
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType(), DataType::Float32};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
if (perfData)
|
||||
kernel->compute(op, perfData, this);
|
||||
else
|
||||
kernel->compute(op, this);
|
||||
if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
|
||||
kernels.emplace_back(op, kernel, perfData);
|
||||
}
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
dbg(op);
|
||||
}
|
||||
|
||||
// Init tvm stream
|
||||
#ifdef INFINI_USE_TVM
|
||||
DLDevice tvm_device_id = {kDLCUDA, 0};
|
||||
auto tvm_device = tvm::runtime::DeviceAPI::Get(tvm_device_id);
|
||||
tvm_device->SetStream(tvm_device_id, getStream());
|
||||
#endif
|
||||
|
||||
beginCudaGraphStreamCapture();
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
if (perfData)
|
||||
kernel->compute(op, perfData, this);
|
||||
else
|
||||
kernel->compute(op, this);
|
||||
}
|
||||
auto [cudaGraphInstance, numCudaGraphNodes] = endCudaGraphStreamCapture();
|
||||
IT_ASSERT(numCudaGraphNodes == kernels.size(),
|
||||
std::to_string(numCudaGraphNodes) +
|
||||
" != " + std::to_string(kernels.size()));
|
||||
return timeit(
|
||||
[&, cudaGraphInstance = cudaGraphInstance, stream = getStream()]() {
|
||||
checkCudaError(cudaGraphLaunch(cudaGraphInstance, stream));
|
||||
},
|
||||
[&, stream = getStream()]() { cudaStreamSynchronize(stream); }, 1000,
|
||||
1000);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
#include "core/graph.h"
|
||||
#include <pybind11/stl.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace infini {
|
||||
|
||||
namespace callback {
|
||||
|
||||
using namespace py::literals;
|
||||
|
||||
static std::function<void(const Graph &, string)> exportONNXImpl;
|
||||
void exportONNX(const Graph &graph, const string &path) {
|
||||
IT_ASSERT(Py_IsInitialized(), "Python interpreter is not running.");
|
||||
static auto exportONNXImpl =
|
||||
py::module_::import("infinitensor.if_onnx").attr("export_onnx");
|
||||
exportONNXImpl(graph, path);
|
||||
}
|
||||
|
||||
} // namespace callback
|
||||
|
||||
} // namespace infini
|
|
@ -1,9 +1,13 @@
|
|||
#include "core/graph_handler.h"
|
||||
#include "core/mutator.h"
|
||||
#include "core/search_engine.h"
|
||||
#include "nnet/nmutator.h"
|
||||
#include "operators/batch_norm.h"
|
||||
#include "operators/concat.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/membound.h"
|
||||
#include "operators/pad.h"
|
||||
#include "operators/pooling.h"
|
||||
#include "operators/reduce_mean.h"
|
||||
|
@ -63,6 +67,7 @@ void export_values(py::module &m) {
|
|||
.VALUE(OpType, Conv)
|
||||
.VALUE(OpType, Matmul)
|
||||
.VALUE(OpType, ConvTrans)
|
||||
.VALUE(OpType, ConvTransNHWC)
|
||||
.VALUE(OpType, G2BMM)
|
||||
.VALUE(OpType, GBMM)
|
||||
.VALUE(OpType, Pad)
|
||||
|
@ -94,8 +99,13 @@ void export_values(py::module &m) {
|
|||
.VALUE(OpType, Abs)
|
||||
.VALUE(OpType, Resize)
|
||||
.VALUE(OpType, Dropout)
|
||||
.VALUE(OpType, MemBound)
|
||||
.export_values();
|
||||
|
||||
py::enum_<TensorType>(m, "TensorType")
|
||||
.VALUE(TensorType, Input)
|
||||
.VALUE(TensorType, Initialized)
|
||||
.VALUE(TensorType, Other);
|
||||
#undef VALUE
|
||||
}
|
||||
|
||||
|
@ -224,6 +234,10 @@ static vector<int> transpose_permute_of(Operator op) {
|
|||
return dynamic_cast<const TransposeObj *>(op.get())->getPermute();
|
||||
}
|
||||
|
||||
static string membound_expr_of(Operator op) {
|
||||
return as<MemBoundObj>(op)->toJson();
|
||||
}
|
||||
|
||||
void export_functions(py::module &m) {
|
||||
#define FUNCTION(NAME) def(#NAME, &NAME)
|
||||
m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
|
||||
|
@ -252,25 +266,37 @@ void export_functions(py::module &m) {
|
|||
.FUNCTION(transpose_permute_of)
|
||||
.FUNCTION(concat_axis_of)
|
||||
.FUNCTION(split_axis_of)
|
||||
.FUNCTION(gather_axis_of);
|
||||
.FUNCTION(gather_axis_of)
|
||||
.FUNCTION(membound_expr_of)
|
||||
.def("membound_hash_of",
|
||||
[](Operator op) { return as<MemBoundObj>(op)->getHash(); });
|
||||
#undef FUNCTION
|
||||
}
|
||||
|
||||
void init_graph_builder(py::module &m) {
|
||||
using Handler = GraphHandlerObj;
|
||||
|
||||
py::class_<RuntimeObj, std::shared_ptr<RuntimeObj>>(m, "Runtime");
|
||||
py::class_<Object, Ref<Object>>(m, "_Object")
|
||||
.def("__str__", &Object::toString)
|
||||
.def("guid", &Object::getGuid);
|
||||
py::class_<RuntimeObj, Ref<RuntimeObj>>(m, "Runtime")
|
||||
.def("run", &RuntimeObj::run, "graph"_a, "tune"_a = false,
|
||||
"profiling"_a = false)
|
||||
.def("getPerfTime", &RuntimeObj::getPerfTime, "graph"_a, "profiling"_a,
|
||||
"allowEstimation"_a, "ignoreMemboundOp"_a)
|
||||
.def("timeNonCtcOperators", &RuntimeObj::timeNonCtcOperators);
|
||||
py::class_<NativeCpuRuntimeObj, std::shared_ptr<NativeCpuRuntimeObj>,
|
||||
RuntimeObj>(m, "CpuRuntime");
|
||||
#ifdef USE_CUDA
|
||||
py::class_<CudaRuntimeObj, std::shared_ptr<CudaRuntimeObj>, RuntimeObj>(
|
||||
m, "CudaRuntime");
|
||||
py::class_<CudaRuntimeObj, Ref<CudaRuntimeObj>, RuntimeObj>(m,
|
||||
"CudaRuntime")
|
||||
.def("timeWithCudaGraph", &CudaRuntimeObj::timeWithCudaGraph);
|
||||
#endif
|
||||
#ifdef USE_BANG
|
||||
py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
|
||||
m, "BangRuntime");
|
||||
#endif
|
||||
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
|
||||
py::class_<TensorObj, std::shared_ptr<TensorObj>, Object>(m, "Tensor")
|
||||
.def("fuid", &TensorObj::getFuid, policy::automatic)
|
||||
.def("shape", &TensorObj::getDims, policy::move)
|
||||
.def("copyin_float", &TensorObj::copyin<float>, policy::move)
|
||||
|
@ -281,8 +307,10 @@ void init_graph_builder(py::module &m) {
|
|||
.def("copyout_int64", &TensorObj::copyout<int64_t>, policy::move)
|
||||
.def("has_target", &TensorObj::hasTarget, policy::automatic)
|
||||
.def("src", &TensorObj::getSource, policy::move)
|
||||
.def("printData", &TensorObj::printData, policy::automatic);
|
||||
py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
|
||||
.def("print_data", &TensorObj::printData)
|
||||
.def("data_malloc", &TensorObj::dataMalloc)
|
||||
.def("getTensorType", &TensorObj::getTensorType);
|
||||
py::class_<OperatorObj, std::shared_ptr<OperatorObj>, Object>(m, "Operator")
|
||||
.def("op_type", &OperatorObj::getOpType, policy::automatic)
|
||||
.def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_),
|
||||
policy::reference)
|
||||
|
@ -291,6 +319,9 @@ void init_graph_builder(py::module &m) {
|
|||
policy::reference);
|
||||
py::class_<Handler>(m, "GraphHandler")
|
||||
.def(py::init<Runtime>())
|
||||
.def(py::init<Graph>())
|
||||
.def("inputs", &Handler::inputs, policy::move)
|
||||
.def("outputs", &Handler::outputs, policy::move)
|
||||
.def("tensor", &Handler::tensor, policy::move)
|
||||
.def("conv", &Handler::conv, policy::move)
|
||||
.def("convTransposed2d", &Handler::convTransposed2d, policy::move)
|
||||
|
@ -321,13 +352,47 @@ void init_graph_builder(py::module &m) {
|
|||
.def("reduce_mean", &Handler::reduceMean, policy::move)
|
||||
.def("slice", &Handler::slice, policy::move)
|
||||
.def("pad", &Handler::pad, policy::move)
|
||||
.def("memBound", &Handler::memBound, policy::move)
|
||||
.def("topo_sort", &Handler::topo_sort, policy::automatic)
|
||||
.def("optimize", &Handler::optimize, policy::automatic)
|
||||
.def("operators", &Handler::operators, policy::move)
|
||||
.def("data_malloc", &Handler::data_malloc, policy::automatic)
|
||||
.def("run", &Handler::run, policy::automatic);
|
||||
.def("run", &Handler::run, policy::automatic)
|
||||
.def("getGraph", &Handler::getGraph);
|
||||
py::class_<Mutator, Ref<Mutator>>(m, "Mutator").def("run", &Mutator::run);
|
||||
py::enum_<NMutator::Mode>(m, "NMutatorMode")
|
||||
.value("RuleBased", NMutator::Mode::RuleBased);
|
||||
py::class_<NMutator, Ref<NMutator>, Mutator>(m, "NMutator")
|
||||
.def(py::init<NMutator::Mode>())
|
||||
.def(py::init<NMutator::Mode, vector<int>>())
|
||||
.def("run", &NMutator::run);
|
||||
py::class_<SearchEngine>(m, "SearchEngine")
|
||||
.def(py::init<Runtime, Ref<Mutator>>())
|
||||
.def("run", &SearchEngine::run);
|
||||
py::class_<GraphObj, Ref<GraphObj>, Object>(m, "Graph")
|
||||
.def("tensors", &GraphObj::getTensors)
|
||||
.def("operators", &GraphObj::getOperators)
|
||||
.def("inputs", &GraphObj::getInputs)
|
||||
.def("outputs", &GraphObj::getOutputs)
|
||||
.def("print", &GraphObj::print)
|
||||
.def("topo_sort", &GraphObj::topo_sort);
|
||||
}
|
||||
|
||||
#ifdef USE_CUDA
|
||||
Graph getInfoGAN(int batch, Runtime runtime, int nLayers);
|
||||
vector<Tensor> runInfoGAN(int nLayers);
|
||||
Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId);
|
||||
Graph optimizeGraph(Graph g, Runtime runtime, bool tuning);
|
||||
|
||||
void export_test_model(py::module &m) {
|
||||
m.def("runInfoGAN", &runInfoGAN);
|
||||
m.def("getInfoGAN", &getInfoGAN);
|
||||
m.def("getConvtransposedNHWC", &getConvtransposedNHWC);
|
||||
m.def("optimizeGraph", &optimizeGraph, "graph"_a, "runtime"_a,
|
||||
"tuning"_a = false);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace infini
|
||||
|
||||
PYBIND11_MODULE(backend, m) {
|
||||
|
@ -335,4 +400,5 @@ PYBIND11_MODULE(backend, m) {
|
|||
infini::export_values(m);
|
||||
infini::export_functions(m);
|
||||
infini::init_graph_builder(m);
|
||||
infini::export_test_model(m);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,171 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
|
||||
using dtype = float;
|
||||
|
||||
__global__ void conv2dreduce_kernel_(float *__restrict__ input,
|
||||
float *__restrict__ bias,
|
||||
float *__restrict__ output,
|
||||
const bool PReLU, const int n, const int f,
|
||||
const int h, const int w, const int oh,
|
||||
const int ow, const int r, const int s,
|
||||
const int ph, const int pw, const int dh,
|
||||
const int dw, const int sh, const int sw) {
|
||||
// output shape: (n, oh, ow, f)
|
||||
// input shape: (n, h, w, f, r, s)
|
||||
int nid = blockIdx.x, fid = blockIdx.y;
|
||||
int hid = threadIdx.x, wid = threadIdx.y;
|
||||
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
|
||||
nchunck = n * hchunk;
|
||||
float *nfinput = input + nid * nchunck + fid * fchunck;
|
||||
if (nid < n && fid < f && hid < oh && wid < ow) {
|
||||
float imm = 0.0;
|
||||
int ihst = hid * sh - ph;
|
||||
int iwst = wid * sw - pw;
|
||||
for (int ri = 0; ri < r; ++ri) {
|
||||
for (int si = 0; si < s; ++si) {
|
||||
int ihid = ihst + ri * dh;
|
||||
int iwid = iwst + si * dw;
|
||||
if (ihid >= 0 && ihid < h && iwid >= 0 && iwid < w) {
|
||||
imm += *(nfinput + ihid * hchunk + iwid * wchunk + ri * s +
|
||||
si);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias) {
|
||||
imm += bias[fid];
|
||||
}
|
||||
if (PReLU) {
|
||||
imm = imm > 0.0 ? imm : 0.0;
|
||||
}
|
||||
output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void convTranspose2dreduce_kernel_(
|
||||
float *__restrict__ input, float *__restrict__ bias,
|
||||
float *__restrict__ output, const bool PReLU, const int n, const int f,
|
||||
const int h, const int w, const int oh, const int ow, const int r,
|
||||
const int s, const int ph, const int pw, const int dh, const int dw,
|
||||
const int sh, const int sw) {
|
||||
// assert dh = dw = 1
|
||||
int nid = blockIdx.x, fid = blockIdx.y;
|
||||
int hid = threadIdx.x, wid = threadIdx.y;
|
||||
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
|
||||
nchunck = n * hchunk;
|
||||
float *nfinput = input + nid * nchunck + fid * fchunck;
|
||||
// view as conv, the true ph and pw
|
||||
int tph = r - ph - 1, tpw = s - pw - 1;
|
||||
int th = (h - 1) * sh + 1, tw = (w - 1) * sw + 1;
|
||||
if (nid < n && fid < f && hid < oh && wid < ow) {
|
||||
float imm = 0.0;
|
||||
int ihst = hid - tph;
|
||||
int iwst = wid - tpw;
|
||||
for (int ri = 0; ri < r; ++ri) {
|
||||
for (int si = 0; si < s; ++si) {
|
||||
int ihid = ihst + r - ri - 1;
|
||||
int iwid = iwst + s - si - 1;
|
||||
if (ihid >= 0 && ihid < th && iwid >= 0 && iwid < tw &&
|
||||
(ihid % sh == 0) && (iwid % sw == 0)) {
|
||||
imm += *(nfinput + (ihid / sh) * hchunk +
|
||||
(iwid / sw) * wchunk + ri * s + si);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias) {
|
||||
imm += bias[fid];
|
||||
}
|
||||
if (PReLU) {
|
||||
imm = imm > 0.0 ? imm : 0.0;
|
||||
}
|
||||
output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
|
||||
}
|
||||
}
|
||||
|
||||
// nhwrsc -> nhwc
|
||||
__global__ void reduce_4x4(dtype *in, dtype *out, int act, const int N,
|
||||
const int F, const int H, const int W, const int IH,
|
||||
const int IW) {
|
||||
// #define in_index(n, h, w, r, s, f) \
|
||||
// ((((((n)*IH + h) * IW + w) * R + r) * S + s) * F + f)
|
||||
#define in_index(n, h, w, f, r, s) \
|
||||
((((((n)*IH + h) * IW + w) * F + f) * R + r) * S + s)
|
||||
#define out_index(n, h, w, f) (((((n)*H) + (h)) * W + (w)) * F + (f))
|
||||
const int R = 4, S = 4;
|
||||
const int n_tasks = N * F * H * W;
|
||||
int start = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (int i = start; i < n_tasks; i += stride) {
|
||||
int t = i, n, f, h, w;
|
||||
f = t % F;
|
||||
t /= F;
|
||||
w = t % W;
|
||||
t /= W;
|
||||
h = t % H;
|
||||
t /= H;
|
||||
n = t;
|
||||
|
||||
// unroll this 2-iter loop
|
||||
float sum = 0;
|
||||
int x, y;
|
||||
for (int r = (h + 1) & 1; r < R; r += 2) {
|
||||
x = (h + 1 - r) / 2;
|
||||
if (x >= 0 && x < IH) {
|
||||
for (int s = (w + 1) & 1; s < S; s += 2) {
|
||||
y = (w + 1 - s) / 2;
|
||||
if (y >= 0 && y < IW) {
|
||||
sum += in[in_index(n, x, y, f, r, s)];
|
||||
// if (i==0)
|
||||
// printf("TTT nhwf= %d,%d,%d,%d x=%d y=%d, v=%f,
|
||||
// index=%d, rsf %d %d %d\n", n, h, w,
|
||||
// f, x, y, in[in_index(n, x, y, r, s, f)],
|
||||
// in_index(n, x, y, r, s, f), r,s,f);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (act == 0) {
|
||||
out[out_index(n, h, w, f)] = sum;
|
||||
} else if (act == 1) { // Relu
|
||||
out[out_index(n, h, w, f)] = sum > 0 ? sum : 0;
|
||||
} else if (act == 2) {
|
||||
out[out_index(n, h, w, f)] = tanhf(sum);
|
||||
}
|
||||
}
|
||||
#undef in_index
|
||||
#undef out_index
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
|
||||
void conv2dreduce_kernel(float *input, float *bias, float *output, bool PReLU,
|
||||
int n, int h, int w, int f, int r, int s, int oh,
|
||||
int ow, int ph, int pw, int sh, int sw, int dh,
|
||||
int dw) {
|
||||
dim3 grid(n, f);
|
||||
dim3 block(oh, ow);
|
||||
// cudaStream_t stream(cudaStreamPerThread);
|
||||
conv2dreduce_kernel_<<<grid, block, 0>>>(input, bias, output, PReLU, n, f,
|
||||
h, w, oh, ow, r, s, ph, pw, dh, dw,
|
||||
sh, sw);
|
||||
}
|
||||
|
||||
void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
|
||||
int act, int n, int h, int w, int f, int r,
|
||||
int s, int oh, int ow, int ph, int pw, int sh,
|
||||
int sw, int dh, int dw) {
|
||||
dim3 grid(n, f);
|
||||
dim3 block(oh, ow);
|
||||
// cudaStream_t stream(cudaStreamPerThread);
|
||||
// puts("convTranspose2dreduce_kernel is executed");
|
||||
if (r == 4 && s == 4 && sh == 2 && sw == 2) {
|
||||
const int M = r * s * f, N = n * h * w;
|
||||
reduce_4x4<<<(M * N + 127) / 128, 128>>>(input, output, act, n, f, oh,
|
||||
ow, h, w);
|
||||
} else {
|
||||
puts("why use this conv2dreduce");
|
||||
convTranspose2dreduce_kernel_<<<grid, block, 0>>>(
|
||||
input, bias, output, (bool)act, n, f, h, w, oh, ow, r, s, ph, pw,
|
||||
dh, dw, sh, sw);
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
|
@ -49,7 +49,7 @@ class matmulCublas : public Kernel {
|
|||
const float alpha = 1.f, beta = 0.f;
|
||||
// TODO:use compute type
|
||||
cublasStatus_t stat;
|
||||
if (b > 1) {
|
||||
if (b >= 1) {
|
||||
// Support batch broadcast with zero stride
|
||||
int dimA = op->getInputs(0)->getDims().size();
|
||||
int dimB = op->getInputs(1)->getDims().size();
|
||||
|
@ -63,6 +63,11 @@ class matmulCublas : public Kernel {
|
|||
(dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
|
||||
? 0 // Broadcast the batch dimension if batch size is 1
|
||||
: n * k;
|
||||
// printf("cublasGemmStridedBatchedEx %d%d, mnk %d %d %d, alpha %f,
|
||||
// B "
|
||||
// "%d %lld, A %d %lld, C %d %d, b %d %d\n",
|
||||
// opB, opA, n, m, k, alpha, ldb, strideB, lda, strideA, ldc,
|
||||
// m * n, b, record->algo);
|
||||
stat = cublasGemmStridedBatchedEx(
|
||||
context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
|
||||
CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA,
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#ifdef INFINI_USE_TVM
|
||||
#include "core/kernel.h"
|
||||
#include "cuda/cuda_conv2dreduce.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "dlpack/dlpack.h"
|
||||
#include "ffi/ffi_embed.h"
|
||||
|
@ -8,6 +9,13 @@
|
|||
#include "operators/pooling.h"
|
||||
#include "tvm/runtime/module.h"
|
||||
#include "tvm/runtime/packed_func.h"
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
|
@ -22,6 +30,8 @@ class TVMRecordObj : public PerfRecordObj {
|
|||
std::string dllPath;
|
||||
std::string funcName;
|
||||
std::vector<int> inputIdx;
|
||||
tvm::runtime::PackedFunc packedFunc;
|
||||
bool useExistingKernel = false;
|
||||
};
|
||||
|
||||
using TVMRecord = Ref<TVMRecordObj>;
|
||||
|
@ -33,9 +43,15 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
auto op = as<MemBoundObj>(_op);
|
||||
// auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
auto tvmRecord = std::dynamic_pointer_cast<TVMRecordObj>(record);
|
||||
tvm::runtime::PackedFunc packedFunc =
|
||||
getPackedFunction(tvmRecord->dllPath, tvmRecord->funcName);
|
||||
IT_ASSERT(packedFunc != nullptr);
|
||||
|
||||
// Use user-defined kernels
|
||||
if (tvmRecord->useExistingKernel) {
|
||||
bool success = useExistingKernels(op);
|
||||
IT_ASSERT(success);
|
||||
return;
|
||||
}
|
||||
|
||||
tvm::runtime::PackedFunc packedFunc = tvmRecord->packedFunc;
|
||||
|
||||
// prepare inputs and outputs
|
||||
vector<DLTensorHolder> inputsHolder;
|
||||
|
@ -63,10 +79,18 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
// Premise: op is idempotent since it is called multiple times.
|
||||
PerfRecord tune(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
TVMRecord ret = std::make_shared<TVMRecordObj>();
|
||||
auto op = as<MemBoundObj>(_op);
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
|
||||
// If hash matches, use user-defined kernels
|
||||
if (useExistingKernels(op)) {
|
||||
TVMRecord ret = std::make_shared<TVMRecordObj>();
|
||||
ret->time = timeit([&]() { useExistingKernels(op); },
|
||||
[&]() { context->sync(); });
|
||||
ret->useExistingKernel = true;
|
||||
return ret;
|
||||
}
|
||||
|
||||
// invoke Ansor to tune a membound kernel
|
||||
auto [expr, hash] = op->getSimplifiedNnetExpr();
|
||||
nnet::AsTVMVisitor visitor;
|
||||
|
@ -93,6 +117,7 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
if (inputName == op->getNnetInputs()[j]->getName())
|
||||
break;
|
||||
}
|
||||
IT_ASSERT(j < numInputs, "Cannot find input name: " + inputName);
|
||||
inputIdx.emplace_back(j);
|
||||
}
|
||||
|
||||
|
@ -114,29 +139,41 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
tvm::runtime::TVMArgs args(preArgs.first.data(), preArgs.second.data(),
|
||||
preArgs.first.size());
|
||||
|
||||
TVMRecord ret = std::make_shared<TVMRecordObj>();
|
||||
ret->time = timeit([&]() { packedFunc.CallPacked(args, &rv); },
|
||||
[&]() { context->sync(); });
|
||||
ret->kernelName = kernelName;
|
||||
ret->dllPath = dllPath;
|
||||
ret->funcName = func;
|
||||
ret->inputIdx = inputIdx;
|
||||
ret->packedFunc = packedFunc;
|
||||
|
||||
return std::dynamic_pointer_cast<PerfRecordObj>(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string serializeTVMArgs(const std::vector<std::vector<int>> &inDims,
|
||||
const std::vector<std::string> &inDTypes,
|
||||
const std::vector<int> &outDims,
|
||||
const std::string &outDType,
|
||||
const std::string &lambda,
|
||||
const std::string &funcName,
|
||||
const std::string &nnetExprString,
|
||||
const std::string &nnetSimplifiedExprString,
|
||||
const HashType hashCode) const {
|
||||
json j;
|
||||
// Consistant with python API interface
|
||||
j["input_tensors"] = inDims;
|
||||
j["input_dtypes"] = inDTypes;
|
||||
j["output_tensor"] = outDims;
|
||||
j["output_dtype"] = outDType;
|
||||
j["tvm_code"] = lambda;
|
||||
j["func_name"] = funcName;
|
||||
j["nnet_expression"] = nnetExprString;
|
||||
j["nnet_simplified_expression"] = nnetSimplifiedExprString;
|
||||
j["hash_code"] = std::to_string(hashCode);
|
||||
return j.dump();
|
||||
}
|
||||
|
||||
/// @brief
|
||||
/// @param inDims
|
||||
/// @param inDTypes
|
||||
/// @param outDims
|
||||
/// @param outDType
|
||||
/// @param lambda
|
||||
/// @param funcName Generated function name
|
||||
/// @param nnetExpressionString Save expr in string for logging.
|
||||
/// @param nnetSimplifiedExprString Save simplified expr in string for
|
||||
/// logging.
|
||||
/// @param hashCode (optional) Hash code of the input expression for kernel
|
||||
/// cache.
|
||||
/// @return
|
||||
std::string getAnsorDLL(const std::vector<std::vector<int>> &inDims,
|
||||
const std::vector<std::string> &inDTypes,
|
||||
const std::vector<int> &outDims,
|
||||
|
@ -146,29 +183,60 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
const std::string &nnetExprString,
|
||||
const std::string &nnetSimplifiedExprString,
|
||||
const HashType hashCode) const {
|
||||
std::string dllPath;
|
||||
try {
|
||||
start_interpreter();
|
||||
// Use static to avoid re-importing the module. Re-importing results
|
||||
// in cuBLAS failure, whose root cause is not identified yet.
|
||||
static auto func =
|
||||
py::module::import("cpp_plugin").attr("gen_ansor_so");
|
||||
py::tuple code =
|
||||
func(inDims, inDTypes, outDims, outDType, lambda, funcName,
|
||||
nnetExprString, nnetSimplifiedExprString,
|
||||
std::to_string(hashCode));
|
||||
dllPath = py::str(code[0]);
|
||||
} catch (py::error_already_set &e) {
|
||||
if (e.matches(PyExc_ImportError)) {
|
||||
std::cerr << "Import Error. Don't forget to set environment "
|
||||
"variable PYTHONPATH to contain "
|
||||
"<repo-root>/python"
|
||||
<< std::endl;
|
||||
}
|
||||
throw;
|
||||
int fdP2C[2], fdC2P[2];
|
||||
for (auto fd : {fdP2C, fdC2P}) {
|
||||
int status = pipe(fd);
|
||||
IT_ASSERT(status == 0, "pipe failed");
|
||||
}
|
||||
pid_t pid = fork();
|
||||
IT_ASSERT(pid >= 0, "fork failed");
|
||||
if (pid == 0) { // Child process
|
||||
close(fdP2C[1]);
|
||||
close(fdC2P[0]);
|
||||
|
||||
return dllPath;
|
||||
dup2(fdP2C[0], STDIN_FILENO);
|
||||
close(fdP2C[0]);
|
||||
|
||||
string cmd =
|
||||
"from cpp_plugin.gen_ansor_so import pipe_gen; pipe_gen(+" +
|
||||
std::to_string(fdC2P[1]) + ")";
|
||||
const char *const argv[] = {"python3", "-c", cmd.data(), NULL};
|
||||
execvp("python3", const_cast<char *const *>(argv));
|
||||
} else { // Parent process
|
||||
close(fdP2C[0]);
|
||||
close(fdC2P[1]);
|
||||
|
||||
// Write to pipe
|
||||
string serializedArgs = serializeTVMArgs(
|
||||
inDims, inDTypes, outDims, outDType, lambda, funcName,
|
||||
nnetExprString, nnetSimplifiedExprString, hashCode);
|
||||
int status = -1;
|
||||
status =
|
||||
write(fdP2C[1], serializedArgs.data(), serializedArgs.size());
|
||||
IT_ASSERT((size_t)status == serializedArgs.size(),
|
||||
"Failed to write to pipe");
|
||||
close(fdP2C[1]);
|
||||
|
||||
// Wait for TVM
|
||||
waitpid(pid, &status, 0);
|
||||
IT_ASSERT(WIFEXITED(status), "TVM process was terminated");
|
||||
const int es = WEXITSTATUS(status);
|
||||
IT_ASSERT(es == 0,
|
||||
"TVM process exit with code " + std::to_string(es));
|
||||
|
||||
// Read from pipe
|
||||
FILE *stream;
|
||||
stream = fdopen(fdC2P[0], "r");
|
||||
char buf_read[257] = {0};
|
||||
status = std::fscanf(stream, "%256c", buf_read);
|
||||
IT_ASSERT(status == 1, "Failed to read from pipe");
|
||||
IT_ASSERT(buf_read[256] == 0, "Pipe buffer overflow");
|
||||
fclose(stream);
|
||||
close(fdC2P[0]);
|
||||
return buf_read;
|
||||
}
|
||||
IT_ASSERT(false, "Should not reach here");
|
||||
return "";
|
||||
}
|
||||
|
||||
tvm::runtime::PackedFunc getPackedFunction(string path,
|
||||
|
@ -214,6 +282,34 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
|
||||
return {values, type_codes};
|
||||
}
|
||||
|
||||
bool useExistingKernels(Ref<MemBoundObj> op) const {
|
||||
const map<HashType, tuple<int, int, int, int, int, int, int, int, int,
|
||||
int, int, int, int, int, int>>
|
||||
hashMap = {
|
||||
// clang-format off
|
||||
{18446744073661354550ULL, {1, 1, 2, 2, 256, 4, 4, 4, 4, 1, 1, 2, 2, 1, 1}},
|
||||
{124145340ULL, {1, 1, 4, 4, 128, 4, 4, 8, 8, 1, 1, 2, 2, 1, 1}},
|
||||
{18446744073695718019ULL, {1, 1, 8, 8, 64, 4, 4, 16, 16, 1, 1, 2, 2, 1, 1}},
|
||||
{515085072ULL, {2, 1, 16, 16, 3, 4, 4, 32, 32, 1, 1, 2, 2, 1, 1}}
|
||||
}; // clang-format on
|
||||
float *input = op->getInputs(0)->getRawDataPtr<float *>();
|
||||
float *bias = nullptr;
|
||||
float *output = op->getOutput()->getRawDataPtr<float *>();
|
||||
if (auto it = hashMap.find(op->getHash()); it != hashMap.end()) {
|
||||
auto &[PReLU, n, h, w, f, r, s, oh, ow, ph, pw, sh, sw, dh, dw] =
|
||||
it->second;
|
||||
IT_ASSERT(op->getInputs(0)->size() ==
|
||||
size_t(n) * h * w * f * r * s);
|
||||
IT_ASSERT(op->getOutput()->size() == size_t(n) * oh * ow * f);
|
||||
convTranspose2dreduce_kernel(input, bias, output, PReLU, n, h, w, f,
|
||||
r, s, oh, ow, ph, pw, sh, sw, dh, dw);
|
||||
return true;
|
||||
}
|
||||
// conv2dreduce_kernel(input, bias, output, PReLU, n, h, w, f, r, s,
|
||||
// oh, ow, ph, pw, sh, sw, dh, dw);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
|
||||
|
|
|
@ -4,10 +4,13 @@ namespace infini {
|
|||
class CopyCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto inData = op->getInputs(0)->getRawDataPtr<void *>();
|
||||
auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
|
||||
cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
|
||||
cudaMemcpyDeviceToDevice);
|
||||
// auto inData = op->getInputs(0)->getRawDataPtr<void *>();
|
||||
// auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
|
||||
// cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
|
||||
// cudaMemcpyDeviceToDevice);
|
||||
|
||||
// HACK: optimization
|
||||
op->getOutputs()[0]->setData(op->getInputs(0)->getDataBlob());
|
||||
}
|
||||
};
|
||||
// reshape/flatten/identity all act as copying from input to output.
|
||||
|
|
|
@ -0,0 +1,239 @@
|
|||
#ifdef USE_CUDA
|
||||
#include "core/blob.h"
|
||||
#include "core/dummy_mutator.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "core/search_engine.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "ffi/ffi_callback.h"
|
||||
#include "nnet/nmutator.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/unary.h"
|
||||
#include "test.h"
|
||||
#include <pybind11/stl.h>
|
||||
|
||||
namespace infini {
|
||||
|
||||
// NHWC format
|
||||
Graph getInfoGAN(int batch, Runtime runtime, int nLayers) {
|
||||
IT_ASSERT(1 <= nLayers && nLayers <= 5);
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
vector<Tensor> weights;
|
||||
vector<tuple<int, int, int, int, bool>> cs{
|
||||
// Channel, kernelSize, pad, stride, isTanh
|
||||
{448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
|
||||
{64, 4, 1, 2, false}, {3, 4, 1, 2, true},
|
||||
};
|
||||
|
||||
Tensor input =
|
||||
g->addTensor({batch, 1, 1, 228}, DataType::Float32, TensorType::Input);
|
||||
for (int i = 0; i < (int)cs.size() && i < nLayers; ++i) {
|
||||
auto [channel, kernelSize, pad, stride, tanh] = cs[i];
|
||||
int f = input->getDims()[3]; // n, h, w, f
|
||||
auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
|
||||
DataType::Float32,
|
||||
TensorType::Initialized); // f, r, s, c
|
||||
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
|
||||
pad, stride, stride, 1, 1)
|
||||
->getOutput();
|
||||
if (tanh) {
|
||||
input = g->addOp<TanhObj>(input, nullptr)->getOutput();
|
||||
} else {
|
||||
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
|
||||
}
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId) {
|
||||
IT_ASSERT(0 <= layerId && layerId < 5);
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
vector<Tensor> weights;
|
||||
vector<tuple<int, int, int, int, bool>> cs{
|
||||
// Channel, kernelSize, pad, stride, isTanh
|
||||
{448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
|
||||
{64, 4, 1, 2, false}, {3, 4, 1, 2, true},
|
||||
};
|
||||
|
||||
Tensor input = g->addTensor(shape, DataType::Float32, TensorType::Input);
|
||||
for (int i = layerId; i < layerId + 1; ++i) {
|
||||
auto [channel, kernelSize, pad, stride, tanh] = cs[i];
|
||||
int f = input->getDims()[3]; // n, h, w, f
|
||||
auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
|
||||
DataType::Float32,
|
||||
TensorType::Initialized); // f, r, s, c
|
||||
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
|
||||
pad, stride, stride, 1, 1)
|
||||
->getOutput();
|
||||
if (tanh) {
|
||||
input = g->addOp<TanhObj>(input, nullptr)->getOutput();
|
||||
} else {
|
||||
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
|
||||
}
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
void printGraph(Graph g) {
|
||||
g->print();
|
||||
puts("============ Data ============");
|
||||
for (auto t : g->getTensors()) {
|
||||
dbg(t);
|
||||
t->printData();
|
||||
}
|
||||
}
|
||||
|
||||
Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning) {
|
||||
auto runtime = as<CudaRuntimeObj>(_runtime);
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
|
||||
auto mutator =
|
||||
make_ref<NMutator>(NMutator::Mode::RuleBased,
|
||||
vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
|
||||
vector<Graph> bestGraphs;
|
||||
SearchEngine searchEngine(runtime, mutator);
|
||||
bestGraphs.emplace_back(searchEngine.run(g));
|
||||
g->topo_sort();
|
||||
dbg(g, bestGraphs[0], bestGraphs.size());
|
||||
g->print();
|
||||
|
||||
g->dataMalloc();
|
||||
map<UidBaseType, Tensor> fuidToInputTensor;
|
||||
for (auto t : g->getInputs()) {
|
||||
IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
|
||||
fuidToInputTensor[t->getFuid()] = t;
|
||||
}
|
||||
|
||||
auto gen = RandomGenerator(-0.1, 0.1, 0);
|
||||
for (auto t : g->getInputs()) {
|
||||
t->setData(gen);
|
||||
}
|
||||
for (auto t : g->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
runtime->run(g);
|
||||
dbg("Baseline graph");
|
||||
printGraph(g);
|
||||
// dbg(runtme->getPerfTime(g, true));
|
||||
|
||||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
auto bestGraphCpu = bestGraphs[i];
|
||||
auto bestGraph =
|
||||
make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
|
||||
bestGraph->topo_sort();
|
||||
|
||||
bestGraph->dataMalloc();
|
||||
// Initialize inputs with random data
|
||||
for (auto t : bestGraph->getInputs()) {
|
||||
t->copyData(fuidToInputTensor[t->getFuid()]);
|
||||
}
|
||||
|
||||
// Initialize outputs with zeros
|
||||
for (auto t : bestGraph->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
|
||||
dbg(bestGraph);
|
||||
dbg(bestGraph->getOutputs());
|
||||
|
||||
if (tuning) {
|
||||
runtime->run(bestGraph, true); // Tune kernels
|
||||
runtime->run(bestGraph, false); // Execute transfomraed graph
|
||||
|
||||
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
|
||||
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
|
||||
// EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
|
||||
dbg(go0->equalData(bgo0, 1e-3));
|
||||
dbg(runtime->getPerfTime(bestGraph, true));
|
||||
dbg(runtime->timeNonCtcOperators(bestGraph));
|
||||
// dbg(runtime->timeWithCudaGraph(bestGraph));
|
||||
}
|
||||
|
||||
dbg("Best graph");
|
||||
printGraph(bestGraph);
|
||||
return bestGraph;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
vector<Tensor> runInfoGAN(int nLayers) {
|
||||
auto cuda = make_ref<CudaRuntimeObj>();
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
|
||||
Graph g = getInfoGAN(1, cuda, nLayers);
|
||||
|
||||
auto mutator =
|
||||
make_ref<NMutator>(NMutator::Mode::RuleBased,
|
||||
vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
|
||||
// // Translate OP to membound without derivation
|
||||
// mutator->setToNaiveMembound();
|
||||
|
||||
vector<Graph> bestGraphs;
|
||||
SearchEngine searchEngine(cuda, mutator);
|
||||
bestGraphs.emplace_back(searchEngine.run(g));
|
||||
g->topo_sort();
|
||||
dbg(g, bestGraphs[0], bestGraphs.size());
|
||||
g->print();
|
||||
|
||||
g->dataMalloc();
|
||||
map<UidBaseType, Tensor> fuidToInputTensor;
|
||||
for (auto t : g->getInputs()) {
|
||||
IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
|
||||
fuidToInputTensor[t->getFuid()] = t;
|
||||
}
|
||||
|
||||
auto gen = RandomGenerator(-0.1, 0.1, 0);
|
||||
// auto gen = RandomGenerator(-5, 5, 0, true);
|
||||
for (auto t : g->getInputs()) {
|
||||
t->setData(gen);
|
||||
}
|
||||
for (auto t : g->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
cuda->run(g);
|
||||
dbg("Baseline graph");
|
||||
printGraph(g);
|
||||
dbg(cuda->getPerfTime(g, true));
|
||||
|
||||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
auto bestGraphCpu = bestGraphs[i];
|
||||
auto bestGraph = make_ref<GraphObj>(cuda, bestGraphCpu->getOperators());
|
||||
bestGraph->topo_sort();
|
||||
|
||||
bestGraph->dataMalloc();
|
||||
// Initialize inputs with random data
|
||||
for (auto t : bestGraph->getInputs()) {
|
||||
t->copyData(fuidToInputTensor[t->getFuid()]);
|
||||
}
|
||||
|
||||
// Initialize outputs with zeros
|
||||
for (auto t : bestGraph->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
|
||||
dbg(bestGraph);
|
||||
dbg(bestGraph->getOutputs());
|
||||
|
||||
cuda->run(bestGraph, true); // Tune kernels
|
||||
cuda->run(bestGraph, false); // Execute transfomraed graph
|
||||
|
||||
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
|
||||
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
|
||||
// EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
|
||||
std::cout << go0->equalData(bgo0, 1e-3) << std::endl;
|
||||
bgo0->printData();
|
||||
go0->printData();
|
||||
dbg(cuda->getPerfTime(bestGraph, true));
|
||||
|
||||
dbg("Best graph");
|
||||
printGraph(bestGraph);
|
||||
callback::exportONNX(bestGraph, "best_graph.onnx"); // Debug
|
||||
return {g->getOutputs()[0], bestGraph->getOutputs()[0]};
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
#endif
|
|
@ -13,7 +13,8 @@ string FullPrinterVisitor::print(const Expr &root) {
|
|||
oss << "==> ROOT\n" << root->toReadable() << "\n";
|
||||
for (size_t i = 0; i < q.size(); ++i) {
|
||||
const auto &[name, routine, tensor] = q[i];
|
||||
oss << "==> " << name << " : ";
|
||||
oss << "==> " << name << " " << infini::vecToString(tensor->getShape())
|
||||
<< " : ";
|
||||
if (routine) {
|
||||
oss << routine->toReadable() << "\n";
|
||||
if (routine->getExpr()) {
|
||||
|
|
|
@ -73,17 +73,52 @@ string Serializer::visit_(const Tensor &c) {
|
|||
return key;
|
||||
}
|
||||
|
||||
bool Serializer::serialize(const Expr &expr, const string &filePath,
|
||||
const string &msg) {
|
||||
string Serializer::visit_(const Func &c) {
|
||||
const string key = std::to_string(id++);
|
||||
j[key]["type"] = c->getType();
|
||||
j[key]["funcType"] = c->getFuncType();
|
||||
j[key]["object"] = dispatch(c->getObject());
|
||||
return key;
|
||||
}
|
||||
|
||||
std::optional<std::string> Serializer::toString(const Expr &expr,
|
||||
const string &msg,
|
||||
vector<Tensor> inputs,
|
||||
double exec_time, string hint) {
|
||||
// Metadata
|
||||
j["Version"] = VERSION;
|
||||
j["Msg"] = msg;
|
||||
j["exec_time"] = exec_time;
|
||||
j["hint"] = hint;
|
||||
// Expressions and routines
|
||||
id = 0;
|
||||
dispatch(expr);
|
||||
std::ofstream fout(filePath);
|
||||
fout << std::setw(4) << j << std::endl;
|
||||
return true;
|
||||
|
||||
// Input tensors
|
||||
vector<string> inputsIndices;
|
||||
for (const auto &tensor : inputs) {
|
||||
inputsIndices.emplace_back(std::to_string(id));
|
||||
dispatch(tensor);
|
||||
}
|
||||
j["nnetInputs"] = inputsIndices;
|
||||
|
||||
// To string
|
||||
std::stringstream ss;
|
||||
ss << std::setw(4) << j << std::endl;
|
||||
return {ss.str()};
|
||||
}
|
||||
|
||||
bool Serializer::toFile(const Expr &expr, const string &filePath,
|
||||
const string &msg, vector<Tensor> inputs,
|
||||
double exec_time, string hint) {
|
||||
if (auto s = toString(expr, msg, inputs, exec_time, hint)) {
|
||||
// Write to file
|
||||
std::ofstream fout(filePath);
|
||||
fout << *s;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
string Serializer::dispatchRoutine(const Routine &c) {
|
||||
|
@ -129,7 +164,15 @@ string Serializer::dispatchRoutine(const Routine &c) {
|
|||
return key;
|
||||
}
|
||||
|
||||
Expr Serializer::deserialize(const string &filePath) {
|
||||
Expr Serializer::fromString(const string &text) {
|
||||
std::stringstream str;
|
||||
str << text;
|
||||
str >> j;
|
||||
assert(j["Version"] == VERSION);
|
||||
return buildExprTree("0");
|
||||
}
|
||||
|
||||
Expr Serializer::fromFile(const string &filePath) {
|
||||
std::ifstream fin(filePath);
|
||||
fin >> j;
|
||||
assert(j["Version"] == VERSION);
|
||||
|
@ -180,6 +223,10 @@ Expr Serializer::buildExprTree(string key) {
|
|||
return make_ref<TensorNode>(j[key]["name"], j[key]["shape"],
|
||||
j[key]["paddings"], source);
|
||||
}
|
||||
case NodeType::FuncNodeType: {
|
||||
auto object = buildExprTree(j[key]["object"]);
|
||||
return make_ref<FuncNode>(object, j[key]["funcType"]);
|
||||
}
|
||||
default: {
|
||||
nnet_unimplemented_halt();
|
||||
break;
|
||||
|
@ -242,4 +289,25 @@ Routine Serializer::buildRoutine(string key) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
tuple<Expr, vector<Tensor>, double, string>
|
||||
Serializer::deserializeAsMemobundOp(const string &filePath) {
|
||||
std::ifstream fin(filePath);
|
||||
fin >> j;
|
||||
assert(j["Version"] == VERSION);
|
||||
vector<Tensor> inputs;
|
||||
for (const auto &input : j["nnetInputs"])
|
||||
inputs.emplace_back(as<TensorNode>(buildExprTree(input)));
|
||||
return {buildExprTree("0"), inputs, j["exec_time"], j["hint"]};
|
||||
}
|
||||
|
||||
tuple<Expr, vector<Tensor>, double, string>
|
||||
Serializer::membundOpFromString(const string &data) {
|
||||
j = json::parse(data);
|
||||
assert(j["Version"] == VERSION);
|
||||
vector<Tensor> inputs;
|
||||
for (const auto &input : j["nnetInputs"])
|
||||
inputs.emplace_back(as<TensorNode>(buildExprTree(input)));
|
||||
return {buildExprTree("0"), inputs, j["exec_time"], j["hint"]};
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
|
|
|
@ -499,8 +499,8 @@ void Derivator::printIntermediateStates() {
|
|||
std::cout << FullPrinterVisitor().print(intermediateStates[i]) << endl;
|
||||
if (logMode == LogMode::DumpFristCandiate) {
|
||||
Serializer serializer;
|
||||
serializer.serialize(intermediateStates[i],
|
||||
logFnPrefix + to_string(i) + ".expr", msg);
|
||||
serializer.toFile(intermediateStates[i],
|
||||
logFnPrefix + to_string(i) + ".expr", msg);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < intermediateStates.size(); ++i) {
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include "nnet/expr.h"
|
||||
#include "nnet/Visitor/FullPrinterVisitor.h"
|
||||
#include "nnet/Visitor/GetTensorsVisitor.h"
|
||||
|
||||
namespace nnet {
|
||||
|
@ -463,4 +464,9 @@ void FuncNode::setObject(Expr e) {
|
|||
object = e;
|
||||
}
|
||||
|
||||
string RangeOpNode::getFullExpression() {
|
||||
FullPrinterVisitor printer;
|
||||
return printer.print(this->shared_from_this());
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
#include "nnet/nmutator.h"
|
||||
#include "core/graph.h"
|
||||
#include "ffi/ffi_callback.h"
|
||||
#include "nnet/Visitor/FullPrinterVisitor.h"
|
||||
#include "nnet/Visitor/GetTensorsVisitor.h"
|
||||
#include "nnet/Visitor/MatchReshapeVisitor.h"
|
||||
#include "nnet/Visitor/MergeMemboundMutator.h"
|
||||
#include "nnet/derivator.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/membound.h"
|
||||
#include "operators/reshape.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -49,9 +52,9 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
|
|||
assert(computeOps.size() == 1);
|
||||
const auto &computeOp = computeOps[0];
|
||||
auto g = infini::make_ref<GraphObj>(in_graph->getRuntime());
|
||||
auto expr = opToExpression(computeOp);
|
||||
nnet::Expr expr = opToExpression(computeOp);
|
||||
auto inputsN = nnet::GetTensorsVisitor().get(expr);
|
||||
dbg(inputsN, expr);
|
||||
// dbg(inputsN, expr);
|
||||
IT_ASSERT(inputsN.count("B") + inputsN.count("K") == 1,
|
||||
"Which one is the second input tensor?");
|
||||
vector<nnet::Tensor> inputsVectorN = {inputsN.at("A")};
|
||||
|
@ -74,17 +77,18 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
|
|||
void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
|
||||
OpVec computeOps = in_graph->getComputeOps();
|
||||
IT_ASSERT(computeOps.size() == 1);
|
||||
// HACK: remove this
|
||||
if (auto op = as<ConvTransposed2dNHWCObj>(computeOps[0]); !op)
|
||||
return;
|
||||
|
||||
/* if (infini::Graph g = transformTConv1x1(computeOps[0])) {
|
||||
// if (infini::Graph g = transformTConv1x1(computeOps[0])) {
|
||||
// out_graphs.emplace_back(g);
|
||||
// return;
|
||||
// }
|
||||
// // Commented for debug, not implemented yet
|
||||
// // if (infini::Graph g = transformTConv3x3(computeOps[0])) {
|
||||
// // Graph graph = new Graph(g->getOperators());
|
||||
// // out_graphs.emplace_back(graph);
|
||||
// // return;
|
||||
// // }
|
||||
if (Graph g = transformConvtransposed1x1(computeOps[0])) {
|
||||
out_graphs.emplace_back(g);
|
||||
return;
|
||||
}
|
||||
// if (infini::Graph g = transformDialtedConv(computeOps[0])) {
|
||||
// out_graphs.emplace_back(g);
|
||||
// return;
|
||||
|
@ -99,7 +103,6 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
|
|||
// // out_graphs.emplace_back(graph);
|
||||
// // return;
|
||||
// // }
|
||||
*/
|
||||
|
||||
auto expr = opToExpression(computeOps[0]);
|
||||
if (!expr)
|
||||
|
@ -107,18 +110,15 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
|
|||
|
||||
nnet::Derivator derivator(maxDepth);
|
||||
nnet::Formula conv_9x9(expr, 0);
|
||||
// const std::vector<int> rules{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90};
|
||||
// ConvTraspose
|
||||
// const std::vector<int> rules{1, 7, 7, 2, 8, 6, 6}; // G2BMM
|
||||
if (mode == Mode::Normal) {
|
||||
derivator.search(conv_9x9, 0);
|
||||
} else if (mode == Mode::RuleBased) {
|
||||
dbg(derivationRules);
|
||||
// dbg(derivationRules);
|
||||
derivator.ruleBasedDFS(conv_9x9, 0, derivationRules);
|
||||
} else
|
||||
IT_TODO_HALT_MSG("Unknown NMutator search mode.");
|
||||
const auto &candidates = derivator.getCandidates();
|
||||
dbg(candidates.size());
|
||||
// dbg(candidates.size());
|
||||
// derivator.print();
|
||||
for (const auto &candidate : candidates) {
|
||||
// dbg(nnet::FullPrinterVisitor().print(candidate.root));
|
||||
|
@ -233,31 +233,38 @@ void NMutator::runMultipleOps(Graph in_graph, std::vector<Graph> &out_graphs) {
|
|||
// }
|
||||
// }
|
||||
|
||||
nnet::Expr NMutator::opToExpression(Operator op) {
|
||||
// IT_TODO_HALT();
|
||||
if (auto convOp = as<ConvObj>(op)) {
|
||||
nnet::Expr NMutator::opToExpression(Operator opT) {
|
||||
auto [expr, mapNameNToTensorT] = extractOp(opT);
|
||||
for (auto &[name, tensorT] : mapNameNToTensorT) {
|
||||
IT_ASSERT(inputsNameNToTensorT.count(name) == 0);
|
||||
inputsNameNToTensorT[name] = tensorT;
|
||||
}
|
||||
return expr;
|
||||
}
|
||||
|
||||
pair<nnet::Expr, NMutator::NameNToTensorT> NMutator::extractOp(Operator opT) {
|
||||
if (auto convOp = as<ConvObj>(opT)) {
|
||||
const auto &inputs = convOp->getInputs();
|
||||
const auto &AT = inputs[0];
|
||||
const auto &KT = inputs[1];
|
||||
const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS();
|
||||
const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation();
|
||||
if (!(sh == 1 && sw == 1 && dh == 1 && dw == 1))
|
||||
return nullptr;
|
||||
return {};
|
||||
assert(sh == 1 && sw == 1 && dh == 1 && dw == 1);
|
||||
inputsNameNToTensorT["A"] = AT;
|
||||
inputsNameNToTensorT["K"] = KT;
|
||||
const auto A = nnet::makeTensor("A", AT->getDims(),
|
||||
std::vector<int>{0, 0, ph, pw});
|
||||
const auto K = nnet::makeTensor("K", KT->getDims());
|
||||
return nnet::ConvPattern::getExpr(A, K, n, c, h, w, f, r, s);
|
||||
} else if (auto convOp = as<ConvTransposed2dNHWCObj>(op)) {
|
||||
return {nnet::ConvPattern::getExpr(A, K, n, c, h, w, f, r, s),
|
||||
{{"A", AT}, {"K", KT}}};
|
||||
} else if (auto convOp = as<ConvTransposed2dNHWCObj>(opT)) {
|
||||
const auto &AT = convOp->getInputs()[0];
|
||||
const auto &KT = convOp->getInputs()[1];
|
||||
inputsNameNToTensorT["A"] = AT;
|
||||
inputsNameNToTensorT["K"] = KT;
|
||||
const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS();
|
||||
const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation();
|
||||
IT_ASSERT_TODO(convOp->getNumGroups() == 1);
|
||||
if (r != 4)
|
||||
return {};
|
||||
IT_ASSERT_TODO(r == 4);
|
||||
IT_ASSERT_TODO(ph == pw);
|
||||
IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));
|
||||
|
@ -269,8 +276,9 @@ nnet::Expr NMutator::opToExpression(Operator op) {
|
|||
const auto A = nnet::makeTensor(
|
||||
"A", AT->getDims(), std::vector<int>{0, padding, padding, 0});
|
||||
const auto K = nnet::makeTensor("K", KT->getDims());
|
||||
return nnet::ConvTransPattern::getExpr(A, K, n, c, h, w, f, r, s);
|
||||
// } else if (auto g2bmmOp = dynamic_cast<G2BMMOp *>(op)) {
|
||||
return {nnet::ConvTransPattern::getExpr(A, K, n, c, h, w, f, r, s),
|
||||
{{"A", AT}, {"K", KT}}};
|
||||
// } else if (auto g2bmmOp = dynamic_cast<G2BMMOp *>(opT)) {
|
||||
// const auto &AT = g2bmmOp->getInputs()[0];
|
||||
// const auto &BT = g2bmmOp->getInputs()[1];
|
||||
// const auto [b, m, k, width, dilation] = g2bmmOp->getArgs();
|
||||
|
@ -280,7 +288,7 @@ nnet::Expr NMutator::opToExpression(Operator op) {
|
|||
// inputsNameNToTensorT[inputsN.first->getName()] = AT;
|
||||
// inputsNameNToTensorT[inputsN.second->getName()] = BT;
|
||||
// return expr;
|
||||
// } else if (auto gbmmlOp = dynamic_cast<GBMMLOp *>(op)) {
|
||||
// } else if (auto gbmmlOp = dynamic_cast<GBMMLOp *>(opT)) {
|
||||
// const auto &AT = gbmmlOp->getInputs()[0];
|
||||
// const auto &BT = gbmmlOp->getInputs()[1];
|
||||
// const auto [b, m, w, k, dilation] = gbmmlOp->getArgs();
|
||||
|
@ -290,22 +298,30 @@ nnet::Expr NMutator::opToExpression(Operator op) {
|
|||
// inputsNameNToTensorT[inputsN.second->getName()] = BT;
|
||||
// dbg(b, m, w, k, dilation, expr);
|
||||
// return expr;
|
||||
} else if (auto matmulOp = as<MatmulObj>(op)) {
|
||||
} else if (auto matmulOp = as<MatmulObj>(opT)) {
|
||||
const auto &AT = matmulOp->getInputs()[0];
|
||||
const auto &BT = matmulOp->getInputs()[1];
|
||||
const auto [b, m, n, k, transA, transB] = matmulOp->getBMNKTransAB();
|
||||
const auto &[expr, inputsN] =
|
||||
nnet::MatmulPattern::getExpr(transA, transB, b, m, n, k);
|
||||
inputsNameNToTensorT[inputsN.first->getName()] = AT;
|
||||
inputsNameNToTensorT[inputsN.second->getName()] = BT;
|
||||
// dbg(b, m, n, k, expr);
|
||||
return expr;
|
||||
return {
|
||||
expr,
|
||||
{{inputsN.first->getName(), AT}, {inputsN.second->getName(), BT}}};
|
||||
} else if (auto op = as<MemBoundObj>(opT)) {
|
||||
NameNToTensorT m;
|
||||
for (int i = 0; i < op->numInputs(); ++i)
|
||||
m[op->getNnetInputs()[i]->getName()] = opT->getInputs()[i];
|
||||
return {op->getNnetExpr(), m};
|
||||
} else if (opT->getOpType() == OpType::Relu ||
|
||||
opT->getOpType() == OpType::Tanh) {
|
||||
return generateUnaryExpr(opT);
|
||||
}
|
||||
// // else if (auto transposeOp = dynamic_cast<TransposeOp *>(op)) {
|
||||
// // else if (auto transposeOp = dynamic_cast<TransposeOp *>(opT)) {
|
||||
// // return transposeOpToExpression(transposeOp);
|
||||
// // }
|
||||
nnet_unimplemented_continue();
|
||||
return nullptr;
|
||||
IT_TODO_HALT_MSG("Cannot convert " + opT->toString() +
|
||||
" to an NNet expression");
|
||||
return {};
|
||||
}
|
||||
|
||||
infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
|
||||
|
@ -313,7 +329,7 @@ infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
|
|||
nnet::FullPrinterVisitor fullVisitor;
|
||||
// Get tensors in the reversed topological order
|
||||
const auto &tensorQueueN = fullVisitor.traverse(expr);
|
||||
dbg(fullVisitor.print(expr));
|
||||
// dbg(fullVisitor.print(expr));
|
||||
|
||||
// Build a map: name in nnet -> tensors in infini
|
||||
// Add input tensors to the map
|
||||
|
@ -381,8 +397,8 @@ infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
|
|||
cnt += tensor->size();
|
||||
for (const auto &tensor : outputsPET)
|
||||
cnt += tensor->size();
|
||||
dbg(inputsPET, outputsPET, op->getInputs(), op->getExpr(),
|
||||
memboundTime(cnt));
|
||||
// dbg(inputsPET, outputsPET, op->getInputs(), op->getExpr(),
|
||||
// memboundTime(cnt));
|
||||
g->addOpWithOutputs<MemBoundObj>(inputsPET, outputsPET,
|
||||
op->getInputs(), op->getExpr(),
|
||||
memboundTime(cnt));
|
||||
|
@ -395,6 +411,9 @@ infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
|
|||
nameNToTensorT.at(BN->getName())};
|
||||
TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
|
||||
const auto &[b, m, n, k, transa, transb] = op->getArgs();
|
||||
// // HACK: pruning for deubg
|
||||
if (!((transa == 0) && (transb == 1)))
|
||||
return nullptr;
|
||||
g->addOpWithOutputs<MatmulObj>(inputsPET[0], inputsPET[1],
|
||||
outputsPET[0], transa, transb);
|
||||
}
|
||||
|
@ -500,43 +519,86 @@ double NMutator::memboundTime(const Shape &dims) {
|
|||
// return nullptr;
|
||||
// }
|
||||
|
||||
// Graph NMutator::transformTConv3x3(Operator op) {
|
||||
// if (auto tconvOp = dynamic_cast<ConvTransOp *>(op)) {
|
||||
// dbg(tconvOp->getInputs()[1]->getDims());
|
||||
// if (tconvOp->getPh() == 1 && tconvOp->getSh() == 2 &&
|
||||
// tconvOp->getInputs()[1]->getDims()[0] == 3 &&
|
||||
// tconvOp->getInputs()[1]->getDims()[1] == 3) {
|
||||
// auto g = new infini::Graph();
|
||||
// auto inputDims = tconvOp->getInputs(0)->getDims();
|
||||
// auto weightDims = tconvOp->getInputs(1)->getDims();
|
||||
// auto outputDims = tconvOp->getOutput()->getDims();
|
||||
// // NHWF
|
||||
// auto newA = g->tensor(
|
||||
// {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]});
|
||||
// // RSFC
|
||||
// auto newW = g->tensor(
|
||||
// {weightDims[0] * weightDims[1] * weightDims[3],
|
||||
// weightDims[2]});
|
||||
// auto newO =
|
||||
// g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
|
||||
Graph NMutator::transformConvtransposed1x1(Operator _op) {
|
||||
auto op = as<ConvTransposed2dNHWCObj>(_op);
|
||||
if (!op)
|
||||
return nullptr;
|
||||
const auto &A = op->getInputs()[0];
|
||||
const auto &W = op->getInputs()[1];
|
||||
const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
const Shape inputDims = op->getInputs(0)->getDims();
|
||||
const Shape weightDims = op->getInputs(1)->getDims();
|
||||
const Shape outputDims = op->getOutput()->getDims();
|
||||
const DataType dtype = A->getDType();
|
||||
IT_ASSERT_TODO(op->getNumGroups() == 1);
|
||||
if (h != 1 || w != 1)
|
||||
return {};
|
||||
IT_ASSERT_TODO(ph == pw);
|
||||
IT_ASSERT_TODO(tie(sh, sw) == tuple(1, 1));
|
||||
IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
|
||||
auto g = make_ref<GraphObj>(runtime);
|
||||
// NHWF
|
||||
auto newA = g->addTensor(
|
||||
{inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
|
||||
// FRSC
|
||||
// auto newW = g->addTensor(
|
||||
// {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]},
|
||||
// dtype);
|
||||
// HACK: without transpoe
|
||||
auto newW = g->addTensor(
|
||||
{weightDims[1] * weightDims[2] * weightDims[3], weightDims[0]}, dtype);
|
||||
g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
|
||||
g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
|
||||
Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 1)->getOutput();
|
||||
g->addOpWithOutputs<ReshapeObj>(newO, g->cloneTensor(op->getOutput()),
|
||||
op->getOutput()->getDims());
|
||||
return g;
|
||||
}
|
||||
|
||||
// Graph NMutator::transformConvtransposed(Operator _op) {
|
||||
// auto op = as<ConvTransposed2dNHWCObj>(_op);
|
||||
// if (!op)
|
||||
// return nullptr;
|
||||
// const auto &AT = op->getInputs()[0];
|
||||
// const auto &KT = op->getInputs()[1];
|
||||
// const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
// const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
// IT_ASSERT_TODO(op->getNumGroups() == 1);
|
||||
// if (r != 4)
|
||||
// return {};
|
||||
// IT_ASSERT_TODO(ph == pw);
|
||||
// IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));
|
||||
// IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
|
||||
|
||||
// auto g = make_ref<Graph>();
|
||||
// // TODO: implement transformation rules
|
||||
// // How to efficiently write an expression...
|
||||
// auto inputDims = op->getInputs(0)->getDims();
|
||||
// auto weightDims = op->getInputs(1)->getDims();
|
||||
// auto outputDims = op->getOutput()->getDims();
|
||||
// // NHWF
|
||||
// auto newA =
|
||||
// g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
|
||||
// inputDims[3]});
|
||||
// // RSFC
|
||||
// auto newW = g->tensor(
|
||||
// {weightDims[0] * weightDims[1] * weightDims[3], weightDims[2]});
|
||||
// auto newO = g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
|
||||
// weightDims[0] * weightDims[1] * weightDims[3]});
|
||||
// g->reshape(tconvOp->getInputs(0), newA);
|
||||
// g->reshape(tconvOp->getInputs(1), newW);
|
||||
// g->matmul(newA, newW, newO, 0, 1);
|
||||
// // g->reshape(newO, tconvOp->getOutput());
|
||||
// tconvOp->print();
|
||||
// dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
|
||||
// assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
|
||||
// g->membound(
|
||||
// {newO}, {tconvOp->getOutput()}, {}, nullptr,
|
||||
// g->reshape(op->getInputs(0), newA);
|
||||
// g->reshape(op->getInputs(1), newW);
|
||||
// g->matmul(newA, newW, newO, 0, 1);
|
||||
// // g->reshape(newO, tconvOp->getOutput());
|
||||
// tconvOp->print();
|
||||
// dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
|
||||
// assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
|
||||
// g->membound({newO}, {tconvOp->getOutput()}, {}, nullptr,
|
||||
// memboundTime(newO->size() + tconvOp->getOutput()->size()),
|
||||
// "TConv3x3 reduce");
|
||||
// g->updateConnection();
|
||||
// Graph graph = new Graph(g->getOperators());
|
||||
// return graph;
|
||||
// }
|
||||
// }
|
||||
// return nullptr;
|
||||
// g->updateConnection();
|
||||
// Graph graph = new Graph(g->getOperators());
|
||||
// return graph;
|
||||
// }
|
||||
|
||||
// Graph NMutator::transformTConv1x1(Operator op) {
|
||||
|
@ -632,4 +694,86 @@ double NMutator::memboundTime(const Shape &dims) {
|
|||
// return graph;
|
||||
// }
|
||||
|
||||
Graph NMutator::fuseVertically(const Graph &inputGraph) {
|
||||
Graph optGraph = make_ref<GraphObj>(runtime);
|
||||
|
||||
auto chainOps = inputGraph->getOperators();
|
||||
IT_ASSERT(!chainOps.empty());
|
||||
for (auto &op : chainOps) {
|
||||
IT_ASSERT(op->isMemBoundOp());
|
||||
IT_ASSERT_TODO(op->getInputs().size() == 1);
|
||||
IT_ASSERT(op->getOutputs().size() == 1);
|
||||
}
|
||||
if (chainOps.size() == 1) {
|
||||
return make_ref<GraphObj>(runtime, chainOps);
|
||||
}
|
||||
std::vector<nnet::Expr> exprs;
|
||||
for (const auto &op : chainOps) {
|
||||
auto [expr, _] = extractOp(op);
|
||||
exprs.emplace_back(expr);
|
||||
// dbg(op, infini::as<nnet::RangeOpNode>(expr)->getFullExpression());
|
||||
}
|
||||
// double maxTime = getMaxPerf(std::make_shared<SubGraph>(chainOps));
|
||||
// Fuse a MemboundOp chain
|
||||
auto expr = nnet::MergeMemboundMutator(exprs).merge(true);
|
||||
auto inputNMap = nnet::GetTensorsVisitor().get(exprs.front());
|
||||
IT_ASSERT(inputNMap.size() == 1);
|
||||
vector<nnet::Tensor> inputsN;
|
||||
for (const auto &[name, t] : inputNMap) {
|
||||
inputsN.emplace_back(t);
|
||||
}
|
||||
optGraph->addOpWithOutputs<MemBoundObj>(chainOps.front()->getInputs(),
|
||||
chainOps.back()->getOutputs(),
|
||||
inputsN, expr, 0);
|
||||
// TODO: set time
|
||||
return optGraph;
|
||||
}
|
||||
|
||||
pair<nnet::Expr, NMutator::NameNToTensorT>
|
||||
NMutator::generateUnaryExpr(const Operator &op) {
|
||||
using namespace nnet;
|
||||
const map<OpType, nnet::FuncType> opTToFuncN = {
|
||||
{OpType::PRelu, nnet::FuncType::PRelu},
|
||||
{OpType::Relu, nnet::FuncType::Relu},
|
||||
{OpType::Tanh, nnet::FuncType::Tanh}};
|
||||
Shape shape = op->getInputs()[0]->getDims();
|
||||
nnet::FuncType type = opTToFuncN.at(op->getOpType());
|
||||
auto T = make_ref<TensorNode>("T", shape);
|
||||
VecExpr indices;
|
||||
for (size_t i = 0; i < shape.size(); ++i) {
|
||||
indices.emplace_back(make_ref<VarNode>("i" + std::to_string(i)));
|
||||
}
|
||||
auto sub = makeSubscript(T, indices);
|
||||
auto func = nnet::make_ref<FuncNode>(sub, type);
|
||||
vector<VarRangePair> varRanges;
|
||||
for (size_t i = 0; i < shape.size(); ++i) {
|
||||
varRanges.emplace_back(nnet::as<VarNode>(indices[i]),
|
||||
Range{0, shape[i]});
|
||||
}
|
||||
return {makeRangeOperator(varRanges, {}, func),
|
||||
NameNToTensorT{{"T", op->getInputs()[0]}}};
|
||||
}
|
||||
|
||||
pair<nnet::Expr, vector<nnet::Tensor>> NMutator::generateRevert(Tensor in) {
|
||||
using namespace nnet;
|
||||
using infini::make_ref;
|
||||
const Shape &orignalShape = in->getDims();
|
||||
auto tensor = makeTensor("T", in->getDims());
|
||||
VecExpr iters;
|
||||
for (size_t i = 0; i < orignalShape.size(); ++i) {
|
||||
iters.emplace_back(make_ref<VarNode>("i" + std::to_string(i)));
|
||||
}
|
||||
|
||||
Shape newShape = orignalShape;
|
||||
std::reverse(newShape.begin(), newShape.end());
|
||||
auto sub = makeSubscript(tensor, iters);
|
||||
vector<VarRangePair> loopIters;
|
||||
for (int i = orignalShape.size() - 1; i >= 0; --i) {
|
||||
loopIters.emplace_back(infini::as<VarNode>(iters[i]),
|
||||
Range{0, orignalShape[i]});
|
||||
}
|
||||
auto range = makeRangeOperator(loopIters, {}, sub);
|
||||
return {range, {tensor}};
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
namespace nnet {
|
||||
|
||||
int matchExprResult(Derivator &derivator, string fn) {
|
||||
auto ans = Serializer().deserialize(fn);
|
||||
auto ans = Serializer().fromFile(fn);
|
||||
auto hashAns = HashVisitor()(ans);
|
||||
int match = 0;
|
||||
for (const auto &candidate : derivator.getCandidates()) {
|
||||
|
@ -19,14 +19,14 @@ int matchExprResult(Derivator &derivator, string fn) {
|
|||
bool checkExprLogSame(string fnPrefix, int start, int end) {
|
||||
Serializer serializer;
|
||||
string fn0 = fnPrefix + to_string(start) + ".expr";
|
||||
Expr expr0 = serializer.deserialize(fn0);
|
||||
Expr expr0 = serializer.fromFile(fn0);
|
||||
RangeOp range0 = as<RangeOpNode>(expr0);
|
||||
Interpreter interpreter(range0);
|
||||
auto ans0 = interpreter.interpretUniformSample(range0);
|
||||
dbg(expr0, ans0);
|
||||
for (int i = start + 1; i < end; ++i) {
|
||||
string fn1 = fnPrefix + to_string(i) + ".expr";
|
||||
Expr expr1 = serializer.deserialize(fn1);
|
||||
Expr expr1 = serializer.fromFile(fn1);
|
||||
RangeOp range1 = as<RangeOpNode>(expr1);
|
||||
dbg(fn1, expr1);
|
||||
auto ans1 = interpreter.interpretUniformSample(range1);
|
||||
|
@ -67,4 +67,4 @@ bool checkExprsEquvivalence(VecExpr exprs) {
|
|||
return true;
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#include "nnet/Visitor/CheckOOBVisitor.h"
|
||||
#include "nnet/Visitor/HashVisitor.h"
|
||||
#include "nnet/Visitor/MergeMemboundMutator.h"
|
||||
#include "nnet/Visitor/Serializer.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -9,8 +10,8 @@ MemBoundObj::MemBoundObj(GraphObj *graph, const TensorVec &input,
|
|||
const TensorVec &output,
|
||||
const std::vector<nnet::Tensor> &nnetInputs,
|
||||
nnet::Expr expr, double exec_time, std::string hint)
|
||||
: OperatorObj(OpType::MemBound, input, output), nnetInputs(nnetInputs),
|
||||
expr(expr), exec_time(exec_time), hint(hint) {
|
||||
: OperatorObj(OpType::MemBound, input, output), expr(expr),
|
||||
nnetInputs(nnetInputs), exec_time(exec_time), hint(hint) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
IT_ASSERT(!checkOOB(expr));
|
||||
hash = calcHash(expr);
|
||||
|
@ -45,7 +46,7 @@ string MemBoundObj::toString() const {
|
|||
os << "exec_time=" << exec_time << ", ";
|
||||
os << "NNet Inputs=[";
|
||||
for (const auto &tensor : nnetInputs)
|
||||
os << tensor->toReadable() << ",";
|
||||
os << tensor->toReadable() << vecToString(tensor->getShape()) << ",";
|
||||
os << "]";
|
||||
os << ", ExprHash=" << hash;
|
||||
os << ", SimplifiedExprHash=" << simplifiedHash;
|
||||
|
@ -60,11 +61,18 @@ string MemBoundObj::toString() const {
|
|||
|
||||
optional<vector<Shape>> MemBoundObj::inferShape(const TensorVec &inputs) const {
|
||||
// inputs have to match nnetInputs excatly
|
||||
if (inputs.size() != nnetInputs.size())
|
||||
if (inputs.size() != nnetInputs.size()) {
|
||||
std::cout << "Num mismatch" << inputs.size() << " "
|
||||
<< nnetInputs.size();
|
||||
return {};
|
||||
}
|
||||
for (size_t i = 0; i < inputs.size(); ++i)
|
||||
if (inputs[i]->getDims() != nnetInputs[i]->getShape())
|
||||
if (inputs[i]->getDims() != nnetInputs[i]->getShape()) {
|
||||
std::cout << "Shape mismatch " << inputs[i]
|
||||
<< vecToString(inputs[i]->getDims()) << " "
|
||||
<< vecToString(nnetInputs[i]->getShape());
|
||||
return {};
|
||||
}
|
||||
return {{nnet::as<nnet::RangeOpNode>(expr)->getOutputShape()}};
|
||||
}
|
||||
|
||||
|
@ -83,4 +91,9 @@ bool MemBoundObj::checkOOB(nnet::Expr expr) {
|
|||
nnet::as<nnet::RangeOpNode>(expr));
|
||||
}
|
||||
|
||||
string MemBoundObj::toJson() const {
|
||||
return *nnet::Serializer().toString(expr, "MemBoundObj::toJson", nnetInputs,
|
||||
exec_time, hint);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -7,9 +7,10 @@ namespace infini {
|
|||
TEST(Handler, matmul) {
|
||||
auto runtime = NativeCpuRuntimeObj::getInstance();
|
||||
auto handler = make_ref<GraphHandlerObj>(runtime);
|
||||
auto i = handler->tensor({1, 2, 3}, OnnxDType::UINT32);
|
||||
auto w = handler->tensor({1, 3, 4}, OnnxDType::UINT32);
|
||||
auto o = handler->tensor({1, 2, 4}, OnnxDType::UINT32);
|
||||
auto i = handler->tensor({1, 2, 3}, OnnxDType::UINT32, TensorType::Input);
|
||||
auto w =
|
||||
handler->tensor({1, 3, 4}, OnnxDType::UINT32, TensorType::Initialized);
|
||||
auto o = handler->tensor({1, 2, 4}, OnnxDType::UINT32, TensorType::Other);
|
||||
handler->matmul(i, w, o, false, false, nullptr, ActType::None);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include "operators/conv.h"
|
||||
#include "nnet/nmutator.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(TestCudaRuntime, CudaGraph) {
|
||||
auto runtime = make_ref<CudaRuntimeObj>();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
|
||||
const int n = 2, c = 256, h = 2, w = 2, f = 448, r = 3, s = 2;
|
||||
auto i0 = g->addTensor({n, c, h, w}, DataType::Float32, TensorType::Input);
|
||||
auto w0 =
|
||||
g->addTensor({f, c, r, s}, DataType::Float32, TensorType::Initialized);
|
||||
g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
g->dataMalloc();
|
||||
runtime->run(g, true);
|
||||
runtime->run(g, false);
|
||||
runtime->getPerfTime(g);
|
||||
|
||||
auto time = runtime->timeWithCudaGraph(g);
|
||||
EXPECT_GE(time, 0.01);
|
||||
}
|
||||
|
||||
TEST(TestCudaRuntime, CudaGraphMembound) {
|
||||
auto runtime = make_ref<CudaRuntimeObj>();
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
|
||||
Tensor i0 = g->addTensor({1, 2, 3}, DataType::Float32);
|
||||
Tensor w0 = g->addTensor({1, 3, 4}, DataType::Float32);
|
||||
Tensor o0 = g->addTensor({1, 2, 4}, DataType::Float32);
|
||||
g->dataMalloc();
|
||||
i0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||
w0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
||||
NMutator nmutator(NMutator::Mode::ToNaiveMembound);
|
||||
auto mutations = nmutator.run(g);
|
||||
ASSERT_EQ(mutations.size(), 2u);
|
||||
Graph gNew = mutations[1];
|
||||
gNew->print();
|
||||
gNew->dataMalloc();
|
||||
|
||||
runtime->run(gNew, true); // tune kernels
|
||||
runtime->run(gNew, false);
|
||||
runtime->getPerfTime(gNew);
|
||||
|
||||
runtime->timeWithCudaGraph(gNew);
|
||||
}
|
||||
} // namespace infini
|
|
@ -10,8 +10,8 @@ int main(int argc, char *argv[]) {
|
|||
printf("Usage: %s <log>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
auto expr = Serializer().deserialize(argv[1]);
|
||||
auto expr = Serializer().fromFile(argv[1]);
|
||||
cout << FullPrinterVisitor().print(expr);
|
||||
cout << endl << "Hash = " << HashVisitor().getHash(expr) << endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
import onnx
|
||||
import torch
|
||||
from matplotlib import pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyinfinitensor as pit
|
||||
from pyinfinitensor import backend as ft
|
||||
from pyinfinitensor.onnx import OnnxStub
|
||||
|
||||
|
||||
def to_pytorch_tensor(tensor) -> torch.Tensor:
|
||||
data = tensor.copyout_float()
|
||||
tt = torch.tensor(data)
|
||||
return tt.reshape(tensor.shape())
|
||||
|
||||
|
||||
def run_InfoGAN_return_tesnor(n_layers: int):
|
||||
if_tensors = ft.runInfoGAN(n_layers)
|
||||
tensors = [to_pytorch_tensor(t) for t in if_tensors]
|
||||
return tensors
|
||||
|
||||
|
||||
def read_and_check():
|
||||
for n_layers in range(1, 6):
|
||||
ans = torch.load(f'torch_{n_layers}layers_0.pt')
|
||||
x = torch.load(f'torch_{n_layers}layers_1.pt')
|
||||
print(f'=== {n_layers} layers ===')
|
||||
print(x.abs().max())
|
||||
|
||||
|
||||
def run_e2e_InfoGAN():
|
||||
data = []
|
||||
for n_layers in range(5, 6):
|
||||
tensors = run_InfoGAN_return_tesnor(n_layers)
|
||||
for i, t in enumerate(tensors):
|
||||
torch.save(t, f'torch_{n_layers}layers_{i}.pt')
|
||||
print(f'============ {n_layers} layers = = =')
|
||||
ans, x = tensors
|
||||
print(f'Allclose {torch.allclose(ans, x)}')
|
||||
|
||||
# Print error numbers
|
||||
tot = np.product(ans.shape)
|
||||
data.append([])
|
||||
for i in range(0, 10):
|
||||
tol = 10**(-i)
|
||||
clo = torch.isclose(ans, x, atol=tol, rtol=tol).sum().item()
|
||||
print(f'0.1^{i} close: {clo}/{tot} = {clo/tot}')
|
||||
data[-1].append(clo/tot)
|
||||
|
||||
rel_err = torch.abs((ans-x)/ans)
|
||||
print(rel_err, rel_err.max())
|
||||
print(f'ans = {ans}')
|
||||
print(f'x = {x}')
|
||||
|
||||
# # Plot CDF
|
||||
# fig, axes = plt.subplots(9,1)
|
||||
# print(axes)
|
||||
# for i, ax in enumerate(axes):
|
||||
# print(i)
|
||||
# ax:plt.Axes
|
||||
# ax.hist(torch.flatten(rel_err), density=True, cumulative=True, label='CDF',
|
||||
# histtype='step', alpha=0.8, color='k')
|
||||
# ax.set_xlim(0, 10**(-i))
|
||||
# # ax.set_title('')
|
||||
# plt.show()
|
||||
# plt.savefig('a.pdf')
|
||||
df = pd.DataFrame(data)
|
||||
print(df.to_string())
|
||||
df.set_axis([f'0.1^{i}' for i in range(0, 10)], axis=1, inplace=True)
|
||||
print(df.to_string())
|
||||
df.to_csv('a.csv')
|
||||
|
||||
|
||||
def runSingleConvT():
|
||||
runtime = ft.cuda_runtime()
|
||||
g = ft.getConvtransposedNHWC(runtime, [1, 2, 2, 448], 1)
|
||||
opt_g = ft.optimizeGraph(g, runtime)
|
||||
ft.if_onnx.export_onnx(opt_g, 'convtransposed.onnx')
|
||||
|
||||
|
||||
def run_InfoGAN_without_tuning(runtime, tuning: bool):
|
||||
g = ft.getInfoGAN(1, runtime, 5)
|
||||
# g = ft.getInfoGAN(1, runtime, 1)
|
||||
opt_g = ft.optimizeGraph(g, runtime, tuning)
|
||||
stub = OnnxStub.from_graph(opt_g)
|
||||
with open("optimized.onnx", "wb") as f:
|
||||
f.write(stub.to_onnx("optimized").SerializeToString())
|
||||
return opt_g
|
||||
|
||||
|
||||
def load_onnx(runtime) -> ft.Graph:
|
||||
stub = OnnxStub.from_onnx(onnx.load("optimized.onnx"), runtime, False)
|
||||
return stub.handler.getGraph()
|
||||
|
||||
|
||||
def run_and_evaluate(runtime, g):
|
||||
runtime.run(g, True)
|
||||
print(f'getPerfTime = {runtime.getPerfTime(g, True, False, False)}')
|
||||
print(f'Non-ctc time = {runtime.timeNonCtcOperators(g, 1000, 1000)}')
|
||||
print(f'Cuda graph time = {runtime.timeWithCudaGraph(g)}')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# run_e2e_InfoGAN()
|
||||
# runSingleConvT()
|
||||
# read_and_check()
|
||||
|
||||
runtime = ft.cuda_runtime()
|
||||
if True:
|
||||
g = run_InfoGAN_without_tuning(runtime, False)
|
||||
else:
|
||||
g = load_onnx(runtime)
|
||||
run_and_evaluate(runtime, g)
|
|
@ -388,4 +388,4 @@ TEST(Conv2conv, InfoGAN_ConvTranspose_3_OOB_Test) {
|
|||
dbg(expr);
|
||||
Derivator derivator;
|
||||
derivator.checkOOB(as<RangeOpNode>(expr));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,12 +5,13 @@
|
|||
#include "core/search_engine.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "nnet/nmutator.h"
|
||||
#include "nnet/test.h"
|
||||
#include "operators/conv.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Mutator, NaiveConvWithInterpreter) {
|
||||
TEST(NMutator, NaiveConvWithInterpreter) {
|
||||
// verifyNaiveMembound True: subgraph after transformation
|
||||
// verifyNaiveMembound False: subgraph of one single membound (eOP)
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
|
@ -54,9 +55,8 @@ TEST(Mutator, NaiveConvWithInterpreter) {
|
|||
gg->getOutputs()[0]->getRawDataPtr<void *>());
|
||||
}
|
||||
|
||||
// FIXME: failed since implicit transpose for DLT
|
||||
TEST(Mutator, InfoGAN_TConv_3_correctness) {
|
||||
const bool useMutatorDirectly = true;
|
||||
TEST(NMutator, InfoGAN_TConv_3_correctness) {
|
||||
const bool useMutatorDirectly = false;
|
||||
Runtime runtime = make_ref<CudaRuntimeObj>();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
|
@ -67,8 +67,9 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {
|
|||
// const int n = 1, c = 1, h = 2, w = 2, f = 1, r = 4, s = 4;
|
||||
// const int n = 1, c = 2, h = 2, w = 2, f = 2, r = 4, s = 4;
|
||||
|
||||
auto i0 = g->addTensor({n, h, w, f});
|
||||
auto w0 = g->addTensor({f, r, s, c});
|
||||
auto i0 = g->addTensor({n, h, w, f}, DataType::Float32, TensorType::Input);
|
||||
auto w0 =
|
||||
g->addTensor({f, r, s, c}, DataType::Float32, TensorType::Initialized);
|
||||
g->addOp<ConvTransposed2dNHWCObj>(i0, w0, nullptr, 1, 1, 2, 2, 1, 1);
|
||||
|
||||
auto mutator =
|
||||
|
@ -91,12 +92,13 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {
|
|||
fuidToInputTensor[t->getFuid()] = t;
|
||||
}
|
||||
|
||||
std::cout << "# bestGraphs = " << bestGraphs.size() << std::endl;
|
||||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
auto bestGraphCpu = bestGraphs[i];
|
||||
auto bestGraph =
|
||||
make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
|
||||
|
||||
auto gen = RandomGenerator(0, 1, i);
|
||||
auto gen = RandomGenerator(0.1, 0.1, i);
|
||||
bestGraph->dataMalloc();
|
||||
// Initialize inputs with random data
|
||||
for (auto t : g->getInputs()) {
|
||||
|
@ -120,6 +122,7 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {
|
|||
|
||||
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
|
||||
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
|
||||
dbg(runtime->getPerfTime(bestGraph, true));
|
||||
EXPECT_TRUE(go0->equalData(bgo0, 1e-4));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
#include "core/graph.h"
|
||||
#include "nnet/Visitor/FullPrinterVisitor.h"
|
||||
#include "nnet/Visitor/Serializer.h"
|
||||
#include "nnet/test.h"
|
||||
#include "operators/membound.h"
|
||||
#include "gtest/gtest.h"
|
||||
using namespace nnet;
|
||||
using namespace std;
|
||||
#define DEFINE_VAR(name) auto name = make_ref<VarNode>(#name);
|
||||
|
||||
//{L<i3:0:2500><i4:0:4><b:0:8><w:0:65>Sum<k:0:512>
|
||||
//{({A}[b, (i3 + (2500 * i4)), k] * {B<pad=0,128,0>}[b, ((i3 + (2500 * i4)) +
|
||||
|
@ -12,11 +14,7 @@ using namespace std;
|
|||
// ==> B : Input Tensor shape=[8,10000,512] pad=[0,128,0]
|
||||
|
||||
Expr buildSimpleExpr() {
|
||||
DEFINE_VAR(b);
|
||||
DEFINE_VAR(w);
|
||||
DEFINE_VAR(k);
|
||||
DEFINE_VAR(i3);
|
||||
DEFINE_VAR(i4);
|
||||
DEFINE_VAR(b, w, k, i3, i4);
|
||||
auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0});
|
||||
auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0});
|
||||
auto subA = makeSubscript(A, {b, (i3 + (2500 * i4)), k});
|
||||
|
@ -28,9 +26,7 @@ Expr buildSimpleExpr() {
|
|||
}
|
||||
|
||||
Expr buildNestedExpr() {
|
||||
DEFINE_VAR(j1);
|
||||
DEFINE_VAR(j2);
|
||||
DEFINE_VAR(j3);
|
||||
DEFINE_VAR(j1, j2, j3);
|
||||
// Build a Matmul to verify.
|
||||
const int M = 10000, N = 512, K = 3;
|
||||
auto C = make_ref<TensorNode>("C", vector<int>({M, K}));
|
||||
|
@ -46,11 +42,7 @@ Expr buildNestedExpr() {
|
|||
auto E = make_ref<TensorNode>("E", shapeE, shapeE, ele2);
|
||||
auto ele1 = make_ref<ElementWiseNode>(expr, vector{E}, shapeE);
|
||||
|
||||
DEFINE_VAR(b);
|
||||
DEFINE_VAR(w);
|
||||
DEFINE_VAR(k);
|
||||
DEFINE_VAR(i3);
|
||||
DEFINE_VAR(i4);
|
||||
DEFINE_VAR(b, w, k, i3, i4);
|
||||
auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0}, matmul);
|
||||
auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0}, ele1);
|
||||
auto subA = makeSubscript(A, {b, (i3 + (2500 * i4)), k});
|
||||
|
@ -63,25 +55,23 @@ Expr buildNestedExpr() {
|
|||
|
||||
TEST(Serializer, Serialization) {
|
||||
auto range = buildSimpleExpr();
|
||||
auto isSuccessful = Serializer().serialize(range, "./test_serializer.json");
|
||||
auto isSuccessful = Serializer().toFile(range, "./test_serializer.json");
|
||||
EXPECT_TRUE(isSuccessful);
|
||||
}
|
||||
|
||||
TEST(Serializer, CompareTwoExprs) {
|
||||
DEFINE_VAR(b);
|
||||
DEFINE_VAR(w);
|
||||
DEFINE_VAR(k);
|
||||
DEFINE_VAR(i3);
|
||||
DEFINE_VAR(i4);
|
||||
DEFINE_VAR(b, w, k, i3, i4);
|
||||
auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0});
|
||||
auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0});
|
||||
auto subA = makeSubscript(A, {b, (i3 + (2500 * i4)), k});
|
||||
auto funcA = make_ref<FuncNode>(subA, FuncType::Relu);
|
||||
auto subB = makeSubscript(B, {b, ((i3 + (2500 * i4)) + w), k});
|
||||
auto range = makeRangeOperator(
|
||||
{{i3, {0, 2500}}, {i4, {0, 4}}, {b, {0, 8}}, {w, {0, 65}}},
|
||||
{{k, {0, 512}}}, subA * subB);
|
||||
Serializer().serialize(range, "./test_serializer.json");
|
||||
auto expr = Serializer().deserialize("./test_serializer.json");
|
||||
{{k, {0, 512}}}, funcA * subB);
|
||||
Serializer().toFile(range, "./test_serializer.json");
|
||||
auto expr = Serializer().fromFile("./test_serializer.json");
|
||||
dbg(expr);
|
||||
|
||||
EXPECT_EQ(range->toReadable(), expr->toReadable());
|
||||
}
|
||||
|
@ -90,11 +80,37 @@ TEST(Serializer, Serialization_NestedTensor) {
|
|||
FullPrinterVisitor printer;
|
||||
auto range = buildNestedExpr();
|
||||
auto ans = printer.print(range);
|
||||
dbg(ans);
|
||||
auto isSuccessful = Serializer().serialize(range, "./test_serializer.json");
|
||||
auto isSuccessful = Serializer().toFile(range, "./test_serializer.json");
|
||||
EXPECT_TRUE(isSuccessful);
|
||||
auto exprDeserialized = Serializer().deserialize("./test_serializer.json");
|
||||
auto exprDeserialized = Serializer().fromFile("./test_serializer.json");
|
||||
auto output = printer.print(exprDeserialized);
|
||||
dbg(output);
|
||||
EXPECT_EQ(output, ans);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Serializer, Serialization_memboundOp) {
|
||||
auto expr = buildSimpleExpr();
|
||||
auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0});
|
||||
auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0});
|
||||
// using namespace infini;
|
||||
auto runtime = infini::NativeCpuRuntimeObj::getInstance();
|
||||
auto g = infini::make_ref<infini::GraphObj>(runtime);
|
||||
auto AT = g->addTensor({8, 10000, 512});
|
||||
auto BT = g->addTensor({8, 10000, 512});
|
||||
auto CT = g->addTensor({2500, 4, 8, 65});
|
||||
|
||||
vector<Tensor> nnetInputs{A, B};
|
||||
double execTime = 1;
|
||||
string hint = "test";
|
||||
infini::MemBoundObj memboundOp(nullptr, {AT, BT}, {CT}, nnetInputs, expr,
|
||||
execTime, hint);
|
||||
auto str = memboundOp.toJson();
|
||||
auto [exprLoaded, nnetInputsLoaded, execTimeLoaded, hintLoaded] =
|
||||
Serializer().membundOpFromString(str);
|
||||
EXPECT_EQ(expr->toReadable(), exprLoaded->toReadable());
|
||||
EXPECT_EQ(execTime, execTimeLoaded);
|
||||
EXPECT_EQ(nnetInputs.size(), nnetInputsLoaded.size());
|
||||
for (size_t i = 0; i < nnetInputs.size(); ++i)
|
||||
EXPECT_EQ(nnetInputs[i]->toReadable(),
|
||||
nnetInputsLoaded[i]->toReadable());
|
||||
EXPECT_EQ(hint, hintLoaded);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue