Compare commits

...

34 Commits

Author SHA1 Message Date
huangshuhong 8c91faa948 remove expect 2023-04-21 00:17:04 +08:00
huangshuhong c0ae03a2d7 fix tvm stream 2023-04-21 00:09:47 +08:00
Liyan Zheng 0cb8729bc1 Add: different ONNX names for inputs and weights 2023-04-20 21:51:47 +08:00
YdrMaster 8bc2d3e48d fix: test graph handler
Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-04-20 21:51:47 +08:00
YdrMaster 28b123753e feat: 导入 Tensor 类型
Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-04-20 21:51:47 +08:00
Liyan Zheng 94730d93b5 Add: hash match for membound kernels 2023-04-20 17:16:01 +08:00
Liyan Zheng 6d17c4caa2 Add: getPerfTime in run_models_nnet 2023-04-20 10:54:49 +08:00
Liyan Zheng 15d0eb79cd Add: import ONNX with membound Op 2023-04-20 10:45:28 +08:00
Liyan Zheng 2a343e240e Add: shape of intermediate tensor in exported ONNX 2023-04-20 10:45:28 +08:00
Liyan Zheng 34ca6bf149 Fix: skip check when Graph is exported to ONNX 2023-04-20 10:45:28 +08:00
YdrMaster a6019e79e3 feat(py): 支持从 Graph 直接创建 OnnxStub
Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-04-20 10:45:28 +08:00
YdrMaster 4e1cc8d3e4 refactor(py): 使用工厂方法创建 OnnxStub
Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-04-20 10:44:39 +08:00
YdrMaster 725f9260cf feat: 支持导出 membound
Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-04-20 10:44:39 +08:00
YdrMaster 0edd138919 feat: 正反序列化分离为到 string 的和到 file 的
fix: 正确设置 `USE_CUDA` cfg

todo: test_search 不过

Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-04-20 10:44:39 +08:00
Liyan Zheng 0b23a065ca Add: debug hacks for InfoGAN 2023-04-20 10:42:56 +08:00
Liyan Zheng e86e993ed4 Add: CUDA graph stream capture (MemboundOp fails) 2023-04-19 16:32:16 +08:00
Liyan Zheng e4c20a9ae2 Add: warmup and repeat args in timeNonCtcOperators 2023-04-19 16:22:59 +08:00
Liyan Zheng 537b3b4ea4 Add: Membound operator serialization 2023-04-18 21:53:48 +08:00
Liyan Zheng 2812900ea2 Fix: OpType and print device tensors 2023-04-18 20:28:08 +08:00
Liyan Zheng 01fc19795d Add: time non-compile-cime-computable operators 2023-04-18 17:21:16 +08:00
Liyan Zheng afc4123328 Chore: remove deprecated function 2023-04-18 17:21:16 +08:00
Liyan Zheng b981951a47 Add: NMutator::memboundToJson to export memboundOp 2023-04-18 17:21:16 +08:00
Liyan Zheng 99b5c95455 Add: nnet::Serializer supports FuncNode 2023-04-18 17:21:16 +08:00
Liyan Zheng 9d50b30af8 Chore: disable nnet_unimplemented_continue output 2023-04-18 17:21:16 +08:00
Liyan Zheng bc31219bde Add: exclude compile-time computable operator time 2023-04-18 17:21:16 +08:00
Liyan Zheng edf4e33353 Add: C++ callback to export ONNX 2023-04-18 17:19:05 +08:00
Liyan Zheng 872f3504a9 Add: RangeOpNode::getFullExpression() 2023-04-18 17:19:05 +08:00
Liyan Zheng da49e91ab0 Add: fuse membound operators 2023-04-18 17:19:05 +08:00
Liyan Zheng a6b8f344d4 Chore: simplify type names 2023-04-18 17:19:05 +08:00
Liyan Zheng 09293730ea Add: export to ONNX with custom operators 2023-04-18 17:19:05 +08:00
Liyan Zheng 307614d95d Add: infogan python interface 2023-04-18 17:16:25 +08:00
Liyan Zheng f14edcd52f Fix: avoid reload library 2023-04-18 17:16:25 +08:00
Liyan Zheng d2d49c5d4f Add: invoke TVM through pipe 2023-04-18 17:16:25 +08:00
Liyan Zheng e72fe79168 Add: search engine uses estimated time 2023-04-18 17:16:25 +08:00
51 changed files with 2931 additions and 466 deletions

View File

@ -129,7 +129,7 @@ if(BUILD_TEST_EINNET)
endif()
# Python bindings
file(GLOB_RECURSE FFIS src/ffi/ffi_infinitensor.cc)
file(GLOB_RECURSE FFIS src/ffi/ffi_callback.cc src/ffi/ffi_infinitensor.cc)
pybind11_add_module(backend MODULE ${FFIS})
target_link_libraries(backend PRIVATE InfiniTensor)
@ -168,6 +168,7 @@ endif()
if(USE_CUDA)
add_compile_definitions(USE_CUDA=1)
add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM=1) # Support CUDA graph stream caputre
# Since enable_language only executes once, rerun cmake is required if CMAKE_CUDA_HOST_COMPILER is wrong
set(CMAKE_CUDA_HOST_COMPILER
${CMAKE_CXX_COMPILER}

View File

@ -81,7 +81,7 @@ import onnx
from pyinfinitensor.onnx import OnnxStub
from pyinfinitensor import backend
stub = OnnxStub(onnx.load("model_file"), backend.cpu_runtime())
stub = OnnxStub.from_model(onnx.load("model_file"), backend.cpu_runtime())
```
[`onnx.load`](https://onnx.ai/onnx/api/serialization.html#load-a-model) 是 onnx 提供的加载函数,将 onnx 文件读取为保存在内存中的 onnx 模型。
@ -201,7 +201,7 @@ def infer(model: ModelProto, input) -> dict:
model0 = onnx.load(sys.argv[1])
model1 = OnnxStub(model0, backend.cpu_runtime()).to_onnx("new")
model1 = OnnxStub.from_model(model0, backend.cpu_runtime()).to_onnx("new")
input_shape = [x.dim_value for x in model1.graph.input[0].type.tensor_type.shape.dim]
input = numpy.random.random(input_shape).astype(numpy.float32)

View File

@ -16,7 +16,8 @@ class GraphObj : public Object {
string toString() const override;
Runtime getRuntime() const { return runtime; }
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32,
TensorType tensorType = TensorType::Other);
Tensor addTensor(const Tensor &tensor);
TensorVec addTensor(const TensorVec &tensors);
/**

View File

@ -35,14 +35,22 @@ class GraphHandlerObj {
Graph g;
public:
GraphHandlerObj(Runtime runtime)
explicit GraphHandlerObj(Runtime runtime)
: g(make_ref<GraphObj>(std::move(runtime))) {}
Tensor tensor(Shape dims, int dtype);
explicit GraphHandlerObj(Graph g) : g(std::move(g)) {}
//------ tensors
vector<Tensor> inputs() { return g->getInputs(); }
vector<Tensor> outputs() { return g->getOutputs(); }
Tensor tensor(Shape dims, int dtype, TensorType ttype);
//------ operators
inline OpVec operators() { return g->getOperators(); }
OpVec operators() { return g->getOperators(); }
Tensor conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
int sh, int sw, int dh, int dw);
@ -90,18 +98,22 @@ class GraphHandlerObj {
const optional<vector<int>> &steps);
Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
const optional<vector<int>> &axes);
/// @brief Import memBound operator from a json
TensorVec memBound(const TensorVec &inputs, const Tensor &outputs,
const string &jsonString);
//------ modifiers
inline bool topo_sort() { return g->topo_sort(); }
bool topo_sort() { return g->topo_sort(); }
inline void optimize() { g->optimize(); }
void optimize() { g->optimize(); }
//------ runtime
inline void data_malloc() { g->dataMalloc(); }
void data_malloc() { g->dataMalloc(); }
inline void run() { g->getRuntime()->run(g); }
void run() { g->getRuntime()->run(g); }
Graph getGraph() const { return g; }
};
} // namespace infini

View File

@ -30,6 +30,7 @@ class Mutator {
virtual bool isMultiBranchMergable(const Graph &in_graph) {
IT_TODO_HALT();
}
virtual Graph fuseVertically(const Graph &inputGraph) { IT_TODO_HALT(); }
};
} // namespace infini

View File

@ -121,6 +121,7 @@ class OpRegistry {
FOP(ConvBackwardData);
FOP(Matmul);
FOP(ConvTrans);
FOP(ConvTransNHWC);
FOP(G2BMM);
FOP(GBMM);
FOP(Pad);
@ -209,7 +210,8 @@ class OpRegistry {
//
FOP(MemBound);
default:
IT_ASSERT(false);
IT_ASSERT(false, "Unknown OpType " +
std::to_string(enum_to_underlying(opType)));
break;
}
#undef FOP

View File

@ -1,5 +1,6 @@
#pragma once
#include "core/common.h"
#include "core/object.h"
#include "core/ref.h"
#include <memory>
@ -59,10 +60,12 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
* execution happens.
*
* @param graph
* @param profiling Whether to print breakdown of time
* @param printProfiling Whether to print breakdown of time
* @return double Return the sum of perf time for each operator
*/
double getPerfTime(const Graph &graph, bool profiling = false) const;
double getPerfTime(const Graph &graph, bool printProfiling = false,
bool allowEstimation = false,
bool ignoreMemboundOp = false) const;
Blob allocBlob(size_t size);
bool isCpu() const {
return device == Device::CPU || device == Device::INTELCPU;
@ -77,6 +80,12 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
size_t bytes) const = 0;
virtual string toString() const = 0;
map<UidBaseType, bool>
getCompileTimeComputableAttribute(const Graph &graph) const;
double timeNonCtcOperators(const Graph &graph, int warmup = 1000,
int repeat = 1000) const;
protected:
void printProfilingData(double totTime,
const std::map<OpType, double> &opTime,

View File

@ -4,19 +4,15 @@
#include "graph.h"
#include "mutator.h"
#include <unordered_map>
namespace infini {
class SearchEngine {
private:
Runtime runtimeExec;
Ref<Mutator> mutator;
std::function<bool(const Graph &, const Graph &)> graphTimeComparer;
public:
SearchEngine(Runtime _runtime, Ref<Mutator> _mutator) {
runtimeExec = _runtime;
mutator = _mutator;
}
SearchEngine(Runtime runtime, Ref<Mutator> mutator);
~SearchEngine() {}
private: // Configurations
@ -24,24 +20,17 @@ class SearchEngine {
3; // cut nodes whose #in + #out >= partitionThreshold
size_t GRAPH_SIZE = 16; // num of best graphs.
private: // Composed objects
std::shared_ptr<Mutator> mutationEngine;
public:
std::shared_ptr<Mutator> getMutationEngine() { return mutationEngine; };
struct GroupEdge {
int v, next;
GroupEdge() = delete;
};
struct Candidate { // a graph with perf
std::shared_ptr<Graph> graph;
double perf = INFINITY;
};
class MetaGraph { // a graph of subgraphs, for searching.
public:
MetaGraph() {}
~MetaGraph() {}
// struct Candidate { // a graph with perf
// Graph graph;
// double perf = INFINITY;
// };
struct MetaGraphObj { // a graph of subgraphs, for searching.
struct Node {
Graph graph;
std::vector<int> suc;
@ -50,31 +39,33 @@ class SearchEngine {
};
std::vector<Node> nodes;
};
using MetaGraph = Ref<MetaGraphObj>;
Graph run(const Graph graph); // entrance of search engine.
Graph run(const Graph graph); // entrance to search engine.
std::vector<Graph> search(const Graph &graph); // search for a partition.
private:
std::vector<Graph> partitionGraph(const Graph graph);
std::shared_ptr<MetaGraph> buildMetaGraphWithGraph(const Graph graph);
std::shared_ptr<MetaGraph>
buildMetaGraphWithPlan(const std::shared_ptr<MetaGraph> metaGraph,
const std::vector<int> &plan);
MetaGraph buildMetaGraphWithGraph(const Graph graph);
MetaGraph buildMetaGraphWithPlan(const MetaGraph metaGraph,
const std::vector<int> &plan);
// search horizontal merges
std::vector<std::shared_ptr<MetaGraph>>
searchMerge(std::shared_ptr<MetaGraph> &metaGraph);
void searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
std::vector<int> &plan, std::vector<int> &frontier,
std::vector<MetaGraph> searchMerge(MetaGraph &metaGraph);
void searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
std::vector<int> &frontier,
std::vector<std::vector<int>> &plans,
std::unordered_set<uint64_t> &planSet);
std::vector<Graph>
searchMutation(const std::shared_ptr<MetaGraph> &metaGraph);
std::vector<Graph> searchMutation(const MetaGraph &metaGraph);
void printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph);
void printMetaGraph(MetaGraph metaGraph);
/**
* @brief Check whether a multi-brach graph can be merged into a single
* branch.
*/
bool isMultiBranchMergable(const Graph graph);
Graph fuseVertically(const Graph &graph);
double getEstimatedGraphPerf(Graph graph);
};
} // namespace infini

View File

@ -12,13 +12,14 @@ namespace infini {
// TODO: how to deal with this
using ShapeElem = int;
using Shape = vector<ShapeElem>;
enum class TensorType { Error = 0, Input = 1, Initialized = 2, Other = 3 };
class TensorObj : public TensorBaseObj {
private:
Shape shape;
size_t _size; // Cache of Π(shape).
Fuid fuid; // Cloned tensors share the same id. Tensors constructed from
// scratch have a new id.
TensorType tensorType;
void copyin(const void *ptr, size_t size) {
runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
}
@ -27,7 +28,8 @@ class TensorObj : public TensorBaseObj {
}
public:
TensorObj(Shape shape, DataType dtype, Runtime runtime);
TensorObj(Shape shape, DataType dtype, Runtime runtime,
TensorType tensorType = TensorType::Other);
virtual ~TensorObj() {}
string toString() const override;
@ -39,6 +41,7 @@ class TensorObj : public TensorBaseObj {
size_t getOffset(const vector<int> &ds) const;
void dataMalloc();
UidBaseType getFuid() const { return fuid; }
TensorType getTensorType() const { return tensorType; }
void load(std::string file_path);
void save(std::string file_path);
@ -74,6 +77,7 @@ class TensorObj : public TensorBaseObj {
// Thus the internal state of generator cannot be updated.
void setData(
std::function<void(void *, size_t, DataType)> const &generator) const;
void setData(const Blob &_blob) { data = _blob; }
Tensor clone() const {
auto obj = make_ref<TensorObj>(*this);
obj->freeData();
@ -106,13 +110,13 @@ class TensorObj : public TensorBaseObj {
size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const;
private:
template <class T> string dataToString() const {
template <class T> string dataToString(void *rawPtr) const {
std::stringstream builder;
builder << "Tensor: " << guid << std::endl;
auto numDims = shape.size();
auto dimSzVec = vector<int>(numDims, 1);
auto ptr = data->getPtr<T *>();
T *ptr = (T *)rawPtr;
dimSzVec[numDims - 1] = shape[numDims - 1];
for (int i = numDims - 1; i != 0; --i)

View File

@ -0,0 +1,14 @@
#pragma once
namespace infini {
void conv2dreduce_kernel(float *input, float *bias, float *output, bool PReLU,
int n, int h, int w, int f, int r, int s, int oh,
int ow, int ph, int pw, int sh, int sw, int dh,
int dw);
void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
int act, int n, int h, int w, int f, int r,
int s, int oh, int ow, int ph, int pw, int sh,
int sw, int dh, int dw);
} // namespace infini

View File

@ -1,35 +1,22 @@
#pragma once
#include "core/runtime.h"
#include "cuda/cuda_common.h"
#include "nnet/dbg.h"
namespace infini {
class CudaRuntimeObj : public RuntimeObj {
private:
cudaStream_t stream;
cudnnHandle_t cudnn;
cublasHandle_t cublas;
CudaPtr workspace;
size_t workspaceSize;
bool cudaGraphStatus; // Whether CUDA graph stream capture is enabled
public:
CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
checkCudnnError(cudnnCreate(&cudnn));
checkCublasError(cublasCreate(&cublas));
// 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 7ll << 30; // 7 GB
workspace = alloc(workspaceSize);
}
virtual ~CudaRuntimeObj() {
try {
dealloc(workspace);
checkCudnnError(cudnnDestroy(cudnn));
checkCublasError(cublasDestroy(cublas));
} catch (const std::exception &e) {
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
}
}
CudaRuntimeObj();
virtual ~CudaRuntimeObj();
string toString() const override;
void run(const Graph &graph, bool tune = false,
@ -69,7 +56,15 @@ class CudaRuntimeObj : public RuntimeObj {
void runWithoutSync(const Graph &graph) const;
bool isInCudaGraph() const { return cudaGraphStatus; }
cudaStream_t getStream() const { return stream; }
double timeWithCudaGraph(Graph graph);
private:
void tune(const Graph &graph, bool profiling) const;
void beginCudaGraphStreamCapture();
tuple<cudaGraphExec_t, size_t> endCudaGraphStreamCapture();
};
} // namespace infini

View File

@ -0,0 +1,9 @@
#include "core/graph_handler.h"
#include "core/mutator.h"
#include "core/search_engine.h"
namespace infini {
namespace callback {
void exportONNX(const Graph &graph, const string &path);
}
} // namespace infini

View File

@ -20,6 +20,7 @@ class Serializer : public Functor<string()> {
string visit_(const Subscript &c) override;
string visit_(const Var &c) override;
string visit_(const Tensor &c) override;
string visit_(const Func &c) override;
string dispatchRoutine(const Routine &c);
Expr buildExprTree(string key);
@ -29,16 +30,44 @@ class Serializer : public Functor<string()> {
Serializer(int _verobse = 0);
virtual ~Serializer();
/**
* @brief Serialize the given expression to string
*
* @param expr The expression to be serialized
* @param msg Message of derivation
* @param inputs membound operator attributes
* @param exec_time membound operator attributes
* @param hint membound operator attributes
* @return bool Whether the serialization succeed
*/
std::optional<std::string> toString(Expr const &expr,
const string &msg = "",
vector<Tensor> inputs = {},
double exec_time = -1e9,
string hint = "");
/**
* @brief Serialize the given expression to json file
*
* @param expr The expression to be serialized
* @param filePath The path of json file to be output
* @param msg Message of derivation
* @param inputs membound operator attributes
* @param exec_time membound operator attributes
* @param hint membound operator attributes
* @return bool Whether the serialization succeed
*/
bool serialize(const Expr &expr, const string &filePath,
const string &msg = "");
bool toFile(const Expr &expr, const string &filePath,
const string &msg = "", vector<Tensor> inputs = {},
double exec_time = -1e9, string hint = "");
/**
* @brief Deserialize the given json file to expression
*
* @param text The text of the expr to be deserialized
* @return Expression deserialized from the given json file
*/
Expr fromString(const string &text);
/**
* @brief Deserialize the given json file to expression
@ -46,7 +75,15 @@ class Serializer : public Functor<string()> {
* @param filePath The path to file to be deserialized
* @return Expression deserialized from the given json file
*/
Expr deserialize(const string &filePath);
Expr fromFile(const string &filePath);
tuple<Expr, vector<Tensor>, double, string>
deserializeAsMemobundOp(const string &filePath);
// FIXME: the order of elements in tuple is not consistent with memboundObj
// constructor
tuple<Expr, vector<Tensor>, double, string>
membundOpFromString(const string &data);
};
} // namespace nnet
} // namespace nnet

View File

@ -69,7 +69,8 @@ static inline HashType genhash(string s) {
{ IT_TODO_HALT(); }
#define nnet_unimplemented_continue() \
{ dbg("Unimplemented"); }
{}
// { dbg("Unimplemented"); }
#define nnet_assert(expr, msg) assert(((void)(msg), (expr)))

View File

@ -104,7 +104,7 @@ enum class NodeType {
FuncNodeType
};
enum class FuncType { Relu, Tanh, PRelu };
enum class FuncType { Relu = 1000, Tanh, PRelu };
#define DEFINE_GETTYPE(CLASS, isScalar_v) \
NodeType getType() const override { return NodeType::CLASS##Type; } \
@ -206,7 +206,8 @@ struct IterationType {
enum { Loop, Sum };
constexpr static int NumIterationType = 2;
};
class RangeOpNode : public OperatorNode {
class RangeOpNode : public OperatorNode,
public std::enable_shared_from_this<RangeOpNode> {
public:
enum { Summand, END_POS };
constexpr static int Loop = IterationType::Loop;
@ -230,6 +231,7 @@ class RangeOpNode : public OperatorNode {
return 0;
};
string toReadable() const override;
string getFullExpression();
const Expr &getSummand() const { return subExprs[Summand]; }
const vector<VarRangePair> &getVarRanges(int _index) const {
return vars[_index];

View File

@ -7,12 +7,13 @@ namespace infini {
class NMutator : public Mutator {
public:
enum class Mode { Normal, ToNaiveMembound, RuleBased };
using NameNToTensorT = map<string, Tensor>;
private:
// Suffix -N: NNet objects.
// Suffix -T: tpm objects.
// Map: NNet tensors -> tpm tensor.
std::map<std::string, Tensor> inputsNameNToTensorT;
NameNToTensorT inputsNameNToTensorT;
Mode mode;
const double bandwidth = double(200) * 1024 * 1024 * 1024;
// If in RuleBased mode, use derivationRules in derivator
@ -24,8 +25,9 @@ class NMutator : public Mutator {
~NMutator();
vector<Graph> run(const Graph &in_graph) override;
void setToNaiveMembound();
Graph fuseVertically(const Graph &in_graph) override;
void setToNaiveMembound();
void setMaxDepth(int _maxDepth) { maxDepth = _maxDepth; }
long long cntStates = 0;
long long cntCandidates = 0;
@ -33,6 +35,14 @@ class NMutator : public Mutator {
private:
int maxDepth = 8;
nnet::Expr opToExpression(Operator op);
/// @brief
/// @param op
/// @return pair<Expr, map from NNet tensor names to InfiniTensor tensors>
static pair<nnet::Expr, NameNToTensorT> extractOp(Operator op);
static pair<nnet::Expr, NMutator::NameNToTensorT>
generateUnaryExpr(const Operator &op);
static pair<nnet::Expr, vector<nnet::Tensor>> generateRevert(Tensor in);
void runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs);
/**
@ -48,8 +58,8 @@ class NMutator : public Mutator {
// TODO: recover these rules
// Graph fuseHetConv(nnet::Expr expr, Graph in_graph);
// Graph transformTConv1x1(Operator op);
// Graph transformTConv3x3(Operator op);
Graph transformConvtransposed1x1(Operator _op);
// Graph transformConvtransposed(Operator op);
// Graph transformDialtedConv(Operator op);
// Graph transformConv1x1(Operator op);
// Graph transformConv1xk(Operator op);

View File

@ -6,12 +6,17 @@ namespace infini {
class MemBoundObj : public OperatorObj {
private:
std::vector<nnet::Tensor> nnetInputs;
nnet::Expr expr, simplifiedExpr;
nnet::Expr expr;
std::vector<nnet::Tensor>
nnetInputs; // The order of inputs in nnetInputs should be consistant
// with inputs in infinitensor
double exec_time;
std::string hint;
HashType hash, simplifiedHash;
int n, f, h, w;
// Generated attributes
HashType hash;
nnet::Expr simplifiedExpr;
HashType simplifiedHash;
public:
MemBoundObj(GraphObj *graph, const TensorVec &input,
@ -27,9 +32,12 @@ class MemBoundObj : public OperatorObj {
int numOutputs() const override { return outputs.size(); }
const vector<nnet::Tensor> &getNnetInputs() const { return nnetInputs; }
const nnet::Expr getNnetExpr() const { return expr; }
HashType getHash() const { return hash; }
pair<const nnet::Expr, HashType> getSimplifiedNnetExpr() const {
return {expr, hash};
}
double getEstimatedTime() const { return exec_time; }
string toJson() const;
private:
vector<int> getWorkloadVector() const override;

View File

@ -46,10 +46,13 @@ class RandomGenerator : public DataGenerator {
std::mt19937 e;
std::uniform_int_distribution<int> di;
std::uniform_real_distribution<float> dr;
bool generateInteger;
public:
RandomGenerator(double l = 0, double r = 1, unsigned int seed = 0)
: l(l), r(r), e(seed), di(l, r), dr(l, r) {}
RandomGenerator(double l = 0, double r = 1, unsigned int seed = 0,
bool generateInteger = false)
: l(l), r(r), e(seed), di(l, r), dr(l, r),
generateInteger(generateInteger) {}
virtual ~RandomGenerator() {}
private:
@ -60,7 +63,7 @@ class RandomGenerator : public DataGenerator {
}
void fill(float *data, size_t size) override {
for (size_t i = 0; i < size; i++) {
data[i] = dr(e);
data[i] = (generateInteger) ? di(e) : dr(e);
}
}
};

View File

@ -37,29 +37,48 @@ class OnnxStub:
outputs: Dict[str, backend.Tensor] = {}
initializer: Dict[int, TensorProto] = {}
handler: backend.GraphHandler
disable_check: bool
def __init__(self, model: ModelProto, runtime):
model = infer_shapes(model)
self.handler = backend.GraphHandler(runtime)
@classmethod
def from_onnx(cls, model: ModelProto, runtime, enable_onnx_shape_infernce=True):
if enable_onnx_shape_infernce:
model = infer_shapes(model)
ans = OnnxStub()
ans.handler = backend.GraphHandler(runtime)
tensors: Dict[str, backend.Tensor] = dict()
data: Dict[str, TensorProto] = dict()
cnt_infini_inputs = 0
for input in model.graph.input:
dims = _take_shape_dim(input.type.tensor_type.shape)
tensors[input.name] = self.handler.tensor(
dims, input.type.tensor_type.elem_type
if input.name.startswith('input'):
tensor_type = backend.TensorType.Input
cnt_infini_inputs += 1
else:
tensor_type = backend.TensorType.Initialized
tensors[input.name] = ans.handler.tensor(
dims,
input.type.tensor_type.elem_type,
tensor_type,
)
assert cnt_infini_inputs == 1, f'{cnt_infini_inputs} tensor names start with "input" found.'
for output in model.graph.output:
dims = _take_shape_dim(output.type.tensor_type.shape)
tensors[output.name] = self.handler.tensor(
dims, output.type.tensor_type.elem_type
tensors[output.name] = ans.handler.tensor(
dims,
output.type.tensor_type.elem_type,
backend.TensorType.Other,
)
for initializer in model.graph.initializer:
dims = [d for d in initializer.dims]
tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
tensors[initializer.name] = ans.handler.tensor(
dims,
initializer.data_type,
backend.TensorType.Initialized,
)
data[initializer.name] = initializer
for node in model.graph.node:
@ -77,7 +96,7 @@ class OnnxStub:
)
if p[0] != p[2] or p[1] != p[3]:
adapt = "{}-adapt".format(node.output[0])
tensors[adapt] = self.handler.pad(
tensors[adapt] = ans.handler.pad(
tensors[node.input[0]], None, p, [-2, -1]
)
p = [0, 0, 0, 0]
@ -87,7 +106,7 @@ class OnnxStub:
if len(node.input) > 2:
bias = "{}-bias".format(node.output[0])
reshape = "{}-reshape".format(node.output[0])
tensors[bias] = self.handler.conv(
tensors[bias] = ans.handler.conv(
tensors[adapt],
tensors[node.input[1]],
None,
@ -98,7 +117,7 @@ class OnnxStub:
d[0],
d[1],
)
tensors[reshape] = self.handler.reshape(
tensors[reshape] = ans.handler.reshape(
tensors[node.input[2]],
None,
[
@ -111,13 +130,13 @@ class OnnxStub:
1,
],
)
tensors[node.output[0]] = self.handler.add(
tensors[node.output[0]] = ans.handler.add(
tensors[bias],
tensors[reshape],
tensors.get(node.output[0]),
)
else:
tensors[node.output[0]] = self.handler.conv(
tensors[node.output[0]] = ans.handler.conv(
tensors[adapt],
tensors[node.input[1]],
tensors.get(node.output[0]),
@ -142,7 +161,7 @@ class OnnxStub:
attributes[name]
for name in ["dilations", "pads", "strides", "output_padding"]
)
tensors[node.output[0]] = self.handler.convTransposed2d(
tensors[node.output[0]] = ans.handler.convTransposed2d(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
@ -156,7 +175,7 @@ class OnnxStub:
op[1],
)
elif node.op_type == "MatMul":
tensors[node.output[0]] = self.handler.matmul(
tensors[node.output[0]] = ans.handler.matmul(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
@ -175,7 +194,7 @@ class OnnxStub:
# FIXME unsupport attributes: `alpha` `beta`
assert alpha == 1.0
assert beta == 1.0
tensors[node.output[0]] = self.handler.matmul(
tensors[node.output[0]] = ans.handler.matmul(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
@ -196,7 +215,7 @@ class OnnxStub:
attributes[name]
for name in ["momentum", "epsilon", "training_mode"]
)
tensors[node.output[0]] = self.handler.batchNorm(
tensors[node.output[0]] = ans.handler.batchNorm(
input, output, mean, var, scale, bias, momentum, eps, training != 0
)
elif node.op_type == "MaxPool":
@ -215,10 +234,10 @@ class OnnxStub:
)
if p[0] != p[2] or p[1] != p[3]:
adapt = "{}-adapt".format(node.output[0])
tensors[adapt] = self.handler.pad(
tensors[adapt] = ans.handler.pad(
tensors.get(node.input[0]), None, p, [-2, -1]
)
tensors[node.output[0]] = self.handler.maxPool(
tensors[node.output[0]] = ans.handler.maxPool(
tensors[adapt],
tensors.get(node.output[0]),
k[0],
@ -231,7 +250,7 @@ class OnnxStub:
s[1],
)
else:
tensors[node.output[0]] = self.handler.maxPool(
tensors[node.output[0]] = ans.handler.maxPool(
tensors[node.input[0]],
tensors.get(node.output[0]),
k[0],
@ -257,10 +276,10 @@ class OnnxStub:
)
if p[0] != p[2] or p[1] != p[3]:
adapt = "{}-adapt".format(node.output[0])
tensors[adapt] = self.handler.pad(
tensors[adapt] = ans.handler.pad(
tensors.get(node.input[0]), None, p, [-2, -1]
)
tensors[node.output[0]] = self.handler.avgPool(
tensors[node.output[0]] = ans.handler.avgPool(
tensors[adapt],
tensors.get(node.output[0]),
k[0],
@ -273,7 +292,7 @@ class OnnxStub:
s[1],
)
else:
tensors[node.output[0]] = self.handler.avgPool(
tensors[node.output[0]] = ans.handler.avgPool(
tensors[node.input[0]],
tensors.get(node.output[0]),
k[0],
@ -287,7 +306,7 @@ class OnnxStub:
)
elif node.op_type == "GlobalAveragePool":
[_, _, h, w] = _search_shape(model, node.input[0])
tensors[node.output[0]] = self.handler.avgPool(
tensors[node.output[0]] = ans.handler.avgPool(
tensors[node.input[0]],
tensors.get(node.output[0]),
h,
@ -300,52 +319,52 @@ class OnnxStub:
1,
)
elif node.op_type == "Add":
tensors[node.output[0]] = self.handler.add(
tensors[node.output[0]] = ans.handler.add(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Sub":
tensors[node.output[0]] = self.handler.sub(
tensors[node.output[0]] = ans.handler.sub(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Mul":
tensors[node.output[0]] = self.handler.mul(
tensors[node.output[0]] = ans.handler.mul(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Div":
tensors[node.output[0]] = self.handler.div(
tensors[node.output[0]] = ans.handler.div(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Pow":
tensors[node.output[0]] = self.handler.pow(
tensors[node.output[0]] = ans.handler.pow(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Relu":
tensors[node.output[0]] = self.handler.relu(
tensors[node.output[0]] = ans.handler.relu(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Sigmoid":
tensors[node.output[0]] = self.handler.sigmoid(
tensors[node.output[0]] = ans.handler.sigmoid(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Tanh":
tensors[node.output[0]] = self.handler.tanh(
tensors[node.output[0]] = ans.handler.tanh(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Softmax":
tensors[node.output[0]] = self.handler.softmax(
tensors[node.output[0]] = ans.handler.softmax(
tensors[node.input[0]],
tensors.get(node.output[0]),
next(
@ -353,34 +372,34 @@ class OnnxStub:
),
)
elif node.op_type == "Abs":
tensors[node.output[0]] = self.handler.abs(
tensors[node.output[0]] = ans.handler.abs(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Shape":
tensors[node.output[0]] = self.handler.shape(
tensors[node.output[0]] = ans.handler.shape(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Identity":
tensors[node.output[0]] = self.handler.identity(
tensors[node.output[0]] = ans.handler.identity(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Flatten":
tensors[node.output[0]] = self.handler.flatten(
tensors[node.output[0]] = ans.handler.flatten(
tensors[node.input[0]],
tensors.get(node.output[0]),
next((attr.i for attr in node.attribute if attr.name == "axis")),
)
elif node.op_type == "PRelu":
tensors[node.output[0]] = self.handler.pRelu(
tensors[node.output[0]] = ans.handler.pRelu(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Clip":
tensors[node.output[0]] = self.handler.clip(
tensors[node.output[0]] = ans.handler.clip(
tensors[node.input[0]],
tensors.get(node.output[0]),
next(_parse_data(data[node.input[1]]).__iter__(), None)
@ -394,7 +413,7 @@ class OnnxStub:
perm = next(
(attr.ints for attr in node.attribute if attr.name == "perm"), None
)
tensors[node.output[0]] = self.handler.transpose(
tensors[node.output[0]] = ans.handler.transpose(
tensors[node.input[0]],
tensors.get(node.output[0]),
perm,
@ -409,7 +428,7 @@ class OnnxStub:
temp = reduce(lambda acc, x: acc * x, input_shape, 1)
if temp < 0:
input_shape[input_shape.index(-1)] = size // -temp
tensors[node.output[0]] = self.handler.reshape(
tensors[node.output[0]] = ans.handler.reshape(
tensors[node.input[0]],
tensors.get(node.output[0]),
input_shape,
@ -426,7 +445,7 @@ class OnnxStub:
for i, x in enumerate(input_shape):
if i not in axes:
output_shape.append(x)
tensors[node.output[0]] = self.handler.reshape(
tensors[node.output[0]] = ans.handler.reshape(
tensors[node.input[0]],
tensors.get(node.output[0]),
output_shape,
@ -440,13 +459,13 @@ class OnnxStub:
)
for i in axes:
input_shape.insert(i, 1)
tensors[node.output[0]] = self.handler.reshape(
tensors[node.output[0]] = ans.handler.reshape(
tensors[node.input[0]],
tensors.get(node.output[0]),
input_shape,
)
elif node.op_type == "Concat":
tensors[node.output[0]] = self.handler.concat(
tensors[node.output[0]] = ans.handler.concat(
[tensors[name] for name in node.input],
tensors.get(node.output[0]),
next((attr.i for attr in node.attribute if attr.name == "axis")),
@ -454,7 +473,7 @@ class OnnxStub:
elif node.op_type == "Split":
for name, tensor in zip(
node.output,
self.handler.split(
ans.handler.split(
tensors[node.input[0]],
None,
next(
@ -466,14 +485,14 @@ class OnnxStub:
):
tensors[name] = tensor
elif node.op_type == "Gather":
tensors[node.output[0]] = self.handler.gather(
tensors[node.output[0]] = ans.handler.gather(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
next((attr.i for attr in node.attribute if attr.name == "axis")),
)
elif node.op_type == "ReduceMean":
tensors[node.output[0]] = self.handler.reduce_mean(
tensors[node.output[0]] = ans.handler.reduce_mean(
tensors[node.input[0]],
tensors.get(node.output[0]),
tensors[node.input[1]] if len(node.input) > 1 else None,
@ -481,7 +500,7 @@ class OnnxStub:
!= 0,
)
elif node.op_type == "Slice":
tensors[node.output[0]] = self.handler.slice(
tensors[node.output[0]] = ans.handler.slice(
tensors[node.input[0]],
tensors.get(node.output[0]),
_parse_data(data[node.input[1]]),
@ -490,7 +509,7 @@ class OnnxStub:
_parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
)
elif node.op_type == "Pad":
tensors[node.output[0]] = self.handler.pad(
tensors[node.output[0]] = ans.handler.pad(
tensors[node.input[0]],
tensors.get(node.output[0]),
_parse_data(data[node.input[1]]),
@ -499,7 +518,7 @@ class OnnxStub:
elif node.op_type == "Dropout":
for name, tensor in zip(
node.output,
self.handler.dropout(
ans.handler.dropout(
tensors[node.input[0]],
tensors.get(node.output[0]),
tensors.get(node.output[1]) if len(node.output) > 1 else None,
@ -512,18 +531,33 @@ class OnnxStub:
),
):
tensors[name] = tensor
elif node.op_type == "MemBound":
attributes = _parse_attribute(node, {"expr": None})
expr: str = attributes["expr"]
assert expr is not None
assert (
len(node.output) == 1
), """MemBound with multiple
outputs requires rewrite the logic of tensor creation"""
outputs = ans.handler.memBound(
[tensors[name] for name in node.input],
tensors.get(node.output[0]),
expr,
)
for name, tensor in zip(node.output, outputs):
tensors[name] = tensor
else:
raise Exception('Unsupported operator "{}"'.format(node.op_type))
self.handler.data_malloc()
ans.handler.data_malloc()
for name, obj in tensors.items():
tensor = data.get(name)
if tensor == None:
if any(input.name == name for input in model.graph.input):
self.inputs[name] = obj
ans.inputs[name] = obj
else:
self.initializer[obj.fuid()] = tensor
ans.initializer[obj.fuid()] = tensor
if tensor.data_type == TensorProto.INT32:
obj.copyin_int32(_parse_data(tensor))
elif tensor.data_type == TensorProto.INT64:
@ -533,8 +567,19 @@ class OnnxStub:
else:
assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
for output in model.graph.output:
self.outputs[output.name] = tensors[output.name]
return ans
@classmethod
def from_graph(cls, g: backend.Graph):
ans = OnnxStub()
handler = backend.GraphHandler(g)
for i, tensor in enumerate(handler.inputs()):
ans.inputs["input{}".format(i)] = tensor
for i, tensor in enumerate(handler.outputs()):
ans.inputs["output{}".format(i)] = tensor
ans.handler = handler
ans.disable_check = True
return ans
def to_onnx(self, name: str) -> ModelProto:
class Context:
@ -552,6 +597,13 @@ class OnnxStub:
outputs: List[ValueInfoProto] = []
# saves global input tensors
initializers: List[TensorProto] = []
# saves global output tensors
value_info: List[ValueInfoProto] = []
enable_check = False
def __init__(self, enable_check):
self.enable_check = enable_check
def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
ty = op.op_type()
@ -562,12 +614,15 @@ class OnnxStub:
def push_output(self, name: str, tensor: backend.Tensor) -> str:
self.names[tensor] = name
if not tensor.has_target():
shape = tensor.shape()
dtype = backend.tensor_dtype(tensor)
value_info = make_tensor_value_info(name, dtype, shape)
check_value_info(value_info)
shape = tensor.shape()
dtype = backend.tensor_dtype(tensor)
value_info = make_tensor_value_info(name, dtype, shape)
check_value_info(value_info)
if not tensor.has_target(): # if this output is a global output
self.outputs.append(value_info)
else: # if this output is a local output
self.value_info.append(value_info)
return name
def push_input(
@ -577,7 +632,10 @@ class OnnxStub:
# means that this input is a global input
if name is None:
self.count_in += 1
name = "input{}".format(self.count_in)
if tensor.getTensorType() == backend.TensorType.Input:
name = "input{}".format(self.count_in)
else:
name = "weight{}".format(self.count_in)
self.names[tensor] = name
if init != None:
init.name = name
@ -605,17 +663,25 @@ class OnnxStub:
return name
def push_node(self, node: NodeProto) -> None:
check_node(node)
if self.enable_check:
check_node(node)
self.nodes.append(node)
def build(self, name: str) -> ModelProto:
graph = make_graph(
self.nodes, name, self.inputs, self.outputs, self.initializers
self.nodes,
name,
self.inputs,
self.outputs,
self.initializers,
value_info=self.value_info,
)
check_graph(graph)
if self.enable_check:
check_graph(graph)
model = make_model(graph)
check_model(model)
if self.enable_check:
check_model(model)
return model
@ -625,7 +691,7 @@ class OnnxStub:
ops = self.handler.operators() # 图中所有算子(节点)
ctx = Context()
ctx = Context(not self.disable_check)
for op in ops:
ty, name = ctx.name_op(op)
@ -812,6 +878,18 @@ class OnnxStub:
ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
)
ctx.push_node(make_node(ty.name, inputs, outputs, name))
elif ty == backend.OpType.MemBound:
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
domain="nnet",
expr=backend.membound_expr_of(op),
hash=str(backend.membound_hash_of(op)),
)
)
else:
raise Exception("Unsupported OpType", ty)
@ -828,7 +906,7 @@ class OnnxStub:
def from_onnx(model: ModelProto, runtime):
stub = OnnxStub(model, runtime)
stub = OnnxStub.from_onnx(model, runtime)
return stub.inputs, stub.outputs, stub.handler

View File

@ -40,9 +40,9 @@ class TestStringMethods(unittest.TestCase):
file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
)
)
model = OnnxStub(onnx.load(model_file), backend.cpu_runtime()).to_onnx(
"new"
)
model = OnnxStub.from_onnx(
onnx.load(model_file), backend.cpu_runtime()
).to_onnx("new")
model = infer_shapes(model)
def test_tensor(self):
@ -304,16 +304,16 @@ class TestStringMethods(unittest.TestCase):
def test_frontend(self):
handler = backend.GraphHandler(backend.cpu_runtime())
a = handler.tensor([1, 2, 3], 12)
b = handler.tensor([1, 2, 3], 12)
c = handler.tensor([1, 2, 3], 12)
d = handler.tensor([1, 2, 3], 12)
e = handler.tensor([1, 2, 3], 12)
a = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
b = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
c = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
d = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
e = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
x = handler.add(
handler.add(handler.add(handler.add(a, b, None), c, None), d, None), e, None
)
y = handler.tensor([3, 2, 1], 12)
y = handler.tensor([3, 2, 1], 12, backend.TensorType.Other)
handler.reshape(x, y, [3, 2, 1])

View File

@ -1,106 +1,131 @@
import re
import os
import sys
import json
from contextlib import redirect_stdout
import time
import logging
import numpy as np
import tvm
from tvm import te, tir, auto_scheduler, topi
import os
import json
import logging
USE_CACHE = True
logging.basicConfig()
logger = logging.getLogger('InfiniTensor')
logger.setLevel(logging.DEBUG)
logger.setLevel(logging.INFO)
def gen_ansor_so(input_tensors, input_dtypes, output_tensor, output_dtype,
tvm_code, func_name, nnet_expression: str,
nnet_simplified_expression: str, hash_code=None):
nnet_simplified_expression: str, hash_code: str = None):
assert len(input_tensors) == len(input_dtypes)
logging.debug(f'Work on hash {hash_code}')
logger.debug(f'Work on hash {hash_code}')
dir_name = os.path.join(".cache", "generated_kernels", str(hash_code))
if not os.path.exists(dir_name):
os.makedirs(dir_name)
so_fn = os.path.join(dir_name, f"{func_name}.so")
config_fn = os.path.join(dir_name, "config_so.json")
print("Generating Ansor op: ")
print(tvm_code)
print("Input shape: ")
print(input_tensors)
print("Output shape: ")
print(output_tensor)
desc_fn = os.path.join(dir_name, "desc.txt")
log_fn = os.path.join(dir_name, f"ansor_{func_name}_log.json")
out_fn = os.path.join(dir_name, "out.txt")
logger.debug(f"Generating Ansor op: {tvm_code}")
logger.debug(f"Input shape: {input_tensors}")
logger.debug(f"Output shape: {output_tensor}")
if USE_CACHE and hash_code is not None:
if os.path.exists(dir_name) and \
os.path.exists(so_fn) and \
os.path.exists(config_fn):
os.path.exists(so_fn) and \
os.path.exists(config_fn):
print(f"Use cache in {dir_name}")
with open(config_fn, "r") as config_fin:
config = json.loads(config_fin.read().strip())
conv_time = config["conv_time"]
logger.debug(f'Find tuning log for {hash_code}')
logger.info(f'Find tuning log for {hash_code} in {so_fn}')
return so_fn, conv_time
logger.info(f"TVM Tuning kernel with hash {hash_code}. See {out_fn}")
time_start = time.perf_counter()
# Print descriptions of the task
if USE_CACHE and hash_code is not None:
with redirect_stdout(open(desc_fn, "w")):
print("====NNET tensor expression====")
print(nnet_expression+"\n")
print("====NNET simplified tensor expression====")
print(nnet_simplified_expression+"\n")
print("====TVM compute====")
print(tvm_code+"\n")
print("Input shape: ", input_tensors)
print("Output shape: ", output_tensor)
@auto_scheduler.register_workload(func_name)
def compute():
_locals = locals()
exec(tvm_code, {'tvm': tvm, 'te': te, 'tir': tir, 'topi': topi}, _locals)
exec(tvm_code, {'tvm': tvm, 'te': te,
'tir': tir, 'topi': topi}, _locals)
return _locals['ret']
target = tvm.target.Target("cuda")
task = auto_scheduler.SearchTask(func=func_name, args=(), target=target)
# Inspect the computational graph
print("Computational DAG:")
print(task.compute_dag)
with redirect_stdout(open(out_fn, 'w')):
# Inspect the computational graph
print("Computational DAG:")
print(task.compute_dag)
log_file = f"ansor_{func_name}_log.json"
measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=10,
runner=measure_ctx.runner,
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
verbose=2,
)
measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=10,
runner=measure_ctx.runner,
measure_callbacks=[auto_scheduler.RecordToFile(log_fn)],
verbose=2,
)
# Run auto-tuning (search)
task.tune(tune_option)
# Apply the best schedule
sch, args = task.apply_best(log_file)
# Run auto-tuning (search)
task.tune(tune_option)
# Apply the best schedule
sch, args = task.apply_best(log_fn)
# Kill the measurement process
del measure_ctx
func = tvm.build(sch, args, target, name=func_name)
func.export_library(so_fn)
ctx = tvm.cuda(0)
input_a = []
for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
a_np = np.random.uniform(size=shape).astype(dtype)
input_a.append(tvm.nd.array(a_np, ctx))
a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
func(a_out, *input_a)
evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
conv_time = evaluator(a_out, *input_a).mean * 1e3
time_end = time.perf_counter()
# Kill the measurement process
del measure_ctx
func = tvm.build(sch, args, target, name=func_name)
func.export_library(so_fn)
ctx = tvm.cuda(0)
input_a = []
for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
a_np = np.random.uniform(size=shape).astype(dtype)
input_a.append(tvm.nd.array(a_np, ctx))
a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
func(a_out, *input_a)
evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
conv_time = evaluator(a_out, *input_a).mean * 1e3
print("====NNET tensor expression====")
print(nnet_expression+"\n")
print("====NNET simplified tensor expression====")
print(nnet_simplified_expression+"\n")
print("====Time====")
print(conv_time)
if USE_CACHE and hash_code is not None:
with open(config_fn, "w") as config_fout:
config_fout.write(json.dumps({
"conv_time": conv_time,
"tuning_time": time_end - time_start,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
}, ensure_ascii=False, indent=2))
return so_fn, conv_time
# Read arguments from pipe, which is redirected to stdin.
# Write generated library path to pipe.
def pipe_gen(fd: int):
args = json.load(sys.stdin) # read from pipe
# print(args, f'fd={fd}')
ret = gen_ansor_so(**args)
with os.fdopen(fd, 'w') as f:
print(ret[0], file=f, end='') # write to pipe

View File

@ -0,0 +1,7 @@
import backend
from backend import *
import sys
sys.path.extend(__path__)
print("import backend: {}".format(backend))

View File

@ -0,0 +1,941 @@
import backend
import onnx
from onnx import (
ModelProto,
TensorProto,
NodeProto,
AttributeProto,
TensorShapeProto,
ValueInfoProto,
)
from onnx.helper import (
make_node,
make_tensor_value_info,
make_tensor,
make_graph,
make_model,
)
from onnx.checker import (
check_graph,
check_model,
check_node,
check_value_info,
check_tensor,
)
from onnx.shape_inference import infer_shapes
from onnx.numpy_helper import to_array
from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
from functools import reduce
class OnnxStub:
"""
The Onnx model imported into infinitensor.
It can be generated from an Onnx model object.
"""
# inputs: Dict[str, backend.Tensor] = {}
# outputs: Dict[str, backend.Tensor] = {}
initializer: Dict[int, TensorProto] = {}
# handler: backend.GraphHandler
# def __init__(self, model: ModelProto, runtime):
# model = infer_shapes(model)
# self.handler = backend.GraphHandler(runtime)
# tensors: Dict[str, backend.Tensor] = dict()
# data: Dict[str, TensorProto] = dict()
# for input in model.graph.input:
# dims = _take_shape_dim(input.type.tensor_type.shape)
# tensors[input.name] = self.handler.tensor(
# dims, input.type.tensor_type.elem_type
# )
# for output in model.graph.output:
# dims = _take_shape_dim(output.type.tensor_type.shape)
# tensors[output.name] = self.handler.tensor(
# dims, output.type.tensor_type.elem_type
# )
# for initializer in model.graph.initializer:
# dims = [d for d in initializer.dims]
# tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
# data[initializer.name] = initializer
# for node in model.graph.node:
# if node.op_type == "Conv":
# attributes = _parse_attribute(
# node,
# {
# "dilations": [1, 1],
# "pads": [0, 0, 0, 0],
# "strides": [1, 1],
# },
# )
# (d, p, s) = (
# attributes[name] for name in ["dilations", "pads", "strides"]
# )
# if p[0] != p[2] or p[1] != p[3]:
# adapt = "{}-adapt".format(node.output[0])
# tensors[adapt] = self.handler.pad(
# tensors[node.input[0]], None, p, [-2, -1]
# )
# p = [0, 0, 0, 0]
# else:
# adapt = node.input[0]
# if len(node.input) > 2:
# bias = "{}-bias".format(node.output[0])
# reshape = "{}-reshape".format(node.output[0])
# tensors[bias] = self.handler.conv(
# tensors[adapt],
# tensors[node.input[1]],
# None,
# p[0],
# p[1],
# s[0],
# s[1],
# d[0],
# d[1],
# )
# tensors[reshape] = self.handler.reshape(
# tensors[node.input[2]],
# None,
# [
# 1,
# reduce(
# lambda acc, x: acc * x,
# _search_shape(model, node.input[2]),
# ),
# 1,
# 1,
# ],
# )
# tensors[node.output[0]] = self.handler.add(
# tensors[bias],
# tensors[reshape],
# tensors.get(node.output[0]),
# )
# else:
# tensors[node.output[0]] = self.handler.conv(
# tensors[adapt],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# p[0],
# p[1],
# s[0],
# s[1],
# d[0],
# d[1],
# )
# elif node.op_type == "ConvTranspose":
# attributes = _parse_attribute(
# node,
# {
# "dilations": [1, 1],
# "pads": [0, 0],
# "strides": [1, 1],
# "output_padding": [0, 0],
# },
# )
# (d, p, s, op) = (
# attributes[name]
# for name in ["dilations", "pads", "strides", "output_padding"]
# )
# tensors[node.output[0]] = self.handler.convTransposed2d(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# p[0],
# p[1],
# s[0],
# s[1],
# d[0],
# d[1],
# op[0],
# op[1],
# )
# elif node.op_type == "MatMul":
# tensors[node.output[0]] = self.handler.matmul(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# False,
# False,
# None,
# backend.ActType.Linear,
# )
# elif node.op_type == "Gemm":
# attributes = _parse_attribute(
# node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
# )
# (alpha, beta, transA, transB) = (
# attributes[name] for name in ["alpha", "beta", "transA", "transB"]
# )
# # FIXME unsupport attributes: `alpha` `beta`
# assert alpha == 1.0
# assert beta == 1.0
# tensors[node.output[0]] = self.handler.matmul(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# transA == 1,
# transB == 1,
# tensors[node.input[2]] if len(node.input) > 2 else None,
# backend.ActType.Linear,
# )
# elif node.op_type == "BatchNormalization":
# (input, mean, var, scale, bias) = (
# tensors[node.input[i]] for i in [0, 3, 4, 1, 2]
# )
# output = tensors.get(node.output[0])
# attributes = _parse_attribute(
# node, {"momentum": 0.9, "epsilon": 1e-05, "training_mode": 0}
# )
# (momentum, eps, training) = (
# attributes[name]
# for name in ["momentum", "epsilon", "training_mode"]
# )
# tensors[node.output[0]] = self.handler.batchNorm(
# input, output, mean, var, scale, bias, momentum, eps, training != 0
# )
# elif node.op_type == "MaxPool":
# attributes = _parse_attribute(
# node,
# {
# "kernel_shape": None,
# "dilations": [1, 1],
# "pads": [0, 0, 0, 0],
# "strides": [1, 1],
# },
# )
# (k, d, p, s) = (
# attributes[name]
# for name in ["kernel_shape", "dilations", "pads", "strides"]
# )
# if p[0] != p[2] or p[1] != p[3]:
# adapt = "{}-adapt".format(node.output[0])
# tensors[adapt] = self.handler.pad(
# tensors.get(node.input[0]), None, p, [-2, -1]
# )
# tensors[node.output[0]] = self.handler.maxPool(
# tensors[adapt],
# tensors.get(node.output[0]),
# k[0],
# k[1],
# d[0],
# d[1],
# 0,
# 0,
# s[0],
# s[1],
# )
# else:
# tensors[node.output[0]] = self.handler.maxPool(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# k[0],
# k[1],
# d[0],
# d[1],
# p[0],
# p[1],
# s[0],
# s[1],
# )
# elif node.op_type == "AveragePool":
# attributes = _parse_attribute(
# node,
# {
# "kernel_shape": None,
# "pads": [0, 0, 0, 0],
# "strides": [1, 1],
# },
# )
# (k, p, s) = (
# attributes[name] for name in ["kernel_shape", "pads", "strides"]
# )
# if p[0] != p[2] or p[1] != p[3]:
# adapt = "{}-adapt".format(node.output[0])
# tensors[adapt] = self.handler.pad(
# tensors.get(node.input[0]), None, p, [-2, -1]
# )
# tensors[node.output[0]] = self.handler.avgPool(
# tensors[adapt],
# tensors.get(node.output[0]),
# k[0],
# k[1],
# 1,
# 1,
# 0,
# 0,
# s[0],
# s[1],
# )
# else:
# tensors[node.output[0]] = self.handler.avgPool(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# k[0],
# k[1],
# 1,
# 1,
# p[0],
# p[1],
# s[0],
# s[1],
# )
# elif node.op_type == "GlobalAveragePool":
# [_, _, h, w] = _search_shape(model, node.input[0])
# tensors[node.output[0]] = self.handler.avgPool(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# h,
# w,
# 1,
# 1,
# 0,
# 0,
# 1,
# 1,
# )
# elif node.op_type == "Add":
# tensors[node.output[0]] = self.handler.add(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Sub":
# tensors[node.output[0]] = self.handler.sub(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Mul":
# tensors[node.output[0]] = self.handler.mul(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Div":
# tensors[node.output[0]] = self.handler.div(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Pow":
# tensors[node.output[0]] = self.handler.pow(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Relu":
# tensors[node.output[0]] = self.handler.relu(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Sigmoid":
# tensors[node.output[0]] = self.handler.sigmoid(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Tanh":
# tensors[node.output[0]] = self.handler.tanh(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Softmax":
# tensors[node.output[0]] = self.handler.softmax(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Abs":
# tensors[node.output[0]] = self.handler.abs(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Shape":
# tensors[node.output[0]] = self.handler.shape(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Identity":
# tensors[node.output[0]] = self.handler.identity(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Flatten":
# # FIXME axis must be 1
# axis = next(
# (attr.i for attr in node.attribute if attr.name == "axis"), None
# )
# assert axis == None or axis == 1
# tensors[node.output[0]] = self.handler.flatten(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "PRelu":
# tensors[node.output[0]] = self.handler.pRelu(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Clip":
# tensors[node.output[0]] = self.handler.clip(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# next(_parse_data(data[node.input[1]]).__iter__(), None)
# if len(node.input) > 1
# else None,
# next(_parse_data(data[node.input[2]]).__iter__(), None)
# if len(node.input) > 2
# else None,
# )
# elif node.op_type == "Transpose":
# perm = next(
# (attr.ints for attr in node.attribute if attr.name == "perm"), None
# )
# tensors[node.output[0]] = self.handler.transpose(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# perm,
# )
# elif node.op_type == "Reshape":
# dims = _search_shape(model, node.input[0])
# size = reduce(lambda acc, x: acc * x, dims)
# input_shape = _parse_data(data[node.input[1]])
# for i, x in enumerate(input_shape):
# if x == 0:
# input_shape[i] = dims[i]
# temp = reduce(lambda acc, x: acc * x, input_shape, 1)
# if temp < 0:
# input_shape[input_shape.index(-1)] = size // -temp
# tensors[node.output[0]] = self.handler.reshape(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# input_shape,
# )
# elif node.op_type == "Squeeze":
# input_shape = _search_shape(model, node.input[0])
# axes = set(
# [int(i) for i in data[node.input[1]].int64_data]
# if len(node.input) > 1
# else _parse_attribute(node, {"axes": None})["axes"]
# )
# assert all(input_shape[d] == 1 for d in axes)
# output_shape = []
# for i, x in enumerate(input_shape):
# if i not in axes:
# output_shape.append(x)
# tensors[node.output[0]] = self.handler.reshape(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# output_shape,
# )
# elif node.op_type == "Unsqueeze":
# input_shape = _search_shape(model, node.input[0])
# axes = (
# [int(i) for i in data[node.input[1]].int64_data]
# if len(node.input) > 1
# else _parse_attribute(node, {"axes": None})["axes"]
# )
# for i in axes:
# input_shape.insert(i, 1)
# tensors[node.output[0]] = self.handler.reshape(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# input_shape,
# )
# elif node.op_type == "Concat":
# tensors[node.output[0]] = self.handler.concat(
# [tensors[name] for name in node.input],
# tensors.get(node.output[0]),
# next((attr.i for attr in node.attribute if attr.name == "axis")),
# )
# elif node.op_type == "Split":
# for name, tensor in zip(
# node.output,
# self.handler.split(
# tensors[node.input[0]],
# None,
# next(
# (attr.i for attr in node.attribute if attr.name == "axis"),
# 0,
# ),
# len(node.output),
# ),
# ):
# tensors[name] = tensor
# elif node.op_type == "Gather":
# tensors[node.output[0]] = self.handler.gather(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# next((attr.i for attr in node.attribute if attr.name == "axis")),
# )
# elif node.op_type == "ReduceMean":
# tensors[node.output[0]] = self.handler.reduce_mean(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# next(
# (attr.ints for attr in node.attribute if attr.name == "axes"),
# None,
# ),
# next((attr.i for attr in node.attribute if attr.name == "keepdims"))
# != 0,
# )
# elif node.op_type == "Slice":
# tensors[node.output[0]] = self.handler.slice(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# _parse_data(data[node.input[1]]),
# _parse_data(data[node.input[2]]),
# _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
# _parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
# )
# elif node.op_type == "Pad":
# tensors[node.output[0]] = self.handler.pad(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# _parse_data(data[node.input[1]]),
# _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
# )
# elif node.op_type == "Dropout":
# for name, tensor in zip(
# node.output,
# self.handler.dropout(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# tensors.get(node.output[1]) if len(node.output) > 1 else None,
# _parse_data(data[node.input[1]])[0]
# if len(node.input) > 1
# else 0.5,
# _parse_data(data[node.input[2]])[0]
# if len(node.input) > 2
# else False,
# ),
# ):
# tensors[name] = tensor
# else:
# raise Exception('Unsupported operator "{}"'.format(node.op_type))
# self.handler.data_malloc()
# for name, obj in tensors.items():
# tensor = data.get(name)
# if tensor == None:
# if any(input.name == name for input in model.graph.input):
# self.inputs[name] = obj
# else:
# self.initializer[obj.fuid()] = tensor
# if tensor.data_type == TensorProto.INT32:
# obj.copyin_int32(_parse_data(tensor))
# elif tensor.data_type == TensorProto.INT64:
# obj.copyin_int64(_parse_data(tensor))
# elif tensor.data_type == TensorProto.FLOAT:
# obj.copyin_float(_parse_data(tensor))
# else:
# assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
# for output in model.graph.output:
# self.outputs[output.name] = tensors[output.name]
def to_onnx(self, g: backend.Graph, path: str, name: str = 'my_onnx') -> ModelProto:
class Context:
# saves object names, including tensors and operators
names: Dict[Union[backend.Tensor, backend.Operator], str] = dict()
# counts the occurrence times of each operator for naming
count_op: Dict[backend.OpType, int] = dict()
# counts input and output tensors for naming
count_in, count_out = 0, 0
# saves nodes (operators)
nodes: List[NodeProto] = []
# saves global input tensors
inputs: List[ValueInfoProto] = []
# saves global output tensors
outputs: List[ValueInfoProto] = []
# saves global input tensors
initializers: List[TensorProto] = []
def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
ty = op.op_type()
name = "{}_{}".format(ty.name, op.guid())
self.names[op] = name
self.count_op[ty] = self.count_op.get(ty, 0) + 1
return ty, name
def push_output(self, name: str, tensor: backend.Tensor) -> str:
self.names[tensor] = name
if not tensor.has_target():
shape = tensor.shape()
dtype = backend.tensor_dtype(tensor)
value_info = make_tensor_value_info(name, dtype, shape)
check_value_info(value_info)
self.outputs.append(value_info)
return name
def push_input(
self, tensor: backend.Tensor, init: Optional[TensorProto]
) -> str:
name = self.names.get(tensor)
# means that this input is a global input
if name is None:
self.count_in += 1
name = "input_{}".format(tensor.guid())
self.names[tensor] = name
if init != None:
init.name = name
self.initializers.append(init)
else:
shape = tensor.shape()
dtype = backend.tensor_dtype(tensor)
value_info = make_tensor_value_info(name, dtype, shape)
check_value_info(value_info)
self.inputs.append(value_info)
return name
def push_data_input(
self,
node_name: str,
attr_name: str,
elem_type: int,
shape: Sequence[int],
vals: Any,
) -> str:
name = "{}_{}".format(node_name, attr_name)
tensor = make_tensor(name, elem_type, shape, vals)
check_tensor(tensor)
self.initializers.append(tensor)
return name
def push_node(self, node: NodeProto) -> None:
# check_node(node)
self.nodes.append(node)
def build(self, name: str) -> ModelProto:
graph = make_graph(
self.nodes, name, self.inputs, self.outputs, self.initializers
)
# check_graph(graph)
model = make_model(graph)
# check_model(model)
return model
# 拓扑排序
if not g.topo_sort():
raise Exception("Sorting fails")
ops = g.operators() # 图中所有算子(节点)
ctx = Context()
for op in ops:
ty, name = ctx.name_op(op)
inputs = [
ctx.push_input(it, self.initializer.get(it.fuid()))
for it in op.inputs()
]
outputs = [
ctx.push_output("{}_{}_{}".format(
name, i, tensor.guid()), tensor)
for (i, tensor) in enumerate(op.outputs())
]
if ty == backend.OpType.Conv:
ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op)
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
pads=[ph, pw, ph, pw],
strides=[sh, sw],
dilations=[dh, dw],
group=op.inputs()[0].shape()[
1] // op.inputs()[1].shape()[1],
)
)
elif ty == backend.OpType.ConvTrans:
ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(
op)
ctx.push_node(
make_node(
"ConvTranspose",
inputs,
outputs,
name,
pads=[ph, pw],
strides=[sh, sw],
dilations=[dh, dw],
output_padding=[oph, opw],
)
)
elif ty == backend.OpType.ConvTransNHWC:
# ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
ctx.push_node(
make_node(
"ConvTranspose",
inputs,
outputs,
name,
domain="nnet",
# pads=[ph, pw],
# strides=[sh, sw],
# dilations=[dh, dw],
# output_padding=[oph, opw],
)
)
elif ty == backend.OpType.MemBound:
# ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
ctx.push_node(
make_node(
"Membound",
inputs,
outputs,
name,
domain="nnet",
# pads=[ph, pw],
# strides=[sh, sw],
# dilations=[dh, dw],
# output_padding=[oph, opw],
)
)
elif ty == backend.OpType.Matmul:
# transA, transB = backend.matmul_attrs_of(op)
# HACK: recover this
transA, transB = False, False
ctx.push_node(
make_node(
"Gemm", inputs, outputs, name, transA=transA, transB=transB
)
)
elif ty == backend.OpType.BatchNorm:
inputs = [inputs[i] for i in [0, 3, 4, 1, 2]]
momentum, eps, training = backend.batch_norm_attrs_of(op)
ctx.push_node(
make_node(
"BatchNormalization",
inputs,
outputs,
name,
epsilon=eps,
momentum=momentum,
training_mode=training,
)
)
elif ty == backend.OpType.MaxPool:
kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
kernel_shape=[kh, kw],
pads=[ph, pw, ph, pw],
dilations=[dh, dw],
strides=[sh, sw],
)
)
elif ty == backend.OpType.AvgPool:
kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
ctx.push_node(
make_node(
"AveragePool",
inputs,
outputs,
name,
kernel_shape=[kh, kw],
pads=[ph, pw, ph, pw],
strides=[sh, sw],
)
)
elif ty in [
backend.OpType.Add,
backend.OpType.Sub,
backend.OpType.Mul,
backend.OpType.Div,
backend.OpType.Pow,
backend.OpType.Relu,
backend.OpType.Sigmoid,
backend.OpType.Tanh,
backend.OpType.Softmax,
backend.OpType.Abs,
backend.OpType.Identity,
backend.OpType.PRelu,
]:
ctx.push_node(make_node(ty.name, inputs, outputs, name))
elif ty == backend.OpType.Flatten:
raise Exception("TODO")
elif ty == backend.OpType.Transpose:
perm = backend.transpose_permute_of(op)
ctx.push_node(make_node(ty.name, inputs,
outputs, name, perm=perm))
elif ty == backend.OpType.Reshape:
shape = backend.reshape_shape_of(op)
inputs.append(
ctx.push_data_input(
name,
"shape",
TensorProto.INT64,
[len(shape)],
shape,
)
)
ctx.push_node(make_node(ty.name, inputs, outputs, name))
elif ty == backend.OpType.Concat:
axis = backend.concat_axis_of(op)
ctx.push_node(make_node(ty.name, inputs,
outputs, name, axis=axis))
elif ty == backend.OpType.Split:
axis = backend.split_axis_of(op)
num_outputs = len(outputs)
split = op.inputs()[0].shape()[axis] // num_outputs
inputs.append(
ctx.push_data_input(
name,
"split",
TensorProto.INT64,
[len(outputs)],
[split for _ in range(0, num_outputs)],
)
)
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
axis=axis,
)
)
elif ty == backend.OpType.Gather:
axis = backend.gather_axis_of(op)
ctx.push_node(make_node(ty.name, inputs,
outputs, name, axis=axis))
elif ty == backend.OpType.ReduceMean:
axes, keepdims = backend.reduce_mean_attrs_of(op)
inputs.append(
ctx.push_data_input(
name, "axes", TensorProto.INT64, [len(axes)], axes
)
)
ctx.push_node(
make_node(ty.name, inputs, outputs,
name, keepdims=keepdims)
)
elif ty == backend.OpType.Slice:
raise Exception("TODO")
elif ty == backend.OpType.Pad:
pads = backend.pad_pads_of(op)
inputs.append(
ctx.push_data_input(
name, "pads", TensorProto.INT64, [len(pads)], pads
)
)
ctx.push_node(make_node(ty.name, inputs, outputs, name))
# elif ty == backend.OpType.Clip:
# min, max = backend.clip_attrs_of(op)
# if min != None:
# inputs.append(
# ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [min])
# )
# else:
# inputs.append(
# ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [])
# )
# if max != None:
# inputs.append(
# ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [max])
# )
# else:
# inputs.append(
# ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
# )
# ctx.push_node(make_node(ty.name, inputs, outputs, name))
else:
raise Exception("Unsupported OpType", ty)
model = ctx.build(name)
onnx.save(model, path)
return model
# def init(self) -> None:
# self.handler.data_malloc()
# def optimize(self) -> None:
# self.handler.optimize()
# def run(self) -> None:
# self.handler.run()
# def from_onnx(model: ModelProto, runtime):
# stub = OnnxStub(model, runtime)
# return stub.inputs, stub.outputs, stub.handler
# def _search_shape(model: ModelProto, name: str) -> List[int]:
# ans = (
# next(
# (
# [
# (d.dim_value if d.dim_value > 0 else 1)
# for d in tensor.type.tensor_type.shape.dim
# ]
# for tensor in model.graph.value_info
# if tensor.name == name
# ),
# None,
# )
# or next(
# (
# [
# (d.dim_value if d.dim_value > 0 else 1)
# for d in tensor.type.tensor_type.shape.dim
# ]
# for tensor in model.graph.input
# if tensor.name == name
# ),
# None,
# )
# or next(
# [int(d) for d in tensor.dims]
# for tensor in model.graph.initializer
# if tensor.name == name
# )
# )
# return ans
# def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
# for attr in node.attribute:
# if attr.name in attrs:
# if attr.type == AttributeProto.INT:
# attrs[attr.name] = attr.i
# elif attr.type == AttributeProto.INTS:
# attrs[attr.name] = attr.ints
# elif attr.type == AttributeProto.FLOAT:
# attrs[attr.name] = attr.f
# elif attr.type == AttributeProto.STRING:
# attrs[attr.name] = attr.s
# elif attr.type == AttributeProto.TENSOR:
# attrs[attr.name] = attr.t
# else:
# assert False, "Unsupported Attribute Type: {}".format(attr.type)
# return attrs
# def _parse_data(tensor: TensorProto) -> List[Any]:
# return to_array(tensor).flatten().tolist()
# def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
# return [(d.dim_value if d.dim_value > 0 else 1) for d in shape.dim]
def export_onnx(g: backend.Graph, path: str) -> None:
stub = OnnxStub()
stub.to_onnx(g, path)

View File

@ -129,8 +129,9 @@ void GraphObj::dataMalloc() {
}
}
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
Tensor GraphObj::addTensor(Shape dim, DataType dtype, TensorType tensorType) {
return tensors.emplace_back(
make_ref<TensorObj>(dim, dtype, runtime, tensorType));
}
Tensor GraphObj::addTensor(const Tensor &tensor) {

View File

@ -1,10 +1,12 @@
#include "core/graph_handler.h"
#include "nnet/Visitor/Serializer.h"
#include "operators/batch_norm.h"
#include "operators/concat.h"
#include "operators/conv.h"
#include "operators/element_wise.h"
#include "operators/gather.h"
#include "operators/matmul.h"
#include "operators/membound.h"
#include "operators/pad.h"
#include "operators/pooling.h"
#include "operators/reduce_mean.h"
@ -19,8 +21,8 @@ namespace infini {
static DataType dtype_repr_convert(int);
Tensor GraphHandlerObj::tensor(Shape dims, int dtype) {
return g->addTensor(std::move(dims), dtype_repr_convert(dtype));
Tensor GraphHandlerObj::tensor(Shape dims, int dtype, TensorType ttype) {
return g->addTensor(std::move(dims), dtype_repr_convert(dtype), ttype);
}
Tensor GraphHandlerObj::conv(Tensor input, Tensor weight, Tensor output, int ph,
@ -291,6 +293,22 @@ Tensor GraphHandlerObj::pad(Tensor input, Tensor output,
}
}
TensorVec GraphHandlerObj::memBound(const TensorVec &inputs,
const Tensor &output,
const string &jsonString) {
const auto &[expr, nnetInputs, execTime, hint] =
nnet::Serializer().membundOpFromString(jsonString);
if (output) {
g->addOpWithOutputs<MemBoundObj>(std::move(inputs), TensorVec{output},
nnetInputs, expr, execTime, hint);
return {output};
} else
return g
->addOp<MemBoundObj>(std::move(inputs), TensorVec{nullptr},
nnetInputs, expr, execTime, hint)
->getOutputs();
}
static DataType dtype_repr_convert(int dtype) {
switch ((OnnxDType)dtype) {
case OnnxDType::FLOAT:

View File

@ -1,6 +1,7 @@
#include "core/operator.h"
#include "core/graph.h"
#include "core/hash.h"
#include "nnet/dbg.h"
namespace infini {
@ -34,7 +35,8 @@ bool OperatorObj::isReshapeOp() const { return type == OpType::Reshape; }
bool OperatorObj::isMemBoundOp() const {
return type == OpType::MemBound || type == OpType::Activation ||
type == OpType::Transpose;
type == OpType::Transpose || type == OpType::Relu ||
type == OpType::Tanh;
}
void OperatorObj::removePredecessors(const Operator &op) {
@ -83,22 +85,29 @@ HashType OperatorObj::hash() const {
bool OperatorObj::checkValid(GraphObj *graph) {
auto optShapes = inferShape();
IT_ASSERT(optShapes);
if (!optShapes) // shape inference failed
return false;
const vector<Shape> &shapes = *optShapes;
IT_ASSERT(shapes.size() == outputs.size());
if (shapes.size() != outputs.size())
return false;
if (graph) { // if graph != nullptr, outputs should be created
auto dataTypes = inferDataType();
for (size_t i = 0; i < outputs.size(); i++) {
IT_ASSERT(!outputs[i], "Find empty output while operator creation");
outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
outputs[i] =
graph->addTensor(shapes[i], dataTypes[i], TensorType::Other);
}
} else { // if outputs have been created, check their shapes
for (size_t i = 0; i < shapes.size(); ++i) {
if (shapes[i] != outputs[i]->getDims())
IT_ASSERT(shapes[i] == outputs[i]->getDims());
if (shapes[i] != outputs[i]->getDims()) {
dbg(shapes[i], outputs[i]->getDims());
return false;
}
IT_ASSERT(outputs[i]->getTensorType() == TensorType::Other);
}
}
return true;

View File

@ -2,9 +2,15 @@
#include "core/blob.h"
#include "core/kernel.h"
#include "core/perf_engine.h"
#include "operators/membound.h"
#include "utils/data_generator.h"
#include <chrono>
#include <cstring>
#ifdef USE_CUDA
#include "cuda_profiler_api.h"
#endif
namespace infini {
void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
if (!tune && profiling)
@ -56,13 +62,36 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
printProfilingData(totalTime, opTime, opCnt);
}
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
map<UidBaseType, bool>
RuntimeObj::getCompileTimeComputableAttribute(const Graph &graph) const {
map<UidBaseType, bool> ctcMap; // compile-time computable
// Skip static computation
bool status = graph->topo_sort();
IT_ASSERT(status, "Topological sort failed");
for (auto &op : graph->getOperators()) {
bool compileTimeComputable = true;
for (auto input : op->getInputs()) {
// FIXME: propogate the tensor type. Current only the first operator
// after weights are compile-time computable.
if (input->getTensorType() != TensorType::Initialized)
compileTimeComputable = false;
}
ctcMap[op->getGuid()] = compileTimeComputable;
}
return ctcMap;
}
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
bool allowEstimation,
bool ignoreMemboundOp) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
// Statistics
double totalTime = 0;
std::map<OpType, double> opTime;
std::map<OpType, int> opCnt;
// compile-time computable
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
for (auto &op : graph->getOperators()) {
auto kernelAttrs = KernelAttrs{device, op->getOpType(), op->getDType()};
@ -70,11 +99,21 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
PerfRecord record;
// Tune the kernel if there is no record
if (!perfData) {
double time = -1e9;
if (ctcMap[op->getGuid()]) { // Compile-time computable operators
time = 0;
} else if (op->getOpType() == OpType::Reshape) {
time = 0;
} else if (op->getOpType() == OpType::MemBound && ignoreMemboundOp) {
time = 0;
} else if (op->getOpType() == OpType::MemBound && allowEstimation) {
time = as<MemBoundObj>(op)->getEstimatedTime();
} else if (perfData) { // Tune the kernel if there is no record
time = perfData->time;
} else {
// TODO: should tenosrs automatically allocate when access data?
// allocate memory for empty tensors and release it after profiling
// allocate memory for empty tensors and release it after
// profiling
TensorVec allocatedTensors;
for (auto t : op->getInputs())
if (!t->hasData())
@ -88,21 +127,20 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
}
// Profile operators and record the results
record = kernel->tune(op, this);
PerfRecord record = kernel->tune(op, this);
time = record->time;
perfEngine.setPerfData(perfKey, record);
// Free allocated memory
for (auto t : allocatedTensors)
t->freeData();
} else
record = perfData;
}
double t = record->time;
totalTime += t;
totalTime += time;
if (profiling) {
op->print();
printf(" op_time %lf\n", t);
opTime[op->getOpType()] += t;
printf(" op_time %lf\n", time);
opTime[op->getOpType()] += time;
opCnt[op->getOpType()]++;
}
}
@ -160,4 +198,44 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,
string NativeCpuRuntimeObj::toString() const { return "CPU Runtime"; }
double RuntimeObj::timeNonCtcOperators(const Graph &graph, int warmup,
int repeat) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
// compile-time computable
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
bool status = graph->topo_sort();
IT_ASSERT(status, "Topological sort failed");
for (auto &op : graph->getOperators()) {
// HACK: set correct data type
auto kernelAttrs =
KernelAttrs{device, op->getOpType(), DataType::Float32};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
if (perfData)
kernel->compute(op, perfData, this);
else
kernel->compute(op, this);
if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
kernels.emplace_back(op, kernel, perfData);
}
for (auto &[op, kernel, perfData] : kernels) {
dbg(op);
}
double ret = timeit(
[&]() {
for (auto &[op, kernel, perfData] : kernels) {
if (perfData)
kernel->compute(op, perfData, this);
else
kernel->compute(op, this);
}
},
[&]() { sync(); }, warmup, repeat);
return ret;
}
} // namespace infini

View File

@ -1,6 +1,7 @@
#include "core/search_engine.h"
#include "core/hash.h"
#include "core/runtime.h"
#include "nnet/dbg.h"
#include <algorithm>
#include <iostream>
@ -8,7 +9,17 @@
namespace infini {
void SearchEngine::printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph) {
using MetaGraph = SearchEngine::MetaGraph;
SearchEngine::SearchEngine(Runtime runtime, Ref<Mutator> mutator)
: runtimeExec(runtime), mutator(mutator) {
// Compare graph with estimated time
graphTimeComparer = [this](const Graph &a, const Graph &b) -> bool {
return getEstimatedGraphPerf(a) < getEstimatedGraphPerf(b);
};
}
void SearchEngine::printMetaGraph(MetaGraph metaGraph) {
for (size_t i = 0; i < metaGraph->nodes.size(); i++) {
auto &node = metaGraph->nodes[i];
std::cout << "id: " << i << std::endl;
@ -32,8 +43,7 @@ Graph SearchEngine::run(const Graph graph) {
IT_ASSERT(runtimeExec == graph->getRuntime());
std::cout << "[INFO] original graph: " << std::endl;
std::cout << graph->toString();
std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(graph)
<< std::endl;
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(graph) << std::endl;
std::vector<Graph> partitions = partitionGraph(graph);
@ -45,7 +55,6 @@ Graph SearchEngine::run(const Graph graph) {
std::vector<Graph> candidates = search(subGraph);
std::cout << "[INFO] size: " << candidates.size() << std::endl;
IT_ASSERT(candidates.size() > 0);
std::cout << subGraph->toString() << std::endl;
std::vector<Graph> nextGraphs;
for (auto lastGraph : bestGraphs) {
for (auto thisGraph : candidates) {
@ -65,9 +74,9 @@ Graph SearchEngine::run(const Graph graph) {
nextGraphs.emplace_back(tmp);
}
}
std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
});
dbg("===Num" + std::to_string(nextGraphs.size()));
std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
if (nextGraphs.size() > GRAPH_SIZE) {
nextGraphs.resize(GRAPH_SIZE);
}
@ -81,10 +90,22 @@ Graph SearchEngine::run(const Graph graph) {
for (size_t i = 0; i < bestGraphs.size(); i++) {
std::cout << "bestGraph " << i << ":" << std::endl;
std::cout << bestGraphs[i]->toString();
std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(bestGraphs[i])
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[i])
<< std::endl;
}
// Fuse vertically and sort according to performance
for (size_t i = 0; i < bestGraphs.size(); ++i) {
// Debug
bestGraphs[i] = fuseVertically(bestGraphs[i]);
}
std::sort(bestGraphs.begin(), bestGraphs.end(), graphTimeComparer);
std::cout << "[INFO] best fused graph: " << std::endl;
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[0])
<< std::endl;
std::cout << bestGraphs[0] << std::endl;
return bestGraphs[0];
}
@ -102,9 +123,9 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
}
}
sort(results.begin(), results.end(), [&](Graph x, Graph y) {
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
}); // compare with perf time
// compare with perf time
dbg("===Num" + std::to_string(results.size()));
std::sort(results.begin(), results.end(), graphTimeComparer);
if (results.size() > GRAPH_SIZE) {
results.resize(GRAPH_SIZE);
}
@ -112,9 +133,8 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
}
// Build metagraph with a graph, each operator is a node.
std::shared_ptr<SearchEngine::MetaGraph>
SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
auto metaGraph = std::make_shared<MetaGraph>();
MetaGraph SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
auto metaGraph = make_ref<MetaGraphObj>();
int numOps = graph->getOperators().size();
std::vector<int> cnt(numOps, 0);
@ -123,7 +143,7 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
std::vector<int> q(0);
for (size_t i = 0; i < graph->getOperators().size(); i++) {
auto &op = graph->getOperators()[i];
MetaGraph::Node node;
MetaGraphObj::Node node;
std::vector<Operator> ops;
ops.emplace_back(op);
node.graph = make_ref<GraphObj>(runtimeExec, ops);
@ -157,9 +177,8 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
// Build a metagraph with graph and a plan, a plan is which ops should be a
// node.
std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
const std::shared_ptr<SearchEngine::MetaGraph> metaGraph,
const std::vector<int> &plan) {
MetaGraph SearchEngine::buildMetaGraphWithPlan(const MetaGraph metaGraph,
const std::vector<int> &plan) {
int numGroups = 0;
for (auto i : plan) {
if (i > numGroups) {
@ -172,12 +191,12 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
groups[plan[i]].emplace_back(i);
}
auto resultMetaGraph = make_ref<MetaGraph>();
auto resultMetaGraph = make_ref<MetaGraphObj>();
for (auto &group : groups) {
std::vector<Operator> ops;
std::unordered_set<int> preSet, sucSet;
for (auto id : group) {
MetaGraph::Node node;
MetaGraphObj::Node node;
for (auto op : metaGraph->nodes[id].graph->getOperators()) {
ops.emplace_back(op);
}
@ -204,8 +223,7 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
}
// Search how to merge multiple ops.
std::vector<std::shared_ptr<SearchEngine::MetaGraph>>
SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
vector<MetaGraph> SearchEngine::searchMerge(MetaGraph &metaGraph) {
IT_ASSERT(metaGraph != nullptr);
std::vector<int> plan(metaGraph->nodes.size());
for (size_t i = 0; i < plan.size(); i++) {
@ -222,7 +240,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
std::unordered_set<HashType> planSet;
searchMergeDfs(metaGraph, plan, frontier, plans, planSet);
std::vector<std::shared_ptr<SearchEngine::MetaGraph>> metaGraphs;
vector<MetaGraph> metaGraphs;
for (auto &curPlan : plans) {
metaGraphs.emplace_back(buildMetaGraphWithPlan(metaGraph, curPlan));
}
@ -230,8 +248,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
}
// DFS impl for search merge.
void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
std::vector<int> &plan,
void SearchEngine::searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
std::vector<int> &frontier,
std::vector<std::vector<int>> &plans,
std::unordered_set<uint64_t> &planSet) {
@ -320,14 +337,16 @@ void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
}
// Search mutation for each compute op.
std::vector<Graph> SearchEngine::searchMutation(
const std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
std::vector<Graph> graphs = {nullptr};
// Append a node to all existing candidates
for (auto &node : metaGraph->nodes) {
std::vector<Graph> nextGraphs;
if (node.type == 1) { // If it has computing OPs
auto mutatedGraphs = mutator->run(node.graph);
// // HACK: only try the first one for debug
if (mutatedGraphs.size() > 2)
mutatedGraphs.resize(2);
for (auto graph : graphs) {
for (auto mutatedGraph : mutatedGraphs) {
std::vector<Operator> ops;
@ -360,9 +379,8 @@ std::vector<Graph> SearchEngine::searchMutation(
for (auto g : nextGraphs) {
g->dataMalloc();
}
std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
});
dbg("===Num" + std::to_string(nextGraphs.size()));
std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
if (nextGraphs.size() > GRAPH_SIZE) {
nextGraphs.resize(GRAPH_SIZE);
}
@ -372,7 +390,7 @@ std::vector<Graph> SearchEngine::searchMutation(
}
bool SearchEngine::isMultiBranchMergable(const Graph graph) {
return mutationEngine->isMultiBranchMergable(graph);
return mutator->isMultiBranchMergable(graph);
}
// Split a graph into multiple independt graphs. Search engine will search for
@ -438,4 +456,60 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
return partitions;
}
double SearchEngine::getEstimatedGraphPerf(Graph graph) {
return runtimeExec->getPerfTime(graph, false, true, true);
}
Graph SearchEngine::fuseVertically(const Graph &graph) {
std::unordered_map<UidBaseType, int> visitTime;
std::vector<Operator> ops;
graph->topo_sort();
int cnt = 0;
for (auto op : graph->getOperators()) {
// Skip visited OP
if (visitTime.find(op->getGuid()) != visitTime.end()) {
continue;
}
// Skip compute OP and multi-input/output OP
if (!op->isMemBoundOp() || (op->getPredecessors().size() != 1 &&
op->getSuccessors().size() != 1)) {
visitTime.emplace(op->getGuid(), ++cnt);
ops.emplace_back(op);
continue;
}
vector<Operator> chainOps;
visitTime.emplace(op->getGuid(), ++cnt);
vector<Operator> tmp;
auto cur = op;
while (cur->getPredecessors().size() == 1 &&
cur->getPredecessors()[0]->isMemBoundOp()) {
cur = cur->getPredecessors()[0];
tmp.emplace_back(cur);
visitTime.emplace(cur->getGuid(), cnt);
}
for (int i = tmp.size() - 1; i >= 0; i--) {
chainOps.emplace_back(tmp[i]);
}
chainOps.emplace_back(op);
cur = op;
while (cur->getSuccessors().size() == 1 &&
cur->getSuccessors()[0]->isMemBoundOp()) {
cur = cur->getSuccessors()[0];
chainOps.emplace_back(cur);
visitTime.emplace(cur->getGuid(), cnt);
}
make_ref<GraphObj>(runtimeExec, chainOps)->print();
Graph optGraph =
mutator->fuseVertically(make_ref<GraphObj>(runtimeExec, chainOps));
for (auto op : optGraph->getOperators()) {
ops.emplace_back(op);
}
}
return make_ref<GraphObj>(runtimeExec, ops);
}
} // namespace infini

View File

@ -8,12 +8,14 @@
namespace infini {
TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime)
TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime,
TensorType tensorType)
: TensorBaseObj(shape_.size(), dtype, runtime), shape(std::move(shape_)),
_size(shape.empty()
? 0
: std::accumulate(shape.begin(), shape.end(), 1,
[](auto acc, auto x) { return acc * x; })) {}
[](auto acc, auto x) { return acc * x; })),
tensorType(tensorType) {}
string TensorObj::toString() const {
// Convert data pointer to string
@ -24,8 +26,8 @@ string TensorObj::toString() const {
ss << "nullptr data";
string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
std::to_string(fuid) + ", shape " + vecToString(shape) +
", dtype " + dtype.toString() + ", " + runtime->toString() +
", " + ss.str() + "\n";
", dtype " + dtype.toString() + ", tensorType " +
std::to_string(enum_to_underlying(tensorType));
vector<UidBaseType> targetGuids;
for (const auto &op : targets)
targetGuids.emplace_back(op.lock()->getGuid());
@ -34,6 +36,7 @@ string TensorObj::toString() const {
else
ret += ", source None";
ret += ", targets " + vecToString(targetGuids);
ret += ", " + runtime->toString() + ", " + ss.str();
return ret;
}
@ -64,12 +67,19 @@ vector<size_t> TensorObj::getStride() const {
void TensorObj::printData() const {
IT_ASSERT(data != nullptr);
if (!runtime->isCpu())
IT_TODO_HALT();
void *ptr = nullptr;
Blob buffer;
if (!runtime->isCpu()) { // copy data to main memory
buffer = NativeCpuRuntimeObj::getInstance()->allocBlob(getBytes());
runtime->copyBlobToCPU(buffer->getPtr<void *>(),
getRawDataPtr<void *>(), getBytes());
ptr = buffer->getPtr<void *>();
} else
ptr = data->getPtr<float *>();
#define TRY_PRINT(N) \
if (dtype == DataType(N)) \
std::cout << dataToString<DT<N>::t>() << std::endl;
std::cout << dataToString<DT<N>::t>(ptr) << std::endl;
TRY_PRINT(0) // fmt: new line
else TRY_PRINT(1) //

View File

@ -2,10 +2,56 @@
#include "core/kernel.h"
#include "core/perf_engine.h"
#include "core/runtime.h"
#include "cuda_profiler_api.h"
#include "operators/conv.h"
#include "operators/matmul.h"
#ifdef INFINI_USE_TVM
#include "tvm/runtime/device_api.h"
#endif
namespace infini {
CudaRuntimeObj::CudaRuntimeObj()
: RuntimeObj(Device::CUDA), stream(cudaStreamPerThread),
cudaGraphStatus(false) {
checkCudnnError(cudnnCreate(&cudnn));
checkCublasError(cublasCreate(&cublas));
checkCudnnError(cudnnSetStream(cudnn, stream));
checkCublasError(cublasSetStream(cublas, stream));
// 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 7ll << 30; // 7 GB
workspace = alloc(workspaceSize);
}
CudaRuntimeObj::~CudaRuntimeObj() {
try {
dealloc(workspace);
checkCudnnError(cudnnDestroy(cudnn));
checkCublasError(cublasDestroy(cublas));
} catch (const std::exception &e) {
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
}
}
void CudaRuntimeObj::beginCudaGraphStreamCapture() {
enum cudaStreamCaptureStatus pCaptureStatus;
checkCudaError(cudaStreamIsCapturing(stream, &pCaptureStatus));
IT_ASSERT(pCaptureStatus == cudaStreamCaptureStatusNone);
cudaGraphStatus = true;
checkCudaError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
}
tuple<cudaGraphExec_t, size_t> CudaRuntimeObj::endCudaGraphStreamCapture() {
cudaGraph_t cudaGraph;
cudaGraphExec_t instance;
checkCudaError(cudaStreamEndCapture(stream, &cudaGraph));
cudaGraphStatus = false;
size_t numCudaGraphNodes;
checkCudaError(cudaGraphGetNodes(cudaGraph, nullptr, &numCudaGraphNodes));
checkCudaError(cudaGraphInstantiate(&instance, cudaGraph, NULL, NULL, 0));
return {instance, numCudaGraphNodes};
}
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
@ -75,4 +121,57 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
double CudaRuntimeObj::timeWithCudaGraph(Graph graph) {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
// compile-time computable
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
bool status = graph->topo_sort();
IT_ASSERT(status, "Topological sort failed");
for (auto &op : graph->getOperators()) {
// HACK: set correct data type
auto kernelAttrs =
KernelAttrs{device, op->getOpType(), DataType::Float32};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
if (perfData)
kernel->compute(op, perfData, this);
else
kernel->compute(op, this);
if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
kernels.emplace_back(op, kernel, perfData);
}
for (auto &[op, kernel, perfData] : kernels) {
dbg(op);
}
// Init tvm stream
#ifdef INFINI_USE_TVM
DLDevice tvm_device_id = {kDLCUDA, 0};
auto tvm_device = tvm::runtime::DeviceAPI::Get(tvm_device_id);
tvm_device->SetStream(tvm_device_id, getStream());
#endif
beginCudaGraphStreamCapture();
for (auto &[op, kernel, perfData] : kernels) {
if (perfData)
kernel->compute(op, perfData, this);
else
kernel->compute(op, this);
}
auto [cudaGraphInstance, numCudaGraphNodes] = endCudaGraphStreamCapture();
IT_ASSERT(numCudaGraphNodes == kernels.size(),
std::to_string(numCudaGraphNodes) +
" != " + std::to_string(kernels.size()));
return timeit(
[&, cudaGraphInstance = cudaGraphInstance, stream = getStream()]() {
checkCudaError(cudaGraphLaunch(cudaGraphInstance, stream));
},
[&, stream = getStream()]() { cudaStreamSynchronize(stream); }, 1000,
1000);
}
} // namespace infini

22
src/ffi/ffi_callback.cc Normal file
View File

@ -0,0 +1,22 @@
#include "core/graph.h"
#include <pybind11/stl.h>
namespace py = pybind11;
namespace infini {
namespace callback {
using namespace py::literals;
static std::function<void(const Graph &, string)> exportONNXImpl;
void exportONNX(const Graph &graph, const string &path) {
IT_ASSERT(Py_IsInitialized(), "Python interpreter is not running.");
static auto exportONNXImpl =
py::module_::import("infinitensor.if_onnx").attr("export_onnx");
exportONNXImpl(graph, path);
}
} // namespace callback
} // namespace infini

View File

@ -1,9 +1,13 @@
#include "core/graph_handler.h"
#include "core/mutator.h"
#include "core/search_engine.h"
#include "nnet/nmutator.h"
#include "operators/batch_norm.h"
#include "operators/concat.h"
#include "operators/conv.h"
#include "operators/gather.h"
#include "operators/matmul.h"
#include "operators/membound.h"
#include "operators/pad.h"
#include "operators/pooling.h"
#include "operators/reduce_mean.h"
@ -63,6 +67,7 @@ void export_values(py::module &m) {
.VALUE(OpType, Conv)
.VALUE(OpType, Matmul)
.VALUE(OpType, ConvTrans)
.VALUE(OpType, ConvTransNHWC)
.VALUE(OpType, G2BMM)
.VALUE(OpType, GBMM)
.VALUE(OpType, Pad)
@ -94,8 +99,13 @@ void export_values(py::module &m) {
.VALUE(OpType, Abs)
.VALUE(OpType, Resize)
.VALUE(OpType, Dropout)
.VALUE(OpType, MemBound)
.export_values();
py::enum_<TensorType>(m, "TensorType")
.VALUE(TensorType, Input)
.VALUE(TensorType, Initialized)
.VALUE(TensorType, Other);
#undef VALUE
}
@ -224,6 +234,10 @@ static vector<int> transpose_permute_of(Operator op) {
return dynamic_cast<const TransposeObj *>(op.get())->getPermute();
}
static string membound_expr_of(Operator op) {
return as<MemBoundObj>(op)->toJson();
}
void export_functions(py::module &m) {
#define FUNCTION(NAME) def(#NAME, &NAME)
m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
@ -252,25 +266,37 @@ void export_functions(py::module &m) {
.FUNCTION(transpose_permute_of)
.FUNCTION(concat_axis_of)
.FUNCTION(split_axis_of)
.FUNCTION(gather_axis_of);
.FUNCTION(gather_axis_of)
.FUNCTION(membound_expr_of)
.def("membound_hash_of",
[](Operator op) { return as<MemBoundObj>(op)->getHash(); });
#undef FUNCTION
}
void init_graph_builder(py::module &m) {
using Handler = GraphHandlerObj;
py::class_<RuntimeObj, std::shared_ptr<RuntimeObj>>(m, "Runtime");
py::class_<Object, Ref<Object>>(m, "_Object")
.def("__str__", &Object::toString)
.def("guid", &Object::getGuid);
py::class_<RuntimeObj, Ref<RuntimeObj>>(m, "Runtime")
.def("run", &RuntimeObj::run, "graph"_a, "tune"_a = false,
"profiling"_a = false)
.def("getPerfTime", &RuntimeObj::getPerfTime, "graph"_a, "profiling"_a,
"allowEstimation"_a, "ignoreMemboundOp"_a)
.def("timeNonCtcOperators", &RuntimeObj::timeNonCtcOperators);
py::class_<NativeCpuRuntimeObj, std::shared_ptr<NativeCpuRuntimeObj>,
RuntimeObj>(m, "CpuRuntime");
#ifdef USE_CUDA
py::class_<CudaRuntimeObj, std::shared_ptr<CudaRuntimeObj>, RuntimeObj>(
m, "CudaRuntime");
py::class_<CudaRuntimeObj, Ref<CudaRuntimeObj>, RuntimeObj>(m,
"CudaRuntime")
.def("timeWithCudaGraph", &CudaRuntimeObj::timeWithCudaGraph);
#endif
#ifdef USE_BANG
py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
m, "BangRuntime");
#endif
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
py::class_<TensorObj, std::shared_ptr<TensorObj>, Object>(m, "Tensor")
.def("fuid", &TensorObj::getFuid, policy::automatic)
.def("shape", &TensorObj::getDims, policy::move)
.def("copyin_float", &TensorObj::copyin<float>, policy::move)
@ -281,8 +307,10 @@ void init_graph_builder(py::module &m) {
.def("copyout_int64", &TensorObj::copyout<int64_t>, policy::move)
.def("has_target", &TensorObj::hasTarget, policy::automatic)
.def("src", &TensorObj::getSource, policy::move)
.def("printData", &TensorObj::printData, policy::automatic);
py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
.def("print_data", &TensorObj::printData)
.def("data_malloc", &TensorObj::dataMalloc)
.def("getTensorType", &TensorObj::getTensorType);
py::class_<OperatorObj, std::shared_ptr<OperatorObj>, Object>(m, "Operator")
.def("op_type", &OperatorObj::getOpType, policy::automatic)
.def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_),
policy::reference)
@ -291,6 +319,9 @@ void init_graph_builder(py::module &m) {
policy::reference);
py::class_<Handler>(m, "GraphHandler")
.def(py::init<Runtime>())
.def(py::init<Graph>())
.def("inputs", &Handler::inputs, policy::move)
.def("outputs", &Handler::outputs, policy::move)
.def("tensor", &Handler::tensor, policy::move)
.def("conv", &Handler::conv, policy::move)
.def("convTransposed2d", &Handler::convTransposed2d, policy::move)
@ -321,13 +352,47 @@ void init_graph_builder(py::module &m) {
.def("reduce_mean", &Handler::reduceMean, policy::move)
.def("slice", &Handler::slice, policy::move)
.def("pad", &Handler::pad, policy::move)
.def("memBound", &Handler::memBound, policy::move)
.def("topo_sort", &Handler::topo_sort, policy::automatic)
.def("optimize", &Handler::optimize, policy::automatic)
.def("operators", &Handler::operators, policy::move)
.def("data_malloc", &Handler::data_malloc, policy::automatic)
.def("run", &Handler::run, policy::automatic);
.def("run", &Handler::run, policy::automatic)
.def("getGraph", &Handler::getGraph);
py::class_<Mutator, Ref<Mutator>>(m, "Mutator").def("run", &Mutator::run);
py::enum_<NMutator::Mode>(m, "NMutatorMode")
.value("RuleBased", NMutator::Mode::RuleBased);
py::class_<NMutator, Ref<NMutator>, Mutator>(m, "NMutator")
.def(py::init<NMutator::Mode>())
.def(py::init<NMutator::Mode, vector<int>>())
.def("run", &NMutator::run);
py::class_<SearchEngine>(m, "SearchEngine")
.def(py::init<Runtime, Ref<Mutator>>())
.def("run", &SearchEngine::run);
py::class_<GraphObj, Ref<GraphObj>, Object>(m, "Graph")
.def("tensors", &GraphObj::getTensors)
.def("operators", &GraphObj::getOperators)
.def("inputs", &GraphObj::getInputs)
.def("outputs", &GraphObj::getOutputs)
.def("print", &GraphObj::print)
.def("topo_sort", &GraphObj::topo_sort);
}
#ifdef USE_CUDA
Graph getInfoGAN(int batch, Runtime runtime, int nLayers);
vector<Tensor> runInfoGAN(int nLayers);
Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId);
Graph optimizeGraph(Graph g, Runtime runtime, bool tuning);
void export_test_model(py::module &m) {
m.def("runInfoGAN", &runInfoGAN);
m.def("getInfoGAN", &getInfoGAN);
m.def("getConvtransposedNHWC", &getConvtransposedNHWC);
m.def("optimizeGraph", &optimizeGraph, "graph"_a, "runtime"_a,
"tuning"_a = false);
}
#endif
} // namespace infini
PYBIND11_MODULE(backend, m) {
@ -335,4 +400,5 @@ PYBIND11_MODULE(backend, m) {
infini::export_values(m);
infini::export_functions(m);
infini::init_graph_builder(m);
infini::export_test_model(m);
}

View File

@ -0,0 +1,171 @@
#include "cuda/cuda_common.h"
using dtype = float;
__global__ void conv2dreduce_kernel_(float *__restrict__ input,
float *__restrict__ bias,
float *__restrict__ output,
const bool PReLU, const int n, const int f,
const int h, const int w, const int oh,
const int ow, const int r, const int s,
const int ph, const int pw, const int dh,
const int dw, const int sh, const int sw) {
// output shape: (n, oh, ow, f)
// input shape: (n, h, w, f, r, s)
int nid = blockIdx.x, fid = blockIdx.y;
int hid = threadIdx.x, wid = threadIdx.y;
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
nchunck = n * hchunk;
float *nfinput = input + nid * nchunck + fid * fchunck;
if (nid < n && fid < f && hid < oh && wid < ow) {
float imm = 0.0;
int ihst = hid * sh - ph;
int iwst = wid * sw - pw;
for (int ri = 0; ri < r; ++ri) {
for (int si = 0; si < s; ++si) {
int ihid = ihst + ri * dh;
int iwid = iwst + si * dw;
if (ihid >= 0 && ihid < h && iwid >= 0 && iwid < w) {
imm += *(nfinput + ihid * hchunk + iwid * wchunk + ri * s +
si);
}
}
}
if (bias) {
imm += bias[fid];
}
if (PReLU) {
imm = imm > 0.0 ? imm : 0.0;
}
output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
}
}
__global__ void convTranspose2dreduce_kernel_(
float *__restrict__ input, float *__restrict__ bias,
float *__restrict__ output, const bool PReLU, const int n, const int f,
const int h, const int w, const int oh, const int ow, const int r,
const int s, const int ph, const int pw, const int dh, const int dw,
const int sh, const int sw) {
// assert dh = dw = 1
int nid = blockIdx.x, fid = blockIdx.y;
int hid = threadIdx.x, wid = threadIdx.y;
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
nchunck = n * hchunk;
float *nfinput = input + nid * nchunck + fid * fchunck;
// view as conv, the true ph and pw
int tph = r - ph - 1, tpw = s - pw - 1;
int th = (h - 1) * sh + 1, tw = (w - 1) * sw + 1;
if (nid < n && fid < f && hid < oh && wid < ow) {
float imm = 0.0;
int ihst = hid - tph;
int iwst = wid - tpw;
for (int ri = 0; ri < r; ++ri) {
for (int si = 0; si < s; ++si) {
int ihid = ihst + r - ri - 1;
int iwid = iwst + s - si - 1;
if (ihid >= 0 && ihid < th && iwid >= 0 && iwid < tw &&
(ihid % sh == 0) && (iwid % sw == 0)) {
imm += *(nfinput + (ihid / sh) * hchunk +
(iwid / sw) * wchunk + ri * s + si);
}
}
}
if (bias) {
imm += bias[fid];
}
if (PReLU) {
imm = imm > 0.0 ? imm : 0.0;
}
output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
}
}
// nhwrsc -> nhwc
__global__ void reduce_4x4(dtype *in, dtype *out, int act, const int N,
const int F, const int H, const int W, const int IH,
const int IW) {
// #define in_index(n, h, w, r, s, f) \
// ((((((n)*IH + h) * IW + w) * R + r) * S + s) * F + f)
#define in_index(n, h, w, f, r, s) \
((((((n)*IH + h) * IW + w) * F + f) * R + r) * S + s)
#define out_index(n, h, w, f) (((((n)*H) + (h)) * W + (w)) * F + (f))
const int R = 4, S = 4;
const int n_tasks = N * F * H * W;
int start = threadIdx.x + blockDim.x * blockIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = start; i < n_tasks; i += stride) {
int t = i, n, f, h, w;
f = t % F;
t /= F;
w = t % W;
t /= W;
h = t % H;
t /= H;
n = t;
// unroll this 2-iter loop
float sum = 0;
int x, y;
for (int r = (h + 1) & 1; r < R; r += 2) {
x = (h + 1 - r) / 2;
if (x >= 0 && x < IH) {
for (int s = (w + 1) & 1; s < S; s += 2) {
y = (w + 1 - s) / 2;
if (y >= 0 && y < IW) {
sum += in[in_index(n, x, y, f, r, s)];
// if (i==0)
// printf("TTT nhwf= %d,%d,%d,%d x=%d y=%d, v=%f,
// index=%d, rsf %d %d %d\n", n, h, w,
// f, x, y, in[in_index(n, x, y, r, s, f)],
// in_index(n, x, y, r, s, f), r,s,f);
}
}
}
}
if (act == 0) {
out[out_index(n, h, w, f)] = sum;
} else if (act == 1) { // Relu
out[out_index(n, h, w, f)] = sum > 0 ? sum : 0;
} else if (act == 2) {
out[out_index(n, h, w, f)] = tanhf(sum);
}
}
#undef in_index
#undef out_index
}
namespace infini {
void conv2dreduce_kernel(float *input, float *bias, float *output, bool PReLU,
int n, int h, int w, int f, int r, int s, int oh,
int ow, int ph, int pw, int sh, int sw, int dh,
int dw) {
dim3 grid(n, f);
dim3 block(oh, ow);
// cudaStream_t stream(cudaStreamPerThread);
conv2dreduce_kernel_<<<grid, block, 0>>>(input, bias, output, PReLU, n, f,
h, w, oh, ow, r, s, ph, pw, dh, dw,
sh, sw);
}
void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
int act, int n, int h, int w, int f, int r,
int s, int oh, int ow, int ph, int pw, int sh,
int sw, int dh, int dw) {
dim3 grid(n, f);
dim3 block(oh, ow);
// cudaStream_t stream(cudaStreamPerThread);
// puts("convTranspose2dreduce_kernel is executed");
if (r == 4 && s == 4 && sh == 2 && sw == 2) {
const int M = r * s * f, N = n * h * w;
reduce_4x4<<<(M * N + 127) / 128, 128>>>(input, output, act, n, f, oh,
ow, h, w);
} else {
puts("why use this conv2dreduce");
convTranspose2dreduce_kernel_<<<grid, block, 0>>>(
input, bias, output, (bool)act, n, f, h, w, oh, ow, r, s, ph, pw,
dh, dw, sh, sw);
}
}
} // namespace infini

View File

@ -49,7 +49,7 @@ class matmulCublas : public Kernel {
const float alpha = 1.f, beta = 0.f;
// TODO:use compute type
cublasStatus_t stat;
if (b > 1) {
if (b >= 1) {
// Support batch broadcast with zero stride
int dimA = op->getInputs(0)->getDims().size();
int dimB = op->getInputs(1)->getDims().size();
@ -63,6 +63,11 @@ class matmulCublas : public Kernel {
(dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
? 0 // Broadcast the batch dimension if batch size is 1
: n * k;
// printf("cublasGemmStridedBatchedEx %d%d, mnk %d %d %d, alpha %f,
// B "
// "%d %lld, A %d %lld, C %d %d, b %d %d\n",
// opB, opA, n, m, k, alpha, ldb, strideB, lda, strideA, ldc,
// m * n, b, record->algo);
stat = cublasGemmStridedBatchedEx(
context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA,

View File

@ -1,5 +1,6 @@
#ifdef INFINI_USE_TVM
#include "core/kernel.h"
#include "cuda/cuda_conv2dreduce.h"
#include "cuda/cuda_runtime.h"
#include "dlpack/dlpack.h"
#include "ffi/ffi_embed.h"
@ -8,6 +9,13 @@
#include "operators/pooling.h"
#include "tvm/runtime/module.h"
#include "tvm/runtime/packed_func.h"
#include <nlohmann/json.hpp>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
using json = nlohmann::json;
namespace py = pybind11;
@ -22,6 +30,8 @@ class TVMRecordObj : public PerfRecordObj {
std::string dllPath;
std::string funcName;
std::vector<int> inputIdx;
tvm::runtime::PackedFunc packedFunc;
bool useExistingKernel = false;
};
using TVMRecord = Ref<TVMRecordObj>;
@ -33,9 +43,15 @@ class MemboundTVMPackedFunction : public Kernel {
auto op = as<MemBoundObj>(_op);
// auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
auto tvmRecord = std::dynamic_pointer_cast<TVMRecordObj>(record);
tvm::runtime::PackedFunc packedFunc =
getPackedFunction(tvmRecord->dllPath, tvmRecord->funcName);
IT_ASSERT(packedFunc != nullptr);
// Use user-defined kernels
if (tvmRecord->useExistingKernel) {
bool success = useExistingKernels(op);
IT_ASSERT(success);
return;
}
tvm::runtime::PackedFunc packedFunc = tvmRecord->packedFunc;
// prepare inputs and outputs
vector<DLTensorHolder> inputsHolder;
@ -63,10 +79,18 @@ class MemboundTVMPackedFunction : public Kernel {
// Premise: op is idempotent since it is called multiple times.
PerfRecord tune(const Operator &_op,
const RuntimeObj *_context) const override {
TVMRecord ret = std::make_shared<TVMRecordObj>();
auto op = as<MemBoundObj>(_op);
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
// If hash matches, use user-defined kernels
if (useExistingKernels(op)) {
TVMRecord ret = std::make_shared<TVMRecordObj>();
ret->time = timeit([&]() { useExistingKernels(op); },
[&]() { context->sync(); });
ret->useExistingKernel = true;
return ret;
}
// invoke Ansor to tune a membound kernel
auto [expr, hash] = op->getSimplifiedNnetExpr();
nnet::AsTVMVisitor visitor;
@ -93,6 +117,7 @@ class MemboundTVMPackedFunction : public Kernel {
if (inputName == op->getNnetInputs()[j]->getName())
break;
}
IT_ASSERT(j < numInputs, "Cannot find input name: " + inputName);
inputIdx.emplace_back(j);
}
@ -114,29 +139,41 @@ class MemboundTVMPackedFunction : public Kernel {
tvm::runtime::TVMArgs args(preArgs.first.data(), preArgs.second.data(),
preArgs.first.size());
TVMRecord ret = std::make_shared<TVMRecordObj>();
ret->time = timeit([&]() { packedFunc.CallPacked(args, &rv); },
[&]() { context->sync(); });
ret->kernelName = kernelName;
ret->dllPath = dllPath;
ret->funcName = func;
ret->inputIdx = inputIdx;
ret->packedFunc = packedFunc;
return std::dynamic_pointer_cast<PerfRecordObj>(ret);
return ret;
}
std::string serializeTVMArgs(const std::vector<std::vector<int>> &inDims,
const std::vector<std::string> &inDTypes,
const std::vector<int> &outDims,
const std::string &outDType,
const std::string &lambda,
const std::string &funcName,
const std::string &nnetExprString,
const std::string &nnetSimplifiedExprString,
const HashType hashCode) const {
json j;
// Consistant with python API interface
j["input_tensors"] = inDims;
j["input_dtypes"] = inDTypes;
j["output_tensor"] = outDims;
j["output_dtype"] = outDType;
j["tvm_code"] = lambda;
j["func_name"] = funcName;
j["nnet_expression"] = nnetExprString;
j["nnet_simplified_expression"] = nnetSimplifiedExprString;
j["hash_code"] = std::to_string(hashCode);
return j.dump();
}
/// @brief
/// @param inDims
/// @param inDTypes
/// @param outDims
/// @param outDType
/// @param lambda
/// @param funcName Generated function name
/// @param nnetExpressionString Save expr in string for logging.
/// @param nnetSimplifiedExprString Save simplified expr in string for
/// logging.
/// @param hashCode (optional) Hash code of the input expression for kernel
/// cache.
/// @return
std::string getAnsorDLL(const std::vector<std::vector<int>> &inDims,
const std::vector<std::string> &inDTypes,
const std::vector<int> &outDims,
@ -146,29 +183,60 @@ class MemboundTVMPackedFunction : public Kernel {
const std::string &nnetExprString,
const std::string &nnetSimplifiedExprString,
const HashType hashCode) const {
std::string dllPath;
try {
start_interpreter();
// Use static to avoid re-importing the module. Re-importing results
// in cuBLAS failure, whose root cause is not identified yet.
static auto func =
py::module::import("cpp_plugin").attr("gen_ansor_so");
py::tuple code =
func(inDims, inDTypes, outDims, outDType, lambda, funcName,
nnetExprString, nnetSimplifiedExprString,
std::to_string(hashCode));
dllPath = py::str(code[0]);
} catch (py::error_already_set &e) {
if (e.matches(PyExc_ImportError)) {
std::cerr << "Import Error. Don't forget to set environment "
"variable PYTHONPATH to contain "
"<repo-root>/python"
<< std::endl;
}
throw;
int fdP2C[2], fdC2P[2];
for (auto fd : {fdP2C, fdC2P}) {
int status = pipe(fd);
IT_ASSERT(status == 0, "pipe failed");
}
pid_t pid = fork();
IT_ASSERT(pid >= 0, "fork failed");
if (pid == 0) { // Child process
close(fdP2C[1]);
close(fdC2P[0]);
return dllPath;
dup2(fdP2C[0], STDIN_FILENO);
close(fdP2C[0]);
string cmd =
"from cpp_plugin.gen_ansor_so import pipe_gen; pipe_gen(+" +
std::to_string(fdC2P[1]) + ")";
const char *const argv[] = {"python3", "-c", cmd.data(), NULL};
execvp("python3", const_cast<char *const *>(argv));
} else { // Parent process
close(fdP2C[0]);
close(fdC2P[1]);
// Write to pipe
string serializedArgs = serializeTVMArgs(
inDims, inDTypes, outDims, outDType, lambda, funcName,
nnetExprString, nnetSimplifiedExprString, hashCode);
int status = -1;
status =
write(fdP2C[1], serializedArgs.data(), serializedArgs.size());
IT_ASSERT((size_t)status == serializedArgs.size(),
"Failed to write to pipe");
close(fdP2C[1]);
// Wait for TVM
waitpid(pid, &status, 0);
IT_ASSERT(WIFEXITED(status), "TVM process was terminated");
const int es = WEXITSTATUS(status);
IT_ASSERT(es == 0,
"TVM process exit with code " + std::to_string(es));
// Read from pipe
FILE *stream;
stream = fdopen(fdC2P[0], "r");
char buf_read[257] = {0};
status = std::fscanf(stream, "%256c", buf_read);
IT_ASSERT(status == 1, "Failed to read from pipe");
IT_ASSERT(buf_read[256] == 0, "Pipe buffer overflow");
fclose(stream);
close(fdC2P[0]);
return buf_read;
}
IT_ASSERT(false, "Should not reach here");
return "";
}
tvm::runtime::PackedFunc getPackedFunction(string path,
@ -214,6 +282,34 @@ class MemboundTVMPackedFunction : public Kernel {
return {values, type_codes};
}
bool useExistingKernels(Ref<MemBoundObj> op) const {
const map<HashType, tuple<int, int, int, int, int, int, int, int, int,
int, int, int, int, int, int>>
hashMap = {
// clang-format off
{18446744073661354550ULL, {1, 1, 2, 2, 256, 4, 4, 4, 4, 1, 1, 2, 2, 1, 1}},
{124145340ULL, {1, 1, 4, 4, 128, 4, 4, 8, 8, 1, 1, 2, 2, 1, 1}},
{18446744073695718019ULL, {1, 1, 8, 8, 64, 4, 4, 16, 16, 1, 1, 2, 2, 1, 1}},
{515085072ULL, {2, 1, 16, 16, 3, 4, 4, 32, 32, 1, 1, 2, 2, 1, 1}}
}; // clang-format on
float *input = op->getInputs(0)->getRawDataPtr<float *>();
float *bias = nullptr;
float *output = op->getOutput()->getRawDataPtr<float *>();
if (auto it = hashMap.find(op->getHash()); it != hashMap.end()) {
auto &[PReLU, n, h, w, f, r, s, oh, ow, ph, pw, sh, sw, dh, dw] =
it->second;
IT_ASSERT(op->getInputs(0)->size() ==
size_t(n) * h * w * f * r * s);
IT_ASSERT(op->getOutput()->size() == size_t(n) * oh * ow * f);
convTranspose2dreduce_kernel(input, bias, output, PReLU, n, h, w, f,
r, s, oh, ow, ph, pw, sh, sw, dh, dw);
return true;
}
// conv2dreduce_kernel(input, bias, output, PReLU, n, h, w, f, r, s,
// oh, ow, ph, pw, sh, sw, dh, dw);
return false;
}
};
REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,

View File

@ -4,10 +4,13 @@ namespace infini {
class CopyCuda : public CudaKernelWithoutConfig {
void compute(const Operator &op,
const RuntimeObj *_context) const override {
auto inData = op->getInputs(0)->getRawDataPtr<void *>();
auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
cudaMemcpyDeviceToDevice);
// auto inData = op->getInputs(0)->getRawDataPtr<void *>();
// auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
// cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
// cudaMemcpyDeviceToDevice);
// HACK: optimization
op->getOutputs()[0]->setData(op->getInputs(0)->getDataBlob());
}
};
// reshape/flatten/identity all act as copying from input to output.

239
src/nnet/App/test_models.cc Normal file
View File

@ -0,0 +1,239 @@
#ifdef USE_CUDA
#include "core/blob.h"
#include "core/dummy_mutator.h"
#include "core/graph.h"
#include "core/runtime.h"
#include "core/search_engine.h"
#include "cuda/cuda_runtime.h"
#include "ffi/ffi_callback.h"
#include "nnet/nmutator.h"
#include "operators/conv.h"
#include "operators/unary.h"
#include "test.h"
#include <pybind11/stl.h>
namespace infini {
// NHWC format
Graph getInfoGAN(int batch, Runtime runtime, int nLayers) {
IT_ASSERT(1 <= nLayers && nLayers <= 5);
Graph g = make_ref<GraphObj>(runtime);
vector<Tensor> weights;
vector<tuple<int, int, int, int, bool>> cs{
// Channel, kernelSize, pad, stride, isTanh
{448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
{64, 4, 1, 2, false}, {3, 4, 1, 2, true},
};
Tensor input =
g->addTensor({batch, 1, 1, 228}, DataType::Float32, TensorType::Input);
for (int i = 0; i < (int)cs.size() && i < nLayers; ++i) {
auto [channel, kernelSize, pad, stride, tanh] = cs[i];
int f = input->getDims()[3]; // n, h, w, f
auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
DataType::Float32,
TensorType::Initialized); // f, r, s, c
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
pad, stride, stride, 1, 1)
->getOutput();
if (tanh) {
input = g->addOp<TanhObj>(input, nullptr)->getOutput();
} else {
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
}
}
return g;
}
Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId) {
IT_ASSERT(0 <= layerId && layerId < 5);
Graph g = make_ref<GraphObj>(runtime);
vector<Tensor> weights;
vector<tuple<int, int, int, int, bool>> cs{
// Channel, kernelSize, pad, stride, isTanh
{448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
{64, 4, 1, 2, false}, {3, 4, 1, 2, true},
};
Tensor input = g->addTensor(shape, DataType::Float32, TensorType::Input);
for (int i = layerId; i < layerId + 1; ++i) {
auto [channel, kernelSize, pad, stride, tanh] = cs[i];
int f = input->getDims()[3]; // n, h, w, f
auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
DataType::Float32,
TensorType::Initialized); // f, r, s, c
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
pad, stride, stride, 1, 1)
->getOutput();
if (tanh) {
input = g->addOp<TanhObj>(input, nullptr)->getOutput();
} else {
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
}
}
return g;
}
void printGraph(Graph g) {
g->print();
puts("============ Data ============");
for (auto t : g->getTensors()) {
dbg(t);
t->printData();
}
}
Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning) {
auto runtime = as<CudaRuntimeObj>(_runtime);
Runtime cpu = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(cpu);
auto mutator =
make_ref<NMutator>(NMutator::Mode::RuleBased,
vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
vector<Graph> bestGraphs;
SearchEngine searchEngine(runtime, mutator);
bestGraphs.emplace_back(searchEngine.run(g));
g->topo_sort();
dbg(g, bestGraphs[0], bestGraphs.size());
g->print();
g->dataMalloc();
map<UidBaseType, Tensor> fuidToInputTensor;
for (auto t : g->getInputs()) {
IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
fuidToInputTensor[t->getFuid()] = t;
}
auto gen = RandomGenerator(-0.1, 0.1, 0);
for (auto t : g->getInputs()) {
t->setData(gen);
}
for (auto t : g->getOutputs()) {
t->setData(ZeroGenerator());
}
runtime->run(g);
dbg("Baseline graph");
printGraph(g);
// dbg(runtme->getPerfTime(g, true));
for (size_t i = 0; i < bestGraphs.size(); i++) {
auto bestGraphCpu = bestGraphs[i];
auto bestGraph =
make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
bestGraph->topo_sort();
bestGraph->dataMalloc();
// Initialize inputs with random data
for (auto t : bestGraph->getInputs()) {
t->copyData(fuidToInputTensor[t->getFuid()]);
}
// Initialize outputs with zeros
for (auto t : bestGraph->getOutputs()) {
t->setData(ZeroGenerator());
}
dbg(bestGraph);
dbg(bestGraph->getOutputs());
if (tuning) {
runtime->run(bestGraph, true); // Tune kernels
runtime->run(bestGraph, false); // Execute transfomraed graph
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
// EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
dbg(go0->equalData(bgo0, 1e-3));
dbg(runtime->getPerfTime(bestGraph, true));
dbg(runtime->timeNonCtcOperators(bestGraph));
// dbg(runtime->timeWithCudaGraph(bestGraph));
}
dbg("Best graph");
printGraph(bestGraph);
return bestGraph;
}
return nullptr;
}
vector<Tensor> runInfoGAN(int nLayers) {
auto cuda = make_ref<CudaRuntimeObj>();
Runtime cpu = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(cpu);
Graph g = getInfoGAN(1, cuda, nLayers);
auto mutator =
make_ref<NMutator>(NMutator::Mode::RuleBased,
vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
// // Translate OP to membound without derivation
// mutator->setToNaiveMembound();
vector<Graph> bestGraphs;
SearchEngine searchEngine(cuda, mutator);
bestGraphs.emplace_back(searchEngine.run(g));
g->topo_sort();
dbg(g, bestGraphs[0], bestGraphs.size());
g->print();
g->dataMalloc();
map<UidBaseType, Tensor> fuidToInputTensor;
for (auto t : g->getInputs()) {
IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
fuidToInputTensor[t->getFuid()] = t;
}
auto gen = RandomGenerator(-0.1, 0.1, 0);
// auto gen = RandomGenerator(-5, 5, 0, true);
for (auto t : g->getInputs()) {
t->setData(gen);
}
for (auto t : g->getOutputs()) {
t->setData(ZeroGenerator());
}
cuda->run(g);
dbg("Baseline graph");
printGraph(g);
dbg(cuda->getPerfTime(g, true));
for (size_t i = 0; i < bestGraphs.size(); i++) {
auto bestGraphCpu = bestGraphs[i];
auto bestGraph = make_ref<GraphObj>(cuda, bestGraphCpu->getOperators());
bestGraph->topo_sort();
bestGraph->dataMalloc();
// Initialize inputs with random data
for (auto t : bestGraph->getInputs()) {
t->copyData(fuidToInputTensor[t->getFuid()]);
}
// Initialize outputs with zeros
for (auto t : bestGraph->getOutputs()) {
t->setData(ZeroGenerator());
}
dbg(bestGraph);
dbg(bestGraph->getOutputs());
cuda->run(bestGraph, true); // Tune kernels
cuda->run(bestGraph, false); // Execute transfomraed graph
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
// EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
std::cout << go0->equalData(bgo0, 1e-3) << std::endl;
bgo0->printData();
go0->printData();
dbg(cuda->getPerfTime(bestGraph, true));
dbg("Best graph");
printGraph(bestGraph);
callback::exportONNX(bestGraph, "best_graph.onnx"); // Debug
return {g->getOutputs()[0], bestGraph->getOutputs()[0]};
}
return {};
}
} // namespace infini
#endif

View File

@ -13,7 +13,8 @@ string FullPrinterVisitor::print(const Expr &root) {
oss << "==> ROOT\n" << root->toReadable() << "\n";
for (size_t i = 0; i < q.size(); ++i) {
const auto &[name, routine, tensor] = q[i];
oss << "==> " << name << " : ";
oss << "==> " << name << " " << infini::vecToString(tensor->getShape())
<< " : ";
if (routine) {
oss << routine->toReadable() << "\n";
if (routine->getExpr()) {

View File

@ -73,17 +73,52 @@ string Serializer::visit_(const Tensor &c) {
return key;
}
bool Serializer::serialize(const Expr &expr, const string &filePath,
const string &msg) {
string Serializer::visit_(const Func &c) {
const string key = std::to_string(id++);
j[key]["type"] = c->getType();
j[key]["funcType"] = c->getFuncType();
j[key]["object"] = dispatch(c->getObject());
return key;
}
std::optional<std::string> Serializer::toString(const Expr &expr,
const string &msg,
vector<Tensor> inputs,
double exec_time, string hint) {
// Metadata
j["Version"] = VERSION;
j["Msg"] = msg;
j["exec_time"] = exec_time;
j["hint"] = hint;
// Expressions and routines
id = 0;
dispatch(expr);
std::ofstream fout(filePath);
fout << std::setw(4) << j << std::endl;
return true;
// Input tensors
vector<string> inputsIndices;
for (const auto &tensor : inputs) {
inputsIndices.emplace_back(std::to_string(id));
dispatch(tensor);
}
j["nnetInputs"] = inputsIndices;
// To string
std::stringstream ss;
ss << std::setw(4) << j << std::endl;
return {ss.str()};
}
bool Serializer::toFile(const Expr &expr, const string &filePath,
const string &msg, vector<Tensor> inputs,
double exec_time, string hint) {
if (auto s = toString(expr, msg, inputs, exec_time, hint)) {
// Write to file
std::ofstream fout(filePath);
fout << *s;
return true;
} else {
return false;
}
}
string Serializer::dispatchRoutine(const Routine &c) {
@ -129,7 +164,15 @@ string Serializer::dispatchRoutine(const Routine &c) {
return key;
}
Expr Serializer::deserialize(const string &filePath) {
Expr Serializer::fromString(const string &text) {
std::stringstream str;
str << text;
str >> j;
assert(j["Version"] == VERSION);
return buildExprTree("0");
}
Expr Serializer::fromFile(const string &filePath) {
std::ifstream fin(filePath);
fin >> j;
assert(j["Version"] == VERSION);
@ -180,6 +223,10 @@ Expr Serializer::buildExprTree(string key) {
return make_ref<TensorNode>(j[key]["name"], j[key]["shape"],
j[key]["paddings"], source);
}
case NodeType::FuncNodeType: {
auto object = buildExprTree(j[key]["object"]);
return make_ref<FuncNode>(object, j[key]["funcType"]);
}
default: {
nnet_unimplemented_halt();
break;
@ -242,4 +289,25 @@ Routine Serializer::buildRoutine(string key) {
return nullptr;
}
} // namespace nnet
tuple<Expr, vector<Tensor>, double, string>
Serializer::deserializeAsMemobundOp(const string &filePath) {
std::ifstream fin(filePath);
fin >> j;
assert(j["Version"] == VERSION);
vector<Tensor> inputs;
for (const auto &input : j["nnetInputs"])
inputs.emplace_back(as<TensorNode>(buildExprTree(input)));
return {buildExprTree("0"), inputs, j["exec_time"], j["hint"]};
}
tuple<Expr, vector<Tensor>, double, string>
Serializer::membundOpFromString(const string &data) {
j = json::parse(data);
assert(j["Version"] == VERSION);
vector<Tensor> inputs;
for (const auto &input : j["nnetInputs"])
inputs.emplace_back(as<TensorNode>(buildExprTree(input)));
return {buildExprTree("0"), inputs, j["exec_time"], j["hint"]};
}
} // namespace nnet

View File

@ -499,8 +499,8 @@ void Derivator::printIntermediateStates() {
std::cout << FullPrinterVisitor().print(intermediateStates[i]) << endl;
if (logMode == LogMode::DumpFristCandiate) {
Serializer serializer;
serializer.serialize(intermediateStates[i],
logFnPrefix + to_string(i) + ".expr", msg);
serializer.toFile(intermediateStates[i],
logFnPrefix + to_string(i) + ".expr", msg);
}
}
for (size_t i = 0; i < intermediateStates.size(); ++i) {

View File

@ -1,4 +1,5 @@
#include "nnet/expr.h"
#include "nnet/Visitor/FullPrinterVisitor.h"
#include "nnet/Visitor/GetTensorsVisitor.h"
namespace nnet {
@ -463,4 +464,9 @@ void FuncNode::setObject(Expr e) {
object = e;
}
string RangeOpNode::getFullExpression() {
FullPrinterVisitor printer;
return printer.print(this->shared_from_this());
}
} // namespace nnet

View File

@ -1,13 +1,16 @@
#include "nnet/nmutator.h"
#include "core/graph.h"
#include "ffi/ffi_callback.h"
#include "nnet/Visitor/FullPrinterVisitor.h"
#include "nnet/Visitor/GetTensorsVisitor.h"
#include "nnet/Visitor/MatchReshapeVisitor.h"
#include "nnet/Visitor/MergeMemboundMutator.h"
#include "nnet/derivator.h"
#include "operators/conv.h"
#include "operators/matmul.h"
#include "operators/membound.h"
#include "operators/reshape.h"
#include "operators/unary.h"
namespace infini {
@ -49,9 +52,9 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
assert(computeOps.size() == 1);
const auto &computeOp = computeOps[0];
auto g = infini::make_ref<GraphObj>(in_graph->getRuntime());
auto expr = opToExpression(computeOp);
nnet::Expr expr = opToExpression(computeOp);
auto inputsN = nnet::GetTensorsVisitor().get(expr);
dbg(inputsN, expr);
// dbg(inputsN, expr);
IT_ASSERT(inputsN.count("B") + inputsN.count("K") == 1,
"Which one is the second input tensor?");
vector<nnet::Tensor> inputsVectorN = {inputsN.at("A")};
@ -74,17 +77,18 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
OpVec computeOps = in_graph->getComputeOps();
IT_ASSERT(computeOps.size() == 1);
// HACK: remove this
if (auto op = as<ConvTransposed2dNHWCObj>(computeOps[0]); !op)
return;
/* if (infini::Graph g = transformTConv1x1(computeOps[0])) {
// if (infini::Graph g = transformTConv1x1(computeOps[0])) {
// out_graphs.emplace_back(g);
// return;
// }
// // Commented for debug, not implemented yet
// // if (infini::Graph g = transformTConv3x3(computeOps[0])) {
// // Graph graph = new Graph(g->getOperators());
// // out_graphs.emplace_back(graph);
// // return;
// // }
if (Graph g = transformConvtransposed1x1(computeOps[0])) {
out_graphs.emplace_back(g);
return;
}
// if (infini::Graph g = transformDialtedConv(computeOps[0])) {
// out_graphs.emplace_back(g);
// return;
@ -99,7 +103,6 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
// // out_graphs.emplace_back(graph);
// // return;
// // }
*/
auto expr = opToExpression(computeOps[0]);
if (!expr)
@ -107,18 +110,15 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
nnet::Derivator derivator(maxDepth);
nnet::Formula conv_9x9(expr, 0);
// const std::vector<int> rules{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90};
// ConvTraspose
// const std::vector<int> rules{1, 7, 7, 2, 8, 6, 6}; // G2BMM
if (mode == Mode::Normal) {
derivator.search(conv_9x9, 0);
} else if (mode == Mode::RuleBased) {
dbg(derivationRules);
// dbg(derivationRules);
derivator.ruleBasedDFS(conv_9x9, 0, derivationRules);
} else
IT_TODO_HALT_MSG("Unknown NMutator search mode.");
const auto &candidates = derivator.getCandidates();
dbg(candidates.size());
// dbg(candidates.size());
// derivator.print();
for (const auto &candidate : candidates) {
// dbg(nnet::FullPrinterVisitor().print(candidate.root));
@ -233,31 +233,38 @@ void NMutator::runMultipleOps(Graph in_graph, std::vector<Graph> &out_graphs) {
// }
// }
nnet::Expr NMutator::opToExpression(Operator op) {
// IT_TODO_HALT();
if (auto convOp = as<ConvObj>(op)) {
nnet::Expr NMutator::opToExpression(Operator opT) {
auto [expr, mapNameNToTensorT] = extractOp(opT);
for (auto &[name, tensorT] : mapNameNToTensorT) {
IT_ASSERT(inputsNameNToTensorT.count(name) == 0);
inputsNameNToTensorT[name] = tensorT;
}
return expr;
}
pair<nnet::Expr, NMutator::NameNToTensorT> NMutator::extractOp(Operator opT) {
if (auto convOp = as<ConvObj>(opT)) {
const auto &inputs = convOp->getInputs();
const auto &AT = inputs[0];
const auto &KT = inputs[1];
const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS();
const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation();
if (!(sh == 1 && sw == 1 && dh == 1 && dw == 1))
return nullptr;
return {};
assert(sh == 1 && sw == 1 && dh == 1 && dw == 1);
inputsNameNToTensorT["A"] = AT;
inputsNameNToTensorT["K"] = KT;
const auto A = nnet::makeTensor("A", AT->getDims(),
std::vector<int>{0, 0, ph, pw});
const auto K = nnet::makeTensor("K", KT->getDims());
return nnet::ConvPattern::getExpr(A, K, n, c, h, w, f, r, s);
} else if (auto convOp = as<ConvTransposed2dNHWCObj>(op)) {
return {nnet::ConvPattern::getExpr(A, K, n, c, h, w, f, r, s),
{{"A", AT}, {"K", KT}}};
} else if (auto convOp = as<ConvTransposed2dNHWCObj>(opT)) {
const auto &AT = convOp->getInputs()[0];
const auto &KT = convOp->getInputs()[1];
inputsNameNToTensorT["A"] = AT;
inputsNameNToTensorT["K"] = KT;
const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS();
const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation();
IT_ASSERT_TODO(convOp->getNumGroups() == 1);
if (r != 4)
return {};
IT_ASSERT_TODO(r == 4);
IT_ASSERT_TODO(ph == pw);
IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));
@ -269,8 +276,9 @@ nnet::Expr NMutator::opToExpression(Operator op) {
const auto A = nnet::makeTensor(
"A", AT->getDims(), std::vector<int>{0, padding, padding, 0});
const auto K = nnet::makeTensor("K", KT->getDims());
return nnet::ConvTransPattern::getExpr(A, K, n, c, h, w, f, r, s);
// } else if (auto g2bmmOp = dynamic_cast<G2BMMOp *>(op)) {
return {nnet::ConvTransPattern::getExpr(A, K, n, c, h, w, f, r, s),
{{"A", AT}, {"K", KT}}};
// } else if (auto g2bmmOp = dynamic_cast<G2BMMOp *>(opT)) {
// const auto &AT = g2bmmOp->getInputs()[0];
// const auto &BT = g2bmmOp->getInputs()[1];
// const auto [b, m, k, width, dilation] = g2bmmOp->getArgs();
@ -280,7 +288,7 @@ nnet::Expr NMutator::opToExpression(Operator op) {
// inputsNameNToTensorT[inputsN.first->getName()] = AT;
// inputsNameNToTensorT[inputsN.second->getName()] = BT;
// return expr;
// } else if (auto gbmmlOp = dynamic_cast<GBMMLOp *>(op)) {
// } else if (auto gbmmlOp = dynamic_cast<GBMMLOp *>(opT)) {
// const auto &AT = gbmmlOp->getInputs()[0];
// const auto &BT = gbmmlOp->getInputs()[1];
// const auto [b, m, w, k, dilation] = gbmmlOp->getArgs();
@ -290,22 +298,30 @@ nnet::Expr NMutator::opToExpression(Operator op) {
// inputsNameNToTensorT[inputsN.second->getName()] = BT;
// dbg(b, m, w, k, dilation, expr);
// return expr;
} else if (auto matmulOp = as<MatmulObj>(op)) {
} else if (auto matmulOp = as<MatmulObj>(opT)) {
const auto &AT = matmulOp->getInputs()[0];
const auto &BT = matmulOp->getInputs()[1];
const auto [b, m, n, k, transA, transB] = matmulOp->getBMNKTransAB();
const auto &[expr, inputsN] =
nnet::MatmulPattern::getExpr(transA, transB, b, m, n, k);
inputsNameNToTensorT[inputsN.first->getName()] = AT;
inputsNameNToTensorT[inputsN.second->getName()] = BT;
// dbg(b, m, n, k, expr);
return expr;
return {
expr,
{{inputsN.first->getName(), AT}, {inputsN.second->getName(), BT}}};
} else if (auto op = as<MemBoundObj>(opT)) {
NameNToTensorT m;
for (int i = 0; i < op->numInputs(); ++i)
m[op->getNnetInputs()[i]->getName()] = opT->getInputs()[i];
return {op->getNnetExpr(), m};
} else if (opT->getOpType() == OpType::Relu ||
opT->getOpType() == OpType::Tanh) {
return generateUnaryExpr(opT);
}
// // else if (auto transposeOp = dynamic_cast<TransposeOp *>(op)) {
// // else if (auto transposeOp = dynamic_cast<TransposeOp *>(opT)) {
// // return transposeOpToExpression(transposeOp);
// // }
nnet_unimplemented_continue();
return nullptr;
IT_TODO_HALT_MSG("Cannot convert " + opT->toString() +
" to an NNet expression");
return {};
}
infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
@ -313,7 +329,7 @@ infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
nnet::FullPrinterVisitor fullVisitor;
// Get tensors in the reversed topological order
const auto &tensorQueueN = fullVisitor.traverse(expr);
dbg(fullVisitor.print(expr));
// dbg(fullVisitor.print(expr));
// Build a map: name in nnet -> tensors in infini
// Add input tensors to the map
@ -381,8 +397,8 @@ infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
cnt += tensor->size();
for (const auto &tensor : outputsPET)
cnt += tensor->size();
dbg(inputsPET, outputsPET, op->getInputs(), op->getExpr(),
memboundTime(cnt));
// dbg(inputsPET, outputsPET, op->getInputs(), op->getExpr(),
// memboundTime(cnt));
g->addOpWithOutputs<MemBoundObj>(inputsPET, outputsPET,
op->getInputs(), op->getExpr(),
memboundTime(cnt));
@ -395,6 +411,9 @@ infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
nameNToTensorT.at(BN->getName())};
TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
const auto &[b, m, n, k, transa, transb] = op->getArgs();
// // HACK: pruning for deubg
if (!((transa == 0) && (transb == 1)))
return nullptr;
g->addOpWithOutputs<MatmulObj>(inputsPET[0], inputsPET[1],
outputsPET[0], transa, transb);
}
@ -500,43 +519,86 @@ double NMutator::memboundTime(const Shape &dims) {
// return nullptr;
// }
// Graph NMutator::transformTConv3x3(Operator op) {
// if (auto tconvOp = dynamic_cast<ConvTransOp *>(op)) {
// dbg(tconvOp->getInputs()[1]->getDims());
// if (tconvOp->getPh() == 1 && tconvOp->getSh() == 2 &&
// tconvOp->getInputs()[1]->getDims()[0] == 3 &&
// tconvOp->getInputs()[1]->getDims()[1] == 3) {
// auto g = new infini::Graph();
// auto inputDims = tconvOp->getInputs(0)->getDims();
// auto weightDims = tconvOp->getInputs(1)->getDims();
// auto outputDims = tconvOp->getOutput()->getDims();
// // NHWF
// auto newA = g->tensor(
// {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]});
// // RSFC
// auto newW = g->tensor(
// {weightDims[0] * weightDims[1] * weightDims[3],
// weightDims[2]});
// auto newO =
// g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
Graph NMutator::transformConvtransposed1x1(Operator _op) {
auto op = as<ConvTransposed2dNHWCObj>(_op);
if (!op)
return nullptr;
const auto &A = op->getInputs()[0];
const auto &W = op->getInputs()[1];
const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
const Shape inputDims = op->getInputs(0)->getDims();
const Shape weightDims = op->getInputs(1)->getDims();
const Shape outputDims = op->getOutput()->getDims();
const DataType dtype = A->getDType();
IT_ASSERT_TODO(op->getNumGroups() == 1);
if (h != 1 || w != 1)
return {};
IT_ASSERT_TODO(ph == pw);
IT_ASSERT_TODO(tie(sh, sw) == tuple(1, 1));
IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
auto g = make_ref<GraphObj>(runtime);
// NHWF
auto newA = g->addTensor(
{inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
// FRSC
// auto newW = g->addTensor(
// {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]},
// dtype);
// HACK: without transpoe
auto newW = g->addTensor(
{weightDims[1] * weightDims[2] * weightDims[3], weightDims[0]}, dtype);
g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 1)->getOutput();
g->addOpWithOutputs<ReshapeObj>(newO, g->cloneTensor(op->getOutput()),
op->getOutput()->getDims());
return g;
}
// Graph NMutator::transformConvtransposed(Operator _op) {
// auto op = as<ConvTransposed2dNHWCObj>(_op);
// if (!op)
// return nullptr;
// const auto &AT = op->getInputs()[0];
// const auto &KT = op->getInputs()[1];
// const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
// const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
// IT_ASSERT_TODO(op->getNumGroups() == 1);
// if (r != 4)
// return {};
// IT_ASSERT_TODO(ph == pw);
// IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));
// IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
// auto g = make_ref<Graph>();
// // TODO: implement transformation rules
// // How to efficiently write an expression...
// auto inputDims = op->getInputs(0)->getDims();
// auto weightDims = op->getInputs(1)->getDims();
// auto outputDims = op->getOutput()->getDims();
// // NHWF
// auto newA =
// g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
// inputDims[3]});
// // RSFC
// auto newW = g->tensor(
// {weightDims[0] * weightDims[1] * weightDims[3], weightDims[2]});
// auto newO = g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
// weightDims[0] * weightDims[1] * weightDims[3]});
// g->reshape(tconvOp->getInputs(0), newA);
// g->reshape(tconvOp->getInputs(1), newW);
// g->matmul(newA, newW, newO, 0, 1);
// // g->reshape(newO, tconvOp->getOutput());
// tconvOp->print();
// dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
// assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
// g->membound(
// {newO}, {tconvOp->getOutput()}, {}, nullptr,
// g->reshape(op->getInputs(0), newA);
// g->reshape(op->getInputs(1), newW);
// g->matmul(newA, newW, newO, 0, 1);
// // g->reshape(newO, tconvOp->getOutput());
// tconvOp->print();
// dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
// assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
// g->membound({newO}, {tconvOp->getOutput()}, {}, nullptr,
// memboundTime(newO->size() + tconvOp->getOutput()->size()),
// "TConv3x3 reduce");
// g->updateConnection();
// Graph graph = new Graph(g->getOperators());
// return graph;
// }
// }
// return nullptr;
// g->updateConnection();
// Graph graph = new Graph(g->getOperators());
// return graph;
// }
// Graph NMutator::transformTConv1x1(Operator op) {
@ -632,4 +694,86 @@ double NMutator::memboundTime(const Shape &dims) {
// return graph;
// }
Graph NMutator::fuseVertically(const Graph &inputGraph) {
Graph optGraph = make_ref<GraphObj>(runtime);
auto chainOps = inputGraph->getOperators();
IT_ASSERT(!chainOps.empty());
for (auto &op : chainOps) {
IT_ASSERT(op->isMemBoundOp());
IT_ASSERT_TODO(op->getInputs().size() == 1);
IT_ASSERT(op->getOutputs().size() == 1);
}
if (chainOps.size() == 1) {
return make_ref<GraphObj>(runtime, chainOps);
}
std::vector<nnet::Expr> exprs;
for (const auto &op : chainOps) {
auto [expr, _] = extractOp(op);
exprs.emplace_back(expr);
// dbg(op, infini::as<nnet::RangeOpNode>(expr)->getFullExpression());
}
// double maxTime = getMaxPerf(std::make_shared<SubGraph>(chainOps));
// Fuse a MemboundOp chain
auto expr = nnet::MergeMemboundMutator(exprs).merge(true);
auto inputNMap = nnet::GetTensorsVisitor().get(exprs.front());
IT_ASSERT(inputNMap.size() == 1);
vector<nnet::Tensor> inputsN;
for (const auto &[name, t] : inputNMap) {
inputsN.emplace_back(t);
}
optGraph->addOpWithOutputs<MemBoundObj>(chainOps.front()->getInputs(),
chainOps.back()->getOutputs(),
inputsN, expr, 0);
// TODO: set time
return optGraph;
}
pair<nnet::Expr, NMutator::NameNToTensorT>
NMutator::generateUnaryExpr(const Operator &op) {
using namespace nnet;
const map<OpType, nnet::FuncType> opTToFuncN = {
{OpType::PRelu, nnet::FuncType::PRelu},
{OpType::Relu, nnet::FuncType::Relu},
{OpType::Tanh, nnet::FuncType::Tanh}};
Shape shape = op->getInputs()[0]->getDims();
nnet::FuncType type = opTToFuncN.at(op->getOpType());
auto T = make_ref<TensorNode>("T", shape);
VecExpr indices;
for (size_t i = 0; i < shape.size(); ++i) {
indices.emplace_back(make_ref<VarNode>("i" + std::to_string(i)));
}
auto sub = makeSubscript(T, indices);
auto func = nnet::make_ref<FuncNode>(sub, type);
vector<VarRangePair> varRanges;
for (size_t i = 0; i < shape.size(); ++i) {
varRanges.emplace_back(nnet::as<VarNode>(indices[i]),
Range{0, shape[i]});
}
return {makeRangeOperator(varRanges, {}, func),
NameNToTensorT{{"T", op->getInputs()[0]}}};
}
pair<nnet::Expr, vector<nnet::Tensor>> NMutator::generateRevert(Tensor in) {
using namespace nnet;
using infini::make_ref;
const Shape &orignalShape = in->getDims();
auto tensor = makeTensor("T", in->getDims());
VecExpr iters;
for (size_t i = 0; i < orignalShape.size(); ++i) {
iters.emplace_back(make_ref<VarNode>("i" + std::to_string(i)));
}
Shape newShape = orignalShape;
std::reverse(newShape.begin(), newShape.end());
auto sub = makeSubscript(tensor, iters);
vector<VarRangePair> loopIters;
for (int i = orignalShape.size() - 1; i >= 0; --i) {
loopIters.emplace_back(infini::as<VarNode>(iters[i]),
Range{0, orignalShape[i]});
}
auto range = makeRangeOperator(loopIters, {}, sub);
return {range, {tensor}};
}
} // namespace infini

View File

@ -6,7 +6,7 @@
namespace nnet {
int matchExprResult(Derivator &derivator, string fn) {
auto ans = Serializer().deserialize(fn);
auto ans = Serializer().fromFile(fn);
auto hashAns = HashVisitor()(ans);
int match = 0;
for (const auto &candidate : derivator.getCandidates()) {
@ -19,14 +19,14 @@ int matchExprResult(Derivator &derivator, string fn) {
bool checkExprLogSame(string fnPrefix, int start, int end) {
Serializer serializer;
string fn0 = fnPrefix + to_string(start) + ".expr";
Expr expr0 = serializer.deserialize(fn0);
Expr expr0 = serializer.fromFile(fn0);
RangeOp range0 = as<RangeOpNode>(expr0);
Interpreter interpreter(range0);
auto ans0 = interpreter.interpretUniformSample(range0);
dbg(expr0, ans0);
for (int i = start + 1; i < end; ++i) {
string fn1 = fnPrefix + to_string(i) + ".expr";
Expr expr1 = serializer.deserialize(fn1);
Expr expr1 = serializer.fromFile(fn1);
RangeOp range1 = as<RangeOpNode>(expr1);
dbg(fn1, expr1);
auto ans1 = interpreter.interpretUniformSample(range1);
@ -67,4 +67,4 @@ bool checkExprsEquvivalence(VecExpr exprs) {
return true;
}
} // namespace nnet
} // namespace nnet

View File

@ -2,6 +2,7 @@
#include "nnet/Visitor/CheckOOBVisitor.h"
#include "nnet/Visitor/HashVisitor.h"
#include "nnet/Visitor/MergeMemboundMutator.h"
#include "nnet/Visitor/Serializer.h"
namespace infini {
@ -9,8 +10,8 @@ MemBoundObj::MemBoundObj(GraphObj *graph, const TensorVec &input,
const TensorVec &output,
const std::vector<nnet::Tensor> &nnetInputs,
nnet::Expr expr, double exec_time, std::string hint)
: OperatorObj(OpType::MemBound, input, output), nnetInputs(nnetInputs),
expr(expr), exec_time(exec_time), hint(hint) {
: OperatorObj(OpType::MemBound, input, output), expr(expr),
nnetInputs(nnetInputs), exec_time(exec_time), hint(hint) {
IT_ASSERT(checkValid(graph));
IT_ASSERT(!checkOOB(expr));
hash = calcHash(expr);
@ -45,7 +46,7 @@ string MemBoundObj::toString() const {
os << "exec_time=" << exec_time << ", ";
os << "NNet Inputs=[";
for (const auto &tensor : nnetInputs)
os << tensor->toReadable() << ",";
os << tensor->toReadable() << vecToString(tensor->getShape()) << ",";
os << "]";
os << ", ExprHash=" << hash;
os << ", SimplifiedExprHash=" << simplifiedHash;
@ -60,11 +61,18 @@ string MemBoundObj::toString() const {
optional<vector<Shape>> MemBoundObj::inferShape(const TensorVec &inputs) const {
// inputs have to match nnetInputs excatly
if (inputs.size() != nnetInputs.size())
if (inputs.size() != nnetInputs.size()) {
std::cout << "Num mismatch" << inputs.size() << " "
<< nnetInputs.size();
return {};
}
for (size_t i = 0; i < inputs.size(); ++i)
if (inputs[i]->getDims() != nnetInputs[i]->getShape())
if (inputs[i]->getDims() != nnetInputs[i]->getShape()) {
std::cout << "Shape mismatch " << inputs[i]
<< vecToString(inputs[i]->getDims()) << " "
<< vecToString(nnetInputs[i]->getShape());
return {};
}
return {{nnet::as<nnet::RangeOpNode>(expr)->getOutputShape()}};
}
@ -83,4 +91,9 @@ bool MemBoundObj::checkOOB(nnet::Expr expr) {
nnet::as<nnet::RangeOpNode>(expr));
}
string MemBoundObj::toJson() const {
return *nnet::Serializer().toString(expr, "MemBoundObj::toJson", nnetInputs,
exec_time, hint);
}
} // namespace infini

View File

@ -7,9 +7,10 @@ namespace infini {
TEST(Handler, matmul) {
auto runtime = NativeCpuRuntimeObj::getInstance();
auto handler = make_ref<GraphHandlerObj>(runtime);
auto i = handler->tensor({1, 2, 3}, OnnxDType::UINT32);
auto w = handler->tensor({1, 3, 4}, OnnxDType::UINT32);
auto o = handler->tensor({1, 2, 4}, OnnxDType::UINT32);
auto i = handler->tensor({1, 2, 3}, OnnxDType::UINT32, TensorType::Input);
auto w =
handler->tensor({1, 3, 4}, OnnxDType::UINT32, TensorType::Initialized);
auto o = handler->tensor({1, 2, 4}, OnnxDType::UINT32, TensorType::Other);
handler->matmul(i, w, o, false, false, nullptr, ActType::None);
}

View File

@ -0,0 +1,58 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/conv.h"
#include "nnet/nmutator.h"
#include "operators/matmul.h"
#include "test.h"
namespace infini {
TEST(TestCudaRuntime, CudaGraph) {
auto runtime = make_ref<CudaRuntimeObj>();
Graph g = make_ref<GraphObj>(runtime);
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
const int n = 2, c = 256, h = 2, w = 2, f = 448, r = 3, s = 2;
auto i0 = g->addTensor({n, c, h, w}, DataType::Float32, TensorType::Input);
auto w0 =
g->addTensor({f, c, r, s}, DataType::Float32, TensorType::Initialized);
g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 1, 1, 1, 1);
g->dataMalloc();
runtime->run(g, true);
runtime->run(g, false);
runtime->getPerfTime(g);
auto time = runtime->timeWithCudaGraph(g);
EXPECT_GE(time, 0.01);
}
TEST(TestCudaRuntime, CudaGraphMembound) {
auto runtime = make_ref<CudaRuntimeObj>();
Runtime cpu = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(cpu);
Graph g = make_ref<GraphObj>(runtime);
Tensor i0 = g->addTensor({1, 2, 3}, DataType::Float32);
Tensor w0 = g->addTensor({1, 3, 4}, DataType::Float32);
Tensor o0 = g->addTensor({1, 2, 4}, DataType::Float32);
g->dataMalloc();
i0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
w0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
NMutator nmutator(NMutator::Mode::ToNaiveMembound);
auto mutations = nmutator.run(g);
ASSERT_EQ(mutations.size(), 2u);
Graph gNew = mutations[1];
gNew->print();
gNew->dataMalloc();
runtime->run(gNew, true); // tune kernels
runtime->run(gNew, false);
runtime->getPerfTime(gNew);
runtime->timeWithCudaGraph(gNew);
}
} // namespace infini

View File

@ -10,8 +10,8 @@ int main(int argc, char *argv[]) {
printf("Usage: %s <log>\n", argv[0]);
return 1;
}
auto expr = Serializer().deserialize(argv[1]);
auto expr = Serializer().fromFile(argv[1]);
cout << FullPrinterVisitor().print(expr);
cout << endl << "Hash = " << HashVisitor().getHash(expr) << endl;
return 0;
}
}

View File

@ -0,0 +1,113 @@
import onnx
import torch
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import pyinfinitensor as pit
from pyinfinitensor import backend as ft
from pyinfinitensor.onnx import OnnxStub
def to_pytorch_tensor(tensor) -> torch.Tensor:
data = tensor.copyout_float()
tt = torch.tensor(data)
return tt.reshape(tensor.shape())
def run_InfoGAN_return_tesnor(n_layers: int):
if_tensors = ft.runInfoGAN(n_layers)
tensors = [to_pytorch_tensor(t) for t in if_tensors]
return tensors
def read_and_check():
for n_layers in range(1, 6):
ans = torch.load(f'torch_{n_layers}layers_0.pt')
x = torch.load(f'torch_{n_layers}layers_1.pt')
print(f'=== {n_layers} layers ===')
print(x.abs().max())
def run_e2e_InfoGAN():
data = []
for n_layers in range(5, 6):
tensors = run_InfoGAN_return_tesnor(n_layers)
for i, t in enumerate(tensors):
torch.save(t, f'torch_{n_layers}layers_{i}.pt')
print(f'============ {n_layers} layers = = =')
ans, x = tensors
print(f'Allclose {torch.allclose(ans, x)}')
# Print error numbers
tot = np.product(ans.shape)
data.append([])
for i in range(0, 10):
tol = 10**(-i)
clo = torch.isclose(ans, x, atol=tol, rtol=tol).sum().item()
print(f'0.1^{i} close: {clo}/{tot} = {clo/tot}')
data[-1].append(clo/tot)
rel_err = torch.abs((ans-x)/ans)
print(rel_err, rel_err.max())
print(f'ans = {ans}')
print(f'x = {x}')
# # Plot CDF
# fig, axes = plt.subplots(9,1)
# print(axes)
# for i, ax in enumerate(axes):
# print(i)
# ax:plt.Axes
# ax.hist(torch.flatten(rel_err), density=True, cumulative=True, label='CDF',
# histtype='step', alpha=0.8, color='k')
# ax.set_xlim(0, 10**(-i))
# # ax.set_title('')
# plt.show()
# plt.savefig('a.pdf')
df = pd.DataFrame(data)
print(df.to_string())
df.set_axis([f'0.1^{i}' for i in range(0, 10)], axis=1, inplace=True)
print(df.to_string())
df.to_csv('a.csv')
def runSingleConvT():
runtime = ft.cuda_runtime()
g = ft.getConvtransposedNHWC(runtime, [1, 2, 2, 448], 1)
opt_g = ft.optimizeGraph(g, runtime)
ft.if_onnx.export_onnx(opt_g, 'convtransposed.onnx')
def run_InfoGAN_without_tuning(runtime, tuning: bool):
g = ft.getInfoGAN(1, runtime, 5)
# g = ft.getInfoGAN(1, runtime, 1)
opt_g = ft.optimizeGraph(g, runtime, tuning)
stub = OnnxStub.from_graph(opt_g)
with open("optimized.onnx", "wb") as f:
f.write(stub.to_onnx("optimized").SerializeToString())
return opt_g
def load_onnx(runtime) -> ft.Graph:
stub = OnnxStub.from_onnx(onnx.load("optimized.onnx"), runtime, False)
return stub.handler.getGraph()
def run_and_evaluate(runtime, g):
runtime.run(g, True)
print(f'getPerfTime = {runtime.getPerfTime(g, True, False, False)}')
print(f'Non-ctc time = {runtime.timeNonCtcOperators(g, 1000, 1000)}')
print(f'Cuda graph time = {runtime.timeWithCudaGraph(g)}')
if __name__ == "__main__":
# run_e2e_InfoGAN()
# runSingleConvT()
# read_and_check()
runtime = ft.cuda_runtime()
if True:
g = run_InfoGAN_without_tuning(runtime, False)
else:
g = load_onnx(runtime)
run_and_evaluate(runtime, g)

View File

@ -388,4 +388,4 @@ TEST(Conv2conv, InfoGAN_ConvTranspose_3_OOB_Test) {
dbg(expr);
Derivator derivator;
derivator.checkOOB(as<RangeOpNode>(expr));
}
}

View File

@ -5,12 +5,13 @@
#include "core/search_engine.h"
#include "cuda/cuda_runtime.h"
#include "nnet/nmutator.h"
#include "nnet/test.h"
#include "operators/conv.h"
#include "test.h"
namespace infini {
TEST(Mutator, NaiveConvWithInterpreter) {
TEST(NMutator, NaiveConvWithInterpreter) {
// verifyNaiveMembound True: subgraph after transformation
// verifyNaiveMembound False: subgraph of one single membound (eOP)
Runtime runtime = NativeCpuRuntimeObj::getInstance();
@ -54,9 +55,8 @@ TEST(Mutator, NaiveConvWithInterpreter) {
gg->getOutputs()[0]->getRawDataPtr<void *>());
}
// FIXME: failed since implicit transpose for DLT
TEST(Mutator, InfoGAN_TConv_3_correctness) {
const bool useMutatorDirectly = true;
TEST(NMutator, InfoGAN_TConv_3_correctness) {
const bool useMutatorDirectly = false;
Runtime runtime = make_ref<CudaRuntimeObj>();
Graph g = make_ref<GraphObj>(runtime);
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
@ -67,8 +67,9 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {
// const int n = 1, c = 1, h = 2, w = 2, f = 1, r = 4, s = 4;
// const int n = 1, c = 2, h = 2, w = 2, f = 2, r = 4, s = 4;
auto i0 = g->addTensor({n, h, w, f});
auto w0 = g->addTensor({f, r, s, c});
auto i0 = g->addTensor({n, h, w, f}, DataType::Float32, TensorType::Input);
auto w0 =
g->addTensor({f, r, s, c}, DataType::Float32, TensorType::Initialized);
g->addOp<ConvTransposed2dNHWCObj>(i0, w0, nullptr, 1, 1, 2, 2, 1, 1);
auto mutator =
@ -91,12 +92,13 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {
fuidToInputTensor[t->getFuid()] = t;
}
std::cout << "# bestGraphs = " << bestGraphs.size() << std::endl;
for (size_t i = 0; i < bestGraphs.size(); i++) {
auto bestGraphCpu = bestGraphs[i];
auto bestGraph =
make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
auto gen = RandomGenerator(0, 1, i);
auto gen = RandomGenerator(0.1, 0.1, i);
bestGraph->dataMalloc();
// Initialize inputs with random data
for (auto t : g->getInputs()) {
@ -120,6 +122,7 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
dbg(runtime->getPerfTime(bestGraph, true));
EXPECT_TRUE(go0->equalData(bgo0, 1e-4));
}
}

View File

@ -1,9 +1,11 @@
#include "core/graph.h"
#include "nnet/Visitor/FullPrinterVisitor.h"
#include "nnet/Visitor/Serializer.h"
#include "nnet/test.h"
#include "operators/membound.h"
#include "gtest/gtest.h"
using namespace nnet;
using namespace std;
#define DEFINE_VAR(name) auto name = make_ref<VarNode>(#name);
//{L<i3:0:2500><i4:0:4><b:0:8><w:0:65>Sum<k:0:512>
//{({A}[b, (i3 + (2500 * i4)), k] * {B<pad=0,128,0>}[b, ((i3 + (2500 * i4)) +
@ -12,11 +14,7 @@ using namespace std;
// ==> B : Input Tensor shape=[8,10000,512] pad=[0,128,0]
Expr buildSimpleExpr() {
DEFINE_VAR(b);
DEFINE_VAR(w);
DEFINE_VAR(k);
DEFINE_VAR(i3);
DEFINE_VAR(i4);
DEFINE_VAR(b, w, k, i3, i4);
auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0});
auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0});
auto subA = makeSubscript(A, {b, (i3 + (2500 * i4)), k});
@ -28,9 +26,7 @@ Expr buildSimpleExpr() {
}
Expr buildNestedExpr() {
DEFINE_VAR(j1);
DEFINE_VAR(j2);
DEFINE_VAR(j3);
DEFINE_VAR(j1, j2, j3);
// Build a Matmul to verify.
const int M = 10000, N = 512, K = 3;
auto C = make_ref<TensorNode>("C", vector<int>({M, K}));
@ -46,11 +42,7 @@ Expr buildNestedExpr() {
auto E = make_ref<TensorNode>("E", shapeE, shapeE, ele2);
auto ele1 = make_ref<ElementWiseNode>(expr, vector{E}, shapeE);
DEFINE_VAR(b);
DEFINE_VAR(w);
DEFINE_VAR(k);
DEFINE_VAR(i3);
DEFINE_VAR(i4);
DEFINE_VAR(b, w, k, i3, i4);
auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0}, matmul);
auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0}, ele1);
auto subA = makeSubscript(A, {b, (i3 + (2500 * i4)), k});
@ -63,25 +55,23 @@ Expr buildNestedExpr() {
TEST(Serializer, Serialization) {
auto range = buildSimpleExpr();
auto isSuccessful = Serializer().serialize(range, "./test_serializer.json");
auto isSuccessful = Serializer().toFile(range, "./test_serializer.json");
EXPECT_TRUE(isSuccessful);
}
TEST(Serializer, CompareTwoExprs) {
DEFINE_VAR(b);
DEFINE_VAR(w);
DEFINE_VAR(k);
DEFINE_VAR(i3);
DEFINE_VAR(i4);
DEFINE_VAR(b, w, k, i3, i4);
auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0});
auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0});
auto subA = makeSubscript(A, {b, (i3 + (2500 * i4)), k});
auto funcA = make_ref<FuncNode>(subA, FuncType::Relu);
auto subB = makeSubscript(B, {b, ((i3 + (2500 * i4)) + w), k});
auto range = makeRangeOperator(
{{i3, {0, 2500}}, {i4, {0, 4}}, {b, {0, 8}}, {w, {0, 65}}},
{{k, {0, 512}}}, subA * subB);
Serializer().serialize(range, "./test_serializer.json");
auto expr = Serializer().deserialize("./test_serializer.json");
{{k, {0, 512}}}, funcA * subB);
Serializer().toFile(range, "./test_serializer.json");
auto expr = Serializer().fromFile("./test_serializer.json");
dbg(expr);
EXPECT_EQ(range->toReadable(), expr->toReadable());
}
@ -90,11 +80,37 @@ TEST(Serializer, Serialization_NestedTensor) {
FullPrinterVisitor printer;
auto range = buildNestedExpr();
auto ans = printer.print(range);
dbg(ans);
auto isSuccessful = Serializer().serialize(range, "./test_serializer.json");
auto isSuccessful = Serializer().toFile(range, "./test_serializer.json");
EXPECT_TRUE(isSuccessful);
auto exprDeserialized = Serializer().deserialize("./test_serializer.json");
auto exprDeserialized = Serializer().fromFile("./test_serializer.json");
auto output = printer.print(exprDeserialized);
dbg(output);
EXPECT_EQ(output, ans);
}
}
TEST(Serializer, Serialization_memboundOp) {
auto expr = buildSimpleExpr();
auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0});
auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0});
// using namespace infini;
auto runtime = infini::NativeCpuRuntimeObj::getInstance();
auto g = infini::make_ref<infini::GraphObj>(runtime);
auto AT = g->addTensor({8, 10000, 512});
auto BT = g->addTensor({8, 10000, 512});
auto CT = g->addTensor({2500, 4, 8, 65});
vector<Tensor> nnetInputs{A, B};
double execTime = 1;
string hint = "test";
infini::MemBoundObj memboundOp(nullptr, {AT, BT}, {CT}, nnetInputs, expr,
execTime, hint);
auto str = memboundOp.toJson();
auto [exprLoaded, nnetInputsLoaded, execTimeLoaded, hintLoaded] =
Serializer().membundOpFromString(str);
EXPECT_EQ(expr->toReadable(), exprLoaded->toReadable());
EXPECT_EQ(execTime, execTimeLoaded);
EXPECT_EQ(nnetInputs.size(), nnetInputsLoaded.size());
for (size_t i = 0; i < nnetInputs.size(); ++i)
EXPECT_EQ(nnetInputs[i]->toReadable(),
nnetInputsLoaded[i]->toReadable());
EXPECT_EQ(hint, hintLoaded);
}