Compare commits

...

96 Commits

Author SHA1 Message Date
Liyan Zheng 1ee4a60af0 Add: convert expression to operator 2023-06-28 11:06:17 +08:00
Liyan Zheng 29071ddcac Rename: Expr construction helpers 2023-06-25 20:31:08 +08:00
Liyan Zheng c6c445991a Add: enable mutator search in python 2023-06-25 20:18:18 +08:00
Liyan Zheng d25b606e12 Add: TF32 supports and accurate timing for conv 2023-05-07 13:22:39 +08:00
Liyan Zheng abcfa76fb5 Add: efficient CUDA transpose for last two dims 2023-05-05 15:16:07 +08:00
Liyan Zheng 6a70555892 Add: TensorRT backend 2023-04-30 23:44:10 +08:00
Liyan Zheng f47a411095 Add: export with random weight 2023-04-30 22:25:07 +08:00
Liyan Zheng df2534d209 Fix: fuse Relu to GEMM/Conv 2023-04-30 16:16:16 +08:00
Liyan Zheng a1f02593d3 Add: export Flatten operator to ONNX 2023-04-30 16:15:24 +08:00
Liyan Zheng 65b4b42fa0 Merge remote-tracking branch 'origin/NNET_OpSearch' into NNET_e2e
Fix: update of derivator.h is missing in NNET_OpSearch
2023-04-30 15:48:48 +08:00
Liyan Zheng b068442bfb Add: print time in op evalution 2023-04-30 00:47:57 +08:00
Liyan Zheng c6e7748786 Add: rule of Any+Relu->Any 2023-04-28 21:46:54 +08:00
Liyan Zheng d0ae48d21d Add: CUDA Matmul selection 2023-04-28 19:13:19 +08:00
Liyan Zheng c875f3cbb8 Add: Matmul Transpose plans 2023-04-28 19:13:01 +08:00
Liyan Zheng 95a8b90fa7 Fix: add virutal method sync in Runtime 2023-04-28 00:44:54 +08:00
Liyan Zheng c58b67f743 Chore: suppress output 2023-04-26 14:11:53 +08:00
Liyan Zheng 75c9226164 Merge branch 'NNET_e2e' into NNET_op_test 2023-04-25 04:32:18 +08:00
Liyan Zheng f877eca517 Add: IT_ASSERT in checkCudaError 2023-04-25 04:25:10 +08:00
Liyan Zheng b13b799fbe All mdoels E2E 2023-04-25 04:24:43 +08:00
Liyan Zheng 350fc01d39 Add: Search Depth 2023-04-25 01:07:21 +08:00
whjthu 71f4f6e9d9 add op test for einnet 2023-04-24 21:48:29 +08:00
Liyan Zheng 1408d308cc Add: FCRS log 2023-04-24 21:20:29 +08:00
Liyan Zheng 11229a2baa Add: Figure 17 2023-04-24 21:07:30 +08:00
Liyan Zheng 2b85ac41ef Fix: CUDA Relu for 2D/1D tensor 2023-04-24 16:14:26 +08:00
Liyan Zheng 1e46750159 Add conv2bgemm and fix mutator::runtime 2023-04-24 13:12:40 +08:00
Liyan Zheng 079985bc8c Add: efficient transpose 2023-04-24 13:08:29 +08:00
Liyan Zheng c1275cddb6 Fix: conv2dreduce_kernel_ offset 2023-04-24 02:30:41 +08:00
Liyan Zheng 51cc042f56 Add: nchw to nhwc conversion
Fix: conv parameter error in to_onnx
2023-04-24 02:29:53 +08:00
Liyan Zheng 18d6ba4022 Merge branch 'NNET_e2e' into NNET_gcn 2023-04-23 23:20:46 +08:00
Liyan Zheng 4211fd1f32 Fix: matmul transpose in convNHWC2gemm rule 2023-04-23 22:54:50 +08:00
xxcclong 8409c1f9d4 tested fsrcnn 2023-04-23 22:19:51 +08:00
xxcclong 830b28913c better transposed convreduce 2023-04-23 21:36:25 +08:00
Liyan Zheng 1ba78d7f89 Add: reduce in Any 2023-04-23 21:36:12 +08:00
xxcclong 777aebafc9 fsrcnn 2023-04-23 20:56:19 +08:00
whjthu 131a679340 gcn optimization 2023-04-23 13:43:41 +08:00
Liyan Zheng 5df2524ff9 Merge branch 'NNET_eliminateOP' into NNET_e2e 2023-04-23 13:35:29 +08:00
Liyan Zheng f204866d93 Fix: reduce workspace size 2023-04-23 13:34:07 +08:00
Liyan Zheng b9819e65c1 Fix: allow eliminate and fusion failure in search 2023-04-23 13:15:34 +08:00
Liyan Zheng 7277356744 Add: Reshape/Transpose elimination 2023-04-23 02:10:05 +08:00
whjthu f820117acd fix unused code 2023-04-23 00:18:26 +08:00
whjthu 1ab2118716 add AnyOp and cuda kernel 2023-04-23 00:16:03 +08:00
huangshuhong ff97c879fb add ConvNHWC and FSRCNN graph 2023-04-23 00:02:22 +08:00
Liyan Zheng acc64fd32c Merge branch 'NNET_transpose' into NNET_e2e
Fix: gridSize and blockSize in Reshape kernel
2023-04-22 21:32:31 +08:00
Liyan Zheng 33ab5dcd3e Fix: gbmm kernel 2023-04-22 21:14:52 +08:00
Liyan Zheng e2f18272c9 Add: no malloc for reshape outputs 2023-04-22 21:13:57 +08:00
Liyan Zheng 40e6db6608 Add: tensor FUID in exported ONNX 2023-04-22 20:28:17 +08:00
Liyan Zheng c451918224 Fix: tensor size overflow 2023-04-22 20:28:00 +08:00
whjthu 34ed298725 fix format 2023-04-22 17:00:52 +08:00
whjthu 664f0dbe02 support cuda transpose 2023-04-22 16:57:27 +08:00
Liyan Zheng a732b6f176 Fix: ignore transpose in CudaGraph since no kernel 2023-04-22 16:08:40 +08:00
Liyan Zheng 0865f8d823 Chore: move TensorObj::clone to .cc 2023-04-22 16:03:16 +08:00
Liyan Zheng 84f9d6731a Add: Longformer models 2023-04-22 16:00:29 +08:00
Liyan Zheng 4f02eeb08c Add: G2BMM kernels generated by tvm 0.10 2023-04-22 15:40:59 +08:00
whjthu 225a42f22d add rule for dilated conv 2023-04-21 23:40:45 +08:00
Liyan Zheng 4e9ece76f4 Chore: remove out-of-date code 2023-04-21 23:22:40 +08:00
Liyan Zheng 16a8c5dce5 Add: Conv1x1 rule 2023-04-21 23:21:04 +08:00
Liyan Zheng d051460c23 Chore: suppress output 2023-04-21 22:58:18 +08:00
Liyan Zheng d8a133684e Add: remove independent tensors in graph 2023-04-21 22:57:23 +08:00
Liyan Zheng 9ce21200c4 Add: NMutator mode in python 2023-04-21 21:31:22 +08:00
Liyan Zheng b943658713 Finish: GAN 2023-04-21 21:25:43 +08:00
Liyan Zheng 2cd75bd79b Merge branch 'NNET_e2e_fix' into NNET_e2e
Support CUDA Graph for TVM kernels
2023-04-21 13:18:44 +08:00
Liyan Zheng f0fcbe825f Add: python verification 2023-04-21 13:18:24 +08:00
huangshuhong 8c91faa948 remove expect 2023-04-21 00:17:04 +08:00
huangshuhong c0ae03a2d7 fix tvm stream 2023-04-21 00:09:47 +08:00
Liyan Zheng 0cb8729bc1 Add: different ONNX names for inputs and weights 2023-04-20 21:51:47 +08:00
YdrMaster 8bc2d3e48d fix: test graph handler
Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-04-20 21:51:47 +08:00
YdrMaster 28b123753e feat: 导入 Tensor 类型
Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-04-20 21:51:47 +08:00
Liyan Zheng 94730d93b5 Add: hash match for membound kernels 2023-04-20 17:16:01 +08:00
Liyan Zheng 6d17c4caa2 Add: getPerfTime in run_models_nnet 2023-04-20 10:54:49 +08:00
Liyan Zheng 15d0eb79cd Add: import ONNX with membound Op 2023-04-20 10:45:28 +08:00
Liyan Zheng 2a343e240e Add: shape of intermediate tensor in exported ONNX 2023-04-20 10:45:28 +08:00
Liyan Zheng 34ca6bf149 Fix: skip check when Graph is exported to ONNX 2023-04-20 10:45:28 +08:00
YdrMaster a6019e79e3 feat(py): 支持从 Graph 直接创建 OnnxStub
Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-04-20 10:45:28 +08:00
YdrMaster 4e1cc8d3e4 refactor(py): 使用工厂方法创建 OnnxStub
Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-04-20 10:44:39 +08:00
YdrMaster 725f9260cf feat: 支持导出 membound
Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-04-20 10:44:39 +08:00
YdrMaster 0edd138919 feat: 正反序列化分离为到 string 的和到 file 的
fix: 正确设置 `USE_CUDA` cfg

todo: test_search 不过

Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-04-20 10:44:39 +08:00
Liyan Zheng 0b23a065ca Add: debug hacks for InfoGAN 2023-04-20 10:42:56 +08:00
Liyan Zheng e86e993ed4 Add: CUDA graph stream capture (MemboundOp fails) 2023-04-19 16:32:16 +08:00
Liyan Zheng e4c20a9ae2 Add: warmup and repeat args in timeNonCtcOperators 2023-04-19 16:22:59 +08:00
Liyan Zheng 537b3b4ea4 Add: Membound operator serialization 2023-04-18 21:53:48 +08:00
Liyan Zheng 2812900ea2 Fix: OpType and print device tensors 2023-04-18 20:28:08 +08:00
Liyan Zheng 01fc19795d Add: time non-compile-cime-computable operators 2023-04-18 17:21:16 +08:00
Liyan Zheng afc4123328 Chore: remove deprecated function 2023-04-18 17:21:16 +08:00
Liyan Zheng b981951a47 Add: NMutator::memboundToJson to export memboundOp 2023-04-18 17:21:16 +08:00
Liyan Zheng 99b5c95455 Add: nnet::Serializer supports FuncNode 2023-04-18 17:21:16 +08:00
Liyan Zheng 9d50b30af8 Chore: disable nnet_unimplemented_continue output 2023-04-18 17:21:16 +08:00
Liyan Zheng bc31219bde Add: exclude compile-time computable operator time 2023-04-18 17:21:16 +08:00
Liyan Zheng edf4e33353 Add: C++ callback to export ONNX 2023-04-18 17:19:05 +08:00
Liyan Zheng 872f3504a9 Add: RangeOpNode::getFullExpression() 2023-04-18 17:19:05 +08:00
Liyan Zheng da49e91ab0 Add: fuse membound operators 2023-04-18 17:19:05 +08:00
Liyan Zheng a6b8f344d4 Chore: simplify type names 2023-04-18 17:19:05 +08:00
Liyan Zheng 09293730ea Add: export to ONNX with custom operators 2023-04-18 17:19:05 +08:00
Liyan Zheng 307614d95d Add: infogan python interface 2023-04-18 17:16:25 +08:00
Liyan Zheng f14edcd52f Fix: avoid reload library 2023-04-18 17:16:25 +08:00
Liyan Zheng d2d49c5d4f Add: invoke TVM through pipe 2023-04-18 17:16:25 +08:00
Liyan Zheng e72fe79168 Add: search engine uses estimated time 2023-04-18 17:16:25 +08:00
138 changed files with 10206 additions and 1522 deletions

@ -1 +1 @@
Subproject commit 3bb9240cb15459768adb3e7d963a20e1523a6294
Subproject commit f30744bcf726ea3735df7ecf9e9de9ddac540283

@ -1 +1 @@
Subproject commit b796f7d44681514f58a683a3a71ff17c94edb0c1
Subproject commit e2239ee6043f73722e7aa812a459f54a28552929

@ -1 +1 @@
Subproject commit 13132dd361c8c5b5753983d5186cf54f689d90f9
Subproject commit 6aebf09233951e4ce30a63919186a70b2b195756

2
3rd-party/pybind11 vendored

@ -1 +1 @@
Subproject commit 0bd8896a4010f2d91b2340570c24fa08606ec406
Subproject commit 1e3400b6742288429f2069aaf5febf92d0662dae

View File

@ -129,7 +129,7 @@ if(BUILD_TEST_EINNET)
endif()
# Python bindings
file(GLOB_RECURSE FFIS src/ffi/ffi_infinitensor.cc)
file(GLOB_RECURSE FFIS src/ffi/ffi_callback.cc src/ffi/ffi_infinitensor.cc)
pybind11_add_module(backend MODULE ${FFIS})
target_link_libraries(backend PRIVATE InfiniTensor)
@ -168,6 +168,7 @@ endif()
if(USE_CUDA)
add_compile_definitions(USE_CUDA=1)
add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM=1) # Support CUDA graph stream caputre
# Since enable_language only executes once, rerun cmake is required if CMAKE_CUDA_HOST_COMPILER is wrong
set(CMAKE_CUDA_HOST_COMPILER
${CMAKE_CXX_COMPILER}

View File

@ -81,7 +81,7 @@ import onnx
from pyinfinitensor.onnx import OnnxStub
from pyinfinitensor import backend
stub = OnnxStub(onnx.load("model_file"), backend.cpu_runtime())
stub = OnnxStub.from_model(onnx.load("model_file"), backend.cpu_runtime())
```
[`onnx.load`](https://onnx.ai/onnx/api/serialization.html#load-a-model) 是 onnx 提供的加载函数,将 onnx 文件读取为保存在内存中的 onnx 模型。
@ -201,7 +201,7 @@ def infer(model: ModelProto, input) -> dict:
model0 = onnx.load(sys.argv[1])
model1 = OnnxStub(model0, backend.cpu_runtime()).to_onnx("new")
model1 = OnnxStub.from_model(model0, backend.cpu_runtime()).to_onnx("new")
input_shape = [x.dim_value for x in model1.graph.input[0].type.tensor_type.shape.dim]
input = numpy.random.random(input_shape).astype(numpy.float32)

View File

@ -36,7 +36,7 @@ class BangRuntimeObj : public RuntimeObj {
bool profiling = false) const;
// double runEvaluation(const Graph &graph, int nWarmups,
// int nEvaluations) const;
void sync() const;
void sync() const override;
BangPtr alloc(size_t size) override {
void *ptr;
checkBangError(cnrtMalloc(&ptr, size));

View File

@ -75,7 +75,8 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
double timeit(
const std::function<void()> &func,
const std::function<void(void)> &sync = []() {}, int warmupRounds = 200,
int timingRounds = 200);
// HACK: set timeit rounds to 10 for fast debug
const std::function<void(void)> &sync = []() {}, int warmupRounds = 10,
int timingRounds = 100);
} // namespace infini

View File

@ -16,7 +16,8 @@ class GraphObj : public Object {
string toString() const override;
Runtime getRuntime() const { return runtime; }
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32,
TensorType tensorType = TensorType::Other);
Tensor addTensor(const Tensor &tensor);
TensorVec addTensor(const TensorVec &tensors);
/**
@ -47,6 +48,22 @@ class GraphObj : public Object {
return opClone;
}
Operator cloneOpAndCreateOutputs(Operator op, TensorVec inputs) {
auto shapes = *op->inferShape(inputs);
vector<Tensor> outputs;
for (auto shape : shapes)
outputs.emplace_back(addTensor(shape));
return cloneOperator(op, inputs, outputs);
}
Operator cloneOpAndCreateInputsOutputs(Operator op) {
vector<Tensor> inputs;
for (auto t : op->getInputs()) {
inputs.emplace_back(cloneTensor(t));
}
return cloneOpAndCreateOutputs(op, inputs);
}
const TensorVec &getTensors() const { return tensors; }
const OpVec &getOperators() const { return ops; }
OpVec getComputeOps() const;
@ -62,6 +79,7 @@ class GraphObj : public Object {
void optimize();
void dataMalloc();
void dataFree();
/**
* @brief Add an operator and create its outputs. Output tensor arguments
@ -107,6 +125,11 @@ class GraphObj : public Object {
bool checkValid() const;
/// @brief If a tensor has no source and garget, it is independent and
/// removed from the graph.
/// @return The number of removed tensors.
int removeIndependentTensors();
private:
/**
* @brief Add reverse connections and Op relationship in ctor.

View File

@ -35,20 +35,33 @@ class GraphHandlerObj {
Graph g;
public:
GraphHandlerObj(Runtime runtime)
explicit GraphHandlerObj(Runtime runtime)
: g(make_ref<GraphObj>(std::move(runtime))) {}
Tensor tensor(Shape dims, int dtype);
explicit GraphHandlerObj(Graph g) : g(std::move(g)) {}
//------ tensors
vector<Tensor> inputs() { return g->getInputs(); }
vector<Tensor> outputs() { return g->getOutputs(); }
Tensor tensor(Shape dims, int dtype, TensorType ttype);
//------ operators
inline OpVec operators() { return g->getOperators(); }
OpVec operators() { return g->getOperators(); }
Tensor conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
int sh, int sw, int dh, int dw);
Tensor convTransposed2d(Tensor input, Tensor weight, Tensor output, int ph,
int pw, int sh, int sw, int dh, int dw, int oph,
int opw);
Tensor convNHWC(Tensor input, Tensor weight, Tensor output, int ph, int pw,
int sh, int sw, int dh, int dw);
Tensor convTransposed2dNHWC(Tensor input, Tensor weight, Tensor output,
int ph, int pw, int sh, int sw, int dh, int dw,
int oph, int opw);
Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
Tensor bias, ActType act);
Tensor batchNorm(Tensor input, Tensor output, Tensor mean, Tensor var,
@ -90,18 +103,23 @@ class GraphHandlerObj {
const optional<vector<int>> &steps);
Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
const optional<vector<int>> &axes);
/// @brief Import memBound operator from a json
TensorVec memBound(const TensorVec &inputs, const Tensor &outputs,
const string &jsonString);
//------ modifiers
inline bool topo_sort() { return g->topo_sort(); }
bool topo_sort() { return g->topo_sort(); }
inline void optimize() { g->optimize(); }
void optimize() { g->optimize(); }
//------ runtime
inline void data_malloc() { g->dataMalloc(); }
void data_malloc() { g->dataMalloc(); }
inline void run() { g->getRuntime()->run(g); }
void run() { g->getRuntime()->run(g); }
Graph getGraph() const;
};
} // namespace infini

View File

@ -16,6 +16,7 @@ class Mutator {
Runtime runtime = NativeCpuRuntimeObj::getInstance())
: candidatesLimit(candidatesLimit), runtime(runtime){};
virtual ~Mutator(){};
bool hasTunedKernel = false;
virtual vector<Graph> run(const Graph &in_graph) = 0;
/**
@ -30,6 +31,14 @@ class Mutator {
virtual bool isMultiBranchMergable(const Graph &in_graph) {
IT_TODO_HALT();
}
/// @brief Fuse memory bound operators.
/// @return The graph after fusion. Return `nullptr` if fails.
virtual Graph fuseVertically(const Graph &inputGraph) { IT_TODO_HALT(); }
/// @brief Eliminate transpose and reshape.
/// @return The graph after elimination. Return `nullptr` if fails.
virtual Graph eliminateVertically(const Graph &in_graph) { IT_TODO_HALT(); }
};
} // namespace infini

View File

@ -11,6 +11,7 @@ enum class OpType {
Matmul,
ConvTrans,
ConvTransNHWC,
ConvNHWC,
G2BMM,
GBMM,
Pad,
@ -102,6 +103,10 @@ enum class OpType {
Dropout,
//
MemBound = 300,
//
Conv2dReduce = 400,
Conv2dReduceTranspose,
Any
};
using KernelAttrs = std::tuple<Device, OpType, DataType>;
@ -121,6 +126,8 @@ class OpRegistry {
FOP(ConvBackwardData);
FOP(Matmul);
FOP(ConvTrans);
FOP(ConvTransNHWC);
FOP(ConvNHWC);
FOP(G2BMM);
FOP(GBMM);
FOP(Pad);
@ -141,6 +148,7 @@ class OpRegistry {
FOP(Reshape);
FOP(Identity);
FOP(Shape);
FOP(Flatten);
// element wise
FOP(BatchNorm);
FOP(Softmax);
@ -208,8 +216,13 @@ class OpRegistry {
FOP(BitRightShift);
//
FOP(MemBound);
//
FOP(Conv2dReduce);
FOP(Conv2dReduceTranspose);
FOP(Any);
default:
IT_ASSERT(false);
IT_ASSERT(false, "Unknown OpType " +
std::to_string(enum_to_underlying(opType)));
break;
}
#undef FOP

View File

@ -1,5 +1,6 @@
#pragma once
#include "core/common.h"
#include "core/object.h"
#include "core/ref.h"
#include <memory>
@ -59,10 +60,12 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
* execution happens.
*
* @param graph
* @param profiling Whether to print breakdown of time
* @param printProfiling Whether to print breakdown of time
* @return double Return the sum of perf time for each operator
*/
double getPerfTime(const Graph &graph, bool profiling = false) const;
double getPerfTime(const Graph &graph, bool printProfiling = false,
bool allowEstimation = false,
bool ignoreMemboundOp = false) const;
Blob allocBlob(size_t size);
bool isCpu() const {
return device == Device::CPU || device == Device::INTELCPU;
@ -76,11 +79,19 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
virtual void copyBlobToCPU(void *dst, const void *src,
size_t bytes) const = 0;
virtual string toString() const = 0;
virtual void sync() const {}
map<UidBaseType, bool>
getCompileTimeComputableAttribute(const Graph &graph) const;
double timeNonCtcOperators(const Graph &graph, int warmup = 1000,
int repeat = 1000) const;
protected:
void printProfilingData(double totTime,
void printProfilingData(double totalTime,
const std::map<OpType, double> &opTime,
const std::map<OpType, int> &opCnt) const;
const std::map<OpType, int> &opCnt,
const std::map<OpType, int> &opNonCtcCnt) const;
virtual void copyBlobInsideRuntime(void *dst, const void *src,
size_t bytes) const = 0;
};

View File

@ -4,44 +4,35 @@
#include "graph.h"
#include "mutator.h"
#include <unordered_map>
namespace infini {
class SearchEngine {
private:
Runtime runtimeExec;
Ref<Mutator> mutator;
std::function<bool(const Graph &, const Graph &)> graphTimeComparer;
public:
SearchEngine(Runtime _runtime, Ref<Mutator> _mutator) {
runtimeExec = _runtime;
mutator = _mutator;
}
SearchEngine(Runtime runtime, Ref<Mutator> mutator);
~SearchEngine() {}
int searchFilter = 0;
bool chooseBestMutation = true;
private: // Configurations
size_t partitionThreshold =
3; // cut nodes whose #in + #out >= partitionThreshold
size_t GRAPH_SIZE = 16; // num of best graphs.
private: // Composed objects
std::shared_ptr<Mutator> mutationEngine;
public:
std::shared_ptr<Mutator> getMutationEngine() { return mutationEngine; };
struct GroupEdge {
int v, next;
GroupEdge() = delete;
};
struct Candidate { // a graph with perf
std::shared_ptr<Graph> graph;
double perf = INFINITY;
};
class MetaGraph { // a graph of subgraphs, for searching.
public:
MetaGraph() {}
~MetaGraph() {}
// struct Candidate { // a graph with perf
// Graph graph;
// double perf = INFINITY;
// };
struct MetaGraphObj { // a graph of subgraphs, for searching.
struct Node {
Graph graph;
std::vector<int> suc;
@ -50,31 +41,33 @@ class SearchEngine {
};
std::vector<Node> nodes;
};
using MetaGraph = Ref<MetaGraphObj>;
Graph run(const Graph graph); // entrance of search engine.
Graph run(const Graph graph); // entrance to search engine.
std::vector<Graph> search(const Graph &graph); // search for a partition.
private:
std::vector<Graph> partitionGraph(const Graph graph);
std::shared_ptr<MetaGraph> buildMetaGraphWithGraph(const Graph graph);
std::shared_ptr<MetaGraph>
buildMetaGraphWithPlan(const std::shared_ptr<MetaGraph> metaGraph,
const std::vector<int> &plan);
MetaGraph buildMetaGraphWithGraph(const Graph graph);
MetaGraph buildMetaGraphWithPlan(const MetaGraph metaGraph,
const std::vector<int> &plan);
// search horizontal merges
std::vector<std::shared_ptr<MetaGraph>>
searchMerge(std::shared_ptr<MetaGraph> &metaGraph);
void searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
std::vector<int> &plan, std::vector<int> &frontier,
std::vector<MetaGraph> searchMerge(MetaGraph &metaGraph);
void searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
std::vector<int> &frontier,
std::vector<std::vector<int>> &plans,
std::unordered_set<uint64_t> &planSet);
std::vector<Graph>
searchMutation(const std::shared_ptr<MetaGraph> &metaGraph);
std::vector<Graph> searchMutation(const MetaGraph &metaGraph);
void printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph);
void printMetaGraph(MetaGraph metaGraph);
/**
* @brief Check whether a multi-brach graph can be merged into a single
* branch.
*/
bool isMultiBranchMergable(const Graph graph);
Graph fuseVertically(const Graph &graph);
double getEstimatedGraphPerf(Graph graph);
};
} // namespace infini

View File

@ -12,13 +12,14 @@ namespace infini {
// TODO: how to deal with this
using ShapeElem = int;
using Shape = vector<ShapeElem>;
enum class TensorType { Error = 0, Input = 1, Initialized = 2, Other = 3 };
class TensorObj : public TensorBaseObj {
private:
Shape shape;
size_t _size; // Cache of Π(shape).
Fuid fuid; // Cloned tensors share the same id. Tensors constructed from
// scratch have a new id.
TensorType tensorType;
void copyin(const void *ptr, size_t size) {
runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
}
@ -27,7 +28,8 @@ class TensorObj : public TensorBaseObj {
}
public:
TensorObj(Shape shape, DataType dtype, Runtime runtime);
TensorObj(Shape shape, DataType dtype, Runtime runtime,
TensorType tensorType = TensorType::Other);
virtual ~TensorObj() {}
string toString() const override;
@ -39,6 +41,7 @@ class TensorObj : public TensorBaseObj {
size_t getOffset(const vector<int> &ds) const;
void dataMalloc();
UidBaseType getFuid() const { return fuid; }
TensorType getTensorType() const { return tensorType; }
void load(std::string file_path);
void save(std::string file_path);
@ -74,25 +77,9 @@ class TensorObj : public TensorBaseObj {
// Thus the internal state of generator cannot be updated.
void setData(
std::function<void(void *, size_t, DataType)> const &generator) const;
Tensor clone() const {
auto obj = make_ref<TensorObj>(*this);
obj->freeData();
obj->targets.clear();
obj->source.reset();
return obj;
}
Tensor clone(Runtime runtime) const {
auto obj = make_ref<TensorObj>(*this);
obj->runtime = runtime;
obj->freeData();
obj->targets.clear();
obj->source.reset();
if (hasData()) {
obj->dataMalloc();
obj->copyData(this);
}
return obj;
}
void setData(const Blob &_blob) { data = _blob; }
Tensor clone() const;
Tensor clone(Runtime runtime) const;
void printData() const;
bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
@ -106,13 +93,13 @@ class TensorObj : public TensorBaseObj {
size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const;
private:
template <class T> string dataToString() const {
template <class T> string dataToString(void *rawPtr) const {
std::stringstream builder;
builder << "Tensor: " << guid << std::endl;
auto numDims = shape.size();
auto dimSzVec = vector<int>(numDims, 1);
auto ptr = data->getPtr<T *>();
T *ptr = (T *)rawPtr;
dimSzVec[numDims - 1] = shape[numDims - 1];
for (int i = numDims - 1; i != 0; --i)
@ -123,6 +110,12 @@ class TensorObj : public TensorBaseObj {
if (i % dimSzVec[j] == 0)
builder << "[";
if (iEnd > 1000 && i > 20 && i < iEnd - 20) {
printf("... , ");
i = iEnd - 20;
continue;
}
builder << ptr[i];
for (size_t j = 0; j < numDims; ++j)
if ((int)i % dimSzVec[j] == dimSzVec[j] - 1)

10
include/cuda/cuda_any.h Normal file
View File

@ -0,0 +1,10 @@
#pragma once
#include "operators/any.h"
namespace infini {
void any_kernel_mapping(vector<float *> input, vector<float *> output,
const string &kernel_name, const vector<int> &attr);
} // namespace infini

View File

@ -13,6 +13,7 @@
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error in %s:%i : %s.\n", __FILE__, __LINE__, \
cudaGetErrorString(err)); \
IT_ASSERT(false); \
exit(EXIT_FAILURE); \
} \
}

View File

@ -0,0 +1,31 @@
#pragma once
namespace infini {
void conv2dreduce_kernel(float *input, float *bias, float *output, bool PReLU,
int n, int h, int w, int f, int r, int s, int oh,
int ow, int ph, int pw, int sh, int sw, int dh,
int dw);
void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
int act, int n, int h, int w, int f, int r,
int s, int oh, int ow, int ph, int pw, int sh,
int sw, int dh, int dw);
void reduceConvRxSToNCHW(float *input, float *bias, float *output, int act,
int n, int h, int w, int f, int r, int s, int oh,
int ow, int ph, int pw, int sh, int sw, int dh,
int dw);
void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
int act, int n, int h, int w, int f, int r,
int s, int oh, int ow, int ph, int pw, int sh,
int sw, int dh, int dw);
void conv5x5ToConv3x3Reduce(int n, int f, int h, int w, float *input,
float *output, float *bias);
void conv3x3ToReduce(int n, int h, int w, int f, float *input, float *output,
float *bias);
} // namespace infini

View File

@ -6,44 +6,52 @@ namespace infini {
class CudaRuntimeObj : public RuntimeObj {
private:
cudaStream_t stream;
cudnnHandle_t cudnn;
cublasHandle_t cublas;
CudaPtr workspace;
size_t workspaceSize;
public:
CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
// Memory information
size_t allocatedGPUMemorySize = 0;
map<void *, size_t> allocationMap;
checkCudnnError(cudnnCreate(&cudnn));
checkCublasError(cublasCreate(&cublas));
// 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 7ll << 30; // 7 GB
workspace = alloc(workspaceSize);
}
virtual ~CudaRuntimeObj() {
try {
dealloc(workspace);
checkCudnnError(cudnnDestroy(cudnn));
checkCublasError(cublasDestroy(cublas));
} catch (const std::exception &e) {
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
}
}
bool cudaGraphStatus; // Whether CUDA graph stream capture is enabled
// CUDA device properties
cudaDeviceProp deviceProperties;
bool enableTF32 = false;
public:
CudaRuntimeObj();
virtual ~CudaRuntimeObj();
string toString() const override;
void run(const Graph &graph, bool tune = false,
bool profiling = false) const;
// double runEvaluation(const Graph &graph, int nWarmups,
// int nEvaluations) const;
void sync() const;
void sync() const override;
CudaPtr alloc(size_t size) override {
void *ptr;
// printf("Try to cudaMalloc: %lu bytes\n", size);
checkCudaError(cudaMalloc(&ptr, size));
// printf("cuda malloc: %p %lu bytes\n", ptr, size);
allocatedGPUMemorySize += size;
allocationMap[ptr] = size;
// printf("cuda malloc: %p %lu bytes, total %lu bytes (%.2lf GB)\n",
// ptr,
// size, allocatedGPUMemorySize,
// double(allocatedGPUMemorySize) / 1024 / 1024 / 1024);
return ptr;
}
void dealloc(void *ptr) override { checkCudaError(cudaFree(ptr)); }
void dealloc(void *ptr) override {
checkCudaError(cudaFree(ptr));
allocatedGPUMemorySize -= allocationMap.at(ptr);
allocationMap.erase(ptr);
// printf("cuda dealloc: %p %lu bytes, total %lu\n", ptr,
// allocationMap.at(ptr), allocatedGPUMemorySize);
}
cudnnHandle_t cudnnHandle() const { return cudnn; }
cublasHandle_t cublasHandle() const { return cublas; }
size_t getWorkspaceSize() const { return workspaceSize; }
@ -51,6 +59,10 @@ class CudaRuntimeObj : public RuntimeObj {
IT_ASSERT(size <= workspaceSize);
return workspace;
}
pair<int, int> getComputeCapacitiy() const {
return {deviceProperties.major, deviceProperties.minor};
}
int getNumSMs() const { return deviceProperties.multiProcessorCount; }
void copyBlobFromCPU(void *dst, const void *src,
size_t bytes) const override {
@ -69,7 +81,19 @@ class CudaRuntimeObj : public RuntimeObj {
void runWithoutSync(const Graph &graph) const;
bool isInCudaGraph() const { return cudaGraphStatus; }
cudaStream_t getStream() const { return stream; }
double timeWithCudaGraph(Graph graph, int rounds = 50);
double timeWithCudaGraph(vector<std::function<void(void)>> funcs,
int rounds = 50);
void setEnableTF32(bool state);
bool getEnableTF32() const { return enableTF32; }
private:
void tune(const Graph &graph, bool profiling) const;
void beginCudaGraphStreamCapture();
tuple<cudaGraphExec_t, size_t> endCudaGraphStreamCapture();
};
} // namespace infini

View File

@ -0,0 +1,16 @@
#pragma once
#include "operators/transpose.h"
#include "utils/small_array.h"
namespace infini {
void transpose_kernel(float *input, float *output, int nDims, int size,
SmallArray strides, SmallArray outputShape,
vector<int> _dims_in, vector<int> _dims_out,
vector<int> _perms);
void invoke_transpose_last_two_dim(float *ptrA, float *ptrB, int dim0, int dim1,
int dim2, int numSMs);
} // namespace infini

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,9 @@
#include "core/graph_handler.h"
#include "core/mutator.h"
#include "core/search_engine.h"
namespace infini {
namespace callback {
void exportONNX(const Graph &graph, const string &path);
}
} // namespace infini

View File

@ -29,7 +29,7 @@ class MklRuntimeObj : public CpuRuntimeObj {
string toString() const override { return "INTELCPU Runtime"; };
dnnl::engine getEngine() const { return dnnl::engine(engine, true); }
dnnl::stream getStream() const { return dnnl::stream(stream, true); }
void sync() const;
void sync() const override;
};
} // namespace infini

View File

@ -20,6 +20,7 @@ class Serializer : public Functor<string()> {
string visit_(const Subscript &c) override;
string visit_(const Var &c) override;
string visit_(const Tensor &c) override;
string visit_(const Func &c) override;
string dispatchRoutine(const Routine &c);
Expr buildExprTree(string key);
@ -29,16 +30,44 @@ class Serializer : public Functor<string()> {
Serializer(int _verobse = 0);
virtual ~Serializer();
/**
* @brief Serialize the given expression to string
*
* @param expr The expression to be serialized
* @param msg Message of derivation
* @param inputs membound operator attributes
* @param exec_time membound operator attributes
* @param hint membound operator attributes
* @return bool Whether the serialization succeed
*/
std::optional<std::string> toString(Expr const &expr,
const string &msg = "",
vector<Tensor> inputs = {},
double exec_time = -1e9,
string hint = "");
/**
* @brief Serialize the given expression to json file
*
* @param expr The expression to be serialized
* @param filePath The path of json file to be output
* @param msg Message of derivation
* @param inputs membound operator attributes
* @param exec_time membound operator attributes
* @param hint membound operator attributes
* @return bool Whether the serialization succeed
*/
bool serialize(const Expr &expr, const string &filePath,
const string &msg = "");
bool toFile(const Expr &expr, const string &filePath,
const string &msg = "", vector<Tensor> inputs = {},
double exec_time = -1e9, string hint = "");
/**
* @brief Deserialize the given json file to expression
*
* @param text The text of the expr to be deserialized
* @return Expression deserialized from the given json file
*/
Expr fromString(const string &text);
/**
* @brief Deserialize the given json file to expression
@ -46,7 +75,15 @@ class Serializer : public Functor<string()> {
* @param filePath The path to file to be deserialized
* @return Expression deserialized from the given json file
*/
Expr deserialize(const string &filePath);
Expr fromFile(const string &filePath);
tuple<Expr, vector<Tensor>, double, string>
deserializeAsMemobundOp(const string &filePath);
// FIXME: the order of elements in tuple is not consistent with memboundObj
// constructor
tuple<Expr, vector<Tensor>, double, string>
membundOpFromString(const string &data);
};
} // namespace nnet
} // namespace nnet

View File

@ -69,7 +69,8 @@ static inline HashType genhash(string s) {
{ IT_TODO_HALT(); }
#define nnet_unimplemented_continue() \
{ dbg("Unimplemented"); }
{}
// { dbg("Unimplemented"); }
#define nnet_assert(expr, msg) assert(((void)(msg), (expr)))

View File

@ -67,11 +67,13 @@ class Derivator {
vector<string> ruleStates, ruleMsgs;
int cntStates = 0; // the number of intermediate states
int searchState = 0; // search state in guided search
bool printAndExit;
void printDerivationRules();
public:
Derivator(int maxDepth = 8, bool enableHashPruning = true,
LogMode mode = LogMode::NoLog,
PassMode passMode = PassMode::Debug);
PassMode passMode = PassMode::Debug, bool printAndExit = false);
void search(Formula &origin, int depth);
void ruleBasedDFS(Formula &origin, int depth, vector<int> _rules,
map<int, vector<Var>> _substituteRules = {},

View File

@ -104,7 +104,7 @@ enum class NodeType {
FuncNodeType
};
enum class FuncType { Relu, Tanh, PRelu };
enum class FuncType { Relu = 1000, Tanh, PRelu };
#define DEFINE_GETTYPE(CLASS, isScalar_v) \
NodeType getType() const override { return NodeType::CLASS##Type; } \
@ -206,7 +206,8 @@ struct IterationType {
enum { Loop, Sum };
constexpr static int NumIterationType = 2;
};
class RangeOpNode : public OperatorNode {
class RangeOpNode : public OperatorNode,
public std::enable_shared_from_this<RangeOpNode> {
public:
enum { Summand, END_POS };
constexpr static int Loop = IterationType::Loop;
@ -230,6 +231,7 @@ class RangeOpNode : public OperatorNode {
return 0;
};
string toReadable() const override;
string getFullExpression();
const Expr &getSummand() const { return subExprs[Summand]; }
const vector<VarRangePair> &getVarRanges(int _index) const {
return vars[_index];
@ -384,13 +386,16 @@ class FuncNode : public ExprNode {
};
// Wrappers for type deduction
Subscript makeSubscript(const Expr &tensor, const VecExpr &subscripts);
RangeOp makeRangeOperator(const vector<VarRangePair> &_loopIters,
const vector<VarRangePair> &_sumIters, Expr _summand,
const vector<int> &paddings = {});
Tensor makeTensor(const string &name, const vector<int> &shape,
const vector<int> &paddings = {},
const Routine &source = nullptr);
// make a subscript operator
Subscript mSub(const Expr &tensor, const VecExpr &subscripts);
// make a range operator
RangeOp mL(const vector<VarRangePair> &_loopIters,
const vector<VarRangePair> &_sumIters, Expr _summand,
const vector<int> &paddings = {});
// make a tensor
Tensor mT(const string &name, const vector<int> &shape,
const vector<int> &paddings = {}, const Routine &source = nullptr);
// Pretty output for dbg with shared_ptr
template <typename T, typename std::enable_if_t<std::is_base_of_v<ExprNode, T>>

View File

@ -7,32 +7,53 @@ namespace infini {
class NMutator : public Mutator {
public:
enum class Mode { Normal, ToNaiveMembound, RuleBased };
using NameNToTensorT = map<string, Tensor>;
private:
// Suffix -N: NNet objects.
// Suffix -T: tpm objects.
// Map: NNet tensors -> tpm tensor.
std::map<std::string, Tensor> inputsNameNToTensorT;
NameNToTensorT inputsNameNToTensorT;
Mode mode;
const double bandwidth = double(200) * 1024 * 1024 * 1024;
// If in RuleBased mode, use derivationRules in derivator
const std::vector<int> derivationRules;
bool searchFilter = false;
bool enableRules = false; // Enable operator-level transformation rules
public:
NMutator(Mode mode = Mode::Normal);
NMutator(Mode mode, const std::vector<int> &derivationRules);
NMutator(Mode mode = Mode::Normal,
Runtime runtime = NativeCpuRuntimeObj::getInstance(),
bool enableRules = false);
NMutator(Mode mode, const std::vector<int> &derivationRules,
Runtime runtime = NativeCpuRuntimeObj::getInstance(),
bool enableRules = false);
~NMutator();
vector<Graph> run(const Graph &in_graph) override;
void setToNaiveMembound();
Graph fuseVertically(const Graph &in_graph) override;
Graph eliminateVertically(const Graph &in_graph) override;
bool isMultiBranchMergable(const Graph &in_graph) override;
void setMaxDepth(int _maxDepth) { maxDepth = _maxDepth; }
void setToNaiveMembound();
void setMaxDepth(int _maxDepth) {
maxDepth = _maxDepth;
searchFilter = true;
}
long long cntStates = 0;
long long cntCandidates = 0;
private:
int maxDepth = 8;
nnet::Expr opToExpression(Operator op);
/// @brief
/// @param op
/// @return pair<Expr, map from NNet tensor names to InfiniTensor tensors>
static pair<nnet::Expr, NameNToTensorT> extractOp(Operator op);
static pair<nnet::Expr, NMutator::NameNToTensorT>
generateUnaryExpr(const Operator &op);
static pair<nnet::Expr, vector<nnet::Tensor>> generateRevert(Tensor in);
void runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs);
/**
@ -47,12 +68,32 @@ class NMutator : public Mutator {
double memboundTime(const Shape &dims);
// TODO: recover these rules
// Graph fuseHetConv(nnet::Expr expr, Graph in_graph);
// Graph transformTConv1x1(Operator op);
// Graph transformTConv3x3(Operator op);
// Graph transformDialtedConv(Operator op);
// Graph transformConv1x1(Operator op);
Graph transformConvtransposed1x1(Operator _op);
// Graph transformConvtransposed(Operator op);
vector<Graph> transformConv1x1(Operator op);
vector<Graph> transformConv3x3ONNX(Operator op);
Graph transformG2bmm(Operator op);
Graph transformGbmm(Operator op);
Graph transformDialtedConv(Operator _op);
vector<Graph> transformConv1xk(Operator op);
// Graph transformConv1xk(Operator op);
Graph transformConvToGEMMReduce(Operator _op);
Graph transformConvTranposeToGEMMReduce(Operator _op);
Tensor splitTransposeMerge(Graph g, Tensor A, int dim, int chunkSize,
Tensor output = nullptr);
/// @brief Construct a new graph with a chain of operators. Use the output
/// from the previous operator as the input of the next operator. While
/// constructing, the input and output tensors from inputGraph are used as
/// new constructed graph.
/// @param op The operator chain. It can have wrong input/output shapes.
/// @return
Graph constructGraphByOperatorChain(vector<Operator> ops, Graph inputGraph);
// Convert an nnet::Expr to an infini::Graph containing corresponding
// tensors and operators
Graph constructGraphFromExpression(Runtime runtime, nnet::Expr expr);
};
} // namespace infini

View File

@ -0,0 +1,23 @@
#ifdef USE_CUDA
#include "core/graph.h"
#include "core/runtime.h"
#include "core/search_engine.h"
namespace infini {
Graph getGANGraph(int batch, Runtime runtime, int nLayers, int modelId);
Graph getFSRCNNGraph(int batch, Runtime runtime);
Graph getLongformer(Runtime runtime, int bs);
vector<Tensor> runInfoGAN(int nLayers);
Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId);
Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning, NMutator::Mode mode,
vector<int> rules);
void initializeGraphTensors(Graph g, double l, double r, bool useInt);
Graph convertNCHWtoNHWCModel(Runtime runtime, Graph inG);
Graph optimizeWithDepthConstraint(Graph g, Runtime _runtime, int maxDepth);
Graph optimizeModel(Graph g, Runtime _runtime, string name);
Graph optimizeModelWithRules(Graph g, Runtime _runtime, vector<int> rules);
} // namespace infini
#endif

View File

@ -49,7 +49,7 @@ template <typename R, typename... Args> class Functor<R(Args...)> {
virtual R visit_(const Tensor &c, Args... args) FUNCTOR_DEFAULT;
virtual R visit_(const Func &c, Args... args) FUNCTOR_DEFAULT;
virtual R visitDefault(const Expr &c, [[maybe_unused]] Args... args) {
dbg(*c);
dbg(*c, c->getType());
nnet_assert(0, "Reach unimplemented visit function.");
return R();
};

30
include/operators/any.h Normal file
View File

@ -0,0 +1,30 @@
#pragma once
#include "core/operator.h"
namespace infini {
class AnyObj : public OperatorObj {
private:
string kernelName;
vector<int> attr;
public:
AnyObj(GraphObj *graph, const TensorVec &inputs, const TensorVec &outputs,
const string &kernelName, const vector<int> &attr);
OP_CLONE(AnyObj);
string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return outputs.size(); }
const string getKernelName() const;
void setAttr(int i, int v) { attr[i] = v; }
vector<int> getOpAttrVector() const override;
vector<int> getWorkloadVector() const override;
};
} // namespace infini

View File

@ -98,7 +98,7 @@ class ConvBaseObj : public OperatorObj {
int numInputs() const override { return 2; }
int numOutputs() const override { return 1; }
Tensor getBias() const { return inputs[2]; }
Tensor getBias() const { return inputs.size() > 2 ? inputs[2] : nullptr; }
PaddingMode getPaddingMode() const { return padding; }
pair<int, int> inferPaddingSize() const;
@ -111,7 +111,7 @@ class ConvBaseObj : public OperatorObj {
auto getNCHWFRS() const { return tuple(n, c, h, w, f, r, s); }
auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); }
int getChannelPerGroup() const {
if (type == OpType::ConvTransNHWC) {
if (type == OpType::ConvTransNHWC || type == OpType::ConvNHWC) {
return inputs[1]->getDims()[3];
} else {
return inputs[1]->getDims()[1];
@ -149,6 +149,25 @@ class ConvObj : public ConvBaseObj {
void setAuxilaryAttributes(PaddingMode mode) override;
};
class ConvNHWCObj : public ConvBaseObj {
public:
ConvNHWCObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
int ph, int pw, int sh = 1, int sw = 1, int dh = 1, int dw = 1,
Tensor bias = nullptr, ActType act = ActType::None);
// Constructors for setting padding mode
ConvNHWCObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
PaddingMode mode = PaddingMode::Same, int sh = 1, int sw = 1,
int dh = 1, int dw = 1, Tensor bias = nullptr,
ActType act = ActType::None);
OP_CLONE(ConvNHWCObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int getNumGroups() const override { return c / getChannelPerGroup(); }
private:
void setAuxilaryAttributes(PaddingMode mode) override;
};
class ConvBackwardFilterObj : public ConvBaseObj {
private:
ActType act;
@ -220,6 +239,7 @@ class ConvTransposed2dNHWCObj : public ConvBaseObj {
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int getNumGroups() const override { return group; }
std::pair<int, int> getOutputPadding() const { return {oph, opw}; }
private:
void setAuxilaryAttributes(PaddingMode mode) override;

View File

@ -0,0 +1,62 @@
#pragma once
#include "core/operator.h"
namespace infini {
class Conv2dReduceBase : public OperatorObj {
protected:
Tensor bias;
int ph, pw;
int sh, sw;
int dh, dw;
int n, h, w, f, r, s; // c has been reduced
bool PReLU;
float paramReLU;
public:
Conv2dReduceBase(OpType opType, Tensor input, Tensor bias, Tensor output,
bool PReLU_, float paramReLU_, int ph_, int pw_,
int sh_ = 1, int sw_ = 1, int dh_ = 1, int dw_ = 1);
std::string toString() const override;
int numInputs() const override { return 2; }
int numOutputs() const override { return 1; }
int getDh() const { return dh; }
int getDw() const { return dw; }
int getPh() const { return ph; }
int getPw() const { return pw; }
int getSh() const { return sh; }
int getSw() const { return sw; }
bool getPReLU() const { return PReLU; }
float getParamReLU() const { return paramReLU; }
Tensor getBias() const { return bias; }
// optional<vector<Shape>> inferShape(const TensorVec &inputs) const
// override;
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
class Conv2dReduce : public Conv2dReduceBase {
public:
Conv2dReduce(GraphObj *graph, Tensor input, Tensor bias, Tensor output,
bool PReLU_, float paramReLU_, int ph_, int pw_, int sh_ = 1,
int sw_ = 1, int dh_ = 1, int dw_ = 1);
OP_CLONE(Conv2dReduce);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
};
class Conv2dReduceTranspose : public Conv2dReduceBase {
public:
Conv2dReduceTranspose(GraphObj *graph, Tensor input, Tensor bias,
Tensor output, bool PReLU_, float paramReLU_, int ph_,
int pw_, int sh_ = 1, int sw_ = 1, int dh_ = 1,
int dw_ = 1);
OP_CLONE(Conv2dReduceTranspose);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
};
} // namespace infini

View File

@ -6,12 +6,17 @@ namespace infini {
class MemBoundObj : public OperatorObj {
private:
std::vector<nnet::Tensor> nnetInputs;
nnet::Expr expr, simplifiedExpr;
nnet::Expr expr;
std::vector<nnet::Tensor>
nnetInputs; // The order of inputs in nnetInputs should be consistant
// with inputs in infinitensor
double exec_time;
std::string hint;
HashType hash, simplifiedHash;
int n, f, h, w;
// Generated attributes
HashType hash;
nnet::Expr simplifiedExpr;
HashType simplifiedHash;
public:
MemBoundObj(GraphObj *graph, const TensorVec &input,
@ -27,9 +32,12 @@ class MemBoundObj : public OperatorObj {
int numOutputs() const override { return outputs.size(); }
const vector<nnet::Tensor> &getNnetInputs() const { return nnetInputs; }
const nnet::Expr getNnetExpr() const { return expr; }
HashType getHash() const { return hash; }
pair<const nnet::Expr, HashType> getSimplifiedNnetExpr() const {
return {expr, hash};
}
double getEstimatedTime() const { return exec_time; }
string toJson() const;
private:
vector<int> getWorkloadVector() const override;

View File

@ -19,7 +19,7 @@ class ReshapeObj : public OperatorObj {
* @param output The output tensor.
* @param dims The shape of the output tensor.
*/
ReshapeObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
ReshapeObj(GraphObj *graph, Tensor input, Tensor output, Shape dims = {});
OP_CLONE(ReshapeObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
@ -60,6 +60,7 @@ class FlattenObj : public OperatorObj {
std::string toString() const override;
int numInputs() const override { return 1; }
int numOutputs() const override { return 1; }
int getAxis() const { return axis; }
private:
vector<int> getWorkloadVector() const override;

View File

@ -7,7 +7,9 @@ namespace infini {
*
*/
class SliceObj : public OperatorObj {
template <class T> struct range_t { T start, end, step; };
template <class T> struct range_t {
T start, end, step;
};
vector<range_t<int>> axes;
public:

View File

@ -3,6 +3,8 @@
namespace infini {
class TransposeObj : public OperatorObj {
vector<int> transposePermute;
public:
TransposeObj(GraphObj *graph, Tensor input, Tensor output,
vector<int> permute);
@ -15,7 +17,6 @@ class TransposeObj : public OperatorObj {
std::vector<int> getPermute() const { return transposePermute; }
private:
vector<int> transposePermute = {1, 1, 1, 1};
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};

View File

@ -46,10 +46,13 @@ class RandomGenerator : public DataGenerator {
std::mt19937 e;
std::uniform_int_distribution<int> di;
std::uniform_real_distribution<float> dr;
bool generateInteger;
public:
RandomGenerator(double l = 0, double r = 1, unsigned int seed = 0)
: l(l), r(r), e(seed), di(l, r), dr(l, r) {}
RandomGenerator(double l = 0, double r = 1, unsigned int seed = 0,
bool generateInteger = false)
: l(l), r(r), e(seed), di(l, r), dr(l, r),
generateInteger(generateInteger) {}
virtual ~RandomGenerator() {}
private:
@ -60,7 +63,7 @@ class RandomGenerator : public DataGenerator {
}
void fill(float *data, size_t size) override {
for (size_t i = 0; i < size; i++) {
data[i] = dr(e);
data[i] = (generateInteger) ? di(e) : dr(e);
}
}
};

View File

@ -0,0 +1,8 @@
namespace infini {
#define SMALL_ARRAY_SIZE 8
struct SmallArray {
int data[SMALL_ARRAY_SIZE];
};
} // namespace infini

View File

@ -25,6 +25,7 @@ from onnx.shape_inference import infer_shapes
from onnx.numpy_helper import to_array
from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
from functools import reduce
import numpy as np
class OnnxStub:
@ -37,29 +38,48 @@ class OnnxStub:
outputs: Dict[str, backend.Tensor] = {}
initializer: Dict[int, TensorProto] = {}
handler: backend.GraphHandler
disable_check: bool
def __init__(self, model: ModelProto, runtime):
model = infer_shapes(model)
self.handler = backend.GraphHandler(runtime)
@classmethod
def from_onnx(cls, model: ModelProto, runtime, enable_onnx_shape_infernce=True):
if enable_onnx_shape_infernce:
model = infer_shapes(model)
ans = OnnxStub()
ans.handler = backend.GraphHandler(runtime)
tensors: Dict[str, backend.Tensor] = dict()
data: Dict[str, TensorProto] = dict()
cnt_infini_inputs = 0
for input in model.graph.input:
dims = _take_shape_dim(input.type.tensor_type.shape)
tensors[input.name] = self.handler.tensor(
dims, input.type.tensor_type.elem_type
if input.name.startswith('input'):
tensor_type = backend.TensorType.Input
cnt_infini_inputs += 1
else:
tensor_type = backend.TensorType.Initialized
tensors[input.name] = ans.handler.tensor(
dims,
input.type.tensor_type.elem_type,
tensor_type,
)
assert cnt_infini_inputs == 1, f'{cnt_infini_inputs} tensor names start with "input" found.'
for output in model.graph.output:
dims = _take_shape_dim(output.type.tensor_type.shape)
tensors[output.name] = self.handler.tensor(
dims, output.type.tensor_type.elem_type
tensors[output.name] = ans.handler.tensor(
dims,
output.type.tensor_type.elem_type,
backend.TensorType.Other,
)
for initializer in model.graph.initializer:
dims = [d for d in initializer.dims]
tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
tensors[initializer.name] = ans.handler.tensor(
dims,
initializer.data_type,
backend.TensorType.Initialized,
)
data[initializer.name] = initializer
for node in model.graph.node:
@ -77,17 +97,18 @@ class OnnxStub:
)
if p[0] != p[2] or p[1] != p[3]:
adapt = "{}-adapt".format(node.output[0])
tensors[adapt] = self.handler.pad(
tensors[adapt] = ans.handler.pad(
tensors[node.input[0]], None, p, [-2, -1]
)
p = [0, 0, 0, 0]
else:
adapt = node.input[0]
if len(node.input) > 2:
# HACK: ignore bias
if len(node.input) > 3:
bias = "{}-bias".format(node.output[0])
reshape = "{}-reshape".format(node.output[0])
tensors[bias] = self.handler.conv(
tensors[bias] = ans.handler.conv(
tensors[adapt],
tensors[node.input[1]],
None,
@ -98,7 +119,7 @@ class OnnxStub:
d[0],
d[1],
)
tensors[reshape] = self.handler.reshape(
tensors[reshape] = ans.handler.reshape(
tensors[node.input[2]],
None,
[
@ -111,13 +132,13 @@ class OnnxStub:
1,
],
)
tensors[node.output[0]] = self.handler.add(
tensors[node.output[0]] = ans.handler.add(
tensors[bias],
tensors[reshape],
tensors.get(node.output[0]),
)
else:
tensors[node.output[0]] = self.handler.conv(
tensors[node.output[0]] = ans.handler.conv(
tensors[adapt],
tensors[node.input[1]],
tensors.get(node.output[0]),
@ -142,7 +163,7 @@ class OnnxStub:
attributes[name]
for name in ["dilations", "pads", "strides", "output_padding"]
)
tensors[node.output[0]] = self.handler.convTransposed2d(
tensors[node.output[0]] = ans.handler.convTransposed2d(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
@ -156,7 +177,7 @@ class OnnxStub:
op[1],
)
elif node.op_type == "MatMul":
tensors[node.output[0]] = self.handler.matmul(
tensors[node.output[0]] = ans.handler.matmul(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
@ -175,7 +196,7 @@ class OnnxStub:
# FIXME unsupport attributes: `alpha` `beta`
assert alpha == 1.0
assert beta == 1.0
tensors[node.output[0]] = self.handler.matmul(
tensors[node.output[0]] = ans.handler.matmul(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
@ -196,7 +217,7 @@ class OnnxStub:
attributes[name]
for name in ["momentum", "epsilon", "training_mode"]
)
tensors[node.output[0]] = self.handler.batchNorm(
tensors[node.output[0]] = ans.handler.batchNorm(
input, output, mean, var, scale, bias, momentum, eps, training != 0
)
elif node.op_type == "MaxPool":
@ -215,10 +236,10 @@ class OnnxStub:
)
if p[0] != p[2] or p[1] != p[3]:
adapt = "{}-adapt".format(node.output[0])
tensors[adapt] = self.handler.pad(
tensors[adapt] = ans.handler.pad(
tensors.get(node.input[0]), None, p, [-2, -1]
)
tensors[node.output[0]] = self.handler.maxPool(
tensors[node.output[0]] = ans.handler.maxPool(
tensors[adapt],
tensors.get(node.output[0]),
k[0],
@ -231,7 +252,7 @@ class OnnxStub:
s[1],
)
else:
tensors[node.output[0]] = self.handler.maxPool(
tensors[node.output[0]] = ans.handler.maxPool(
tensors[node.input[0]],
tensors.get(node.output[0]),
k[0],
@ -257,10 +278,10 @@ class OnnxStub:
)
if p[0] != p[2] or p[1] != p[3]:
adapt = "{}-adapt".format(node.output[0])
tensors[adapt] = self.handler.pad(
tensors[adapt] = ans.handler.pad(
tensors.get(node.input[0]), None, p, [-2, -1]
)
tensors[node.output[0]] = self.handler.avgPool(
tensors[node.output[0]] = ans.handler.avgPool(
tensors[adapt],
tensors.get(node.output[0]),
k[0],
@ -273,7 +294,7 @@ class OnnxStub:
s[1],
)
else:
tensors[node.output[0]] = self.handler.avgPool(
tensors[node.output[0]] = ans.handler.avgPool(
tensors[node.input[0]],
tensors.get(node.output[0]),
k[0],
@ -287,7 +308,7 @@ class OnnxStub:
)
elif node.op_type == "GlobalAveragePool":
[_, _, h, w] = _search_shape(model, node.input[0])
tensors[node.output[0]] = self.handler.avgPool(
tensors[node.output[0]] = ans.handler.avgPool(
tensors[node.input[0]],
tensors.get(node.output[0]),
h,
@ -300,52 +321,52 @@ class OnnxStub:
1,
)
elif node.op_type == "Add":
tensors[node.output[0]] = self.handler.add(
tensors[node.output[0]] = ans.handler.add(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Sub":
tensors[node.output[0]] = self.handler.sub(
tensors[node.output[0]] = ans.handler.sub(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Mul":
tensors[node.output[0]] = self.handler.mul(
tensors[node.output[0]] = ans.handler.mul(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Div":
tensors[node.output[0]] = self.handler.div(
tensors[node.output[0]] = ans.handler.div(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Pow":
tensors[node.output[0]] = self.handler.pow(
tensors[node.output[0]] = ans.handler.pow(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Relu":
tensors[node.output[0]] = self.handler.relu(
tensors[node.output[0]] = ans.handler.relu(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Sigmoid":
tensors[node.output[0]] = self.handler.sigmoid(
tensors[node.output[0]] = ans.handler.sigmoid(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Tanh":
tensors[node.output[0]] = self.handler.tanh(
tensors[node.output[0]] = ans.handler.tanh(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Softmax":
tensors[node.output[0]] = self.handler.softmax(
tensors[node.output[0]] = ans.handler.softmax(
tensors[node.input[0]],
tensors.get(node.output[0]),
next(
@ -353,34 +374,39 @@ class OnnxStub:
),
)
elif node.op_type == "Abs":
tensors[node.output[0]] = self.handler.abs(
tensors[node.output[0]] = ans.handler.abs(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Shape":
tensors[node.output[0]] = self.handler.shape(
tensors[node.output[0]] = ans.handler.shape(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Identity":
tensors[node.output[0]] = self.handler.identity(
tensors[node.output[0]] = ans.handler.identity(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Flatten":
tensors[node.output[0]] = self.handler.flatten(
tensors[node.output[0]] = ans.handler.flatten(
tensors[node.input[0]],
tensors.get(node.output[0]),
next((attr.i for attr in node.attribute if attr.name == "axis")),
)
elif node.op_type == "PRelu":
tensors[node.output[0]] = self.handler.pRelu(
# HACK: replace PRelu with Relu
tensors[node.output[0]] = ans.handler.relu(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
# tensors[node.output[0]] = ans.handler.pRelu(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
elif node.op_type == "Clip":
tensors[node.output[0]] = self.handler.clip(
tensors[node.output[0]] = ans.handler.clip(
tensors[node.input[0]],
tensors.get(node.output[0]),
next(_parse_data(data[node.input[1]]).__iter__(), None)
@ -394,7 +420,7 @@ class OnnxStub:
perm = next(
(attr.ints for attr in node.attribute if attr.name == "perm"), None
)
tensors[node.output[0]] = self.handler.transpose(
tensors[node.output[0]] = ans.handler.transpose(
tensors[node.input[0]],
tensors.get(node.output[0]),
perm,
@ -409,7 +435,7 @@ class OnnxStub:
temp = reduce(lambda acc, x: acc * x, input_shape, 1)
if temp < 0:
input_shape[input_shape.index(-1)] = size // -temp
tensors[node.output[0]] = self.handler.reshape(
tensors[node.output[0]] = ans.handler.reshape(
tensors[node.input[0]],
tensors.get(node.output[0]),
input_shape,
@ -426,7 +452,7 @@ class OnnxStub:
for i, x in enumerate(input_shape):
if i not in axes:
output_shape.append(x)
tensors[node.output[0]] = self.handler.reshape(
tensors[node.output[0]] = ans.handler.reshape(
tensors[node.input[0]],
tensors.get(node.output[0]),
output_shape,
@ -440,13 +466,13 @@ class OnnxStub:
)
for i in axes:
input_shape.insert(i, 1)
tensors[node.output[0]] = self.handler.reshape(
tensors[node.output[0]] = ans.handler.reshape(
tensors[node.input[0]],
tensors.get(node.output[0]),
input_shape,
)
elif node.op_type == "Concat":
tensors[node.output[0]] = self.handler.concat(
tensors[node.output[0]] = ans.handler.concat(
[tensors[name] for name in node.input],
tensors.get(node.output[0]),
next((attr.i for attr in node.attribute if attr.name == "axis")),
@ -454,7 +480,7 @@ class OnnxStub:
elif node.op_type == "Split":
for name, tensor in zip(
node.output,
self.handler.split(
ans.handler.split(
tensors[node.input[0]],
None,
next(
@ -466,14 +492,14 @@ class OnnxStub:
):
tensors[name] = tensor
elif node.op_type == "Gather":
tensors[node.output[0]] = self.handler.gather(
tensors[node.output[0]] = ans.handler.gather(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
next((attr.i for attr in node.attribute if attr.name == "axis")),
)
elif node.op_type == "ReduceMean":
tensors[node.output[0]] = self.handler.reduce_mean(
tensors[node.output[0]] = ans.handler.reduce_mean(
tensors[node.input[0]],
tensors.get(node.output[0]),
tensors[node.input[1]] if len(node.input) > 1 else None,
@ -481,7 +507,7 @@ class OnnxStub:
!= 0,
)
elif node.op_type == "Slice":
tensors[node.output[0]] = self.handler.slice(
tensors[node.output[0]] = ans.handler.slice(
tensors[node.input[0]],
tensors.get(node.output[0]),
_parse_data(data[node.input[1]]),
@ -490,7 +516,7 @@ class OnnxStub:
_parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
)
elif node.op_type == "Pad":
tensors[node.output[0]] = self.handler.pad(
tensors[node.output[0]] = ans.handler.pad(
tensors[node.input[0]],
tensors.get(node.output[0]),
_parse_data(data[node.input[1]]),
@ -499,7 +525,7 @@ class OnnxStub:
elif node.op_type == "Dropout":
for name, tensor in zip(
node.output,
self.handler.dropout(
ans.handler.dropout(
tensors[node.input[0]],
tensors.get(node.output[0]),
tensors.get(node.output[1]) if len(node.output) > 1 else None,
@ -512,18 +538,35 @@ class OnnxStub:
),
):
tensors[name] = tensor
elif node.op_type == "MemBound":
attributes = _parse_attribute(node, {"expr": None})
expr: str = attributes["expr"]
assert expr is not None
assert (
len(node.output) == 1
), """MemBound with multiple
outputs requires rewrite the logic of tensor creation"""
outputs = ans.handler.memBound(
[tensors[name] for name in node.input],
tensors.get(node.output[0]),
expr,
)
for name, tensor in zip(node.output, outputs):
tensors[name] = tensor
else:
raise Exception('Unsupported operator "{}"'.format(node.op_type))
self.handler.data_malloc()
# FIXME: do not load data for speed
return ans
ans.handler.data_malloc()
for name, obj in tensors.items():
tensor = data.get(name)
if tensor == None:
if any(input.name == name for input in model.graph.input):
self.inputs[name] = obj
ans.inputs[name] = obj
else:
self.initializer[obj.fuid()] = tensor
ans.initializer[obj.fuid()] = tensor
if tensor.data_type == TensorProto.INT32:
obj.copyin_int32(_parse_data(tensor))
elif tensor.data_type == TensorProto.INT64:
@ -533,8 +576,19 @@ class OnnxStub:
else:
assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
for output in model.graph.output:
self.outputs[output.name] = tensors[output.name]
return ans
@classmethod
def from_graph(cls, g: backend.Graph):
ans = OnnxStub()
handler = backend.GraphHandler(g)
for i, tensor in enumerate(handler.inputs()):
ans.inputs["input{}".format(i)] = tensor
for i, tensor in enumerate(handler.outputs()):
ans.inputs["output{}".format(i)] = tensor
ans.handler = handler
ans.disable_check = True
return ans
def to_onnx(self, name: str) -> ModelProto:
class Context:
@ -552,6 +606,13 @@ class OnnxStub:
outputs: List[ValueInfoProto] = []
# saves global input tensors
initializers: List[TensorProto] = []
# saves global output tensors
value_info: List[ValueInfoProto] = []
enable_check = False
def __init__(self, enable_check):
self.enable_check = enable_check
def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
ty = op.op_type()
@ -562,12 +623,15 @@ class OnnxStub:
def push_output(self, name: str, tensor: backend.Tensor) -> str:
self.names[tensor] = name
if not tensor.has_target():
shape = tensor.shape()
dtype = backend.tensor_dtype(tensor)
value_info = make_tensor_value_info(name, dtype, shape)
check_value_info(value_info)
shape = tensor.shape()
dtype = backend.tensor_dtype(tensor)
value_info = make_tensor_value_info(name, dtype, shape)
check_value_info(value_info)
if not tensor.has_target(): # if this output is a global output
self.outputs.append(value_info)
else: # if this output is a local output
self.value_info.append(value_info)
return name
def push_input(
@ -577,7 +641,15 @@ class OnnxStub:
# means that this input is a global input
if name is None:
self.count_in += 1
name = "input{}".format(self.count_in)
if tensor.getTensorType() == backend.TensorType.Input:
name = f"input{self.count_in}_{tensor.guid()}"
else:
name = f"weight{self.count_in}_{tensor.guid()}"
shape = tensor.shape()
data = np.random.randn(*shape)
self.initializers.append(
make_tensor(name, TensorProto.FLOAT, shape, data)
)
self.names[tensor] = name
if init != None:
init.name = name
@ -605,17 +677,25 @@ class OnnxStub:
return name
def push_node(self, node: NodeProto) -> None:
check_node(node)
if self.enable_check:
check_node(node)
self.nodes.append(node)
def build(self, name: str) -> ModelProto:
graph = make_graph(
self.nodes, name, self.inputs, self.outputs, self.initializers
self.nodes,
name,
self.inputs,
self.outputs,
self.initializers,
value_info=self.value_info,
)
check_graph(graph)
if self.enable_check:
check_graph(graph)
model = make_model(graph)
check_model(model)
if self.enable_check:
check_model(model)
return model
@ -625,7 +705,7 @@ class OnnxStub:
ops = self.handler.operators() # 图中所有算子(节点)
ctx = Context()
ctx = Context(not self.disable_check)
for op in ops:
ty, name = ctx.name_op(op)
@ -634,11 +714,11 @@ class OnnxStub:
for it in op.inputs()
]
outputs = [
ctx.push_output("{}_{}".format(name, i), it)
ctx.push_output(f"{name}_{i}_{it.guid()}", it)
for (i, it) in enumerate(op.outputs())
]
if ty == backend.OpType.Conv:
ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op)
if ty == backend.OpType.Conv or ty == backend.OpType.ConvNHWC:
ph, pw, sh, sw, dh, dw = backend.conv_attrs_of(op)
ctx.push_node(
make_node(
ty.name,
@ -651,7 +731,7 @@ class OnnxStub:
group=op.inputs()[0].shape()[1] // op.inputs()[1].shape()[1],
)
)
elif ty == backend.OpType.ConvTrans:
elif ty == backend.OpType.ConvTrans or ty == backend.OpType.ConvTransNHWC:
ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
ctx.push_node(
make_node(
@ -729,7 +809,8 @@ class OnnxStub:
]:
ctx.push_node(make_node(ty.name, inputs, outputs, name))
elif ty == backend.OpType.Flatten:
raise Exception("TODO")
ctx.push_node(make_node(ty.name, inputs,
outputs, axis=backend.flatten_axis_of(op)))
elif ty == backend.OpType.Transpose:
perm = backend.transpose_permute_of(op)
ctx.push_node(make_node(ty.name, inputs, outputs, name, perm=perm))
@ -744,7 +825,8 @@ class OnnxStub:
shape,
)
)
ctx.push_node(make_node(ty.name, inputs, outputs, name))
ctx.push_node(make_node(ty.name, inputs,
outputs, name, allowzero=0))
elif ty == backend.OpType.Concat:
axis = backend.concat_axis_of(op)
ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
@ -812,6 +894,62 @@ class OnnxStub:
ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
)
ctx.push_node(make_node(ty.name, inputs, outputs, name))
elif ty == backend.OpType.Any:
kernel_name = backend.any_kernelName_of(op)
normal_op = kernel_name != 'Reduce3x3Offset_hint'
ctx.push_node(
make_node(
ty.name if normal_op else 'Reduce3x3OffsetPlugin',
inputs,
outputs,
name,
kernelName=kernel_name,
domain="nnet" if normal_op else None,
)
)
elif ty in [backend.OpType.ConvTransNHWC, backend.OpType.GBMM,
backend.OpType.G2BMM]:
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
domain="nnet",
)
)
elif ty == backend.OpType.Conv2dReduce:
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
domain="nnet",
)
)
elif ty == backend.OpType.Conv2dReduceTranspose:
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
domain="nnet",
)
)
elif ty == backend.OpType.MemBound:
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
domain="nnet",
expr=backend.membound_expr_of(op),
hash=str(backend.membound_hash_of(op)),
)
)
else:
raise Exception("Unsupported OpType", ty)
@ -828,7 +966,7 @@ class OnnxStub:
def from_onnx(model: ModelProto, runtime):
stub = OnnxStub(model, runtime)
stub = OnnxStub.from_onnx(model, runtime)
return stub.inputs, stub.outputs, stub.handler
@ -889,3 +1027,9 @@ def _parse_data(tensor: TensorProto) -> List[Any]:
def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
return [(d.dim_value if d.dim_value > 0 else 1) for d in shape.dim]
def save_onnx(opt_g, filename: str):
stub = OnnxStub.from_graph(opt_g)
with open(filename, "wb") as f:
f.write(stub.to_onnx("optimized").SerializeToString())

View File

@ -0,0 +1,18 @@
import subprocess
import re
import os
from .onnx import save_onnx
def get_trt_time(g):
onnx_filename = '/tmp/tmp.onnx'
save_onnx(g, onnx_filename)
plugin_path = os.environ['TRT_PLUGIN']
# LD_LIBRARY_PATH=$TRT_PLUGIN:$LD_LIBRARY_PATH trtexec --noTF32 --onnx=/home/zly/InfiniTensor_merge/build/opt_resnet.bs16.onnx --plugins=$TRT_PLUGIN/libnvinfer_plugin.so.8.2.0
res = subprocess.run(
f'trtexec --noTF32 --onnx={onnx_filename} --plugins={plugin_path}/libnvinfer_plugin.so.8.2.0'.split(' '), capture_output=True)
p = re.compile('GPU Compute Time.*mean = ([0-9.]+) ms')
output = res.stdout.decode('utf-8')
# err = res.stderr.decode('utf-8')
# print(output, '\n'*5, err)
return float(p.search(output).group(1))

View File

@ -40,9 +40,9 @@ class TestStringMethods(unittest.TestCase):
file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
)
)
model = OnnxStub(onnx.load(model_file), backend.cpu_runtime()).to_onnx(
"new"
)
model = OnnxStub.from_onnx(
onnx.load(model_file), backend.cpu_runtime()
).to_onnx("new")
model = infer_shapes(model)
def test_tensor(self):
@ -304,16 +304,16 @@ class TestStringMethods(unittest.TestCase):
def test_frontend(self):
handler = backend.GraphHandler(backend.cpu_runtime())
a = handler.tensor([1, 2, 3], 12)
b = handler.tensor([1, 2, 3], 12)
c = handler.tensor([1, 2, 3], 12)
d = handler.tensor([1, 2, 3], 12)
e = handler.tensor([1, 2, 3], 12)
a = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
b = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
c = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
d = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
e = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
x = handler.add(
handler.add(handler.add(handler.add(a, b, None), c, None), d, None), e, None
)
y = handler.tensor([3, 2, 1], 12)
y = handler.tensor([3, 2, 1], 12, backend.TensorType.Other)
handler.reshape(x, y, [3, 2, 1])

View File

@ -1,106 +1,131 @@
import re
import os
import sys
import json
from contextlib import redirect_stdout
import time
import logging
import numpy as np
import tvm
from tvm import te, tir, auto_scheduler, topi
import os
import json
import logging
USE_CACHE = True
logging.basicConfig()
logger = logging.getLogger('InfiniTensor')
logger.setLevel(logging.DEBUG)
logger.setLevel(logging.INFO)
def gen_ansor_so(input_tensors, input_dtypes, output_tensor, output_dtype,
tvm_code, func_name, nnet_expression: str,
nnet_simplified_expression: str, hash_code=None):
nnet_simplified_expression: str, hash_code: str = None):
assert len(input_tensors) == len(input_dtypes)
logging.debug(f'Work on hash {hash_code}')
logger.debug(f'Work on hash {hash_code}')
dir_name = os.path.join(".cache", "generated_kernels", str(hash_code))
if not os.path.exists(dir_name):
os.makedirs(dir_name)
so_fn = os.path.join(dir_name, f"{func_name}.so")
config_fn = os.path.join(dir_name, "config_so.json")
print("Generating Ansor op: ")
print(tvm_code)
print("Input shape: ")
print(input_tensors)
print("Output shape: ")
print(output_tensor)
desc_fn = os.path.join(dir_name, "desc.txt")
log_fn = os.path.join(dir_name, f"ansor_{func_name}_log.json")
out_fn = os.path.join(dir_name, "out.txt")
logger.debug(f"Generating Ansor op: {tvm_code}")
logger.debug(f"Input shape: {input_tensors}")
logger.debug(f"Output shape: {output_tensor}")
if USE_CACHE and hash_code is not None:
if os.path.exists(dir_name) and \
os.path.exists(so_fn) and \
os.path.exists(config_fn):
os.path.exists(so_fn) and \
os.path.exists(config_fn):
print(f"Use cache in {dir_name}")
with open(config_fn, "r") as config_fin:
config = json.loads(config_fin.read().strip())
conv_time = config["conv_time"]
logger.debug(f'Find tuning log for {hash_code}')
logger.info(f'Find tuning log for {hash_code} in {so_fn}')
return so_fn, conv_time
logger.info(f"TVM Tuning kernel with hash {hash_code}. See {out_fn}")
time_start = time.perf_counter()
# Print descriptions of the task
if USE_CACHE and hash_code is not None:
with redirect_stdout(open(desc_fn, "w")):
print("====NNET tensor expression====")
print(nnet_expression+"\n")
print("====NNET simplified tensor expression====")
print(nnet_simplified_expression+"\n")
print("====TVM compute====")
print(tvm_code+"\n")
print("Input shape: ", input_tensors)
print("Output shape: ", output_tensor)
@auto_scheduler.register_workload(func_name)
def compute():
_locals = locals()
exec(tvm_code, {'tvm': tvm, 'te': te, 'tir': tir, 'topi': topi}, _locals)
exec(tvm_code, {'tvm': tvm, 'te': te,
'tir': tir, 'topi': topi}, _locals)
return _locals['ret']
target = tvm.target.Target("cuda")
task = auto_scheduler.SearchTask(func=func_name, args=(), target=target)
# Inspect the computational graph
print("Computational DAG:")
print(task.compute_dag)
with redirect_stdout(open(out_fn, 'w')):
# Inspect the computational graph
print("Computational DAG:")
print(task.compute_dag)
log_file = f"ansor_{func_name}_log.json"
measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=10,
runner=measure_ctx.runner,
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
verbose=2,
)
measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=10,
runner=measure_ctx.runner,
measure_callbacks=[auto_scheduler.RecordToFile(log_fn)],
verbose=2,
)
# Run auto-tuning (search)
task.tune(tune_option)
# Apply the best schedule
sch, args = task.apply_best(log_file)
# Run auto-tuning (search)
task.tune(tune_option)
# Apply the best schedule
sch, args = task.apply_best(log_fn)
# Kill the measurement process
del measure_ctx
func = tvm.build(sch, args, target, name=func_name)
func.export_library(so_fn)
ctx = tvm.cuda(0)
input_a = []
for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
a_np = np.random.uniform(size=shape).astype(dtype)
input_a.append(tvm.nd.array(a_np, ctx))
a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
func(a_out, *input_a)
evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
conv_time = evaluator(a_out, *input_a).mean * 1e3
time_end = time.perf_counter()
# Kill the measurement process
del measure_ctx
func = tvm.build(sch, args, target, name=func_name)
func.export_library(so_fn)
ctx = tvm.cuda(0)
input_a = []
for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
a_np = np.random.uniform(size=shape).astype(dtype)
input_a.append(tvm.nd.array(a_np, ctx))
a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
func(a_out, *input_a)
evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
conv_time = evaluator(a_out, *input_a).mean * 1e3
print("====NNET tensor expression====")
print(nnet_expression+"\n")
print("====NNET simplified tensor expression====")
print(nnet_simplified_expression+"\n")
print("====Time====")
print(conv_time)
if USE_CACHE and hash_code is not None:
with open(config_fn, "w") as config_fout:
config_fout.write(json.dumps({
"conv_time": conv_time,
"tuning_time": time_end - time_start,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
}, ensure_ascii=False, indent=2))
return so_fn, conv_time
# Read arguments from pipe, which is redirected to stdin.
# Write generated library path to pipe.
def pipe_gen(fd: int):
args = json.load(sys.stdin) # read from pipe
# print(args, f'fd={fd}')
ret = gen_ansor_so(**args)
with os.fdopen(fd, 'w') as f:
print(ret[0], file=f, end='') # write to pipe

View File

@ -0,0 +1,7 @@
import backend
from backend import *
import sys
sys.path.extend(__path__)
print("import backend: {}".format(backend))

View File

@ -0,0 +1,941 @@
import backend
import onnx
from onnx import (
ModelProto,
TensorProto,
NodeProto,
AttributeProto,
TensorShapeProto,
ValueInfoProto,
)
from onnx.helper import (
make_node,
make_tensor_value_info,
make_tensor,
make_graph,
make_model,
)
from onnx.checker import (
check_graph,
check_model,
check_node,
check_value_info,
check_tensor,
)
from onnx.shape_inference import infer_shapes
from onnx.numpy_helper import to_array
from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
from functools import reduce
class OnnxStub:
"""
The Onnx model imported into infinitensor.
It can be generated from an Onnx model object.
"""
# inputs: Dict[str, backend.Tensor] = {}
# outputs: Dict[str, backend.Tensor] = {}
initializer: Dict[int, TensorProto] = {}
# handler: backend.GraphHandler
# def __init__(self, model: ModelProto, runtime):
# model = infer_shapes(model)
# self.handler = backend.GraphHandler(runtime)
# tensors: Dict[str, backend.Tensor] = dict()
# data: Dict[str, TensorProto] = dict()
# for input in model.graph.input:
# dims = _take_shape_dim(input.type.tensor_type.shape)
# tensors[input.name] = self.handler.tensor(
# dims, input.type.tensor_type.elem_type
# )
# for output in model.graph.output:
# dims = _take_shape_dim(output.type.tensor_type.shape)
# tensors[output.name] = self.handler.tensor(
# dims, output.type.tensor_type.elem_type
# )
# for initializer in model.graph.initializer:
# dims = [d for d in initializer.dims]
# tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
# data[initializer.name] = initializer
# for node in model.graph.node:
# if node.op_type == "Conv":
# attributes = _parse_attribute(
# node,
# {
# "dilations": [1, 1],
# "pads": [0, 0, 0, 0],
# "strides": [1, 1],
# },
# )
# (d, p, s) = (
# attributes[name] for name in ["dilations", "pads", "strides"]
# )
# if p[0] != p[2] or p[1] != p[3]:
# adapt = "{}-adapt".format(node.output[0])
# tensors[adapt] = self.handler.pad(
# tensors[node.input[0]], None, p, [-2, -1]
# )
# p = [0, 0, 0, 0]
# else:
# adapt = node.input[0]
# if len(node.input) > 2:
# bias = "{}-bias".format(node.output[0])
# reshape = "{}-reshape".format(node.output[0])
# tensors[bias] = self.handler.conv(
# tensors[adapt],
# tensors[node.input[1]],
# None,
# p[0],
# p[1],
# s[0],
# s[1],
# d[0],
# d[1],
# )
# tensors[reshape] = self.handler.reshape(
# tensors[node.input[2]],
# None,
# [
# 1,
# reduce(
# lambda acc, x: acc * x,
# _search_shape(model, node.input[2]),
# ),
# 1,
# 1,
# ],
# )
# tensors[node.output[0]] = self.handler.add(
# tensors[bias],
# tensors[reshape],
# tensors.get(node.output[0]),
# )
# else:
# tensors[node.output[0]] = self.handler.conv(
# tensors[adapt],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# p[0],
# p[1],
# s[0],
# s[1],
# d[0],
# d[1],
# )
# elif node.op_type == "ConvTranspose":
# attributes = _parse_attribute(
# node,
# {
# "dilations": [1, 1],
# "pads": [0, 0],
# "strides": [1, 1],
# "output_padding": [0, 0],
# },
# )
# (d, p, s, op) = (
# attributes[name]
# for name in ["dilations", "pads", "strides", "output_padding"]
# )
# tensors[node.output[0]] = self.handler.convTransposed2d(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# p[0],
# p[1],
# s[0],
# s[1],
# d[0],
# d[1],
# op[0],
# op[1],
# )
# elif node.op_type == "MatMul":
# tensors[node.output[0]] = self.handler.matmul(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# False,
# False,
# None,
# backend.ActType.Linear,
# )
# elif node.op_type == "Gemm":
# attributes = _parse_attribute(
# node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
# )
# (alpha, beta, transA, transB) = (
# attributes[name] for name in ["alpha", "beta", "transA", "transB"]
# )
# # FIXME unsupport attributes: `alpha` `beta`
# assert alpha == 1.0
# assert beta == 1.0
# tensors[node.output[0]] = self.handler.matmul(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# transA == 1,
# transB == 1,
# tensors[node.input[2]] if len(node.input) > 2 else None,
# backend.ActType.Linear,
# )
# elif node.op_type == "BatchNormalization":
# (input, mean, var, scale, bias) = (
# tensors[node.input[i]] for i in [0, 3, 4, 1, 2]
# )
# output = tensors.get(node.output[0])
# attributes = _parse_attribute(
# node, {"momentum": 0.9, "epsilon": 1e-05, "training_mode": 0}
# )
# (momentum, eps, training) = (
# attributes[name]
# for name in ["momentum", "epsilon", "training_mode"]
# )
# tensors[node.output[0]] = self.handler.batchNorm(
# input, output, mean, var, scale, bias, momentum, eps, training != 0
# )
# elif node.op_type == "MaxPool":
# attributes = _parse_attribute(
# node,
# {
# "kernel_shape": None,
# "dilations": [1, 1],
# "pads": [0, 0, 0, 0],
# "strides": [1, 1],
# },
# )
# (k, d, p, s) = (
# attributes[name]
# for name in ["kernel_shape", "dilations", "pads", "strides"]
# )
# if p[0] != p[2] or p[1] != p[3]:
# adapt = "{}-adapt".format(node.output[0])
# tensors[adapt] = self.handler.pad(
# tensors.get(node.input[0]), None, p, [-2, -1]
# )
# tensors[node.output[0]] = self.handler.maxPool(
# tensors[adapt],
# tensors.get(node.output[0]),
# k[0],
# k[1],
# d[0],
# d[1],
# 0,
# 0,
# s[0],
# s[1],
# )
# else:
# tensors[node.output[0]] = self.handler.maxPool(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# k[0],
# k[1],
# d[0],
# d[1],
# p[0],
# p[1],
# s[0],
# s[1],
# )
# elif node.op_type == "AveragePool":
# attributes = _parse_attribute(
# node,
# {
# "kernel_shape": None,
# "pads": [0, 0, 0, 0],
# "strides": [1, 1],
# },
# )
# (k, p, s) = (
# attributes[name] for name in ["kernel_shape", "pads", "strides"]
# )
# if p[0] != p[2] or p[1] != p[3]:
# adapt = "{}-adapt".format(node.output[0])
# tensors[adapt] = self.handler.pad(
# tensors.get(node.input[0]), None, p, [-2, -1]
# )
# tensors[node.output[0]] = self.handler.avgPool(
# tensors[adapt],
# tensors.get(node.output[0]),
# k[0],
# k[1],
# 1,
# 1,
# 0,
# 0,
# s[0],
# s[1],
# )
# else:
# tensors[node.output[0]] = self.handler.avgPool(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# k[0],
# k[1],
# 1,
# 1,
# p[0],
# p[1],
# s[0],
# s[1],
# )
# elif node.op_type == "GlobalAveragePool":
# [_, _, h, w] = _search_shape(model, node.input[0])
# tensors[node.output[0]] = self.handler.avgPool(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# h,
# w,
# 1,
# 1,
# 0,
# 0,
# 1,
# 1,
# )
# elif node.op_type == "Add":
# tensors[node.output[0]] = self.handler.add(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Sub":
# tensors[node.output[0]] = self.handler.sub(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Mul":
# tensors[node.output[0]] = self.handler.mul(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Div":
# tensors[node.output[0]] = self.handler.div(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Pow":
# tensors[node.output[0]] = self.handler.pow(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Relu":
# tensors[node.output[0]] = self.handler.relu(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Sigmoid":
# tensors[node.output[0]] = self.handler.sigmoid(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Tanh":
# tensors[node.output[0]] = self.handler.tanh(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Softmax":
# tensors[node.output[0]] = self.handler.softmax(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Abs":
# tensors[node.output[0]] = self.handler.abs(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Shape":
# tensors[node.output[0]] = self.handler.shape(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Identity":
# tensors[node.output[0]] = self.handler.identity(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Flatten":
# # FIXME axis must be 1
# axis = next(
# (attr.i for attr in node.attribute if attr.name == "axis"), None
# )
# assert axis == None or axis == 1
# tensors[node.output[0]] = self.handler.flatten(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "PRelu":
# tensors[node.output[0]] = self.handler.pRelu(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# )
# elif node.op_type == "Clip":
# tensors[node.output[0]] = self.handler.clip(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# next(_parse_data(data[node.input[1]]).__iter__(), None)
# if len(node.input) > 1
# else None,
# next(_parse_data(data[node.input[2]]).__iter__(), None)
# if len(node.input) > 2
# else None,
# )
# elif node.op_type == "Transpose":
# perm = next(
# (attr.ints for attr in node.attribute if attr.name == "perm"), None
# )
# tensors[node.output[0]] = self.handler.transpose(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# perm,
# )
# elif node.op_type == "Reshape":
# dims = _search_shape(model, node.input[0])
# size = reduce(lambda acc, x: acc * x, dims)
# input_shape = _parse_data(data[node.input[1]])
# for i, x in enumerate(input_shape):
# if x == 0:
# input_shape[i] = dims[i]
# temp = reduce(lambda acc, x: acc * x, input_shape, 1)
# if temp < 0:
# input_shape[input_shape.index(-1)] = size // -temp
# tensors[node.output[0]] = self.handler.reshape(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# input_shape,
# )
# elif node.op_type == "Squeeze":
# input_shape = _search_shape(model, node.input[0])
# axes = set(
# [int(i) for i in data[node.input[1]].int64_data]
# if len(node.input) > 1
# else _parse_attribute(node, {"axes": None})["axes"]
# )
# assert all(input_shape[d] == 1 for d in axes)
# output_shape = []
# for i, x in enumerate(input_shape):
# if i not in axes:
# output_shape.append(x)
# tensors[node.output[0]] = self.handler.reshape(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# output_shape,
# )
# elif node.op_type == "Unsqueeze":
# input_shape = _search_shape(model, node.input[0])
# axes = (
# [int(i) for i in data[node.input[1]].int64_data]
# if len(node.input) > 1
# else _parse_attribute(node, {"axes": None})["axes"]
# )
# for i in axes:
# input_shape.insert(i, 1)
# tensors[node.output[0]] = self.handler.reshape(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# input_shape,
# )
# elif node.op_type == "Concat":
# tensors[node.output[0]] = self.handler.concat(
# [tensors[name] for name in node.input],
# tensors.get(node.output[0]),
# next((attr.i for attr in node.attribute if attr.name == "axis")),
# )
# elif node.op_type == "Split":
# for name, tensor in zip(
# node.output,
# self.handler.split(
# tensors[node.input[0]],
# None,
# next(
# (attr.i for attr in node.attribute if attr.name == "axis"),
# 0,
# ),
# len(node.output),
# ),
# ):
# tensors[name] = tensor
# elif node.op_type == "Gather":
# tensors[node.output[0]] = self.handler.gather(
# tensors[node.input[0]],
# tensors[node.input[1]],
# tensors.get(node.output[0]),
# next((attr.i for attr in node.attribute if attr.name == "axis")),
# )
# elif node.op_type == "ReduceMean":
# tensors[node.output[0]] = self.handler.reduce_mean(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# next(
# (attr.ints for attr in node.attribute if attr.name == "axes"),
# None,
# ),
# next((attr.i for attr in node.attribute if attr.name == "keepdims"))
# != 0,
# )
# elif node.op_type == "Slice":
# tensors[node.output[0]] = self.handler.slice(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# _parse_data(data[node.input[1]]),
# _parse_data(data[node.input[2]]),
# _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
# _parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
# )
# elif node.op_type == "Pad":
# tensors[node.output[0]] = self.handler.pad(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# _parse_data(data[node.input[1]]),
# _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
# )
# elif node.op_type == "Dropout":
# for name, tensor in zip(
# node.output,
# self.handler.dropout(
# tensors[node.input[0]],
# tensors.get(node.output[0]),
# tensors.get(node.output[1]) if len(node.output) > 1 else None,
# _parse_data(data[node.input[1]])[0]
# if len(node.input) > 1
# else 0.5,
# _parse_data(data[node.input[2]])[0]
# if len(node.input) > 2
# else False,
# ),
# ):
# tensors[name] = tensor
# else:
# raise Exception('Unsupported operator "{}"'.format(node.op_type))
# self.handler.data_malloc()
# for name, obj in tensors.items():
# tensor = data.get(name)
# if tensor == None:
# if any(input.name == name for input in model.graph.input):
# self.inputs[name] = obj
# else:
# self.initializer[obj.fuid()] = tensor
# if tensor.data_type == TensorProto.INT32:
# obj.copyin_int32(_parse_data(tensor))
# elif tensor.data_type == TensorProto.INT64:
# obj.copyin_int64(_parse_data(tensor))
# elif tensor.data_type == TensorProto.FLOAT:
# obj.copyin_float(_parse_data(tensor))
# else:
# assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
# for output in model.graph.output:
# self.outputs[output.name] = tensors[output.name]
def to_onnx(self, g: backend.Graph, path: str, name: str = 'my_onnx') -> ModelProto:
class Context:
# saves object names, including tensors and operators
names: Dict[Union[backend.Tensor, backend.Operator], str] = dict()
# counts the occurrence times of each operator for naming
count_op: Dict[backend.OpType, int] = dict()
# counts input and output tensors for naming
count_in, count_out = 0, 0
# saves nodes (operators)
nodes: List[NodeProto] = []
# saves global input tensors
inputs: List[ValueInfoProto] = []
# saves global output tensors
outputs: List[ValueInfoProto] = []
# saves global input tensors
initializers: List[TensorProto] = []
def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
ty = op.op_type()
name = "{}_{}".format(ty.name, op.guid())
self.names[op] = name
self.count_op[ty] = self.count_op.get(ty, 0) + 1
return ty, name
def push_output(self, name: str, tensor: backend.Tensor) -> str:
self.names[tensor] = name
if not tensor.has_target():
shape = tensor.shape()
dtype = backend.tensor_dtype(tensor)
value_info = make_tensor_value_info(name, dtype, shape)
check_value_info(value_info)
self.outputs.append(value_info)
return name
def push_input(
self, tensor: backend.Tensor, init: Optional[TensorProto]
) -> str:
name = self.names.get(tensor)
# means that this input is a global input
if name is None:
self.count_in += 1
name = "input_{}".format(tensor.guid())
self.names[tensor] = name
if init != None:
init.name = name
self.initializers.append(init)
else:
shape = tensor.shape()
dtype = backend.tensor_dtype(tensor)
value_info = make_tensor_value_info(name, dtype, shape)
check_value_info(value_info)
self.inputs.append(value_info)
return name
def push_data_input(
self,
node_name: str,
attr_name: str,
elem_type: int,
shape: Sequence[int],
vals: Any,
) -> str:
name = "{}_{}".format(node_name, attr_name)
tensor = make_tensor(name, elem_type, shape, vals)
check_tensor(tensor)
self.initializers.append(tensor)
return name
def push_node(self, node: NodeProto) -> None:
# check_node(node)
self.nodes.append(node)
def build(self, name: str) -> ModelProto:
graph = make_graph(
self.nodes, name, self.inputs, self.outputs, self.initializers
)
# check_graph(graph)
model = make_model(graph)
# check_model(model)
return model
# 拓扑排序
if not g.topo_sort():
raise Exception("Sorting fails")
ops = g.operators() # 图中所有算子(节点)
ctx = Context()
for op in ops:
ty, name = ctx.name_op(op)
inputs = [
ctx.push_input(it, self.initializer.get(it.fuid()))
for it in op.inputs()
]
outputs = [
ctx.push_output("{}_{}_{}".format(
name, i, tensor.guid()), tensor)
for (i, tensor) in enumerate(op.outputs())
]
if ty == backend.OpType.Conv:
ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op)
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
pads=[ph, pw, ph, pw],
strides=[sh, sw],
dilations=[dh, dw],
group=op.inputs()[0].shape()[
1] // op.inputs()[1].shape()[1],
)
)
elif ty == backend.OpType.ConvTrans:
ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(
op)
ctx.push_node(
make_node(
"ConvTranspose",
inputs,
outputs,
name,
pads=[ph, pw],
strides=[sh, sw],
dilations=[dh, dw],
output_padding=[oph, opw],
)
)
elif ty == backend.OpType.ConvTransNHWC:
# ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
ctx.push_node(
make_node(
"ConvTranspose",
inputs,
outputs,
name,
domain="nnet",
# pads=[ph, pw],
# strides=[sh, sw],
# dilations=[dh, dw],
# output_padding=[oph, opw],
)
)
elif ty == backend.OpType.MemBound:
# ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
ctx.push_node(
make_node(
"Membound",
inputs,
outputs,
name,
domain="nnet",
# pads=[ph, pw],
# strides=[sh, sw],
# dilations=[dh, dw],
# output_padding=[oph, opw],
)
)
elif ty == backend.OpType.Matmul:
# transA, transB = backend.matmul_attrs_of(op)
# HACK: recover this
transA, transB = False, False
ctx.push_node(
make_node(
"Gemm", inputs, outputs, name, transA=transA, transB=transB
)
)
elif ty == backend.OpType.BatchNorm:
inputs = [inputs[i] for i in [0, 3, 4, 1, 2]]
momentum, eps, training = backend.batch_norm_attrs_of(op)
ctx.push_node(
make_node(
"BatchNormalization",
inputs,
outputs,
name,
epsilon=eps,
momentum=momentum,
training_mode=training,
)
)
elif ty == backend.OpType.MaxPool:
kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
kernel_shape=[kh, kw],
pads=[ph, pw, ph, pw],
dilations=[dh, dw],
strides=[sh, sw],
)
)
elif ty == backend.OpType.AvgPool:
kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
ctx.push_node(
make_node(
"AveragePool",
inputs,
outputs,
name,
kernel_shape=[kh, kw],
pads=[ph, pw, ph, pw],
strides=[sh, sw],
)
)
elif ty in [
backend.OpType.Add,
backend.OpType.Sub,
backend.OpType.Mul,
backend.OpType.Div,
backend.OpType.Pow,
backend.OpType.Relu,
backend.OpType.Sigmoid,
backend.OpType.Tanh,
backend.OpType.Softmax,
backend.OpType.Abs,
backend.OpType.Identity,
backend.OpType.PRelu,
]:
ctx.push_node(make_node(ty.name, inputs, outputs, name))
elif ty == backend.OpType.Flatten:
raise Exception("TODO")
elif ty == backend.OpType.Transpose:
perm = backend.transpose_permute_of(op)
ctx.push_node(make_node(ty.name, inputs,
outputs, name, perm=perm))
elif ty == backend.OpType.Reshape:
shape = backend.reshape_shape_of(op)
inputs.append(
ctx.push_data_input(
name,
"shape",
TensorProto.INT64,
[len(shape)],
shape,
)
)
ctx.push_node(make_node(ty.name, inputs, outputs, name))
elif ty == backend.OpType.Concat:
axis = backend.concat_axis_of(op)
ctx.push_node(make_node(ty.name, inputs,
outputs, name, axis=axis))
elif ty == backend.OpType.Split:
axis = backend.split_axis_of(op)
num_outputs = len(outputs)
split = op.inputs()[0].shape()[axis] // num_outputs
inputs.append(
ctx.push_data_input(
name,
"split",
TensorProto.INT64,
[len(outputs)],
[split for _ in range(0, num_outputs)],
)
)
ctx.push_node(
make_node(
ty.name,
inputs,
outputs,
name,
axis=axis,
)
)
elif ty == backend.OpType.Gather:
axis = backend.gather_axis_of(op)
ctx.push_node(make_node(ty.name, inputs,
outputs, name, axis=axis))
elif ty == backend.OpType.ReduceMean:
axes, keepdims = backend.reduce_mean_attrs_of(op)
inputs.append(
ctx.push_data_input(
name, "axes", TensorProto.INT64, [len(axes)], axes
)
)
ctx.push_node(
make_node(ty.name, inputs, outputs,
name, keepdims=keepdims)
)
elif ty == backend.OpType.Slice:
raise Exception("TODO")
elif ty == backend.OpType.Pad:
pads = backend.pad_pads_of(op)
inputs.append(
ctx.push_data_input(
name, "pads", TensorProto.INT64, [len(pads)], pads
)
)
ctx.push_node(make_node(ty.name, inputs, outputs, name))
# elif ty == backend.OpType.Clip:
# min, max = backend.clip_attrs_of(op)
# if min != None:
# inputs.append(
# ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [min])
# )
# else:
# inputs.append(
# ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [])
# )
# if max != None:
# inputs.append(
# ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [max])
# )
# else:
# inputs.append(
# ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
# )
# ctx.push_node(make_node(ty.name, inputs, outputs, name))
else:
raise Exception("Unsupported OpType", ty)
model = ctx.build(name)
onnx.save(model, path)
return model
# def init(self) -> None:
# self.handler.data_malloc()
# def optimize(self) -> None:
# self.handler.optimize()
# def run(self) -> None:
# self.handler.run()
# def from_onnx(model: ModelProto, runtime):
# stub = OnnxStub(model, runtime)
# return stub.inputs, stub.outputs, stub.handler
# def _search_shape(model: ModelProto, name: str) -> List[int]:
# ans = (
# next(
# (
# [
# (d.dim_value if d.dim_value > 0 else 1)
# for d in tensor.type.tensor_type.shape.dim
# ]
# for tensor in model.graph.value_info
# if tensor.name == name
# ),
# None,
# )
# or next(
# (
# [
# (d.dim_value if d.dim_value > 0 else 1)
# for d in tensor.type.tensor_type.shape.dim
# ]
# for tensor in model.graph.input
# if tensor.name == name
# ),
# None,
# )
# or next(
# [int(d) for d in tensor.dims]
# for tensor in model.graph.initializer
# if tensor.name == name
# )
# )
# return ans
# def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
# for attr in node.attribute:
# if attr.name in attrs:
# if attr.type == AttributeProto.INT:
# attrs[attr.name] = attr.i
# elif attr.type == AttributeProto.INTS:
# attrs[attr.name] = attr.ints
# elif attr.type == AttributeProto.FLOAT:
# attrs[attr.name] = attr.f
# elif attr.type == AttributeProto.STRING:
# attrs[attr.name] = attr.s
# elif attr.type == AttributeProto.TENSOR:
# attrs[attr.name] = attr.t
# else:
# assert False, "Unsupported Attribute Type: {}".format(attr.type)
# return attrs
# def _parse_data(tensor: TensorProto) -> List[Any]:
# return to_array(tensor).flatten().tolist()
# def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
# return [(d.dim_value if d.dim_value > 0 else 1) for d in shape.dim]
def export_onnx(g: backend.Graph, path: str) -> None:
stub = OnnxStub()
stub.to_onnx(g, path)

View File

@ -29,6 +29,7 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
perfEngine.setPerfData(perfKey, record);
} else
record = perfData;
std::cout << 5 << std::endl;
double t = record->time;
totalTime += t;

View File

@ -125,12 +125,30 @@ void GraphObj::optimize() {
void GraphObj::dataMalloc() {
for (auto &tensor : tensors) {
tensor->dataMalloc();
if (tensor->getSource() && tensor->getTargets().size() > 0 &&
tensor->getSource()->getOpType() == OpType::Reshape) {
continue;
} else
tensor->dataMalloc();
}
// Fill reshape output for avoiding nullptr
for (auto &tensor : tensors) {
if (tensor->getSource() &&
tensor->getSource()->getOpType() == OpType::Reshape) {
tensor->setData(tensor->getSource()->getInputs(0)->getDataBlob());
}
}
}
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
void GraphObj::dataFree() {
for (auto &tensor : tensors) {
tensor->freeData();
}
}
Tensor GraphObj::addTensor(Shape dim, DataType dtype, TensorType tensorType) {
return tensors.emplace_back(
make_ref<TensorObj>(dim, dtype, runtime, tensorType));
}
Tensor GraphObj::addTensor(const Tensor &tensor) {
@ -228,4 +246,14 @@ bool GraphObj::checkValid() const {
return true;
}
int GraphObj::removeIndependentTensors() {
TensorVec newTensors;
for (const auto &t : tensors)
if (!t->getTargets().empty() || t->getSource())
newTensors.emplace_back(t);
auto ret = tensors.size() - newTensors.size();
tensors = newTensors;
return ret;
}
} // namespace infini

View File

@ -1,10 +1,12 @@
#include "core/graph_handler.h"
#include "nnet/Visitor/Serializer.h"
#include "operators/batch_norm.h"
#include "operators/concat.h"
#include "operators/conv.h"
#include "operators/element_wise.h"
#include "operators/gather.h"
#include "operators/matmul.h"
#include "operators/membound.h"
#include "operators/pad.h"
#include "operators/pooling.h"
#include "operators/reduce_mean.h"
@ -19,8 +21,8 @@ namespace infini {
static DataType dtype_repr_convert(int);
Tensor GraphHandlerObj::tensor(Shape dims, int dtype) {
return g->addTensor(std::move(dims), dtype_repr_convert(dtype));
Tensor GraphHandlerObj::tensor(Shape dims, int dtype, TensorType ttype) {
return g->addTensor(std::move(dims), dtype_repr_convert(dtype), ttype);
}
Tensor GraphHandlerObj::conv(Tensor input, Tensor weight, Tensor output, int ph,
@ -55,6 +57,39 @@ Tensor GraphHandlerObj::convTransposed2d(Tensor input, Tensor weight,
}
}
Tensor GraphHandlerObj::convNHWC(Tensor input, Tensor weight, Tensor output,
int ph, int pw, int sh, int sw, int dh,
int dw) {
if (output) {
g->addOpWithOutputs<ConvNHWCObj>(std::move(input), std::move(weight),
output, ph, pw, sh, sw, dh, dw);
return output;
} else {
return g
->addOp<ConvNHWCObj>(std::move(input), std::move(weight), output,
ph, pw, sh, sw, dh, dw)
->getOutput();
}
}
Tensor GraphHandlerObj::convTransposed2dNHWC(Tensor input, Tensor weight,
Tensor output, int ph, int pw,
int sh, int sw, int dh, int dw,
int oph, int opw) {
if (output) {
g->addOpWithOutputs<ConvTransposed2dNHWCObj>(
std::move(input), std::move(weight), output, ph, pw, sh, sw, dh, dw,
oph, opw);
return output;
} else {
return g
->addOp<ConvTransposed2dNHWCObj>(std::move(input),
std::move(weight), output, ph, pw,
sh, sw, dh, dw, oph, opw)
->getOutput();
}
}
Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA,
bool transB, Tensor bias, ActType act) {
if (y) {
@ -291,6 +326,22 @@ Tensor GraphHandlerObj::pad(Tensor input, Tensor output,
}
}
TensorVec GraphHandlerObj::memBound(const TensorVec &inputs,
const Tensor &output,
const string &jsonString) {
const auto &[expr, nnetInputs, execTime, hint] =
nnet::Serializer().membundOpFromString(jsonString);
if (output) {
g->addOpWithOutputs<MemBoundObj>(std::move(inputs), TensorVec{output},
nnetInputs, expr, execTime, hint);
return {output};
} else
return g
->addOp<MemBoundObj>(std::move(inputs), TensorVec{nullptr},
nnetInputs, expr, execTime, hint)
->getOutputs();
}
static DataType dtype_repr_convert(int dtype) {
switch ((OnnxDType)dtype) {
case OnnxDType::FLOAT:
@ -314,4 +365,12 @@ static DataType dtype_repr_convert(int dtype) {
}
}
Graph GraphHandlerObj::getGraph() const {
int nRemoved = g->removeIndependentTensors();
if (nRemoved > 0)
std::cout << "Removed " << nRemoved << " independent tensors"
<< std::endl;
return g;
}
} // namespace infini

View File

@ -1,6 +1,7 @@
#include "core/operator.h"
#include "core/graph.h"
#include "core/hash.h"
#include "nnet/dbg.h"
namespace infini {
@ -25,7 +26,8 @@ bool OperatorObj::isConcatOp() const { return type == OpType::Concat; }
bool OperatorObj::isComputeOp() const {
return type == OpType::Conv || type == OpType::Matmul ||
type == OpType::ConvTrans || type == OpType::ConvTransNHWC ||
type == OpType::G2BMM || type == OpType::GBMM;
type == OpType::G2BMM || type == OpType::GBMM ||
type == OpType::ConvNHWC;
}
bool OperatorObj::isTransposeOp() const { return type == OpType::Transpose; }
@ -33,8 +35,12 @@ bool OperatorObj::isTransposeOp() const { return type == OpType::Transpose; }
bool OperatorObj::isReshapeOp() const { return type == OpType::Reshape; }
bool OperatorObj::isMemBoundOp() const {
return type == OpType::MemBound || type == OpType::Activation ||
type == OpType::Transpose;
if (type == OpType::Any)
return true; // TODO: check operator attributes
return type == OpType::MemBound || type == OpType::Reshape ||
type == OpType::Activation || type == OpType::Transpose ||
type == OpType::Relu || type == OpType::Tanh ||
type == OpType::Softmax;
}
void OperatorObj::removePredecessors(const Operator &op) {
@ -83,22 +89,31 @@ HashType OperatorObj::hash() const {
bool OperatorObj::checkValid(GraphObj *graph) {
auto optShapes = inferShape();
IT_ASSERT(optShapes);
if (!optShapes) // shape inference failed
return false;
const vector<Shape> &shapes = *optShapes;
IT_ASSERT(shapes.size() == outputs.size());
if (shapes.size() != outputs.size())
return false;
if (graph) { // if graph != nullptr, outputs should be created
auto dataTypes = inferDataType();
for (size_t i = 0; i < outputs.size(); i++) {
IT_ASSERT(!outputs[i], "Find empty output while operator creation");
outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
outputs[i] =
graph->addTensor(shapes[i], dataTypes[i], TensorType::Other);
}
} else { // if outputs have been created, check their shapes
for (size_t i = 0; i < shapes.size(); ++i) {
if (shapes[i] != outputs[i]->getDims())
IT_ASSERT(shapes[i] == outputs[i]->getDims(),
(vecToString(shapes[i]) +
" != " + vecToString(outputs[i]->getDims())));
if (shapes[i] != outputs[i]->getDims()) {
dbg(shapes[i], outputs[i]->getDims());
return false;
}
IT_ASSERT(outputs[i]->getTensorType() == TensorType::Other);
}
}
return true;

View File

@ -2,9 +2,15 @@
#include "core/blob.h"
#include "core/kernel.h"
#include "core/perf_engine.h"
#include "operators/membound.h"
#include "utils/data_generator.h"
#include <chrono>
#include <cstring>
#ifdef USE_CUDA
#include "cuda_profiler_api.h"
#endif
namespace infini {
void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
if (!tune && profiling)
@ -52,17 +58,40 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
opCnt[op->getOpType()]++;
}
}
if (profiling)
printProfilingData(totalTime, opTime, opCnt);
// if (profiling)
// printProfilingData(totalTime, opTime, opCnt);
}
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
map<UidBaseType, bool>
RuntimeObj::getCompileTimeComputableAttribute(const Graph &graph) const {
map<UidBaseType, bool> ctcMap; // compile-time computable
// Skip static computation
bool status = graph->topo_sort();
IT_ASSERT(status, "Topological sort failed");
for (auto &op : graph->getOperators()) {
bool compileTimeComputable = true;
for (auto input : op->getInputs()) {
// FIXME: propogate the tensor type. Current only the first operator
// after weights are compile-time computable.
if (input->getTensorType() != TensorType::Initialized)
compileTimeComputable = false;
}
ctcMap[op->getGuid()] = compileTimeComputable;
}
return ctcMap;
}
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
bool allowEstimation,
bool ignoreMemboundOp) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
// Statistics
double totalTime = 0;
std::map<OpType, double> opTime;
std::map<OpType, int> opCnt;
std::map<OpType, int> opCnt, opNonCtcCnt;
// compile-time computable
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
for (auto &op : graph->getOperators()) {
auto kernelAttrs = KernelAttrs{device, op->getOpType(), op->getDType()};
@ -70,11 +99,19 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
PerfRecord record;
// Tune the kernel if there is no record
if (!perfData) {
double time = -1e9;
if (ctcMap[op->getGuid()]) { // Compile-time computable operators
time = 0;
} else if (op->getOpType() == OpType::MemBound && ignoreMemboundOp) {
time = 0;
} else if (op->getOpType() == OpType::MemBound && allowEstimation) {
time = as<MemBoundObj>(op)->getEstimatedTime();
} else if (perfData) { // Tune the kernel if there is no record
time = perfData->time;
} else {
// TODO: should tenosrs automatically allocate when access data?
// allocate memory for empty tensors and release it after profiling
// allocate memory for empty tensors and release it after
// profiling
TensorVec allocatedTensors;
for (auto t : op->getInputs())
if (!t->hasData())
@ -88,37 +125,47 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
}
// Profile operators and record the results
record = kernel->tune(op, this);
PerfRecord record = kernel->tune(op, this);
time = record->time;
perfEngine.setPerfData(perfKey, record);
// Free allocated memory
for (auto t : allocatedTensors)
t->freeData();
} else
record = perfData;
}
double t = record->time;
totalTime += t;
// FIXME: ignore trnapose when necessary
// op->getOpType() != OpType::Transpose &&
// op->getOpType() != OpType::ReduceMean
if (op->getOpType() != OpType::Reshape)
totalTime += time;
if (profiling) {
op->print();
printf(" op_time %lf\n", t);
opTime[op->getOpType()] += t;
printf(" op_time %lf\n", time);
opTime[op->getOpType()] += time;
opCnt[op->getOpType()]++;
if (!ctcMap[op->getGuid()])
opNonCtcCnt[op->getOpType()]++;
else
opNonCtcCnt[op->getOpType()]; // Create a new entry
}
}
if (profiling)
printProfilingData(totalTime, opTime, opCnt);
printProfilingData(totalTime, opTime, opCnt, opNonCtcCnt);
return totalTime;
}
void RuntimeObj::printProfilingData(double totalTime,
const std::map<OpType, double> &opTime,
const std::map<OpType, int> &opCnt) const {
printf("%11s %3s %7s %7s %7s\n", "Op", "Cnt", "T_tot", "Percent", "T_mean");
void RuntimeObj::printProfilingData(
double totalTime, const std::map<OpType, double> &opTime,
const std::map<OpType, int> &opCnt,
const std::map<OpType, int> &opNonCtcCnt) const {
printf("%11s %3s %5s %7s %7s %7s\n", "Op", "Cnt", "#NCtc", "T_tot",
"Percent", "T_mean");
for (const auto &[type, t] : opTime) {
printf("%11s %3d %7.3f %7.1f %7.3f\n",
OpRegistry::getOpName(type).data(), opCnt.at(type), t,
t / totalTime * 100, t / opCnt.at(type));
printf("%11s %3d %5d %7.3f %7.1f %7.3f\n",
OpRegistry::getOpName(type).data(), opCnt.at(type),
opNonCtcCnt.at(type), t, t / totalTime * 100,
t / opCnt.at(type));
}
}
@ -160,4 +207,44 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,
string NativeCpuRuntimeObj::toString() const { return "CPU Runtime"; }
double RuntimeObj::timeNonCtcOperators(const Graph &graph, int warmup,
int repeat) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
// compile-time computable
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
bool status = graph->topo_sort();
IT_ASSERT(status, "Topological sort failed");
for (auto &op : graph->getOperators()) {
// HACK: set correct data type
auto kernelAttrs =
KernelAttrs{device, op->getOpType(), DataType::Float32};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
if (perfData)
kernel->compute(op, perfData, this);
else
kernel->compute(op, this);
if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
kernels.emplace_back(op, kernel, perfData);
}
for (auto &[op, kernel, perfData] : kernels) {
dbg(op);
}
double ret = timeit(
[&]() {
for (auto &[op, kernel, perfData] : kernels) {
if (perfData)
kernel->compute(op, perfData, this);
else
kernel->compute(op, this);
}
},
[&]() { this->sync(); }, warmup, repeat);
return ret;
}
} // namespace infini

View File

@ -1,6 +1,9 @@
#include "core/search_engine.h"
#include "core/hash.h"
#include "core/runtime.h"
#include "ffi/ffi_callback.h"
#include "nnet/dbg.h"
#include "operators/reshape.h"
#include <algorithm>
#include <iostream>
@ -8,7 +11,17 @@
namespace infini {
void SearchEngine::printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph) {
using MetaGraph = SearchEngine::MetaGraph;
SearchEngine::SearchEngine(Runtime runtime, Ref<Mutator> mutator)
: runtimeExec(runtime), mutator(mutator) {
// Compare graph with estimated time
graphTimeComparer = [this](const Graph &a, const Graph &b) -> bool {
return getEstimatedGraphPerf(a) < getEstimatedGraphPerf(b);
};
}
void SearchEngine::printMetaGraph(MetaGraph metaGraph) {
for (size_t i = 0; i < metaGraph->nodes.size(); i++) {
auto &node = metaGraph->nodes[i];
std::cout << "id: " << i << std::endl;
@ -32,8 +45,7 @@ Graph SearchEngine::run(const Graph graph) {
IT_ASSERT(runtimeExec == graph->getRuntime());
std::cout << "[INFO] original graph: " << std::endl;
std::cout << graph->toString();
std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(graph)
<< std::endl;
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(graph) << std::endl;
std::vector<Graph> partitions = partitionGraph(graph);
@ -45,7 +57,6 @@ Graph SearchEngine::run(const Graph graph) {
std::vector<Graph> candidates = search(subGraph);
std::cout << "[INFO] size: " << candidates.size() << std::endl;
IT_ASSERT(candidates.size() > 0);
std::cout << subGraph->toString() << std::endl;
std::vector<Graph> nextGraphs;
for (auto lastGraph : bestGraphs) {
for (auto thisGraph : candidates) {
@ -61,13 +72,12 @@ Graph SearchEngine::run(const Graph graph) {
}
}
auto tmp = make_ref<GraphObj>(runtimeExec, ops);
tmp->dataMalloc();
nextGraphs.emplace_back(tmp);
}
}
std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
});
dbg("===Num" + std::to_string(nextGraphs.size()));
std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
if (nextGraphs.size() > GRAPH_SIZE) {
nextGraphs.resize(GRAPH_SIZE);
}
@ -81,10 +91,30 @@ Graph SearchEngine::run(const Graph graph) {
for (size_t i = 0; i < bestGraphs.size(); i++) {
std::cout << "bestGraph " << i << ":" << std::endl;
std::cout << bestGraphs[i]->toString();
std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(bestGraphs[i])
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[i])
<< std::endl;
}
// Fuse vertically and sort according to performance
for (size_t i = 0; i < bestGraphs.size(); ++i) {
bestGraphs[i] = fuseVertically(bestGraphs[i]);
}
std::sort(bestGraphs.begin(), bestGraphs.end(), graphTimeComparer);
// Check optimized graphs are legal
for (auto g : bestGraphs) {
g->checkValid();
IT_ASSERT(graph->getInputs().size() == g->getInputs().size(),
graph->toString() + string("\n") + g->toString());
IT_ASSERT(graph->getOutputs().size() == g->getOutputs().size(),
graph->toString() + string("\n") + g->toString());
}
std::cout << "[INFO] best fused graph: " << std::endl;
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[0])
<< std::endl;
std::cout << bestGraphs[0] << std::endl;
return bestGraphs[0];
}
@ -102,9 +132,9 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
}
}
sort(results.begin(), results.end(), [&](Graph x, Graph y) {
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
}); // compare with perf time
// compare with perf time
dbg("===Num" + std::to_string(results.size()));
std::sort(results.begin(), results.end(), graphTimeComparer);
if (results.size() > GRAPH_SIZE) {
results.resize(GRAPH_SIZE);
}
@ -112,9 +142,8 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
}
// Build metagraph with a graph, each operator is a node.
std::shared_ptr<SearchEngine::MetaGraph>
SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
auto metaGraph = std::make_shared<MetaGraph>();
MetaGraph SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
auto metaGraph = make_ref<MetaGraphObj>();
int numOps = graph->getOperators().size();
std::vector<int> cnt(numOps, 0);
@ -123,7 +152,7 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
std::vector<int> q(0);
for (size_t i = 0; i < graph->getOperators().size(); i++) {
auto &op = graph->getOperators()[i];
MetaGraph::Node node;
MetaGraphObj::Node node;
std::vector<Operator> ops;
ops.emplace_back(op);
node.graph = make_ref<GraphObj>(runtimeExec, ops);
@ -157,9 +186,8 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
// Build a metagraph with graph and a plan, a plan is which ops should be a
// node.
std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
const std::shared_ptr<SearchEngine::MetaGraph> metaGraph,
const std::vector<int> &plan) {
MetaGraph SearchEngine::buildMetaGraphWithPlan(const MetaGraph metaGraph,
const std::vector<int> &plan) {
int numGroups = 0;
for (auto i : plan) {
if (i > numGroups) {
@ -172,12 +200,12 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
groups[plan[i]].emplace_back(i);
}
auto resultMetaGraph = make_ref<MetaGraph>();
auto resultMetaGraph = make_ref<MetaGraphObj>();
for (auto &group : groups) {
std::vector<Operator> ops;
std::unordered_set<int> preSet, sucSet;
for (auto id : group) {
MetaGraph::Node node;
MetaGraphObj::Node node;
for (auto op : metaGraph->nodes[id].graph->getOperators()) {
ops.emplace_back(op);
}
@ -204,9 +232,10 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
}
// Search how to merge multiple ops.
std::vector<std::shared_ptr<SearchEngine::MetaGraph>>
SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
vector<MetaGraph> SearchEngine::searchMerge(MetaGraph &metaGraph) {
IT_ASSERT(metaGraph != nullptr);
// HACK: disable multiple op search
return {metaGraph};
std::vector<int> plan(metaGraph->nodes.size());
for (size_t i = 0; i < plan.size(); i++) {
plan[i] = i;
@ -222,7 +251,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
std::unordered_set<HashType> planSet;
searchMergeDfs(metaGraph, plan, frontier, plans, planSet);
std::vector<std::shared_ptr<SearchEngine::MetaGraph>> metaGraphs;
vector<MetaGraph> metaGraphs;
for (auto &curPlan : plans) {
metaGraphs.emplace_back(buildMetaGraphWithPlan(metaGraph, curPlan));
}
@ -230,8 +259,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
}
// DFS impl for search merge.
void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
std::vector<int> &plan,
void SearchEngine::searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
std::vector<int> &frontier,
std::vector<std::vector<int>> &plans,
std::unordered_set<uint64_t> &planSet) {
@ -320,14 +348,40 @@ void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
}
// Search mutation for each compute op.
std::vector<Graph> SearchEngine::searchMutation(
const std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
std::vector<Graph> graphs = {nullptr};
// Append a node to all existing candidates
for (auto &node : metaGraph->nodes) {
std::vector<Graph> nextGraphs;
if (node.type == 1) { // If it has computing OPs
auto mutatedGraphs = mutator->run(node.graph);
if (mutator->hasTunedKernel)
chooseBestMutation = false;
std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
graphTimeComparer);
if (mutatedGraphs.size() >= 10)
mutatedGraphs.resize(10);
mutatedGraphs = {mutatedGraphs[0]};
// if (searchFilter == 1) {
// std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
// graphTimeComparer);
// if (mutatedGraphs.size() >= 10)
// mutatedGraphs.resize(10);
// mutatedGraphs = {mutatedGraphs[0]};
// } else if (chooseBestMutation && mutatedGraphs.size() >= 2) {
// std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
// graphTimeComparer);
// if (mutatedGraphs.size() >= 10)
// mutatedGraphs.resize(10);
// mutatedGraphs = {mutatedGraphs[0]};
// } else { // avoid repeated kernel genreation
// if (mutatedGraphs.size() >= 2) // INFOGAN
// mutatedGraphs = {mutatedGraphs[1]};
// // if (mutatedGraphs.size() > 2) {
// // mutatedGraphs.resize(2);
// // }
// }
for (auto graph : graphs) {
for (auto mutatedGraph : mutatedGraphs) {
std::vector<Operator> ops;
@ -357,12 +411,8 @@ std::vector<Graph> SearchEngine::searchMutation(
nextGraphs.emplace_back(make_ref<GraphObj>(runtimeExec, ops));
}
}
for (auto g : nextGraphs) {
g->dataMalloc();
}
std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
});
dbg("===Num" + std::to_string(nextGraphs.size()));
std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
if (nextGraphs.size() > GRAPH_SIZE) {
nextGraphs.resize(GRAPH_SIZE);
}
@ -372,7 +422,7 @@ std::vector<Graph> SearchEngine::searchMutation(
}
bool SearchEngine::isMultiBranchMergable(const Graph graph) {
return mutationEngine->isMultiBranchMergable(graph);
return mutator->isMultiBranchMergable(graph);
}
// Split a graph into multiple independt graphs. Search engine will search for
@ -423,7 +473,6 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
std::cout << op->toString() << std::endl;
}
auto tmp = make_ref<GraphObj>(runtimeExec, headOps);
tmp->dataMalloc();
partitions.emplace_back(tmp);
headOps.clear();
}
@ -431,11 +480,100 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
}
if (!headOps.empty()) {
auto tmp = make_ref<GraphObj>(runtimeExec, headOps);
tmp->dataMalloc();
partitions.emplace_back(tmp);
}
std::reverse(partitions.begin(), partitions.end());
return partitions;
}
double SearchEngine::getEstimatedGraphPerf(Graph graph) {
// dbg(graph);
// // hkz
// callback::exportONNX(graph, "a.onnx");
return runtimeExec->getPerfTime(graph, false, true, true);
}
Graph SearchEngine::fuseVertically(const Graph &graph) {
std::unordered_map<UidBaseType, int> visitTime;
std::vector<Operator> ops;
graph->topo_sort();
int cnt = 0;
for (auto op : graph->getOperators()) {
// Skip visited OP
if (visitTime.find(op->getGuid()) != visitTime.end()) {
continue;
}
// Skip compute OP and multi-input/output OP
if (!op->isMemBoundOp() || (op->getPredecessors().size() != 1 &&
op->getSuccessors().size() != 1)) {
visitTime.emplace(op->getGuid(), ++cnt);
ops.emplace_back(op);
continue;
}
// FIXME: fuse and modify attributes of computing operators
if (op->getOpType() == OpType::Relu ||
op->getOpType() == OpType::PRelu) {
if (auto p = op->getInputs()[0])
if (auto sop = p->getSource())
if (sop->getOpType() == OpType::Conv ||
sop->getOpType() == OpType::Matmul) {
visitTime.emplace(op->getGuid(), ++cnt);
ops.emplace_back(make_ref<ReshapeObj>(
nullptr, op->getInputs()[0], op->getOutputs()[0]));
continue;
}
}
vector<Operator> chainOps;
visitTime.emplace(op->getGuid(), ++cnt);
vector<Operator> tmp;
auto cur = op;
while (cur->getPredecessors().size() == 1 &&
cur->getPredecessors()[0]->isMemBoundOp()) {
cur = cur->getPredecessors()[0];
if (visitTime.count(cur->getGuid()))
break;
tmp.emplace_back(cur);
visitTime.emplace(cur->getGuid(), cnt);
}
for (int i = tmp.size() - 1; i >= 0; i--) {
chainOps.emplace_back(tmp[i]);
}
chainOps.emplace_back(op);
cur = op;
while (cur->getSuccessors().size() == 1 &&
cur->getSuccessors()[0]->isMemBoundOp()) {
cur = cur->getSuccessors()[0];
if (visitTime.count(cur->getGuid()))
break;
chainOps.emplace_back(cur);
visitTime.emplace(cur->getGuid(), cnt);
}
make_ref<GraphObj>(runtimeExec, chainOps)->print();
auto bestGraph = make_ref<GraphObj>(runtimeExec, chainOps);
// Eliminate transpose and reshape operators
if (auto eliminatedGraph = mutator->eliminateVertically(
make_ref<GraphObj>(runtimeExec, chainOps)))
bestGraph = eliminatedGraph;
// Fuse membound operators
if (auto optGraph = mutator->fuseVertically(bestGraph))
bestGraph = optGraph;
for (auto op : bestGraph->getOperators()) {
ops.emplace_back(op);
}
}
if (ops.empty()) {
IT_TODO_HALT();
IT_ASSERT(graph->getOutputs().size() == 1);
IT_ASSERT(graph->getInputs().size() == 1);
// auto g = make_ref<GraphObj>(runtime);
// TODO: add identity
ops.emplace_back(make_ref<ReshapeObj>(nullptr, graph->getInputs()[0],
graph->getOutputs()[0]));
}
return make_ref<GraphObj>(runtimeExec, ops);
}
} // namespace infini

View File

@ -8,12 +8,14 @@
namespace infini {
TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime)
TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime,
TensorType tensorType)
: TensorBaseObj(shape_.size(), dtype, runtime), shape(std::move(shape_)),
_size(shape.empty()
? 0
: std::accumulate(shape.begin(), shape.end(), 1,
[](auto acc, auto x) { return acc * x; })) {}
: std::accumulate(shape.begin(), shape.end(), 1lu,
[](auto acc, auto x) { return acc * x; })),
tensorType(tensorType) {}
string TensorObj::toString() const {
// Convert data pointer to string
@ -24,8 +26,8 @@ string TensorObj::toString() const {
ss << "nullptr data";
string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
std::to_string(fuid) + ", shape " + vecToString(shape) +
", dtype " + dtype.toString() + ", " + runtime->toString() +
", " + ss.str() + "\n";
", dtype " + dtype.toString() + ", tensorType " +
std::to_string(enum_to_underlying(tensorType));
vector<UidBaseType> targetGuids;
for (const auto &op : targets)
targetGuids.emplace_back(op.lock()->getGuid());
@ -34,6 +36,7 @@ string TensorObj::toString() const {
else
ret += ", source None";
ret += ", targets " + vecToString(targetGuids);
ret += ", " + runtime->toString() + ", " + ss.str();
return ret;
}
@ -64,12 +67,19 @@ vector<size_t> TensorObj::getStride() const {
void TensorObj::printData() const {
IT_ASSERT(data != nullptr);
if (!runtime->isCpu())
IT_TODO_HALT();
void *ptr = nullptr;
Blob buffer;
if (!runtime->isCpu()) { // copy data to main memory
buffer = NativeCpuRuntimeObj::getInstance()->allocBlob(getBytes());
runtime->copyBlobToCPU(buffer->getPtr<void *>(),
getRawDataPtr<void *>(), getBytes());
ptr = buffer->getPtr<void *>();
} else
ptr = data->getPtr<float *>();
#define TRY_PRINT(N) \
if (dtype == DataType(N)) \
std::cout << dataToString<DT<N>::t>() << std::endl;
std::cout << dataToString<DT<N>::t>(ptr) << std::endl;
TRY_PRINT(0) // fmt: new line
else TRY_PRINT(1) //
@ -112,8 +122,9 @@ bool TensorObj::equalData(const Tensor &rhs, double relativeError) const {
}
void TensorObj::dataMalloc() {
if (!data)
if (!data) {
data = runtime->allocBlob(getBytes());
}
}
void TensorObj::copyData(const TensorObj *src) {
@ -172,4 +183,27 @@ size_t TensorObj::getOffsetByBroadcastOffset(size_t bcOffset,
return getOffsetByPos(pos, shape);
}
Tensor TensorObj::clone() const {
auto obj = make_ref<TensorObj>(*this);
obj->freeData();
obj->targets.clear();
obj->source.reset();
return obj;
}
Tensor TensorObj::clone(Runtime runtime) const {
auto obj = make_ref<TensorObj>(*this);
obj->runtime = runtime;
obj->freeData();
obj->targets.clear();
obj->source.reset();
// FIXME
// if (hasData()) {
// obj->dataMalloc();
// obj->copyData(this);
// }
return obj;
}
}; // namespace infini

View File

@ -2,10 +2,58 @@
#include "core/kernel.h"
#include "core/perf_engine.h"
#include "core/runtime.h"
#include "cuda_profiler_api.h"
#include "nnet/dbg.h"
#include "operators/any.h"
#include "operators/conv.h"
#include "operators/matmul.h"
#ifdef INFINI_USE_TVM
#include "tvm/runtime/device_api.h"
#endif
namespace infini {
CudaRuntimeObj::CudaRuntimeObj()
: RuntimeObj(Device::CUDA), stream(cudaStreamPerThread),
cudaGraphStatus(false) {
checkCudnnError(cudnnCreate(&cudnn));
checkCublasError(cublasCreate(&cublas));
checkCudnnError(cudnnSetStream(cudnn, stream));
checkCublasError(cublasSetStream(cublas, stream));
workspaceSize = 2ll << 30; // 2 GB
workspace = alloc(workspaceSize);
// Get CUDA device properties
checkCudaError(cudaGetDeviceProperties(&deviceProperties, 0));
}
CudaRuntimeObj::~CudaRuntimeObj() {
try {
dealloc(workspace);
checkCudnnError(cudnnDestroy(cudnn));
checkCublasError(cublasDestroy(cublas));
} catch (const std::exception &e) {
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
}
}
void CudaRuntimeObj::beginCudaGraphStreamCapture() {
enum cudaStreamCaptureStatus pCaptureStatus;
checkCudaError(cudaStreamIsCapturing(stream, &pCaptureStatus));
IT_ASSERT(pCaptureStatus == cudaStreamCaptureStatusNone);
cudaGraphStatus = true;
checkCudaError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
}
tuple<cudaGraphExec_t, size_t> CudaRuntimeObj::endCudaGraphStreamCapture() {
cudaGraph_t cudaGraph;
cudaGraphExec_t instance;
checkCudaError(cudaStreamEndCapture(stream, &cudaGraph));
cudaGraphStatus = false;
size_t numCudaGraphNodes;
checkCudaError(cudaGraphGetNodes(cudaGraph, nullptr, &numCudaGraphNodes));
checkCudaError(cudaGraphInstantiate(&instance, cudaGraph, NULL, NULL, 0));
return {instance, numCudaGraphNodes};
}
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
@ -75,4 +123,74 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
double CudaRuntimeObj::timeWithCudaGraph(Graph graph, int rounds) {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
// compile-time computable
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
bool status = graph->topo_sort();
IT_ASSERT(status, "Topological sort failed");
for (auto &op : graph->getOperators()) {
// HACK: set correct data type
auto kernelAttrs =
KernelAttrs{device, op->getOpType(), DataType::Float32};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
if (perfData)
kernel->compute(op, perfData, this);
else
kernel->compute(op, this);
bool isFakeOp = (as<AnyObj>(op) &&
as<AnyObj>(op)->getKernelName() == string("FakeOp"));
if (as<AnyObj>(op))
dbg(op, as<AnyObj>(op)->getKernelName() == string("FakeOp"));
if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape &&
op->getOpType() != OpType::Flatten && !isFakeOp)
kernels.emplace_back(op, kernel, perfData);
}
for (auto &[op, kernel, perfData] : kernels) {
dbg(op);
}
vector<std::function<void(void)>> funcs;
for (auto &[op, kernel, perfData] : kernels) {
if (perfData)
funcs.push_back([&]() { kernel->compute(op, perfData, this); });
else
funcs.push_back([&]() { kernel->compute(op, this); });
}
return timeWithCudaGraph(funcs, rounds);
}
double
CudaRuntimeObj::timeWithCudaGraph(std::vector<std::function<void(void)>> funcs,
int rounds) {
// TODO: move this to kernel source?
// Init tvm stream
#ifdef INFINI_USE_TVM
DLDevice tvm_device_id = {kDLCUDA, 0};
auto tvm_device = tvm::runtime::DeviceAPI::Get(tvm_device_id);
tvm_device->SetStream(tvm_device_id, getStream());
#endif
beginCudaGraphStreamCapture();
for (auto &f : funcs)
f();
auto [cudaGraphInstance, numCudaGraphNodes] = endCudaGraphStreamCapture();
// Since one TVM packed function may contaion more than one CUDA kernel, the
// number of captured kernels may exceed the number of operators.
IT_ASSERT(numCudaGraphNodes >= funcs.size(),
std::to_string(numCudaGraphNodes) +
" != " + std::to_string(funcs.size()));
return timeit(
[&, cudaGraphInstance = cudaGraphInstance, stream = getStream()]() {
checkCudaError(cudaGraphLaunch(cudaGraphInstance, stream));
},
[&, stream = getStream()]() { cudaStreamSynchronize(stream); },
std::min(50, rounds), rounds);
}
void CudaRuntimeObj::setEnableTF32(bool state) { enableTF32 = state; }
} // namespace infini

22
src/ffi/ffi_callback.cc Normal file
View File

@ -0,0 +1,22 @@
#include "core/graph.h"
#include <pybind11/stl.h>
namespace py = pybind11;
namespace infini {
namespace callback {
using namespace py::literals;
static std::function<void(const Graph &, string)> exportONNXImpl;
void exportONNX(const Graph &graph, const string &path) {
IT_ASSERT(Py_IsInitialized(), "Python interpreter is not running.");
static auto exportONNXImpl =
py::module_::import("pyinfinitensor.onnx").attr("save_onnx");
exportONNXImpl(graph, path);
}
} // namespace callback
} // namespace infini

View File

@ -1,9 +1,15 @@
#include "core/graph_handler.h"
#include "core/mutator.h"
#include "core/search_engine.h"
#include "nnet/nmutator.h"
#include "nnet/test_models.h"
#include "operators/any.h"
#include "operators/batch_norm.h"
#include "operators/concat.h"
#include "operators/conv.h"
#include "operators/gather.h"
#include "operators/matmul.h"
#include "operators/membound.h"
#include "operators/pad.h"
#include "operators/pooling.h"
#include "operators/reduce_mean.h"
@ -63,6 +69,8 @@ void export_values(py::module &m) {
.VALUE(OpType, Conv)
.VALUE(OpType, Matmul)
.VALUE(OpType, ConvTrans)
.VALUE(OpType, ConvTransNHWC)
.VALUE(OpType, ConvNHWC)
.VALUE(OpType, G2BMM)
.VALUE(OpType, GBMM)
.VALUE(OpType, Pad)
@ -94,8 +102,16 @@ void export_values(py::module &m) {
.VALUE(OpType, Abs)
.VALUE(OpType, Resize)
.VALUE(OpType, Dropout)
.VALUE(OpType, Conv2dReduce)
.VALUE(OpType, Conv2dReduceTranspose)
.VALUE(OpType, MemBound)
.VALUE(OpType, Any)
.export_values();
py::enum_<TensorType>(m, "TensorType")
.VALUE(TensorType, Input)
.VALUE(TensorType, Initialized)
.VALUE(TensorType, Other);
#undef VALUE
}
@ -132,19 +148,34 @@ static Ref<RuntimeObj> intelcpu_runtime() { return make_ref<MklRuntimeObj>(); }
#endif
static std::tuple<int, int, int, int, int, int> conv_attrs_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Conv);
auto conv = dynamic_cast<const ConvObj *>(op.get());
return std::make_tuple(conv->getPh(), conv->getPw(), conv->getDh(),
conv->getDw(), conv->getSh(), conv->getSw());
IT_ASSERT(op->getOpType() == OpType::Conv ||
op->getOpType() == OpType::ConvNHWC);
auto conv = dynamic_cast<const ConvBaseObj *>(op.get());
return std::make_tuple(conv->getPh(), conv->getPw(), conv->getSh(),
conv->getSw(), conv->getDh(), conv->getDw());
}
static std::tuple<int, int, int, int, int, int, int, int>
conv_trans_attrs_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::ConvTrans);
auto conv = dynamic_cast<const ConvTransposed2dObj *>(op.get());
auto [oph, opw] = conv->getOutputPadding();
return std::make_tuple(conv->getPh(), conv->getPw(), conv->getDh(),
conv->getDw(), conv->getSh(), conv->getSw(), oph,
IT_ASSERT(op->getOpType() == OpType::ConvTrans ||
op->getOpType() == OpType::ConvTransNHWC);
auto conv = dynamic_cast<const ConvBaseObj *>(op.get());
int oph, opw;
if (op->getOpType() == OpType::ConvTrans) {
auto _conv = dynamic_cast<const ConvTransposed2dObj *>(op.get());
auto output_pad = _conv->getOutputPadding();
oph = output_pad.first;
opw = output_pad.second;
} else {
auto _conv = dynamic_cast<const ConvTransposed2dNHWCObj *>(op.get());
auto output_pad = _conv->getOutputPadding();
oph = output_pad.first;
opw = output_pad.second;
}
return std::make_tuple(conv->getPh(), conv->getPw(), conv->getSh(),
conv->getSw(), conv->getDh(), conv->getDw(), oph,
opw);
}
@ -210,6 +241,11 @@ static vector<int64_t> reshape_shape_of(Operator op) {
return ans;
}
static int flatten_axis_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Flatten);
return as<FlattenObj>(op)->getAxis();
}
static vector<int64_t> pad_pads_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Pad);
auto shape = dynamic_cast<const PadObj *>(op.get())->getPads();
@ -219,11 +255,20 @@ static vector<int64_t> pad_pads_of(Operator op) {
return ans;
}
static string any_kernelName_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Any);
return as<AnyObj>(op)->getKernelName();
}
static vector<int> transpose_permute_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Transpose);
return dynamic_cast<const TransposeObj *>(op.get())->getPermute();
}
static string membound_expr_of(Operator op) {
return as<MemBoundObj>(op)->toJson();
}
void export_functions(py::module &m) {
#define FUNCTION(NAME) def(#NAME, &NAME)
m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
@ -248,29 +293,45 @@ void export_functions(py::module &m) {
.FUNCTION(reduce_mean_attrs_of)
.FUNCTION(tensor_dtype)
.FUNCTION(reshape_shape_of)
.FUNCTION(flatten_axis_of)
.FUNCTION(pad_pads_of)
.FUNCTION(transpose_permute_of)
.FUNCTION(concat_axis_of)
.FUNCTION(split_axis_of)
.FUNCTION(gather_axis_of);
.FUNCTION(gather_axis_of)
.FUNCTION(membound_expr_of)
.FUNCTION(any_kernelName_of)
.def("membound_hash_of",
[](Operator op) { return as<MemBoundObj>(op)->getHash(); });
#undef FUNCTION
}
void init_graph_builder(py::module &m) {
using Handler = GraphHandlerObj;
py::class_<RuntimeObj, std::shared_ptr<RuntimeObj>>(m, "Runtime");
py::class_<Object, Ref<Object>>(m, "_Object")
.def("__str__", &Object::toString)
.def("guid", &Object::getGuid);
py::class_<RuntimeObj, Ref<RuntimeObj>>(m, "Runtime")
.def("run", &RuntimeObj::run, "graph"_a, "tune"_a = false,
"profiling"_a = false)
.def("getPerfTime", &RuntimeObj::getPerfTime, "graph"_a, "profiling"_a,
"allowEstimation"_a, "ignoreMemboundOp"_a)
.def("timeNonCtcOperators", &RuntimeObj::timeNonCtcOperators);
py::class_<NativeCpuRuntimeObj, std::shared_ptr<NativeCpuRuntimeObj>,
RuntimeObj>(m, "CpuRuntime");
#ifdef USE_CUDA
py::class_<CudaRuntimeObj, std::shared_ptr<CudaRuntimeObj>, RuntimeObj>(
m, "CudaRuntime");
py::class_<CudaRuntimeObj, Ref<CudaRuntimeObj>, RuntimeObj>(m,
"CudaRuntime")
.def("timeWithCudaGraph",
py::overload_cast<Graph, int>(&CudaRuntimeObj::timeWithCudaGraph))
.def("setEnableTF32", &CudaRuntimeObj::setEnableTF32);
#endif
#ifdef USE_BANG
py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
m, "BangRuntime");
#endif
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
py::class_<TensorObj, std::shared_ptr<TensorObj>, Object>(m, "Tensor")
.def("fuid", &TensorObj::getFuid, policy::automatic)
.def("shape", &TensorObj::getDims, policy::move)
.def("copyin_float", &TensorObj::copyin<float>, policy::move)
@ -281,8 +342,10 @@ void init_graph_builder(py::module &m) {
.def("copyout_int64", &TensorObj::copyout<int64_t>, policy::move)
.def("has_target", &TensorObj::hasTarget, policy::automatic)
.def("src", &TensorObj::getSource, policy::move)
.def("printData", &TensorObj::printData, policy::automatic);
py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
.def("print_data", &TensorObj::printData)
.def("data_malloc", &TensorObj::dataMalloc)
.def("getTensorType", &TensorObj::getTensorType);
py::class_<OperatorObj, std::shared_ptr<OperatorObj>, Object>(m, "Operator")
.def("op_type", &OperatorObj::getOpType, policy::automatic)
.def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_),
policy::reference)
@ -291,9 +354,16 @@ void init_graph_builder(py::module &m) {
policy::reference);
py::class_<Handler>(m, "GraphHandler")
.def(py::init<Runtime>())
.def("tensor", &Handler::tensor, policy::move)
.def(py::init<Graph>())
.def("inputs", &Handler::inputs, policy::move)
.def("outputs", &Handler::outputs, policy::move)
.def("tensor", &Handler::tensor, policy::move, "shape"_a, "dtype"_a = 1,
"tensor_type"_a = TensorType::Other)
.def("conv", &Handler::conv, policy::move)
.def("convTransposed2d", &Handler::convTransposed2d, policy::move)
.def("convNHWC", &Handler::convNHWC, policy::move)
.def("convtransposed2dNHWC", &Handler::convTransposed2dNHWC,
policy::move)
.def("matmul", &Handler::matmul, policy::move)
.def("batchNorm", &Handler::batchNorm, policy::move)
.def("maxPool", &Handler::maxPool, policy::move)
@ -321,11 +391,50 @@ void init_graph_builder(py::module &m) {
.def("reduce_mean", &Handler::reduceMean, policy::move)
.def("slice", &Handler::slice, policy::move)
.def("pad", &Handler::pad, policy::move)
.def("memBound", &Handler::memBound, policy::move)
.def("topo_sort", &Handler::topo_sort, policy::automatic)
.def("optimize", &Handler::optimize, policy::automatic)
.def("operators", &Handler::operators, policy::move)
.def("data_malloc", &Handler::data_malloc, policy::automatic)
.def("run", &Handler::run, policy::automatic);
.def("run", &Handler::run, policy::automatic)
.def("getGraph", &Handler::getGraph);
py::class_<Mutator, Ref<Mutator>>(m, "Mutator").def("run", &Mutator::run);
py::enum_<NMutator::Mode>(m, "NMutatorMode")
.value("Normal", NMutator::Mode::Normal)
.value("RuleBased", NMutator::Mode::RuleBased);
py::class_<NMutator, Ref<NMutator>, Mutator>(m, "NMutator")
.def(py::init<NMutator::Mode>())
.def(py::init<NMutator::Mode, vector<int>>())
.def("run", &NMutator::run);
py::class_<SearchEngine>(m, "SearchEngine")
.def(py::init<Runtime, Ref<Mutator>>())
.def("run", &SearchEngine::run);
py::class_<GraphObj, Ref<GraphObj>, Object>(m, "Graph")
.def("tensors", &GraphObj::getTensors)
.def("operators", &GraphObj::getOperators)
.def("inputs", &GraphObj::getInputs)
.def("outputs", &GraphObj::getOutputs)
.def("print", &GraphObj::print)
.def("topo_sort", &GraphObj::topo_sort);
}
void export_test_model(py::module &m) {
#ifdef USE_CUDA
m.def("runInfoGAN", &runInfoGAN)
.def("getGANGraph", &getGANGraph)
.def("getFSRCNNGraph", &getFSRCNNGraph)
.def("getLongformer", &getLongformer)
.def("getConvtransposedNHWC", &getConvtransposedNHWC)
.def("optimizeGraph", &optimizeGraph, "graph"_a, "runtime"_a,
"tuning"_a = false, "mode"_a = NMutator::Mode::Normal,
"rules"_a = vector<int>{})
.def("initializeGraphTensors", &initializeGraphTensors, "g"_a,
"l"_a = -0.1, "r"_a = 0.1, "useInt"_a = false)
.def("convertNCHWtoNHWCModel", &convertNCHWtoNHWCModel)
.def("optimizeWithDepthConstraint", &optimizeWithDepthConstraint)
.def("optimizeModel", &optimizeModel)
.def("optimizeModelWithRules", &optimizeModelWithRules);
#endif
}
} // namespace infini
@ -335,4 +444,5 @@ PYBIND11_MODULE(backend, m) {
infini::export_values(m);
infini::export_functions(m);
infini::init_graph_builder(m);
infini::export_test_model(m);
}

View File

@ -33,7 +33,7 @@ class G2BMMCudnn : public CudaKernelWithoutConfig {
auto record =
make_ref<PerfRecordObj>(std::numeric_limits<double>::max());
const auto [warmupRounds, timingRounds] =
op->getB() > 100 ? tuple{1, 3} : tuple{5, 15};
op->getB() > 100 ? tuple{1, 1} : tuple{1, 2};
double tmp =
timeit([&]() { g2bmmKernel(op, context); },
[&]() { context->sync(); }, warmupRounds, timingRounds);

View File

@ -34,7 +34,7 @@ class GBMMCudnn : public CudaKernelWithoutConfig {
auto record =
make_ref<PerfRecordObj>(std::numeric_limits<double>::max());
const auto [warmupRounds, timingRounds] =
op->getB() > 100 ? tuple{1, 3} : tuple{5, 15};
op->getB() > 100 ? tuple{1, 1} : tuple{1, 3};
double tmp =
timeit([&]() { gbmmKernel(op, context); },
[&]() { context->sync(); }, warmupRounds, timingRounds);

88
src/kernels/cuda/any.cc Normal file
View File

@ -0,0 +1,88 @@
#include "operators/any.h"
#include "cuda/cuda_any.h"
#include "cuda/cuda_conv2dreduce.h"
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h"
namespace infini {
class AnyCuda : public CudaKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<AnyObj>(_op);
auto inputs = op->getInputs();
auto outputs = op->getOutputs();
vector<float *> inputsRawPtr;
for (auto &input : inputs) {
inputsRawPtr.emplace_back(input->getRawDataPtr<float *>());
}
vector<float *> outputsRawPtr;
for (auto &output : outputs) {
outputsRawPtr.emplace_back(output->getRawDataPtr<float *>());
}
any_kernel_mapping(inputsRawPtr, outputsRawPtr, op->getKernelName(),
op->getOpAttrVector());
}
};
void any_kernel_mapping(vector<float *> inputs, vector<float *> outputs,
const string &kernelName, const vector<int> &attr) {
if (kernelName == "conv2dreduce_kernel") {
IT_ASSERT(attr.size() == 15);
IT_ASSERT(inputs.size() == 1 || inputs.size() == 2)
IT_ASSERT(outputs.size() == 1);
conv2dreduce_kernel(inputs[0], inputs.size() > 1 ? inputs[1] : nullptr,
outputs[0], attr[0] != 0, attr[1], attr[2], attr[3],
attr[4], attr[5], attr[6], attr[7], attr[8],
attr[9], attr[10], attr[11], attr[12], attr[13],
attr[14]);
} else if (kernelName == "reduceConvRxSToNCHW") {
IT_ASSERT(attr.size() == 15);
IT_ASSERT(inputs.size() == 1 || inputs.size() == 2)
IT_ASSERT(outputs.size() == 1);
// float *input, float *bias, float *output, int act,
// int n, int h, int w, int f, int r, int s,
// int oh, int ow, int ph, int pw, int sh, int
// sw, int dh, int dw
reduceConvRxSToNCHW(inputs[0], inputs.size() > 1 ? inputs[1] : nullptr,
outputs[0], attr[0], attr[1], attr[2], attr[3],
attr[4], attr[5], attr[6], attr[7], attr[8],
attr[9], attr[10], attr[11], attr[12], attr[13],
attr[14]);
} else if (kernelName == "convTranspose2dreduce_kernel") {
IT_ASSERT(attr.size() == 15);
IT_ASSERT(inputs.size() == 1 || inputs.size() == 2)
IT_ASSERT(outputs.size() == 1);
convTranspose2dreduce_kernel(
inputs[0], inputs.size() > 1 ? inputs[1] : nullptr, outputs[0],
attr[0] != 0, attr[1], attr[2], attr[3], attr[4], attr[5], attr[6],
attr[7], attr[8], attr[9], attr[10], attr[11], attr[12], attr[13],
attr[14]);
} else if (kernelName == "conv5x5ToConv3x3Reduce") {
IT_ASSERT(attr.size() == 4);
IT_ASSERT(inputs.size() == 1 || inputs.size() == 2)
IT_ASSERT(outputs.size() == 1);
conv5x5ToConv3x3Reduce(attr[0], attr[1], attr[2], attr[3], inputs[0],
outputs[0],
inputs.size() > 1 ? inputs[1] : nullptr);
} else if (kernelName == "conv3x3ToReduce") {
IT_ASSERT(attr.size() == 4);
IT_ASSERT(inputs.size() == 1 || inputs.size() == 2);
IT_ASSERT(outputs.size() == 1);
conv3x3ToReduce(attr[0], attr[1], attr[2], attr[3], inputs[0],
outputs[0], inputs.size() > 1 ? inputs[1] : nullptr);
} else if (kernelName == "FakeOp" || kernelName == "Reduce3x3Offset_hint") {
} else {
std::cout << "Unimplemented AnyOp cuda kernel: " << kernelName
<< std::endl;
IT_TODO_HALT();
}
}
REGISTER_KERNEL(Device::CUDA, OpType::Any, DataType::Float32, AnyCuda,
"Any_CUDA_Float32");
} // namespace infini

View File

@ -24,4 +24,4 @@ class ClipCuda : public CudaKernelWithoutConfig {
REGISTER_KERNEL(Device::CUDA, OpType::Clip, DataType::Float32, ClipCuda,
"Clip_CUDA_Float32");
}; // namespace infini
} // namespace infini

View File

@ -1,9 +1,7 @@
#include "core/common.h"
#include "core/constants.h"
#include "cuda/cuda_common.h"
#include <math.h>
using infini::E_CONSTANT;
constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
@ -29,4 +27,4 @@ void clip_kernel(float *input, float *output, int num, float minValue,
maxValue);
}
}; // namespace infini
} // namespace infini

View File

@ -1,4 +1,5 @@
#include "operators/conv.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "cuda/cuda_runtime.h"
#include <chrono>
@ -52,7 +53,7 @@ class convCudnn : public Kernel {
cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
cudnnConvolutionDescriptor_t, cudnnActivationDescriptor_t,
cudnnTensorDescriptor_t>
createCuDNNDescriptor(const Ref<ConvObj> &op,
createCuDNNDescriptor(const Ref<ConvBaseObj> &op,
const ConvCuDnnPerfRecord &record) const {
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
@ -68,15 +69,23 @@ class convCudnn : public Kernel {
int channelsPerGrp = cpg, channels = c;
// set input format
cudnnTensorFormat_t tensorFormat = (op->getOpType() == OpType::ConvNHWC)
? CUDNN_TENSOR_NHWC
: CUDNN_TENSOR_NCHW;
// get inputs
cudnnTensorDescriptor_t inDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(
inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, channels, h, w));
inDesc, tensorFormat, CUDNN_DATA_FLOAT, n, channels, h, w));
// get kernels
cudnnFilterDescriptor_t knDesc;
checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
// FIXME: filter data layout is not changed with input data layout
// since FCRS shows better performance for NHWC inputs in some cases.
// This should be tunable.
checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
CUDNN_TENSOR_NCHW, f,
channelsPerGrp, r, s));
@ -84,7 +93,7 @@ class convCudnn : public Kernel {
cudnnTensorDescriptor_t biasDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(
biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1));
biasDesc, tensorFormat, CUDNN_DATA_FLOAT, 1, f, 1, 1));
// get convlution descriptor
cudnnConvolutionDescriptor_t convDesc;
@ -125,18 +134,25 @@ class convCudnn : public Kernel {
convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
cudnnTensorDescriptor_t outDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
CUDNN_DATA_FLOAT, outn, outc,
outh, outw));
IT_ASSERT((vector{outn, outc, outh, outw}) ==
op->getOutput()->getDims(),
"cuDNN output shape mismatches with OP output shape");
checkCudnnError(cudnnSetTensor4dDescriptor(
outDesc, tensorFormat, CUDNN_DATA_FLOAT, outn, outc, outh, outw));
if (op->getOpType() == OpType::ConvNHWC) {
IT_ASSERT((vector{outn, outh, outw, outc}) ==
op->getOutput()->getDims(),
"cuDNN output shape mismatches with OP output shape");
} else {
IT_ASSERT((vector{outn, outc, outh, outw}) ==
op->getOutput()->getDims(),
"cuDNN output shape mismatches with OP output shape");
}
return tuple(inData, knData, outData, inDesc, knDesc, biasDesc,
convDesc, actDesc, outDesc);
}
bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
bool cuDNNUnfused(const Ref<ConvBaseObj> &op,
const ConvCuDnnPerfRecord &record,
const CudaRuntimeObj *context) const {
cudnnStatus_t stat;
@ -219,12 +235,14 @@ class convCudnn : public Kernel {
const RuntimeObj *_context) const override {
ConvCuDnnPerfRecordObj ret;
ret.time = std::numeric_limits<double>::max();
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
auto op = as<ConvObj>(_op);
auto context = const_cast<CudaRuntimeObj *>(
dynamic_cast<const CudaRuntimeObj *>(_context));
auto op = as<ConvBaseObj>(_op);
int try_algo = op->getOpType() == OpType::ConvNHWC ? 2 : N_ALGO;
// Both modes have the same performance. Only run cross-correlation.
for (int mode = 1; mode < 2; mode++) {
// Try every possible algorithm of convolution
for (int algo = 0; algo < N_ALGO; algo++) {
for (int algo = 0; algo < try_algo; algo++) {
auto recordRef = make_ref<ConvCuDnnPerfRecordObj>();
auto &record = *recordRef;
record.mode = mode;
@ -251,16 +269,15 @@ class convCudnn : public Kernel {
record.workspaceSize, &beta, outDesc, outData);
if (stat != CUDNN_STATUS_SUCCESS)
continue;
record.time = timeit(
[&]() {
cudnnConvolutionForward(context->cudnnHandle(), &alpha,
inDesc, inData, knDesc, knData,
convDesc, ALGOS[record.algo],
wsData, record.workspaceSize,
&beta, outDesc, outData);
},
[&]() { context->sync(); });
// printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
// Time the kernel with CUDA Graph to get a precise time
std::function<void(void)> func = [&]() {
cudnnConvolutionForward(
context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
knData, convDesc, ALGOS[record.algo], wsData,
record.workspaceSize, &beta, outDesc, outData);
};
record.time = context->timeWithCudaGraph({func}, 100);
// printf("mode:%d algo:%d :%.4lf\n", mode, algo, record.time);
// Update the tune result
if (ret.time > record.time)
@ -283,7 +300,7 @@ class convCudnn : public Kernel {
void compute(const Operator &_op, const PerfRecord &_record,
const RuntimeObj *_context) const override {
auto op = as<ConvObj>(_op);
auto op = as<ConvBaseObj>(_op);
auto record = as<ConvCuDnnPerfRecordObj>(_record);
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
bool success = cuDNNUnfused(op, record, context);
@ -294,5 +311,8 @@ class convCudnn : public Kernel {
REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float32, convCudnn,
"Conv_cuDNN_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::ConvNHWC, DataType::Float32, convCudnn,
"ConvNHWC_cuDNN_CUDA_Float32");
REGISTER_CONSTRUCTOR(1, ConvCuDnnPerfRecordObj::from_json);
} // namespace infini

View File

@ -0,0 +1,44 @@
#include "operators/conv2dreduce.h"
#include "cuda/cuda_conv2dreduce.h"
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h"
namespace infini {
class Conv2dReduceCuda : public CudaKernelWithoutConfig {
void compute(const Operator &_op, const RuntimeObj *_context) const {
auto op = as<Conv2dReduceBase>(_op);
float *const input = (op->getInputs(0)->getRawDataPtr<float *>());
float *const bias =
op->getBias() ? (op->getBias()->getRawDataPtr<float *>()) : nullptr;
float *const output = (op->getOutput()->getRawDataPtr<float *>());
auto dim = op->getInputs(0)->getDims();
int n = dim[0], h = dim[1], w = dim[2], f = dim[3], r = dim[4],
s = dim[5];
int dh = op->getDh(), dw = op->getDw();
int sh = op->getSh(), sw = op->getSw();
int ph = op->getPh(), pw = op->getPw();
auto odim = op->getOutput()->getDims();
int oh = odim[1], ow = odim[2];
bool PReLU = op->getPReLU();
// float paramReLU = op->getParamReLU();
auto opType = op->getOpType();
if (opType == OpType::Conv2dReduce) {
conv2dreduce_kernel(input, bias, output, PReLU, n, h, w, f, r, s,
oh, ow, ph, pw, sh, sw, dh, dw);
} else {
convTranspose2dreduce_kernel(input, bias, output, PReLU, n, h, w, f,
r, s, oh, ow, ph, pw, sh, sw, dh, dw);
}
}
};
REGISTER_KERNEL(Device::CUDA, OpType::Conv2dReduce, DataType::Float32,
Conv2dReduceCuda, "Conv2dReduce_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Conv2dReduceTranspose, DataType::Float32,
Conv2dReduceCuda, "Conv2dReduceTranspose_CUDA_Float32");
} // namespace infini

View File

@ -0,0 +1,239 @@
#include "cuda/cuda_common.h"
#include "nnet/dbg.h"
using dtype = float;
__global__ void conv2dreduce_kernel_(float *__restrict__ input,
float *__restrict__ bias,
float *__restrict__ output,
const bool PReLU, const int n, const int f,
const int h, const int w, const int oh,
const int ow, const int r, const int s,
const int ph, const int pw, const int dh,
const int dw, const int sh, const int sw) {
// output shape: (n, oh, ow, f)
// input shape: (n, h, w, f, r, s)
int nid = blockIdx.x, fid = blockIdx.y;
int hid = threadIdx.x, wid = threadIdx.y;
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
nchunck = h * hchunk;
float *nfinput = input + nid * nchunck + fid * fchunck;
if (nid < n && fid < f && hid < oh && wid < ow) {
float imm = 0.0;
int ihst = hid * sh - ph;
int iwst = wid * sw - pw;
for (int ri = 0; ri < r; ++ri) {
for (int si = 0; si < s; ++si) {
int ihid = ihst + ri * dh;
int iwid = iwst + si * dw;
if (ihid >= 0 && ihid < h && iwid >= 0 && iwid < w) {
imm += *(nfinput + ihid * hchunk + iwid * wchunk + ri * s +
si);
}
}
}
if (bias) {
imm += bias[fid];
}
if (PReLU) {
imm = imm > 0.0 ? imm : 0.0;
}
output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
}
}
__global__ void convTranspose2dreduce_kernel2_(
float *__restrict__ input, float *__restrict__ bias,
float *__restrict__ output, const bool PReLU, const int n, const int f,
const int h, const int w, const int oh, const int ow, const int r,
const int s, const int ph, const int pw, const int dh, const int dw,
const int sh, const int sw) {
int warp_id = (blockDim.x / 32) * blockIdx.x + threadIdx.x / 32;
int lane = threadIdx.x % 32;
int nid = warp_id / (f * oh * ow);
int fid = (warp_id - nid * (f * oh * ow)) / (oh * ow);
int hid = (warp_id - nid * (f * oh * ow) - fid * (oh * ow)) / ow;
int wid = warp_id % ow;
if (hid >= oh || wid >= ow || nid > n || fid > f)
return;
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
nchunck = h * hchunk;
float *nfinput = input + nid * nchunck + fid * fchunck;
// view as conv, the true ph and pw
int tph = r - ph - 1, tpw = s - pw - 1;
int th = (h - 1) * sh + 1, tw = (w - 1) * sw + 1;
float imm = 0.0;
int ihst = hid - tph;
int iwst = wid - tpw;
for (int idx = lane; idx < r * s; idx += 32) {
int ri = idx / s;
int si = idx % s;
int ihid = ihst + r - ri - 1;
int iwid = iwst + s - si - 1;
if (ihid >= 0 && ihid < th && iwid >= 0 && iwid < tw &&
(ihid % sh == 0) && (iwid % sw == 0)) {
imm += *(nfinput + (ihid / sh) * hchunk + (iwid / sw) * wchunk +
ri * s + si);
}
}
for (int k = 16; k > 0; k >>= 1) {
imm += __shfl_down_sync(0xffffffff, imm, k); // sum
}
if (lane == 0) {
if (bias) {
imm += bias[fid];
}
if (PReLU) {
imm = imm > 0.0 ? imm : 0.0;
}
output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
}
}
__global__ void convTranspose2dreduce_kernel_(
float *__restrict__ input, float *__restrict__ bias,
float *__restrict__ output, const bool PReLU, const int n, const int f,
const int h, const int w, const int oh, const int ow, const int r,
const int s, const int ph, const int pw, const int dh, const int dw,
const int sh, const int sw, const int block_x_num, const int block_y_num) {
// assert dh = dw = 1
int nid = blockIdx.x / block_x_num, fid = blockIdx.y / block_y_num;
int hid = (blockIdx.x % block_x_num) * blockDim.x + threadIdx.x,
wid = (blockIdx.y % block_y_num) * blockDim.y + threadIdx.y;
if (hid >= oh || wid >= ow)
return;
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
nchunck = h * hchunk;
float *nfinput = input + nid * nchunck + fid * fchunck;
// view as conv, the true ph and pw
int tph = r - ph - 1, tpw = s - pw - 1;
int th = (h - 1) * sh + 1, tw = (w - 1) * sw + 1;
if (nid < n && fid < f && hid < oh && wid < ow) {
float imm = 0.0;
int ihst = hid - tph;
int iwst = wid - tpw;
for (int ri = 0; ri < r; ++ri) {
for (int si = 0; si < s; ++si) {
int ihid = ihst + r - ri - 1;
int iwid = iwst + s - si - 1;
if (ihid >= 0 && ihid < th && iwid >= 0 && iwid < tw &&
(ihid % sh == 0) && (iwid % sw == 0)) {
imm += *(nfinput + (ihid / sh) * hchunk +
(iwid / sw) * wchunk + ri * s + si);
}
}
}
if (bias) {
imm += bias[fid];
}
if (PReLU) {
imm = imm > 0.0 ? imm : 0.0;
}
output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
}
}
// nhwrsc -> nhwc
__global__ void reduce_4x4(dtype *in, dtype *out, int act, const int N,
const int F, const int H, const int W, const int IH,
const int IW) {
// #define in_index(n, h, w, r, s, f) \
// ((((((n)*IH + h) * IW + w) * R + r) * S + s) * F + f)
#define in_index(n, h, w, f, r, s) \
((((((n)*IH + h) * IW + w) * F + f) * R + r) * S + s)
#define out_index(n, h, w, f) (((((n)*H) + (h)) * W + (w)) * F + (f))
const int R = 4, S = 4;
const int n_tasks = N * F * H * W;
int start = threadIdx.x + blockDim.x * blockIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = start; i < n_tasks; i += stride) {
int t = i, n, f, h, w;
f = t % F;
t /= F;
w = t % W;
t /= W;
h = t % H;
t /= H;
n = t;
// unroll this 2-iter loop
float sum = 0;
int x, y;
for (int r = (h + 1) & 1; r < R; r += 2) {
x = (h + 1 - r) / 2;
if (x >= 0 && x < IH) {
for (int s = (w + 1) & 1; s < S; s += 2) {
y = (w + 1 - s) / 2;
if (y >= 0 && y < IW) {
sum += in[in_index(n, x, y, f, r, s)];
// if (i==0)
// printf("TTT nhwf= %d,%d,%d,%d x=%d y=%d, v=%f,
// index=%d, rsf %d %d %d\n", n, h, w,
// f, x, y, in[in_index(n, x, y, r, s, f)],
// in_index(n, x, y, r, s, f), r,s,f);
}
}
}
}
if (act == 0) {
out[out_index(n, h, w, f)] = sum;
} else if (act == 1) { // Relu
out[out_index(n, h, w, f)] = sum > 0 ? sum : 0;
} else if (act == 2) {
out[out_index(n, h, w, f)] = tanhf(sum);
}
}
#undef in_index
#undef out_index
}
namespace infini {
void conv2dreduce_kernel(float *input, float *bias, float *output, bool PReLU,
int n, int h, int w, int f, int r, int s, int oh,
int ow, int ph, int pw, int sh, int sw, int dh,
int dw) {
dim3 grid(n, f);
dim3 block(oh, ow);
// cudaStream_t stream(cudaStreamPerThread);
conv2dreduce_kernel_<<<grid, block, 0>>>(input, bias, output, PReLU, n, f,
h, w, oh, ow, r, s, ph, pw, dh, dw,
sh, sw);
}
void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
int act, int n, int h, int w, int f, int r,
int s, int oh, int ow, int ph, int pw, int sh,
int sw, int dh, int dw) {
dim3 grid(n, f);
dim3 block(oh, ow);
// cudaStream_t stream(cudaStreamPerThread);
// puts("convTranspose2dreduce_kernel is executed");
if (r == 4 && s == 4 && sh == 2 && sw == 2) {
const int M = r * s * f, N = n * h * w;
reduce_4x4<<<(M * N + 127) / 128, 128>>>(input, output, act, n, f, oh,
ow, h, w);
} else {
// puts("why use this conv2dreduce");
// block.x = 32;
// block.y = 32;
// int block_x_num = (oh + block.x - 1) / block.x;
// int block_y_num = (ow + block.y - 1) / block.y;
// grid.x = n * (block_x_num);
// grid.y = f * (block_y_num);
// convTranspose2dreduce_kernel_<<<grid, block, 0>>>(
// input, bias, output, (bool)act, n, f, h, w, oh, ow, r, s, ph, pw,
// dh, dw, sh, sw, block_x_num, block_y_num);
block.x = 128;
block.y = 1;
grid.x = (n * f * ow * oh + block.x / 32 - 1) / (block.x / 32);
grid.y = 1;
convTranspose2dreduce_kernel2_<<<grid, block, 0>>>(
input, bias, output, (bool)act, n, f, h, w, oh, ow, r, s, ph, pw,
dh, dw, sh, sw);
}
}
} // namespace infini

View File

@ -6,6 +6,8 @@ namespace infini {
struct MatmulCublasPerfRecordObj : public PerfRecordObj {
int algo = CUBLAS_GEMM_DEFAULT;
/// @brief 0 for cublasGemmStridedBatchedEx, 1 for cublasGemmEx
int apiId = 0;
void to_json(json &j) override {
j["type"] = 2;
j["data"] = std::make_pair(algo, time);
@ -19,8 +21,7 @@ struct MatmulCublasPerfRecordObj : public PerfRecordObj {
}
};
constexpr int N_ALGO = 24;
constexpr cublasGemmAlgo_t ALGOS[N_ALGO] = {
const vector<cublasGemmAlgo_t> Algos = {
CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1, CUBLAS_GEMM_ALGO2,
CUBLAS_GEMM_ALGO3, CUBLAS_GEMM_ALGO4, CUBLAS_GEMM_ALGO5,
CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7, CUBLAS_GEMM_ALGO8,
@ -30,6 +31,17 @@ constexpr cublasGemmAlgo_t ALGOS[N_ALGO] = {
CUBLAS_GEMM_ALGO18, CUBLAS_GEMM_ALGO19, CUBLAS_GEMM_ALGO20,
CUBLAS_GEMM_ALGO21, CUBLAS_GEMM_ALGO22, CUBLAS_GEMM_ALGO23,
};
const vector<cublasGemmAlgo_t> AlgosTensorOp = {
CUBLAS_GEMM_DFALT_TENSOR_OP, CUBLAS_GEMM_ALGO0_TENSOR_OP,
CUBLAS_GEMM_ALGO1_TENSOR_OP, CUBLAS_GEMM_ALGO2_TENSOR_OP,
CUBLAS_GEMM_ALGO3_TENSOR_OP, CUBLAS_GEMM_ALGO4_TENSOR_OP,
CUBLAS_GEMM_ALGO5_TENSOR_OP, CUBLAS_GEMM_ALGO6_TENSOR_OP,
CUBLAS_GEMM_ALGO7_TENSOR_OP, CUBLAS_GEMM_ALGO8_TENSOR_OP,
CUBLAS_GEMM_ALGO9_TENSOR_OP, CUBLAS_GEMM_ALGO10_TENSOR_OP,
CUBLAS_GEMM_ALGO11_TENSOR_OP, CUBLAS_GEMM_ALGO12_TENSOR_OP,
CUBLAS_GEMM_ALGO13_TENSOR_OP, CUBLAS_GEMM_ALGO14_TENSOR_OP,
CUBLAS_GEMM_ALGO15_TENSOR_OP};
class matmulCublas : public Kernel {
bool do_compute(const Operator &_op, const PerfRecord &_record,
const RuntimeObj *_context) const {
@ -47,9 +59,12 @@ class matmulCublas : public Kernel {
const int lda = op->getTransA() ? m : k, ldb = op->getTransB() ? k : n,
ldc = n;
const float alpha = 1.f, beta = 0.f;
// TODO:use compute type
cublasStatus_t stat;
if (b > 1) {
// Set the compute type to TF32 if enabled
cublasComputeType_t computeType = context->getEnableTF32()
? CUBLAS_COMPUTE_32F_FAST_TF32
: CUBLAS_COMPUTE_32F;
if (record->apiId == 0) {
// Support batch broadcast with zero stride
int dimA = op->getInputs(0)->getDims().size();
int dimB = op->getInputs(1)->getDims().size();
@ -63,17 +78,23 @@ class matmulCublas : public Kernel {
(dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
? 0 // Broadcast the batch dimension if batch size is 1
: n * k;
// printf("cublasGemmStridedBatchedEx %d%d, mnk %d %d %d, alpha %f,
// B "
// "%d %lld, A %d %lld, C %d %d, b %d %d\n",
// opB, opA, n, m, k, alpha, ldb, strideB, lda, strideA, ldc,
// m * n, b, record->algo);
stat = cublasGemmStridedBatchedEx(
context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA,
&beta, outData, CUDA_R_32F, ldc, m * n, b, CUDA_R_32F,
&beta, outData, CUDA_R_32F, ldc, m * n, b, computeType,
(cublasGemmAlgo_t)record->algo);
} else {
} else if (record->apiId == 1) {
stat = cublasGemmEx(
context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
CUDA_R_32F, ldb, inAData, CUDA_R_32F, lda, &beta, outData,
CUDA_R_32F, ldc, CUDA_R_32F, (cublasGemmAlgo_t)record->algo);
}
CUDA_R_32F, ldc, computeType, (cublasGemmAlgo_t)record->algo);
} else
IT_ASSERT(false);
// if (stat != CUBLAS_STATUS_SUCCESS)
// cout << cublasGetErrorString(stat);
return (stat == CUBLAS_STATUS_SUCCESS);
@ -98,15 +119,29 @@ class matmulCublas : public Kernel {
IT_ASSERT(op);
auto ret = make_ref<MatmulCublasPerfRecordObj>();
ret->time = std::numeric_limits<double>::max();
for (int i = 0; i < N_ALGO; i++) {
auto rcd = make_ref<MatmulCublasPerfRecordObj>();
rcd->algo = ALGOS[i];
if (!do_compute(_op, rcd, _context))
continue;
rcd->time = timeit([&]() { do_compute(_op, rcd, _context); },
[&]() { context->sync(); });
if (rcd->time < ret->time)
ret = rcd;
vector<int> apis{0};
if (op->getB() == 1)
apis.emplace_back(1);
// Set the possible algorithm range
auto algos = Algos;
if (context->getEnableTF32()) {
algos.insert(algos.end(), AlgosTensorOp.begin(),
AlgosTensorOp.end());
}
for (int api : apis) {
for (size_t i = 0; i < algos.size(); i++) {
auto rcd = make_ref<MatmulCublasPerfRecordObj>();
rcd->apiId = api;
rcd->algo = algos[i];
if (!do_compute(_op, rcd, _context))
continue;
rcd->time = timeit([&]() { do_compute(_op, rcd, _context); },
[&]() { context->sync(); });
if (rcd->time < ret->time)
ret = rcd;
}
}
IT_ASSERT(ret->time < std::numeric_limits<double>::max(),
"No valid algorithm found for " + op->toString());

View File

@ -1,5 +1,6 @@
#ifdef INFINI_USE_TVM
#include "core/kernel.h"
#include "cuda/cuda_conv2dreduce.h"
#include "cuda/cuda_runtime.h"
#include "dlpack/dlpack.h"
#include "ffi/ffi_embed.h"
@ -8,6 +9,13 @@
#include "operators/pooling.h"
#include "tvm/runtime/module.h"
#include "tvm/runtime/packed_func.h"
#include <nlohmann/json.hpp>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
using json = nlohmann::json;
namespace py = pybind11;
@ -22,6 +30,8 @@ class TVMRecordObj : public PerfRecordObj {
std::string dllPath;
std::string funcName;
std::vector<int> inputIdx;
tvm::runtime::PackedFunc packedFunc;
bool useExistingKernel = false;
};
using TVMRecord = Ref<TVMRecordObj>;
@ -33,9 +43,15 @@ class MemboundTVMPackedFunction : public Kernel {
auto op = as<MemBoundObj>(_op);
// auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
auto tvmRecord = std::dynamic_pointer_cast<TVMRecordObj>(record);
tvm::runtime::PackedFunc packedFunc =
getPackedFunction(tvmRecord->dllPath, tvmRecord->funcName);
IT_ASSERT(packedFunc != nullptr);
// Use user-defined kernels
if (tvmRecord->useExistingKernel) {
bool success = useExistingKernels(op);
IT_ASSERT(success);
return;
}
tvm::runtime::PackedFunc packedFunc = tvmRecord->packedFunc;
// prepare inputs and outputs
vector<DLTensorHolder> inputsHolder;
@ -63,10 +79,18 @@ class MemboundTVMPackedFunction : public Kernel {
// Premise: op is idempotent since it is called multiple times.
PerfRecord tune(const Operator &_op,
const RuntimeObj *_context) const override {
TVMRecord ret = std::make_shared<TVMRecordObj>();
auto op = as<MemBoundObj>(_op);
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
// If hash matches, use user-defined kernels
if (useExistingKernels(op)) {
TVMRecord ret = std::make_shared<TVMRecordObj>();
ret->time = timeit([&]() { useExistingKernels(op); },
[&]() { context->sync(); });
ret->useExistingKernel = true;
return ret;
}
// invoke Ansor to tune a membound kernel
auto [expr, hash] = op->getSimplifiedNnetExpr();
nnet::AsTVMVisitor visitor;
@ -93,6 +117,7 @@ class MemboundTVMPackedFunction : public Kernel {
if (inputName == op->getNnetInputs()[j]->getName())
break;
}
IT_ASSERT(j < numInputs, "Cannot find input name: " + inputName);
inputIdx.emplace_back(j);
}
@ -114,29 +139,41 @@ class MemboundTVMPackedFunction : public Kernel {
tvm::runtime::TVMArgs args(preArgs.first.data(), preArgs.second.data(),
preArgs.first.size());
TVMRecord ret = std::make_shared<TVMRecordObj>();
ret->time = timeit([&]() { packedFunc.CallPacked(args, &rv); },
[&]() { context->sync(); });
ret->kernelName = kernelName;
ret->dllPath = dllPath;
ret->funcName = func;
ret->inputIdx = inputIdx;
ret->packedFunc = packedFunc;
return std::dynamic_pointer_cast<PerfRecordObj>(ret);
return ret;
}
std::string serializeTVMArgs(const std::vector<std::vector<int>> &inDims,
const std::vector<std::string> &inDTypes,
const std::vector<int> &outDims,
const std::string &outDType,
const std::string &lambda,
const std::string &funcName,
const std::string &nnetExprString,
const std::string &nnetSimplifiedExprString,
const HashType hashCode) const {
json j;
// Consistant with python API interface
j["input_tensors"] = inDims;
j["input_dtypes"] = inDTypes;
j["output_tensor"] = outDims;
j["output_dtype"] = outDType;
j["tvm_code"] = lambda;
j["func_name"] = funcName;
j["nnet_expression"] = nnetExprString;
j["nnet_simplified_expression"] = nnetSimplifiedExprString;
j["hash_code"] = std::to_string(hashCode);
return j.dump();
}
/// @brief
/// @param inDims
/// @param inDTypes
/// @param outDims
/// @param outDType
/// @param lambda
/// @param funcName Generated function name
/// @param nnetExpressionString Save expr in string for logging.
/// @param nnetSimplifiedExprString Save simplified expr in string for
/// logging.
/// @param hashCode (optional) Hash code of the input expression for kernel
/// cache.
/// @return
std::string getAnsorDLL(const std::vector<std::vector<int>> &inDims,
const std::vector<std::string> &inDTypes,
const std::vector<int> &outDims,
@ -146,29 +183,60 @@ class MemboundTVMPackedFunction : public Kernel {
const std::string &nnetExprString,
const std::string &nnetSimplifiedExprString,
const HashType hashCode) const {
std::string dllPath;
try {
start_interpreter();
// Use static to avoid re-importing the module. Re-importing results
// in cuBLAS failure, whose root cause is not identified yet.
static auto func =
py::module::import("cpp_plugin").attr("gen_ansor_so");
py::tuple code =
func(inDims, inDTypes, outDims, outDType, lambda, funcName,
nnetExprString, nnetSimplifiedExprString,
std::to_string(hashCode));
dllPath = py::str(code[0]);
} catch (py::error_already_set &e) {
if (e.matches(PyExc_ImportError)) {
std::cerr << "Import Error. Don't forget to set environment "
"variable PYTHONPATH to contain "
"<repo-root>/python"
<< std::endl;
}
throw;
int fdP2C[2], fdC2P[2];
for (auto fd : {fdP2C, fdC2P}) {
int status = pipe(fd);
IT_ASSERT(status == 0, "pipe failed");
}
pid_t pid = fork();
IT_ASSERT(pid >= 0, "fork failed");
if (pid == 0) { // Child process
close(fdP2C[1]);
close(fdC2P[0]);
return dllPath;
dup2(fdP2C[0], STDIN_FILENO);
close(fdP2C[0]);
string cmd =
"from cpp_plugin.gen_ansor_so import pipe_gen; pipe_gen(+" +
std::to_string(fdC2P[1]) + ")";
const char *const argv[] = {"python3", "-c", cmd.data(), NULL};
execvp("python3", const_cast<char *const *>(argv));
} else { // Parent process
close(fdP2C[0]);
close(fdC2P[1]);
// Write to pipe
string serializedArgs = serializeTVMArgs(
inDims, inDTypes, outDims, outDType, lambda, funcName,
nnetExprString, nnetSimplifiedExprString, hashCode);
int status = -1;
status =
write(fdP2C[1], serializedArgs.data(), serializedArgs.size());
IT_ASSERT((size_t)status == serializedArgs.size(),
"Failed to write to pipe");
close(fdP2C[1]);
// Wait for TVM
waitpid(pid, &status, 0);
IT_ASSERT(WIFEXITED(status), "TVM process was terminated");
const int es = WEXITSTATUS(status);
IT_ASSERT(es == 0,
"TVM process exit with code " + std::to_string(es));
// Read from pipe
FILE *stream;
stream = fdopen(fdC2P[0], "r");
char buf_read[257] = {0};
status = std::fscanf(stream, "%256c", buf_read);
IT_ASSERT(status == 1, "Failed to read from pipe");
IT_ASSERT(buf_read[256] == 0, "Pipe buffer overflow");
fclose(stream);
close(fdC2P[0]);
return buf_read;
}
IT_ASSERT(false, "Should not reach here");
return "";
}
tvm::runtime::PackedFunc getPackedFunction(string path,
@ -214,6 +282,35 @@ class MemboundTVMPackedFunction : public Kernel {
return {values, type_codes};
}
bool useExistingKernels(Ref<MemBoundObj> op) const {
return false;
const map<HashType, tuple<int, int, int, int, int, int, int, int, int,
int, int, int, int, int, int>>
hashMap = {
// clang-format off
{18446744073661354550ULL, {1, 1, 2, 2, 256, 4, 4, 4, 4, 1, 1, 2, 2, 1, 1}},
{124145340ULL, {1, 1, 4, 4, 128, 4, 4, 8, 8, 1, 1, 2, 2, 1, 1}},
{18446744073695718019ULL, {1, 1, 8, 8, 64, 4, 4, 16, 16, 1, 1, 2, 2, 1, 1}},
{515085072ULL, {2, 1, 16, 16, 3, 4, 4, 32, 32, 1, 1, 2, 2, 1, 1}}
}; // clang-format on
float *input = op->getInputs(0)->getRawDataPtr<float *>();
float *bias = nullptr;
float *output = op->getOutput()->getRawDataPtr<float *>();
if (auto it = hashMap.find(op->getHash()); it != hashMap.end()) {
auto &[PReLU, n, h, w, f, r, s, oh, ow, ph, pw, sh, sw, dh, dw] =
it->second;
IT_ASSERT(op->getInputs(0)->size() ==
size_t(n) * h * w * f * r * s);
IT_ASSERT(op->getOutput()->size() == size_t(n) * oh * ow * f);
convTranspose2dreduce_kernel(input, bias, output, PReLU, n, h, w, f,
r, s, oh, ow, ph, pw, sh, sw, dh, dw);
return true;
}
// conv2dreduce_kernel(input, bias, output, PReLU, n, h, w, f, r, s,
// oh, ow, ph, pw, sh, sw, dh, dw);
return false;
}
};
REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,

View File

@ -0,0 +1,287 @@
#include "core/common.h"
#include <vector>
using namespace std;
template <class T>
__global__ void reduce_merge_conv_3x3_1x1(
T *__restrict__ input, T *__restrict__ output, T *__restrict__ bias,
const int N, const int H, const int W, const int F, const int N_offset,
const int H_offset, const int W_offset, const int F_offset,
const int out_N_offset, const int out_F_offset, const int out_H_offset,
const int out_W_offset, const int num) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num) {
int tmptid = tid;
const int n = (tmptid / out_N_offset);
tmptid -= n * out_N_offset;
const int f = tmptid / out_F_offset;
tmptid -= f * out_F_offset;
const int h = tmptid / out_H_offset;
tmptid -= h * out_H_offset;
const int w = tmptid / out_W_offset;
const int noff = n * N_offset;
const int hoff = h * H_offset;
const int woff = w * W_offset;
const int foff = f * F_offset;
input += noff + foff + woff + hoff;
T res = 0;
res += input[4];
res += input[9];
if (h < H - 1) {
res += input[H_offset + 7];
if (w < W - 1)
res += input[H_offset + W_offset + 8];
if (w > 0)
res += input[H_offset - W_offset + 6];
}
if (h > 0) {
res += input[1 - H_offset];
if (w < W - 1)
res += input[W_offset - H_offset + 2];
if (w > 0)
res += input[-1 * H_offset - W_offset];
}
if (w < W - 1)
res += input[5 + W_offset];
if (w > 0)
res += input[3 - W_offset];
output[tid] = max(res + bias[f], 0.f);
}
}
template <class T>
__global__ void reduce_merge_conv_3x3(
T *__restrict__ input, T *__restrict__ output, T *__restrict__ bias,
const int N, const int H, const int W, const int F, const int N_offset,
const int H_offset, const int W_offset, const int F_offset,
const int out_N_offset, const int out_F_offset, const int out_H_offset,
const int out_W_offset, const int num, const int act) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num) {
int tmptid = tid;
const int n = (tmptid / out_N_offset);
tmptid -= n * out_N_offset;
const int f = tmptid / out_F_offset;
tmptid -= f * out_F_offset;
const int h = tmptid / out_H_offset;
tmptid -= h * out_H_offset;
const int w = tmptid / out_W_offset;
const int noff = n * N_offset;
const int hoff = h * H_offset;
const int woff = w * W_offset;
const int foff = f * F_offset;
input += noff + foff + woff + hoff;
T res = 0;
res += input[4];
if (h < H - 1) {
res += input[H_offset + 7];
if (w < W - 1)
res += input[H_offset + W_offset + 8];
if (w > 0)
res += input[H_offset - W_offset + 6];
}
if (h > 0) {
res += input[1 - H_offset];
if (w < W - 1)
res += input[W_offset - H_offset + 2];
if (w > 0)
res += input[-1 * H_offset - W_offset];
}
if (w < W - 1)
res += input[5 + W_offset];
if (w > 0)
res += input[3 - W_offset];
if (act) {
// output[tid] = max(res + bias[f], 0.f);
// HACK: temperaly remove bias
output[tid] = max(res, 0.f);
} else {
// output[tid] = res + bias[f];
// HACK: temperaly remove bias
output[tid] = res;
}
}
}
template <class T>
__global__ void
reduce_2(T *__restrict__ input, T *__restrict__ output, T *__restrict__ bias,
const int N, const int F, const int H, const int W, const int N_offset,
const int F_offset, const int H_offset, const int W_offset,
const int out_N_offset, const int out_F_offset, const int out_H_offset,
const int out_W_offset, const int num) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num) {
int tmptid = tid;
const int n = tmptid / out_N_offset;
tmptid -= n * out_N_offset;
const int f = tmptid / out_F_offset;
tmptid -= f * out_F_offset;
const int h = tmptid / out_H_offset;
tmptid -= h * out_H_offset;
const int w = tmptid / out_W_offset;
const int noff = n * N_offset;
const int foff = f * F_offset * 4;
const int hoff = h * H_offset;
const int woff = w * W_offset;
input += noff + foff + woff + hoff;
T res = input[0];
if (w != W - 1)
res += input[F_offset * 2 + 3];
if (h != H - 1) {
res += input[F_offset + 3 * H_offset];
if (w != W - 1)
res += input[F_offset * 3 + 3 * H_offset + 3];
}
// output[tid] = max(res + bias[f], 0.f);
// HACK: temperaly remove bias
output[tid] = max(res, 0.f);
}
}
__global__ void reduceConvRxSToNCHWKernel(
float *__restrict__ input, float *__restrict__ bias,
float *__restrict__ output, const int act, const int n, const int f,
const int h, const int w, const int oh, const int ow, const int r,
const int s, const int ph, const int pw, const int dh, const int dw) {
// input shape: (n, h, w, f, r, s)
// output shape: (n, f, oh, ow)
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int out_N_offset = f * oh * ow, out_F_offset = oh * ow,
out_H_offset = ow, out_W_offset = 1;
const int num = out_N_offset * n;
if (tid < num) {
// output index
int tmptid = tid;
const int nid = (tmptid / out_N_offset);
tmptid -= nid * out_N_offset;
const int fid = tmptid / out_F_offset;
tmptid -= fid * out_F_offset;
const int hid = tmptid / out_H_offset;
tmptid -= hid * out_H_offset;
const int wid = tmptid / out_W_offset;
// Input index
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
nchunck = h * hchunk;
float *__restrict__ nfinput = input + nid * nchunck + fid * fchunck;
float imm = 0.0;
const int ihst = hid, iwst = wid;
for (int ri = 0; ri < r; ++ri) {
for (int si = 0; si < s; ++si) {
int ihid = ihst + (ri - r / 2) * dh;
int iwid = iwst + (si - s / 2) * dw;
if (ihid >= 0 && ihid < h && iwid >= 0 && iwid < w) {
imm += *(nfinput + ihid * hchunk + iwid * wchunk + ri * s +
si);
}
}
}
if (bias) {
imm += bias[fid];
}
if (act) {
imm = imm > 0.0 ? imm : 0;
}
output[tid] = imm;
}
}
namespace infini {
void hetConvToMMReduce(int n, int h, int w, int f, float *input, float *output,
float *bias) {
const int kBlockSize = 128;
vector<int> in_params = {n, h, w, f}; // NHWF
vector<int> out_params = {n, f, h, w};
int in_base = 10;
int out_base = 1;
vector<int> in_offsets;
vector<int> out_offsets;
for (int i = 0; i < 4; ++i) {
in_offsets.push_back(in_base);
in_base *= in_params[3 - i];
out_offsets.push_back(out_base);
out_base *= out_params[3 - i];
}
reduce_merge_conv_3x3_1x1<float>
<<<(out_base + kBlockSize - 1) / kBlockSize, kBlockSize>>>(
input, output, bias, in_params[0], in_params[1], in_params[2],
in_params[3], in_offsets[3], in_offsets[2], in_offsets[1],
in_offsets[0], out_offsets[3], out_offsets[2], out_offsets[1],
out_offsets[0], out_base);
}
void conv5x5ToConv3x3Reduce(int n, int f, int h, int w, float *input,
float *output, float *bias) {
const int kBlockSize = 128;
vector<int> params{n, f, h, w}; // NFHW
vector<int> ranges(4);
ranges[3] = params[3] + 2;
ranges[2] = params[2] + 2;
ranges[1] = params[1] * 4;
ranges[0] = params[0];
int in_base = 1;
int out_base = 1;
vector<int> in_offsets;
vector<int> out_offsets;
for (int i = 0; i < 4; ++i) {
in_offsets.push_back(in_base);
in_base *= ranges[3 - i];
out_offsets.push_back(out_base);
out_base *= params[3 - i];
}
reduce_2<float><<<(out_base + kBlockSize - 1) / kBlockSize, kBlockSize>>>(
input, output, bias, params[0], params[1], params[2], params[3],
in_offsets[3], in_offsets[2], in_offsets[1], in_offsets[0],
out_offsets[3], out_offsets[2], out_offsets[1], out_offsets[0],
out_base);
}
// [NHW,FRS] -> [NFHW]
void conv3x3ToReduce(int n, int h, int w, int f, float *input, float *output,
float *bias) {
const int kBlockSize = 128;
vector<int> in_params = {n, h, w, f}; // NHWF
vector<int> out_params = {n, f, h, w};
int in_base = 9;
int out_base = 1;
vector<int> in_offsets;
vector<int> out_offsets;
for (int i = 0; i < 4; ++i) {
in_offsets.push_back(in_base);
in_base *= in_params[3 - i];
out_offsets.push_back(out_base);
out_base *= out_params[3 - i];
}
reduce_merge_conv_3x3<float>
<<<(out_base + kBlockSize - 1) / kBlockSize, kBlockSize>>>(
input, output, bias, in_params[0], in_params[1], in_params[2],
in_params[3], in_offsets[3], in_offsets[2], in_offsets[1],
in_offsets[0], out_offsets[3], out_offsets[2], out_offsets[1],
out_offsets[0], out_base, 0);
}
void reduceConvRxSToNCHW(float *input, float *bias, float *output, int act,
int n, int h, int w, int f, int r, int s, int oh,
int ow, int ph, int pw, int sh, int sw, int dh,
int dw) {
IT_ASSERT(sh == 1 && sw == 1,
"reduceConvRxSToNCHWKernel_kernel only support sh=sw=1");
IT_ASSERT(dh == 1 && dw == 1,
"reduceConvRxSToNCHWKernel_kernel only support dh=dw=1");
const int blocksize = 512;
const int gridsize = (n * f * oh * ow + blocksize - 1) / blocksize;
cudaStream_t stream(cudaStreamPerThread);
reduceConvRxSToNCHWKernel<<<gridsize, blocksize, 0, stream>>>(
input, bias, output, act, n, f, h, w, oh, ow, r, s, ph, pw, dh, dw);
}
} // namespace infini

View File

@ -4,10 +4,13 @@ namespace infini {
class CopyCuda : public CudaKernelWithoutConfig {
void compute(const Operator &op,
const RuntimeObj *_context) const override {
auto inData = op->getInputs(0)->getRawDataPtr<void *>();
auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
cudaMemcpyDeviceToDevice);
// auto inData = op->getInputs(0)->getRawDataPtr<void *>();
// auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
// cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
// cudaMemcpyDeviceToDevice);
// HACK: optimization
op->getOutputs()[0]->setData(op->getInputs(0)->getDataBlob());
}
};
// reshape/flatten/identity all act as copying from input to output.

View File

@ -0,0 +1,70 @@
#include "operators/transpose.h"
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_transpose.h"
namespace infini {
class TransposeCuda : public CudaKernelWithoutConfig {
void generic_transpose(const Ref<TransposeObj> &op,
const RuntimeObj *context) const {
auto input = op->getInputs(0);
auto output = op->getOutput();
void *const inputData = input->getRawDataPtr<void *>();
void *const outputData = output->getRawDataPtr<void *>();
const auto &inputShape = input->getDims();
const auto &outputShape = output->getDims();
const auto &perm = op->getPermute();
int size = input->size();
int nDims = input->getDims().size();
// Compute strides
SmallArray strides, buffer;
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
int curStride = 1;
for (int i = nDims - 1; i >= 0; --i) {
buffer.data[i] = curStride;
curStride *= inputShape[i];
}
for (int i = 0; i < nDims; ++i) {
strides.data[i] = buffer.data[perm[i]];
}
SmallArray outputDims;
for (int i = 0; i < nDims; ++i) {
outputDims.data[i] = outputShape[i];
}
transpose_kernel((float *)inputData, (float *)outputData, nDims, size,
strides, outputDims, input->getDims(),
output->getDims(), perm);
}
void fast_transpose_last_dim(const Ref<TransposeObj> &op,
const RuntimeObj *context) const {
// Perm 0 2 3 1
auto cuda = dynamic_cast<const CudaRuntimeObj *>(context);
auto shape = op->getOutput()->getDims();
invoke_transpose_last_two_dim(
op->getInputs(0)->getRawDataPtr<float *>(),
op->getOutput()->getRawDataPtr<float *>(), shape[0],
shape[1] * shape[2], shape[3], cuda->getNumSMs());
}
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<TransposeObj>(_op);
const auto &perm = op->getPermute();
if (perm == vector{0, 2, 3, 1}) {
fast_transpose_last_dim(op, _context);
} else {
generic_transpose(op, _context);
}
}
};
REGISTER_KERNEL(Device::CUDA, OpType::Transpose, DataType::Float32,
TransposeCuda, "Transpose_CUDA_Float32");
} // namespace infini

View File

@ -0,0 +1,231 @@
#include "core/common.h"
#include "cuda/cuda_common.h"
#include "utils/small_array.h"
#include <cassert>
#include <cstdint>
#include <iostream>
#include <numeric>
#include <limits>
#define CUDA_HOST_DEVICE __forceinline__ __device__ __host__
// https://github.com/462630221/SampleCode
template <typename T> struct QuotientMod {
T quotient;
T mod;
__host__ __device__ QuotientMod(T q, T m) : quotient(q), mod(m) {}
};
template <typename T> struct FastIntDivider {
FastIntDivider() {}
FastIntDivider(T d) { divisor_ = d; };
__forceinline__ __device__ __host__ T div(T n) { return n / divisor_; }
__forceinline__ __device__ __host__ T mod(T n) { return n % divisor_; }
__forceinline__ __device__ __host__ QuotientMod<T> divmod(T n) {
return QuotientMod<T>(n / divisor_, n % divisor_);
}
T divisor_;
};
template <> struct FastIntDivider<uint32_t> {
FastIntDivider(){};
FastIntDivider(uint32_t d) {
assert(d >= 1);
divisor_ = d;
// if put 0 to __builtin_clz, the result undefined.
if (d == 1) {
rshift_ = 0;
} else {
rshift_ = 32 - __builtin_clz(d - 1);
}
uint64_t magic_t = ((1lu << (32 + rshift_)) + d - 1) / d;
magic_ = uint32_t(magic_t);
};
__forceinline__ __device__ __host__ uint32_t div(uint32_t n) {
#if defined(__CUDA_ARCH__)
uint32_t q = __umulhi(n, magic_);
#else
uint32_t q = (uint64_t(n) * magic_) >> 32;
#endif
// return (((n - q) >> 1) + q) >> (rshift_ - 1);
return (n + q) >> rshift_;
}
__forceinline__ __device__ __host__ QuotientMod<uint32_t>
divmod(uint32_t n) {
uint32_t q = div(n);
return QuotientMod<uint32_t>(q, n - divisor_ * q);
}
uint32_t magic_;
uint32_t rshift_;
uint32_t divisor_;
};
void test_fast_u32() {
uint32_t d = 1;
FastIntDivider<uint32_t> diver(d);
std::cout << "7/3= " << uint32_t(7) / uint32_t(d) << " " << diver.div(7)
<< std::endl;
}
constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
__global__ void _transpose_kernel(float *input, float *output, int nDims,
int size, infini::SmallArray strides,
infini::SmallArray outputShape) {
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (outputIdx < size) {
int inputIdx = 0;
int v = outputIdx;
for (int i = nDims - 1; i >= 0; --i) {
inputIdx += v % outputShape.data[i] * strides.data[i];
v /= outputShape.data[i];
}
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
output[outputIdx] = __ldg(input + inputIdx);
#else
output[outputIdx] = input[inputIdx];
#endif
}
}
template <typename T, int NUM> struct Array {
CUDA_HOST_DEVICE T &operator[](unsigned int index) { return data[index]; }
CUDA_HOST_DEVICE const T &operator[](unsigned int index) const {
return data[index];
}
CUDA_HOST_DEVICE constexpr int size() const { return NUM; }
CUDA_HOST_DEVICE Array() {
#ifndef __CUDA_ARCH__
for (int i = 0; i < NUM; i++) {
data[i] = T();
}
#endif
}
T data[NUM];
};
/**
* @brief Optimize : Reorganize
*
*/
template <int NUM_AXES, int UNROLL, int BLOCK_SIZE, typename T>
__global__ void
transpose_kernel_v3(const T *data_in, T *data_out,
const Array<uint32_t, NUM_AXES> perm_strides,
Array<FastIntDivider<uint32_t>, NUM_AXES> out_strides,
const size_t all_cnt) {
uint32_t out_offset_reg[UNROLL];
uint32_t in_offset_reg[UNROLL];
#pragma unroll
for (int i = 0; i < UNROLL; ++i) {
out_offset_reg[i] =
blockIdx.x * BLOCK_SIZE * UNROLL + threadIdx.x + i * BLOCK_SIZE;
in_offset_reg[i] = 0;
}
#pragma unroll
for (int j = 0; j < NUM_AXES; ++j) {
#pragma unroll
for (int i = 0; i < UNROLL; ++i) {
QuotientMod<uint32_t> d = out_strides[j].divmod(out_offset_reg[i]);
in_offset_reg[i] += d.quotient * perm_strides[j];
out_offset_reg[i] = d.mod;
}
}
T ld_reg[UNROLL];
uint32_t offset = blockIdx.x * BLOCK_SIZE * UNROLL + threadIdx.x;
if (offset + BLOCK_SIZE * UNROLL <= all_cnt) {
#pragma unroll
for (int i = 0; i < UNROLL; ++i) {
ld_reg[i] = data_in[in_offset_reg[i]];
}
#pragma unroll
for (int i = 0; i < UNROLL; ++i) {
data_out[offset + i * BLOCK_SIZE] = ld_reg[i];
}
} else {
#pragma unroll
for (int i = 0; i < UNROLL; ++i) {
if (offset + i * BLOCK_SIZE < all_cnt) {
ld_reg[i] = data_in[in_offset_reg[i]];
}
}
#pragma unroll
for (int i = 0; i < UNROLL; ++i) {
if (offset + i * BLOCK_SIZE < all_cnt) {
data_out[offset + i * BLOCK_SIZE] = ld_reg[i];
}
}
}
}
template <typename T> T AccMul(std::vector<T> vec) {
return std::accumulate(vec.begin(), vec.end(), T(1), std::multiplies<T>());
}
namespace infini {
// void transpose_kernel(float *input, float *output, int nDims, int size,
// SmallArray strides, SmallArray outputShape) {
// int blocksize = block_work_size();
// int gridsize = (size + block_work_size() - 1) / block_work_size();
// _transpose_kernel<<<gridsize, blocksize>>>(input, output, nDims, size,
// strides, outputShape);
// }
std::vector<uint32_t> GetStrides(std::vector<uint32_t> dims) {
std::vector<uint32_t> strides(dims.size(), 1);
for (int i = dims.size() - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dims[i + 1];
}
return strides;
}
void transpose_kernel(float *input, float *output, int nDims, int size,
SmallArray _strides, SmallArray _outputShape,
vector<int> _dims_in, vector<int> _dims_out,
vector<int> _perms) {
constexpr int NUM_AXES = 4;
IT_ASSERT(nDims <= NUM_AXES);
constexpr int UNROLL = 8 / sizeof(float);
constexpr int BLOCK_SIZE = 128;
vector<uint32_t> dims_in, dims_out, perms;
for (auto v : _dims_in)
dims_in.push_back(v);
for (auto v : _dims_out)
dims_out.push_back(v);
for (auto v : _perms)
perms.push_back(v);
size_t all_cnt = AccMul(dims_in);
auto strides_in = GetStrides(dims_in);
auto strides_out = GetStrides(dims_out);
const int grid =
(all_cnt + BLOCK_SIZE * UNROLL - 1) / (BLOCK_SIZE * UNROLL);
Array<uint32_t, NUM_AXES> perm_strides;
Array<FastIntDivider<uint32_t>, NUM_AXES> out_strides_fast;
for (int i = 0; i < NUM_AXES; ++i) {
out_strides_fast[i] = FastIntDivider<uint32_t>(strides_out[i]);
perm_strides[i] = strides_in[perms[i]];
}
transpose_kernel_v3<NUM_AXES, UNROLL, BLOCK_SIZE, float>
<<<grid, BLOCK_SIZE, 0>>>(
input, output, perm_strides, out_strides_fast, all_cnt);
}
} // namespace infini

View File

@ -0,0 +1,194 @@
#include "cuda/cuda_common.h"
#include <assert.h>
#include <vector>
template <int numSM, int numWarp>
__global__ void kernel_transpose_last(float *ptrA, float *ptrB, int dim0,
int dim1, int dim2) {
int laneId = threadIdx.x % 32;
int warpId = blockIdx.x * numWarp + threadIdx.x / 32;
int n1 = (dim1 + 31) / 32;
int n2 = (dim2 + 31) / 32;
float bufA[32];
for (int i = warpId; i < dim0 * n1 * n2; i += numSM * numWarp) {
// clock_t ck0 = clock();
int i0 = i / (n1 * n2);
int i1 = (i % (n1 * n2)) / n2;
int i2 = (i % (n1 * n2)) % n2;
int offsetA = i0 * dim1 * dim2 + i2 * 32 * dim1 + i1 * 32;
int offsetB = i0 * dim1 * dim2 + i1 * 32 * dim2 + i2 * 32;
int ld1 = min(32, dim1 - i1 * 32);
int ld2 = min(32, dim2 - i2 * 32);
// if (i == 4 && laneId == 0)
// printf("%d %d\n", ld1, ld2);
if (ld2 == 32) {
#pragma unroll
for (int i = 0; i < 32; i++) {
if ((laneId + i) % 32 < ld1) {
bufA[i] = ptrA[offsetA + i * dim1 + (laneId + i) % 32];
}
}
} else if (ld2 == 17) {
#pragma unroll
for (int i = 0; i < 17; i++) {
if ((laneId + i) % 32 < ld1) {
bufA[i] = ptrA[offsetA + i * dim1 + (laneId + i) % 32];
}
}
} else if (ld2 == 4) {
#pragma unroll
for (int i = 0; i < 4; i++) {
if ((laneId + i) % 32 < ld1) {
bufA[i] = ptrA[offsetA + i * dim1 + (laneId + i) % 32];
}
}
} else {
for (int i = 0; i < ld2; i++) {
if ((laneId + i) % 32 < ld1) {
bufA[i] = ptrA[offsetA + i * dim1 + (laneId + i) % 32];
}
}
};
if (ld1 == 32) {
#pragma unroll
for (int i = 0; i < 32; i++) {
if ((i + 32 - laneId) % 32 < ld2) {
ptrB[offsetB + i * dim2 + (i + 32 - laneId) % 32] =
bufA[(i + 32 - laneId) % 32];
}
}
} else if (ld1 == 17) {
#pragma unroll
for (int i = 0; i < 17; i++) {
if ((i + 32 - laneId) % 32 < ld2) {
ptrB[offsetB + i * dim2 + (i + 32 - laneId) % 32] =
bufA[(i + 32 - laneId) % 32];
}
}
} else if (ld1 == 4) {
#pragma unroll
for (int i = 0; i < 4; i++) {
if ((i + 32 - laneId) % 32 < ld2) {
ptrB[offsetB + i * dim2 + (i + 32 - laneId) % 32] =
bufA[(i + 32 - laneId) % 32];
}
}
} else {
for (int i = 0; i < ld1; i++) {
if ((i + 32 - laneId) % 32 < ld2) {
ptrB[offsetB + i * dim2 + (i + 32 - laneId) % 32] =
bufA[(i + 32 - laneId) % 32];
}
}
};
}
}
namespace infini {
/// @brief
/// @param ptrA Input tensor of shape [dim0, dim2, dim1]
/// @param ptrB Output tensor of shape [dim0, dim1, dim2]
/// @param dim0
/// @param dim1
/// @param dim2
void invoke_transpose_last_two_dim(float *ptrA, float *ptrB, int dim0, int dim1,
int dim2, int numSMs) {
constexpr int numWarps = 4;
dim3 gridDim(numSMs, 1);
dim3 blockDim(numWarps * 32, 1);
if (numSMs == 80) { // V100
kernel_transpose_last<80, numWarps>
<<<gridDim, blockDim>>>(ptrA, ptrB, dim0, dim1, dim2);
} else if (numSMs == 108) { // A100
kernel_transpose_last<108, numWarps>
<<<gridDim, blockDim>>>(ptrA, ptrB, dim0, dim1, dim2);
} else {
IT_TODO_HALT_MSG(std::string("transpose_last_two_dim with ") +
std::to_string(numSMs) + " SMs is not implemented");
}
// cudaCheckError();
}
} // namespace infini
// constexpr int numWarm = 128, numEval = 128;
//
// void eval_transpose_last(const std::vector<int> &shape) {
// assert(shape.size() == 3);
// int size = shape[0] * shape[1] * shape[2];
// float *dataA, *dataB;
// dataA = (float *)malloc(size * sizeof(float));
// dataB = (float *)malloc(size * sizeof(float));
// for (int i0 = 0; i0 < shape[0]; i0++) {
// for (int i2 = 0; i2 < shape[2]; i2++) {
// for (int i1 = 0; i1 < shape[1]; i1++) {
// dataA[i0 * shape[1] * shape[2] + i2 * shape[1] + i1] =
// i0 * shape[1] * shape[2] + i2 * shape[1] + i1;
// }
// }
// }
// float *ptrA, *ptrB;
// checkCudaError(cudaMalloc(&ptrA, size * sizeof(float)));
// checkCudaError(cudaMalloc(&ptrB, size * sizeof(float)));
// checkCudaError(
// cudaMemcpy(ptrA, dataA, size * sizeof(float),
// cudaMemcpyHostToDevice));
// invoke_transpose_last_two_dim(ptrA, ptrB, shape[0], shape[1], shape[2]);
// checkCudaError(
// cudaMemcpy(dataB, ptrB, size * sizeof(float),
// cudaMemcpyDeviceToHost));
// for (int i0 = 0; i0 < shape[0]; i0++) {
// for (int i1 = 0; i1 < shape[1]; i1++) {
// for (int i2 = 0; i2 < shape[2]; i2++) {
// if (dataA[i0 * shape[1] * shape[2] + i1 + i2 * shape[1]] !=
// dataB[i0 * shape[1] * shape[2] + i1 * shape[2] + i2]) {
// std::cout
// << i0 << " " << i1 << " " << i2 << " "
// << dataA[i0 * shape[1] * shape[2] + i1 + i2 *
// shape[1]]
// << " "
// << dataB[i0 * shape[1] * shape[2] + i1 * shape[2] +
// i2]
// << std::endl;
// exit(-1);
// }
// }
// }
// }
// cudaEvent_t st, ed;
// checkCudaError(cudaEventCreate(&st));
// checkCudaError(cudaEventCreate(&ed));
// for (int i = 0; i < numWarm; i++) {
// invoke_transpose_last_two_dim(ptrA, ptrB, shape[0], shape[1],
// shape[2]);
// }
// checkCudaError(cudaEventRecord(st));
// for (int i = 0; i < numEval; i++) {
// invoke_transpose_last_two_dim(ptrA, ptrB, shape[0], shape[1],
// shape[2]);
// }
// checkCudaError(cudaEventRecord(ed));
// checkCudaError(cudaEventSynchronize(st));
// checkCudaError(cudaEventSynchronize(ed));
// float time;
// checkCudaError(cudaEventElapsedTime(&time, st, ed));
// float bandwidth = size * 2 * sizeof(float) * numEval / time / 1e6;
// std::cout << "transpose_last: " << shape[0] << " " << shape[1] << " "
// << shape[2] << " time: " << time / numEval
// << " ms. bandwidth: " << bandwidth << " GB/s" << std::endl;
// }
// Performance evaluation
// int main() {
// eval_transpose_last({16, 1024, 256});
// eval_transpose_last({16, 14 * 14, 1024});
// eval_transpose_last({16, 7 * 7, 2048});
// eval_transpose_last({16, 7 * 7, 128});
// eval_transpose_last({1, 14 * 14, 1024});
// eval_transpose_last({1, 7 * 7, 2048});
// eval_transpose_last({1, 7 * 7, 128});
// }

View File

@ -24,9 +24,11 @@ class ActivationCudnn : public CudaKernelWithoutConfig {
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
cudnnTensorDescriptor_t inputDesc, outputDesc;
auto dim = op->getInputs(0)->getDims();
if (dim.size() != 4)
IT_TODO_HALT();
auto _dim = op->getInputs(0)->getDims();
IT_ASSERT_TODO(_dim.size() <= 4);
vector<int> dim(4, 1);
for (int i = 0; i < (int)_dim.size(); i++) // Unsqueeze to 4D
dim[i + 4 - _dim.size()] = _dim[i];
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
// get inputs

560
src/nnet/App/test_models.cc Normal file
View File

@ -0,0 +1,560 @@
#ifdef USE_CUDA
#include "core/blob.h"
#include "core/dummy_mutator.h"
#include "core/graph.h"
#include "core/runtime.h"
#include "core/search_engine.h"
#include "cuda/cuda_runtime.h"
#include "ffi/ffi_callback.h"
#include "nnet/nmutator.h"
#include "operators/G2BMM.h"
#include "operators/GBMM.h"
#include "operators/conv.h"
#include "operators/element_wise.h"
#include "operators/matmul.h"
#include "operators/pooling.h"
#include "operators/reshape.h"
#include "operators/softmax.h"
#include "operators/transpose.h"
#include "operators/unary.h"
#include "test.h"
#include <pybind11/stl.h>
namespace infini {
// Channel, kernelSize, pad, stride, isTanh
using GANConfigs = vector<tuple<int, int, int, int, bool>>;
using DetailedConfigs =
vector<tuple<int, int, int, int, int, int, int, int, int, int, bool>>;
static const vector<int> metaRules = {3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90};
DetailedConfigs getGANConfigs(int id, int batch) {
// The first conv can be transformed into gemm without reduction
// n, f, h, w, c, r, s, stride,
// pad, dilation
GANConfigs ret;
const DetailedConfigs infoConfigs = {
{batch, 228, 1, 1, 448, 2, 2, 1, 0, 1, false},
{batch, 448, 2, 2, 256, 4, 4, 2, 1, 1, false},
{batch, 256, 4, 4, 128, 4, 4, 2, 1, 1, false},
{batch, 128, 8, 8, 64, 4, 4, 2, 1, 1, false},
{batch, 64, 16, 16, 3, 4, 4, 2, 1, 1, true}};
const DetailedConfigs dcganConfigs = {
{batch, 100, 1, 1, 512, 4, 4, 1, 0, 1, false},
{batch, 512, 4, 4, 256, 4, 4, 2, 1, 1, false},
{batch, 256, 8, 8, 128, 4, 4, 2, 1, 1, false},
{batch, 128, 16, 16, 64, 4, 4, 2, 1, 1, false},
{batch, 64, 32, 32, 3, 4, 4, 2, 1, 1, true}};
DetailedConfigs details;
if (id == 0) { // InfoGAN
dbg("Use InfoGAN configs");
details = infoConfigs;
} else if (id == 1) { // DCGAN
dbg("Use DCGAN configs");
details = dcganConfigs;
} else
IT_ASSERT(false);
return details;
}
// NHWC format
Graph getGANGraph(int batch, Runtime runtime, int nLayers, int modelId) {
IT_ASSERT(1 <= nLayers && nLayers <= 5);
Graph g = make_ref<GraphObj>(runtime);
vector<Tensor> weights;
auto configs = getGANConfigs(modelId, batch);
Tensor input;
{
auto &[n, f, h, w, c, r, s, stride, pad, dilation, isTanh] = configs[0];
input = g->addTensor({batch, 1, 1, f}, DataType::Float32,
TensorType::Input);
}
for (int i = 0; i < (int)configs.size() && i < nLayers; ++i) {
// auto [channel, kernelSize, pad, stride, tanh] = configs[i];
auto &[n, f, h, w, c, r, s, stride, pad, dilation, isTanh] = configs[i];
IT_ASSERT(input->getDims()[3] == f);
auto weight = g->addTensor({f, r, s, c}, DataType::Float32,
TensorType::Initialized); // f, r, s, c
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
pad, stride, stride, 1, 1)
->getOutput();
if (isTanh) {
input = g->addOp<TanhObj>(input, nullptr)->getOutput();
} else {
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
}
}
return g;
}
// NHWC
Graph getFSRCNNGraph(int batch, Runtime runtime) {
// n, c, h, w, f, r, s, stride, pad, dilation, has_pReLU
const DetailedConfigs fsrcnn_config = {
{batch, 1, 32, 32, 56, 5, 5, 1, 2, 1, true},
{batch, 56, 32, 32, 12, 1, 1, 1, 0, 1, true},
{batch, 12, 32, 32, 12, 3, 3, 1, 1, 1, false},
{batch, 12, 32, 32, 12, 3, 3, 1, 1, 1, false},
{batch, 12, 32, 32, 12, 3, 3, 1, 1, 1, false},
{batch, 12, 32, 32, 12, 3, 3, 1, 1, 1, true},
{batch, 12, 32, 32, 56, 1, 1, 1, 0, 1, true},
{batch, 56, 32, 32, 1, 9, 9, 1, 3, 4, false} // ConvTransNHWC
// n, f, h, w, c, r, s, stride, pad, dilation, has_pReLU
};
Graph g = make_ref<GraphObj>(runtime);
Tensor input;
{
auto &[n, c, h, w, f, r, s, stride, pad, dilation, has_pReLU] =
fsrcnn_config[0];
input = g->addTensor({batch, h, w, c}, DataType::Float32,
TensorType::Input);
}
for (int i = 0; i < (int)fsrcnn_config.size() - 1; ++i) {
// auto [channel, kernelSize, pad, stride, tanh] = configs[i];
auto &[n, c, h, w, f, r, s, stride, pad, dilation, has_pReLU] =
fsrcnn_config[i];
IT_ASSERT(input->getDims()[3] == c);
auto weight = g->addTensor({f, r, s, c}, DataType::Float32,
TensorType::Initialized); // f, r, s, c
input = g->addOp<ConvNHWCObj>(input, weight, nullptr, pad, pad, stride,
stride, 1, 1)
->getOutput();
if (has_pReLU) {
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
}
}
// last operator is a ConvTransNHWC
{
auto &[n, f, h, w, c, r, s, stride, pad, dilation, has_pReLU] =
fsrcnn_config[fsrcnn_config.size() - 1];
IT_ASSERT(input->getDims()[3] == f);
auto weight = g->addTensor({f, r, s, c}, DataType::Float32,
TensorType::Initialized); // f, r, s, c
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
pad, stride, stride, 1, 1)
->getOutput();
}
return g;
}
Graph getLongformer(Runtime runtime, int bs) {
const int seqlen = 10000, w = 1000, featlen = 512, heads = 8, d = 4;
const int hidden = featlen, hiddenPerHead = hidden / heads;
assert(hidden % heads == 0);
Graph g = make_ref<GraphObj>(runtime);
auto i0 = g->addTensor({bs, seqlen, featlen}, DataType::Float32,
TensorType::Input);
auto w0 = g->addTensor({featlen, hidden}, DataType::Float32,
TensorType::Initialized);
auto w1 =
g->addTensor({512, 512}, DataType::Float32, TensorType::Initialized);
auto w2 =
g->addTensor({512, 512}, DataType::Float32, TensorType::Initialized);
// Feed forward
auto w3 =
g->addTensor({512, 512}, DataType::Float32, TensorType::Initialized);
auto bias3 =
g->addTensor({512}, DataType::Float32, TensorType::Initialized);
auto w4 =
g->addTensor({512, 512}, DataType::Float32, TensorType::Initialized);
auto bias4 =
g->addTensor({512}, DataType::Float32, TensorType::Initialized);
auto q0 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
TensorType::Other);
auto k0 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
TensorType::Other);
auto v0 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
TensorType::Other);
auto q1 = g->addTensor({bs, seqlen, heads, hiddenPerHead},
DataType::Float32, TensorType::Other);
auto k1 = g->addTensor({bs, seqlen, heads, hiddenPerHead},
DataType::Float32, TensorType::Other);
auto v1 = g->addTensor({bs, seqlen, heads, hiddenPerHead},
DataType::Float32, TensorType::Other);
auto q2 = g->addTensor({bs, heads, seqlen, hiddenPerHead},
DataType::Float32, TensorType::Other);
auto k2 = g->addTensor({bs, heads, seqlen, hiddenPerHead},
DataType::Float32, TensorType::Other);
auto v2 = g->addTensor({bs, heads, seqlen, hiddenPerHead},
DataType::Float32, TensorType::Other);
auto q3 = g->addTensor({bs * heads, seqlen, hiddenPerHead},
DataType::Float32, TensorType::Other);
auto k3 = g->addTensor({bs * heads, seqlen, hiddenPerHead},
DataType::Float32, TensorType::Other);
auto v3 = g->addTensor({bs * heads, seqlen, hiddenPerHead},
DataType::Float32, TensorType::Other);
auto prob = g->addTensor({bs * heads, seqlen, 2 * w + 1}, DataType::Float32,
TensorType::Other);
auto probSoftmax = g->addTensor({bs * heads, seqlen, 2 * w + 1},
DataType::Float32, TensorType::Other);
auto attn = g->addTensor({bs * heads, seqlen, hiddenPerHead},
DataType::Float32, TensorType::Other);
auto t00 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
TensorType::Other);
auto t01 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
TensorType::Other);
auto t02 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
TensorType::Other);
// auto t10 = g->addTensor({bs, seqlen, hidden});
auto t11 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
TensorType::Other);
auto t12 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
TensorType::Other);
auto output = g->addTensor({bs, seqlen, featlen}, DataType::Float32,
TensorType::Other);
g->addOpWithOutputs<MatmulObj>(i0, w0, q0, false, true);
g->addOpWithOutputs<MatmulObj>(i0, w1, k0, false, true);
g->addOpWithOutputs<MatmulObj>(i0, w2, v0, false, true);
g->addOpWithOutputs<ReshapeObj>(q0, q1);
g->addOpWithOutputs<ReshapeObj>(k0, k1);
g->addOpWithOutputs<ReshapeObj>(v0, v1);
// For example, when perm=(1, 0, 2), given an input tensor of shape (1,
// 2, 3), the output shape will be (2, 1, 3).
g->addOpWithOutputs<TransposeObj>(q1, q2, vector{0, 2, 1, 3});
g->addOpWithOutputs<TransposeObj>(k1, k2, vector{0, 2, 1, 3});
g->addOpWithOutputs<TransposeObj>(v1, v2, vector{0, 2, 1, 3});
g->addOpWithOutputs<ReshapeObj>(q2, q3);
g->addOpWithOutputs<ReshapeObj>(k2, k3);
g->addOpWithOutputs<ReshapeObj>(v2, v3);
// Attention
g->addOpWithOutputs<G2BMMObj>(q3, k3, prob, w, d);
g->addOpWithOutputs<SoftmaxObj>(prob, probSoftmax, 2);
g->addOpWithOutputs<GBMMObj>(probSoftmax, v3, attn, d);
auto attn2 = g->addOp<ReshapeObj>(attn, nullptr,
vector{bs, heads, seqlen, hiddenPerHead})
->getOutput();
auto t000 =
g->addOp<TransposeObj>(attn2, nullptr, vector{0, 2, 1, 3})->getOutput();
g->addOpWithOutputs<ReshapeObj>(t000, t00);
// Feed forward
g->addOpWithOutputs<MatmulObj>(t00, w3, t01, false, true, bias3);
g->addOpWithOutputs<ReluObj>(t01, t02);
g->addOpWithOutputs<MatmulObj>(t02, w4, t11, false, true, bias4);
g->addOpWithOutputs<ReluObj>(t11, t12);
g->addOpWithOutputs<AddObj>(t12, i0, output);
return g;
}
Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId) {
IT_ASSERT(0 <= layerId && layerId < 5);
Graph g = make_ref<GraphObj>(runtime);
vector<Tensor> weights;
vector<tuple<int, int, int, int, bool>> cs{
// Channel, kernelSize, pad, stride, isTanh
{448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
{64, 4, 1, 2, false}, {3, 4, 1, 2, true},
};
Tensor input = g->addTensor(shape, DataType::Float32, TensorType::Input);
for (int i = layerId; i < layerId + 1; ++i) {
auto [channel, kernelSize, pad, stride, tanh] = cs[i];
int f = input->getDims()[3]; // n, h, w, f
auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
DataType::Float32,
TensorType::Initialized); // f, r, s, c
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
pad, stride, stride, 1, 1)
->getOutput();
if (tanh) {
input = g->addOp<TanhObj>(input, nullptr)->getOutput();
} else {
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
}
}
return g;
}
void printGraph(Graph g) {
g->print();
puts("============ Data ============");
for (auto t : g->getTensors()) {
dbg(t);
t->printData();
}
}
void initializeGraphTensors(Graph g, double l, double r, bool useInt) {
g->dataMalloc();
auto gen = RandomGenerator(-0.1, 0.1, 0, useInt);
for (auto t : g->getInputs()) {
t->setData(gen);
}
for (auto t : g->getOutputs()) {
t->setData(ZeroGenerator());
}
}
Graph convertNCHWtoNHWCModel(Runtime runtime, Graph inG) {
// Construct new graph
// IT_ASSERT(inG->getInputs().size() == 1);
IT_ASSERT(inG->getOutputs().size() == 1);
bool status = inG->topo_sort();
IT_ASSERT(status);
auto g = make_ref<GraphObj>(runtime);
map<UidBaseType, Tensor> tensors;
for (const auto &t : inG->getTensors())
if (t->getDims().size() != 4)
return nullptr;
auto getTensor = [&g, &tensors](const Tensor &inTensor) {
auto uid = inTensor->getGuid();
if (auto it = tensors.find(uid); it == tensors.end()) {
Shape s = inTensor->getDims();
s = vector{s[0], s[2], s[3], s[1]};
tensors[uid] = g->addTensor(s, inTensor->getDType(),
inTensor->getTensorType());
}
return tensors[uid];
};
for (auto op : inG->getOperators()) {
TensorVec inputs, outputs;
for (auto &t : op->getInputs())
inputs.emplace_back(getTensor(t));
for (auto &t : op->getOutputs())
outputs.emplace_back(getTensor(t));
if (auto cOp = as<ConvObj>(op)) {
const auto &[ph, pw, sh, sw, dh, dw] = cOp->getPadStrideDilation();
auto bias =
cOp->getBias() ? g->cloneTensor(cOp->getBias()) : nullptr;
g->addOpWithOutputs<ConvNHWCObj>(inputs[0], inputs[1], outputs[0],
ph, pw, sh, sw, dh, dw, bias,
cOp->getAct());
} else if (const auto &cOp = as<ConvTransposed2dObj>(op)) {
const auto &[ph, pw, sh, sw, dh, dw] = cOp->getPadStrideDilation();
const auto &[oph, opw] = cOp->getOutputPadding();
auto group = cOp->getNumGroups();
auto bias =
cOp->getBias() ? g->cloneTensor(cOp->getBias()) : nullptr;
g->addOpWithOutputs<ConvTransposed2dNHWCObj>(
inputs[0], inputs[1], outputs[0], ph, pw, sh, sw, dh, dw, oph,
opw, group, bias, cOp->getAct());
} else if (const auto &cOp = as<MaxPoolObj>(op)) {
auto t = g->addOp<ReshapeObj>(inputs[0], nullptr,
cOp->getInputs(0)->getDims())
->getOutput();
auto tt = g->addTensor(cOp->getOutput()->getDims(),
cOp->getOutput()->getDType());
g->cloneOperator(op, {t}, {tt});
g->addOpWithOutputs<ReshapeObj>(tt, outputs[0]);
} else {
dbg(op);
g->cloneOperator(op, inputs, outputs);
}
}
return g;
}
Graph optimizeModelWithRules(Graph g, Runtime _runtime, vector<int> rules) {
auto runtime = as<CudaRuntimeObj>(_runtime);
// make_ref<NMutator>(NMutator::Mode::RuleBased, metaRules, runtime);
Ref<NMutator> mutator =
make_ref<NMutator>(NMutator::Mode::RuleBased, rules, runtime);
vector<Graph> bestGraphs;
SearchEngine searchEngine(runtime, mutator);
g->dataFree();
return searchEngine.run(g);
}
Graph optimizeModel(Graph g, Runtime _runtime, string name) {
auto runtime = as<CudaRuntimeObj>(_runtime);
Ref<NMutator> mutator = make_ref<NMutator>(NMutator::Mode::Normal, runtime);
vector<Graph> bestGraphs;
SearchEngine searchEngine(runtime, mutator);
g->dataFree();
return searchEngine.run(g);
}
Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning, NMutator::Mode mode,
vector<int> rules) {
auto runtime = as<CudaRuntimeObj>(_runtime);
Runtime cpu = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(cpu);
// vector<int>{3, 2, 2, 5, 8, 8, 6, 90}); // Conv2gemm
// vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90}); // TConv
Ref<NMutator> mutator;
if (mode == NMutator::Mode::Normal) {
dbg(mode);
mutator = make_ref<NMutator>(mode, runtime);
} else if (mode == NMutator::Mode::RuleBased) {
dbg(mode, rules);
IT_ASSERT_TODO(rules.size() > 0);
mutator = make_ref<NMutator>(mode, rules, runtime);
} else
IT_TODO_HALT();
vector<Graph> bestGraphs;
SearchEngine searchEngine(runtime, mutator);
g->dataFree();
return searchEngine.run(g);
bestGraphs.emplace_back(searchEngine.run(g));
g->topo_sort();
dbg(g, bestGraphs[0], bestGraphs.size());
g->print();
g->dataMalloc();
map<UidBaseType, Tensor> fuidToInputTensor;
for (auto t : g->getInputs()) {
IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
fuidToInputTensor[t->getFuid()] = t;
}
auto gen = RandomGenerator(-0.1, 0.1, 0);
for (auto t : g->getInputs()) {
t->setData(gen);
}
for (auto t : g->getOutputs()) {
t->setData(ZeroGenerator());
}
runtime->run(g);
// dbg("Baseline graph");
// printGraph(g);
// dbg(runtme->getPerfTime(g, true));
g->dataFree();
for (size_t i = 0; i < bestGraphs.size(); i++) {
auto bestGraphCpu = bestGraphs[i];
auto bestGraph =
make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
bestGraph->topo_sort();
// bestGraph->dataMalloc();
// // Initialize inputs with random data
// for (auto t : bestGraph->getInputs()) {
// t->copyData(fuidToInputTensor[t->getFuid()]);
// }
// // Initialize outputs with zeros
// for (auto t : bestGraph->getOutputs()) {
// t->setData(ZeroGenerator());
// }
// dbg(bestGraph);
// dbg(bestGraph->getOutputs());
// if (tuning) {
// runtime->run(bestGraph, true); // Tune kernels
// runtime->run(bestGraph, false); // Execute transfomraed graph
// // FIXME: g is freed
// auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
// auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
// // EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
// dbg(go0->equalData(bgo0, 1e-3));
// dbg(runtime->getPerfTime(bestGraph, true));
// dbg(runtime->timeNonCtcOperators(bestGraph));
// // dbg(runtime->timeWithCudaGraph(bestGraph));
// }
// dbg("Best graph");
// printGraph(bestGraph);
return bestGraph;
}
return nullptr;
}
Graph optimizeWithDepthConstraint(Graph g, Runtime _runtime, int maxDepth) {
auto runtime = as<CudaRuntimeObj>(_runtime);
Runtime cpu = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(cpu);
Ref<NMutator> mutator = make_ref<NMutator>(NMutator::Mode::Normal, runtime);
mutator->setMaxDepth(maxDepth);
g->dataFree();
SearchEngine searchEngine(runtime, mutator);
searchEngine.searchFilter = 1;
return searchEngine.run(g);
}
vector<Tensor> runInfoGAN(int nLayers) {
auto cuda = make_ref<CudaRuntimeObj>();
Runtime cpu = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(cpu);
Graph g = getGANGraph(1, cuda, nLayers, 0);
auto mutator =
make_ref<NMutator>(NMutator::Mode::RuleBased,
vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
// // Translate OP to membound without derivation
// mutator->setToNaiveMembound();
vector<Graph> bestGraphs;
SearchEngine searchEngine(cuda, mutator);
bestGraphs.emplace_back(searchEngine.run(g));
g->topo_sort();
dbg(g, bestGraphs[0], bestGraphs.size());
g->print();
g->dataMalloc();
map<UidBaseType, Tensor> fuidToInputTensor;
for (auto t : g->getInputs()) {
IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
fuidToInputTensor[t->getFuid()] = t;
}
auto gen = RandomGenerator(-0.1, 0.1, 0);
// auto gen = RandomGenerator(-5, 5, 0, true);
for (auto t : g->getInputs()) {
t->setData(gen);
}
for (auto t : g->getOutputs()) {
t->setData(ZeroGenerator());
}
cuda->run(g);
dbg("Baseline graph");
printGraph(g);
dbg(cuda->getPerfTime(g, true));
for (size_t i = 0; i < bestGraphs.size(); i++) {
auto bestGraphCpu = bestGraphs[i];
auto bestGraph = make_ref<GraphObj>(cuda, bestGraphCpu->getOperators());
bestGraph->topo_sort();
bestGraph->dataMalloc();
// Initialize inputs with random data
for (auto t : bestGraph->getInputs()) {
t->copyData(fuidToInputTensor[t->getFuid()]);
}
// Initialize outputs with zeros
for (auto t : bestGraph->getOutputs()) {
t->setData(ZeroGenerator());
}
dbg(bestGraph);
dbg(bestGraph->getOutputs());
cuda->run(bestGraph, true); // Tune kernels
cuda->run(bestGraph, false); // Execute transfomraed graph
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
// EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
std::cout << go0->equalData(bgo0, 1e-3) << std::endl;
bgo0->printData();
go0->printData();
dbg(cuda->getPerfTime(bestGraph, true));
dbg("Best graph");
printGraph(bestGraph);
callback::exportONNX(bestGraph, "best_graph.onnx"); // Debug
return {g->getOutputs()[0], bestGraph->getOutputs()[0]};
}
return {};
}
} // namespace infini
#endif

View File

@ -12,12 +12,11 @@ void MatchMemBoundKernel::transform(Formula &origin, int depth, Expr &rCur) {
const auto &inputs = InputVisitor().getInputs(rangeOp);
auto source =
make_ref<ElementWiseNode>(rangeOp, inputs, rangeOp->getOutputShape());
auto tensor =
makeTensor(newTensorName(), rangeOp->getOutputShape(), {}, source);
auto tensor = mT(newTensorName(), rangeOp->getOutputShape(), {}, source);
// The original code directly appends candidate. But it seems should be done
// by the search.
// appendCanddiate(as<TensorNode>(tensor), depth);
nextStep(origin, depth, rCur, tensor);
}
} // namespace nnet
} // namespace nnet

View File

@ -38,11 +38,10 @@ void Rule3StageSplit::transform(Formula &origin, int depth, Expr &rCur) {
// if no sum iterator, the stage is redundant
assert(!innerSumVars.empty());
auto inner =
makeRangeOperator(innerLoopVars, innerSumVars, cur->getSummand());
auto inner = mL(innerLoopVars, innerSumVars, cur->getSummand());
auto subscriptedInner = make_ref<SubscriptNode>(inner, indexForInner);
auto outer = makeRangeOperator(cur->getLoopVarRanges(), outerSumVars,
subscriptedInner);
auto outer =
mL(cur->getLoopVarRanges(), outerSumVars, subscriptedInner);
outer->setPaddings(cur->getPaddings());
// next searching step
@ -79,4 +78,4 @@ Rule3StageSplit::getSplitSummationIters(RangeOp rangeOp) {
return ret;
}
} // namespace nnet
} // namespace nnet

View File

@ -25,8 +25,8 @@ void Rule6KenerlMatching::transform(Formula &origin, int depth, Expr &rCur) {
}
{ // Match element-wise OP
auto replaces = matchElementWise(cur);
if (!replaces.empty())
dbg(rCur);
// if (!replaces.empty())
// dbg(rCur);
for (auto newCur : replaces)
nextStep(origin, depth, rCur, newCur);
}
@ -50,8 +50,8 @@ VecExpr Rule6KenerlMatching::matchElementWise(const RangeOp &rangeOp) {
const auto &inputs = InputVisitor().getInputs(rangeOp);
auto source =
make_ref<ElementWiseNode>(rangeOp, inputs, rangeOp->getOutputShape());
auto newTensor = makeTensor(newTensorName(), newShape, {}, source);
auto newTensor = mT(newTensorName(), newShape, {}, source);
return {newTensor};
}
} // namespace nnet
} // namespace nnet

View File

@ -265,10 +265,9 @@ Expr Rule8GuidedDLT::guidedDLTMoreVar2(const RangeOp &cur,
const auto sourceRoutine = make_ref<ElementWiseNode>(
sourceExpr, vector<Tensor>{originalTensor}, newShape);
// build stage connections
const auto newTensor =
makeTensor(newTensorName(), newShape, {}, sourceRoutine);
const auto &newSub = makeSubscript(
newTensor, VecExpr(tensorDimAxes.begin(), tensorDimAxes.end()));
const auto newTensor = mT(newTensorName(), newShape, {}, sourceRoutine);
const auto &newSub =
mSub(newTensor, VecExpr(tensorDimAxes.begin(), tensorDimAxes.end()));
// TODO [1124]: get variable mapping and reorder L according to it
// dbg(cur, originalSub, newSub, newVarRanges, replace.toReadable(),
// tensorDimAxes, newShape);
@ -311,7 +310,7 @@ Expr Rule8GuidedDLT::buildGuidedDLTSource(const Subscript &originalSub,
vector<VarRangePair> loopVarRangePairs;
for (size_t i = 0; i < tensorDimAxes.size(); ++i)
loopVarRangePairs.emplace_back(tensorDimAxes[i], pair(0, newShape[i]));
return makeRangeOperator(loopVarRangePairs, {}, newSub);
return mL(loopVarRangePairs, {}, newSub);
}
} // namespace nnet
} // namespace nnet

View File

@ -47,8 +47,8 @@ Rule90TwoStageElementWise::matchTwoStageElementWise(const RangeOp &rangeOp) {
const auto &inputs = InputVisitor().getInputs(rangeOp);
auto source =
make_ref<ElementWiseNode>(rangeOp, inputs, rangeOp->getOutputShape());
auto newTensor = makeTensor(newTensorName(), newShape, {}, source);
auto newTensor = mT(newTensorName(), newShape, {}, source);
return {newTensor};
}
} // namespace nnet
} // namespace nnet

View File

@ -13,7 +13,8 @@ string FullPrinterVisitor::print(const Expr &root) {
oss << "==> ROOT\n" << root->toReadable() << "\n";
for (size_t i = 0; i < q.size(); ++i) {
const auto &[name, routine, tensor] = q[i];
oss << "==> " << name << " : ";
oss << "==> " << name << " " << infini::vecToString(tensor->getShape())
<< " : ";
if (routine) {
oss << routine->toReadable() << "\n";
if (routine->getExpr()) {

View File

@ -45,27 +45,26 @@ VecExpr MatmulTransposeMutator::transpose(const Tensor &tensor) {
auto _va = make_ref<VarNode>("transA");
auto _vb = make_ref<VarNode>("transB");
auto _vc = make_ref<VarNode>("swapAB");
auto fakeSub = makeSubscript(matmul->getExpr(), {_va, _vb});
auto fakeRangeWrapperForHackHash =
makeRangeOperator({{_va, {0, Atrans + 100}},
{_vb, {0, Btrans + 100}},
{_vc, {0, ABswap + 100}}},
{}, fakeSub);
auto fakeSub = mSub(matmul->getExpr(), {_va, _vb});
auto fakeRangeWrapperForHackHash = mL({{_va, {0, Atrans + 100}},
{_vb, {0, Btrans + 100}},
{_vc, {0, ABswap + 100}}},
{}, fakeSub);
Matmul newMatmul =
make_ref<MatmulNode>(fakeRangeWrapperForHackHash, inputs[0],
inputs[1], b, m, n, k, transa, transb);
auto newTensor = makeTensor(derivator.newTensorName(), newShape,
newPaddings, newMatmul);
auto newTensor =
mT(derivator.newTensorName(), newShape, newPaddings, newMatmul);
// build output transpose
if (ABswap) {
vector<Var> vars{derivator.getNewVar(), derivator.getNewVar()};
auto sub = makeSubscript(newTensor, {vars[1], vars[0]});
auto sub = mSub(newTensor, {vars[1], vars[0]});
vector<VarRangePair> loopVRs;
// Sicne inputs array may be swaped, use the orignal tensor shape
for (int i = 0; i < 2; ++i) {
loopVRs.emplace_back(vars[i], Range(0, tensor->getShape(i)));
}
auto rangeOp = makeRangeOperator(loopVRs, {}, sub);
auto rangeOp = mL(loopVRs, {}, sub);
ret.emplace_back(rangeOp);
} else
ret.emplace_back(newTensor);
@ -85,8 +84,8 @@ optional<Tensor> MatmulTransposeMutator::transposeInput(const Tensor &tensor) {
assert(!rangeOp->hasPaddings());
// auto paddings = rangeOp->getPaddings();
// std::swap(paddings[0], paddings[1]);
auto sub = makeSubscript(rangeOp, {loopVRs[1].first, loopVRs[0].first});
auto newRangeOp = makeRangeOperator(loopVRs, {}, sub);
auto sub = mSub(rangeOp, {loopVRs[1].first, loopVRs[0].first});
auto newRangeOp = mL(loopVRs, {}, sub);
// ElementWise newElementWise = make_ref<ElementWiseNode>(*ew);
auto outputShape = ew->getOutputShape();
std::swap(outputShape[0], outputShape[1]);
@ -97,8 +96,8 @@ optional<Tensor> MatmulTransposeMutator::transposeInput(const Tensor &tensor) {
auto tensorPaddings = tensor->getPaddings();
std::swap(tensorShape[0], tensorShape[1]);
std::swap(tensorPaddings[0], tensorPaddings[1]);
ret = makeTensor(derivator.newTensorName(), tensorShape, tensorPaddings,
newElementWise);
ret = mT(derivator.newTensorName(), tensorShape, tensorPaddings,
newElementWise);
// } else if (!tensor->getSource()) {
} else {
nnet_unimplemented_continue();
@ -107,4 +106,4 @@ optional<Tensor> MatmulTransposeMutator::transposeInput(const Tensor &tensor) {
return ret;
}
} // namespace nnet
} // namespace nnet

View File

@ -45,6 +45,8 @@ Expr MergeMemboundMutator::merge(bool allowEmptyMembound, bool allowFailure) {
curExpr = sub->getObjectPtr();
else
break;
} else if (auto funcOp = as<BinaryOpNode>(summand)) {
break;
} else {
if (allowFailure)
return nullptr;
@ -143,4 +145,4 @@ Expr MergeMemboundMutator::rule4StageMerging(Expr &rCur,
return merged;
}
} // namespace nnet
} // namespace nnet

View File

@ -32,8 +32,7 @@ RangeOp PatternMatcher::getOffsetCur() {
}
auto newSummand = ReplaceKit::replaceMultipleExprs(
originalCur->getSummand(), itersFromNonZero, psis);
return makeRangeOperator(newLoopVarRanges, originalCur->getSumVarRanges(),
newSummand);
return mL(newLoopVarRanges, originalCur->getSumVarRanges(), newSummand);
}
VecExpr PatternMatcher::matchKernel(const Pattern &pattern,
@ -106,9 +105,9 @@ VecExpr PatternMatcher::applyWrapper(const VecExpr &exprs) {
}
}
for (auto &expr : exprs) {
auto newSub = makeSubscript(expr, indexes);
ret.emplace_back(makeRangeOperator(originalCur->getLoopVarRanges(), {},
newSub, originalCur->getPaddings()));
auto newSub = mSub(expr, indexes);
ret.emplace_back(mL(originalCur->getLoopVarRanges(), {}, newSub,
originalCur->getPaddings()));
}
return ret;
}

View File

@ -73,17 +73,52 @@ string Serializer::visit_(const Tensor &c) {
return key;
}
bool Serializer::serialize(const Expr &expr, const string &filePath,
const string &msg) {
string Serializer::visit_(const Func &c) {
const string key = std::to_string(id++);
j[key]["type"] = c->getType();
j[key]["funcType"] = c->getFuncType();
j[key]["object"] = dispatch(c->getObject());
return key;
}
std::optional<std::string> Serializer::toString(const Expr &expr,
const string &msg,
vector<Tensor> inputs,
double exec_time, string hint) {
// Metadata
j["Version"] = VERSION;
j["Msg"] = msg;
j["exec_time"] = exec_time;
j["hint"] = hint;
// Expressions and routines
id = 0;
dispatch(expr);
std::ofstream fout(filePath);
fout << std::setw(4) << j << std::endl;
return true;
// Input tensors
vector<string> inputsIndices;
for (const auto &tensor : inputs) {
inputsIndices.emplace_back(std::to_string(id));
dispatch(tensor);
}
j["nnetInputs"] = inputsIndices;
// To string
std::stringstream ss;
ss << std::setw(4) << j << std::endl;
return {ss.str()};
}
bool Serializer::toFile(const Expr &expr, const string &filePath,
const string &msg, vector<Tensor> inputs,
double exec_time, string hint) {
if (auto s = toString(expr, msg, inputs, exec_time, hint)) {
// Write to file
std::ofstream fout(filePath);
fout << *s;
return true;
} else {
return false;
}
}
string Serializer::dispatchRoutine(const Routine &c) {
@ -129,7 +164,15 @@ string Serializer::dispatchRoutine(const Routine &c) {
return key;
}
Expr Serializer::deserialize(const string &filePath) {
Expr Serializer::fromString(const string &text) {
std::stringstream str;
str << text;
str >> j;
assert(j["Version"] == VERSION);
return buildExprTree("0");
}
Expr Serializer::fromFile(const string &filePath) {
std::ifstream fin(filePath);
fin >> j;
assert(j["Version"] == VERSION);
@ -160,7 +203,7 @@ Expr Serializer::buildExprTree(string key) {
}
auto summand = buildExprTree(j[key]["summand"]);
auto paddings = j[key]["paddings"].get<std::vector<int>>();
auto rangeOp = makeRangeOperator(loopIters, sumIters, summand);
auto rangeOp = mL(loopIters, sumIters, summand);
rangeOp->setPaddings(paddings);
return rangeOp;
}
@ -180,6 +223,10 @@ Expr Serializer::buildExprTree(string key) {
return make_ref<TensorNode>(j[key]["name"], j[key]["shape"],
j[key]["paddings"], source);
}
case NodeType::FuncNodeType: {
auto object = buildExprTree(j[key]["object"]);
return make_ref<FuncNode>(object, j[key]["funcType"]);
}
default: {
nnet_unimplemented_halt();
break;
@ -242,4 +289,25 @@ Routine Serializer::buildRoutine(string key) {
return nullptr;
}
} // namespace nnet
tuple<Expr, vector<Tensor>, double, string>
Serializer::deserializeAsMemobundOp(const string &filePath) {
std::ifstream fin(filePath);
fin >> j;
assert(j["Version"] == VERSION);
vector<Tensor> inputs;
for (const auto &input : j["nnetInputs"])
inputs.emplace_back(as<TensorNode>(buildExprTree(input)));
return {buildExprTree("0"), inputs, j["exec_time"], j["hint"]};
}
tuple<Expr, vector<Tensor>, double, string>
Serializer::membundOpFromString(const string &data) {
j = json::parse(data);
assert(j["Version"] == VERSION);
vector<Tensor> inputs;
for (const auto &input : j["nnetInputs"])
inputs.emplace_back(as<TensorNode>(buildExprTree(input)));
return {buildExprTree("0"), inputs, j["exec_time"], j["hint"]};
}
} // namespace nnet

View File

@ -136,9 +136,10 @@ void Derivator::dfs(Formula &origin, int depth) {
}
Derivator::Derivator(int maxDepth, bool enableHashPruning, LogMode logMode,
PassMode passMode)
PassMode passMode, bool printAndExit)
: maxDepth(maxDepth), logMode(logMode), passMode(passMode),
enableHashPruning(enableHashPruning), cntAppliedRules(12) {}
enableHashPruning(enableHashPruning), cntAppliedRules(12),
printAndExit(printAndExit) {}
int Derivator::getNumIntermediateStates() { return cntStates; }
@ -405,6 +406,8 @@ Expr Derivator::mergeMemboundStages(VecExpr stages) {
void Derivator::appendCanddiate(const Tensor &tensor, int depth) {
// if (!CountRoutineVisitor().match(tensor, 1, 0, 3))
// return;
if (intermediateStates.size() > 1 && printAndExit)
printDerivationRules();
candidates.emplace_back(tensor, depth);
// dbg("!!!!!!!!!!!!!!!Success!!!!!!!!!!!!!!!");
@ -478,6 +481,7 @@ void Derivator::printStatistics() {
printf("#Hashed intermediate states = %lu\n", visited.size());
printf("#Iteratos = %d\n", nIteratorNames);
printf("#Tensors = %d\n", nTensorNames);
printf("#Print and Exit mode = %d\n", printAndExit);
}
void Derivator::setDumpFirstSuccess(const string &_logFnPrefix) {
@ -490,6 +494,9 @@ void Derivator::printIntermediateStates() {
// Skip in NoLog mode
if (logMode == LogMode::NoLog)
return;
if (intermediateStates.size() > 1 && printAndExit)
printDerivationRules();
assert(intermediateStates.size() == ruleStates.size());
assert(intermediateStates.size() == ruleMsgs.size());
for (size_t i = 0; i < intermediateStates.size(); ++i) {
@ -499,16 +506,17 @@ void Derivator::printIntermediateStates() {
std::cout << FullPrinterVisitor().print(intermediateStates[i]) << endl;
if (logMode == LogMode::DumpFristCandiate) {
Serializer serializer;
serializer.serialize(intermediateStates[i],
logFnPrefix + to_string(i) + ".expr", msg);
serializer.toFile(intermediateStates[i],
logFnPrefix + to_string(i) + ".expr", msg);
}
}
for (size_t i = 0; i < intermediateStates.size(); ++i) {
if (auto cur = as<RangeOpNode>(intermediateStates[i]))
if (CheckOOBVisitor().checkRangeOp(cur)) {
printf("OOB detected depth=%lu\n", i);
}
}
// FIXME
// for (size_t i = 0; i < intermediateStates.size(); ++i) {
// if (auto cur = as<RangeOpNode>(intermediateStates[i]))
// if (CheckOOBVisitor().checkRangeOp(cur)) {
// printf("OOB detected depth=%lu\n", i);
// }
// }
if (logMode == LogMode::DumpFristCandiate) {
puts("Serializaiton finished.");
exit(0);
@ -528,4 +536,23 @@ Derivator::PassMode Derivator::getPassMode() { return passMode; }
Derivator::LogMode Derivator::getLogMode() { return logMode; }
void Derivator::printDerivationRules() {
int cntRules = 0, cntNonGuideRules = 0;
bool startGuided = false;
std::cout << ruleStates.size() << "rules" << std::endl;
for (size_t i = 1; i < ruleStates.size(); ++i) {
int ruleId = ruleStates[i][4] - '0';
if (ruleId != 4)
++cntRules;
if (ruleId == 8)
startGuided = true;
if (!startGuided && ruleId != 4)
++cntNonGuideRules;
}
printf("#Steps w/o converging derivation %d, #Steps w/ converging "
"derivation %d\n",
cntRules, cntNonGuideRules);
exit(0);
}
} // namespace nnet

View File

@ -60,7 +60,7 @@ optional<Expr> DLT::apply(const RangeOp &rangeOp, const Subscript &subscript,
// Maybe there are bugs...
// assert(index != nullptr);
if (index == nullptr) {
std::cout << "Warning empty" << std::endl;
// std::cout << "Warning empty" << std::endl;
return {};
}
}
@ -83,12 +83,11 @@ optional<Expr> DLT::apply(const RangeOp &rangeOp, const Subscript &subscript,
// HACK [important] fix this fake tensor.
auto elementRoutine = make_ref<ElementWiseNode>(
// FIXME: implement transpose
// makeTensor(newTensorName + "_DLT", {}), vector<Tensor>{tensor},
// mT(newTensorName + "_DLT", {}), vector<Tensor>{tensor},
// shape0);
makeTensor("__DLT", {}), vector<Tensor>{tensor}, shape0);
auto dltedTensor =
makeTensor(newTensorName, shape0, dltedPaddings, elementRoutine);
auto dltedSubscript = makeSubscript(dltedTensor, index0);
mT("__DLT", {}), vector<Tensor>{tensor}, shape0);
auto dltedTensor = mT(newTensorName, shape0, dltedPaddings, elementRoutine);
auto dltedSubscript = mSub(dltedTensor, index0);
return optional<Expr>(std::in_place, dltedSubscript);
}

View File

@ -1,4 +1,5 @@
#include "nnet/expr.h"
#include "nnet/Visitor/FullPrinterVisitor.h"
#include "nnet/Visitor/GetTensorsVisitor.h"
namespace nnet {
@ -367,19 +368,19 @@ Expr operator/(const Expr &lhs, const int rhs) {
}
// Wrappers for type deduction
Subscript makeSubscript(const Expr &tensor, const VecExpr &subscripts) {
Subscript mSub(const Expr &tensor, const VecExpr &subscripts) {
return make_ref<SubscriptNode>(tensor, subscripts);
}
RangeOp makeRangeOperator(const vector<VarRangePair> &_loopIters,
const vector<VarRangePair> &_sumIters, Expr _summand,
const vector<int> &paddings) {
RangeOp mL(const vector<VarRangePair> &_loopIters,
const vector<VarRangePair> &_sumIters, Expr _summand,
const vector<int> &paddings) {
return make_ref<RangeOpNode>(_loopIters, _sumIters, _summand, paddings);
}
// Wrappers for type deduction
Tensor makeTensor(const string &name, const vector<int> &shape,
const vector<int> &paddings, const Routine &source) {
Tensor mT(const string &name, const vector<int> &shape,
const vector<int> &paddings, const Routine &source) {
if (paddings.size() == 0)
return make_ref<TensorNode>(name, shape,
vector<int>((int)shape.size(), 0), source);
@ -463,4 +464,9 @@ void FuncNode::setObject(Expr e) {
object = e;
}
string RangeOpNode::getFullExpression() {
FullPrinterVisitor printer;
return printer.print(this->shared_from_this());
}
} // namespace nnet

View File

@ -296,10 +296,9 @@ const Pattern &MatmulPattern::getMatmulPattern() {
auto k = make_ref<VarNode>("_Matmul_k");
auto A = make_ref<TensorNode>("_Matmul_A", vector<int>({M, K}));
auto B = make_ref<TensorNode>("_Matmul_B", vector<int>({N, K}));
auto subA = makeSubscript(A, {m, k});
auto subB = makeSubscript(B, {n, k});
auto range = makeRangeOperator({{m, {0, M}}, {n, {0, N}}},
{{k, {0, K}}}, subA * subB);
auto subA = mSub(A, {m, k});
auto subB = mSub(B, {n, k});
auto range = mL({{m, {0, M}}, {n, {0, N}}}, {{k, {0, K}}}, subA * subB);
auto success = exprIT.analyzeExpr(range);
assert(success);
exprIT.buildTable({0, 1});
@ -317,11 +316,10 @@ const Pattern &ConvPattern::getPattern() {
// auto n = make_ref<VarNode>("_Matmul_n");
auto A = make_ref<TensorNode>("_Conv_A", vector<int>({N, C, H, W}));
auto B = make_ref<TensorNode>("_Conv_K", vector<int>({F, C, R, S}));
auto subA = makeSubscript(A, {n, c, h + r, w + s});
auto subB = makeSubscript(B, {f, c, r, s});
auto range = makeRangeOperator(
{{n, {0, 0}}, {f, {0, 0}}, {h, {0, 0}}, {w, {0, 0}}},
{{c, {0, 0}}, {r, {0, 0}}, {s, {0, 0}}}, subA * subB);
auto subA = mSub(A, {n, c, h + r, w + s});
auto subB = mSub(B, {f, c, r, s});
auto range = mL({{n, {0, 0}}, {f, {0, 0}}, {h, {0, 0}}, {w, {0, 0}}},
{{c, {0, 0}}, {r, {0, 0}}, {s, {0, 0}}}, subA * subB);
auto success = exprIT.analyzeExpr(range);
assert(success);
exprIT.buildTable({0, 1});
@ -350,7 +348,7 @@ Expr ConvPattern::buildExpr(
auto shape = conv->getShape();
auto rangeOpShape = as<RangeOpNode>(expr)->getOutputShape();
assert(shape.size() == rangeOpShape.size());
dbg(shape, rangeOpShape);
// dbg(shape, rangeOpShape);
for (size_t i = 0; i < shape.size(); ++i) {
if (shape[i] != rangeOpShape[i]) {
dbg("Warning: unmatched Conv output", shape, rangeOpShape);
@ -404,11 +402,10 @@ const Pattern &Sg2bmmPattern::getPattern() {
// auto n = make_ref<VarNode>("_Matmul_n");
auto A = make_ref<TensorNode>("_Sg2bmm_A", vector<int>{Batch, M, K});
auto B = make_ref<TensorNode>("_Sg2bmm_B", vector<int>{Batch, M, K});
auto subA = makeSubscript(A, {b, m, k});
auto subB = makeSubscript(B, {b, m + w, k});
auto range =
makeRangeOperator({{b, {0, Batch}}, {m, {0, M}}, {w, {-W, W + 1}}},
{{k, {0, K}}}, subA * subB);
auto subA = mSub(A, {b, m, k});
auto subB = mSub(B, {b, m + w, k});
auto range = mL({{b, {0, Batch}}, {m, {0, M}}, {w, {-W, W + 1}}},
{{k, {0, K}}}, subA * subB);
auto success = exprIT.analyzeExpr(range);
assert(success);
exprIT.buildTableWithDefaultMap();
@ -458,11 +455,10 @@ const Pattern &LongformerGBMMPattern::getPattern() {
auto A =
make_ref<TensorNode>("_lo_A", vector<int>{Batch, M, 2 * W + 1});
auto B = make_ref<TensorNode>("_lo_B", vector<int>{Batch, M, N});
auto subA = makeSubscript(A, {b, m, w});
auto subB = makeSubscript(B, {b, m + w, n});
auto range =
makeRangeOperator({{b, {0, Batch}}, {m, {0, M}}, {n, {0, M}}},
{{w, {-W, W + 1}}}, subA * subB);
auto subA = mSub(A, {b, m, w});
auto subB = mSub(B, {b, m + w, n});
auto range = mL({{b, {0, Batch}}, {m, {0, M}}, {n, {0, M}}},
{{w, {-W, W + 1}}}, subA * subB);
auto success = exprIT.analyzeExpr(range);
assert(success);
exprIT.buildTableWithDefaultMap();
@ -536,11 +532,10 @@ Expr ConvPattern::getExpr(Tensor A, Tensor K, int N, int C, int H, int W, int F,
DEFINE_VAR(f);
DEFINE_VAR(r);
DEFINE_VAR(s);
auto subA = makeSubscript(A, {n, c, h + r - R / 2, w + s - S / 2});
auto subB = makeSubscript(K, {f, c, r, s});
auto range =
makeRangeOperator({{n, {0, N}}, {f, {0, F}}, {h, {0, H}}, {w, {0, W}}},
{{c, {0, C}}, {r, {0, R}}, {s, {0, S}}}, subA * subB);
auto subA = mSub(A, {n, c, h + r - R / 2, w + s - S / 2});
auto subB = mSub(K, {f, c, r, s});
auto range = mL({{n, {0, N}}, {f, {0, F}}, {h, {0, H}}, {w, {0, W}}},
{{c, {0, C}}, {r, {0, R}}, {s, {0, S}}}, subA * subB);
return range;
}
@ -572,13 +567,13 @@ Expr ConvTransPattern::getExpr(Tensor A, Tensor K, int N, int C, int H, int W,
// vector<int>{0, padding, padding, 0});
// auto K = make_ref<TensorNode>("K", vector<int>({R, S, F, C}));
auto subA = makeSubscript(A, {n, x1 + r - 1, y1 + s - 1, f});
auto subA = mSub(A, {n, x1 + r - 1, y1 + s - 1, f});
auto subK =
// makeSubscript(K, {(R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, f, c});
makeSubscript(K, {f, (R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, c});
// mSub(K, {(R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, f, c});
mSub(K, {f, (R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, c});
// x1=(h+1)//2, x2=(h+1)%2, y1=(w+1)//2
auto range1 = makeRangeOperator(
auto range1 = mL(
{
{n, {0, N}},
{c, {0, C}},
@ -588,10 +583,10 @@ Expr ConvTransPattern::getExpr(Tensor A, Tensor K, int N, int C, int H, int W,
{y2, {0, 2}},
},
{{f, {0, F}}, {r, {0, R / 2}}, {s, {0, S / 2}}}, subA * subK);
auto sub0 = makeSubscript(
auto sub0 = mSub(
range1, {n, c, (h + 1) / 2, (h + 1) % 2, (w + 1) / 2, (w + 1) % 2});
auto range0 = makeRangeOperator(
{{n, {0, N}}, {h, {0, OH}}, {w, {0, OW}}, {c, {0, C}}}, {}, sub0);
auto range0 =
mL({{n, {0, N}}, {h, {0, OH}}, {w, {0, OW}}, {c, {0, C}}}, {}, sub0);
return range0;
}
@ -606,11 +601,10 @@ pair<Expr, pair<Tensor, Tensor>> Sg2bmmPattern::getExpr(int Batch, int M, int K,
auto B = make_ref<TensorNode>("B", vector<int>({Batch, M, K}),
vector<int>{0, D * W, 0});
auto subA = makeSubscript(A, {b, m, k});
auto subB = makeSubscript(B, {b, m + D * (w - W), k});
auto range =
makeRangeOperator({{b, {0, Batch}}, {m, {0, M}}, {w, {0, 2 * W + 1}}},
{{k, {0, K}}}, subA * subB);
auto subA = mSub(A, {b, m, k});
auto subB = mSub(B, {b, m + D * (w - W), k});
auto range = mL({{b, {0, Batch}}, {m, {0, M}}, {w, {0, 2 * W + 1}}},
{{k, {0, K}}}, subA * subB);
return {range, {A, B}};
}
@ -624,10 +618,10 @@ LongformerGBMMPattern::getExpr(int Batch, int M, int W, int K, int dilation) {
vector<int>{0, 0, 0});
auto B = make_ref<TensorNode>("B", vector<int>({Batch, M, K}),
vector<int>{0, dilation * W, 0});
auto subA = makeSubscript(A, {b, m, w});
auto subB = makeSubscript(B, {b, m + dilation * w - dilation * W, n});
auto range = makeRangeOperator({{b, {0, Batch}}, {m, {0, M}}, {n, {0, K}}},
{{w, {0, 2 * W + 1}}}, subA * subB);
auto subA = mSub(A, {b, m, w});
auto subB = mSub(B, {b, m + dilation * w - dilation * W, n});
auto range = mL({{b, {0, Batch}}, {m, {0, M}}, {n, {0, K}}},
{{w, {0, 2 * W + 1}}}, subA * subB);
return {range, {A, B}};
}
@ -642,10 +636,10 @@ pair<Expr, pair<Tensor, Tensor>> MatmulPattern::getExpr(bool transA,
vector<int>{0, 0, 0});
auto B = make_ref<TensorNode>("B", vector<int>({Batch, K, N}),
vector<int>{0, 0, 0});
auto subA = makeSubscript(A, {b, m, k});
auto subB = makeSubscript(B, {b, k, n});
auto range = makeRangeOperator({{b, {0, Batch}}, {m, {0, M}}, {n, {0, N}}},
{{k, {0, K}}}, subA * subB);
auto subA = mSub(A, {b, m, k});
auto subB = mSub(B, {b, k, n});
auto range = mL({{b, {0, Batch}}, {m, {0, M}}, {n, {0, N}}}, {{k, {0, K}}},
subA * subB);
return {range, {A, B}};
}

File diff suppressed because it is too large Load Diff

View File

@ -17,8 +17,8 @@ RangeOp ReplaceKit::replaceRangeOpIterator(const RangeOp &rangeOp,
replace.oldIters.size() +
replace.newIters.size());
// Check the number of loop iterators
return makeRangeOperator(newVarRangePairs, rangeOp->getSumVarRanges(),
replacedSummand);
return mL(newVarRangePairs, rangeOp->getSumVarRanges(),
replacedSummand);
} else if (replace.iteratorType == IterationType::Sum) {
for (const auto &[var, range] : rangeOp->getSumVarRanges()) {
if (!replace.isReplaced(var))
@ -27,8 +27,8 @@ RangeOp ReplaceKit::replaceRangeOpIterator(const RangeOp &rangeOp,
assert(newVarRangePairs.size() == rangeOp->getSumVarRanges().size() -
replace.oldIters.size() +
replace.newIters.size());
return makeRangeOperator(rangeOp->getLoopVarRanges(), newVarRangePairs,
replacedSummand, rangeOp->getPaddings());
return mL(rangeOp->getLoopVarRanges(), newVarRangePairs,
replacedSummand, rangeOp->getPaddings());
}
assert(false);
return nullptr;
@ -55,7 +55,7 @@ Subscript ReplaceKit::buildSubscirptForLoopVarReplace(const RangeOp &inner,
// } else
// subs.emplace_back(inner->getLoopVar(i));
// }
return makeSubscript(inner, subs);
return mSub(inner, subs);
}
RangeOp
@ -89,4 +89,4 @@ Expr ReplaceKit::replaceExpr(const Expr &cur, const Expr &pattern,
return ret;
}
} // namespace nnet
} // namespace nnet

View File

@ -6,7 +6,7 @@
namespace nnet {
int matchExprResult(Derivator &derivator, string fn) {
auto ans = Serializer().deserialize(fn);
auto ans = Serializer().fromFile(fn);
auto hashAns = HashVisitor()(ans);
int match = 0;
for (const auto &candidate : derivator.getCandidates()) {
@ -19,14 +19,14 @@ int matchExprResult(Derivator &derivator, string fn) {
bool checkExprLogSame(string fnPrefix, int start, int end) {
Serializer serializer;
string fn0 = fnPrefix + to_string(start) + ".expr";
Expr expr0 = serializer.deserialize(fn0);
Expr expr0 = serializer.fromFile(fn0);
RangeOp range0 = as<RangeOpNode>(expr0);
Interpreter interpreter(range0);
auto ans0 = interpreter.interpretUniformSample(range0);
dbg(expr0, ans0);
for (int i = start + 1; i < end; ++i) {
string fn1 = fnPrefix + to_string(i) + ".expr";
Expr expr1 = serializer.deserialize(fn1);
Expr expr1 = serializer.fromFile(fn1);
RangeOp range1 = as<RangeOpNode>(expr1);
dbg(fn1, expr1);
auto ans1 = interpreter.interpretUniformSample(range1);
@ -67,4 +67,4 @@ bool checkExprsEquvivalence(VecExpr exprs) {
return true;
}
} // namespace nnet
} // namespace nnet

74
src/operators/any.cc Normal file
View File

@ -0,0 +1,74 @@
#include "operators/any.h"
namespace infini {
AnyObj::AnyObj(GraphObj *graph, const TensorVec &inputs,
const TensorVec &outputs, const string &kernelName,
const vector<int> &attr)
: OperatorObj(OpType::Any, inputs, outputs), kernelName(kernelName),
attr(attr) {
IT_ASSERT(checkValid(graph));
// Outputs must assigned when constructing AnyObj
IT_ASSERT(!outputs.empty());
for (auto &output : outputs)
IT_ASSERT(output != nullptr && output->size() > 0);
}
string AnyObj::toString() const {
std::ostringstream os;
os << "Any[" << getGuid() << "](";
for (size_t i = 0; i < inputs.size(); ++i) {
os << "i" << i << "=" << inputs[i]->getGuid();
if (i != inputs.size() - 1)
os << " ";
}
os << ", ";
for (size_t i = 0; i < outputs.size(); ++i) {
os << "o" << i << "=" << outputs[i]->getGuid();
if (i != outputs.size() - 1)
os << " ";
}
os << ", ";
os << "kernel name: " << kernelName << ", ";
os << "attr = [";
for (size_t i = 0; i < attr.size(); ++i) {
os << attr[i];
if (i != attr.size() - 1)
os << ", ";
}
os << "])\n";
return os.str();
}
optional<vector<Shape>> AnyObj::inferShape(const TensorVec &inputs) const {
vector<Shape> ret;
for (auto output : outputs) {
ret.emplace_back(output->getDims());
}
return ret;
}
const string AnyObj::getKernelName() const { return kernelName; }
vector<int> AnyObj::getOpAttrVector() const { return attr; };
vector<int> AnyObj::getWorkloadVector() const {
vector<int> ret = {};
for (auto &input : inputs) {
auto inputDims = input->getDims();
ret.insert(ret.end(), inputDims.begin(), inputDims.end());
}
for (auto &output : outputs) {
auto outputDims = output->getDims();
ret.insert(ret.end(), outputDims.begin(), outputDims.end());
}
for (auto c : kernelName) {
ret.emplace_back(c);
}
for (auto at : attr) {
ret.emplace_back(at);
}
return ret;
}
} // namespace infini

View File

@ -21,9 +21,8 @@ string ConvBaseObj::toString() const {
std::ostringstream os;
os << OpRegistry::getOpName(getOpType()) << "[" << getGuid() << "]";
os << "(";
if (inputs.size() == 2) {
os << vecToString(inputs[0]->getDims()) << ",";
os << vecToString(inputs[1]->getDims()) << ",";
for (auto &input : inputs) {
os << vecToString(input->getDims()) << ",";
}
os << "p=[" << ph << "," << pw << "],";
os << "s=[" << sh << "," << sw << "],";
@ -114,6 +113,75 @@ optional<vector<Shape>> ConvObj::inferShape(const TensorVec &inputs) const {
return {{{on, oc, oh, ow}}};
}
void ConvNHWCObj::setAuxilaryAttributes(PaddingMode mode) {
const Tensor &input = inputs[0];
const Tensor &weight = inputs[1];
n = input->getDims()[0], c = input->getDims()[3], h = input->getDims()[1],
w = input->getDims()[2], f = weight->getDims()[0], r = weight->getDims()[1],
s = weight->getDims()[2];
if (mode == PaddingMode::Same) {
int oh = h / sh;
int ow = w / sw;
ph = (h - oh * sh + (r - sh) * dh) / 2;
pw = (w - ow * sw + (s - sw) * dw) / 2;
} else if (mode == PaddingMode::Valid) {
ph = pw = 0;
}
}
ConvNHWCObj::ConvNHWCObj(GraphObj *graph, Tensor input, Tensor weight,
Tensor output, int ph, int pw, int sh, int sw, int dh,
int dw, Tensor bias, ActType act)
: ConvBaseObj(OpType::ConvNHWC, {input, weight}, output, ph, pw, sh, sw, dh,
dw, input, weight, act) {
if (bias)
IT_TODO_HALT();
setAuxilaryAttributes(PaddingMode::Other);
IT_ASSERT(checkValid(graph));
}
ConvNHWCObj::ConvNHWCObj(GraphObj *graph, Tensor input, Tensor weight,
Tensor output, PaddingMode mode, int sh, int sw,
int dh, int dw, Tensor bias, ActType act)
: ConvBaseObj(OpType::ConvNHWC, {input, weight}, output, mode, sh, sw, dh,
dw, input, weight, act) {
if (bias)
IT_TODO_HALT();
setAuxilaryAttributes(mode);
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>> ConvNHWCObj::inferShape(const TensorVec &inputs) const {
const auto &input = inputs[0], &weight = inputs[1];
auto n = input->getDims()[0];
auto h = input->getDims()[1];
auto w = input->getDims()[2];
auto f = weight->getDims()[0];
auto r = weight->getDims()[1];
auto s = weight->getDims()[2];
int on = n, oc = f;
int oh = 0, ow = 0;
// For NCHW+FCRS layout, C of input is divisable by C of weight
if (input->getDims()[3] % weight->getDims()[3] != 0)
return {};
// Set padding size
if (padding == PaddingMode::Other) {
oh = (h - (r - sh) * dh + ph * 2) / sh;
ow = (w - (s - sw) * dw + pw * 2) / sw;
} else if (padding == PaddingMode::Same) {
oh = h / sh;
ow = w / sw;
// ph = (h - oh * sh + (r - sh) * dh) / 2;
// pw = (w - ow * sw + (s - sw) * dw) / 2;
} else if (padding == PaddingMode::Valid) {
int ph = 0;
int pw = 0;
oh = (h - (r - sh) * dh + ph * 2) / sh;
ow = (w - (s - sw) * dw + pw * 2) / sw;
}
return {{{on, oh, ow, oc}}};
}
ConvTransposed2dObj::ConvTransposed2dObj(GraphObj *graph, Tensor input,
Tensor weight, Tensor output, int ph,
int pw, int sh, int sw, int dh, int dw,

View File

@ -0,0 +1,98 @@
#include "operators/conv2dreduce.h"
namespace infini {
Conv2dReduceBase::Conv2dReduceBase(OpType opType, Tensor input, Tensor bias_,
Tensor output, bool PReLU_, float paramReLU_,
int ph_, int pw_, int sh_, int sw_, int dh_,
int dw_)
: OperatorObj(opType, {input}, {output}), bias(bias_), ph(ph_), pw(pw_),
sh(sh_), sw(sw_), dh(dh_), dw(dw_), PReLU(PReLU_), paramReLU(paramReLU_) {
// expect input shape is (n, h, w, f, r, s)
auto inputShape = input->getDims();
IT_ASSERT(inputShape.size() == 6);
n = inputShape[0];
h = inputShape[1];
w = inputShape[2];
f = inputShape[3];
r = inputShape[4];
s = inputShape[5];
if (bias) {
auto biasShape = bias->getDims();
IT_ASSERT(biasShape.size() == 1);
IT_ASSERT(biasShape[0] == f);
}
}
std::string Conv2dReduceBase::toString() const {
std::ostringstream os;
os << OpRegistry::getOpName(getOpType()) << "[" << getGuid() << "]";
os << "(";
if (inputs.size() == 2) {
os << vecToString(inputs[0]->getDims()) << ",";
os << vecToString(inputs[1]->getDims()) << ",";
} else {
os << vecToString(inputs[0]->getDims()) << ",";
}
os << "p=[" << ph << "," << pw << "],";
os << "s=[" << sh << "," << sw << "],";
os << "d=[" << dh << "," << dw << "],";
os << "PReLU=" << (PReLU ? "true" : "false") << ",";
// os << "act=" << enum_to_underlying(act) << ",";
os << "input=" << inputs[0]->getGuid() << ",";
if (bias != nullptr) {
os << "bias=" << bias->getGuid() << ",";
}
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
std::vector<int> Conv2dReduceBase::getWorkloadVector() const {
return {enum_to_underlying(type), n, h, w, f, r, s, ph, pw, sh, sw, dh, dw};
}
std::vector<int> Conv2dReduceBase::getOpAttrVector() const {
return {enum_to_underlying(type), ph, pw, sh, sw, dh, dw};
}
Conv2dReduce::Conv2dReduce(GraphObj *graph, Tensor input, Tensor bias,
Tensor output, bool PReLU_, float paramReLU_,
int ph_, int pw_, int sh_, int sw_, int dh_, int dw_)
: Conv2dReduceBase(OpType::Conv2dReduce, input, bias, output, PReLU_,
paramReLU_, ph_, pw_, sh_, sw_, dh_, dw_) {
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>>
Conv2dReduce::inferShape(const TensorVec &inputs) const {
// const auto &input = inputs[0], &bias = inputs[1];
int on = n, of = f;
int oh = (h + ph * 2 - dh * (r - 1) - 1) / sh + 1;
int ow = (w + pw * 2 - dw * (s - 1) - 1) / sw + 1;
return {{{on, oh, ow, of}}};
}
Conv2dReduceTranspose::Conv2dReduceTranspose(GraphObj *graph, Tensor input,
Tensor bias, Tensor output,
bool PReLU_, float paramReLU_,
int ph_, int pw_, int sh_, int sw_,
int dh_, int dw_)
: Conv2dReduceBase(OpType::Conv2dReduceTranspose, input, bias, output,
PReLU_, paramReLU_, ph_, pw_, sh_, sw_, dh_, dw_) {
IT_ASSERT(dh_ == 1);
IT_ASSERT(dw_ == 1);
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>>
Conv2dReduceTranspose::inferShape(const TensorVec &inputs) const {
// const auto &input = inputs[0], &bias = inputs[1];
int on = n, of = f;
int oh = (h - 1) * sh - 2 * ph + dh * (r - 1) + 1;
int ow = (w - 1) * sw - 2 * pw + dw * (s - 1) + 1;
return {{{on, oh, ow, of}}};
}
} // namespace infini

View File

@ -2,6 +2,7 @@
#include "nnet/Visitor/CheckOOBVisitor.h"
#include "nnet/Visitor/HashVisitor.h"
#include "nnet/Visitor/MergeMemboundMutator.h"
#include "nnet/Visitor/Serializer.h"
namespace infini {
@ -9,8 +10,8 @@ MemBoundObj::MemBoundObj(GraphObj *graph, const TensorVec &input,
const TensorVec &output,
const std::vector<nnet::Tensor> &nnetInputs,
nnet::Expr expr, double exec_time, std::string hint)
: OperatorObj(OpType::MemBound, input, output), nnetInputs(nnetInputs),
expr(expr), exec_time(exec_time), hint(hint) {
: OperatorObj(OpType::MemBound, input, output), expr(expr),
nnetInputs(nnetInputs), exec_time(exec_time), hint(hint) {
IT_ASSERT(checkValid(graph));
IT_ASSERT(!checkOOB(expr));
hash = calcHash(expr);
@ -45,7 +46,7 @@ string MemBoundObj::toString() const {
os << "exec_time=" << exec_time << ", ";
os << "NNet Inputs=[";
for (const auto &tensor : nnetInputs)
os << tensor->toReadable() << ",";
os << tensor->toReadable() << vecToString(tensor->getShape()) << ",";
os << "]";
os << ", ExprHash=" << hash;
os << ", SimplifiedExprHash=" << simplifiedHash;
@ -60,11 +61,18 @@ string MemBoundObj::toString() const {
optional<vector<Shape>> MemBoundObj::inferShape(const TensorVec &inputs) const {
// inputs have to match nnetInputs excatly
if (inputs.size() != nnetInputs.size())
if (inputs.size() != nnetInputs.size()) {
std::cout << "Num mismatch" << inputs.size() << " "
<< nnetInputs.size();
return {};
}
for (size_t i = 0; i < inputs.size(); ++i)
if (inputs[i]->getDims() != nnetInputs[i]->getShape())
if (inputs[i]->getDims() != nnetInputs[i]->getShape()) {
std::cout << "Shape mismatch " << inputs[i]
<< vecToString(inputs[i]->getDims()) << " "
<< vecToString(nnetInputs[i]->getShape());
return {};
}
return {{nnet::as<nnet::RangeOpNode>(expr)->getOutputShape()}};
}
@ -83,4 +91,9 @@ bool MemBoundObj::checkOOB(nnet::Expr expr) {
nnet::as<nnet::RangeOpNode>(expr));
}
string MemBoundObj::toJson() const {
return *nnet::Serializer().toString(expr, "MemBoundObj::toJson", nnetInputs,
exec_time, hint);
}
} // namespace infini

View File

@ -2,7 +2,8 @@
namespace infini {
ReshapeObj::ReshapeObj(GraphObj *graph, Tensor input, Tensor output, Shape dims)
: OperatorObj(OpType::Reshape, {input}, {output}), dims(std::move(dims)) {
: OperatorObj(OpType::Reshape, {input}, {output}),
dims(dims.size() == 0 ? output->getDims() : dims) {
IT_ASSERT(checkValid(graph));
}
@ -19,9 +20,9 @@ optional<vector<Shape>> ReshapeObj::inferShape(const TensorVec &inputs) const {
std::string ReshapeObj::toString() const {
std::ostringstream os;
os << "Reshape[" << getGuid() << "]";
os << "(";
os << "(input dim=";
os << vecToString(inputs[0]->getDims()) << ",";
os << "dims=" << vecToString(dims) << ",";
os << "output dims=" << vecToString(dims) << ",";
os << "input=" << inputs[0]->getGuid() << ",";
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();

View File

@ -4,13 +4,7 @@ namespace infini {
TransposeObj::TransposeObj(GraphObj *graph, Tensor input, Tensor output,
vector<int> permute)
: OperatorObj(OpType::Transpose, {input}, {output}) {
if (permute.size() != 4) {
IT_TODO_HALT();
}
transposePermute[0] = permute[0];
transposePermute[1] = permute[1];
transposePermute[2] = permute[2];
transposePermute[3] = permute[3];
transposePermute = permute;
IT_ASSERT(checkValid(graph));
}
@ -20,7 +14,8 @@ TransposeObj::inferShape(const TensorVec &inputs) const {
auto input = A->getDims();
auto output = input;
for (int i = 0; i < 4; ++i) {
auto nDims = input.size();
for (size_t i = 0; i < nDims; ++i) {
output[i] = input[transposePermute[i]];
}
return {{output}};
@ -32,7 +27,8 @@ std::string TransposeObj::toString() const {
os << "(";
os << vecToString(inputs[0]->getDims()) << ",";
os << "input=" << inputs[0]->getGuid() << ",";
os << "output=" << outputs[0]->getGuid() << ")";
os << "output=" << outputs[0]->getGuid() << ",";
os << "perm=" << vecToString(transposePermute) << ")";
return os.str();
}

View File

@ -7,9 +7,10 @@ namespace infini {
TEST(Handler, matmul) {
auto runtime = NativeCpuRuntimeObj::getInstance();
auto handler = make_ref<GraphHandlerObj>(runtime);
auto i = handler->tensor({1, 2, 3}, OnnxDType::UINT32);
auto w = handler->tensor({1, 3, 4}, OnnxDType::UINT32);
auto o = handler->tensor({1, 2, 4}, OnnxDType::UINT32);
auto i = handler->tensor({1, 2, 3}, OnnxDType::UINT32, TensorType::Input);
auto w =
handler->tensor({1, 3, 4}, OnnxDType::UINT32, TensorType::Initialized);
auto o = handler->tensor({1, 2, 4}, OnnxDType::UINT32, TensorType::Other);
handler->matmul(i, w, o, false, false, nullptr, ActType::None);
}

View File

@ -48,7 +48,7 @@ TEST(SubGraphRewriter, subGraphMatch1) {
SubGraphRewriter v(g);
vector<MatchGraph> subgs = v.findMatch(subG);
EXPECT_TRUE(subgs.size() == 2);
EXPECT_TRUE(subgs.size() == 2u);
}
TEST(MatchGraph, single_input) {
@ -116,12 +116,12 @@ TEST(MatchGraph, single_input) {
auto o4 = v.addSubGraph(subG, TensorVec{add1->getOutput(0)});
EXPECT_EQ(g->getOperators().size(), 52);
EXPECT_EQ(g->getOperators().size(), 52u);
vector<MatchGraph> subgs = v.findMatch(subG);
EXPECT_TRUE(subgs.size() == 5);
EXPECT_TRUE(subgs.size() == 5u);
vector<MatchGraph> subgs1 = v.findMatch(subG1);
EXPECT_TRUE(subgs1.size() == 4);
EXPECT_TRUE(subgs1.size() == 4u);
// test replace
Tensor sii0 =
@ -135,7 +135,7 @@ TEST(MatchGraph, single_input) {
}
v.replaceSubGraph(subG, subG2);
EXPECT_EQ(g->getOperators().size(), 37);
EXPECT_EQ(g->getOperators().size(), 37u);
}
TEST(MatchGraph, multi_input) {
@ -186,17 +186,17 @@ TEST(MatchGraph, multi_input) {
nullptr);
auto matches = v.findMatch(subG);
EXPECT_EQ(2, matches.size());
EXPECT_EQ(2u, matches.size());
auto div0 = g->addOp<DivObj>(reduce1->getOutput(0), i2, nullptr);
auto add1 =
g->addOp<AddObj>(sub0->getOutput(), div0->getOutput(), nullptr);
matches = v.findMatch(subG);
EXPECT_EQ(1, matches.size());
EXPECT_EQ(1u, matches.size());
// two matched subgraphs overlaped,so only replaced one sub graph
v.replaceSubGraph(subG, replaceG);
EXPECT_EQ(1, v.findMatch(replaceG).size());
EXPECT_EQ(1u, v.findMatch(replaceG).size());
}
}
@ -240,7 +240,7 @@ TEST(MatchGraph, multi_output) {
{
auto input = g->cloneTensor(i);
auto outs = v.addSubGraph(subg0, {input});
EXPECT_EQ(2, outs.size());
EXPECT_EQ(2u, outs.size());
Tensor w0 = g->addTensor(Shape{96, 64, 3, 3}, DataType::UInt32);
auto conv0 = g->addOp<ConvObj>(outs[0], w0, nullptr, 1, 1);
auto relu0 = g->addOp<ReluObj>(conv0->getOutput(0), nullptr);
@ -263,11 +263,11 @@ TEST(MatchGraph, multi_output) {
}
auto matches = v.findMatch(subg0);
EXPECT_EQ(1, matches.size());
EXPECT_EQ(1u, matches.size());
v.replaceSubGraph(subg0, subg1);
auto matches2 = v.findMatch(subg1);
EXPECT_EQ(1, matches2.size());
EXPECT_EQ(1u, matches2.size());
}
// gcn
@ -354,16 +354,16 @@ TEST(MatchGraph, multi_input_output) {
v.addSubGraph(subg0, {relu->getOutput(0), maxPool->getOutput(0)});
auto out1 =
v.addSubGraph(subg1, {maxPool->getOutput(0), relu->getOutput(0)});
EXPECT_EQ(2, out0.size());
EXPECT_EQ(2, out1.size());
EXPECT_EQ(2u, out0.size());
EXPECT_EQ(2u, out1.size());
auto div = g->addOp<DivObj>(out0[0], out1[1], nullptr);
auto sub = g->addOp<SubObj>(out0[1], out1[0], nullptr);
}
EXPECT_EQ(2, v.findMatch(subg0).size());
EXPECT_EQ(2, v.findMatch(subg1).size());
EXPECT_EQ(2u, v.findMatch(subg0).size());
EXPECT_EQ(2u, v.findMatch(subg1).size());
v.replaceSubGraph(subg0, subg2);
EXPECT_EQ(v.findMatch(subg2).size(), 2);
EXPECT_EQ(v.findMatch(subg2).size(), 2u);
}
/* One Node having two or more successors is not supported yet.

View File

@ -0,0 +1,57 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "operators/any.h"
#include "test.h"
namespace infini {
TEST(cuda_Any, anyKernel) {
// conv2dreduce
{
// Construct Runtime and graph for CPU and CUDA
Runtime cpu =
NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime cuda = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cuda);
auto generator = IncrementalGenerator();
int PRelu = 0, n = 1, h = 4, w = 4, f = 2, r = 3, s = 3, oh = 4, ow = 4,
ph = 1, pw = 1, sh = 1, sw = 1, dh = 1, dw = 1;
string kernelName = "conv2dreduce_kernel";
vector<int> attr{PRelu, n, h, w, f, r, s, oh,
ow, ph, pw, sh, sw, dh, dw};
// Build input data on CPu
Tensor i0Cpu = gCpu->addTensor({n, 1, h, w}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({f, 1, r, s}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit
// allocation?
gCpu->dataMalloc();
i0Cpu->setData(generator);
w0Cpu->setData(generator);
// Copy input tensors from CPU to CUDA
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
Tensor o0Cuda = gCuda->addTensor({n, f, oh, ow});
auto anyOp = gCuda->addOpWithOutputs<AnyObj>(
TensorVec{i0Cuda, w0Cuda}, TensorVec{o0Cuda}, kernelName, attr);
anyOp->print();
// allocate CUDA memory
gCuda->dataMalloc();
std::cout << "data malloc success..." << std::endl;
// Execute on CUDA
cuda->run(gCuda);
std::cout << "cuda run success..." << std::endl;
// copy output from CUDA to CPU
auto o0Cpu = gCpu->cloneTensor(anyOp->getOutput());
// check results on CPU
EXPECT_TRUE(1);
// print a tensor/operator/graph by print()
gCuda->print();
}
}
} // namespace infini

View File

@ -43,6 +43,42 @@ void testConvCudnn(
gCuda->print();
}
void testConvNHWCCudnn(
const std::function<void(void *, size_t, DataType)> &generator,
vector<float> ansVec) {
// Construct Runtime and graph for CPU and CUDA
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime cuda = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cuda);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({1, 4, 4, 3}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({2, 3, 3, 3}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(generator);
w0Cpu->setData(generator);
// Copy input tensors from CPU to CUDA
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
// Build CUDA graph
auto conv =
gCuda->addOp<ConvNHWCObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
// allocate CUDA memory
gCuda->dataMalloc();
// Execute on CUDA
cuda->run(gCuda);
// copy output from CUDA to CPU
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
o0Cpu->print();
o0Cpu->printData();
// check results on CPU
EXPECT_TRUE(o0Cpu->equalData(ansVec));
// print a tensor/operator/graph by print()
gCuda->print();
}
TEST(cuDNN_Conv, run) {
testConvCudnn(OneGenerator(),
vector<float>{12, 12, 18, 18, 12, 12, 18, 18});
@ -51,6 +87,14 @@ TEST(cuDNN_Conv, run) {
vector<float>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
}
TEST(cuDNN_Conv, runNHWC) {
testConvNHWCCudnn(OneGenerator(),
vector<float>{12., 12., 12., 12., 18., 18., 18., 18.});
testConvNHWCCudnn(
IncrementalGenerator(),
vector<float>{3350, 7562, 2306, 5546, 9480, 24546, 7185, 20793});
}
TEST(cuDNN_Conv, tune) {
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);

View File

@ -68,16 +68,16 @@ TEST(cuBLAS_Matmul, tune) {
const int B = 1, M = 4, N = 4096, K = 448;
const bool transA = true, transB = false;
auto cudaRuntime = make_ref<CudaRuntimeObj>();
cudaRuntime->setEnableTF32(true);
Graph g = make_ref<GraphObj>(cudaRuntime);
auto a = g->addTensor(transA ? Shape{B, K, M} : Shape{B, M, K});
auto b = g->addTensor(transB ? Shape{B, N, K} : Shape{B, K, N});
// allocate CUDA memory
auto matmul = g->addOp<MatmulObj>(a, b, nullptr, transA, transB);
g->dataMalloc();
a->setData(IncrementalGenerator());
b->setData(IncrementalGenerator());
auto matmul = g->addOp<MatmulObj>(a, b, nullptr, transA, transB);
matmul->print();
double time = cudaRuntime->getPerfTime(g);
EXPECT_GT(time, 1e-3);
EXPECT_LT(time, 1);

Some files were not shown because too many files have changed in this diff Show More