forked from jiuyuan/InfiniTensor
Compare commits
96 Commits
Author | SHA1 | Date |
---|---|---|
Liyan Zheng | 1ee4a60af0 | |
Liyan Zheng | 29071ddcac | |
Liyan Zheng | c6c445991a | |
Liyan Zheng | d25b606e12 | |
Liyan Zheng | abcfa76fb5 | |
Liyan Zheng | 6a70555892 | |
Liyan Zheng | f47a411095 | |
Liyan Zheng | df2534d209 | |
Liyan Zheng | a1f02593d3 | |
Liyan Zheng | 65b4b42fa0 | |
Liyan Zheng | b068442bfb | |
Liyan Zheng | c6e7748786 | |
Liyan Zheng | d0ae48d21d | |
Liyan Zheng | c875f3cbb8 | |
Liyan Zheng | 95a8b90fa7 | |
Liyan Zheng | c58b67f743 | |
Liyan Zheng | 75c9226164 | |
Liyan Zheng | f877eca517 | |
Liyan Zheng | b13b799fbe | |
Liyan Zheng | 350fc01d39 | |
whjthu | 71f4f6e9d9 | |
Liyan Zheng | 1408d308cc | |
Liyan Zheng | 11229a2baa | |
Liyan Zheng | 2b85ac41ef | |
Liyan Zheng | 1e46750159 | |
Liyan Zheng | 079985bc8c | |
Liyan Zheng | c1275cddb6 | |
Liyan Zheng | 51cc042f56 | |
Liyan Zheng | 18d6ba4022 | |
Liyan Zheng | 4211fd1f32 | |
xxcclong | 8409c1f9d4 | |
xxcclong | 830b28913c | |
Liyan Zheng | 1ba78d7f89 | |
xxcclong | 777aebafc9 | |
whjthu | 131a679340 | |
Liyan Zheng | 5df2524ff9 | |
Liyan Zheng | f204866d93 | |
Liyan Zheng | b9819e65c1 | |
Liyan Zheng | 7277356744 | |
whjthu | f820117acd | |
whjthu | 1ab2118716 | |
huangshuhong | ff97c879fb | |
Liyan Zheng | acc64fd32c | |
Liyan Zheng | 33ab5dcd3e | |
Liyan Zheng | e2f18272c9 | |
Liyan Zheng | 40e6db6608 | |
Liyan Zheng | c451918224 | |
whjthu | 34ed298725 | |
whjthu | 664f0dbe02 | |
Liyan Zheng | a732b6f176 | |
Liyan Zheng | 0865f8d823 | |
Liyan Zheng | 84f9d6731a | |
Liyan Zheng | 4f02eeb08c | |
whjthu | 225a42f22d | |
Liyan Zheng | 4e9ece76f4 | |
Liyan Zheng | 16a8c5dce5 | |
Liyan Zheng | d051460c23 | |
Liyan Zheng | d8a133684e | |
Liyan Zheng | 9ce21200c4 | |
Liyan Zheng | b943658713 | |
Liyan Zheng | 2cd75bd79b | |
Liyan Zheng | f0fcbe825f | |
huangshuhong | 8c91faa948 | |
huangshuhong | c0ae03a2d7 | |
Liyan Zheng | 0cb8729bc1 | |
YdrMaster | 8bc2d3e48d | |
YdrMaster | 28b123753e | |
Liyan Zheng | 94730d93b5 | |
Liyan Zheng | 6d17c4caa2 | |
Liyan Zheng | 15d0eb79cd | |
Liyan Zheng | 2a343e240e | |
Liyan Zheng | 34ca6bf149 | |
YdrMaster | a6019e79e3 | |
YdrMaster | 4e1cc8d3e4 | |
YdrMaster | 725f9260cf | |
YdrMaster | 0edd138919 | |
Liyan Zheng | 0b23a065ca | |
Liyan Zheng | e86e993ed4 | |
Liyan Zheng | e4c20a9ae2 | |
Liyan Zheng | 537b3b4ea4 | |
Liyan Zheng | 2812900ea2 | |
Liyan Zheng | 01fc19795d | |
Liyan Zheng | afc4123328 | |
Liyan Zheng | b981951a47 | |
Liyan Zheng | 99b5c95455 | |
Liyan Zheng | 9d50b30af8 | |
Liyan Zheng | bc31219bde | |
Liyan Zheng | edf4e33353 | |
Liyan Zheng | 872f3504a9 | |
Liyan Zheng | da49e91ab0 | |
Liyan Zheng | a6b8f344d4 | |
Liyan Zheng | 09293730ea | |
Liyan Zheng | 307614d95d | |
Liyan Zheng | f14edcd52f | |
Liyan Zheng | d2d49c5d4f | |
Liyan Zheng | e72fe79168 |
|
@ -1 +1 @@
|
|||
Subproject commit 3bb9240cb15459768adb3e7d963a20e1523a6294
|
||||
Subproject commit f30744bcf726ea3735df7ecf9e9de9ddac540283
|
|
@ -1 +1 @@
|
|||
Subproject commit b796f7d44681514f58a683a3a71ff17c94edb0c1
|
||||
Subproject commit e2239ee6043f73722e7aa812a459f54a28552929
|
|
@ -1 +1 @@
|
|||
Subproject commit 13132dd361c8c5b5753983d5186cf54f689d90f9
|
||||
Subproject commit 6aebf09233951e4ce30a63919186a70b2b195756
|
|
@ -1 +1 @@
|
|||
Subproject commit 0bd8896a4010f2d91b2340570c24fa08606ec406
|
||||
Subproject commit 1e3400b6742288429f2069aaf5febf92d0662dae
|
|
@ -129,7 +129,7 @@ if(BUILD_TEST_EINNET)
|
|||
endif()
|
||||
|
||||
# Python bindings
|
||||
file(GLOB_RECURSE FFIS src/ffi/ffi_infinitensor.cc)
|
||||
file(GLOB_RECURSE FFIS src/ffi/ffi_callback.cc src/ffi/ffi_infinitensor.cc)
|
||||
pybind11_add_module(backend MODULE ${FFIS})
|
||||
target_link_libraries(backend PRIVATE InfiniTensor)
|
||||
|
||||
|
@ -168,6 +168,7 @@ endif()
|
|||
|
||||
if(USE_CUDA)
|
||||
add_compile_definitions(USE_CUDA=1)
|
||||
add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM=1) # Support CUDA graph stream caputre
|
||||
# Since enable_language only executes once, rerun cmake is required if CMAKE_CUDA_HOST_COMPILER is wrong
|
||||
set(CMAKE_CUDA_HOST_COMPILER
|
||||
${CMAKE_CXX_COMPILER}
|
||||
|
|
|
@ -81,7 +81,7 @@ import onnx
|
|||
from pyinfinitensor.onnx import OnnxStub
|
||||
from pyinfinitensor import backend
|
||||
|
||||
stub = OnnxStub(onnx.load("model_file"), backend.cpu_runtime())
|
||||
stub = OnnxStub.from_model(onnx.load("model_file"), backend.cpu_runtime())
|
||||
```
|
||||
|
||||
[`onnx.load`](https://onnx.ai/onnx/api/serialization.html#load-a-model) 是 onnx 提供的加载函数,将 onnx 文件读取为保存在内存中的 onnx 模型。
|
||||
|
@ -201,7 +201,7 @@ def infer(model: ModelProto, input) -> dict:
|
|||
|
||||
|
||||
model0 = onnx.load(sys.argv[1])
|
||||
model1 = OnnxStub(model0, backend.cpu_runtime()).to_onnx("new")
|
||||
model1 = OnnxStub.from_model(model0, backend.cpu_runtime()).to_onnx("new")
|
||||
|
||||
input_shape = [x.dim_value for x in model1.graph.input[0].type.tensor_type.shape.dim]
|
||||
input = numpy.random.random(input_shape).astype(numpy.float32)
|
||||
|
|
|
@ -36,7 +36,7 @@ class BangRuntimeObj : public RuntimeObj {
|
|||
bool profiling = false) const;
|
||||
// double runEvaluation(const Graph &graph, int nWarmups,
|
||||
// int nEvaluations) const;
|
||||
void sync() const;
|
||||
void sync() const override;
|
||||
BangPtr alloc(size_t size) override {
|
||||
void *ptr;
|
||||
checkBangError(cnrtMalloc(&ptr, size));
|
||||
|
|
|
@ -75,7 +75,8 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
|
|||
|
||||
double timeit(
|
||||
const std::function<void()> &func,
|
||||
const std::function<void(void)> &sync = []() {}, int warmupRounds = 200,
|
||||
int timingRounds = 200);
|
||||
// HACK: set timeit rounds to 10 for fast debug
|
||||
const std::function<void(void)> &sync = []() {}, int warmupRounds = 10,
|
||||
int timingRounds = 100);
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -16,7 +16,8 @@ class GraphObj : public Object {
|
|||
string toString() const override;
|
||||
Runtime getRuntime() const { return runtime; }
|
||||
|
||||
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
|
||||
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32,
|
||||
TensorType tensorType = TensorType::Other);
|
||||
Tensor addTensor(const Tensor &tensor);
|
||||
TensorVec addTensor(const TensorVec &tensors);
|
||||
/**
|
||||
|
@ -47,6 +48,22 @@ class GraphObj : public Object {
|
|||
return opClone;
|
||||
}
|
||||
|
||||
Operator cloneOpAndCreateOutputs(Operator op, TensorVec inputs) {
|
||||
auto shapes = *op->inferShape(inputs);
|
||||
vector<Tensor> outputs;
|
||||
for (auto shape : shapes)
|
||||
outputs.emplace_back(addTensor(shape));
|
||||
return cloneOperator(op, inputs, outputs);
|
||||
}
|
||||
|
||||
Operator cloneOpAndCreateInputsOutputs(Operator op) {
|
||||
vector<Tensor> inputs;
|
||||
for (auto t : op->getInputs()) {
|
||||
inputs.emplace_back(cloneTensor(t));
|
||||
}
|
||||
return cloneOpAndCreateOutputs(op, inputs);
|
||||
}
|
||||
|
||||
const TensorVec &getTensors() const { return tensors; }
|
||||
const OpVec &getOperators() const { return ops; }
|
||||
OpVec getComputeOps() const;
|
||||
|
@ -62,6 +79,7 @@ class GraphObj : public Object {
|
|||
void optimize();
|
||||
|
||||
void dataMalloc();
|
||||
void dataFree();
|
||||
|
||||
/**
|
||||
* @brief Add an operator and create its outputs. Output tensor arguments
|
||||
|
@ -107,6 +125,11 @@ class GraphObj : public Object {
|
|||
|
||||
bool checkValid() const;
|
||||
|
||||
/// @brief If a tensor has no source and garget, it is independent and
|
||||
/// removed from the graph.
|
||||
/// @return The number of removed tensors.
|
||||
int removeIndependentTensors();
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Add reverse connections and Op relationship in ctor.
|
||||
|
|
|
@ -35,20 +35,33 @@ class GraphHandlerObj {
|
|||
Graph g;
|
||||
|
||||
public:
|
||||
GraphHandlerObj(Runtime runtime)
|
||||
explicit GraphHandlerObj(Runtime runtime)
|
||||
: g(make_ref<GraphObj>(std::move(runtime))) {}
|
||||
|
||||
Tensor tensor(Shape dims, int dtype);
|
||||
explicit GraphHandlerObj(Graph g) : g(std::move(g)) {}
|
||||
|
||||
//------ tensors
|
||||
|
||||
vector<Tensor> inputs() { return g->getInputs(); }
|
||||
|
||||
vector<Tensor> outputs() { return g->getOutputs(); }
|
||||
|
||||
Tensor tensor(Shape dims, int dtype, TensorType ttype);
|
||||
|
||||
//------ operators
|
||||
|
||||
inline OpVec operators() { return g->getOperators(); }
|
||||
OpVec operators() { return g->getOperators(); }
|
||||
|
||||
Tensor conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
|
||||
int sh, int sw, int dh, int dw);
|
||||
Tensor convTransposed2d(Tensor input, Tensor weight, Tensor output, int ph,
|
||||
int pw, int sh, int sw, int dh, int dw, int oph,
|
||||
int opw);
|
||||
Tensor convNHWC(Tensor input, Tensor weight, Tensor output, int ph, int pw,
|
||||
int sh, int sw, int dh, int dw);
|
||||
Tensor convTransposed2dNHWC(Tensor input, Tensor weight, Tensor output,
|
||||
int ph, int pw, int sh, int sw, int dh, int dw,
|
||||
int oph, int opw);
|
||||
Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
|
||||
Tensor bias, ActType act);
|
||||
Tensor batchNorm(Tensor input, Tensor output, Tensor mean, Tensor var,
|
||||
|
@ -90,18 +103,23 @@ class GraphHandlerObj {
|
|||
const optional<vector<int>> &steps);
|
||||
Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
|
||||
const optional<vector<int>> &axes);
|
||||
/// @brief Import memBound operator from a json
|
||||
TensorVec memBound(const TensorVec &inputs, const Tensor &outputs,
|
||||
const string &jsonString);
|
||||
|
||||
//------ modifiers
|
||||
|
||||
inline bool topo_sort() { return g->topo_sort(); }
|
||||
bool topo_sort() { return g->topo_sort(); }
|
||||
|
||||
inline void optimize() { g->optimize(); }
|
||||
void optimize() { g->optimize(); }
|
||||
|
||||
//------ runtime
|
||||
|
||||
inline void data_malloc() { g->dataMalloc(); }
|
||||
void data_malloc() { g->dataMalloc(); }
|
||||
|
||||
inline void run() { g->getRuntime()->run(g); }
|
||||
void run() { g->getRuntime()->run(g); }
|
||||
|
||||
Graph getGraph() const;
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -16,6 +16,7 @@ class Mutator {
|
|||
Runtime runtime = NativeCpuRuntimeObj::getInstance())
|
||||
: candidatesLimit(candidatesLimit), runtime(runtime){};
|
||||
virtual ~Mutator(){};
|
||||
bool hasTunedKernel = false;
|
||||
|
||||
virtual vector<Graph> run(const Graph &in_graph) = 0;
|
||||
/**
|
||||
|
@ -30,6 +31,14 @@ class Mutator {
|
|||
virtual bool isMultiBranchMergable(const Graph &in_graph) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
/// @brief Fuse memory bound operators.
|
||||
/// @return The graph after fusion. Return `nullptr` if fails.
|
||||
virtual Graph fuseVertically(const Graph &inputGraph) { IT_TODO_HALT(); }
|
||||
|
||||
/// @brief Eliminate transpose and reshape.
|
||||
/// @return The graph after elimination. Return `nullptr` if fails.
|
||||
virtual Graph eliminateVertically(const Graph &in_graph) { IT_TODO_HALT(); }
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -11,6 +11,7 @@ enum class OpType {
|
|||
Matmul,
|
||||
ConvTrans,
|
||||
ConvTransNHWC,
|
||||
ConvNHWC,
|
||||
G2BMM,
|
||||
GBMM,
|
||||
Pad,
|
||||
|
@ -102,6 +103,10 @@ enum class OpType {
|
|||
Dropout,
|
||||
//
|
||||
MemBound = 300,
|
||||
//
|
||||
Conv2dReduce = 400,
|
||||
Conv2dReduceTranspose,
|
||||
Any
|
||||
};
|
||||
|
||||
using KernelAttrs = std::tuple<Device, OpType, DataType>;
|
||||
|
@ -121,6 +126,8 @@ class OpRegistry {
|
|||
FOP(ConvBackwardData);
|
||||
FOP(Matmul);
|
||||
FOP(ConvTrans);
|
||||
FOP(ConvTransNHWC);
|
||||
FOP(ConvNHWC);
|
||||
FOP(G2BMM);
|
||||
FOP(GBMM);
|
||||
FOP(Pad);
|
||||
|
@ -141,6 +148,7 @@ class OpRegistry {
|
|||
FOP(Reshape);
|
||||
FOP(Identity);
|
||||
FOP(Shape);
|
||||
FOP(Flatten);
|
||||
// element wise
|
||||
FOP(BatchNorm);
|
||||
FOP(Softmax);
|
||||
|
@ -208,8 +216,13 @@ class OpRegistry {
|
|||
FOP(BitRightShift);
|
||||
//
|
||||
FOP(MemBound);
|
||||
//
|
||||
FOP(Conv2dReduce);
|
||||
FOP(Conv2dReduceTranspose);
|
||||
FOP(Any);
|
||||
default:
|
||||
IT_ASSERT(false);
|
||||
IT_ASSERT(false, "Unknown OpType " +
|
||||
std::to_string(enum_to_underlying(opType)));
|
||||
break;
|
||||
}
|
||||
#undef FOP
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
#include "core/common.h"
|
||||
#include "core/object.h"
|
||||
#include "core/ref.h"
|
||||
#include <memory>
|
||||
|
||||
|
@ -59,10 +60,12 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
|||
* execution happens.
|
||||
*
|
||||
* @param graph
|
||||
* @param profiling Whether to print breakdown of time
|
||||
* @param printProfiling Whether to print breakdown of time
|
||||
* @return double Return the sum of perf time for each operator
|
||||
*/
|
||||
double getPerfTime(const Graph &graph, bool profiling = false) const;
|
||||
double getPerfTime(const Graph &graph, bool printProfiling = false,
|
||||
bool allowEstimation = false,
|
||||
bool ignoreMemboundOp = false) const;
|
||||
Blob allocBlob(size_t size);
|
||||
bool isCpu() const {
|
||||
return device == Device::CPU || device == Device::INTELCPU;
|
||||
|
@ -76,11 +79,19 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
|||
virtual void copyBlobToCPU(void *dst, const void *src,
|
||||
size_t bytes) const = 0;
|
||||
virtual string toString() const = 0;
|
||||
virtual void sync() const {}
|
||||
|
||||
map<UidBaseType, bool>
|
||||
getCompileTimeComputableAttribute(const Graph &graph) const;
|
||||
|
||||
double timeNonCtcOperators(const Graph &graph, int warmup = 1000,
|
||||
int repeat = 1000) const;
|
||||
|
||||
protected:
|
||||
void printProfilingData(double totTime,
|
||||
void printProfilingData(double totalTime,
|
||||
const std::map<OpType, double> &opTime,
|
||||
const std::map<OpType, int> &opCnt) const;
|
||||
const std::map<OpType, int> &opCnt,
|
||||
const std::map<OpType, int> &opNonCtcCnt) const;
|
||||
virtual void copyBlobInsideRuntime(void *dst, const void *src,
|
||||
size_t bytes) const = 0;
|
||||
};
|
||||
|
|
|
@ -4,44 +4,35 @@
|
|||
#include "graph.h"
|
||||
#include "mutator.h"
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
namespace infini {
|
||||
class SearchEngine {
|
||||
private:
|
||||
Runtime runtimeExec;
|
||||
Ref<Mutator> mutator;
|
||||
std::function<bool(const Graph &, const Graph &)> graphTimeComparer;
|
||||
|
||||
public:
|
||||
SearchEngine(Runtime _runtime, Ref<Mutator> _mutator) {
|
||||
runtimeExec = _runtime;
|
||||
mutator = _mutator;
|
||||
}
|
||||
SearchEngine(Runtime runtime, Ref<Mutator> mutator);
|
||||
~SearchEngine() {}
|
||||
int searchFilter = 0;
|
||||
bool chooseBestMutation = true;
|
||||
|
||||
private: // Configurations
|
||||
size_t partitionThreshold =
|
||||
3; // cut nodes whose #in + #out >= partitionThreshold
|
||||
size_t GRAPH_SIZE = 16; // num of best graphs.
|
||||
|
||||
private: // Composed objects
|
||||
std::shared_ptr<Mutator> mutationEngine;
|
||||
|
||||
public:
|
||||
std::shared_ptr<Mutator> getMutationEngine() { return mutationEngine; };
|
||||
struct GroupEdge {
|
||||
int v, next;
|
||||
GroupEdge() = delete;
|
||||
};
|
||||
|
||||
struct Candidate { // a graph with perf
|
||||
std::shared_ptr<Graph> graph;
|
||||
double perf = INFINITY;
|
||||
};
|
||||
class MetaGraph { // a graph of subgraphs, for searching.
|
||||
public:
|
||||
MetaGraph() {}
|
||||
~MetaGraph() {}
|
||||
// struct Candidate { // a graph with perf
|
||||
// Graph graph;
|
||||
// double perf = INFINITY;
|
||||
// };
|
||||
struct MetaGraphObj { // a graph of subgraphs, for searching.
|
||||
struct Node {
|
||||
Graph graph;
|
||||
std::vector<int> suc;
|
||||
|
@ -50,31 +41,33 @@ class SearchEngine {
|
|||
};
|
||||
std::vector<Node> nodes;
|
||||
};
|
||||
using MetaGraph = Ref<MetaGraphObj>;
|
||||
|
||||
Graph run(const Graph graph); // entrance of search engine.
|
||||
Graph run(const Graph graph); // entrance to search engine.
|
||||
std::vector<Graph> search(const Graph &graph); // search for a partition.
|
||||
|
||||
private:
|
||||
std::vector<Graph> partitionGraph(const Graph graph);
|
||||
std::shared_ptr<MetaGraph> buildMetaGraphWithGraph(const Graph graph);
|
||||
std::shared_ptr<MetaGraph>
|
||||
buildMetaGraphWithPlan(const std::shared_ptr<MetaGraph> metaGraph,
|
||||
const std::vector<int> &plan);
|
||||
MetaGraph buildMetaGraphWithGraph(const Graph graph);
|
||||
MetaGraph buildMetaGraphWithPlan(const MetaGraph metaGraph,
|
||||
const std::vector<int> &plan);
|
||||
// search horizontal merges
|
||||
std::vector<std::shared_ptr<MetaGraph>>
|
||||
searchMerge(std::shared_ptr<MetaGraph> &metaGraph);
|
||||
void searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
|
||||
std::vector<int> &plan, std::vector<int> &frontier,
|
||||
std::vector<MetaGraph> searchMerge(MetaGraph &metaGraph);
|
||||
void searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
|
||||
std::vector<int> &frontier,
|
||||
std::vector<std::vector<int>> &plans,
|
||||
std::unordered_set<uint64_t> &planSet);
|
||||
std::vector<Graph>
|
||||
searchMutation(const std::shared_ptr<MetaGraph> &metaGraph);
|
||||
std::vector<Graph> searchMutation(const MetaGraph &metaGraph);
|
||||
|
||||
void printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph);
|
||||
void printMetaGraph(MetaGraph metaGraph);
|
||||
/**
|
||||
* @brief Check whether a multi-brach graph can be merged into a single
|
||||
* branch.
|
||||
*/
|
||||
bool isMultiBranchMergable(const Graph graph);
|
||||
Graph fuseVertically(const Graph &graph);
|
||||
|
||||
double getEstimatedGraphPerf(Graph graph);
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -12,13 +12,14 @@ namespace infini {
|
|||
// TODO: how to deal with this
|
||||
using ShapeElem = int;
|
||||
using Shape = vector<ShapeElem>;
|
||||
enum class TensorType { Error = 0, Input = 1, Initialized = 2, Other = 3 };
|
||||
class TensorObj : public TensorBaseObj {
|
||||
private:
|
||||
Shape shape;
|
||||
size_t _size; // Cache of Π(shape).
|
||||
Fuid fuid; // Cloned tensors share the same id. Tensors constructed from
|
||||
// scratch have a new id.
|
||||
|
||||
TensorType tensorType;
|
||||
void copyin(const void *ptr, size_t size) {
|
||||
runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
|
||||
}
|
||||
|
@ -27,7 +28,8 @@ class TensorObj : public TensorBaseObj {
|
|||
}
|
||||
|
||||
public:
|
||||
TensorObj(Shape shape, DataType dtype, Runtime runtime);
|
||||
TensorObj(Shape shape, DataType dtype, Runtime runtime,
|
||||
TensorType tensorType = TensorType::Other);
|
||||
virtual ~TensorObj() {}
|
||||
string toString() const override;
|
||||
|
||||
|
@ -39,6 +41,7 @@ class TensorObj : public TensorBaseObj {
|
|||
size_t getOffset(const vector<int> &ds) const;
|
||||
void dataMalloc();
|
||||
UidBaseType getFuid() const { return fuid; }
|
||||
TensorType getTensorType() const { return tensorType; }
|
||||
|
||||
void load(std::string file_path);
|
||||
void save(std::string file_path);
|
||||
|
@ -74,25 +77,9 @@ class TensorObj : public TensorBaseObj {
|
|||
// Thus the internal state of generator cannot be updated.
|
||||
void setData(
|
||||
std::function<void(void *, size_t, DataType)> const &generator) const;
|
||||
Tensor clone() const {
|
||||
auto obj = make_ref<TensorObj>(*this);
|
||||
obj->freeData();
|
||||
obj->targets.clear();
|
||||
obj->source.reset();
|
||||
return obj;
|
||||
}
|
||||
Tensor clone(Runtime runtime) const {
|
||||
auto obj = make_ref<TensorObj>(*this);
|
||||
obj->runtime = runtime;
|
||||
obj->freeData();
|
||||
obj->targets.clear();
|
||||
obj->source.reset();
|
||||
if (hasData()) {
|
||||
obj->dataMalloc();
|
||||
obj->copyData(this);
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
void setData(const Blob &_blob) { data = _blob; }
|
||||
Tensor clone() const;
|
||||
Tensor clone(Runtime runtime) const;
|
||||
|
||||
void printData() const;
|
||||
bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
|
||||
|
@ -106,13 +93,13 @@ class TensorObj : public TensorBaseObj {
|
|||
size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const;
|
||||
|
||||
private:
|
||||
template <class T> string dataToString() const {
|
||||
template <class T> string dataToString(void *rawPtr) const {
|
||||
std::stringstream builder;
|
||||
builder << "Tensor: " << guid << std::endl;
|
||||
|
||||
auto numDims = shape.size();
|
||||
auto dimSzVec = vector<int>(numDims, 1);
|
||||
auto ptr = data->getPtr<T *>();
|
||||
T *ptr = (T *)rawPtr;
|
||||
dimSzVec[numDims - 1] = shape[numDims - 1];
|
||||
|
||||
for (int i = numDims - 1; i != 0; --i)
|
||||
|
@ -123,6 +110,12 @@ class TensorObj : public TensorBaseObj {
|
|||
if (i % dimSzVec[j] == 0)
|
||||
builder << "[";
|
||||
|
||||
if (iEnd > 1000 && i > 20 && i < iEnd - 20) {
|
||||
printf("... , ");
|
||||
i = iEnd - 20;
|
||||
continue;
|
||||
}
|
||||
|
||||
builder << ptr[i];
|
||||
for (size_t j = 0; j < numDims; ++j)
|
||||
if ((int)i % dimSzVec[j] == dimSzVec[j] - 1)
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
#pragma once
|
||||
|
||||
#include "operators/any.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void any_kernel_mapping(vector<float *> input, vector<float *> output,
|
||||
const string &kernel_name, const vector<int> &attr);
|
||||
|
||||
} // namespace infini
|
|
@ -13,6 +13,7 @@
|
|||
if (cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error in %s:%i : %s.\n", __FILE__, __LINE__, \
|
||||
cudaGetErrorString(err)); \
|
||||
IT_ASSERT(false); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
#pragma once
|
||||
|
||||
namespace infini {
|
||||
|
||||
void conv2dreduce_kernel(float *input, float *bias, float *output, bool PReLU,
|
||||
int n, int h, int w, int f, int r, int s, int oh,
|
||||
int ow, int ph, int pw, int sh, int sw, int dh,
|
||||
int dw);
|
||||
|
||||
void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
|
||||
int act, int n, int h, int w, int f, int r,
|
||||
int s, int oh, int ow, int ph, int pw, int sh,
|
||||
int sw, int dh, int dw);
|
||||
|
||||
void reduceConvRxSToNCHW(float *input, float *bias, float *output, int act,
|
||||
int n, int h, int w, int f, int r, int s, int oh,
|
||||
int ow, int ph, int pw, int sh, int sw, int dh,
|
||||
int dw);
|
||||
|
||||
void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
|
||||
int act, int n, int h, int w, int f, int r,
|
||||
int s, int oh, int ow, int ph, int pw, int sh,
|
||||
int sw, int dh, int dw);
|
||||
|
||||
void conv5x5ToConv3x3Reduce(int n, int f, int h, int w, float *input,
|
||||
float *output, float *bias);
|
||||
|
||||
void conv3x3ToReduce(int n, int h, int w, int f, float *input, float *output,
|
||||
float *bias);
|
||||
|
||||
} // namespace infini
|
|
@ -6,44 +6,52 @@ namespace infini {
|
|||
|
||||
class CudaRuntimeObj : public RuntimeObj {
|
||||
private:
|
||||
cudaStream_t stream;
|
||||
cudnnHandle_t cudnn;
|
||||
cublasHandle_t cublas;
|
||||
CudaPtr workspace;
|
||||
size_t workspaceSize;
|
||||
|
||||
public:
|
||||
CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
|
||||
// Memory information
|
||||
size_t allocatedGPUMemorySize = 0;
|
||||
map<void *, size_t> allocationMap;
|
||||
|
||||
checkCudnnError(cudnnCreate(&cudnn));
|
||||
checkCublasError(cublasCreate(&cublas));
|
||||
// 10GB for Longformer
|
||||
// size_t longformerNum = 3lu * (1 << 30);
|
||||
workspaceSize = 7ll << 30; // 7 GB
|
||||
workspace = alloc(workspaceSize);
|
||||
}
|
||||
virtual ~CudaRuntimeObj() {
|
||||
try {
|
||||
dealloc(workspace);
|
||||
checkCudnnError(cudnnDestroy(cudnn));
|
||||
checkCublasError(cublasDestroy(cublas));
|
||||
} catch (const std::exception &e) {
|
||||
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
|
||||
}
|
||||
}
|
||||
bool cudaGraphStatus; // Whether CUDA graph stream capture is enabled
|
||||
|
||||
// CUDA device properties
|
||||
cudaDeviceProp deviceProperties;
|
||||
|
||||
bool enableTF32 = false;
|
||||
|
||||
public:
|
||||
CudaRuntimeObj();
|
||||
virtual ~CudaRuntimeObj();
|
||||
string toString() const override;
|
||||
|
||||
void run(const Graph &graph, bool tune = false,
|
||||
bool profiling = false) const;
|
||||
// double runEvaluation(const Graph &graph, int nWarmups,
|
||||
// int nEvaluations) const;
|
||||
void sync() const;
|
||||
void sync() const override;
|
||||
CudaPtr alloc(size_t size) override {
|
||||
void *ptr;
|
||||
// printf("Try to cudaMalloc: %lu bytes\n", size);
|
||||
checkCudaError(cudaMalloc(&ptr, size));
|
||||
// printf("cuda malloc: %p %lu bytes\n", ptr, size);
|
||||
allocatedGPUMemorySize += size;
|
||||
allocationMap[ptr] = size;
|
||||
// printf("cuda malloc: %p %lu bytes, total %lu bytes (%.2lf GB)\n",
|
||||
// ptr,
|
||||
// size, allocatedGPUMemorySize,
|
||||
// double(allocatedGPUMemorySize) / 1024 / 1024 / 1024);
|
||||
return ptr;
|
||||
}
|
||||
void dealloc(void *ptr) override { checkCudaError(cudaFree(ptr)); }
|
||||
void dealloc(void *ptr) override {
|
||||
checkCudaError(cudaFree(ptr));
|
||||
allocatedGPUMemorySize -= allocationMap.at(ptr);
|
||||
allocationMap.erase(ptr);
|
||||
// printf("cuda dealloc: %p %lu bytes, total %lu\n", ptr,
|
||||
// allocationMap.at(ptr), allocatedGPUMemorySize);
|
||||
}
|
||||
cudnnHandle_t cudnnHandle() const { return cudnn; }
|
||||
cublasHandle_t cublasHandle() const { return cublas; }
|
||||
size_t getWorkspaceSize() const { return workspaceSize; }
|
||||
|
@ -51,6 +59,10 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
IT_ASSERT(size <= workspaceSize);
|
||||
return workspace;
|
||||
}
|
||||
pair<int, int> getComputeCapacitiy() const {
|
||||
return {deviceProperties.major, deviceProperties.minor};
|
||||
}
|
||||
int getNumSMs() const { return deviceProperties.multiProcessorCount; }
|
||||
|
||||
void copyBlobFromCPU(void *dst, const void *src,
|
||||
size_t bytes) const override {
|
||||
|
@ -69,7 +81,19 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
|
||||
void runWithoutSync(const Graph &graph) const;
|
||||
|
||||
bool isInCudaGraph() const { return cudaGraphStatus; }
|
||||
cudaStream_t getStream() const { return stream; }
|
||||
|
||||
double timeWithCudaGraph(Graph graph, int rounds = 50);
|
||||
double timeWithCudaGraph(vector<std::function<void(void)>> funcs,
|
||||
int rounds = 50);
|
||||
void setEnableTF32(bool state);
|
||||
bool getEnableTF32() const { return enableTF32; }
|
||||
|
||||
private:
|
||||
void tune(const Graph &graph, bool profiling) const;
|
||||
|
||||
void beginCudaGraphStreamCapture();
|
||||
tuple<cudaGraphExec_t, size_t> endCudaGraphStreamCapture();
|
||||
};
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
#pragma once
|
||||
|
||||
#include "operators/transpose.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void transpose_kernel(float *input, float *output, int nDims, int size,
|
||||
SmallArray strides, SmallArray outputShape,
|
||||
vector<int> _dims_in, vector<int> _dims_out,
|
||||
vector<int> _perms);
|
||||
|
||||
void invoke_transpose_last_two_dim(float *ptrA, float *ptrB, int dim0, int dim1,
|
||||
int dim2, int numSMs);
|
||||
|
||||
} // namespace infini
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,9 @@
|
|||
#include "core/graph_handler.h"
|
||||
#include "core/mutator.h"
|
||||
#include "core/search_engine.h"
|
||||
|
||||
namespace infini {
|
||||
namespace callback {
|
||||
void exportONNX(const Graph &graph, const string &path);
|
||||
}
|
||||
} // namespace infini
|
|
@ -29,7 +29,7 @@ class MklRuntimeObj : public CpuRuntimeObj {
|
|||
string toString() const override { return "INTELCPU Runtime"; };
|
||||
dnnl::engine getEngine() const { return dnnl::engine(engine, true); }
|
||||
dnnl::stream getStream() const { return dnnl::stream(stream, true); }
|
||||
void sync() const;
|
||||
void sync() const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -20,6 +20,7 @@ class Serializer : public Functor<string()> {
|
|||
string visit_(const Subscript &c) override;
|
||||
string visit_(const Var &c) override;
|
||||
string visit_(const Tensor &c) override;
|
||||
string visit_(const Func &c) override;
|
||||
string dispatchRoutine(const Routine &c);
|
||||
|
||||
Expr buildExprTree(string key);
|
||||
|
@ -29,16 +30,44 @@ class Serializer : public Functor<string()> {
|
|||
Serializer(int _verobse = 0);
|
||||
virtual ~Serializer();
|
||||
|
||||
/**
|
||||
* @brief Serialize the given expression to string
|
||||
*
|
||||
* @param expr The expression to be serialized
|
||||
* @param msg Message of derivation
|
||||
* @param inputs membound operator attributes
|
||||
* @param exec_time membound operator attributes
|
||||
* @param hint membound operator attributes
|
||||
* @return bool Whether the serialization succeed
|
||||
*/
|
||||
std::optional<std::string> toString(Expr const &expr,
|
||||
const string &msg = "",
|
||||
vector<Tensor> inputs = {},
|
||||
double exec_time = -1e9,
|
||||
string hint = "");
|
||||
|
||||
/**
|
||||
* @brief Serialize the given expression to json file
|
||||
*
|
||||
* @param expr The expression to be serialized
|
||||
* @param filePath The path of json file to be output
|
||||
* @param msg Message of derivation
|
||||
* @param inputs membound operator attributes
|
||||
* @param exec_time membound operator attributes
|
||||
* @param hint membound operator attributes
|
||||
* @return bool Whether the serialization succeed
|
||||
*/
|
||||
bool serialize(const Expr &expr, const string &filePath,
|
||||
const string &msg = "");
|
||||
bool toFile(const Expr &expr, const string &filePath,
|
||||
const string &msg = "", vector<Tensor> inputs = {},
|
||||
double exec_time = -1e9, string hint = "");
|
||||
|
||||
/**
|
||||
* @brief Deserialize the given json file to expression
|
||||
*
|
||||
* @param text The text of the expr to be deserialized
|
||||
* @return Expression deserialized from the given json file
|
||||
*/
|
||||
Expr fromString(const string &text);
|
||||
|
||||
/**
|
||||
* @brief Deserialize the given json file to expression
|
||||
|
@ -46,7 +75,15 @@ class Serializer : public Functor<string()> {
|
|||
* @param filePath The path to file to be deserialized
|
||||
* @return Expression deserialized from the given json file
|
||||
*/
|
||||
Expr deserialize(const string &filePath);
|
||||
Expr fromFile(const string &filePath);
|
||||
|
||||
tuple<Expr, vector<Tensor>, double, string>
|
||||
deserializeAsMemobundOp(const string &filePath);
|
||||
|
||||
// FIXME: the order of elements in tuple is not consistent with memboundObj
|
||||
// constructor
|
||||
tuple<Expr, vector<Tensor>, double, string>
|
||||
membundOpFromString(const string &data);
|
||||
};
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -69,7 +69,8 @@ static inline HashType genhash(string s) {
|
|||
{ IT_TODO_HALT(); }
|
||||
|
||||
#define nnet_unimplemented_continue() \
|
||||
{ dbg("Unimplemented"); }
|
||||
{}
|
||||
// { dbg("Unimplemented"); }
|
||||
|
||||
#define nnet_assert(expr, msg) assert(((void)(msg), (expr)))
|
||||
|
||||
|
|
|
@ -67,11 +67,13 @@ class Derivator {
|
|||
vector<string> ruleStates, ruleMsgs;
|
||||
int cntStates = 0; // the number of intermediate states
|
||||
int searchState = 0; // search state in guided search
|
||||
bool printAndExit;
|
||||
void printDerivationRules();
|
||||
|
||||
public:
|
||||
Derivator(int maxDepth = 8, bool enableHashPruning = true,
|
||||
LogMode mode = LogMode::NoLog,
|
||||
PassMode passMode = PassMode::Debug);
|
||||
PassMode passMode = PassMode::Debug, bool printAndExit = false);
|
||||
void search(Formula &origin, int depth);
|
||||
void ruleBasedDFS(Formula &origin, int depth, vector<int> _rules,
|
||||
map<int, vector<Var>> _substituteRules = {},
|
||||
|
|
|
@ -104,7 +104,7 @@ enum class NodeType {
|
|||
FuncNodeType
|
||||
};
|
||||
|
||||
enum class FuncType { Relu, Tanh, PRelu };
|
||||
enum class FuncType { Relu = 1000, Tanh, PRelu };
|
||||
|
||||
#define DEFINE_GETTYPE(CLASS, isScalar_v) \
|
||||
NodeType getType() const override { return NodeType::CLASS##Type; } \
|
||||
|
@ -206,7 +206,8 @@ struct IterationType {
|
|||
enum { Loop, Sum };
|
||||
constexpr static int NumIterationType = 2;
|
||||
};
|
||||
class RangeOpNode : public OperatorNode {
|
||||
class RangeOpNode : public OperatorNode,
|
||||
public std::enable_shared_from_this<RangeOpNode> {
|
||||
public:
|
||||
enum { Summand, END_POS };
|
||||
constexpr static int Loop = IterationType::Loop;
|
||||
|
@ -230,6 +231,7 @@ class RangeOpNode : public OperatorNode {
|
|||
return 0;
|
||||
};
|
||||
string toReadable() const override;
|
||||
string getFullExpression();
|
||||
const Expr &getSummand() const { return subExprs[Summand]; }
|
||||
const vector<VarRangePair> &getVarRanges(int _index) const {
|
||||
return vars[_index];
|
||||
|
@ -384,13 +386,16 @@ class FuncNode : public ExprNode {
|
|||
};
|
||||
|
||||
// Wrappers for type deduction
|
||||
Subscript makeSubscript(const Expr &tensor, const VecExpr &subscripts);
|
||||
RangeOp makeRangeOperator(const vector<VarRangePair> &_loopIters,
|
||||
const vector<VarRangePair> &_sumIters, Expr _summand,
|
||||
const vector<int> &paddings = {});
|
||||
Tensor makeTensor(const string &name, const vector<int> &shape,
|
||||
const vector<int> &paddings = {},
|
||||
const Routine &source = nullptr);
|
||||
|
||||
// make a subscript operator
|
||||
Subscript mSub(const Expr &tensor, const VecExpr &subscripts);
|
||||
// make a range operator
|
||||
RangeOp mL(const vector<VarRangePair> &_loopIters,
|
||||
const vector<VarRangePair> &_sumIters, Expr _summand,
|
||||
const vector<int> &paddings = {});
|
||||
// make a tensor
|
||||
Tensor mT(const string &name, const vector<int> &shape,
|
||||
const vector<int> &paddings = {}, const Routine &source = nullptr);
|
||||
|
||||
// Pretty output for dbg with shared_ptr
|
||||
template <typename T, typename std::enable_if_t<std::is_base_of_v<ExprNode, T>>
|
||||
|
|
|
@ -7,32 +7,53 @@ namespace infini {
|
|||
class NMutator : public Mutator {
|
||||
public:
|
||||
enum class Mode { Normal, ToNaiveMembound, RuleBased };
|
||||
using NameNToTensorT = map<string, Tensor>;
|
||||
|
||||
private:
|
||||
// Suffix -N: NNet objects.
|
||||
// Suffix -T: tpm objects.
|
||||
// Map: NNet tensors -> tpm tensor.
|
||||
std::map<std::string, Tensor> inputsNameNToTensorT;
|
||||
NameNToTensorT inputsNameNToTensorT;
|
||||
Mode mode;
|
||||
const double bandwidth = double(200) * 1024 * 1024 * 1024;
|
||||
// If in RuleBased mode, use derivationRules in derivator
|
||||
const std::vector<int> derivationRules;
|
||||
bool searchFilter = false;
|
||||
bool enableRules = false; // Enable operator-level transformation rules
|
||||
|
||||
public:
|
||||
NMutator(Mode mode = Mode::Normal);
|
||||
NMutator(Mode mode, const std::vector<int> &derivationRules);
|
||||
NMutator(Mode mode = Mode::Normal,
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance(),
|
||||
bool enableRules = false);
|
||||
NMutator(Mode mode, const std::vector<int> &derivationRules,
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance(),
|
||||
bool enableRules = false);
|
||||
~NMutator();
|
||||
|
||||
vector<Graph> run(const Graph &in_graph) override;
|
||||
void setToNaiveMembound();
|
||||
Graph fuseVertically(const Graph &in_graph) override;
|
||||
Graph eliminateVertically(const Graph &in_graph) override;
|
||||
bool isMultiBranchMergable(const Graph &in_graph) override;
|
||||
|
||||
void setMaxDepth(int _maxDepth) { maxDepth = _maxDepth; }
|
||||
void setToNaiveMembound();
|
||||
void setMaxDepth(int _maxDepth) {
|
||||
maxDepth = _maxDepth;
|
||||
searchFilter = true;
|
||||
}
|
||||
long long cntStates = 0;
|
||||
long long cntCandidates = 0;
|
||||
|
||||
private:
|
||||
int maxDepth = 8;
|
||||
nnet::Expr opToExpression(Operator op);
|
||||
/// @brief
|
||||
/// @param op
|
||||
/// @return pair<Expr, map from NNet tensor names to InfiniTensor tensors>
|
||||
static pair<nnet::Expr, NameNToTensorT> extractOp(Operator op);
|
||||
static pair<nnet::Expr, NMutator::NameNToTensorT>
|
||||
generateUnaryExpr(const Operator &op);
|
||||
static pair<nnet::Expr, vector<nnet::Tensor>> generateRevert(Tensor in);
|
||||
|
||||
void runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs);
|
||||
|
||||
/**
|
||||
|
@ -47,12 +68,32 @@ class NMutator : public Mutator {
|
|||
double memboundTime(const Shape &dims);
|
||||
|
||||
// TODO: recover these rules
|
||||
// Graph fuseHetConv(nnet::Expr expr, Graph in_graph);
|
||||
// Graph transformTConv1x1(Operator op);
|
||||
// Graph transformTConv3x3(Operator op);
|
||||
// Graph transformDialtedConv(Operator op);
|
||||
// Graph transformConv1x1(Operator op);
|
||||
Graph transformConvtransposed1x1(Operator _op);
|
||||
// Graph transformConvtransposed(Operator op);
|
||||
vector<Graph> transformConv1x1(Operator op);
|
||||
vector<Graph> transformConv3x3ONNX(Operator op);
|
||||
Graph transformG2bmm(Operator op);
|
||||
Graph transformGbmm(Operator op);
|
||||
Graph transformDialtedConv(Operator _op);
|
||||
vector<Graph> transformConv1xk(Operator op);
|
||||
// Graph transformConv1xk(Operator op);
|
||||
Graph transformConvToGEMMReduce(Operator _op);
|
||||
Graph transformConvTranposeToGEMMReduce(Operator _op);
|
||||
|
||||
Tensor splitTransposeMerge(Graph g, Tensor A, int dim, int chunkSize,
|
||||
Tensor output = nullptr);
|
||||
|
||||
/// @brief Construct a new graph with a chain of operators. Use the output
|
||||
/// from the previous operator as the input of the next operator. While
|
||||
/// constructing, the input and output tensors from inputGraph are used as
|
||||
/// new constructed graph.
|
||||
/// @param op The operator chain. It can have wrong input/output shapes.
|
||||
/// @return
|
||||
Graph constructGraphByOperatorChain(vector<Operator> ops, Graph inputGraph);
|
||||
|
||||
// Convert an nnet::Expr to an infini::Graph containing corresponding
|
||||
// tensors and operators
|
||||
Graph constructGraphFromExpression(Runtime runtime, nnet::Expr expr);
|
||||
};
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
#ifdef USE_CUDA
|
||||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "core/search_engine.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
Graph getGANGraph(int batch, Runtime runtime, int nLayers, int modelId);
|
||||
Graph getFSRCNNGraph(int batch, Runtime runtime);
|
||||
Graph getLongformer(Runtime runtime, int bs);
|
||||
vector<Tensor> runInfoGAN(int nLayers);
|
||||
Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId);
|
||||
Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning, NMutator::Mode mode,
|
||||
vector<int> rules);
|
||||
void initializeGraphTensors(Graph g, double l, double r, bool useInt);
|
||||
Graph convertNCHWtoNHWCModel(Runtime runtime, Graph inG);
|
||||
Graph optimizeWithDepthConstraint(Graph g, Runtime _runtime, int maxDepth);
|
||||
Graph optimizeModel(Graph g, Runtime _runtime, string name);
|
||||
Graph optimizeModelWithRules(Graph g, Runtime _runtime, vector<int> rules);
|
||||
|
||||
} // namespace infini
|
||||
|
||||
#endif
|
|
@ -49,7 +49,7 @@ template <typename R, typename... Args> class Functor<R(Args...)> {
|
|||
virtual R visit_(const Tensor &c, Args... args) FUNCTOR_DEFAULT;
|
||||
virtual R visit_(const Func &c, Args... args) FUNCTOR_DEFAULT;
|
||||
virtual R visitDefault(const Expr &c, [[maybe_unused]] Args... args) {
|
||||
dbg(*c);
|
||||
dbg(*c, c->getType());
|
||||
nnet_assert(0, "Reach unimplemented visit function.");
|
||||
return R();
|
||||
};
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class AnyObj : public OperatorObj {
|
||||
private:
|
||||
string kernelName;
|
||||
vector<int> attr;
|
||||
|
||||
public:
|
||||
AnyObj(GraphObj *graph, const TensorVec &inputs, const TensorVec &outputs,
|
||||
const string &kernelName, const vector<int> &attr);
|
||||
|
||||
OP_CLONE(AnyObj);
|
||||
|
||||
string toString() const override;
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
int numInputs() const override { return inputs.size(); }
|
||||
int numOutputs() const override { return outputs.size(); }
|
||||
|
||||
const string getKernelName() const;
|
||||
void setAttr(int i, int v) { attr[i] = v; }
|
||||
vector<int> getOpAttrVector() const override;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -98,7 +98,7 @@ class ConvBaseObj : public OperatorObj {
|
|||
int numInputs() const override { return 2; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
Tensor getBias() const { return inputs[2]; }
|
||||
Tensor getBias() const { return inputs.size() > 2 ? inputs[2] : nullptr; }
|
||||
PaddingMode getPaddingMode() const { return padding; }
|
||||
pair<int, int> inferPaddingSize() const;
|
||||
|
||||
|
@ -111,7 +111,7 @@ class ConvBaseObj : public OperatorObj {
|
|||
auto getNCHWFRS() const { return tuple(n, c, h, w, f, r, s); }
|
||||
auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); }
|
||||
int getChannelPerGroup() const {
|
||||
if (type == OpType::ConvTransNHWC) {
|
||||
if (type == OpType::ConvTransNHWC || type == OpType::ConvNHWC) {
|
||||
return inputs[1]->getDims()[3];
|
||||
} else {
|
||||
return inputs[1]->getDims()[1];
|
||||
|
@ -149,6 +149,25 @@ class ConvObj : public ConvBaseObj {
|
|||
void setAuxilaryAttributes(PaddingMode mode) override;
|
||||
};
|
||||
|
||||
class ConvNHWCObj : public ConvBaseObj {
|
||||
public:
|
||||
ConvNHWCObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
|
||||
int ph, int pw, int sh = 1, int sw = 1, int dh = 1, int dw = 1,
|
||||
Tensor bias = nullptr, ActType act = ActType::None);
|
||||
// Constructors for setting padding mode
|
||||
ConvNHWCObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
|
||||
PaddingMode mode = PaddingMode::Same, int sh = 1, int sw = 1,
|
||||
int dh = 1, int dw = 1, Tensor bias = nullptr,
|
||||
ActType act = ActType::None);
|
||||
OP_CLONE(ConvNHWCObj);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
int getNumGroups() const override { return c / getChannelPerGroup(); }
|
||||
|
||||
private:
|
||||
void setAuxilaryAttributes(PaddingMode mode) override;
|
||||
};
|
||||
|
||||
class ConvBackwardFilterObj : public ConvBaseObj {
|
||||
private:
|
||||
ActType act;
|
||||
|
@ -220,6 +239,7 @@ class ConvTransposed2dNHWCObj : public ConvBaseObj {
|
|||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
int getNumGroups() const override { return group; }
|
||||
std::pair<int, int> getOutputPadding() const { return {oph, opw}; }
|
||||
|
||||
private:
|
||||
void setAuxilaryAttributes(PaddingMode mode) override;
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class Conv2dReduceBase : public OperatorObj {
|
||||
protected:
|
||||
Tensor bias;
|
||||
int ph, pw;
|
||||
int sh, sw;
|
||||
int dh, dw;
|
||||
int n, h, w, f, r, s; // c has been reduced
|
||||
bool PReLU;
|
||||
float paramReLU;
|
||||
|
||||
public:
|
||||
Conv2dReduceBase(OpType opType, Tensor input, Tensor bias, Tensor output,
|
||||
bool PReLU_, float paramReLU_, int ph_, int pw_,
|
||||
int sh_ = 1, int sw_ = 1, int dh_ = 1, int dw_ = 1);
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 2; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
int getDh() const { return dh; }
|
||||
int getDw() const { return dw; }
|
||||
int getPh() const { return ph; }
|
||||
int getPw() const { return pw; }
|
||||
int getSh() const { return sh; }
|
||||
int getSw() const { return sw; }
|
||||
bool getPReLU() const { return PReLU; }
|
||||
float getParamReLU() const { return paramReLU; }
|
||||
|
||||
Tensor getBias() const { return bias; }
|
||||
|
||||
// optional<vector<Shape>> inferShape(const TensorVec &inputs) const
|
||||
// override;
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
class Conv2dReduce : public Conv2dReduceBase {
|
||||
public:
|
||||
Conv2dReduce(GraphObj *graph, Tensor input, Tensor bias, Tensor output,
|
||||
bool PReLU_, float paramReLU_, int ph_, int pw_, int sh_ = 1,
|
||||
int sw_ = 1, int dh_ = 1, int dw_ = 1);
|
||||
OP_CLONE(Conv2dReduce);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
};
|
||||
|
||||
class Conv2dReduceTranspose : public Conv2dReduceBase {
|
||||
public:
|
||||
Conv2dReduceTranspose(GraphObj *graph, Tensor input, Tensor bias,
|
||||
Tensor output, bool PReLU_, float paramReLU_, int ph_,
|
||||
int pw_, int sh_ = 1, int sw_ = 1, int dh_ = 1,
|
||||
int dw_ = 1);
|
||||
OP_CLONE(Conv2dReduceTranspose);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
};
|
||||
} // namespace infini
|
|
@ -6,12 +6,17 @@ namespace infini {
|
|||
|
||||
class MemBoundObj : public OperatorObj {
|
||||
private:
|
||||
std::vector<nnet::Tensor> nnetInputs;
|
||||
nnet::Expr expr, simplifiedExpr;
|
||||
nnet::Expr expr;
|
||||
std::vector<nnet::Tensor>
|
||||
nnetInputs; // The order of inputs in nnetInputs should be consistant
|
||||
// with inputs in infinitensor
|
||||
double exec_time;
|
||||
std::string hint;
|
||||
HashType hash, simplifiedHash;
|
||||
int n, f, h, w;
|
||||
|
||||
// Generated attributes
|
||||
HashType hash;
|
||||
nnet::Expr simplifiedExpr;
|
||||
HashType simplifiedHash;
|
||||
|
||||
public:
|
||||
MemBoundObj(GraphObj *graph, const TensorVec &input,
|
||||
|
@ -27,9 +32,12 @@ class MemBoundObj : public OperatorObj {
|
|||
int numOutputs() const override { return outputs.size(); }
|
||||
const vector<nnet::Tensor> &getNnetInputs() const { return nnetInputs; }
|
||||
const nnet::Expr getNnetExpr() const { return expr; }
|
||||
HashType getHash() const { return hash; }
|
||||
pair<const nnet::Expr, HashType> getSimplifiedNnetExpr() const {
|
||||
return {expr, hash};
|
||||
}
|
||||
double getEstimatedTime() const { return exec_time; }
|
||||
string toJson() const;
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
|
|
|
@ -19,7 +19,7 @@ class ReshapeObj : public OperatorObj {
|
|||
* @param output The output tensor.
|
||||
* @param dims The shape of the output tensor.
|
||||
*/
|
||||
ReshapeObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
|
||||
ReshapeObj(GraphObj *graph, Tensor input, Tensor output, Shape dims = {});
|
||||
OP_CLONE(ReshapeObj);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
@ -60,6 +60,7 @@ class FlattenObj : public OperatorObj {
|
|||
std::string toString() const override;
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
int getAxis() const { return axis; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
|
|
|
@ -7,7 +7,9 @@ namespace infini {
|
|||
*
|
||||
*/
|
||||
class SliceObj : public OperatorObj {
|
||||
template <class T> struct range_t { T start, end, step; };
|
||||
template <class T> struct range_t {
|
||||
T start, end, step;
|
||||
};
|
||||
vector<range_t<int>> axes;
|
||||
|
||||
public:
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
|
||||
namespace infini {
|
||||
class TransposeObj : public OperatorObj {
|
||||
vector<int> transposePermute;
|
||||
|
||||
public:
|
||||
TransposeObj(GraphObj *graph, Tensor input, Tensor output,
|
||||
vector<int> permute);
|
||||
|
@ -15,7 +17,6 @@ class TransposeObj : public OperatorObj {
|
|||
std::vector<int> getPermute() const { return transposePermute; }
|
||||
|
||||
private:
|
||||
vector<int> transposePermute = {1, 1, 1, 1};
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
|
|
@ -46,10 +46,13 @@ class RandomGenerator : public DataGenerator {
|
|||
std::mt19937 e;
|
||||
std::uniform_int_distribution<int> di;
|
||||
std::uniform_real_distribution<float> dr;
|
||||
bool generateInteger;
|
||||
|
||||
public:
|
||||
RandomGenerator(double l = 0, double r = 1, unsigned int seed = 0)
|
||||
: l(l), r(r), e(seed), di(l, r), dr(l, r) {}
|
||||
RandomGenerator(double l = 0, double r = 1, unsigned int seed = 0,
|
||||
bool generateInteger = false)
|
||||
: l(l), r(r), e(seed), di(l, r), dr(l, r),
|
||||
generateInteger(generateInteger) {}
|
||||
virtual ~RandomGenerator() {}
|
||||
|
||||
private:
|
||||
|
@ -60,7 +63,7 @@ class RandomGenerator : public DataGenerator {
|
|||
}
|
||||
void fill(float *data, size_t size) override {
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
data[i] = dr(e);
|
||||
data[i] = (generateInteger) ? di(e) : dr(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
namespace infini {
|
||||
|
||||
#define SMALL_ARRAY_SIZE 8
|
||||
struct SmallArray {
|
||||
int data[SMALL_ARRAY_SIZE];
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -25,6 +25,7 @@ from onnx.shape_inference import infer_shapes
|
|||
from onnx.numpy_helper import to_array
|
||||
from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
|
||||
from functools import reduce
|
||||
import numpy as np
|
||||
|
||||
|
||||
class OnnxStub:
|
||||
|
@ -37,29 +38,48 @@ class OnnxStub:
|
|||
outputs: Dict[str, backend.Tensor] = {}
|
||||
initializer: Dict[int, TensorProto] = {}
|
||||
handler: backend.GraphHandler
|
||||
disable_check: bool
|
||||
|
||||
def __init__(self, model: ModelProto, runtime):
|
||||
model = infer_shapes(model)
|
||||
self.handler = backend.GraphHandler(runtime)
|
||||
@classmethod
|
||||
def from_onnx(cls, model: ModelProto, runtime, enable_onnx_shape_infernce=True):
|
||||
if enable_onnx_shape_infernce:
|
||||
model = infer_shapes(model)
|
||||
ans = OnnxStub()
|
||||
ans.handler = backend.GraphHandler(runtime)
|
||||
|
||||
tensors: Dict[str, backend.Tensor] = dict()
|
||||
data: Dict[str, TensorProto] = dict()
|
||||
|
||||
cnt_infini_inputs = 0
|
||||
for input in model.graph.input:
|
||||
dims = _take_shape_dim(input.type.tensor_type.shape)
|
||||
tensors[input.name] = self.handler.tensor(
|
||||
dims, input.type.tensor_type.elem_type
|
||||
if input.name.startswith('input'):
|
||||
tensor_type = backend.TensorType.Input
|
||||
cnt_infini_inputs += 1
|
||||
else:
|
||||
tensor_type = backend.TensorType.Initialized
|
||||
tensors[input.name] = ans.handler.tensor(
|
||||
dims,
|
||||
input.type.tensor_type.elem_type,
|
||||
tensor_type,
|
||||
)
|
||||
assert cnt_infini_inputs == 1, f'{cnt_infini_inputs} tensor names start with "input" found.'
|
||||
|
||||
for output in model.graph.output:
|
||||
dims = _take_shape_dim(output.type.tensor_type.shape)
|
||||
tensors[output.name] = self.handler.tensor(
|
||||
dims, output.type.tensor_type.elem_type
|
||||
tensors[output.name] = ans.handler.tensor(
|
||||
dims,
|
||||
output.type.tensor_type.elem_type,
|
||||
backend.TensorType.Other,
|
||||
)
|
||||
|
||||
for initializer in model.graph.initializer:
|
||||
dims = [d for d in initializer.dims]
|
||||
tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
|
||||
tensors[initializer.name] = ans.handler.tensor(
|
||||
dims,
|
||||
initializer.data_type,
|
||||
backend.TensorType.Initialized,
|
||||
)
|
||||
data[initializer.name] = initializer
|
||||
|
||||
for node in model.graph.node:
|
||||
|
@ -77,17 +97,18 @@ class OnnxStub:
|
|||
)
|
||||
if p[0] != p[2] or p[1] != p[3]:
|
||||
adapt = "{}-adapt".format(node.output[0])
|
||||
tensors[adapt] = self.handler.pad(
|
||||
tensors[adapt] = ans.handler.pad(
|
||||
tensors[node.input[0]], None, p, [-2, -1]
|
||||
)
|
||||
p = [0, 0, 0, 0]
|
||||
else:
|
||||
adapt = node.input[0]
|
||||
|
||||
if len(node.input) > 2:
|
||||
# HACK: ignore bias
|
||||
if len(node.input) > 3:
|
||||
bias = "{}-bias".format(node.output[0])
|
||||
reshape = "{}-reshape".format(node.output[0])
|
||||
tensors[bias] = self.handler.conv(
|
||||
tensors[bias] = ans.handler.conv(
|
||||
tensors[adapt],
|
||||
tensors[node.input[1]],
|
||||
None,
|
||||
|
@ -98,7 +119,7 @@ class OnnxStub:
|
|||
d[0],
|
||||
d[1],
|
||||
)
|
||||
tensors[reshape] = self.handler.reshape(
|
||||
tensors[reshape] = ans.handler.reshape(
|
||||
tensors[node.input[2]],
|
||||
None,
|
||||
[
|
||||
|
@ -111,13 +132,13 @@ class OnnxStub:
|
|||
1,
|
||||
],
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.add(
|
||||
tensors[node.output[0]] = ans.handler.add(
|
||||
tensors[bias],
|
||||
tensors[reshape],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
else:
|
||||
tensors[node.output[0]] = self.handler.conv(
|
||||
tensors[node.output[0]] = ans.handler.conv(
|
||||
tensors[adapt],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
|
@ -142,7 +163,7 @@ class OnnxStub:
|
|||
attributes[name]
|
||||
for name in ["dilations", "pads", "strides", "output_padding"]
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.convTransposed2d(
|
||||
tensors[node.output[0]] = ans.handler.convTransposed2d(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
|
@ -156,7 +177,7 @@ class OnnxStub:
|
|||
op[1],
|
||||
)
|
||||
elif node.op_type == "MatMul":
|
||||
tensors[node.output[0]] = self.handler.matmul(
|
||||
tensors[node.output[0]] = ans.handler.matmul(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
|
@ -175,7 +196,7 @@ class OnnxStub:
|
|||
# FIXME unsupport attributes: `alpha` `beta`
|
||||
assert alpha == 1.0
|
||||
assert beta == 1.0
|
||||
tensors[node.output[0]] = self.handler.matmul(
|
||||
tensors[node.output[0]] = ans.handler.matmul(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
|
@ -196,7 +217,7 @@ class OnnxStub:
|
|||
attributes[name]
|
||||
for name in ["momentum", "epsilon", "training_mode"]
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.batchNorm(
|
||||
tensors[node.output[0]] = ans.handler.batchNorm(
|
||||
input, output, mean, var, scale, bias, momentum, eps, training != 0
|
||||
)
|
||||
elif node.op_type == "MaxPool":
|
||||
|
@ -215,10 +236,10 @@ class OnnxStub:
|
|||
)
|
||||
if p[0] != p[2] or p[1] != p[3]:
|
||||
adapt = "{}-adapt".format(node.output[0])
|
||||
tensors[adapt] = self.handler.pad(
|
||||
tensors[adapt] = ans.handler.pad(
|
||||
tensors.get(node.input[0]), None, p, [-2, -1]
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.maxPool(
|
||||
tensors[node.output[0]] = ans.handler.maxPool(
|
||||
tensors[adapt],
|
||||
tensors.get(node.output[0]),
|
||||
k[0],
|
||||
|
@ -231,7 +252,7 @@ class OnnxStub:
|
|||
s[1],
|
||||
)
|
||||
else:
|
||||
tensors[node.output[0]] = self.handler.maxPool(
|
||||
tensors[node.output[0]] = ans.handler.maxPool(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
k[0],
|
||||
|
@ -257,10 +278,10 @@ class OnnxStub:
|
|||
)
|
||||
if p[0] != p[2] or p[1] != p[3]:
|
||||
adapt = "{}-adapt".format(node.output[0])
|
||||
tensors[adapt] = self.handler.pad(
|
||||
tensors[adapt] = ans.handler.pad(
|
||||
tensors.get(node.input[0]), None, p, [-2, -1]
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.avgPool(
|
||||
tensors[node.output[0]] = ans.handler.avgPool(
|
||||
tensors[adapt],
|
||||
tensors.get(node.output[0]),
|
||||
k[0],
|
||||
|
@ -273,7 +294,7 @@ class OnnxStub:
|
|||
s[1],
|
||||
)
|
||||
else:
|
||||
tensors[node.output[0]] = self.handler.avgPool(
|
||||
tensors[node.output[0]] = ans.handler.avgPool(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
k[0],
|
||||
|
@ -287,7 +308,7 @@ class OnnxStub:
|
|||
)
|
||||
elif node.op_type == "GlobalAveragePool":
|
||||
[_, _, h, w] = _search_shape(model, node.input[0])
|
||||
tensors[node.output[0]] = self.handler.avgPool(
|
||||
tensors[node.output[0]] = ans.handler.avgPool(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
h,
|
||||
|
@ -300,52 +321,52 @@ class OnnxStub:
|
|||
1,
|
||||
)
|
||||
elif node.op_type == "Add":
|
||||
tensors[node.output[0]] = self.handler.add(
|
||||
tensors[node.output[0]] = ans.handler.add(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Sub":
|
||||
tensors[node.output[0]] = self.handler.sub(
|
||||
tensors[node.output[0]] = ans.handler.sub(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Mul":
|
||||
tensors[node.output[0]] = self.handler.mul(
|
||||
tensors[node.output[0]] = ans.handler.mul(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Div":
|
||||
tensors[node.output[0]] = self.handler.div(
|
||||
tensors[node.output[0]] = ans.handler.div(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Pow":
|
||||
tensors[node.output[0]] = self.handler.pow(
|
||||
tensors[node.output[0]] = ans.handler.pow(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Relu":
|
||||
tensors[node.output[0]] = self.handler.relu(
|
||||
tensors[node.output[0]] = ans.handler.relu(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Sigmoid":
|
||||
tensors[node.output[0]] = self.handler.sigmoid(
|
||||
tensors[node.output[0]] = ans.handler.sigmoid(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Tanh":
|
||||
tensors[node.output[0]] = self.handler.tanh(
|
||||
tensors[node.output[0]] = ans.handler.tanh(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Softmax":
|
||||
tensors[node.output[0]] = self.handler.softmax(
|
||||
tensors[node.output[0]] = ans.handler.softmax(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
next(
|
||||
|
@ -353,34 +374,39 @@ class OnnxStub:
|
|||
),
|
||||
)
|
||||
elif node.op_type == "Abs":
|
||||
tensors[node.output[0]] = self.handler.abs(
|
||||
tensors[node.output[0]] = ans.handler.abs(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Shape":
|
||||
tensors[node.output[0]] = self.handler.shape(
|
||||
tensors[node.output[0]] = ans.handler.shape(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Identity":
|
||||
tensors[node.output[0]] = self.handler.identity(
|
||||
tensors[node.output[0]] = ans.handler.identity(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Flatten":
|
||||
tensors[node.output[0]] = self.handler.flatten(
|
||||
tensors[node.output[0]] = ans.handler.flatten(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||
)
|
||||
elif node.op_type == "PRelu":
|
||||
tensors[node.output[0]] = self.handler.pRelu(
|
||||
# HACK: replace PRelu with Relu
|
||||
tensors[node.output[0]] = ans.handler.relu(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
# tensors[node.output[0]] = ans.handler.pRelu(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
elif node.op_type == "Clip":
|
||||
tensors[node.output[0]] = self.handler.clip(
|
||||
tensors[node.output[0]] = ans.handler.clip(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
next(_parse_data(data[node.input[1]]).__iter__(), None)
|
||||
|
@ -394,7 +420,7 @@ class OnnxStub:
|
|||
perm = next(
|
||||
(attr.ints for attr in node.attribute if attr.name == "perm"), None
|
||||
)
|
||||
tensors[node.output[0]] = self.handler.transpose(
|
||||
tensors[node.output[0]] = ans.handler.transpose(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
perm,
|
||||
|
@ -409,7 +435,7 @@ class OnnxStub:
|
|||
temp = reduce(lambda acc, x: acc * x, input_shape, 1)
|
||||
if temp < 0:
|
||||
input_shape[input_shape.index(-1)] = size // -temp
|
||||
tensors[node.output[0]] = self.handler.reshape(
|
||||
tensors[node.output[0]] = ans.handler.reshape(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
input_shape,
|
||||
|
@ -426,7 +452,7 @@ class OnnxStub:
|
|||
for i, x in enumerate(input_shape):
|
||||
if i not in axes:
|
||||
output_shape.append(x)
|
||||
tensors[node.output[0]] = self.handler.reshape(
|
||||
tensors[node.output[0]] = ans.handler.reshape(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
output_shape,
|
||||
|
@ -440,13 +466,13 @@ class OnnxStub:
|
|||
)
|
||||
for i in axes:
|
||||
input_shape.insert(i, 1)
|
||||
tensors[node.output[0]] = self.handler.reshape(
|
||||
tensors[node.output[0]] = ans.handler.reshape(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
input_shape,
|
||||
)
|
||||
elif node.op_type == "Concat":
|
||||
tensors[node.output[0]] = self.handler.concat(
|
||||
tensors[node.output[0]] = ans.handler.concat(
|
||||
[tensors[name] for name in node.input],
|
||||
tensors.get(node.output[0]),
|
||||
next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||
|
@ -454,7 +480,7 @@ class OnnxStub:
|
|||
elif node.op_type == "Split":
|
||||
for name, tensor in zip(
|
||||
node.output,
|
||||
self.handler.split(
|
||||
ans.handler.split(
|
||||
tensors[node.input[0]],
|
||||
None,
|
||||
next(
|
||||
|
@ -466,14 +492,14 @@ class OnnxStub:
|
|||
):
|
||||
tensors[name] = tensor
|
||||
elif node.op_type == "Gather":
|
||||
tensors[node.output[0]] = self.handler.gather(
|
||||
tensors[node.output[0]] = ans.handler.gather(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||
)
|
||||
elif node.op_type == "ReduceMean":
|
||||
tensors[node.output[0]] = self.handler.reduce_mean(
|
||||
tensors[node.output[0]] = ans.handler.reduce_mean(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
tensors[node.input[1]] if len(node.input) > 1 else None,
|
||||
|
@ -481,7 +507,7 @@ class OnnxStub:
|
|||
!= 0,
|
||||
)
|
||||
elif node.op_type == "Slice":
|
||||
tensors[node.output[0]] = self.handler.slice(
|
||||
tensors[node.output[0]] = ans.handler.slice(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
_parse_data(data[node.input[1]]),
|
||||
|
@ -490,7 +516,7 @@ class OnnxStub:
|
|||
_parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
|
||||
)
|
||||
elif node.op_type == "Pad":
|
||||
tensors[node.output[0]] = self.handler.pad(
|
||||
tensors[node.output[0]] = ans.handler.pad(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
_parse_data(data[node.input[1]]),
|
||||
|
@ -499,7 +525,7 @@ class OnnxStub:
|
|||
elif node.op_type == "Dropout":
|
||||
for name, tensor in zip(
|
||||
node.output,
|
||||
self.handler.dropout(
|
||||
ans.handler.dropout(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
tensors.get(node.output[1]) if len(node.output) > 1 else None,
|
||||
|
@ -512,18 +538,35 @@ class OnnxStub:
|
|||
),
|
||||
):
|
||||
tensors[name] = tensor
|
||||
elif node.op_type == "MemBound":
|
||||
attributes = _parse_attribute(node, {"expr": None})
|
||||
expr: str = attributes["expr"]
|
||||
assert expr is not None
|
||||
assert (
|
||||
len(node.output) == 1
|
||||
), """MemBound with multiple
|
||||
outputs requires rewrite the logic of tensor creation"""
|
||||
outputs = ans.handler.memBound(
|
||||
[tensors[name] for name in node.input],
|
||||
tensors.get(node.output[0]),
|
||||
expr,
|
||||
)
|
||||
for name, tensor in zip(node.output, outputs):
|
||||
tensors[name] = tensor
|
||||
else:
|
||||
raise Exception('Unsupported operator "{}"'.format(node.op_type))
|
||||
|
||||
self.handler.data_malloc()
|
||||
# FIXME: do not load data for speed
|
||||
return ans
|
||||
ans.handler.data_malloc()
|
||||
|
||||
for name, obj in tensors.items():
|
||||
tensor = data.get(name)
|
||||
if tensor == None:
|
||||
if any(input.name == name for input in model.graph.input):
|
||||
self.inputs[name] = obj
|
||||
ans.inputs[name] = obj
|
||||
else:
|
||||
self.initializer[obj.fuid()] = tensor
|
||||
ans.initializer[obj.fuid()] = tensor
|
||||
if tensor.data_type == TensorProto.INT32:
|
||||
obj.copyin_int32(_parse_data(tensor))
|
||||
elif tensor.data_type == TensorProto.INT64:
|
||||
|
@ -533,8 +576,19 @@ class OnnxStub:
|
|||
else:
|
||||
assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
|
||||
|
||||
for output in model.graph.output:
|
||||
self.outputs[output.name] = tensors[output.name]
|
||||
return ans
|
||||
|
||||
@classmethod
|
||||
def from_graph(cls, g: backend.Graph):
|
||||
ans = OnnxStub()
|
||||
handler = backend.GraphHandler(g)
|
||||
for i, tensor in enumerate(handler.inputs()):
|
||||
ans.inputs["input{}".format(i)] = tensor
|
||||
for i, tensor in enumerate(handler.outputs()):
|
||||
ans.inputs["output{}".format(i)] = tensor
|
||||
ans.handler = handler
|
||||
ans.disable_check = True
|
||||
return ans
|
||||
|
||||
def to_onnx(self, name: str) -> ModelProto:
|
||||
class Context:
|
||||
|
@ -552,6 +606,13 @@ class OnnxStub:
|
|||
outputs: List[ValueInfoProto] = []
|
||||
# saves global input tensors
|
||||
initializers: List[TensorProto] = []
|
||||
# saves global output tensors
|
||||
value_info: List[ValueInfoProto] = []
|
||||
|
||||
enable_check = False
|
||||
|
||||
def __init__(self, enable_check):
|
||||
self.enable_check = enable_check
|
||||
|
||||
def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
|
||||
ty = op.op_type()
|
||||
|
@ -562,12 +623,15 @@ class OnnxStub:
|
|||
|
||||
def push_output(self, name: str, tensor: backend.Tensor) -> str:
|
||||
self.names[tensor] = name
|
||||
if not tensor.has_target():
|
||||
shape = tensor.shape()
|
||||
dtype = backend.tensor_dtype(tensor)
|
||||
value_info = make_tensor_value_info(name, dtype, shape)
|
||||
check_value_info(value_info)
|
||||
|
||||
shape = tensor.shape()
|
||||
dtype = backend.tensor_dtype(tensor)
|
||||
value_info = make_tensor_value_info(name, dtype, shape)
|
||||
check_value_info(value_info)
|
||||
if not tensor.has_target(): # if this output is a global output
|
||||
self.outputs.append(value_info)
|
||||
else: # if this output is a local output
|
||||
self.value_info.append(value_info)
|
||||
return name
|
||||
|
||||
def push_input(
|
||||
|
@ -577,7 +641,15 @@ class OnnxStub:
|
|||
# means that this input is a global input
|
||||
if name is None:
|
||||
self.count_in += 1
|
||||
name = "input{}".format(self.count_in)
|
||||
if tensor.getTensorType() == backend.TensorType.Input:
|
||||
name = f"input{self.count_in}_{tensor.guid()}"
|
||||
else:
|
||||
name = f"weight{self.count_in}_{tensor.guid()}"
|
||||
shape = tensor.shape()
|
||||
data = np.random.randn(*shape)
|
||||
self.initializers.append(
|
||||
make_tensor(name, TensorProto.FLOAT, shape, data)
|
||||
)
|
||||
self.names[tensor] = name
|
||||
if init != None:
|
||||
init.name = name
|
||||
|
@ -605,17 +677,25 @@ class OnnxStub:
|
|||
return name
|
||||
|
||||
def push_node(self, node: NodeProto) -> None:
|
||||
check_node(node)
|
||||
if self.enable_check:
|
||||
check_node(node)
|
||||
self.nodes.append(node)
|
||||
|
||||
def build(self, name: str) -> ModelProto:
|
||||
graph = make_graph(
|
||||
self.nodes, name, self.inputs, self.outputs, self.initializers
|
||||
self.nodes,
|
||||
name,
|
||||
self.inputs,
|
||||
self.outputs,
|
||||
self.initializers,
|
||||
value_info=self.value_info,
|
||||
)
|
||||
check_graph(graph)
|
||||
if self.enable_check:
|
||||
check_graph(graph)
|
||||
|
||||
model = make_model(graph)
|
||||
check_model(model)
|
||||
if self.enable_check:
|
||||
check_model(model)
|
||||
|
||||
return model
|
||||
|
||||
|
@ -625,7 +705,7 @@ class OnnxStub:
|
|||
|
||||
ops = self.handler.operators() # 图中所有算子(节点)
|
||||
|
||||
ctx = Context()
|
||||
ctx = Context(not self.disable_check)
|
||||
|
||||
for op in ops:
|
||||
ty, name = ctx.name_op(op)
|
||||
|
@ -634,11 +714,11 @@ class OnnxStub:
|
|||
for it in op.inputs()
|
||||
]
|
||||
outputs = [
|
||||
ctx.push_output("{}_{}".format(name, i), it)
|
||||
ctx.push_output(f"{name}_{i}_{it.guid()}", it)
|
||||
for (i, it) in enumerate(op.outputs())
|
||||
]
|
||||
if ty == backend.OpType.Conv:
|
||||
ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op)
|
||||
if ty == backend.OpType.Conv or ty == backend.OpType.ConvNHWC:
|
||||
ph, pw, sh, sw, dh, dw = backend.conv_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
|
@ -651,7 +731,7 @@ class OnnxStub:
|
|||
group=op.inputs()[0].shape()[1] // op.inputs()[1].shape()[1],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.ConvTrans:
|
||||
elif ty == backend.OpType.ConvTrans or ty == backend.OpType.ConvTransNHWC:
|
||||
ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
|
@ -729,7 +809,8 @@ class OnnxStub:
|
|||
]:
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
elif ty == backend.OpType.Flatten:
|
||||
raise Exception("TODO")
|
||||
ctx.push_node(make_node(ty.name, inputs,
|
||||
outputs, axis=backend.flatten_axis_of(op)))
|
||||
elif ty == backend.OpType.Transpose:
|
||||
perm = backend.transpose_permute_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name, perm=perm))
|
||||
|
@ -744,7 +825,8 @@ class OnnxStub:
|
|||
shape,
|
||||
)
|
||||
)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
ctx.push_node(make_node(ty.name, inputs,
|
||||
outputs, name, allowzero=0))
|
||||
elif ty == backend.OpType.Concat:
|
||||
axis = backend.concat_axis_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
|
||||
|
@ -812,6 +894,62 @@ class OnnxStub:
|
|||
ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
|
||||
)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
elif ty == backend.OpType.Any:
|
||||
kernel_name = backend.any_kernelName_of(op)
|
||||
normal_op = kernel_name != 'Reduce3x3Offset_hint'
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name if normal_op else 'Reduce3x3OffsetPlugin',
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
kernelName=kernel_name,
|
||||
domain="nnet" if normal_op else None,
|
||||
)
|
||||
)
|
||||
elif ty in [backend.OpType.ConvTransNHWC, backend.OpType.GBMM,
|
||||
backend.OpType.G2BMM]:
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
domain="nnet",
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.Conv2dReduce:
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
domain="nnet",
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.Conv2dReduceTranspose:
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
domain="nnet",
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.MemBound:
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
domain="nnet",
|
||||
expr=backend.membound_expr_of(op),
|
||||
hash=str(backend.membound_hash_of(op)),
|
||||
)
|
||||
)
|
||||
else:
|
||||
raise Exception("Unsupported OpType", ty)
|
||||
|
||||
|
@ -828,7 +966,7 @@ class OnnxStub:
|
|||
|
||||
|
||||
def from_onnx(model: ModelProto, runtime):
|
||||
stub = OnnxStub(model, runtime)
|
||||
stub = OnnxStub.from_onnx(model, runtime)
|
||||
return stub.inputs, stub.outputs, stub.handler
|
||||
|
||||
|
||||
|
@ -889,3 +1027,9 @@ def _parse_data(tensor: TensorProto) -> List[Any]:
|
|||
|
||||
def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
|
||||
return [(d.dim_value if d.dim_value > 0 else 1) for d in shape.dim]
|
||||
|
||||
|
||||
def save_onnx(opt_g, filename: str):
|
||||
stub = OnnxStub.from_graph(opt_g)
|
||||
with open(filename, "wb") as f:
|
||||
f.write(stub.to_onnx("optimized").SerializeToString())
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
import subprocess
|
||||
import re
|
||||
import os
|
||||
from .onnx import save_onnx
|
||||
|
||||
|
||||
def get_trt_time(g):
|
||||
onnx_filename = '/tmp/tmp.onnx'
|
||||
save_onnx(g, onnx_filename)
|
||||
plugin_path = os.environ['TRT_PLUGIN']
|
||||
# LD_LIBRARY_PATH=$TRT_PLUGIN:$LD_LIBRARY_PATH trtexec --noTF32 --onnx=/home/zly/InfiniTensor_merge/build/opt_resnet.bs16.onnx --plugins=$TRT_PLUGIN/libnvinfer_plugin.so.8.2.0
|
||||
res = subprocess.run(
|
||||
f'trtexec --noTF32 --onnx={onnx_filename} --plugins={plugin_path}/libnvinfer_plugin.so.8.2.0'.split(' '), capture_output=True)
|
||||
p = re.compile('GPU Compute Time.*mean = ([0-9.]+) ms')
|
||||
output = res.stdout.decode('utf-8')
|
||||
# err = res.stderr.decode('utf-8')
|
||||
# print(output, '\n'*5, err)
|
||||
return float(p.search(output).group(1))
|
|
@ -40,9 +40,9 @@ class TestStringMethods(unittest.TestCase):
|
|||
file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
|
||||
)
|
||||
)
|
||||
model = OnnxStub(onnx.load(model_file), backend.cpu_runtime()).to_onnx(
|
||||
"new"
|
||||
)
|
||||
model = OnnxStub.from_onnx(
|
||||
onnx.load(model_file), backend.cpu_runtime()
|
||||
).to_onnx("new")
|
||||
model = infer_shapes(model)
|
||||
|
||||
def test_tensor(self):
|
||||
|
@ -304,16 +304,16 @@ class TestStringMethods(unittest.TestCase):
|
|||
|
||||
def test_frontend(self):
|
||||
handler = backend.GraphHandler(backend.cpu_runtime())
|
||||
a = handler.tensor([1, 2, 3], 12)
|
||||
b = handler.tensor([1, 2, 3], 12)
|
||||
c = handler.tensor([1, 2, 3], 12)
|
||||
d = handler.tensor([1, 2, 3], 12)
|
||||
e = handler.tensor([1, 2, 3], 12)
|
||||
a = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
|
||||
b = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
|
||||
c = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
|
||||
d = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
|
||||
e = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
|
||||
|
||||
x = handler.add(
|
||||
handler.add(handler.add(handler.add(a, b, None), c, None), d, None), e, None
|
||||
)
|
||||
y = handler.tensor([3, 2, 1], 12)
|
||||
y = handler.tensor([3, 2, 1], 12, backend.TensorType.Other)
|
||||
handler.reshape(x, y, [3, 2, 1])
|
||||
|
||||
|
||||
|
|
|
@ -1,106 +1,131 @@
|
|||
import re
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from contextlib import redirect_stdout
|
||||
import time
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import tvm
|
||||
from tvm import te, tir, auto_scheduler, topi
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
|
||||
USE_CACHE = True
|
||||
logging.basicConfig()
|
||||
logger = logging.getLogger('InfiniTensor')
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def gen_ansor_so(input_tensors, input_dtypes, output_tensor, output_dtype,
|
||||
tvm_code, func_name, nnet_expression: str,
|
||||
nnet_simplified_expression: str, hash_code=None):
|
||||
nnet_simplified_expression: str, hash_code: str = None):
|
||||
assert len(input_tensors) == len(input_dtypes)
|
||||
|
||||
logging.debug(f'Work on hash {hash_code}')
|
||||
|
||||
logger.debug(f'Work on hash {hash_code}')
|
||||
dir_name = os.path.join(".cache", "generated_kernels", str(hash_code))
|
||||
|
||||
|
||||
if not os.path.exists(dir_name):
|
||||
os.makedirs(dir_name)
|
||||
|
||||
|
||||
so_fn = os.path.join(dir_name, f"{func_name}.so")
|
||||
config_fn = os.path.join(dir_name, "config_so.json")
|
||||
|
||||
print("Generating Ansor op: ")
|
||||
print(tvm_code)
|
||||
|
||||
print("Input shape: ")
|
||||
print(input_tensors)
|
||||
print("Output shape: ")
|
||||
print(output_tensor)
|
||||
|
||||
desc_fn = os.path.join(dir_name, "desc.txt")
|
||||
log_fn = os.path.join(dir_name, f"ansor_{func_name}_log.json")
|
||||
out_fn = os.path.join(dir_name, "out.txt")
|
||||
|
||||
logger.debug(f"Generating Ansor op: {tvm_code}")
|
||||
logger.debug(f"Input shape: {input_tensors}")
|
||||
logger.debug(f"Output shape: {output_tensor}")
|
||||
|
||||
if USE_CACHE and hash_code is not None:
|
||||
if os.path.exists(dir_name) and \
|
||||
os.path.exists(so_fn) and \
|
||||
os.path.exists(config_fn):
|
||||
os.path.exists(so_fn) and \
|
||||
os.path.exists(config_fn):
|
||||
print(f"Use cache in {dir_name}")
|
||||
with open(config_fn, "r") as config_fin:
|
||||
config = json.loads(config_fin.read().strip())
|
||||
conv_time = config["conv_time"]
|
||||
|
||||
logger.debug(f'Find tuning log for {hash_code}')
|
||||
logger.info(f'Find tuning log for {hash_code} in {so_fn}')
|
||||
return so_fn, conv_time
|
||||
|
||||
logger.info(f"TVM Tuning kernel with hash {hash_code}. See {out_fn}")
|
||||
|
||||
time_start = time.perf_counter()
|
||||
# Print descriptions of the task
|
||||
if USE_CACHE and hash_code is not None:
|
||||
with redirect_stdout(open(desc_fn, "w")):
|
||||
print("====NNET tensor expression====")
|
||||
print(nnet_expression+"\n")
|
||||
print("====NNET simplified tensor expression====")
|
||||
print(nnet_simplified_expression+"\n")
|
||||
print("====TVM compute====")
|
||||
print(tvm_code+"\n")
|
||||
print("Input shape: ", input_tensors)
|
||||
print("Output shape: ", output_tensor)
|
||||
|
||||
@auto_scheduler.register_workload(func_name)
|
||||
def compute():
|
||||
_locals = locals()
|
||||
exec(tvm_code, {'tvm': tvm, 'te': te, 'tir': tir, 'topi': topi}, _locals)
|
||||
exec(tvm_code, {'tvm': tvm, 'te': te,
|
||||
'tir': tir, 'topi': topi}, _locals)
|
||||
return _locals['ret']
|
||||
|
||||
|
||||
target = tvm.target.Target("cuda")
|
||||
|
||||
task = auto_scheduler.SearchTask(func=func_name, args=(), target=target)
|
||||
|
||||
# Inspect the computational graph
|
||||
print("Computational DAG:")
|
||||
print(task.compute_dag)
|
||||
with redirect_stdout(open(out_fn, 'w')):
|
||||
# Inspect the computational graph
|
||||
print("Computational DAG:")
|
||||
print(task.compute_dag)
|
||||
|
||||
log_file = f"ansor_{func_name}_log.json"
|
||||
measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
|
||||
tune_option = auto_scheduler.TuningOptions(
|
||||
num_measure_trials=10,
|
||||
runner=measure_ctx.runner,
|
||||
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
|
||||
verbose=2,
|
||||
)
|
||||
measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
|
||||
tune_option = auto_scheduler.TuningOptions(
|
||||
num_measure_trials=10,
|
||||
runner=measure_ctx.runner,
|
||||
measure_callbacks=[auto_scheduler.RecordToFile(log_fn)],
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
# Run auto-tuning (search)
|
||||
task.tune(tune_option)
|
||||
# Apply the best schedule
|
||||
sch, args = task.apply_best(log_file)
|
||||
# Run auto-tuning (search)
|
||||
task.tune(tune_option)
|
||||
# Apply the best schedule
|
||||
sch, args = task.apply_best(log_fn)
|
||||
|
||||
# Kill the measurement process
|
||||
del measure_ctx
|
||||
|
||||
func = tvm.build(sch, args, target, name=func_name)
|
||||
func.export_library(so_fn)
|
||||
|
||||
ctx = tvm.cuda(0)
|
||||
input_a = []
|
||||
for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
|
||||
a_np = np.random.uniform(size=shape).astype(dtype)
|
||||
input_a.append(tvm.nd.array(a_np, ctx))
|
||||
a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
|
||||
func(a_out, *input_a)
|
||||
evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
|
||||
conv_time = evaluator(a_out, *input_a).mean * 1e3
|
||||
|
||||
time_end = time.perf_counter()
|
||||
|
||||
# Kill the measurement process
|
||||
del measure_ctx
|
||||
|
||||
func = tvm.build(sch, args, target, name=func_name)
|
||||
func.export_library(so_fn)
|
||||
|
||||
ctx = tvm.cuda(0)
|
||||
input_a = []
|
||||
for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
|
||||
a_np = np.random.uniform(size=shape).astype(dtype)
|
||||
input_a.append(tvm.nd.array(a_np, ctx))
|
||||
a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
|
||||
func(a_out, *input_a)
|
||||
evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
|
||||
conv_time = evaluator(a_out, *input_a).mean * 1e3
|
||||
|
||||
print("====NNET tensor expression====")
|
||||
print(nnet_expression+"\n")
|
||||
print("====NNET simplified tensor expression====")
|
||||
print(nnet_simplified_expression+"\n")
|
||||
print("====Time====")
|
||||
print(conv_time)
|
||||
|
||||
if USE_CACHE and hash_code is not None:
|
||||
with open(config_fn, "w") as config_fout:
|
||||
config_fout.write(json.dumps({
|
||||
"conv_time": conv_time,
|
||||
"tuning_time": time_end - time_start,
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
|
||||
}, ensure_ascii=False, indent=2))
|
||||
|
||||
|
||||
return so_fn, conv_time
|
||||
|
||||
# Read arguments from pipe, which is redirected to stdin.
|
||||
# Write generated library path to pipe.
|
||||
|
||||
|
||||
def pipe_gen(fd: int):
|
||||
args = json.load(sys.stdin) # read from pipe
|
||||
# print(args, f'fd={fd}')
|
||||
ret = gen_ansor_so(**args)
|
||||
with os.fdopen(fd, 'w') as f:
|
||||
print(ret[0], file=f, end='') # write to pipe
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
import backend
|
||||
from backend import *
|
||||
import sys
|
||||
|
||||
sys.path.extend(__path__)
|
||||
|
||||
print("import backend: {}".format(backend))
|
|
@ -0,0 +1,941 @@
|
|||
import backend
|
||||
import onnx
|
||||
from onnx import (
|
||||
ModelProto,
|
||||
TensorProto,
|
||||
NodeProto,
|
||||
AttributeProto,
|
||||
TensorShapeProto,
|
||||
ValueInfoProto,
|
||||
)
|
||||
from onnx.helper import (
|
||||
make_node,
|
||||
make_tensor_value_info,
|
||||
make_tensor,
|
||||
make_graph,
|
||||
make_model,
|
||||
)
|
||||
from onnx.checker import (
|
||||
check_graph,
|
||||
check_model,
|
||||
check_node,
|
||||
check_value_info,
|
||||
check_tensor,
|
||||
)
|
||||
from onnx.shape_inference import infer_shapes
|
||||
from onnx.numpy_helper import to_array
|
||||
from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
|
||||
from functools import reduce
|
||||
|
||||
|
||||
class OnnxStub:
|
||||
"""
|
||||
The Onnx model imported into infinitensor.
|
||||
It can be generated from an Onnx model object.
|
||||
"""
|
||||
|
||||
# inputs: Dict[str, backend.Tensor] = {}
|
||||
# outputs: Dict[str, backend.Tensor] = {}
|
||||
initializer: Dict[int, TensorProto] = {}
|
||||
# handler: backend.GraphHandler
|
||||
|
||||
# def __init__(self, model: ModelProto, runtime):
|
||||
# model = infer_shapes(model)
|
||||
# self.handler = backend.GraphHandler(runtime)
|
||||
|
||||
# tensors: Dict[str, backend.Tensor] = dict()
|
||||
# data: Dict[str, TensorProto] = dict()
|
||||
|
||||
# for input in model.graph.input:
|
||||
# dims = _take_shape_dim(input.type.tensor_type.shape)
|
||||
# tensors[input.name] = self.handler.tensor(
|
||||
# dims, input.type.tensor_type.elem_type
|
||||
# )
|
||||
|
||||
# for output in model.graph.output:
|
||||
# dims = _take_shape_dim(output.type.tensor_type.shape)
|
||||
# tensors[output.name] = self.handler.tensor(
|
||||
# dims, output.type.tensor_type.elem_type
|
||||
# )
|
||||
|
||||
# for initializer in model.graph.initializer:
|
||||
# dims = [d for d in initializer.dims]
|
||||
# tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
|
||||
# data[initializer.name] = initializer
|
||||
|
||||
# for node in model.graph.node:
|
||||
# if node.op_type == "Conv":
|
||||
# attributes = _parse_attribute(
|
||||
# node,
|
||||
# {
|
||||
# "dilations": [1, 1],
|
||||
# "pads": [0, 0, 0, 0],
|
||||
# "strides": [1, 1],
|
||||
# },
|
||||
# )
|
||||
# (d, p, s) = (
|
||||
# attributes[name] for name in ["dilations", "pads", "strides"]
|
||||
# )
|
||||
# if p[0] != p[2] or p[1] != p[3]:
|
||||
# adapt = "{}-adapt".format(node.output[0])
|
||||
# tensors[adapt] = self.handler.pad(
|
||||
# tensors[node.input[0]], None, p, [-2, -1]
|
||||
# )
|
||||
# p = [0, 0, 0, 0]
|
||||
# else:
|
||||
# adapt = node.input[0]
|
||||
|
||||
# if len(node.input) > 2:
|
||||
# bias = "{}-bias".format(node.output[0])
|
||||
# reshape = "{}-reshape".format(node.output[0])
|
||||
# tensors[bias] = self.handler.conv(
|
||||
# tensors[adapt],
|
||||
# tensors[node.input[1]],
|
||||
# None,
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# )
|
||||
# tensors[reshape] = self.handler.reshape(
|
||||
# tensors[node.input[2]],
|
||||
# None,
|
||||
# [
|
||||
# 1,
|
||||
# reduce(
|
||||
# lambda acc, x: acc * x,
|
||||
# _search_shape(model, node.input[2]),
|
||||
# ),
|
||||
# 1,
|
||||
# 1,
|
||||
# ],
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.add(
|
||||
# tensors[bias],
|
||||
# tensors[reshape],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# else:
|
||||
# tensors[node.output[0]] = self.handler.conv(
|
||||
# tensors[adapt],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# )
|
||||
# elif node.op_type == "ConvTranspose":
|
||||
# attributes = _parse_attribute(
|
||||
# node,
|
||||
# {
|
||||
# "dilations": [1, 1],
|
||||
# "pads": [0, 0],
|
||||
# "strides": [1, 1],
|
||||
# "output_padding": [0, 0],
|
||||
# },
|
||||
# )
|
||||
# (d, p, s, op) = (
|
||||
# attributes[name]
|
||||
# for name in ["dilations", "pads", "strides", "output_padding"]
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.convTransposed2d(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# op[0],
|
||||
# op[1],
|
||||
# )
|
||||
# elif node.op_type == "MatMul":
|
||||
# tensors[node.output[0]] = self.handler.matmul(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# False,
|
||||
# False,
|
||||
# None,
|
||||
# backend.ActType.Linear,
|
||||
# )
|
||||
# elif node.op_type == "Gemm":
|
||||
# attributes = _parse_attribute(
|
||||
# node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
|
||||
# )
|
||||
# (alpha, beta, transA, transB) = (
|
||||
# attributes[name] for name in ["alpha", "beta", "transA", "transB"]
|
||||
# )
|
||||
# # FIXME unsupport attributes: `alpha` `beta`
|
||||
# assert alpha == 1.0
|
||||
# assert beta == 1.0
|
||||
# tensors[node.output[0]] = self.handler.matmul(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# transA == 1,
|
||||
# transB == 1,
|
||||
# tensors[node.input[2]] if len(node.input) > 2 else None,
|
||||
# backend.ActType.Linear,
|
||||
# )
|
||||
# elif node.op_type == "BatchNormalization":
|
||||
# (input, mean, var, scale, bias) = (
|
||||
# tensors[node.input[i]] for i in [0, 3, 4, 1, 2]
|
||||
# )
|
||||
# output = tensors.get(node.output[0])
|
||||
# attributes = _parse_attribute(
|
||||
# node, {"momentum": 0.9, "epsilon": 1e-05, "training_mode": 0}
|
||||
# )
|
||||
# (momentum, eps, training) = (
|
||||
# attributes[name]
|
||||
# for name in ["momentum", "epsilon", "training_mode"]
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.batchNorm(
|
||||
# input, output, mean, var, scale, bias, momentum, eps, training != 0
|
||||
# )
|
||||
# elif node.op_type == "MaxPool":
|
||||
# attributes = _parse_attribute(
|
||||
# node,
|
||||
# {
|
||||
# "kernel_shape": None,
|
||||
# "dilations": [1, 1],
|
||||
# "pads": [0, 0, 0, 0],
|
||||
# "strides": [1, 1],
|
||||
# },
|
||||
# )
|
||||
# (k, d, p, s) = (
|
||||
# attributes[name]
|
||||
# for name in ["kernel_shape", "dilations", "pads", "strides"]
|
||||
# )
|
||||
# if p[0] != p[2] or p[1] != p[3]:
|
||||
# adapt = "{}-adapt".format(node.output[0])
|
||||
# tensors[adapt] = self.handler.pad(
|
||||
# tensors.get(node.input[0]), None, p, [-2, -1]
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.maxPool(
|
||||
# tensors[adapt],
|
||||
# tensors.get(node.output[0]),
|
||||
# k[0],
|
||||
# k[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# 0,
|
||||
# 0,
|
||||
# s[0],
|
||||
# s[1],
|
||||
# )
|
||||
# else:
|
||||
# tensors[node.output[0]] = self.handler.maxPool(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# k[0],
|
||||
# k[1],
|
||||
# d[0],
|
||||
# d[1],
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# )
|
||||
# elif node.op_type == "AveragePool":
|
||||
# attributes = _parse_attribute(
|
||||
# node,
|
||||
# {
|
||||
# "kernel_shape": None,
|
||||
# "pads": [0, 0, 0, 0],
|
||||
# "strides": [1, 1],
|
||||
# },
|
||||
# )
|
||||
# (k, p, s) = (
|
||||
# attributes[name] for name in ["kernel_shape", "pads", "strides"]
|
||||
# )
|
||||
# if p[0] != p[2] or p[1] != p[3]:
|
||||
# adapt = "{}-adapt".format(node.output[0])
|
||||
# tensors[adapt] = self.handler.pad(
|
||||
# tensors.get(node.input[0]), None, p, [-2, -1]
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.avgPool(
|
||||
# tensors[adapt],
|
||||
# tensors.get(node.output[0]),
|
||||
# k[0],
|
||||
# k[1],
|
||||
# 1,
|
||||
# 1,
|
||||
# 0,
|
||||
# 0,
|
||||
# s[0],
|
||||
# s[1],
|
||||
# )
|
||||
# else:
|
||||
# tensors[node.output[0]] = self.handler.avgPool(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# k[0],
|
||||
# k[1],
|
||||
# 1,
|
||||
# 1,
|
||||
# p[0],
|
||||
# p[1],
|
||||
# s[0],
|
||||
# s[1],
|
||||
# )
|
||||
# elif node.op_type == "GlobalAveragePool":
|
||||
# [_, _, h, w] = _search_shape(model, node.input[0])
|
||||
# tensors[node.output[0]] = self.handler.avgPool(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# h,
|
||||
# w,
|
||||
# 1,
|
||||
# 1,
|
||||
# 0,
|
||||
# 0,
|
||||
# 1,
|
||||
# 1,
|
||||
# )
|
||||
# elif node.op_type == "Add":
|
||||
# tensors[node.output[0]] = self.handler.add(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Sub":
|
||||
# tensors[node.output[0]] = self.handler.sub(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Mul":
|
||||
# tensors[node.output[0]] = self.handler.mul(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Div":
|
||||
# tensors[node.output[0]] = self.handler.div(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Pow":
|
||||
# tensors[node.output[0]] = self.handler.pow(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Relu":
|
||||
# tensors[node.output[0]] = self.handler.relu(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Sigmoid":
|
||||
# tensors[node.output[0]] = self.handler.sigmoid(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Tanh":
|
||||
# tensors[node.output[0]] = self.handler.tanh(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Softmax":
|
||||
# tensors[node.output[0]] = self.handler.softmax(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Abs":
|
||||
# tensors[node.output[0]] = self.handler.abs(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Shape":
|
||||
# tensors[node.output[0]] = self.handler.shape(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Identity":
|
||||
# tensors[node.output[0]] = self.handler.identity(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Flatten":
|
||||
# # FIXME axis must be 1
|
||||
# axis = next(
|
||||
# (attr.i for attr in node.attribute if attr.name == "axis"), None
|
||||
# )
|
||||
# assert axis == None or axis == 1
|
||||
# tensors[node.output[0]] = self.handler.flatten(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "PRelu":
|
||||
# tensors[node.output[0]] = self.handler.pRelu(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# )
|
||||
# elif node.op_type == "Clip":
|
||||
# tensors[node.output[0]] = self.handler.clip(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# next(_parse_data(data[node.input[1]]).__iter__(), None)
|
||||
# if len(node.input) > 1
|
||||
# else None,
|
||||
# next(_parse_data(data[node.input[2]]).__iter__(), None)
|
||||
# if len(node.input) > 2
|
||||
# else None,
|
||||
# )
|
||||
# elif node.op_type == "Transpose":
|
||||
# perm = next(
|
||||
# (attr.ints for attr in node.attribute if attr.name == "perm"), None
|
||||
# )
|
||||
# tensors[node.output[0]] = self.handler.transpose(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# perm,
|
||||
# )
|
||||
# elif node.op_type == "Reshape":
|
||||
# dims = _search_shape(model, node.input[0])
|
||||
# size = reduce(lambda acc, x: acc * x, dims)
|
||||
# input_shape = _parse_data(data[node.input[1]])
|
||||
# for i, x in enumerate(input_shape):
|
||||
# if x == 0:
|
||||
# input_shape[i] = dims[i]
|
||||
# temp = reduce(lambda acc, x: acc * x, input_shape, 1)
|
||||
# if temp < 0:
|
||||
# input_shape[input_shape.index(-1)] = size // -temp
|
||||
# tensors[node.output[0]] = self.handler.reshape(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# input_shape,
|
||||
# )
|
||||
# elif node.op_type == "Squeeze":
|
||||
# input_shape = _search_shape(model, node.input[0])
|
||||
# axes = set(
|
||||
# [int(i) for i in data[node.input[1]].int64_data]
|
||||
# if len(node.input) > 1
|
||||
# else _parse_attribute(node, {"axes": None})["axes"]
|
||||
# )
|
||||
# assert all(input_shape[d] == 1 for d in axes)
|
||||
# output_shape = []
|
||||
# for i, x in enumerate(input_shape):
|
||||
# if i not in axes:
|
||||
# output_shape.append(x)
|
||||
# tensors[node.output[0]] = self.handler.reshape(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# output_shape,
|
||||
# )
|
||||
# elif node.op_type == "Unsqueeze":
|
||||
# input_shape = _search_shape(model, node.input[0])
|
||||
# axes = (
|
||||
# [int(i) for i in data[node.input[1]].int64_data]
|
||||
# if len(node.input) > 1
|
||||
# else _parse_attribute(node, {"axes": None})["axes"]
|
||||
# )
|
||||
# for i in axes:
|
||||
# input_shape.insert(i, 1)
|
||||
# tensors[node.output[0]] = self.handler.reshape(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# input_shape,
|
||||
# )
|
||||
# elif node.op_type == "Concat":
|
||||
# tensors[node.output[0]] = self.handler.concat(
|
||||
# [tensors[name] for name in node.input],
|
||||
# tensors.get(node.output[0]),
|
||||
# next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||
# )
|
||||
# elif node.op_type == "Split":
|
||||
# for name, tensor in zip(
|
||||
# node.output,
|
||||
# self.handler.split(
|
||||
# tensors[node.input[0]],
|
||||
# None,
|
||||
# next(
|
||||
# (attr.i for attr in node.attribute if attr.name == "axis"),
|
||||
# 0,
|
||||
# ),
|
||||
# len(node.output),
|
||||
# ),
|
||||
# ):
|
||||
# tensors[name] = tensor
|
||||
# elif node.op_type == "Gather":
|
||||
# tensors[node.output[0]] = self.handler.gather(
|
||||
# tensors[node.input[0]],
|
||||
# tensors[node.input[1]],
|
||||
# tensors.get(node.output[0]),
|
||||
# next((attr.i for attr in node.attribute if attr.name == "axis")),
|
||||
# )
|
||||
# elif node.op_type == "ReduceMean":
|
||||
# tensors[node.output[0]] = self.handler.reduce_mean(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# next(
|
||||
# (attr.ints for attr in node.attribute if attr.name == "axes"),
|
||||
# None,
|
||||
# ),
|
||||
# next((attr.i for attr in node.attribute if attr.name == "keepdims"))
|
||||
# != 0,
|
||||
# )
|
||||
# elif node.op_type == "Slice":
|
||||
# tensors[node.output[0]] = self.handler.slice(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# _parse_data(data[node.input[1]]),
|
||||
# _parse_data(data[node.input[2]]),
|
||||
# _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
|
||||
# _parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
|
||||
# )
|
||||
# elif node.op_type == "Pad":
|
||||
# tensors[node.output[0]] = self.handler.pad(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# _parse_data(data[node.input[1]]),
|
||||
# _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
|
||||
# )
|
||||
# elif node.op_type == "Dropout":
|
||||
# for name, tensor in zip(
|
||||
# node.output,
|
||||
# self.handler.dropout(
|
||||
# tensors[node.input[0]],
|
||||
# tensors.get(node.output[0]),
|
||||
# tensors.get(node.output[1]) if len(node.output) > 1 else None,
|
||||
# _parse_data(data[node.input[1]])[0]
|
||||
# if len(node.input) > 1
|
||||
# else 0.5,
|
||||
# _parse_data(data[node.input[2]])[0]
|
||||
# if len(node.input) > 2
|
||||
# else False,
|
||||
# ),
|
||||
# ):
|
||||
# tensors[name] = tensor
|
||||
# else:
|
||||
# raise Exception('Unsupported operator "{}"'.format(node.op_type))
|
||||
|
||||
# self.handler.data_malloc()
|
||||
|
||||
# for name, obj in tensors.items():
|
||||
# tensor = data.get(name)
|
||||
# if tensor == None:
|
||||
# if any(input.name == name for input in model.graph.input):
|
||||
# self.inputs[name] = obj
|
||||
# else:
|
||||
# self.initializer[obj.fuid()] = tensor
|
||||
# if tensor.data_type == TensorProto.INT32:
|
||||
# obj.copyin_int32(_parse_data(tensor))
|
||||
# elif tensor.data_type == TensorProto.INT64:
|
||||
# obj.copyin_int64(_parse_data(tensor))
|
||||
# elif tensor.data_type == TensorProto.FLOAT:
|
||||
# obj.copyin_float(_parse_data(tensor))
|
||||
# else:
|
||||
# assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
|
||||
|
||||
# for output in model.graph.output:
|
||||
# self.outputs[output.name] = tensors[output.name]
|
||||
|
||||
def to_onnx(self, g: backend.Graph, path: str, name: str = 'my_onnx') -> ModelProto:
|
||||
class Context:
|
||||
# saves object names, including tensors and operators
|
||||
names: Dict[Union[backend.Tensor, backend.Operator], str] = dict()
|
||||
# counts the occurrence times of each operator for naming
|
||||
count_op: Dict[backend.OpType, int] = dict()
|
||||
# counts input and output tensors for naming
|
||||
count_in, count_out = 0, 0
|
||||
# saves nodes (operators)
|
||||
nodes: List[NodeProto] = []
|
||||
# saves global input tensors
|
||||
inputs: List[ValueInfoProto] = []
|
||||
# saves global output tensors
|
||||
outputs: List[ValueInfoProto] = []
|
||||
# saves global input tensors
|
||||
initializers: List[TensorProto] = []
|
||||
|
||||
def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
|
||||
ty = op.op_type()
|
||||
name = "{}_{}".format(ty.name, op.guid())
|
||||
self.names[op] = name
|
||||
self.count_op[ty] = self.count_op.get(ty, 0) + 1
|
||||
return ty, name
|
||||
|
||||
def push_output(self, name: str, tensor: backend.Tensor) -> str:
|
||||
self.names[tensor] = name
|
||||
if not tensor.has_target():
|
||||
shape = tensor.shape()
|
||||
dtype = backend.tensor_dtype(tensor)
|
||||
value_info = make_tensor_value_info(name, dtype, shape)
|
||||
check_value_info(value_info)
|
||||
self.outputs.append(value_info)
|
||||
return name
|
||||
|
||||
def push_input(
|
||||
self, tensor: backend.Tensor, init: Optional[TensorProto]
|
||||
) -> str:
|
||||
name = self.names.get(tensor)
|
||||
# means that this input is a global input
|
||||
if name is None:
|
||||
self.count_in += 1
|
||||
name = "input_{}".format(tensor.guid())
|
||||
self.names[tensor] = name
|
||||
if init != None:
|
||||
init.name = name
|
||||
self.initializers.append(init)
|
||||
else:
|
||||
shape = tensor.shape()
|
||||
dtype = backend.tensor_dtype(tensor)
|
||||
value_info = make_tensor_value_info(name, dtype, shape)
|
||||
check_value_info(value_info)
|
||||
self.inputs.append(value_info)
|
||||
return name
|
||||
|
||||
def push_data_input(
|
||||
self,
|
||||
node_name: str,
|
||||
attr_name: str,
|
||||
elem_type: int,
|
||||
shape: Sequence[int],
|
||||
vals: Any,
|
||||
) -> str:
|
||||
name = "{}_{}".format(node_name, attr_name)
|
||||
tensor = make_tensor(name, elem_type, shape, vals)
|
||||
check_tensor(tensor)
|
||||
self.initializers.append(tensor)
|
||||
return name
|
||||
|
||||
def push_node(self, node: NodeProto) -> None:
|
||||
# check_node(node)
|
||||
self.nodes.append(node)
|
||||
|
||||
def build(self, name: str) -> ModelProto:
|
||||
graph = make_graph(
|
||||
self.nodes, name, self.inputs, self.outputs, self.initializers
|
||||
)
|
||||
# check_graph(graph)
|
||||
|
||||
model = make_model(graph)
|
||||
# check_model(model)
|
||||
|
||||
return model
|
||||
|
||||
# 拓扑排序
|
||||
if not g.topo_sort():
|
||||
raise Exception("Sorting fails")
|
||||
|
||||
ops = g.operators() # 图中所有算子(节点)
|
||||
|
||||
ctx = Context()
|
||||
|
||||
for op in ops:
|
||||
ty, name = ctx.name_op(op)
|
||||
inputs = [
|
||||
ctx.push_input(it, self.initializer.get(it.fuid()))
|
||||
for it in op.inputs()
|
||||
]
|
||||
outputs = [
|
||||
ctx.push_output("{}_{}_{}".format(
|
||||
name, i, tensor.guid()), tensor)
|
||||
for (i, tensor) in enumerate(op.outputs())
|
||||
]
|
||||
if ty == backend.OpType.Conv:
|
||||
ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
pads=[ph, pw, ph, pw],
|
||||
strides=[sh, sw],
|
||||
dilations=[dh, dw],
|
||||
group=op.inputs()[0].shape()[
|
||||
1] // op.inputs()[1].shape()[1],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.ConvTrans:
|
||||
ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(
|
||||
op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"ConvTranspose",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
pads=[ph, pw],
|
||||
strides=[sh, sw],
|
||||
dilations=[dh, dw],
|
||||
output_padding=[oph, opw],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.ConvTransNHWC:
|
||||
# ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"ConvTranspose",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
domain="nnet",
|
||||
# pads=[ph, pw],
|
||||
# strides=[sh, sw],
|
||||
# dilations=[dh, dw],
|
||||
# output_padding=[oph, opw],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.MemBound:
|
||||
# ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"Membound",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
domain="nnet",
|
||||
# pads=[ph, pw],
|
||||
# strides=[sh, sw],
|
||||
# dilations=[dh, dw],
|
||||
# output_padding=[oph, opw],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.Matmul:
|
||||
# transA, transB = backend.matmul_attrs_of(op)
|
||||
# HACK: recover this
|
||||
transA, transB = False, False
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"Gemm", inputs, outputs, name, transA=transA, transB=transB
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.BatchNorm:
|
||||
inputs = [inputs[i] for i in [0, 3, 4, 1, 2]]
|
||||
momentum, eps, training = backend.batch_norm_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"BatchNormalization",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
epsilon=eps,
|
||||
momentum=momentum,
|
||||
training_mode=training,
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.MaxPool:
|
||||
kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
kernel_shape=[kh, kw],
|
||||
pads=[ph, pw, ph, pw],
|
||||
dilations=[dh, dw],
|
||||
strides=[sh, sw],
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.AvgPool:
|
||||
kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
"AveragePool",
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
kernel_shape=[kh, kw],
|
||||
pads=[ph, pw, ph, pw],
|
||||
strides=[sh, sw],
|
||||
)
|
||||
)
|
||||
elif ty in [
|
||||
backend.OpType.Add,
|
||||
backend.OpType.Sub,
|
||||
backend.OpType.Mul,
|
||||
backend.OpType.Div,
|
||||
backend.OpType.Pow,
|
||||
backend.OpType.Relu,
|
||||
backend.OpType.Sigmoid,
|
||||
backend.OpType.Tanh,
|
||||
backend.OpType.Softmax,
|
||||
backend.OpType.Abs,
|
||||
backend.OpType.Identity,
|
||||
backend.OpType.PRelu,
|
||||
]:
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
elif ty == backend.OpType.Flatten:
|
||||
raise Exception("TODO")
|
||||
elif ty == backend.OpType.Transpose:
|
||||
perm = backend.transpose_permute_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs,
|
||||
outputs, name, perm=perm))
|
||||
elif ty == backend.OpType.Reshape:
|
||||
shape = backend.reshape_shape_of(op)
|
||||
inputs.append(
|
||||
ctx.push_data_input(
|
||||
name,
|
||||
"shape",
|
||||
TensorProto.INT64,
|
||||
[len(shape)],
|
||||
shape,
|
||||
)
|
||||
)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
elif ty == backend.OpType.Concat:
|
||||
axis = backend.concat_axis_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs,
|
||||
outputs, name, axis=axis))
|
||||
elif ty == backend.OpType.Split:
|
||||
axis = backend.split_axis_of(op)
|
||||
num_outputs = len(outputs)
|
||||
split = op.inputs()[0].shape()[axis] // num_outputs
|
||||
inputs.append(
|
||||
ctx.push_data_input(
|
||||
name,
|
||||
"split",
|
||||
TensorProto.INT64,
|
||||
[len(outputs)],
|
||||
[split for _ in range(0, num_outputs)],
|
||||
)
|
||||
)
|
||||
ctx.push_node(
|
||||
make_node(
|
||||
ty.name,
|
||||
inputs,
|
||||
outputs,
|
||||
name,
|
||||
axis=axis,
|
||||
)
|
||||
)
|
||||
elif ty == backend.OpType.Gather:
|
||||
axis = backend.gather_axis_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs,
|
||||
outputs, name, axis=axis))
|
||||
elif ty == backend.OpType.ReduceMean:
|
||||
axes, keepdims = backend.reduce_mean_attrs_of(op)
|
||||
inputs.append(
|
||||
ctx.push_data_input(
|
||||
name, "axes", TensorProto.INT64, [len(axes)], axes
|
||||
)
|
||||
)
|
||||
ctx.push_node(
|
||||
make_node(ty.name, inputs, outputs,
|
||||
name, keepdims=keepdims)
|
||||
)
|
||||
elif ty == backend.OpType.Slice:
|
||||
raise Exception("TODO")
|
||||
elif ty == backend.OpType.Pad:
|
||||
pads = backend.pad_pads_of(op)
|
||||
inputs.append(
|
||||
ctx.push_data_input(
|
||||
name, "pads", TensorProto.INT64, [len(pads)], pads
|
||||
)
|
||||
)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
# elif ty == backend.OpType.Clip:
|
||||
# min, max = backend.clip_attrs_of(op)
|
||||
# if min != None:
|
||||
# inputs.append(
|
||||
# ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [min])
|
||||
# )
|
||||
# else:
|
||||
# inputs.append(
|
||||
# ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [])
|
||||
# )
|
||||
# if max != None:
|
||||
# inputs.append(
|
||||
# ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [max])
|
||||
# )
|
||||
# else:
|
||||
# inputs.append(
|
||||
# ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
|
||||
# )
|
||||
# ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
else:
|
||||
raise Exception("Unsupported OpType", ty)
|
||||
|
||||
model = ctx.build(name)
|
||||
onnx.save(model, path)
|
||||
return model
|
||||
|
||||
# def init(self) -> None:
|
||||
# self.handler.data_malloc()
|
||||
|
||||
# def optimize(self) -> None:
|
||||
# self.handler.optimize()
|
||||
|
||||
# def run(self) -> None:
|
||||
# self.handler.run()
|
||||
|
||||
|
||||
# def from_onnx(model: ModelProto, runtime):
|
||||
# stub = OnnxStub(model, runtime)
|
||||
# return stub.inputs, stub.outputs, stub.handler
|
||||
|
||||
|
||||
# def _search_shape(model: ModelProto, name: str) -> List[int]:
|
||||
# ans = (
|
||||
# next(
|
||||
# (
|
||||
# [
|
||||
# (d.dim_value if d.dim_value > 0 else 1)
|
||||
# for d in tensor.type.tensor_type.shape.dim
|
||||
# ]
|
||||
# for tensor in model.graph.value_info
|
||||
# if tensor.name == name
|
||||
# ),
|
||||
# None,
|
||||
# )
|
||||
# or next(
|
||||
# (
|
||||
# [
|
||||
# (d.dim_value if d.dim_value > 0 else 1)
|
||||
# for d in tensor.type.tensor_type.shape.dim
|
||||
# ]
|
||||
# for tensor in model.graph.input
|
||||
# if tensor.name == name
|
||||
# ),
|
||||
# None,
|
||||
# )
|
||||
# or next(
|
||||
# [int(d) for d in tensor.dims]
|
||||
# for tensor in model.graph.initializer
|
||||
# if tensor.name == name
|
||||
# )
|
||||
# )
|
||||
# return ans
|
||||
|
||||
|
||||
# def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
|
||||
# for attr in node.attribute:
|
||||
# if attr.name in attrs:
|
||||
# if attr.type == AttributeProto.INT:
|
||||
# attrs[attr.name] = attr.i
|
||||
# elif attr.type == AttributeProto.INTS:
|
||||
# attrs[attr.name] = attr.ints
|
||||
# elif attr.type == AttributeProto.FLOAT:
|
||||
# attrs[attr.name] = attr.f
|
||||
# elif attr.type == AttributeProto.STRING:
|
||||
# attrs[attr.name] = attr.s
|
||||
# elif attr.type == AttributeProto.TENSOR:
|
||||
# attrs[attr.name] = attr.t
|
||||
# else:
|
||||
# assert False, "Unsupported Attribute Type: {}".format(attr.type)
|
||||
# return attrs
|
||||
|
||||
|
||||
# def _parse_data(tensor: TensorProto) -> List[Any]:
|
||||
# return to_array(tensor).flatten().tolist()
|
||||
|
||||
|
||||
# def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
|
||||
# return [(d.dim_value if d.dim_value > 0 else 1) for d in shape.dim]
|
||||
|
||||
def export_onnx(g: backend.Graph, path: str) -> None:
|
||||
stub = OnnxStub()
|
||||
stub.to_onnx(g, path)
|
|
@ -29,6 +29,7 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
|
|||
perfEngine.setPerfData(perfKey, record);
|
||||
} else
|
||||
record = perfData;
|
||||
std::cout << 5 << std::endl;
|
||||
|
||||
double t = record->time;
|
||||
totalTime += t;
|
||||
|
|
|
@ -125,12 +125,30 @@ void GraphObj::optimize() {
|
|||
|
||||
void GraphObj::dataMalloc() {
|
||||
for (auto &tensor : tensors) {
|
||||
tensor->dataMalloc();
|
||||
if (tensor->getSource() && tensor->getTargets().size() > 0 &&
|
||||
tensor->getSource()->getOpType() == OpType::Reshape) {
|
||||
continue;
|
||||
} else
|
||||
tensor->dataMalloc();
|
||||
}
|
||||
// Fill reshape output for avoiding nullptr
|
||||
for (auto &tensor : tensors) {
|
||||
if (tensor->getSource() &&
|
||||
tensor->getSource()->getOpType() == OpType::Reshape) {
|
||||
tensor->setData(tensor->getSource()->getInputs(0)->getDataBlob());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
|
||||
return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
|
||||
void GraphObj::dataFree() {
|
||||
for (auto &tensor : tensors) {
|
||||
tensor->freeData();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphObj::addTensor(Shape dim, DataType dtype, TensorType tensorType) {
|
||||
return tensors.emplace_back(
|
||||
make_ref<TensorObj>(dim, dtype, runtime, tensorType));
|
||||
}
|
||||
|
||||
Tensor GraphObj::addTensor(const Tensor &tensor) {
|
||||
|
@ -228,4 +246,14 @@ bool GraphObj::checkValid() const {
|
|||
return true;
|
||||
}
|
||||
|
||||
int GraphObj::removeIndependentTensors() {
|
||||
TensorVec newTensors;
|
||||
for (const auto &t : tensors)
|
||||
if (!t->getTargets().empty() || t->getSource())
|
||||
newTensors.emplace_back(t);
|
||||
auto ret = tensors.size() - newTensors.size();
|
||||
tensors = newTensors;
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
#include "core/graph_handler.h"
|
||||
#include "nnet/Visitor/Serializer.h"
|
||||
#include "operators/batch_norm.h"
|
||||
#include "operators/concat.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/membound.h"
|
||||
#include "operators/pad.h"
|
||||
#include "operators/pooling.h"
|
||||
#include "operators/reduce_mean.h"
|
||||
|
@ -19,8 +21,8 @@ namespace infini {
|
|||
|
||||
static DataType dtype_repr_convert(int);
|
||||
|
||||
Tensor GraphHandlerObj::tensor(Shape dims, int dtype) {
|
||||
return g->addTensor(std::move(dims), dtype_repr_convert(dtype));
|
||||
Tensor GraphHandlerObj::tensor(Shape dims, int dtype, TensorType ttype) {
|
||||
return g->addTensor(std::move(dims), dtype_repr_convert(dtype), ttype);
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::conv(Tensor input, Tensor weight, Tensor output, int ph,
|
||||
|
@ -55,6 +57,39 @@ Tensor GraphHandlerObj::convTransposed2d(Tensor input, Tensor weight,
|
|||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::convNHWC(Tensor input, Tensor weight, Tensor output,
|
||||
int ph, int pw, int sh, int sw, int dh,
|
||||
int dw) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<ConvNHWCObj>(std::move(input), std::move(weight),
|
||||
output, ph, pw, sh, sw, dh, dw);
|
||||
return output;
|
||||
} else {
|
||||
return g
|
||||
->addOp<ConvNHWCObj>(std::move(input), std::move(weight), output,
|
||||
ph, pw, sh, sw, dh, dw)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::convTransposed2dNHWC(Tensor input, Tensor weight,
|
||||
Tensor output, int ph, int pw,
|
||||
int sh, int sw, int dh, int dw,
|
||||
int oph, int opw) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<ConvTransposed2dNHWCObj>(
|
||||
std::move(input), std::move(weight), output, ph, pw, sh, sw, dh, dw,
|
||||
oph, opw);
|
||||
return output;
|
||||
} else {
|
||||
return g
|
||||
->addOp<ConvTransposed2dNHWCObj>(std::move(input),
|
||||
std::move(weight), output, ph, pw,
|
||||
sh, sw, dh, dw, oph, opw)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA,
|
||||
bool transB, Tensor bias, ActType act) {
|
||||
if (y) {
|
||||
|
@ -291,6 +326,22 @@ Tensor GraphHandlerObj::pad(Tensor input, Tensor output,
|
|||
}
|
||||
}
|
||||
|
||||
TensorVec GraphHandlerObj::memBound(const TensorVec &inputs,
|
||||
const Tensor &output,
|
||||
const string &jsonString) {
|
||||
const auto &[expr, nnetInputs, execTime, hint] =
|
||||
nnet::Serializer().membundOpFromString(jsonString);
|
||||
if (output) {
|
||||
g->addOpWithOutputs<MemBoundObj>(std::move(inputs), TensorVec{output},
|
||||
nnetInputs, expr, execTime, hint);
|
||||
return {output};
|
||||
} else
|
||||
return g
|
||||
->addOp<MemBoundObj>(std::move(inputs), TensorVec{nullptr},
|
||||
nnetInputs, expr, execTime, hint)
|
||||
->getOutputs();
|
||||
}
|
||||
|
||||
static DataType dtype_repr_convert(int dtype) {
|
||||
switch ((OnnxDType)dtype) {
|
||||
case OnnxDType::FLOAT:
|
||||
|
@ -314,4 +365,12 @@ static DataType dtype_repr_convert(int dtype) {
|
|||
}
|
||||
}
|
||||
|
||||
Graph GraphHandlerObj::getGraph() const {
|
||||
int nRemoved = g->removeIndependentTensors();
|
||||
if (nRemoved > 0)
|
||||
std::cout << "Removed " << nRemoved << " independent tensors"
|
||||
<< std::endl;
|
||||
return g;
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#include "core/operator.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/hash.h"
|
||||
#include "nnet/dbg.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -25,7 +26,8 @@ bool OperatorObj::isConcatOp() const { return type == OpType::Concat; }
|
|||
bool OperatorObj::isComputeOp() const {
|
||||
return type == OpType::Conv || type == OpType::Matmul ||
|
||||
type == OpType::ConvTrans || type == OpType::ConvTransNHWC ||
|
||||
type == OpType::G2BMM || type == OpType::GBMM;
|
||||
type == OpType::G2BMM || type == OpType::GBMM ||
|
||||
type == OpType::ConvNHWC;
|
||||
}
|
||||
|
||||
bool OperatorObj::isTransposeOp() const { return type == OpType::Transpose; }
|
||||
|
@ -33,8 +35,12 @@ bool OperatorObj::isTransposeOp() const { return type == OpType::Transpose; }
|
|||
bool OperatorObj::isReshapeOp() const { return type == OpType::Reshape; }
|
||||
|
||||
bool OperatorObj::isMemBoundOp() const {
|
||||
return type == OpType::MemBound || type == OpType::Activation ||
|
||||
type == OpType::Transpose;
|
||||
if (type == OpType::Any)
|
||||
return true; // TODO: check operator attributes
|
||||
return type == OpType::MemBound || type == OpType::Reshape ||
|
||||
type == OpType::Activation || type == OpType::Transpose ||
|
||||
type == OpType::Relu || type == OpType::Tanh ||
|
||||
type == OpType::Softmax;
|
||||
}
|
||||
|
||||
void OperatorObj::removePredecessors(const Operator &op) {
|
||||
|
@ -83,22 +89,31 @@ HashType OperatorObj::hash() const {
|
|||
|
||||
bool OperatorObj::checkValid(GraphObj *graph) {
|
||||
auto optShapes = inferShape();
|
||||
IT_ASSERT(optShapes);
|
||||
if (!optShapes) // shape inference failed
|
||||
return false;
|
||||
|
||||
const vector<Shape> &shapes = *optShapes;
|
||||
IT_ASSERT(shapes.size() == outputs.size());
|
||||
if (shapes.size() != outputs.size())
|
||||
return false;
|
||||
if (graph) { // if graph != nullptr, outputs should be created
|
||||
auto dataTypes = inferDataType();
|
||||
for (size_t i = 0; i < outputs.size(); i++) {
|
||||
IT_ASSERT(!outputs[i], "Find empty output while operator creation");
|
||||
outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
|
||||
outputs[i] =
|
||||
graph->addTensor(shapes[i], dataTypes[i], TensorType::Other);
|
||||
}
|
||||
} else { // if outputs have been created, check their shapes
|
||||
for (size_t i = 0; i < shapes.size(); ++i) {
|
||||
if (shapes[i] != outputs[i]->getDims())
|
||||
IT_ASSERT(shapes[i] == outputs[i]->getDims(),
|
||||
(vecToString(shapes[i]) +
|
||||
" != " + vecToString(outputs[i]->getDims())));
|
||||
if (shapes[i] != outputs[i]->getDims()) {
|
||||
dbg(shapes[i], outputs[i]->getDims());
|
||||
return false;
|
||||
}
|
||||
IT_ASSERT(outputs[i]->getTensorType() == TensorType::Other);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
|
|
@ -2,9 +2,15 @@
|
|||
#include "core/blob.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/perf_engine.h"
|
||||
#include "operators/membound.h"
|
||||
#include "utils/data_generator.h"
|
||||
#include <chrono>
|
||||
#include <cstring>
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#include "cuda_profiler_api.h"
|
||||
#endif
|
||||
|
||||
namespace infini {
|
||||
void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
||||
if (!tune && profiling)
|
||||
|
@ -52,17 +58,40 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
|||
opCnt[op->getOpType()]++;
|
||||
}
|
||||
}
|
||||
if (profiling)
|
||||
printProfilingData(totalTime, opTime, opCnt);
|
||||
// if (profiling)
|
||||
// printProfilingData(totalTime, opTime, opCnt);
|
||||
}
|
||||
|
||||
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
||||
map<UidBaseType, bool>
|
||||
RuntimeObj::getCompileTimeComputableAttribute(const Graph &graph) const {
|
||||
map<UidBaseType, bool> ctcMap; // compile-time computable
|
||||
// Skip static computation
|
||||
bool status = graph->topo_sort();
|
||||
IT_ASSERT(status, "Topological sort failed");
|
||||
for (auto &op : graph->getOperators()) {
|
||||
bool compileTimeComputable = true;
|
||||
for (auto input : op->getInputs()) {
|
||||
// FIXME: propogate the tensor type. Current only the first operator
|
||||
// after weights are compile-time computable.
|
||||
if (input->getTensorType() != TensorType::Initialized)
|
||||
compileTimeComputable = false;
|
||||
}
|
||||
ctcMap[op->getGuid()] = compileTimeComputable;
|
||||
}
|
||||
return ctcMap;
|
||||
}
|
||||
|
||||
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
|
||||
bool allowEstimation,
|
||||
bool ignoreMemboundOp) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
// Statistics
|
||||
double totalTime = 0;
|
||||
std::map<OpType, double> opTime;
|
||||
std::map<OpType, int> opCnt;
|
||||
std::map<OpType, int> opCnt, opNonCtcCnt;
|
||||
// compile-time computable
|
||||
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType(), op->getDType()};
|
||||
|
@ -70,11 +99,19 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
|||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
||||
PerfRecord record;
|
||||
// Tune the kernel if there is no record
|
||||
if (!perfData) {
|
||||
double time = -1e9;
|
||||
if (ctcMap[op->getGuid()]) { // Compile-time computable operators
|
||||
time = 0;
|
||||
} else if (op->getOpType() == OpType::MemBound && ignoreMemboundOp) {
|
||||
time = 0;
|
||||
} else if (op->getOpType() == OpType::MemBound && allowEstimation) {
|
||||
time = as<MemBoundObj>(op)->getEstimatedTime();
|
||||
} else if (perfData) { // Tune the kernel if there is no record
|
||||
time = perfData->time;
|
||||
} else {
|
||||
// TODO: should tenosrs automatically allocate when access data?
|
||||
// allocate memory for empty tensors and release it after profiling
|
||||
// allocate memory for empty tensors and release it after
|
||||
// profiling
|
||||
TensorVec allocatedTensors;
|
||||
for (auto t : op->getInputs())
|
||||
if (!t->hasData())
|
||||
|
@ -88,37 +125,47 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
|||
}
|
||||
|
||||
// Profile operators and record the results
|
||||
record = kernel->tune(op, this);
|
||||
PerfRecord record = kernel->tune(op, this);
|
||||
time = record->time;
|
||||
perfEngine.setPerfData(perfKey, record);
|
||||
|
||||
// Free allocated memory
|
||||
for (auto t : allocatedTensors)
|
||||
t->freeData();
|
||||
} else
|
||||
record = perfData;
|
||||
}
|
||||
|
||||
double t = record->time;
|
||||
totalTime += t;
|
||||
// FIXME: ignore trnapose when necessary
|
||||
// op->getOpType() != OpType::Transpose &&
|
||||
// op->getOpType() != OpType::ReduceMean
|
||||
if (op->getOpType() != OpType::Reshape)
|
||||
totalTime += time;
|
||||
if (profiling) {
|
||||
op->print();
|
||||
printf(" op_time %lf\n", t);
|
||||
opTime[op->getOpType()] += t;
|
||||
printf(" op_time %lf\n", time);
|
||||
opTime[op->getOpType()] += time;
|
||||
opCnt[op->getOpType()]++;
|
||||
if (!ctcMap[op->getGuid()])
|
||||
opNonCtcCnt[op->getOpType()]++;
|
||||
else
|
||||
opNonCtcCnt[op->getOpType()]; // Create a new entry
|
||||
}
|
||||
}
|
||||
if (profiling)
|
||||
printProfilingData(totalTime, opTime, opCnt);
|
||||
printProfilingData(totalTime, opTime, opCnt, opNonCtcCnt);
|
||||
return totalTime;
|
||||
}
|
||||
|
||||
void RuntimeObj::printProfilingData(double totalTime,
|
||||
const std::map<OpType, double> &opTime,
|
||||
const std::map<OpType, int> &opCnt) const {
|
||||
printf("%11s %3s %7s %7s %7s\n", "Op", "Cnt", "T_tot", "Percent", "T_mean");
|
||||
void RuntimeObj::printProfilingData(
|
||||
double totalTime, const std::map<OpType, double> &opTime,
|
||||
const std::map<OpType, int> &opCnt,
|
||||
const std::map<OpType, int> &opNonCtcCnt) const {
|
||||
printf("%11s %3s %5s %7s %7s %7s\n", "Op", "Cnt", "#NCtc", "T_tot",
|
||||
"Percent", "T_mean");
|
||||
for (const auto &[type, t] : opTime) {
|
||||
printf("%11s %3d %7.3f %7.1f %7.3f\n",
|
||||
OpRegistry::getOpName(type).data(), opCnt.at(type), t,
|
||||
t / totalTime * 100, t / opCnt.at(type));
|
||||
printf("%11s %3d %5d %7.3f %7.1f %7.3f\n",
|
||||
OpRegistry::getOpName(type).data(), opCnt.at(type),
|
||||
opNonCtcCnt.at(type), t, t / totalTime * 100,
|
||||
t / opCnt.at(type));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -160,4 +207,44 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,
|
|||
|
||||
string NativeCpuRuntimeObj::toString() const { return "CPU Runtime"; }
|
||||
|
||||
double RuntimeObj::timeNonCtcOperators(const Graph &graph, int warmup,
|
||||
int repeat) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
// compile-time computable
|
||||
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
|
||||
vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
|
||||
bool status = graph->topo_sort();
|
||||
IT_ASSERT(status, "Topological sort failed");
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType(), DataType::Float32};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
if (perfData)
|
||||
kernel->compute(op, perfData, this);
|
||||
else
|
||||
kernel->compute(op, this);
|
||||
if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
|
||||
kernels.emplace_back(op, kernel, perfData);
|
||||
}
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
dbg(op);
|
||||
}
|
||||
double ret = timeit(
|
||||
[&]() {
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
if (perfData)
|
||||
kernel->compute(op, perfData, this);
|
||||
else
|
||||
kernel->compute(op, this);
|
||||
}
|
||||
},
|
||||
[&]() { this->sync(); }, warmup, repeat);
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
#include "core/search_engine.h"
|
||||
#include "core/hash.h"
|
||||
#include "core/runtime.h"
|
||||
#include "ffi/ffi_callback.h"
|
||||
#include "nnet/dbg.h"
|
||||
#include "operators/reshape.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
|
@ -8,7 +11,17 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
void SearchEngine::printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph) {
|
||||
using MetaGraph = SearchEngine::MetaGraph;
|
||||
|
||||
SearchEngine::SearchEngine(Runtime runtime, Ref<Mutator> mutator)
|
||||
: runtimeExec(runtime), mutator(mutator) {
|
||||
// Compare graph with estimated time
|
||||
graphTimeComparer = [this](const Graph &a, const Graph &b) -> bool {
|
||||
return getEstimatedGraphPerf(a) < getEstimatedGraphPerf(b);
|
||||
};
|
||||
}
|
||||
|
||||
void SearchEngine::printMetaGraph(MetaGraph metaGraph) {
|
||||
for (size_t i = 0; i < metaGraph->nodes.size(); i++) {
|
||||
auto &node = metaGraph->nodes[i];
|
||||
std::cout << "id: " << i << std::endl;
|
||||
|
@ -32,8 +45,7 @@ Graph SearchEngine::run(const Graph graph) {
|
|||
IT_ASSERT(runtimeExec == graph->getRuntime());
|
||||
std::cout << "[INFO] original graph: " << std::endl;
|
||||
std::cout << graph->toString();
|
||||
std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(graph)
|
||||
<< std::endl;
|
||||
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(graph) << std::endl;
|
||||
|
||||
std::vector<Graph> partitions = partitionGraph(graph);
|
||||
|
||||
|
@ -45,7 +57,6 @@ Graph SearchEngine::run(const Graph graph) {
|
|||
std::vector<Graph> candidates = search(subGraph);
|
||||
std::cout << "[INFO] size: " << candidates.size() << std::endl;
|
||||
IT_ASSERT(candidates.size() > 0);
|
||||
std::cout << subGraph->toString() << std::endl;
|
||||
std::vector<Graph> nextGraphs;
|
||||
for (auto lastGraph : bestGraphs) {
|
||||
for (auto thisGraph : candidates) {
|
||||
|
@ -61,13 +72,12 @@ Graph SearchEngine::run(const Graph graph) {
|
|||
}
|
||||
}
|
||||
auto tmp = make_ref<GraphObj>(runtimeExec, ops);
|
||||
tmp->dataMalloc();
|
||||
nextGraphs.emplace_back(tmp);
|
||||
}
|
||||
}
|
||||
std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
|
||||
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
|
||||
});
|
||||
dbg("===Num" + std::to_string(nextGraphs.size()));
|
||||
std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
|
||||
|
||||
if (nextGraphs.size() > GRAPH_SIZE) {
|
||||
nextGraphs.resize(GRAPH_SIZE);
|
||||
}
|
||||
|
@ -81,10 +91,30 @@ Graph SearchEngine::run(const Graph graph) {
|
|||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
std::cout << "bestGraph " << i << ":" << std::endl;
|
||||
std::cout << bestGraphs[i]->toString();
|
||||
std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(bestGraphs[i])
|
||||
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[i])
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
// Fuse vertically and sort according to performance
|
||||
for (size_t i = 0; i < bestGraphs.size(); ++i) {
|
||||
bestGraphs[i] = fuseVertically(bestGraphs[i]);
|
||||
}
|
||||
std::sort(bestGraphs.begin(), bestGraphs.end(), graphTimeComparer);
|
||||
|
||||
// Check optimized graphs are legal
|
||||
for (auto g : bestGraphs) {
|
||||
g->checkValid();
|
||||
IT_ASSERT(graph->getInputs().size() == g->getInputs().size(),
|
||||
graph->toString() + string("\n") + g->toString());
|
||||
IT_ASSERT(graph->getOutputs().size() == g->getOutputs().size(),
|
||||
graph->toString() + string("\n") + g->toString());
|
||||
}
|
||||
|
||||
std::cout << "[INFO] best fused graph: " << std::endl;
|
||||
std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[0])
|
||||
<< std::endl;
|
||||
std::cout << bestGraphs[0] << std::endl;
|
||||
|
||||
return bestGraphs[0];
|
||||
}
|
||||
|
||||
|
@ -102,9 +132,9 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
|
|||
}
|
||||
}
|
||||
|
||||
sort(results.begin(), results.end(), [&](Graph x, Graph y) {
|
||||
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
|
||||
}); // compare with perf time
|
||||
// compare with perf time
|
||||
dbg("===Num" + std::to_string(results.size()));
|
||||
std::sort(results.begin(), results.end(), graphTimeComparer);
|
||||
if (results.size() > GRAPH_SIZE) {
|
||||
results.resize(GRAPH_SIZE);
|
||||
}
|
||||
|
@ -112,9 +142,8 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
|
|||
}
|
||||
|
||||
// Build metagraph with a graph, each operator is a node.
|
||||
std::shared_ptr<SearchEngine::MetaGraph>
|
||||
SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
||||
auto metaGraph = std::make_shared<MetaGraph>();
|
||||
MetaGraph SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
||||
auto metaGraph = make_ref<MetaGraphObj>();
|
||||
|
||||
int numOps = graph->getOperators().size();
|
||||
std::vector<int> cnt(numOps, 0);
|
||||
|
@ -123,7 +152,7 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
|||
std::vector<int> q(0);
|
||||
for (size_t i = 0; i < graph->getOperators().size(); i++) {
|
||||
auto &op = graph->getOperators()[i];
|
||||
MetaGraph::Node node;
|
||||
MetaGraphObj::Node node;
|
||||
std::vector<Operator> ops;
|
||||
ops.emplace_back(op);
|
||||
node.graph = make_ref<GraphObj>(runtimeExec, ops);
|
||||
|
@ -157,9 +186,8 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
|||
|
||||
// Build a metagraph with graph and a plan, a plan is which ops should be a
|
||||
// node.
|
||||
std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
|
||||
const std::shared_ptr<SearchEngine::MetaGraph> metaGraph,
|
||||
const std::vector<int> &plan) {
|
||||
MetaGraph SearchEngine::buildMetaGraphWithPlan(const MetaGraph metaGraph,
|
||||
const std::vector<int> &plan) {
|
||||
int numGroups = 0;
|
||||
for (auto i : plan) {
|
||||
if (i > numGroups) {
|
||||
|
@ -172,12 +200,12 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
|
|||
groups[plan[i]].emplace_back(i);
|
||||
}
|
||||
|
||||
auto resultMetaGraph = make_ref<MetaGraph>();
|
||||
auto resultMetaGraph = make_ref<MetaGraphObj>();
|
||||
for (auto &group : groups) {
|
||||
std::vector<Operator> ops;
|
||||
std::unordered_set<int> preSet, sucSet;
|
||||
for (auto id : group) {
|
||||
MetaGraph::Node node;
|
||||
MetaGraphObj::Node node;
|
||||
for (auto op : metaGraph->nodes[id].graph->getOperators()) {
|
||||
ops.emplace_back(op);
|
||||
}
|
||||
|
@ -204,9 +232,10 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
|
|||
}
|
||||
|
||||
// Search how to merge multiple ops.
|
||||
std::vector<std::shared_ptr<SearchEngine::MetaGraph>>
|
||||
SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
||||
vector<MetaGraph> SearchEngine::searchMerge(MetaGraph &metaGraph) {
|
||||
IT_ASSERT(metaGraph != nullptr);
|
||||
// HACK: disable multiple op search
|
||||
return {metaGraph};
|
||||
std::vector<int> plan(metaGraph->nodes.size());
|
||||
for (size_t i = 0; i < plan.size(); i++) {
|
||||
plan[i] = i;
|
||||
|
@ -222,7 +251,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
|||
std::unordered_set<HashType> planSet;
|
||||
searchMergeDfs(metaGraph, plan, frontier, plans, planSet);
|
||||
|
||||
std::vector<std::shared_ptr<SearchEngine::MetaGraph>> metaGraphs;
|
||||
vector<MetaGraph> metaGraphs;
|
||||
for (auto &curPlan : plans) {
|
||||
metaGraphs.emplace_back(buildMetaGraphWithPlan(metaGraph, curPlan));
|
||||
}
|
||||
|
@ -230,8 +259,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
|||
}
|
||||
|
||||
// DFS impl for search merge.
|
||||
void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
|
||||
std::vector<int> &plan,
|
||||
void SearchEngine::searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
|
||||
std::vector<int> &frontier,
|
||||
std::vector<std::vector<int>> &plans,
|
||||
std::unordered_set<uint64_t> &planSet) {
|
||||
|
@ -320,14 +348,40 @@ void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
|
|||
}
|
||||
|
||||
// Search mutation for each compute op.
|
||||
std::vector<Graph> SearchEngine::searchMutation(
|
||||
const std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
||||
std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
|
||||
std::vector<Graph> graphs = {nullptr};
|
||||
// Append a node to all existing candidates
|
||||
for (auto &node : metaGraph->nodes) {
|
||||
std::vector<Graph> nextGraphs;
|
||||
if (node.type == 1) { // If it has computing OPs
|
||||
auto mutatedGraphs = mutator->run(node.graph);
|
||||
if (mutator->hasTunedKernel)
|
||||
chooseBestMutation = false;
|
||||
std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
|
||||
graphTimeComparer);
|
||||
if (mutatedGraphs.size() >= 10)
|
||||
mutatedGraphs.resize(10);
|
||||
mutatedGraphs = {mutatedGraphs[0]};
|
||||
// if (searchFilter == 1) {
|
||||
// std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
|
||||
// graphTimeComparer);
|
||||
// if (mutatedGraphs.size() >= 10)
|
||||
// mutatedGraphs.resize(10);
|
||||
// mutatedGraphs = {mutatedGraphs[0]};
|
||||
// } else if (chooseBestMutation && mutatedGraphs.size() >= 2) {
|
||||
// std::sort(mutatedGraphs.begin(), mutatedGraphs.end(),
|
||||
// graphTimeComparer);
|
||||
// if (mutatedGraphs.size() >= 10)
|
||||
// mutatedGraphs.resize(10);
|
||||
// mutatedGraphs = {mutatedGraphs[0]};
|
||||
// } else { // avoid repeated kernel genreation
|
||||
// if (mutatedGraphs.size() >= 2) // INFOGAN
|
||||
// mutatedGraphs = {mutatedGraphs[1]};
|
||||
// // if (mutatedGraphs.size() > 2) {
|
||||
// // mutatedGraphs.resize(2);
|
||||
// // }
|
||||
// }
|
||||
|
||||
for (auto graph : graphs) {
|
||||
for (auto mutatedGraph : mutatedGraphs) {
|
||||
std::vector<Operator> ops;
|
||||
|
@ -357,12 +411,8 @@ std::vector<Graph> SearchEngine::searchMutation(
|
|||
nextGraphs.emplace_back(make_ref<GraphObj>(runtimeExec, ops));
|
||||
}
|
||||
}
|
||||
for (auto g : nextGraphs) {
|
||||
g->dataMalloc();
|
||||
}
|
||||
std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
|
||||
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
|
||||
});
|
||||
dbg("===Num" + std::to_string(nextGraphs.size()));
|
||||
std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
|
||||
if (nextGraphs.size() > GRAPH_SIZE) {
|
||||
nextGraphs.resize(GRAPH_SIZE);
|
||||
}
|
||||
|
@ -372,7 +422,7 @@ std::vector<Graph> SearchEngine::searchMutation(
|
|||
}
|
||||
|
||||
bool SearchEngine::isMultiBranchMergable(const Graph graph) {
|
||||
return mutationEngine->isMultiBranchMergable(graph);
|
||||
return mutator->isMultiBranchMergable(graph);
|
||||
}
|
||||
|
||||
// Split a graph into multiple independt graphs. Search engine will search for
|
||||
|
@ -423,7 +473,6 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
|
|||
std::cout << op->toString() << std::endl;
|
||||
}
|
||||
auto tmp = make_ref<GraphObj>(runtimeExec, headOps);
|
||||
tmp->dataMalloc();
|
||||
partitions.emplace_back(tmp);
|
||||
headOps.clear();
|
||||
}
|
||||
|
@ -431,11 +480,100 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
|
|||
}
|
||||
if (!headOps.empty()) {
|
||||
auto tmp = make_ref<GraphObj>(runtimeExec, headOps);
|
||||
tmp->dataMalloc();
|
||||
partitions.emplace_back(tmp);
|
||||
}
|
||||
std::reverse(partitions.begin(), partitions.end());
|
||||
return partitions;
|
||||
}
|
||||
|
||||
double SearchEngine::getEstimatedGraphPerf(Graph graph) {
|
||||
// dbg(graph);
|
||||
// // hkz
|
||||
// callback::exportONNX(graph, "a.onnx");
|
||||
return runtimeExec->getPerfTime(graph, false, true, true);
|
||||
}
|
||||
|
||||
Graph SearchEngine::fuseVertically(const Graph &graph) {
|
||||
std::unordered_map<UidBaseType, int> visitTime;
|
||||
std::vector<Operator> ops;
|
||||
|
||||
graph->topo_sort();
|
||||
int cnt = 0;
|
||||
for (auto op : graph->getOperators()) {
|
||||
// Skip visited OP
|
||||
if (visitTime.find(op->getGuid()) != visitTime.end()) {
|
||||
continue;
|
||||
}
|
||||
// Skip compute OP and multi-input/output OP
|
||||
if (!op->isMemBoundOp() || (op->getPredecessors().size() != 1 &&
|
||||
op->getSuccessors().size() != 1)) {
|
||||
visitTime.emplace(op->getGuid(), ++cnt);
|
||||
ops.emplace_back(op);
|
||||
continue;
|
||||
}
|
||||
// FIXME: fuse and modify attributes of computing operators
|
||||
if (op->getOpType() == OpType::Relu ||
|
||||
op->getOpType() == OpType::PRelu) {
|
||||
if (auto p = op->getInputs()[0])
|
||||
if (auto sop = p->getSource())
|
||||
if (sop->getOpType() == OpType::Conv ||
|
||||
sop->getOpType() == OpType::Matmul) {
|
||||
visitTime.emplace(op->getGuid(), ++cnt);
|
||||
ops.emplace_back(make_ref<ReshapeObj>(
|
||||
nullptr, op->getInputs()[0], op->getOutputs()[0]));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
vector<Operator> chainOps;
|
||||
visitTime.emplace(op->getGuid(), ++cnt);
|
||||
|
||||
vector<Operator> tmp;
|
||||
auto cur = op;
|
||||
while (cur->getPredecessors().size() == 1 &&
|
||||
cur->getPredecessors()[0]->isMemBoundOp()) {
|
||||
cur = cur->getPredecessors()[0];
|
||||
if (visitTime.count(cur->getGuid()))
|
||||
break;
|
||||
tmp.emplace_back(cur);
|
||||
visitTime.emplace(cur->getGuid(), cnt);
|
||||
}
|
||||
for (int i = tmp.size() - 1; i >= 0; i--) {
|
||||
chainOps.emplace_back(tmp[i]);
|
||||
}
|
||||
chainOps.emplace_back(op);
|
||||
cur = op;
|
||||
while (cur->getSuccessors().size() == 1 &&
|
||||
cur->getSuccessors()[0]->isMemBoundOp()) {
|
||||
cur = cur->getSuccessors()[0];
|
||||
if (visitTime.count(cur->getGuid()))
|
||||
break;
|
||||
chainOps.emplace_back(cur);
|
||||
visitTime.emplace(cur->getGuid(), cnt);
|
||||
}
|
||||
make_ref<GraphObj>(runtimeExec, chainOps)->print();
|
||||
|
||||
auto bestGraph = make_ref<GraphObj>(runtimeExec, chainOps);
|
||||
// Eliminate transpose and reshape operators
|
||||
if (auto eliminatedGraph = mutator->eliminateVertically(
|
||||
make_ref<GraphObj>(runtimeExec, chainOps)))
|
||||
bestGraph = eliminatedGraph;
|
||||
// Fuse membound operators
|
||||
if (auto optGraph = mutator->fuseVertically(bestGraph))
|
||||
bestGraph = optGraph;
|
||||
for (auto op : bestGraph->getOperators()) {
|
||||
ops.emplace_back(op);
|
||||
}
|
||||
}
|
||||
if (ops.empty()) {
|
||||
IT_TODO_HALT();
|
||||
IT_ASSERT(graph->getOutputs().size() == 1);
|
||||
IT_ASSERT(graph->getInputs().size() == 1);
|
||||
// auto g = make_ref<GraphObj>(runtime);
|
||||
// TODO: add identity
|
||||
ops.emplace_back(make_ref<ReshapeObj>(nullptr, graph->getInputs()[0],
|
||||
graph->getOutputs()[0]));
|
||||
}
|
||||
return make_ref<GraphObj>(runtimeExec, ops);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -8,12 +8,14 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime)
|
||||
TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime,
|
||||
TensorType tensorType)
|
||||
: TensorBaseObj(shape_.size(), dtype, runtime), shape(std::move(shape_)),
|
||||
_size(shape.empty()
|
||||
? 0
|
||||
: std::accumulate(shape.begin(), shape.end(), 1,
|
||||
[](auto acc, auto x) { return acc * x; })) {}
|
||||
: std::accumulate(shape.begin(), shape.end(), 1lu,
|
||||
[](auto acc, auto x) { return acc * x; })),
|
||||
tensorType(tensorType) {}
|
||||
|
||||
string TensorObj::toString() const {
|
||||
// Convert data pointer to string
|
||||
|
@ -24,8 +26,8 @@ string TensorObj::toString() const {
|
|||
ss << "nullptr data";
|
||||
string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
|
||||
std::to_string(fuid) + ", shape " + vecToString(shape) +
|
||||
", dtype " + dtype.toString() + ", " + runtime->toString() +
|
||||
", " + ss.str() + "\n";
|
||||
", dtype " + dtype.toString() + ", tensorType " +
|
||||
std::to_string(enum_to_underlying(tensorType));
|
||||
vector<UidBaseType> targetGuids;
|
||||
for (const auto &op : targets)
|
||||
targetGuids.emplace_back(op.lock()->getGuid());
|
||||
|
@ -34,6 +36,7 @@ string TensorObj::toString() const {
|
|||
else
|
||||
ret += ", source None";
|
||||
ret += ", targets " + vecToString(targetGuids);
|
||||
ret += ", " + runtime->toString() + ", " + ss.str();
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -64,12 +67,19 @@ vector<size_t> TensorObj::getStride() const {
|
|||
|
||||
void TensorObj::printData() const {
|
||||
IT_ASSERT(data != nullptr);
|
||||
if (!runtime->isCpu())
|
||||
IT_TODO_HALT();
|
||||
void *ptr = nullptr;
|
||||
Blob buffer;
|
||||
if (!runtime->isCpu()) { // copy data to main memory
|
||||
buffer = NativeCpuRuntimeObj::getInstance()->allocBlob(getBytes());
|
||||
runtime->copyBlobToCPU(buffer->getPtr<void *>(),
|
||||
getRawDataPtr<void *>(), getBytes());
|
||||
ptr = buffer->getPtr<void *>();
|
||||
} else
|
||||
ptr = data->getPtr<float *>();
|
||||
|
||||
#define TRY_PRINT(N) \
|
||||
if (dtype == DataType(N)) \
|
||||
std::cout << dataToString<DT<N>::t>() << std::endl;
|
||||
std::cout << dataToString<DT<N>::t>(ptr) << std::endl;
|
||||
|
||||
TRY_PRINT(0) // fmt: new line
|
||||
else TRY_PRINT(1) //
|
||||
|
@ -112,8 +122,9 @@ bool TensorObj::equalData(const Tensor &rhs, double relativeError) const {
|
|||
}
|
||||
|
||||
void TensorObj::dataMalloc() {
|
||||
if (!data)
|
||||
if (!data) {
|
||||
data = runtime->allocBlob(getBytes());
|
||||
}
|
||||
}
|
||||
|
||||
void TensorObj::copyData(const TensorObj *src) {
|
||||
|
@ -172,4 +183,27 @@ size_t TensorObj::getOffsetByBroadcastOffset(size_t bcOffset,
|
|||
|
||||
return getOffsetByPos(pos, shape);
|
||||
}
|
||||
|
||||
Tensor TensorObj::clone() const {
|
||||
auto obj = make_ref<TensorObj>(*this);
|
||||
obj->freeData();
|
||||
obj->targets.clear();
|
||||
obj->source.reset();
|
||||
return obj;
|
||||
}
|
||||
|
||||
Tensor TensorObj::clone(Runtime runtime) const {
|
||||
auto obj = make_ref<TensorObj>(*this);
|
||||
obj->runtime = runtime;
|
||||
obj->freeData();
|
||||
obj->targets.clear();
|
||||
obj->source.reset();
|
||||
// FIXME
|
||||
// if (hasData()) {
|
||||
// obj->dataMalloc();
|
||||
// obj->copyData(this);
|
||||
// }
|
||||
return obj;
|
||||
}
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -2,10 +2,58 @@
|
|||
#include "core/kernel.h"
|
||||
#include "core/perf_engine.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda_profiler_api.h"
|
||||
#include "nnet/dbg.h"
|
||||
#include "operators/any.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/matmul.h"
|
||||
#ifdef INFINI_USE_TVM
|
||||
#include "tvm/runtime/device_api.h"
|
||||
#endif
|
||||
namespace infini {
|
||||
|
||||
CudaRuntimeObj::CudaRuntimeObj()
|
||||
: RuntimeObj(Device::CUDA), stream(cudaStreamPerThread),
|
||||
cudaGraphStatus(false) {
|
||||
checkCudnnError(cudnnCreate(&cudnn));
|
||||
checkCublasError(cublasCreate(&cublas));
|
||||
checkCudnnError(cudnnSetStream(cudnn, stream));
|
||||
checkCublasError(cublasSetStream(cublas, stream));
|
||||
workspaceSize = 2ll << 30; // 2 GB
|
||||
workspace = alloc(workspaceSize);
|
||||
// Get CUDA device properties
|
||||
checkCudaError(cudaGetDeviceProperties(&deviceProperties, 0));
|
||||
}
|
||||
|
||||
CudaRuntimeObj::~CudaRuntimeObj() {
|
||||
try {
|
||||
dealloc(workspace);
|
||||
checkCudnnError(cudnnDestroy(cudnn));
|
||||
checkCublasError(cublasDestroy(cublas));
|
||||
} catch (const std::exception &e) {
|
||||
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void CudaRuntimeObj::beginCudaGraphStreamCapture() {
|
||||
enum cudaStreamCaptureStatus pCaptureStatus;
|
||||
checkCudaError(cudaStreamIsCapturing(stream, &pCaptureStatus));
|
||||
IT_ASSERT(pCaptureStatus == cudaStreamCaptureStatusNone);
|
||||
cudaGraphStatus = true;
|
||||
checkCudaError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
|
||||
}
|
||||
|
||||
tuple<cudaGraphExec_t, size_t> CudaRuntimeObj::endCudaGraphStreamCapture() {
|
||||
cudaGraph_t cudaGraph;
|
||||
cudaGraphExec_t instance;
|
||||
checkCudaError(cudaStreamEndCapture(stream, &cudaGraph));
|
||||
cudaGraphStatus = false;
|
||||
size_t numCudaGraphNodes;
|
||||
checkCudaError(cudaGraphGetNodes(cudaGraph, nullptr, &numCudaGraphNodes));
|
||||
checkCudaError(cudaGraphInstantiate(&instance, cudaGraph, NULL, NULL, 0));
|
||||
return {instance, numCudaGraphNodes};
|
||||
}
|
||||
|
||||
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
|
@ -75,4 +123,74 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
|
|||
|
||||
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
|
||||
|
||||
double CudaRuntimeObj::timeWithCudaGraph(Graph graph, int rounds) {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
// compile-time computable
|
||||
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
|
||||
vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
|
||||
bool status = graph->topo_sort();
|
||||
IT_ASSERT(status, "Topological sort failed");
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType(), DataType::Float32};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
if (perfData)
|
||||
kernel->compute(op, perfData, this);
|
||||
else
|
||||
kernel->compute(op, this);
|
||||
bool isFakeOp = (as<AnyObj>(op) &&
|
||||
as<AnyObj>(op)->getKernelName() == string("FakeOp"));
|
||||
if (as<AnyObj>(op))
|
||||
dbg(op, as<AnyObj>(op)->getKernelName() == string("FakeOp"));
|
||||
if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape &&
|
||||
op->getOpType() != OpType::Flatten && !isFakeOp)
|
||||
kernels.emplace_back(op, kernel, perfData);
|
||||
}
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
dbg(op);
|
||||
}
|
||||
vector<std::function<void(void)>> funcs;
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
if (perfData)
|
||||
funcs.push_back([&]() { kernel->compute(op, perfData, this); });
|
||||
else
|
||||
funcs.push_back([&]() { kernel->compute(op, this); });
|
||||
}
|
||||
return timeWithCudaGraph(funcs, rounds);
|
||||
}
|
||||
|
||||
double
|
||||
CudaRuntimeObj::timeWithCudaGraph(std::vector<std::function<void(void)>> funcs,
|
||||
int rounds) {
|
||||
// TODO: move this to kernel source?
|
||||
// Init tvm stream
|
||||
#ifdef INFINI_USE_TVM
|
||||
DLDevice tvm_device_id = {kDLCUDA, 0};
|
||||
auto tvm_device = tvm::runtime::DeviceAPI::Get(tvm_device_id);
|
||||
tvm_device->SetStream(tvm_device_id, getStream());
|
||||
#endif
|
||||
beginCudaGraphStreamCapture();
|
||||
for (auto &f : funcs)
|
||||
f();
|
||||
auto [cudaGraphInstance, numCudaGraphNodes] = endCudaGraphStreamCapture();
|
||||
// Since one TVM packed function may contaion more than one CUDA kernel, the
|
||||
// number of captured kernels may exceed the number of operators.
|
||||
IT_ASSERT(numCudaGraphNodes >= funcs.size(),
|
||||
std::to_string(numCudaGraphNodes) +
|
||||
" != " + std::to_string(funcs.size()));
|
||||
return timeit(
|
||||
[&, cudaGraphInstance = cudaGraphInstance, stream = getStream()]() {
|
||||
checkCudaError(cudaGraphLaunch(cudaGraphInstance, stream));
|
||||
},
|
||||
[&, stream = getStream()]() { cudaStreamSynchronize(stream); },
|
||||
std::min(50, rounds), rounds);
|
||||
}
|
||||
|
||||
void CudaRuntimeObj::setEnableTF32(bool state) { enableTF32 = state; }
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
#include "core/graph.h"
|
||||
#include <pybind11/stl.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace infini {
|
||||
|
||||
namespace callback {
|
||||
|
||||
using namespace py::literals;
|
||||
|
||||
static std::function<void(const Graph &, string)> exportONNXImpl;
|
||||
void exportONNX(const Graph &graph, const string &path) {
|
||||
IT_ASSERT(Py_IsInitialized(), "Python interpreter is not running.");
|
||||
static auto exportONNXImpl =
|
||||
py::module_::import("pyinfinitensor.onnx").attr("save_onnx");
|
||||
exportONNXImpl(graph, path);
|
||||
}
|
||||
|
||||
} // namespace callback
|
||||
|
||||
} // namespace infini
|
|
@ -1,9 +1,15 @@
|
|||
#include "core/graph_handler.h"
|
||||
#include "core/mutator.h"
|
||||
#include "core/search_engine.h"
|
||||
#include "nnet/nmutator.h"
|
||||
#include "nnet/test_models.h"
|
||||
#include "operators/any.h"
|
||||
#include "operators/batch_norm.h"
|
||||
#include "operators/concat.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/membound.h"
|
||||
#include "operators/pad.h"
|
||||
#include "operators/pooling.h"
|
||||
#include "operators/reduce_mean.h"
|
||||
|
@ -63,6 +69,8 @@ void export_values(py::module &m) {
|
|||
.VALUE(OpType, Conv)
|
||||
.VALUE(OpType, Matmul)
|
||||
.VALUE(OpType, ConvTrans)
|
||||
.VALUE(OpType, ConvTransNHWC)
|
||||
.VALUE(OpType, ConvNHWC)
|
||||
.VALUE(OpType, G2BMM)
|
||||
.VALUE(OpType, GBMM)
|
||||
.VALUE(OpType, Pad)
|
||||
|
@ -94,8 +102,16 @@ void export_values(py::module &m) {
|
|||
.VALUE(OpType, Abs)
|
||||
.VALUE(OpType, Resize)
|
||||
.VALUE(OpType, Dropout)
|
||||
.VALUE(OpType, Conv2dReduce)
|
||||
.VALUE(OpType, Conv2dReduceTranspose)
|
||||
.VALUE(OpType, MemBound)
|
||||
.VALUE(OpType, Any)
|
||||
.export_values();
|
||||
|
||||
py::enum_<TensorType>(m, "TensorType")
|
||||
.VALUE(TensorType, Input)
|
||||
.VALUE(TensorType, Initialized)
|
||||
.VALUE(TensorType, Other);
|
||||
#undef VALUE
|
||||
}
|
||||
|
||||
|
@ -132,19 +148,34 @@ static Ref<RuntimeObj> intelcpu_runtime() { return make_ref<MklRuntimeObj>(); }
|
|||
#endif
|
||||
|
||||
static std::tuple<int, int, int, int, int, int> conv_attrs_of(Operator op) {
|
||||
IT_ASSERT(op->getOpType() == OpType::Conv);
|
||||
auto conv = dynamic_cast<const ConvObj *>(op.get());
|
||||
return std::make_tuple(conv->getPh(), conv->getPw(), conv->getDh(),
|
||||
conv->getDw(), conv->getSh(), conv->getSw());
|
||||
IT_ASSERT(op->getOpType() == OpType::Conv ||
|
||||
op->getOpType() == OpType::ConvNHWC);
|
||||
auto conv = dynamic_cast<const ConvBaseObj *>(op.get());
|
||||
return std::make_tuple(conv->getPh(), conv->getPw(), conv->getSh(),
|
||||
conv->getSw(), conv->getDh(), conv->getDw());
|
||||
}
|
||||
|
||||
static std::tuple<int, int, int, int, int, int, int, int>
|
||||
conv_trans_attrs_of(Operator op) {
|
||||
IT_ASSERT(op->getOpType() == OpType::ConvTrans);
|
||||
auto conv = dynamic_cast<const ConvTransposed2dObj *>(op.get());
|
||||
auto [oph, opw] = conv->getOutputPadding();
|
||||
return std::make_tuple(conv->getPh(), conv->getPw(), conv->getDh(),
|
||||
conv->getDw(), conv->getSh(), conv->getSw(), oph,
|
||||
IT_ASSERT(op->getOpType() == OpType::ConvTrans ||
|
||||
op->getOpType() == OpType::ConvTransNHWC);
|
||||
auto conv = dynamic_cast<const ConvBaseObj *>(op.get());
|
||||
int oph, opw;
|
||||
|
||||
if (op->getOpType() == OpType::ConvTrans) {
|
||||
auto _conv = dynamic_cast<const ConvTransposed2dObj *>(op.get());
|
||||
auto output_pad = _conv->getOutputPadding();
|
||||
oph = output_pad.first;
|
||||
opw = output_pad.second;
|
||||
} else {
|
||||
auto _conv = dynamic_cast<const ConvTransposed2dNHWCObj *>(op.get());
|
||||
auto output_pad = _conv->getOutputPadding();
|
||||
oph = output_pad.first;
|
||||
opw = output_pad.second;
|
||||
}
|
||||
|
||||
return std::make_tuple(conv->getPh(), conv->getPw(), conv->getSh(),
|
||||
conv->getSw(), conv->getDh(), conv->getDw(), oph,
|
||||
opw);
|
||||
}
|
||||
|
||||
|
@ -210,6 +241,11 @@ static vector<int64_t> reshape_shape_of(Operator op) {
|
|||
return ans;
|
||||
}
|
||||
|
||||
static int flatten_axis_of(Operator op) {
|
||||
IT_ASSERT(op->getOpType() == OpType::Flatten);
|
||||
return as<FlattenObj>(op)->getAxis();
|
||||
}
|
||||
|
||||
static vector<int64_t> pad_pads_of(Operator op) {
|
||||
IT_ASSERT(op->getOpType() == OpType::Pad);
|
||||
auto shape = dynamic_cast<const PadObj *>(op.get())->getPads();
|
||||
|
@ -219,11 +255,20 @@ static vector<int64_t> pad_pads_of(Operator op) {
|
|||
return ans;
|
||||
}
|
||||
|
||||
static string any_kernelName_of(Operator op) {
|
||||
IT_ASSERT(op->getOpType() == OpType::Any);
|
||||
return as<AnyObj>(op)->getKernelName();
|
||||
}
|
||||
|
||||
static vector<int> transpose_permute_of(Operator op) {
|
||||
IT_ASSERT(op->getOpType() == OpType::Transpose);
|
||||
return dynamic_cast<const TransposeObj *>(op.get())->getPermute();
|
||||
}
|
||||
|
||||
static string membound_expr_of(Operator op) {
|
||||
return as<MemBoundObj>(op)->toJson();
|
||||
}
|
||||
|
||||
void export_functions(py::module &m) {
|
||||
#define FUNCTION(NAME) def(#NAME, &NAME)
|
||||
m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
|
||||
|
@ -248,29 +293,45 @@ void export_functions(py::module &m) {
|
|||
.FUNCTION(reduce_mean_attrs_of)
|
||||
.FUNCTION(tensor_dtype)
|
||||
.FUNCTION(reshape_shape_of)
|
||||
.FUNCTION(flatten_axis_of)
|
||||
.FUNCTION(pad_pads_of)
|
||||
.FUNCTION(transpose_permute_of)
|
||||
.FUNCTION(concat_axis_of)
|
||||
.FUNCTION(split_axis_of)
|
||||
.FUNCTION(gather_axis_of);
|
||||
.FUNCTION(gather_axis_of)
|
||||
.FUNCTION(membound_expr_of)
|
||||
.FUNCTION(any_kernelName_of)
|
||||
.def("membound_hash_of",
|
||||
[](Operator op) { return as<MemBoundObj>(op)->getHash(); });
|
||||
#undef FUNCTION
|
||||
}
|
||||
|
||||
void init_graph_builder(py::module &m) {
|
||||
using Handler = GraphHandlerObj;
|
||||
|
||||
py::class_<RuntimeObj, std::shared_ptr<RuntimeObj>>(m, "Runtime");
|
||||
py::class_<Object, Ref<Object>>(m, "_Object")
|
||||
.def("__str__", &Object::toString)
|
||||
.def("guid", &Object::getGuid);
|
||||
py::class_<RuntimeObj, Ref<RuntimeObj>>(m, "Runtime")
|
||||
.def("run", &RuntimeObj::run, "graph"_a, "tune"_a = false,
|
||||
"profiling"_a = false)
|
||||
.def("getPerfTime", &RuntimeObj::getPerfTime, "graph"_a, "profiling"_a,
|
||||
"allowEstimation"_a, "ignoreMemboundOp"_a)
|
||||
.def("timeNonCtcOperators", &RuntimeObj::timeNonCtcOperators);
|
||||
py::class_<NativeCpuRuntimeObj, std::shared_ptr<NativeCpuRuntimeObj>,
|
||||
RuntimeObj>(m, "CpuRuntime");
|
||||
#ifdef USE_CUDA
|
||||
py::class_<CudaRuntimeObj, std::shared_ptr<CudaRuntimeObj>, RuntimeObj>(
|
||||
m, "CudaRuntime");
|
||||
py::class_<CudaRuntimeObj, Ref<CudaRuntimeObj>, RuntimeObj>(m,
|
||||
"CudaRuntime")
|
||||
.def("timeWithCudaGraph",
|
||||
py::overload_cast<Graph, int>(&CudaRuntimeObj::timeWithCudaGraph))
|
||||
.def("setEnableTF32", &CudaRuntimeObj::setEnableTF32);
|
||||
#endif
|
||||
#ifdef USE_BANG
|
||||
py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
|
||||
m, "BangRuntime");
|
||||
#endif
|
||||
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
|
||||
py::class_<TensorObj, std::shared_ptr<TensorObj>, Object>(m, "Tensor")
|
||||
.def("fuid", &TensorObj::getFuid, policy::automatic)
|
||||
.def("shape", &TensorObj::getDims, policy::move)
|
||||
.def("copyin_float", &TensorObj::copyin<float>, policy::move)
|
||||
|
@ -281,8 +342,10 @@ void init_graph_builder(py::module &m) {
|
|||
.def("copyout_int64", &TensorObj::copyout<int64_t>, policy::move)
|
||||
.def("has_target", &TensorObj::hasTarget, policy::automatic)
|
||||
.def("src", &TensorObj::getSource, policy::move)
|
||||
.def("printData", &TensorObj::printData, policy::automatic);
|
||||
py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
|
||||
.def("print_data", &TensorObj::printData)
|
||||
.def("data_malloc", &TensorObj::dataMalloc)
|
||||
.def("getTensorType", &TensorObj::getTensorType);
|
||||
py::class_<OperatorObj, std::shared_ptr<OperatorObj>, Object>(m, "Operator")
|
||||
.def("op_type", &OperatorObj::getOpType, policy::automatic)
|
||||
.def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_),
|
||||
policy::reference)
|
||||
|
@ -291,9 +354,16 @@ void init_graph_builder(py::module &m) {
|
|||
policy::reference);
|
||||
py::class_<Handler>(m, "GraphHandler")
|
||||
.def(py::init<Runtime>())
|
||||
.def("tensor", &Handler::tensor, policy::move)
|
||||
.def(py::init<Graph>())
|
||||
.def("inputs", &Handler::inputs, policy::move)
|
||||
.def("outputs", &Handler::outputs, policy::move)
|
||||
.def("tensor", &Handler::tensor, policy::move, "shape"_a, "dtype"_a = 1,
|
||||
"tensor_type"_a = TensorType::Other)
|
||||
.def("conv", &Handler::conv, policy::move)
|
||||
.def("convTransposed2d", &Handler::convTransposed2d, policy::move)
|
||||
.def("convNHWC", &Handler::convNHWC, policy::move)
|
||||
.def("convtransposed2dNHWC", &Handler::convTransposed2dNHWC,
|
||||
policy::move)
|
||||
.def("matmul", &Handler::matmul, policy::move)
|
||||
.def("batchNorm", &Handler::batchNorm, policy::move)
|
||||
.def("maxPool", &Handler::maxPool, policy::move)
|
||||
|
@ -321,11 +391,50 @@ void init_graph_builder(py::module &m) {
|
|||
.def("reduce_mean", &Handler::reduceMean, policy::move)
|
||||
.def("slice", &Handler::slice, policy::move)
|
||||
.def("pad", &Handler::pad, policy::move)
|
||||
.def("memBound", &Handler::memBound, policy::move)
|
||||
.def("topo_sort", &Handler::topo_sort, policy::automatic)
|
||||
.def("optimize", &Handler::optimize, policy::automatic)
|
||||
.def("operators", &Handler::operators, policy::move)
|
||||
.def("data_malloc", &Handler::data_malloc, policy::automatic)
|
||||
.def("run", &Handler::run, policy::automatic);
|
||||
.def("run", &Handler::run, policy::automatic)
|
||||
.def("getGraph", &Handler::getGraph);
|
||||
py::class_<Mutator, Ref<Mutator>>(m, "Mutator").def("run", &Mutator::run);
|
||||
py::enum_<NMutator::Mode>(m, "NMutatorMode")
|
||||
.value("Normal", NMutator::Mode::Normal)
|
||||
.value("RuleBased", NMutator::Mode::RuleBased);
|
||||
py::class_<NMutator, Ref<NMutator>, Mutator>(m, "NMutator")
|
||||
.def(py::init<NMutator::Mode>())
|
||||
.def(py::init<NMutator::Mode, vector<int>>())
|
||||
.def("run", &NMutator::run);
|
||||
py::class_<SearchEngine>(m, "SearchEngine")
|
||||
.def(py::init<Runtime, Ref<Mutator>>())
|
||||
.def("run", &SearchEngine::run);
|
||||
py::class_<GraphObj, Ref<GraphObj>, Object>(m, "Graph")
|
||||
.def("tensors", &GraphObj::getTensors)
|
||||
.def("operators", &GraphObj::getOperators)
|
||||
.def("inputs", &GraphObj::getInputs)
|
||||
.def("outputs", &GraphObj::getOutputs)
|
||||
.def("print", &GraphObj::print)
|
||||
.def("topo_sort", &GraphObj::topo_sort);
|
||||
}
|
||||
|
||||
void export_test_model(py::module &m) {
|
||||
#ifdef USE_CUDA
|
||||
m.def("runInfoGAN", &runInfoGAN)
|
||||
.def("getGANGraph", &getGANGraph)
|
||||
.def("getFSRCNNGraph", &getFSRCNNGraph)
|
||||
.def("getLongformer", &getLongformer)
|
||||
.def("getConvtransposedNHWC", &getConvtransposedNHWC)
|
||||
.def("optimizeGraph", &optimizeGraph, "graph"_a, "runtime"_a,
|
||||
"tuning"_a = false, "mode"_a = NMutator::Mode::Normal,
|
||||
"rules"_a = vector<int>{})
|
||||
.def("initializeGraphTensors", &initializeGraphTensors, "g"_a,
|
||||
"l"_a = -0.1, "r"_a = 0.1, "useInt"_a = false)
|
||||
.def("convertNCHWtoNHWCModel", &convertNCHWtoNHWCModel)
|
||||
.def("optimizeWithDepthConstraint", &optimizeWithDepthConstraint)
|
||||
.def("optimizeModel", &optimizeModel)
|
||||
.def("optimizeModelWithRules", &optimizeModelWithRules);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
@ -335,4 +444,5 @@ PYBIND11_MODULE(backend, m) {
|
|||
infini::export_values(m);
|
||||
infini::export_functions(m);
|
||||
infini::init_graph_builder(m);
|
||||
infini::export_test_model(m);
|
||||
}
|
||||
|
|
|
@ -33,7 +33,7 @@ class G2BMMCudnn : public CudaKernelWithoutConfig {
|
|||
auto record =
|
||||
make_ref<PerfRecordObj>(std::numeric_limits<double>::max());
|
||||
const auto [warmupRounds, timingRounds] =
|
||||
op->getB() > 100 ? tuple{1, 3} : tuple{5, 15};
|
||||
op->getB() > 100 ? tuple{1, 1} : tuple{1, 2};
|
||||
double tmp =
|
||||
timeit([&]() { g2bmmKernel(op, context); },
|
||||
[&]() { context->sync(); }, warmupRounds, timingRounds);
|
||||
|
|
|
@ -34,7 +34,7 @@ class GBMMCudnn : public CudaKernelWithoutConfig {
|
|||
auto record =
|
||||
make_ref<PerfRecordObj>(std::numeric_limits<double>::max());
|
||||
const auto [warmupRounds, timingRounds] =
|
||||
op->getB() > 100 ? tuple{1, 3} : tuple{5, 15};
|
||||
op->getB() > 100 ? tuple{1, 1} : tuple{1, 3};
|
||||
double tmp =
|
||||
timeit([&]() { gbmmKernel(op, context); },
|
||||
[&]() { context->sync(); }, warmupRounds, timingRounds);
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
#include "operators/any.h"
|
||||
#include "cuda/cuda_any.h"
|
||||
#include "cuda/cuda_conv2dreduce.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class AnyCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<AnyObj>(_op);
|
||||
|
||||
auto inputs = op->getInputs();
|
||||
auto outputs = op->getOutputs();
|
||||
|
||||
vector<float *> inputsRawPtr;
|
||||
for (auto &input : inputs) {
|
||||
inputsRawPtr.emplace_back(input->getRawDataPtr<float *>());
|
||||
}
|
||||
vector<float *> outputsRawPtr;
|
||||
for (auto &output : outputs) {
|
||||
outputsRawPtr.emplace_back(output->getRawDataPtr<float *>());
|
||||
}
|
||||
|
||||
any_kernel_mapping(inputsRawPtr, outputsRawPtr, op->getKernelName(),
|
||||
op->getOpAttrVector());
|
||||
}
|
||||
};
|
||||
|
||||
void any_kernel_mapping(vector<float *> inputs, vector<float *> outputs,
|
||||
const string &kernelName, const vector<int> &attr) {
|
||||
if (kernelName == "conv2dreduce_kernel") {
|
||||
IT_ASSERT(attr.size() == 15);
|
||||
IT_ASSERT(inputs.size() == 1 || inputs.size() == 2)
|
||||
IT_ASSERT(outputs.size() == 1);
|
||||
conv2dreduce_kernel(inputs[0], inputs.size() > 1 ? inputs[1] : nullptr,
|
||||
outputs[0], attr[0] != 0, attr[1], attr[2], attr[3],
|
||||
attr[4], attr[5], attr[6], attr[7], attr[8],
|
||||
attr[9], attr[10], attr[11], attr[12], attr[13],
|
||||
attr[14]);
|
||||
} else if (kernelName == "reduceConvRxSToNCHW") {
|
||||
IT_ASSERT(attr.size() == 15);
|
||||
IT_ASSERT(inputs.size() == 1 || inputs.size() == 2)
|
||||
IT_ASSERT(outputs.size() == 1);
|
||||
// float *input, float *bias, float *output, int act,
|
||||
// int n, int h, int w, int f, int r, int s,
|
||||
// int oh, int ow, int ph, int pw, int sh, int
|
||||
// sw, int dh, int dw
|
||||
reduceConvRxSToNCHW(inputs[0], inputs.size() > 1 ? inputs[1] : nullptr,
|
||||
outputs[0], attr[0], attr[1], attr[2], attr[3],
|
||||
attr[4], attr[5], attr[6], attr[7], attr[8],
|
||||
attr[9], attr[10], attr[11], attr[12], attr[13],
|
||||
attr[14]);
|
||||
} else if (kernelName == "convTranspose2dreduce_kernel") {
|
||||
IT_ASSERT(attr.size() == 15);
|
||||
IT_ASSERT(inputs.size() == 1 || inputs.size() == 2)
|
||||
IT_ASSERT(outputs.size() == 1);
|
||||
convTranspose2dreduce_kernel(
|
||||
inputs[0], inputs.size() > 1 ? inputs[1] : nullptr, outputs[0],
|
||||
attr[0] != 0, attr[1], attr[2], attr[3], attr[4], attr[5], attr[6],
|
||||
attr[7], attr[8], attr[9], attr[10], attr[11], attr[12], attr[13],
|
||||
attr[14]);
|
||||
} else if (kernelName == "conv5x5ToConv3x3Reduce") {
|
||||
IT_ASSERT(attr.size() == 4);
|
||||
IT_ASSERT(inputs.size() == 1 || inputs.size() == 2)
|
||||
IT_ASSERT(outputs.size() == 1);
|
||||
conv5x5ToConv3x3Reduce(attr[0], attr[1], attr[2], attr[3], inputs[0],
|
||||
outputs[0],
|
||||
inputs.size() > 1 ? inputs[1] : nullptr);
|
||||
} else if (kernelName == "conv3x3ToReduce") {
|
||||
IT_ASSERT(attr.size() == 4);
|
||||
IT_ASSERT(inputs.size() == 1 || inputs.size() == 2);
|
||||
IT_ASSERT(outputs.size() == 1);
|
||||
conv3x3ToReduce(attr[0], attr[1], attr[2], attr[3], inputs[0],
|
||||
outputs[0], inputs.size() > 1 ? inputs[1] : nullptr);
|
||||
} else if (kernelName == "FakeOp" || kernelName == "Reduce3x3Offset_hint") {
|
||||
} else {
|
||||
std::cout << "Unimplemented AnyOp cuda kernel: " << kernelName
|
||||
<< std::endl;
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Any, DataType::Float32, AnyCuda,
|
||||
"Any_CUDA_Float32");
|
||||
|
||||
} // namespace infini
|
|
@ -24,4 +24,4 @@ class ClipCuda : public CudaKernelWithoutConfig {
|
|||
REGISTER_KERNEL(Device::CUDA, OpType::Clip, DataType::Float32, ClipCuda,
|
||||
"Clip_CUDA_Float32");
|
||||
|
||||
}; // namespace infini
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
#include "core/common.h"
|
||||
#include "core/constants.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include <math.h>
|
||||
|
||||
using infini::E_CONSTANT;
|
||||
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
@ -29,4 +27,4 @@ void clip_kernel(float *input, float *output, int num, float minValue,
|
|||
maxValue);
|
||||
}
|
||||
|
||||
}; // namespace infini
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include "operators/conv.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include <chrono>
|
||||
|
@ -52,7 +53,7 @@ class convCudnn : public Kernel {
|
|||
cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
|
||||
cudnnConvolutionDescriptor_t, cudnnActivationDescriptor_t,
|
||||
cudnnTensorDescriptor_t>
|
||||
createCuDNNDescriptor(const Ref<ConvObj> &op,
|
||||
createCuDNNDescriptor(const Ref<ConvBaseObj> &op,
|
||||
const ConvCuDnnPerfRecord &record) const {
|
||||
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
|
@ -68,15 +69,23 @@ class convCudnn : public Kernel {
|
|||
|
||||
int channelsPerGrp = cpg, channels = c;
|
||||
|
||||
// set input format
|
||||
cudnnTensorFormat_t tensorFormat = (op->getOpType() == OpType::ConvNHWC)
|
||||
? CUDNN_TENSOR_NHWC
|
||||
: CUDNN_TENSOR_NCHW;
|
||||
|
||||
// get inputs
|
||||
cudnnTensorDescriptor_t inDesc;
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, channels, h, w));
|
||||
inDesc, tensorFormat, CUDNN_DATA_FLOAT, n, channels, h, w));
|
||||
|
||||
// get kernels
|
||||
cudnnFilterDescriptor_t knDesc;
|
||||
checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
|
||||
// FIXME: filter data layout is not changed with input data layout
|
||||
// since FCRS shows better performance for NHWC inputs in some cases.
|
||||
// This should be tunable.
|
||||
checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
|
||||
CUDNN_TENSOR_NCHW, f,
|
||||
channelsPerGrp, r, s));
|
||||
|
@ -84,7 +93,7 @@ class convCudnn : public Kernel {
|
|||
cudnnTensorDescriptor_t biasDesc;
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1));
|
||||
biasDesc, tensorFormat, CUDNN_DATA_FLOAT, 1, f, 1, 1));
|
||||
|
||||
// get convlution descriptor
|
||||
cudnnConvolutionDescriptor_t convDesc;
|
||||
|
@ -125,18 +134,25 @@ class convCudnn : public Kernel {
|
|||
convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
|
||||
cudnnTensorDescriptor_t outDesc;
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
|
||||
CUDNN_DATA_FLOAT, outn, outc,
|
||||
outh, outw));
|
||||
IT_ASSERT((vector{outn, outc, outh, outw}) ==
|
||||
op->getOutput()->getDims(),
|
||||
"cuDNN output shape mismatches with OP output shape");
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
outDesc, tensorFormat, CUDNN_DATA_FLOAT, outn, outc, outh, outw));
|
||||
|
||||
if (op->getOpType() == OpType::ConvNHWC) {
|
||||
IT_ASSERT((vector{outn, outh, outw, outc}) ==
|
||||
op->getOutput()->getDims(),
|
||||
"cuDNN output shape mismatches with OP output shape");
|
||||
} else {
|
||||
IT_ASSERT((vector{outn, outc, outh, outw}) ==
|
||||
op->getOutput()->getDims(),
|
||||
"cuDNN output shape mismatches with OP output shape");
|
||||
}
|
||||
|
||||
return tuple(inData, knData, outData, inDesc, knDesc, biasDesc,
|
||||
convDesc, actDesc, outDesc);
|
||||
}
|
||||
|
||||
bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
|
||||
bool cuDNNUnfused(const Ref<ConvBaseObj> &op,
|
||||
const ConvCuDnnPerfRecord &record,
|
||||
const CudaRuntimeObj *context) const {
|
||||
cudnnStatus_t stat;
|
||||
|
||||
|
@ -219,12 +235,14 @@ class convCudnn : public Kernel {
|
|||
const RuntimeObj *_context) const override {
|
||||
ConvCuDnnPerfRecordObj ret;
|
||||
ret.time = std::numeric_limits<double>::max();
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
auto op = as<ConvObj>(_op);
|
||||
auto context = const_cast<CudaRuntimeObj *>(
|
||||
dynamic_cast<const CudaRuntimeObj *>(_context));
|
||||
auto op = as<ConvBaseObj>(_op);
|
||||
int try_algo = op->getOpType() == OpType::ConvNHWC ? 2 : N_ALGO;
|
||||
// Both modes have the same performance. Only run cross-correlation.
|
||||
for (int mode = 1; mode < 2; mode++) {
|
||||
// Try every possible algorithm of convolution
|
||||
for (int algo = 0; algo < N_ALGO; algo++) {
|
||||
for (int algo = 0; algo < try_algo; algo++) {
|
||||
auto recordRef = make_ref<ConvCuDnnPerfRecordObj>();
|
||||
auto &record = *recordRef;
|
||||
record.mode = mode;
|
||||
|
@ -251,16 +269,15 @@ class convCudnn : public Kernel {
|
|||
record.workspaceSize, &beta, outDesc, outData);
|
||||
if (stat != CUDNN_STATUS_SUCCESS)
|
||||
continue;
|
||||
record.time = timeit(
|
||||
[&]() {
|
||||
cudnnConvolutionForward(context->cudnnHandle(), &alpha,
|
||||
inDesc, inData, knDesc, knData,
|
||||
convDesc, ALGOS[record.algo],
|
||||
wsData, record.workspaceSize,
|
||||
&beta, outDesc, outData);
|
||||
},
|
||||
[&]() { context->sync(); });
|
||||
// printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
|
||||
// Time the kernel with CUDA Graph to get a precise time
|
||||
std::function<void(void)> func = [&]() {
|
||||
cudnnConvolutionForward(
|
||||
context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
|
||||
knData, convDesc, ALGOS[record.algo], wsData,
|
||||
record.workspaceSize, &beta, outDesc, outData);
|
||||
};
|
||||
record.time = context->timeWithCudaGraph({func}, 100);
|
||||
// printf("mode:%d algo:%d :%.4lf\n", mode, algo, record.time);
|
||||
|
||||
// Update the tune result
|
||||
if (ret.time > record.time)
|
||||
|
@ -283,7 +300,7 @@ class convCudnn : public Kernel {
|
|||
|
||||
void compute(const Operator &_op, const PerfRecord &_record,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConvObj>(_op);
|
||||
auto op = as<ConvBaseObj>(_op);
|
||||
auto record = as<ConvCuDnnPerfRecordObj>(_record);
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
bool success = cuDNNUnfused(op, record, context);
|
||||
|
@ -294,5 +311,8 @@ class convCudnn : public Kernel {
|
|||
REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float32, convCudnn,
|
||||
"Conv_cuDNN_CUDA_Float32");
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::ConvNHWC, DataType::Float32, convCudnn,
|
||||
"ConvNHWC_cuDNN_CUDA_Float32");
|
||||
|
||||
REGISTER_CONSTRUCTOR(1, ConvCuDnnPerfRecordObj::from_json);
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
#include "operators/conv2dreduce.h"
|
||||
#include "cuda/cuda_conv2dreduce.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class Conv2dReduceCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op, const RuntimeObj *_context) const {
|
||||
auto op = as<Conv2dReduceBase>(_op);
|
||||
float *const input = (op->getInputs(0)->getRawDataPtr<float *>());
|
||||
float *const bias =
|
||||
op->getBias() ? (op->getBias()->getRawDataPtr<float *>()) : nullptr;
|
||||
float *const output = (op->getOutput()->getRawDataPtr<float *>());
|
||||
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
int n = dim[0], h = dim[1], w = dim[2], f = dim[3], r = dim[4],
|
||||
s = dim[5];
|
||||
int dh = op->getDh(), dw = op->getDw();
|
||||
int sh = op->getSh(), sw = op->getSw();
|
||||
int ph = op->getPh(), pw = op->getPw();
|
||||
auto odim = op->getOutput()->getDims();
|
||||
int oh = odim[1], ow = odim[2];
|
||||
bool PReLU = op->getPReLU();
|
||||
// float paramReLU = op->getParamReLU();
|
||||
|
||||
auto opType = op->getOpType();
|
||||
|
||||
if (opType == OpType::Conv2dReduce) {
|
||||
conv2dreduce_kernel(input, bias, output, PReLU, n, h, w, f, r, s,
|
||||
oh, ow, ph, pw, sh, sw, dh, dw);
|
||||
} else {
|
||||
convTranspose2dreduce_kernel(input, bias, output, PReLU, n, h, w, f,
|
||||
r, s, oh, ow, ph, pw, sh, sw, dh, dw);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Conv2dReduce, DataType::Float32,
|
||||
Conv2dReduceCuda, "Conv2dReduce_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Conv2dReduceTranspose, DataType::Float32,
|
||||
Conv2dReduceCuda, "Conv2dReduceTranspose_CUDA_Float32");
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,239 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
#include "nnet/dbg.h"
|
||||
|
||||
using dtype = float;
|
||||
|
||||
__global__ void conv2dreduce_kernel_(float *__restrict__ input,
|
||||
float *__restrict__ bias,
|
||||
float *__restrict__ output,
|
||||
const bool PReLU, const int n, const int f,
|
||||
const int h, const int w, const int oh,
|
||||
const int ow, const int r, const int s,
|
||||
const int ph, const int pw, const int dh,
|
||||
const int dw, const int sh, const int sw) {
|
||||
// output shape: (n, oh, ow, f)
|
||||
// input shape: (n, h, w, f, r, s)
|
||||
int nid = blockIdx.x, fid = blockIdx.y;
|
||||
int hid = threadIdx.x, wid = threadIdx.y;
|
||||
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
|
||||
nchunck = h * hchunk;
|
||||
float *nfinput = input + nid * nchunck + fid * fchunck;
|
||||
if (nid < n && fid < f && hid < oh && wid < ow) {
|
||||
float imm = 0.0;
|
||||
int ihst = hid * sh - ph;
|
||||
int iwst = wid * sw - pw;
|
||||
for (int ri = 0; ri < r; ++ri) {
|
||||
for (int si = 0; si < s; ++si) {
|
||||
int ihid = ihst + ri * dh;
|
||||
int iwid = iwst + si * dw;
|
||||
if (ihid >= 0 && ihid < h && iwid >= 0 && iwid < w) {
|
||||
imm += *(nfinput + ihid * hchunk + iwid * wchunk + ri * s +
|
||||
si);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias) {
|
||||
imm += bias[fid];
|
||||
}
|
||||
if (PReLU) {
|
||||
imm = imm > 0.0 ? imm : 0.0;
|
||||
}
|
||||
output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
|
||||
}
|
||||
}
|
||||
__global__ void convTranspose2dreduce_kernel2_(
|
||||
float *__restrict__ input, float *__restrict__ bias,
|
||||
float *__restrict__ output, const bool PReLU, const int n, const int f,
|
||||
const int h, const int w, const int oh, const int ow, const int r,
|
||||
const int s, const int ph, const int pw, const int dh, const int dw,
|
||||
const int sh, const int sw) {
|
||||
int warp_id = (blockDim.x / 32) * blockIdx.x + threadIdx.x / 32;
|
||||
int lane = threadIdx.x % 32;
|
||||
int nid = warp_id / (f * oh * ow);
|
||||
int fid = (warp_id - nid * (f * oh * ow)) / (oh * ow);
|
||||
int hid = (warp_id - nid * (f * oh * ow) - fid * (oh * ow)) / ow;
|
||||
int wid = warp_id % ow;
|
||||
if (hid >= oh || wid >= ow || nid > n || fid > f)
|
||||
return;
|
||||
|
||||
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
|
||||
nchunck = h * hchunk;
|
||||
float *nfinput = input + nid * nchunck + fid * fchunck;
|
||||
// view as conv, the true ph and pw
|
||||
int tph = r - ph - 1, tpw = s - pw - 1;
|
||||
int th = (h - 1) * sh + 1, tw = (w - 1) * sw + 1;
|
||||
|
||||
float imm = 0.0;
|
||||
int ihst = hid - tph;
|
||||
int iwst = wid - tpw;
|
||||
for (int idx = lane; idx < r * s; idx += 32) {
|
||||
int ri = idx / s;
|
||||
int si = idx % s;
|
||||
int ihid = ihst + r - ri - 1;
|
||||
int iwid = iwst + s - si - 1;
|
||||
if (ihid >= 0 && ihid < th && iwid >= 0 && iwid < tw &&
|
||||
(ihid % sh == 0) && (iwid % sw == 0)) {
|
||||
imm += *(nfinput + (ihid / sh) * hchunk + (iwid / sw) * wchunk +
|
||||
ri * s + si);
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = 16; k > 0; k >>= 1) {
|
||||
imm += __shfl_down_sync(0xffffffff, imm, k); // sum
|
||||
}
|
||||
if (lane == 0) {
|
||||
if (bias) {
|
||||
imm += bias[fid];
|
||||
}
|
||||
if (PReLU) {
|
||||
imm = imm > 0.0 ? imm : 0.0;
|
||||
}
|
||||
output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void convTranspose2dreduce_kernel_(
|
||||
float *__restrict__ input, float *__restrict__ bias,
|
||||
float *__restrict__ output, const bool PReLU, const int n, const int f,
|
||||
const int h, const int w, const int oh, const int ow, const int r,
|
||||
const int s, const int ph, const int pw, const int dh, const int dw,
|
||||
const int sh, const int sw, const int block_x_num, const int block_y_num) {
|
||||
// assert dh = dw = 1
|
||||
int nid = blockIdx.x / block_x_num, fid = blockIdx.y / block_y_num;
|
||||
int hid = (blockIdx.x % block_x_num) * blockDim.x + threadIdx.x,
|
||||
wid = (blockIdx.y % block_y_num) * blockDim.y + threadIdx.y;
|
||||
if (hid >= oh || wid >= ow)
|
||||
return;
|
||||
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
|
||||
nchunck = h * hchunk;
|
||||
float *nfinput = input + nid * nchunck + fid * fchunck;
|
||||
// view as conv, the true ph and pw
|
||||
int tph = r - ph - 1, tpw = s - pw - 1;
|
||||
int th = (h - 1) * sh + 1, tw = (w - 1) * sw + 1;
|
||||
if (nid < n && fid < f && hid < oh && wid < ow) {
|
||||
float imm = 0.0;
|
||||
int ihst = hid - tph;
|
||||
int iwst = wid - tpw;
|
||||
for (int ri = 0; ri < r; ++ri) {
|
||||
for (int si = 0; si < s; ++si) {
|
||||
int ihid = ihst + r - ri - 1;
|
||||
int iwid = iwst + s - si - 1;
|
||||
if (ihid >= 0 && ihid < th && iwid >= 0 && iwid < tw &&
|
||||
(ihid % sh == 0) && (iwid % sw == 0)) {
|
||||
imm += *(nfinput + (ihid / sh) * hchunk +
|
||||
(iwid / sw) * wchunk + ri * s + si);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias) {
|
||||
imm += bias[fid];
|
||||
}
|
||||
if (PReLU) {
|
||||
imm = imm > 0.0 ? imm : 0.0;
|
||||
}
|
||||
output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
|
||||
}
|
||||
}
|
||||
|
||||
// nhwrsc -> nhwc
|
||||
__global__ void reduce_4x4(dtype *in, dtype *out, int act, const int N,
|
||||
const int F, const int H, const int W, const int IH,
|
||||
const int IW) {
|
||||
// #define in_index(n, h, w, r, s, f) \
|
||||
// ((((((n)*IH + h) * IW + w) * R + r) * S + s) * F + f)
|
||||
#define in_index(n, h, w, f, r, s) \
|
||||
((((((n)*IH + h) * IW + w) * F + f) * R + r) * S + s)
|
||||
#define out_index(n, h, w, f) (((((n)*H) + (h)) * W + (w)) * F + (f))
|
||||
const int R = 4, S = 4;
|
||||
const int n_tasks = N * F * H * W;
|
||||
int start = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (int i = start; i < n_tasks; i += stride) {
|
||||
int t = i, n, f, h, w;
|
||||
f = t % F;
|
||||
t /= F;
|
||||
w = t % W;
|
||||
t /= W;
|
||||
h = t % H;
|
||||
t /= H;
|
||||
n = t;
|
||||
|
||||
// unroll this 2-iter loop
|
||||
float sum = 0;
|
||||
int x, y;
|
||||
for (int r = (h + 1) & 1; r < R; r += 2) {
|
||||
x = (h + 1 - r) / 2;
|
||||
if (x >= 0 && x < IH) {
|
||||
for (int s = (w + 1) & 1; s < S; s += 2) {
|
||||
y = (w + 1 - s) / 2;
|
||||
if (y >= 0 && y < IW) {
|
||||
sum += in[in_index(n, x, y, f, r, s)];
|
||||
// if (i==0)
|
||||
// printf("TTT nhwf= %d,%d,%d,%d x=%d y=%d, v=%f,
|
||||
// index=%d, rsf %d %d %d\n", n, h, w,
|
||||
// f, x, y, in[in_index(n, x, y, r, s, f)],
|
||||
// in_index(n, x, y, r, s, f), r,s,f);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (act == 0) {
|
||||
out[out_index(n, h, w, f)] = sum;
|
||||
} else if (act == 1) { // Relu
|
||||
out[out_index(n, h, w, f)] = sum > 0 ? sum : 0;
|
||||
} else if (act == 2) {
|
||||
out[out_index(n, h, w, f)] = tanhf(sum);
|
||||
}
|
||||
}
|
||||
#undef in_index
|
||||
#undef out_index
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
|
||||
void conv2dreduce_kernel(float *input, float *bias, float *output, bool PReLU,
|
||||
int n, int h, int w, int f, int r, int s, int oh,
|
||||
int ow, int ph, int pw, int sh, int sw, int dh,
|
||||
int dw) {
|
||||
dim3 grid(n, f);
|
||||
dim3 block(oh, ow);
|
||||
// cudaStream_t stream(cudaStreamPerThread);
|
||||
conv2dreduce_kernel_<<<grid, block, 0>>>(input, bias, output, PReLU, n, f,
|
||||
h, w, oh, ow, r, s, ph, pw, dh, dw,
|
||||
sh, sw);
|
||||
}
|
||||
|
||||
void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
|
||||
int act, int n, int h, int w, int f, int r,
|
||||
int s, int oh, int ow, int ph, int pw, int sh,
|
||||
int sw, int dh, int dw) {
|
||||
dim3 grid(n, f);
|
||||
dim3 block(oh, ow);
|
||||
// cudaStream_t stream(cudaStreamPerThread);
|
||||
// puts("convTranspose2dreduce_kernel is executed");
|
||||
if (r == 4 && s == 4 && sh == 2 && sw == 2) {
|
||||
const int M = r * s * f, N = n * h * w;
|
||||
reduce_4x4<<<(M * N + 127) / 128, 128>>>(input, output, act, n, f, oh,
|
||||
ow, h, w);
|
||||
} else {
|
||||
// puts("why use this conv2dreduce");
|
||||
// block.x = 32;
|
||||
// block.y = 32;
|
||||
// int block_x_num = (oh + block.x - 1) / block.x;
|
||||
// int block_y_num = (ow + block.y - 1) / block.y;
|
||||
// grid.x = n * (block_x_num);
|
||||
// grid.y = f * (block_y_num);
|
||||
// convTranspose2dreduce_kernel_<<<grid, block, 0>>>(
|
||||
// input, bias, output, (bool)act, n, f, h, w, oh, ow, r, s, ph, pw,
|
||||
// dh, dw, sh, sw, block_x_num, block_y_num);
|
||||
|
||||
block.x = 128;
|
||||
block.y = 1;
|
||||
grid.x = (n * f * ow * oh + block.x / 32 - 1) / (block.x / 32);
|
||||
grid.y = 1;
|
||||
convTranspose2dreduce_kernel2_<<<grid, block, 0>>>(
|
||||
input, bias, output, (bool)act, n, f, h, w, oh, ow, r, s, ph, pw,
|
||||
dh, dw, sh, sw);
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
|
@ -6,6 +6,8 @@ namespace infini {
|
|||
|
||||
struct MatmulCublasPerfRecordObj : public PerfRecordObj {
|
||||
int algo = CUBLAS_GEMM_DEFAULT;
|
||||
/// @brief 0 for cublasGemmStridedBatchedEx, 1 for cublasGemmEx
|
||||
int apiId = 0;
|
||||
void to_json(json &j) override {
|
||||
j["type"] = 2;
|
||||
j["data"] = std::make_pair(algo, time);
|
||||
|
@ -19,8 +21,7 @@ struct MatmulCublasPerfRecordObj : public PerfRecordObj {
|
|||
}
|
||||
};
|
||||
|
||||
constexpr int N_ALGO = 24;
|
||||
constexpr cublasGemmAlgo_t ALGOS[N_ALGO] = {
|
||||
const vector<cublasGemmAlgo_t> Algos = {
|
||||
CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1, CUBLAS_GEMM_ALGO2,
|
||||
CUBLAS_GEMM_ALGO3, CUBLAS_GEMM_ALGO4, CUBLAS_GEMM_ALGO5,
|
||||
CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7, CUBLAS_GEMM_ALGO8,
|
||||
|
@ -30,6 +31,17 @@ constexpr cublasGemmAlgo_t ALGOS[N_ALGO] = {
|
|||
CUBLAS_GEMM_ALGO18, CUBLAS_GEMM_ALGO19, CUBLAS_GEMM_ALGO20,
|
||||
CUBLAS_GEMM_ALGO21, CUBLAS_GEMM_ALGO22, CUBLAS_GEMM_ALGO23,
|
||||
};
|
||||
const vector<cublasGemmAlgo_t> AlgosTensorOp = {
|
||||
CUBLAS_GEMM_DFALT_TENSOR_OP, CUBLAS_GEMM_ALGO0_TENSOR_OP,
|
||||
CUBLAS_GEMM_ALGO1_TENSOR_OP, CUBLAS_GEMM_ALGO2_TENSOR_OP,
|
||||
CUBLAS_GEMM_ALGO3_TENSOR_OP, CUBLAS_GEMM_ALGO4_TENSOR_OP,
|
||||
CUBLAS_GEMM_ALGO5_TENSOR_OP, CUBLAS_GEMM_ALGO6_TENSOR_OP,
|
||||
CUBLAS_GEMM_ALGO7_TENSOR_OP, CUBLAS_GEMM_ALGO8_TENSOR_OP,
|
||||
CUBLAS_GEMM_ALGO9_TENSOR_OP, CUBLAS_GEMM_ALGO10_TENSOR_OP,
|
||||
CUBLAS_GEMM_ALGO11_TENSOR_OP, CUBLAS_GEMM_ALGO12_TENSOR_OP,
|
||||
CUBLAS_GEMM_ALGO13_TENSOR_OP, CUBLAS_GEMM_ALGO14_TENSOR_OP,
|
||||
CUBLAS_GEMM_ALGO15_TENSOR_OP};
|
||||
|
||||
class matmulCublas : public Kernel {
|
||||
bool do_compute(const Operator &_op, const PerfRecord &_record,
|
||||
const RuntimeObj *_context) const {
|
||||
|
@ -47,9 +59,12 @@ class matmulCublas : public Kernel {
|
|||
const int lda = op->getTransA() ? m : k, ldb = op->getTransB() ? k : n,
|
||||
ldc = n;
|
||||
const float alpha = 1.f, beta = 0.f;
|
||||
// TODO:use compute type
|
||||
cublasStatus_t stat;
|
||||
if (b > 1) {
|
||||
// Set the compute type to TF32 if enabled
|
||||
cublasComputeType_t computeType = context->getEnableTF32()
|
||||
? CUBLAS_COMPUTE_32F_FAST_TF32
|
||||
: CUBLAS_COMPUTE_32F;
|
||||
if (record->apiId == 0) {
|
||||
// Support batch broadcast with zero stride
|
||||
int dimA = op->getInputs(0)->getDims().size();
|
||||
int dimB = op->getInputs(1)->getDims().size();
|
||||
|
@ -63,17 +78,23 @@ class matmulCublas : public Kernel {
|
|||
(dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
|
||||
? 0 // Broadcast the batch dimension if batch size is 1
|
||||
: n * k;
|
||||
// printf("cublasGemmStridedBatchedEx %d%d, mnk %d %d %d, alpha %f,
|
||||
// B "
|
||||
// "%d %lld, A %d %lld, C %d %d, b %d %d\n",
|
||||
// opB, opA, n, m, k, alpha, ldb, strideB, lda, strideA, ldc,
|
||||
// m * n, b, record->algo);
|
||||
stat = cublasGemmStridedBatchedEx(
|
||||
context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
|
||||
CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA,
|
||||
&beta, outData, CUDA_R_32F, ldc, m * n, b, CUDA_R_32F,
|
||||
&beta, outData, CUDA_R_32F, ldc, m * n, b, computeType,
|
||||
(cublasGemmAlgo_t)record->algo);
|
||||
} else {
|
||||
} else if (record->apiId == 1) {
|
||||
stat = cublasGemmEx(
|
||||
context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
|
||||
CUDA_R_32F, ldb, inAData, CUDA_R_32F, lda, &beta, outData,
|
||||
CUDA_R_32F, ldc, CUDA_R_32F, (cublasGemmAlgo_t)record->algo);
|
||||
}
|
||||
CUDA_R_32F, ldc, computeType, (cublasGemmAlgo_t)record->algo);
|
||||
} else
|
||||
IT_ASSERT(false);
|
||||
// if (stat != CUBLAS_STATUS_SUCCESS)
|
||||
// cout << cublasGetErrorString(stat);
|
||||
return (stat == CUBLAS_STATUS_SUCCESS);
|
||||
|
@ -98,15 +119,29 @@ class matmulCublas : public Kernel {
|
|||
IT_ASSERT(op);
|
||||
auto ret = make_ref<MatmulCublasPerfRecordObj>();
|
||||
ret->time = std::numeric_limits<double>::max();
|
||||
for (int i = 0; i < N_ALGO; i++) {
|
||||
auto rcd = make_ref<MatmulCublasPerfRecordObj>();
|
||||
rcd->algo = ALGOS[i];
|
||||
if (!do_compute(_op, rcd, _context))
|
||||
continue;
|
||||
rcd->time = timeit([&]() { do_compute(_op, rcd, _context); },
|
||||
[&]() { context->sync(); });
|
||||
if (rcd->time < ret->time)
|
||||
ret = rcd;
|
||||
vector<int> apis{0};
|
||||
if (op->getB() == 1)
|
||||
apis.emplace_back(1);
|
||||
|
||||
// Set the possible algorithm range
|
||||
auto algos = Algos;
|
||||
if (context->getEnableTF32()) {
|
||||
algos.insert(algos.end(), AlgosTensorOp.begin(),
|
||||
AlgosTensorOp.end());
|
||||
}
|
||||
|
||||
for (int api : apis) {
|
||||
for (size_t i = 0; i < algos.size(); i++) {
|
||||
auto rcd = make_ref<MatmulCublasPerfRecordObj>();
|
||||
rcd->apiId = api;
|
||||
rcd->algo = algos[i];
|
||||
if (!do_compute(_op, rcd, _context))
|
||||
continue;
|
||||
rcd->time = timeit([&]() { do_compute(_op, rcd, _context); },
|
||||
[&]() { context->sync(); });
|
||||
if (rcd->time < ret->time)
|
||||
ret = rcd;
|
||||
}
|
||||
}
|
||||
IT_ASSERT(ret->time < std::numeric_limits<double>::max(),
|
||||
"No valid algorithm found for " + op->toString());
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#ifdef INFINI_USE_TVM
|
||||
#include "core/kernel.h"
|
||||
#include "cuda/cuda_conv2dreduce.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "dlpack/dlpack.h"
|
||||
#include "ffi/ffi_embed.h"
|
||||
|
@ -8,6 +9,13 @@
|
|||
#include "operators/pooling.h"
|
||||
#include "tvm/runtime/module.h"
|
||||
#include "tvm/runtime/packed_func.h"
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
|
@ -22,6 +30,8 @@ class TVMRecordObj : public PerfRecordObj {
|
|||
std::string dllPath;
|
||||
std::string funcName;
|
||||
std::vector<int> inputIdx;
|
||||
tvm::runtime::PackedFunc packedFunc;
|
||||
bool useExistingKernel = false;
|
||||
};
|
||||
|
||||
using TVMRecord = Ref<TVMRecordObj>;
|
||||
|
@ -33,9 +43,15 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
auto op = as<MemBoundObj>(_op);
|
||||
// auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
auto tvmRecord = std::dynamic_pointer_cast<TVMRecordObj>(record);
|
||||
tvm::runtime::PackedFunc packedFunc =
|
||||
getPackedFunction(tvmRecord->dllPath, tvmRecord->funcName);
|
||||
IT_ASSERT(packedFunc != nullptr);
|
||||
|
||||
// Use user-defined kernels
|
||||
if (tvmRecord->useExistingKernel) {
|
||||
bool success = useExistingKernels(op);
|
||||
IT_ASSERT(success);
|
||||
return;
|
||||
}
|
||||
|
||||
tvm::runtime::PackedFunc packedFunc = tvmRecord->packedFunc;
|
||||
|
||||
// prepare inputs and outputs
|
||||
vector<DLTensorHolder> inputsHolder;
|
||||
|
@ -63,10 +79,18 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
// Premise: op is idempotent since it is called multiple times.
|
||||
PerfRecord tune(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
TVMRecord ret = std::make_shared<TVMRecordObj>();
|
||||
auto op = as<MemBoundObj>(_op);
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
|
||||
// If hash matches, use user-defined kernels
|
||||
if (useExistingKernels(op)) {
|
||||
TVMRecord ret = std::make_shared<TVMRecordObj>();
|
||||
ret->time = timeit([&]() { useExistingKernels(op); },
|
||||
[&]() { context->sync(); });
|
||||
ret->useExistingKernel = true;
|
||||
return ret;
|
||||
}
|
||||
|
||||
// invoke Ansor to tune a membound kernel
|
||||
auto [expr, hash] = op->getSimplifiedNnetExpr();
|
||||
nnet::AsTVMVisitor visitor;
|
||||
|
@ -93,6 +117,7 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
if (inputName == op->getNnetInputs()[j]->getName())
|
||||
break;
|
||||
}
|
||||
IT_ASSERT(j < numInputs, "Cannot find input name: " + inputName);
|
||||
inputIdx.emplace_back(j);
|
||||
}
|
||||
|
||||
|
@ -114,29 +139,41 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
tvm::runtime::TVMArgs args(preArgs.first.data(), preArgs.second.data(),
|
||||
preArgs.first.size());
|
||||
|
||||
TVMRecord ret = std::make_shared<TVMRecordObj>();
|
||||
ret->time = timeit([&]() { packedFunc.CallPacked(args, &rv); },
|
||||
[&]() { context->sync(); });
|
||||
ret->kernelName = kernelName;
|
||||
ret->dllPath = dllPath;
|
||||
ret->funcName = func;
|
||||
ret->inputIdx = inputIdx;
|
||||
ret->packedFunc = packedFunc;
|
||||
|
||||
return std::dynamic_pointer_cast<PerfRecordObj>(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string serializeTVMArgs(const std::vector<std::vector<int>> &inDims,
|
||||
const std::vector<std::string> &inDTypes,
|
||||
const std::vector<int> &outDims,
|
||||
const std::string &outDType,
|
||||
const std::string &lambda,
|
||||
const std::string &funcName,
|
||||
const std::string &nnetExprString,
|
||||
const std::string &nnetSimplifiedExprString,
|
||||
const HashType hashCode) const {
|
||||
json j;
|
||||
// Consistant with python API interface
|
||||
j["input_tensors"] = inDims;
|
||||
j["input_dtypes"] = inDTypes;
|
||||
j["output_tensor"] = outDims;
|
||||
j["output_dtype"] = outDType;
|
||||
j["tvm_code"] = lambda;
|
||||
j["func_name"] = funcName;
|
||||
j["nnet_expression"] = nnetExprString;
|
||||
j["nnet_simplified_expression"] = nnetSimplifiedExprString;
|
||||
j["hash_code"] = std::to_string(hashCode);
|
||||
return j.dump();
|
||||
}
|
||||
|
||||
/// @brief
|
||||
/// @param inDims
|
||||
/// @param inDTypes
|
||||
/// @param outDims
|
||||
/// @param outDType
|
||||
/// @param lambda
|
||||
/// @param funcName Generated function name
|
||||
/// @param nnetExpressionString Save expr in string for logging.
|
||||
/// @param nnetSimplifiedExprString Save simplified expr in string for
|
||||
/// logging.
|
||||
/// @param hashCode (optional) Hash code of the input expression for kernel
|
||||
/// cache.
|
||||
/// @return
|
||||
std::string getAnsorDLL(const std::vector<std::vector<int>> &inDims,
|
||||
const std::vector<std::string> &inDTypes,
|
||||
const std::vector<int> &outDims,
|
||||
|
@ -146,29 +183,60 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
const std::string &nnetExprString,
|
||||
const std::string &nnetSimplifiedExprString,
|
||||
const HashType hashCode) const {
|
||||
std::string dllPath;
|
||||
try {
|
||||
start_interpreter();
|
||||
// Use static to avoid re-importing the module. Re-importing results
|
||||
// in cuBLAS failure, whose root cause is not identified yet.
|
||||
static auto func =
|
||||
py::module::import("cpp_plugin").attr("gen_ansor_so");
|
||||
py::tuple code =
|
||||
func(inDims, inDTypes, outDims, outDType, lambda, funcName,
|
||||
nnetExprString, nnetSimplifiedExprString,
|
||||
std::to_string(hashCode));
|
||||
dllPath = py::str(code[0]);
|
||||
} catch (py::error_already_set &e) {
|
||||
if (e.matches(PyExc_ImportError)) {
|
||||
std::cerr << "Import Error. Don't forget to set environment "
|
||||
"variable PYTHONPATH to contain "
|
||||
"<repo-root>/python"
|
||||
<< std::endl;
|
||||
}
|
||||
throw;
|
||||
int fdP2C[2], fdC2P[2];
|
||||
for (auto fd : {fdP2C, fdC2P}) {
|
||||
int status = pipe(fd);
|
||||
IT_ASSERT(status == 0, "pipe failed");
|
||||
}
|
||||
pid_t pid = fork();
|
||||
IT_ASSERT(pid >= 0, "fork failed");
|
||||
if (pid == 0) { // Child process
|
||||
close(fdP2C[1]);
|
||||
close(fdC2P[0]);
|
||||
|
||||
return dllPath;
|
||||
dup2(fdP2C[0], STDIN_FILENO);
|
||||
close(fdP2C[0]);
|
||||
|
||||
string cmd =
|
||||
"from cpp_plugin.gen_ansor_so import pipe_gen; pipe_gen(+" +
|
||||
std::to_string(fdC2P[1]) + ")";
|
||||
const char *const argv[] = {"python3", "-c", cmd.data(), NULL};
|
||||
execvp("python3", const_cast<char *const *>(argv));
|
||||
} else { // Parent process
|
||||
close(fdP2C[0]);
|
||||
close(fdC2P[1]);
|
||||
|
||||
// Write to pipe
|
||||
string serializedArgs = serializeTVMArgs(
|
||||
inDims, inDTypes, outDims, outDType, lambda, funcName,
|
||||
nnetExprString, nnetSimplifiedExprString, hashCode);
|
||||
int status = -1;
|
||||
status =
|
||||
write(fdP2C[1], serializedArgs.data(), serializedArgs.size());
|
||||
IT_ASSERT((size_t)status == serializedArgs.size(),
|
||||
"Failed to write to pipe");
|
||||
close(fdP2C[1]);
|
||||
|
||||
// Wait for TVM
|
||||
waitpid(pid, &status, 0);
|
||||
IT_ASSERT(WIFEXITED(status), "TVM process was terminated");
|
||||
const int es = WEXITSTATUS(status);
|
||||
IT_ASSERT(es == 0,
|
||||
"TVM process exit with code " + std::to_string(es));
|
||||
|
||||
// Read from pipe
|
||||
FILE *stream;
|
||||
stream = fdopen(fdC2P[0], "r");
|
||||
char buf_read[257] = {0};
|
||||
status = std::fscanf(stream, "%256c", buf_read);
|
||||
IT_ASSERT(status == 1, "Failed to read from pipe");
|
||||
IT_ASSERT(buf_read[256] == 0, "Pipe buffer overflow");
|
||||
fclose(stream);
|
||||
close(fdC2P[0]);
|
||||
return buf_read;
|
||||
}
|
||||
IT_ASSERT(false, "Should not reach here");
|
||||
return "";
|
||||
}
|
||||
|
||||
tvm::runtime::PackedFunc getPackedFunction(string path,
|
||||
|
@ -214,6 +282,35 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
|
||||
return {values, type_codes};
|
||||
}
|
||||
|
||||
bool useExistingKernels(Ref<MemBoundObj> op) const {
|
||||
return false;
|
||||
const map<HashType, tuple<int, int, int, int, int, int, int, int, int,
|
||||
int, int, int, int, int, int>>
|
||||
hashMap = {
|
||||
// clang-format off
|
||||
{18446744073661354550ULL, {1, 1, 2, 2, 256, 4, 4, 4, 4, 1, 1, 2, 2, 1, 1}},
|
||||
{124145340ULL, {1, 1, 4, 4, 128, 4, 4, 8, 8, 1, 1, 2, 2, 1, 1}},
|
||||
{18446744073695718019ULL, {1, 1, 8, 8, 64, 4, 4, 16, 16, 1, 1, 2, 2, 1, 1}},
|
||||
{515085072ULL, {2, 1, 16, 16, 3, 4, 4, 32, 32, 1, 1, 2, 2, 1, 1}}
|
||||
}; // clang-format on
|
||||
float *input = op->getInputs(0)->getRawDataPtr<float *>();
|
||||
float *bias = nullptr;
|
||||
float *output = op->getOutput()->getRawDataPtr<float *>();
|
||||
if (auto it = hashMap.find(op->getHash()); it != hashMap.end()) {
|
||||
auto &[PReLU, n, h, w, f, r, s, oh, ow, ph, pw, sh, sw, dh, dw] =
|
||||
it->second;
|
||||
IT_ASSERT(op->getInputs(0)->size() ==
|
||||
size_t(n) * h * w * f * r * s);
|
||||
IT_ASSERT(op->getOutput()->size() == size_t(n) * oh * ow * f);
|
||||
convTranspose2dreduce_kernel(input, bias, output, PReLU, n, h, w, f,
|
||||
r, s, oh, ow, ph, pw, sh, sw, dh, dw);
|
||||
return true;
|
||||
}
|
||||
// conv2dreduce_kernel(input, bias, output, PReLU, n, h, w, f, r, s,
|
||||
// oh, ow, ph, pw, sh, sw, dh, dw);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
|
||||
|
|
|
@ -0,0 +1,287 @@
|
|||
#include "core/common.h"
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
template <class T>
|
||||
__global__ void reduce_merge_conv_3x3_1x1(
|
||||
T *__restrict__ input, T *__restrict__ output, T *__restrict__ bias,
|
||||
const int N, const int H, const int W, const int F, const int N_offset,
|
||||
const int H_offset, const int W_offset, const int F_offset,
|
||||
const int out_N_offset, const int out_F_offset, const int out_H_offset,
|
||||
const int out_W_offset, const int num) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num) {
|
||||
int tmptid = tid;
|
||||
const int n = (tmptid / out_N_offset);
|
||||
tmptid -= n * out_N_offset;
|
||||
const int f = tmptid / out_F_offset;
|
||||
tmptid -= f * out_F_offset;
|
||||
const int h = tmptid / out_H_offset;
|
||||
tmptid -= h * out_H_offset;
|
||||
const int w = tmptid / out_W_offset;
|
||||
|
||||
const int noff = n * N_offset;
|
||||
const int hoff = h * H_offset;
|
||||
const int woff = w * W_offset;
|
||||
const int foff = f * F_offset;
|
||||
input += noff + foff + woff + hoff;
|
||||
T res = 0;
|
||||
res += input[4];
|
||||
res += input[9];
|
||||
|
||||
if (h < H - 1) {
|
||||
res += input[H_offset + 7];
|
||||
if (w < W - 1)
|
||||
res += input[H_offset + W_offset + 8];
|
||||
if (w > 0)
|
||||
res += input[H_offset - W_offset + 6];
|
||||
}
|
||||
if (h > 0) {
|
||||
res += input[1 - H_offset];
|
||||
if (w < W - 1)
|
||||
res += input[W_offset - H_offset + 2];
|
||||
if (w > 0)
|
||||
res += input[-1 * H_offset - W_offset];
|
||||
}
|
||||
if (w < W - 1)
|
||||
res += input[5 + W_offset];
|
||||
if (w > 0)
|
||||
res += input[3 - W_offset];
|
||||
output[tid] = max(res + bias[f], 0.f);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void reduce_merge_conv_3x3(
|
||||
T *__restrict__ input, T *__restrict__ output, T *__restrict__ bias,
|
||||
const int N, const int H, const int W, const int F, const int N_offset,
|
||||
const int H_offset, const int W_offset, const int F_offset,
|
||||
const int out_N_offset, const int out_F_offset, const int out_H_offset,
|
||||
const int out_W_offset, const int num, const int act) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num) {
|
||||
int tmptid = tid;
|
||||
const int n = (tmptid / out_N_offset);
|
||||
tmptid -= n * out_N_offset;
|
||||
const int f = tmptid / out_F_offset;
|
||||
tmptid -= f * out_F_offset;
|
||||
const int h = tmptid / out_H_offset;
|
||||
tmptid -= h * out_H_offset;
|
||||
const int w = tmptid / out_W_offset;
|
||||
|
||||
const int noff = n * N_offset;
|
||||
const int hoff = h * H_offset;
|
||||
const int woff = w * W_offset;
|
||||
const int foff = f * F_offset;
|
||||
input += noff + foff + woff + hoff;
|
||||
T res = 0;
|
||||
res += input[4];
|
||||
|
||||
if (h < H - 1) {
|
||||
res += input[H_offset + 7];
|
||||
if (w < W - 1)
|
||||
res += input[H_offset + W_offset + 8];
|
||||
if (w > 0)
|
||||
res += input[H_offset - W_offset + 6];
|
||||
}
|
||||
if (h > 0) {
|
||||
res += input[1 - H_offset];
|
||||
if (w < W - 1)
|
||||
res += input[W_offset - H_offset + 2];
|
||||
if (w > 0)
|
||||
res += input[-1 * H_offset - W_offset];
|
||||
}
|
||||
if (w < W - 1)
|
||||
res += input[5 + W_offset];
|
||||
if (w > 0)
|
||||
res += input[3 - W_offset];
|
||||
if (act) {
|
||||
// output[tid] = max(res + bias[f], 0.f);
|
||||
// HACK: temperaly remove bias
|
||||
output[tid] = max(res, 0.f);
|
||||
} else {
|
||||
// output[tid] = res + bias[f];
|
||||
// HACK: temperaly remove bias
|
||||
output[tid] = res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void
|
||||
reduce_2(T *__restrict__ input, T *__restrict__ output, T *__restrict__ bias,
|
||||
const int N, const int F, const int H, const int W, const int N_offset,
|
||||
const int F_offset, const int H_offset, const int W_offset,
|
||||
const int out_N_offset, const int out_F_offset, const int out_H_offset,
|
||||
const int out_W_offset, const int num) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num) {
|
||||
int tmptid = tid;
|
||||
const int n = tmptid / out_N_offset;
|
||||
tmptid -= n * out_N_offset;
|
||||
const int f = tmptid / out_F_offset;
|
||||
tmptid -= f * out_F_offset;
|
||||
const int h = tmptid / out_H_offset;
|
||||
tmptid -= h * out_H_offset;
|
||||
const int w = tmptid / out_W_offset;
|
||||
|
||||
const int noff = n * N_offset;
|
||||
const int foff = f * F_offset * 4;
|
||||
const int hoff = h * H_offset;
|
||||
const int woff = w * W_offset;
|
||||
input += noff + foff + woff + hoff;
|
||||
T res = input[0];
|
||||
if (w != W - 1)
|
||||
res += input[F_offset * 2 + 3];
|
||||
if (h != H - 1) {
|
||||
res += input[F_offset + 3 * H_offset];
|
||||
if (w != W - 1)
|
||||
res += input[F_offset * 3 + 3 * H_offset + 3];
|
||||
}
|
||||
// output[tid] = max(res + bias[f], 0.f);
|
||||
// HACK: temperaly remove bias
|
||||
output[tid] = max(res, 0.f);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void reduceConvRxSToNCHWKernel(
|
||||
float *__restrict__ input, float *__restrict__ bias,
|
||||
float *__restrict__ output, const int act, const int n, const int f,
|
||||
const int h, const int w, const int oh, const int ow, const int r,
|
||||
const int s, const int ph, const int pw, const int dh, const int dw) {
|
||||
// input shape: (n, h, w, f, r, s)
|
||||
// output shape: (n, f, oh, ow)
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int out_N_offset = f * oh * ow, out_F_offset = oh * ow,
|
||||
out_H_offset = ow, out_W_offset = 1;
|
||||
const int num = out_N_offset * n;
|
||||
if (tid < num) {
|
||||
// output index
|
||||
int tmptid = tid;
|
||||
const int nid = (tmptid / out_N_offset);
|
||||
tmptid -= nid * out_N_offset;
|
||||
const int fid = tmptid / out_F_offset;
|
||||
tmptid -= fid * out_F_offset;
|
||||
const int hid = tmptid / out_H_offset;
|
||||
tmptid -= hid * out_H_offset;
|
||||
const int wid = tmptid / out_W_offset;
|
||||
|
||||
// Input index
|
||||
const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
|
||||
nchunck = h * hchunk;
|
||||
float *__restrict__ nfinput = input + nid * nchunck + fid * fchunck;
|
||||
float imm = 0.0;
|
||||
const int ihst = hid, iwst = wid;
|
||||
for (int ri = 0; ri < r; ++ri) {
|
||||
for (int si = 0; si < s; ++si) {
|
||||
int ihid = ihst + (ri - r / 2) * dh;
|
||||
int iwid = iwst + (si - s / 2) * dw;
|
||||
if (ihid >= 0 && ihid < h && iwid >= 0 && iwid < w) {
|
||||
imm += *(nfinput + ihid * hchunk + iwid * wchunk + ri * s +
|
||||
si);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias) {
|
||||
imm += bias[fid];
|
||||
}
|
||||
if (act) {
|
||||
imm = imm > 0.0 ? imm : 0;
|
||||
}
|
||||
output[tid] = imm;
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
|
||||
void hetConvToMMReduce(int n, int h, int w, int f, float *input, float *output,
|
||||
float *bias) {
|
||||
const int kBlockSize = 128;
|
||||
vector<int> in_params = {n, h, w, f}; // NHWF
|
||||
vector<int> out_params = {n, f, h, w};
|
||||
int in_base = 10;
|
||||
int out_base = 1;
|
||||
vector<int> in_offsets;
|
||||
vector<int> out_offsets;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
in_offsets.push_back(in_base);
|
||||
in_base *= in_params[3 - i];
|
||||
out_offsets.push_back(out_base);
|
||||
out_base *= out_params[3 - i];
|
||||
}
|
||||
reduce_merge_conv_3x3_1x1<float>
|
||||
<<<(out_base + kBlockSize - 1) / kBlockSize, kBlockSize>>>(
|
||||
input, output, bias, in_params[0], in_params[1], in_params[2],
|
||||
in_params[3], in_offsets[3], in_offsets[2], in_offsets[1],
|
||||
in_offsets[0], out_offsets[3], out_offsets[2], out_offsets[1],
|
||||
out_offsets[0], out_base);
|
||||
}
|
||||
|
||||
void conv5x5ToConv3x3Reduce(int n, int f, int h, int w, float *input,
|
||||
float *output, float *bias) {
|
||||
const int kBlockSize = 128;
|
||||
vector<int> params{n, f, h, w}; // NFHW
|
||||
vector<int> ranges(4);
|
||||
ranges[3] = params[3] + 2;
|
||||
ranges[2] = params[2] + 2;
|
||||
ranges[1] = params[1] * 4;
|
||||
ranges[0] = params[0];
|
||||
|
||||
int in_base = 1;
|
||||
int out_base = 1;
|
||||
vector<int> in_offsets;
|
||||
vector<int> out_offsets;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
in_offsets.push_back(in_base);
|
||||
in_base *= ranges[3 - i];
|
||||
out_offsets.push_back(out_base);
|
||||
out_base *= params[3 - i];
|
||||
}
|
||||
reduce_2<float><<<(out_base + kBlockSize - 1) / kBlockSize, kBlockSize>>>(
|
||||
input, output, bias, params[0], params[1], params[2], params[3],
|
||||
in_offsets[3], in_offsets[2], in_offsets[1], in_offsets[0],
|
||||
out_offsets[3], out_offsets[2], out_offsets[1], out_offsets[0],
|
||||
out_base);
|
||||
}
|
||||
|
||||
// [NHW,FRS] -> [NFHW]
|
||||
void conv3x3ToReduce(int n, int h, int w, int f, float *input, float *output,
|
||||
float *bias) {
|
||||
const int kBlockSize = 128;
|
||||
vector<int> in_params = {n, h, w, f}; // NHWF
|
||||
vector<int> out_params = {n, f, h, w};
|
||||
int in_base = 9;
|
||||
int out_base = 1;
|
||||
vector<int> in_offsets;
|
||||
vector<int> out_offsets;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
in_offsets.push_back(in_base);
|
||||
in_base *= in_params[3 - i];
|
||||
out_offsets.push_back(out_base);
|
||||
out_base *= out_params[3 - i];
|
||||
}
|
||||
reduce_merge_conv_3x3<float>
|
||||
<<<(out_base + kBlockSize - 1) / kBlockSize, kBlockSize>>>(
|
||||
input, output, bias, in_params[0], in_params[1], in_params[2],
|
||||
in_params[3], in_offsets[3], in_offsets[2], in_offsets[1],
|
||||
in_offsets[0], out_offsets[3], out_offsets[2], out_offsets[1],
|
||||
out_offsets[0], out_base, 0);
|
||||
}
|
||||
|
||||
void reduceConvRxSToNCHW(float *input, float *bias, float *output, int act,
|
||||
int n, int h, int w, int f, int r, int s, int oh,
|
||||
int ow, int ph, int pw, int sh, int sw, int dh,
|
||||
int dw) {
|
||||
IT_ASSERT(sh == 1 && sw == 1,
|
||||
"reduceConvRxSToNCHWKernel_kernel only support sh=sw=1");
|
||||
IT_ASSERT(dh == 1 && dw == 1,
|
||||
"reduceConvRxSToNCHWKernel_kernel only support dh=dw=1");
|
||||
const int blocksize = 512;
|
||||
const int gridsize = (n * f * oh * ow + blocksize - 1) / blocksize;
|
||||
|
||||
cudaStream_t stream(cudaStreamPerThread);
|
||||
reduceConvRxSToNCHWKernel<<<gridsize, blocksize, 0, stream>>>(
|
||||
input, bias, output, act, n, f, h, w, oh, ow, r, s, ph, pw, dh, dw);
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -4,10 +4,13 @@ namespace infini {
|
|||
class CopyCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto inData = op->getInputs(0)->getRawDataPtr<void *>();
|
||||
auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
|
||||
cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
|
||||
cudaMemcpyDeviceToDevice);
|
||||
// auto inData = op->getInputs(0)->getRawDataPtr<void *>();
|
||||
// auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
|
||||
// cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
|
||||
// cudaMemcpyDeviceToDevice);
|
||||
|
||||
// HACK: optimization
|
||||
op->getOutputs()[0]->setData(op->getInputs(0)->getDataBlob());
|
||||
}
|
||||
};
|
||||
// reshape/flatten/identity all act as copying from input to output.
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
#include "operators/transpose.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_transpose.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class TransposeCuda : public CudaKernelWithoutConfig {
|
||||
void generic_transpose(const Ref<TransposeObj> &op,
|
||||
const RuntimeObj *context) const {
|
||||
auto input = op->getInputs(0);
|
||||
auto output = op->getOutput();
|
||||
void *const inputData = input->getRawDataPtr<void *>();
|
||||
void *const outputData = output->getRawDataPtr<void *>();
|
||||
const auto &inputShape = input->getDims();
|
||||
const auto &outputShape = output->getDims();
|
||||
|
||||
const auto &perm = op->getPermute();
|
||||
int size = input->size();
|
||||
int nDims = input->getDims().size();
|
||||
|
||||
// Compute strides
|
||||
SmallArray strides, buffer;
|
||||
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
||||
int curStride = 1;
|
||||
for (int i = nDims - 1; i >= 0; --i) {
|
||||
buffer.data[i] = curStride;
|
||||
curStride *= inputShape[i];
|
||||
}
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
strides.data[i] = buffer.data[perm[i]];
|
||||
}
|
||||
|
||||
SmallArray outputDims;
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
outputDims.data[i] = outputShape[i];
|
||||
}
|
||||
|
||||
transpose_kernel((float *)inputData, (float *)outputData, nDims, size,
|
||||
strides, outputDims, input->getDims(),
|
||||
output->getDims(), perm);
|
||||
}
|
||||
|
||||
void fast_transpose_last_dim(const Ref<TransposeObj> &op,
|
||||
const RuntimeObj *context) const {
|
||||
// Perm 0 2 3 1
|
||||
auto cuda = dynamic_cast<const CudaRuntimeObj *>(context);
|
||||
auto shape = op->getOutput()->getDims();
|
||||
invoke_transpose_last_two_dim(
|
||||
op->getInputs(0)->getRawDataPtr<float *>(),
|
||||
op->getOutput()->getRawDataPtr<float *>(), shape[0],
|
||||
shape[1] * shape[2], shape[3], cuda->getNumSMs());
|
||||
}
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<TransposeObj>(_op);
|
||||
const auto &perm = op->getPermute();
|
||||
if (perm == vector{0, 2, 3, 1}) {
|
||||
fast_transpose_last_dim(op, _context);
|
||||
} else {
|
||||
generic_transpose(op, _context);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Transpose, DataType::Float32,
|
||||
TransposeCuda, "Transpose_CUDA_Float32");
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,231 @@
|
|||
#include "core/common.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <limits>
|
||||
#define CUDA_HOST_DEVICE __forceinline__ __device__ __host__
|
||||
|
||||
// https://github.com/462630221/SampleCode
|
||||
template <typename T> struct QuotientMod {
|
||||
T quotient;
|
||||
T mod;
|
||||
__host__ __device__ QuotientMod(T q, T m) : quotient(q), mod(m) {}
|
||||
};
|
||||
|
||||
template <typename T> struct FastIntDivider {
|
||||
FastIntDivider() {}
|
||||
FastIntDivider(T d) { divisor_ = d; };
|
||||
__forceinline__ __device__ __host__ T div(T n) { return n / divisor_; }
|
||||
__forceinline__ __device__ __host__ T mod(T n) { return n % divisor_; }
|
||||
__forceinline__ __device__ __host__ QuotientMod<T> divmod(T n) {
|
||||
return QuotientMod<T>(n / divisor_, n % divisor_);
|
||||
}
|
||||
T divisor_;
|
||||
};
|
||||
|
||||
template <> struct FastIntDivider<uint32_t> {
|
||||
FastIntDivider(){};
|
||||
|
||||
FastIntDivider(uint32_t d) {
|
||||
assert(d >= 1);
|
||||
divisor_ = d;
|
||||
// if put 0 to __builtin_clz, the result undefined.
|
||||
if (d == 1) {
|
||||
rshift_ = 0;
|
||||
} else {
|
||||
rshift_ = 32 - __builtin_clz(d - 1);
|
||||
}
|
||||
uint64_t magic_t = ((1lu << (32 + rshift_)) + d - 1) / d;
|
||||
magic_ = uint32_t(magic_t);
|
||||
};
|
||||
|
||||
__forceinline__ __device__ __host__ uint32_t div(uint32_t n) {
|
||||
#if defined(__CUDA_ARCH__)
|
||||
uint32_t q = __umulhi(n, magic_);
|
||||
#else
|
||||
uint32_t q = (uint64_t(n) * magic_) >> 32;
|
||||
#endif
|
||||
// return (((n - q) >> 1) + q) >> (rshift_ - 1);
|
||||
return (n + q) >> rshift_;
|
||||
}
|
||||
|
||||
__forceinline__ __device__ __host__ QuotientMod<uint32_t>
|
||||
divmod(uint32_t n) {
|
||||
uint32_t q = div(n);
|
||||
return QuotientMod<uint32_t>(q, n - divisor_ * q);
|
||||
}
|
||||
|
||||
uint32_t magic_;
|
||||
uint32_t rshift_;
|
||||
uint32_t divisor_;
|
||||
};
|
||||
|
||||
void test_fast_u32() {
|
||||
uint32_t d = 1;
|
||||
|
||||
FastIntDivider<uint32_t> diver(d);
|
||||
std::cout << "7/3= " << uint32_t(7) / uint32_t(d) << " " << diver.div(7)
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
||||
__global__ void _transpose_kernel(float *input, float *output, int nDims,
|
||||
int size, infini::SmallArray strides,
|
||||
infini::SmallArray outputShape) {
|
||||
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (outputIdx < size) {
|
||||
int inputIdx = 0;
|
||||
int v = outputIdx;
|
||||
for (int i = nDims - 1; i >= 0; --i) {
|
||||
inputIdx += v % outputShape.data[i] * strides.data[i];
|
||||
v /= outputShape.data[i];
|
||||
}
|
||||
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
|
||||
output[outputIdx] = __ldg(input + inputIdx);
|
||||
#else
|
||||
output[outputIdx] = input[inputIdx];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int NUM> struct Array {
|
||||
CUDA_HOST_DEVICE T &operator[](unsigned int index) { return data[index]; }
|
||||
CUDA_HOST_DEVICE const T &operator[](unsigned int index) const {
|
||||
return data[index];
|
||||
}
|
||||
CUDA_HOST_DEVICE constexpr int size() const { return NUM; }
|
||||
|
||||
CUDA_HOST_DEVICE Array() {
|
||||
#ifndef __CUDA_ARCH__
|
||||
for (int i = 0; i < NUM; i++) {
|
||||
data[i] = T();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
T data[NUM];
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Optimize : Reorganize
|
||||
*
|
||||
*/
|
||||
template <int NUM_AXES, int UNROLL, int BLOCK_SIZE, typename T>
|
||||
__global__ void
|
||||
transpose_kernel_v3(const T *data_in, T *data_out,
|
||||
const Array<uint32_t, NUM_AXES> perm_strides,
|
||||
Array<FastIntDivider<uint32_t>, NUM_AXES> out_strides,
|
||||
const size_t all_cnt) {
|
||||
uint32_t out_offset_reg[UNROLL];
|
||||
uint32_t in_offset_reg[UNROLL];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL; ++i) {
|
||||
out_offset_reg[i] =
|
||||
blockIdx.x * BLOCK_SIZE * UNROLL + threadIdx.x + i * BLOCK_SIZE;
|
||||
in_offset_reg[i] = 0;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < NUM_AXES; ++j) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL; ++i) {
|
||||
QuotientMod<uint32_t> d = out_strides[j].divmod(out_offset_reg[i]);
|
||||
in_offset_reg[i] += d.quotient * perm_strides[j];
|
||||
out_offset_reg[i] = d.mod;
|
||||
}
|
||||
}
|
||||
|
||||
T ld_reg[UNROLL];
|
||||
uint32_t offset = blockIdx.x * BLOCK_SIZE * UNROLL + threadIdx.x;
|
||||
if (offset + BLOCK_SIZE * UNROLL <= all_cnt) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL; ++i) {
|
||||
ld_reg[i] = data_in[in_offset_reg[i]];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL; ++i) {
|
||||
data_out[offset + i * BLOCK_SIZE] = ld_reg[i];
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL; ++i) {
|
||||
if (offset + i * BLOCK_SIZE < all_cnt) {
|
||||
ld_reg[i] = data_in[in_offset_reg[i]];
|
||||
}
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL; ++i) {
|
||||
if (offset + i * BLOCK_SIZE < all_cnt) {
|
||||
data_out[offset + i * BLOCK_SIZE] = ld_reg[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <typename T> T AccMul(std::vector<T> vec) {
|
||||
return std::accumulate(vec.begin(), vec.end(), T(1), std::multiplies<T>());
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
// void transpose_kernel(float *input, float *output, int nDims, int size,
|
||||
// SmallArray strides, SmallArray outputShape) {
|
||||
// int blocksize = block_work_size();
|
||||
// int gridsize = (size + block_work_size() - 1) / block_work_size();
|
||||
// _transpose_kernel<<<gridsize, blocksize>>>(input, output, nDims, size,
|
||||
// strides, outputShape);
|
||||
// }
|
||||
|
||||
std::vector<uint32_t> GetStrides(std::vector<uint32_t> dims) {
|
||||
std::vector<uint32_t> strides(dims.size(), 1);
|
||||
for (int i = dims.size() - 2; i >= 0; --i) {
|
||||
strides[i] = strides[i + 1] * dims[i + 1];
|
||||
}
|
||||
return strides;
|
||||
}
|
||||
|
||||
void transpose_kernel(float *input, float *output, int nDims, int size,
|
||||
SmallArray _strides, SmallArray _outputShape,
|
||||
vector<int> _dims_in, vector<int> _dims_out,
|
||||
vector<int> _perms) {
|
||||
constexpr int NUM_AXES = 4;
|
||||
IT_ASSERT(nDims <= NUM_AXES);
|
||||
constexpr int UNROLL = 8 / sizeof(float);
|
||||
constexpr int BLOCK_SIZE = 128;
|
||||
|
||||
vector<uint32_t> dims_in, dims_out, perms;
|
||||
for (auto v : _dims_in)
|
||||
dims_in.push_back(v);
|
||||
for (auto v : _dims_out)
|
||||
dims_out.push_back(v);
|
||||
for (auto v : _perms)
|
||||
perms.push_back(v);
|
||||
|
||||
size_t all_cnt = AccMul(dims_in);
|
||||
|
||||
auto strides_in = GetStrides(dims_in);
|
||||
auto strides_out = GetStrides(dims_out);
|
||||
|
||||
const int grid =
|
||||
(all_cnt + BLOCK_SIZE * UNROLL - 1) / (BLOCK_SIZE * UNROLL);
|
||||
Array<uint32_t, NUM_AXES> perm_strides;
|
||||
Array<FastIntDivider<uint32_t>, NUM_AXES> out_strides_fast;
|
||||
for (int i = 0; i < NUM_AXES; ++i) {
|
||||
out_strides_fast[i] = FastIntDivider<uint32_t>(strides_out[i]);
|
||||
perm_strides[i] = strides_in[perms[i]];
|
||||
}
|
||||
|
||||
transpose_kernel_v3<NUM_AXES, UNROLL, BLOCK_SIZE, float>
|
||||
<<<grid, BLOCK_SIZE, 0>>>(
|
||||
input, output, perm_strides, out_strides_fast, all_cnt);
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,194 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
|
||||
template <int numSM, int numWarp>
|
||||
__global__ void kernel_transpose_last(float *ptrA, float *ptrB, int dim0,
|
||||
int dim1, int dim2) {
|
||||
int laneId = threadIdx.x % 32;
|
||||
int warpId = blockIdx.x * numWarp + threadIdx.x / 32;
|
||||
int n1 = (dim1 + 31) / 32;
|
||||
int n2 = (dim2 + 31) / 32;
|
||||
float bufA[32];
|
||||
for (int i = warpId; i < dim0 * n1 * n2; i += numSM * numWarp) {
|
||||
// clock_t ck0 = clock();
|
||||
int i0 = i / (n1 * n2);
|
||||
int i1 = (i % (n1 * n2)) / n2;
|
||||
int i2 = (i % (n1 * n2)) % n2;
|
||||
int offsetA = i0 * dim1 * dim2 + i2 * 32 * dim1 + i1 * 32;
|
||||
int offsetB = i0 * dim1 * dim2 + i1 * 32 * dim2 + i2 * 32;
|
||||
int ld1 = min(32, dim1 - i1 * 32);
|
||||
int ld2 = min(32, dim2 - i2 * 32);
|
||||
// if (i == 4 && laneId == 0)
|
||||
// printf("%d %d\n", ld1, ld2);
|
||||
|
||||
if (ld2 == 32) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 32; i++) {
|
||||
if ((laneId + i) % 32 < ld1) {
|
||||
bufA[i] = ptrA[offsetA + i * dim1 + (laneId + i) % 32];
|
||||
}
|
||||
}
|
||||
} else if (ld2 == 17) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 17; i++) {
|
||||
if ((laneId + i) % 32 < ld1) {
|
||||
bufA[i] = ptrA[offsetA + i * dim1 + (laneId + i) % 32];
|
||||
}
|
||||
}
|
||||
} else if (ld2 == 4) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if ((laneId + i) % 32 < ld1) {
|
||||
bufA[i] = ptrA[offsetA + i * dim1 + (laneId + i) % 32];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < ld2; i++) {
|
||||
if ((laneId + i) % 32 < ld1) {
|
||||
bufA[i] = ptrA[offsetA + i * dim1 + (laneId + i) % 32];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if (ld1 == 32) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 32; i++) {
|
||||
if ((i + 32 - laneId) % 32 < ld2) {
|
||||
ptrB[offsetB + i * dim2 + (i + 32 - laneId) % 32] =
|
||||
bufA[(i + 32 - laneId) % 32];
|
||||
}
|
||||
}
|
||||
} else if (ld1 == 17) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 17; i++) {
|
||||
if ((i + 32 - laneId) % 32 < ld2) {
|
||||
ptrB[offsetB + i * dim2 + (i + 32 - laneId) % 32] =
|
||||
bufA[(i + 32 - laneId) % 32];
|
||||
}
|
||||
}
|
||||
} else if (ld1 == 4) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if ((i + 32 - laneId) % 32 < ld2) {
|
||||
ptrB[offsetB + i * dim2 + (i + 32 - laneId) % 32] =
|
||||
bufA[(i + 32 - laneId) % 32];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < ld1; i++) {
|
||||
if ((i + 32 - laneId) % 32 < ld2) {
|
||||
ptrB[offsetB + i * dim2 + (i + 32 - laneId) % 32] =
|
||||
bufA[(i + 32 - laneId) % 32];
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
|
||||
/// @brief
|
||||
/// @param ptrA Input tensor of shape [dim0, dim2, dim1]
|
||||
/// @param ptrB Output tensor of shape [dim0, dim1, dim2]
|
||||
/// @param dim0
|
||||
/// @param dim1
|
||||
/// @param dim2
|
||||
void invoke_transpose_last_two_dim(float *ptrA, float *ptrB, int dim0, int dim1,
|
||||
int dim2, int numSMs) {
|
||||
constexpr int numWarps = 4;
|
||||
dim3 gridDim(numSMs, 1);
|
||||
dim3 blockDim(numWarps * 32, 1);
|
||||
if (numSMs == 80) { // V100
|
||||
kernel_transpose_last<80, numWarps>
|
||||
<<<gridDim, blockDim>>>(ptrA, ptrB, dim0, dim1, dim2);
|
||||
} else if (numSMs == 108) { // A100
|
||||
kernel_transpose_last<108, numWarps>
|
||||
<<<gridDim, blockDim>>>(ptrA, ptrB, dim0, dim1, dim2);
|
||||
} else {
|
||||
IT_TODO_HALT_MSG(std::string("transpose_last_two_dim with ") +
|
||||
std::to_string(numSMs) + " SMs is not implemented");
|
||||
}
|
||||
// cudaCheckError();
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
||||
// constexpr int numWarm = 128, numEval = 128;
|
||||
//
|
||||
// void eval_transpose_last(const std::vector<int> &shape) {
|
||||
// assert(shape.size() == 3);
|
||||
// int size = shape[0] * shape[1] * shape[2];
|
||||
// float *dataA, *dataB;
|
||||
// dataA = (float *)malloc(size * sizeof(float));
|
||||
// dataB = (float *)malloc(size * sizeof(float));
|
||||
// for (int i0 = 0; i0 < shape[0]; i0++) {
|
||||
// for (int i2 = 0; i2 < shape[2]; i2++) {
|
||||
// for (int i1 = 0; i1 < shape[1]; i1++) {
|
||||
// dataA[i0 * shape[1] * shape[2] + i2 * shape[1] + i1] =
|
||||
// i0 * shape[1] * shape[2] + i2 * shape[1] + i1;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// float *ptrA, *ptrB;
|
||||
// checkCudaError(cudaMalloc(&ptrA, size * sizeof(float)));
|
||||
// checkCudaError(cudaMalloc(&ptrB, size * sizeof(float)));
|
||||
// checkCudaError(
|
||||
// cudaMemcpy(ptrA, dataA, size * sizeof(float),
|
||||
// cudaMemcpyHostToDevice));
|
||||
|
||||
// invoke_transpose_last_two_dim(ptrA, ptrB, shape[0], shape[1], shape[2]);
|
||||
// checkCudaError(
|
||||
// cudaMemcpy(dataB, ptrB, size * sizeof(float),
|
||||
// cudaMemcpyDeviceToHost));
|
||||
// for (int i0 = 0; i0 < shape[0]; i0++) {
|
||||
// for (int i1 = 0; i1 < shape[1]; i1++) {
|
||||
// for (int i2 = 0; i2 < shape[2]; i2++) {
|
||||
// if (dataA[i0 * shape[1] * shape[2] + i1 + i2 * shape[1]] !=
|
||||
// dataB[i0 * shape[1] * shape[2] + i1 * shape[2] + i2]) {
|
||||
// std::cout
|
||||
// << i0 << " " << i1 << " " << i2 << " "
|
||||
// << dataA[i0 * shape[1] * shape[2] + i1 + i2 *
|
||||
// shape[1]]
|
||||
// << " "
|
||||
// << dataB[i0 * shape[1] * shape[2] + i1 * shape[2] +
|
||||
// i2]
|
||||
// << std::endl;
|
||||
// exit(-1);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// cudaEvent_t st, ed;
|
||||
// checkCudaError(cudaEventCreate(&st));
|
||||
// checkCudaError(cudaEventCreate(&ed));
|
||||
// for (int i = 0; i < numWarm; i++) {
|
||||
// invoke_transpose_last_two_dim(ptrA, ptrB, shape[0], shape[1],
|
||||
// shape[2]);
|
||||
// }
|
||||
// checkCudaError(cudaEventRecord(st));
|
||||
// for (int i = 0; i < numEval; i++) {
|
||||
// invoke_transpose_last_two_dim(ptrA, ptrB, shape[0], shape[1],
|
||||
// shape[2]);
|
||||
// }
|
||||
// checkCudaError(cudaEventRecord(ed));
|
||||
// checkCudaError(cudaEventSynchronize(st));
|
||||
// checkCudaError(cudaEventSynchronize(ed));
|
||||
// float time;
|
||||
// checkCudaError(cudaEventElapsedTime(&time, st, ed));
|
||||
// float bandwidth = size * 2 * sizeof(float) * numEval / time / 1e6;
|
||||
// std::cout << "transpose_last: " << shape[0] << " " << shape[1] << " "
|
||||
// << shape[2] << " time: " << time / numEval
|
||||
// << " ms. bandwidth: " << bandwidth << " GB/s" << std::endl;
|
||||
// }
|
||||
|
||||
// Performance evaluation
|
||||
// int main() {
|
||||
// eval_transpose_last({16, 1024, 256});
|
||||
// eval_transpose_last({16, 14 * 14, 1024});
|
||||
// eval_transpose_last({16, 7 * 7, 2048});
|
||||
// eval_transpose_last({16, 7 * 7, 128});
|
||||
// eval_transpose_last({1, 14 * 14, 1024});
|
||||
// eval_transpose_last({1, 7 * 7, 2048});
|
||||
// eval_transpose_last({1, 7 * 7, 128});
|
||||
// }
|
|
@ -24,9 +24,11 @@ class ActivationCudnn : public CudaKernelWithoutConfig {
|
|||
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
cudnnTensorDescriptor_t inputDesc, outputDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto _dim = op->getInputs(0)->getDims();
|
||||
IT_ASSERT_TODO(_dim.size() <= 4);
|
||||
vector<int> dim(4, 1);
|
||||
for (int i = 0; i < (int)_dim.size(); i++) // Unsqueeze to 4D
|
||||
dim[i + 4 - _dim.size()] = _dim[i];
|
||||
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
||||
|
||||
// get inputs
|
||||
|
|
|
@ -0,0 +1,560 @@
|
|||
#ifdef USE_CUDA
|
||||
#include "core/blob.h"
|
||||
#include "core/dummy_mutator.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "core/search_engine.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "ffi/ffi_callback.h"
|
||||
#include "nnet/nmutator.h"
|
||||
#include "operators/G2BMM.h"
|
||||
#include "operators/GBMM.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/pooling.h"
|
||||
#include "operators/reshape.h"
|
||||
#include "operators/softmax.h"
|
||||
#include "operators/transpose.h"
|
||||
#include "operators/unary.h"
|
||||
#include "test.h"
|
||||
#include <pybind11/stl.h>
|
||||
|
||||
namespace infini {
|
||||
|
||||
// Channel, kernelSize, pad, stride, isTanh
|
||||
using GANConfigs = vector<tuple<int, int, int, int, bool>>;
|
||||
using DetailedConfigs =
|
||||
vector<tuple<int, int, int, int, int, int, int, int, int, int, bool>>;
|
||||
|
||||
static const vector<int> metaRules = {3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90};
|
||||
|
||||
DetailedConfigs getGANConfigs(int id, int batch) {
|
||||
// The first conv can be transformed into gemm without reduction
|
||||
// n, f, h, w, c, r, s, stride,
|
||||
// pad, dilation
|
||||
GANConfigs ret;
|
||||
const DetailedConfigs infoConfigs = {
|
||||
{batch, 228, 1, 1, 448, 2, 2, 1, 0, 1, false},
|
||||
{batch, 448, 2, 2, 256, 4, 4, 2, 1, 1, false},
|
||||
{batch, 256, 4, 4, 128, 4, 4, 2, 1, 1, false},
|
||||
{batch, 128, 8, 8, 64, 4, 4, 2, 1, 1, false},
|
||||
{batch, 64, 16, 16, 3, 4, 4, 2, 1, 1, true}};
|
||||
const DetailedConfigs dcganConfigs = {
|
||||
{batch, 100, 1, 1, 512, 4, 4, 1, 0, 1, false},
|
||||
{batch, 512, 4, 4, 256, 4, 4, 2, 1, 1, false},
|
||||
{batch, 256, 8, 8, 128, 4, 4, 2, 1, 1, false},
|
||||
{batch, 128, 16, 16, 64, 4, 4, 2, 1, 1, false},
|
||||
{batch, 64, 32, 32, 3, 4, 4, 2, 1, 1, true}};
|
||||
DetailedConfigs details;
|
||||
if (id == 0) { // InfoGAN
|
||||
dbg("Use InfoGAN configs");
|
||||
details = infoConfigs;
|
||||
} else if (id == 1) { // DCGAN
|
||||
dbg("Use DCGAN configs");
|
||||
details = dcganConfigs;
|
||||
} else
|
||||
IT_ASSERT(false);
|
||||
return details;
|
||||
}
|
||||
|
||||
// NHWC format
|
||||
Graph getGANGraph(int batch, Runtime runtime, int nLayers, int modelId) {
|
||||
IT_ASSERT(1 <= nLayers && nLayers <= 5);
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
vector<Tensor> weights;
|
||||
auto configs = getGANConfigs(modelId, batch);
|
||||
|
||||
Tensor input;
|
||||
{
|
||||
auto &[n, f, h, w, c, r, s, stride, pad, dilation, isTanh] = configs[0];
|
||||
input = g->addTensor({batch, 1, 1, f}, DataType::Float32,
|
||||
TensorType::Input);
|
||||
}
|
||||
for (int i = 0; i < (int)configs.size() && i < nLayers; ++i) {
|
||||
// auto [channel, kernelSize, pad, stride, tanh] = configs[i];
|
||||
auto &[n, f, h, w, c, r, s, stride, pad, dilation, isTanh] = configs[i];
|
||||
IT_ASSERT(input->getDims()[3] == f);
|
||||
auto weight = g->addTensor({f, r, s, c}, DataType::Float32,
|
||||
TensorType::Initialized); // f, r, s, c
|
||||
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
|
||||
pad, stride, stride, 1, 1)
|
||||
->getOutput();
|
||||
if (isTanh) {
|
||||
input = g->addOp<TanhObj>(input, nullptr)->getOutput();
|
||||
} else {
|
||||
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
|
||||
}
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
// NHWC
|
||||
Graph getFSRCNNGraph(int batch, Runtime runtime) {
|
||||
// n, c, h, w, f, r, s, stride, pad, dilation, has_pReLU
|
||||
const DetailedConfigs fsrcnn_config = {
|
||||
{batch, 1, 32, 32, 56, 5, 5, 1, 2, 1, true},
|
||||
{batch, 56, 32, 32, 12, 1, 1, 1, 0, 1, true},
|
||||
{batch, 12, 32, 32, 12, 3, 3, 1, 1, 1, false},
|
||||
{batch, 12, 32, 32, 12, 3, 3, 1, 1, 1, false},
|
||||
{batch, 12, 32, 32, 12, 3, 3, 1, 1, 1, false},
|
||||
{batch, 12, 32, 32, 12, 3, 3, 1, 1, 1, true},
|
||||
{batch, 12, 32, 32, 56, 1, 1, 1, 0, 1, true},
|
||||
{batch, 56, 32, 32, 1, 9, 9, 1, 3, 4, false} // ConvTransNHWC
|
||||
// n, f, h, w, c, r, s, stride, pad, dilation, has_pReLU
|
||||
};
|
||||
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
|
||||
Tensor input;
|
||||
{
|
||||
auto &[n, c, h, w, f, r, s, stride, pad, dilation, has_pReLU] =
|
||||
fsrcnn_config[0];
|
||||
input = g->addTensor({batch, h, w, c}, DataType::Float32,
|
||||
TensorType::Input);
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int)fsrcnn_config.size() - 1; ++i) {
|
||||
// auto [channel, kernelSize, pad, stride, tanh] = configs[i];
|
||||
auto &[n, c, h, w, f, r, s, stride, pad, dilation, has_pReLU] =
|
||||
fsrcnn_config[i];
|
||||
IT_ASSERT(input->getDims()[3] == c);
|
||||
auto weight = g->addTensor({f, r, s, c}, DataType::Float32,
|
||||
TensorType::Initialized); // f, r, s, c
|
||||
input = g->addOp<ConvNHWCObj>(input, weight, nullptr, pad, pad, stride,
|
||||
stride, 1, 1)
|
||||
->getOutput();
|
||||
if (has_pReLU) {
|
||||
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
// last operator is a ConvTransNHWC
|
||||
{
|
||||
auto &[n, f, h, w, c, r, s, stride, pad, dilation, has_pReLU] =
|
||||
fsrcnn_config[fsrcnn_config.size() - 1];
|
||||
IT_ASSERT(input->getDims()[3] == f);
|
||||
auto weight = g->addTensor({f, r, s, c}, DataType::Float32,
|
||||
TensorType::Initialized); // f, r, s, c
|
||||
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
|
||||
pad, stride, stride, 1, 1)
|
||||
->getOutput();
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
Graph getLongformer(Runtime runtime, int bs) {
|
||||
const int seqlen = 10000, w = 1000, featlen = 512, heads = 8, d = 4;
|
||||
const int hidden = featlen, hiddenPerHead = hidden / heads;
|
||||
assert(hidden % heads == 0);
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
|
||||
auto i0 = g->addTensor({bs, seqlen, featlen}, DataType::Float32,
|
||||
TensorType::Input);
|
||||
auto w0 = g->addTensor({featlen, hidden}, DataType::Float32,
|
||||
TensorType::Initialized);
|
||||
auto w1 =
|
||||
g->addTensor({512, 512}, DataType::Float32, TensorType::Initialized);
|
||||
auto w2 =
|
||||
g->addTensor({512, 512}, DataType::Float32, TensorType::Initialized);
|
||||
// Feed forward
|
||||
auto w3 =
|
||||
g->addTensor({512, 512}, DataType::Float32, TensorType::Initialized);
|
||||
auto bias3 =
|
||||
g->addTensor({512}, DataType::Float32, TensorType::Initialized);
|
||||
auto w4 =
|
||||
g->addTensor({512, 512}, DataType::Float32, TensorType::Initialized);
|
||||
auto bias4 =
|
||||
g->addTensor({512}, DataType::Float32, TensorType::Initialized);
|
||||
|
||||
auto q0 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
|
||||
TensorType::Other);
|
||||
auto k0 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
|
||||
TensorType::Other);
|
||||
auto v0 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
|
||||
TensorType::Other);
|
||||
|
||||
auto q1 = g->addTensor({bs, seqlen, heads, hiddenPerHead},
|
||||
DataType::Float32, TensorType::Other);
|
||||
auto k1 = g->addTensor({bs, seqlen, heads, hiddenPerHead},
|
||||
DataType::Float32, TensorType::Other);
|
||||
auto v1 = g->addTensor({bs, seqlen, heads, hiddenPerHead},
|
||||
DataType::Float32, TensorType::Other);
|
||||
|
||||
auto q2 = g->addTensor({bs, heads, seqlen, hiddenPerHead},
|
||||
DataType::Float32, TensorType::Other);
|
||||
auto k2 = g->addTensor({bs, heads, seqlen, hiddenPerHead},
|
||||
DataType::Float32, TensorType::Other);
|
||||
auto v2 = g->addTensor({bs, heads, seqlen, hiddenPerHead},
|
||||
DataType::Float32, TensorType::Other);
|
||||
|
||||
auto q3 = g->addTensor({bs * heads, seqlen, hiddenPerHead},
|
||||
DataType::Float32, TensorType::Other);
|
||||
auto k3 = g->addTensor({bs * heads, seqlen, hiddenPerHead},
|
||||
DataType::Float32, TensorType::Other);
|
||||
auto v3 = g->addTensor({bs * heads, seqlen, hiddenPerHead},
|
||||
DataType::Float32, TensorType::Other);
|
||||
|
||||
auto prob = g->addTensor({bs * heads, seqlen, 2 * w + 1}, DataType::Float32,
|
||||
TensorType::Other);
|
||||
auto probSoftmax = g->addTensor({bs * heads, seqlen, 2 * w + 1},
|
||||
DataType::Float32, TensorType::Other);
|
||||
auto attn = g->addTensor({bs * heads, seqlen, hiddenPerHead},
|
||||
DataType::Float32, TensorType::Other);
|
||||
|
||||
auto t00 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
|
||||
TensorType::Other);
|
||||
auto t01 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
|
||||
TensorType::Other);
|
||||
auto t02 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
|
||||
TensorType::Other);
|
||||
// auto t10 = g->addTensor({bs, seqlen, hidden});
|
||||
auto t11 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
|
||||
TensorType::Other);
|
||||
auto t12 = g->addTensor({bs, seqlen, hidden}, DataType::Float32,
|
||||
TensorType::Other);
|
||||
auto output = g->addTensor({bs, seqlen, featlen}, DataType::Float32,
|
||||
TensorType::Other);
|
||||
|
||||
g->addOpWithOutputs<MatmulObj>(i0, w0, q0, false, true);
|
||||
g->addOpWithOutputs<MatmulObj>(i0, w1, k0, false, true);
|
||||
g->addOpWithOutputs<MatmulObj>(i0, w2, v0, false, true);
|
||||
g->addOpWithOutputs<ReshapeObj>(q0, q1);
|
||||
g->addOpWithOutputs<ReshapeObj>(k0, k1);
|
||||
g->addOpWithOutputs<ReshapeObj>(v0, v1);
|
||||
// For example, when perm=(1, 0, 2), given an input tensor of shape (1,
|
||||
// 2, 3), the output shape will be (2, 1, 3).
|
||||
g->addOpWithOutputs<TransposeObj>(q1, q2, vector{0, 2, 1, 3});
|
||||
g->addOpWithOutputs<TransposeObj>(k1, k2, vector{0, 2, 1, 3});
|
||||
g->addOpWithOutputs<TransposeObj>(v1, v2, vector{0, 2, 1, 3});
|
||||
g->addOpWithOutputs<ReshapeObj>(q2, q3);
|
||||
g->addOpWithOutputs<ReshapeObj>(k2, k3);
|
||||
g->addOpWithOutputs<ReshapeObj>(v2, v3);
|
||||
// Attention
|
||||
g->addOpWithOutputs<G2BMMObj>(q3, k3, prob, w, d);
|
||||
g->addOpWithOutputs<SoftmaxObj>(prob, probSoftmax, 2);
|
||||
g->addOpWithOutputs<GBMMObj>(probSoftmax, v3, attn, d);
|
||||
auto attn2 = g->addOp<ReshapeObj>(attn, nullptr,
|
||||
vector{bs, heads, seqlen, hiddenPerHead})
|
||||
->getOutput();
|
||||
auto t000 =
|
||||
g->addOp<TransposeObj>(attn2, nullptr, vector{0, 2, 1, 3})->getOutput();
|
||||
g->addOpWithOutputs<ReshapeObj>(t000, t00);
|
||||
|
||||
// Feed forward
|
||||
g->addOpWithOutputs<MatmulObj>(t00, w3, t01, false, true, bias3);
|
||||
g->addOpWithOutputs<ReluObj>(t01, t02);
|
||||
g->addOpWithOutputs<MatmulObj>(t02, w4, t11, false, true, bias4);
|
||||
g->addOpWithOutputs<ReluObj>(t11, t12);
|
||||
g->addOpWithOutputs<AddObj>(t12, i0, output);
|
||||
return g;
|
||||
}
|
||||
|
||||
Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId) {
|
||||
IT_ASSERT(0 <= layerId && layerId < 5);
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
vector<Tensor> weights;
|
||||
vector<tuple<int, int, int, int, bool>> cs{
|
||||
// Channel, kernelSize, pad, stride, isTanh
|
||||
{448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
|
||||
{64, 4, 1, 2, false}, {3, 4, 1, 2, true},
|
||||
};
|
||||
|
||||
Tensor input = g->addTensor(shape, DataType::Float32, TensorType::Input);
|
||||
for (int i = layerId; i < layerId + 1; ++i) {
|
||||
auto [channel, kernelSize, pad, stride, tanh] = cs[i];
|
||||
int f = input->getDims()[3]; // n, h, w, f
|
||||
auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
|
||||
DataType::Float32,
|
||||
TensorType::Initialized); // f, r, s, c
|
||||
input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
|
||||
pad, stride, stride, 1, 1)
|
||||
->getOutput();
|
||||
if (tanh) {
|
||||
input = g->addOp<TanhObj>(input, nullptr)->getOutput();
|
||||
} else {
|
||||
input = g->addOp<ReluObj>(input, nullptr)->getOutput();
|
||||
}
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
void printGraph(Graph g) {
|
||||
g->print();
|
||||
puts("============ Data ============");
|
||||
for (auto t : g->getTensors()) {
|
||||
dbg(t);
|
||||
t->printData();
|
||||
}
|
||||
}
|
||||
|
||||
void initializeGraphTensors(Graph g, double l, double r, bool useInt) {
|
||||
g->dataMalloc();
|
||||
auto gen = RandomGenerator(-0.1, 0.1, 0, useInt);
|
||||
for (auto t : g->getInputs()) {
|
||||
t->setData(gen);
|
||||
}
|
||||
for (auto t : g->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
}
|
||||
|
||||
Graph convertNCHWtoNHWCModel(Runtime runtime, Graph inG) {
|
||||
// Construct new graph
|
||||
// IT_ASSERT(inG->getInputs().size() == 1);
|
||||
IT_ASSERT(inG->getOutputs().size() == 1);
|
||||
bool status = inG->topo_sort();
|
||||
IT_ASSERT(status);
|
||||
auto g = make_ref<GraphObj>(runtime);
|
||||
map<UidBaseType, Tensor> tensors;
|
||||
for (const auto &t : inG->getTensors())
|
||||
if (t->getDims().size() != 4)
|
||||
return nullptr;
|
||||
auto getTensor = [&g, &tensors](const Tensor &inTensor) {
|
||||
auto uid = inTensor->getGuid();
|
||||
if (auto it = tensors.find(uid); it == tensors.end()) {
|
||||
Shape s = inTensor->getDims();
|
||||
s = vector{s[0], s[2], s[3], s[1]};
|
||||
tensors[uid] = g->addTensor(s, inTensor->getDType(),
|
||||
inTensor->getTensorType());
|
||||
}
|
||||
return tensors[uid];
|
||||
};
|
||||
for (auto op : inG->getOperators()) {
|
||||
TensorVec inputs, outputs;
|
||||
for (auto &t : op->getInputs())
|
||||
inputs.emplace_back(getTensor(t));
|
||||
for (auto &t : op->getOutputs())
|
||||
outputs.emplace_back(getTensor(t));
|
||||
if (auto cOp = as<ConvObj>(op)) {
|
||||
const auto &[ph, pw, sh, sw, dh, dw] = cOp->getPadStrideDilation();
|
||||
auto bias =
|
||||
cOp->getBias() ? g->cloneTensor(cOp->getBias()) : nullptr;
|
||||
g->addOpWithOutputs<ConvNHWCObj>(inputs[0], inputs[1], outputs[0],
|
||||
ph, pw, sh, sw, dh, dw, bias,
|
||||
cOp->getAct());
|
||||
} else if (const auto &cOp = as<ConvTransposed2dObj>(op)) {
|
||||
const auto &[ph, pw, sh, sw, dh, dw] = cOp->getPadStrideDilation();
|
||||
const auto &[oph, opw] = cOp->getOutputPadding();
|
||||
auto group = cOp->getNumGroups();
|
||||
auto bias =
|
||||
cOp->getBias() ? g->cloneTensor(cOp->getBias()) : nullptr;
|
||||
g->addOpWithOutputs<ConvTransposed2dNHWCObj>(
|
||||
inputs[0], inputs[1], outputs[0], ph, pw, sh, sw, dh, dw, oph,
|
||||
opw, group, bias, cOp->getAct());
|
||||
} else if (const auto &cOp = as<MaxPoolObj>(op)) {
|
||||
auto t = g->addOp<ReshapeObj>(inputs[0], nullptr,
|
||||
cOp->getInputs(0)->getDims())
|
||||
->getOutput();
|
||||
auto tt = g->addTensor(cOp->getOutput()->getDims(),
|
||||
cOp->getOutput()->getDType());
|
||||
g->cloneOperator(op, {t}, {tt});
|
||||
g->addOpWithOutputs<ReshapeObj>(tt, outputs[0]);
|
||||
} else {
|
||||
dbg(op);
|
||||
g->cloneOperator(op, inputs, outputs);
|
||||
}
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
Graph optimizeModelWithRules(Graph g, Runtime _runtime, vector<int> rules) {
|
||||
auto runtime = as<CudaRuntimeObj>(_runtime);
|
||||
// make_ref<NMutator>(NMutator::Mode::RuleBased, metaRules, runtime);
|
||||
Ref<NMutator> mutator =
|
||||
make_ref<NMutator>(NMutator::Mode::RuleBased, rules, runtime);
|
||||
vector<Graph> bestGraphs;
|
||||
SearchEngine searchEngine(runtime, mutator);
|
||||
g->dataFree();
|
||||
return searchEngine.run(g);
|
||||
}
|
||||
|
||||
Graph optimizeModel(Graph g, Runtime _runtime, string name) {
|
||||
auto runtime = as<CudaRuntimeObj>(_runtime);
|
||||
Ref<NMutator> mutator = make_ref<NMutator>(NMutator::Mode::Normal, runtime);
|
||||
vector<Graph> bestGraphs;
|
||||
SearchEngine searchEngine(runtime, mutator);
|
||||
g->dataFree();
|
||||
return searchEngine.run(g);
|
||||
}
|
||||
|
||||
Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning, NMutator::Mode mode,
|
||||
vector<int> rules) {
|
||||
auto runtime = as<CudaRuntimeObj>(_runtime);
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
// vector<int>{3, 2, 2, 5, 8, 8, 6, 90}); // Conv2gemm
|
||||
// vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90}); // TConv
|
||||
Ref<NMutator> mutator;
|
||||
if (mode == NMutator::Mode::Normal) {
|
||||
dbg(mode);
|
||||
mutator = make_ref<NMutator>(mode, runtime);
|
||||
} else if (mode == NMutator::Mode::RuleBased) {
|
||||
dbg(mode, rules);
|
||||
IT_ASSERT_TODO(rules.size() > 0);
|
||||
mutator = make_ref<NMutator>(mode, rules, runtime);
|
||||
} else
|
||||
IT_TODO_HALT();
|
||||
vector<Graph> bestGraphs;
|
||||
SearchEngine searchEngine(runtime, mutator);
|
||||
g->dataFree();
|
||||
return searchEngine.run(g);
|
||||
|
||||
bestGraphs.emplace_back(searchEngine.run(g));
|
||||
g->topo_sort();
|
||||
dbg(g, bestGraphs[0], bestGraphs.size());
|
||||
g->print();
|
||||
|
||||
g->dataMalloc();
|
||||
map<UidBaseType, Tensor> fuidToInputTensor;
|
||||
for (auto t : g->getInputs()) {
|
||||
IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
|
||||
fuidToInputTensor[t->getFuid()] = t;
|
||||
}
|
||||
|
||||
auto gen = RandomGenerator(-0.1, 0.1, 0);
|
||||
for (auto t : g->getInputs()) {
|
||||
t->setData(gen);
|
||||
}
|
||||
for (auto t : g->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
runtime->run(g);
|
||||
// dbg("Baseline graph");
|
||||
// printGraph(g);
|
||||
// dbg(runtme->getPerfTime(g, true));
|
||||
g->dataFree();
|
||||
|
||||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
auto bestGraphCpu = bestGraphs[i];
|
||||
auto bestGraph =
|
||||
make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
|
||||
bestGraph->topo_sort();
|
||||
|
||||
// bestGraph->dataMalloc();
|
||||
// // Initialize inputs with random data
|
||||
// for (auto t : bestGraph->getInputs()) {
|
||||
// t->copyData(fuidToInputTensor[t->getFuid()]);
|
||||
// }
|
||||
|
||||
// // Initialize outputs with zeros
|
||||
// for (auto t : bestGraph->getOutputs()) {
|
||||
// t->setData(ZeroGenerator());
|
||||
// }
|
||||
|
||||
// dbg(bestGraph);
|
||||
// dbg(bestGraph->getOutputs());
|
||||
|
||||
// if (tuning) {
|
||||
// runtime->run(bestGraph, true); // Tune kernels
|
||||
// runtime->run(bestGraph, false); // Execute transfomraed graph
|
||||
|
||||
// // FIXME: g is freed
|
||||
// auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
|
||||
// auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
|
||||
// // EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
|
||||
// dbg(go0->equalData(bgo0, 1e-3));
|
||||
// dbg(runtime->getPerfTime(bestGraph, true));
|
||||
// dbg(runtime->timeNonCtcOperators(bestGraph));
|
||||
// // dbg(runtime->timeWithCudaGraph(bestGraph));
|
||||
// }
|
||||
|
||||
// dbg("Best graph");
|
||||
// printGraph(bestGraph);
|
||||
return bestGraph;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
Graph optimizeWithDepthConstraint(Graph g, Runtime _runtime, int maxDepth) {
|
||||
auto runtime = as<CudaRuntimeObj>(_runtime);
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Ref<NMutator> mutator = make_ref<NMutator>(NMutator::Mode::Normal, runtime);
|
||||
mutator->setMaxDepth(maxDepth);
|
||||
g->dataFree();
|
||||
SearchEngine searchEngine(runtime, mutator);
|
||||
searchEngine.searchFilter = 1;
|
||||
return searchEngine.run(g);
|
||||
}
|
||||
|
||||
vector<Tensor> runInfoGAN(int nLayers) {
|
||||
auto cuda = make_ref<CudaRuntimeObj>();
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
|
||||
Graph g = getGANGraph(1, cuda, nLayers, 0);
|
||||
|
||||
auto mutator =
|
||||
make_ref<NMutator>(NMutator::Mode::RuleBased,
|
||||
vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
|
||||
// // Translate OP to membound without derivation
|
||||
// mutator->setToNaiveMembound();
|
||||
|
||||
vector<Graph> bestGraphs;
|
||||
SearchEngine searchEngine(cuda, mutator);
|
||||
bestGraphs.emplace_back(searchEngine.run(g));
|
||||
g->topo_sort();
|
||||
dbg(g, bestGraphs[0], bestGraphs.size());
|
||||
g->print();
|
||||
|
||||
g->dataMalloc();
|
||||
map<UidBaseType, Tensor> fuidToInputTensor;
|
||||
for (auto t : g->getInputs()) {
|
||||
IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
|
||||
fuidToInputTensor[t->getFuid()] = t;
|
||||
}
|
||||
|
||||
auto gen = RandomGenerator(-0.1, 0.1, 0);
|
||||
// auto gen = RandomGenerator(-5, 5, 0, true);
|
||||
for (auto t : g->getInputs()) {
|
||||
t->setData(gen);
|
||||
}
|
||||
for (auto t : g->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
cuda->run(g);
|
||||
dbg("Baseline graph");
|
||||
printGraph(g);
|
||||
dbg(cuda->getPerfTime(g, true));
|
||||
|
||||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
auto bestGraphCpu = bestGraphs[i];
|
||||
auto bestGraph = make_ref<GraphObj>(cuda, bestGraphCpu->getOperators());
|
||||
bestGraph->topo_sort();
|
||||
|
||||
bestGraph->dataMalloc();
|
||||
// Initialize inputs with random data
|
||||
for (auto t : bestGraph->getInputs()) {
|
||||
t->copyData(fuidToInputTensor[t->getFuid()]);
|
||||
}
|
||||
|
||||
// Initialize outputs with zeros
|
||||
for (auto t : bestGraph->getOutputs()) {
|
||||
t->setData(ZeroGenerator());
|
||||
}
|
||||
|
||||
dbg(bestGraph);
|
||||
dbg(bestGraph->getOutputs());
|
||||
|
||||
cuda->run(bestGraph, true); // Tune kernels
|
||||
cuda->run(bestGraph, false); // Execute transfomraed graph
|
||||
|
||||
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
|
||||
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
|
||||
// EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
|
||||
std::cout << go0->equalData(bgo0, 1e-3) << std::endl;
|
||||
bgo0->printData();
|
||||
go0->printData();
|
||||
dbg(cuda->getPerfTime(bestGraph, true));
|
||||
|
||||
dbg("Best graph");
|
||||
printGraph(bestGraph);
|
||||
callback::exportONNX(bestGraph, "best_graph.onnx"); // Debug
|
||||
return {g->getOutputs()[0], bestGraph->getOutputs()[0]};
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
#endif
|
|
@ -12,12 +12,11 @@ void MatchMemBoundKernel::transform(Formula &origin, int depth, Expr &rCur) {
|
|||
const auto &inputs = InputVisitor().getInputs(rangeOp);
|
||||
auto source =
|
||||
make_ref<ElementWiseNode>(rangeOp, inputs, rangeOp->getOutputShape());
|
||||
auto tensor =
|
||||
makeTensor(newTensorName(), rangeOp->getOutputShape(), {}, source);
|
||||
auto tensor = mT(newTensorName(), rangeOp->getOutputShape(), {}, source);
|
||||
// The original code directly appends candidate. But it seems should be done
|
||||
// by the search.
|
||||
// appendCanddiate(as<TensorNode>(tensor), depth);
|
||||
nextStep(origin, depth, rCur, tensor);
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -38,11 +38,10 @@ void Rule3StageSplit::transform(Formula &origin, int depth, Expr &rCur) {
|
|||
|
||||
// if no sum iterator, the stage is redundant
|
||||
assert(!innerSumVars.empty());
|
||||
auto inner =
|
||||
makeRangeOperator(innerLoopVars, innerSumVars, cur->getSummand());
|
||||
auto inner = mL(innerLoopVars, innerSumVars, cur->getSummand());
|
||||
auto subscriptedInner = make_ref<SubscriptNode>(inner, indexForInner);
|
||||
auto outer = makeRangeOperator(cur->getLoopVarRanges(), outerSumVars,
|
||||
subscriptedInner);
|
||||
auto outer =
|
||||
mL(cur->getLoopVarRanges(), outerSumVars, subscriptedInner);
|
||||
outer->setPaddings(cur->getPaddings());
|
||||
|
||||
// next searching step
|
||||
|
@ -79,4 +78,4 @@ Rule3StageSplit::getSplitSummationIters(RangeOp rangeOp) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -25,8 +25,8 @@ void Rule6KenerlMatching::transform(Formula &origin, int depth, Expr &rCur) {
|
|||
}
|
||||
{ // Match element-wise OP
|
||||
auto replaces = matchElementWise(cur);
|
||||
if (!replaces.empty())
|
||||
dbg(rCur);
|
||||
// if (!replaces.empty())
|
||||
// dbg(rCur);
|
||||
for (auto newCur : replaces)
|
||||
nextStep(origin, depth, rCur, newCur);
|
||||
}
|
||||
|
@ -50,8 +50,8 @@ VecExpr Rule6KenerlMatching::matchElementWise(const RangeOp &rangeOp) {
|
|||
const auto &inputs = InputVisitor().getInputs(rangeOp);
|
||||
auto source =
|
||||
make_ref<ElementWiseNode>(rangeOp, inputs, rangeOp->getOutputShape());
|
||||
auto newTensor = makeTensor(newTensorName(), newShape, {}, source);
|
||||
auto newTensor = mT(newTensorName(), newShape, {}, source);
|
||||
return {newTensor};
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -265,10 +265,9 @@ Expr Rule8GuidedDLT::guidedDLTMoreVar2(const RangeOp &cur,
|
|||
const auto sourceRoutine = make_ref<ElementWiseNode>(
|
||||
sourceExpr, vector<Tensor>{originalTensor}, newShape);
|
||||
// build stage connections
|
||||
const auto newTensor =
|
||||
makeTensor(newTensorName(), newShape, {}, sourceRoutine);
|
||||
const auto &newSub = makeSubscript(
|
||||
newTensor, VecExpr(tensorDimAxes.begin(), tensorDimAxes.end()));
|
||||
const auto newTensor = mT(newTensorName(), newShape, {}, sourceRoutine);
|
||||
const auto &newSub =
|
||||
mSub(newTensor, VecExpr(tensorDimAxes.begin(), tensorDimAxes.end()));
|
||||
// TODO [1124]: get variable mapping and reorder L according to it
|
||||
// dbg(cur, originalSub, newSub, newVarRanges, replace.toReadable(),
|
||||
// tensorDimAxes, newShape);
|
||||
|
@ -311,7 +310,7 @@ Expr Rule8GuidedDLT::buildGuidedDLTSource(const Subscript &originalSub,
|
|||
vector<VarRangePair> loopVarRangePairs;
|
||||
for (size_t i = 0; i < tensorDimAxes.size(); ++i)
|
||||
loopVarRangePairs.emplace_back(tensorDimAxes[i], pair(0, newShape[i]));
|
||||
return makeRangeOperator(loopVarRangePairs, {}, newSub);
|
||||
return mL(loopVarRangePairs, {}, newSub);
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -47,8 +47,8 @@ Rule90TwoStageElementWise::matchTwoStageElementWise(const RangeOp &rangeOp) {
|
|||
const auto &inputs = InputVisitor().getInputs(rangeOp);
|
||||
auto source =
|
||||
make_ref<ElementWiseNode>(rangeOp, inputs, rangeOp->getOutputShape());
|
||||
auto newTensor = makeTensor(newTensorName(), newShape, {}, source);
|
||||
auto newTensor = mT(newTensorName(), newShape, {}, source);
|
||||
return {newTensor};
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -13,7 +13,8 @@ string FullPrinterVisitor::print(const Expr &root) {
|
|||
oss << "==> ROOT\n" << root->toReadable() << "\n";
|
||||
for (size_t i = 0; i < q.size(); ++i) {
|
||||
const auto &[name, routine, tensor] = q[i];
|
||||
oss << "==> " << name << " : ";
|
||||
oss << "==> " << name << " " << infini::vecToString(tensor->getShape())
|
||||
<< " : ";
|
||||
if (routine) {
|
||||
oss << routine->toReadable() << "\n";
|
||||
if (routine->getExpr()) {
|
||||
|
|
|
@ -45,27 +45,26 @@ VecExpr MatmulTransposeMutator::transpose(const Tensor &tensor) {
|
|||
auto _va = make_ref<VarNode>("transA");
|
||||
auto _vb = make_ref<VarNode>("transB");
|
||||
auto _vc = make_ref<VarNode>("swapAB");
|
||||
auto fakeSub = makeSubscript(matmul->getExpr(), {_va, _vb});
|
||||
auto fakeRangeWrapperForHackHash =
|
||||
makeRangeOperator({{_va, {0, Atrans + 100}},
|
||||
{_vb, {0, Btrans + 100}},
|
||||
{_vc, {0, ABswap + 100}}},
|
||||
{}, fakeSub);
|
||||
auto fakeSub = mSub(matmul->getExpr(), {_va, _vb});
|
||||
auto fakeRangeWrapperForHackHash = mL({{_va, {0, Atrans + 100}},
|
||||
{_vb, {0, Btrans + 100}},
|
||||
{_vc, {0, ABswap + 100}}},
|
||||
{}, fakeSub);
|
||||
Matmul newMatmul =
|
||||
make_ref<MatmulNode>(fakeRangeWrapperForHackHash, inputs[0],
|
||||
inputs[1], b, m, n, k, transa, transb);
|
||||
auto newTensor = makeTensor(derivator.newTensorName(), newShape,
|
||||
newPaddings, newMatmul);
|
||||
auto newTensor =
|
||||
mT(derivator.newTensorName(), newShape, newPaddings, newMatmul);
|
||||
// build output transpose
|
||||
if (ABswap) {
|
||||
vector<Var> vars{derivator.getNewVar(), derivator.getNewVar()};
|
||||
auto sub = makeSubscript(newTensor, {vars[1], vars[0]});
|
||||
auto sub = mSub(newTensor, {vars[1], vars[0]});
|
||||
vector<VarRangePair> loopVRs;
|
||||
// Sicne inputs array may be swaped, use the orignal tensor shape
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
loopVRs.emplace_back(vars[i], Range(0, tensor->getShape(i)));
|
||||
}
|
||||
auto rangeOp = makeRangeOperator(loopVRs, {}, sub);
|
||||
auto rangeOp = mL(loopVRs, {}, sub);
|
||||
ret.emplace_back(rangeOp);
|
||||
} else
|
||||
ret.emplace_back(newTensor);
|
||||
|
@ -85,8 +84,8 @@ optional<Tensor> MatmulTransposeMutator::transposeInput(const Tensor &tensor) {
|
|||
assert(!rangeOp->hasPaddings());
|
||||
// auto paddings = rangeOp->getPaddings();
|
||||
// std::swap(paddings[0], paddings[1]);
|
||||
auto sub = makeSubscript(rangeOp, {loopVRs[1].first, loopVRs[0].first});
|
||||
auto newRangeOp = makeRangeOperator(loopVRs, {}, sub);
|
||||
auto sub = mSub(rangeOp, {loopVRs[1].first, loopVRs[0].first});
|
||||
auto newRangeOp = mL(loopVRs, {}, sub);
|
||||
// ElementWise newElementWise = make_ref<ElementWiseNode>(*ew);
|
||||
auto outputShape = ew->getOutputShape();
|
||||
std::swap(outputShape[0], outputShape[1]);
|
||||
|
@ -97,8 +96,8 @@ optional<Tensor> MatmulTransposeMutator::transposeInput(const Tensor &tensor) {
|
|||
auto tensorPaddings = tensor->getPaddings();
|
||||
std::swap(tensorShape[0], tensorShape[1]);
|
||||
std::swap(tensorPaddings[0], tensorPaddings[1]);
|
||||
ret = makeTensor(derivator.newTensorName(), tensorShape, tensorPaddings,
|
||||
newElementWise);
|
||||
ret = mT(derivator.newTensorName(), tensorShape, tensorPaddings,
|
||||
newElementWise);
|
||||
// } else if (!tensor->getSource()) {
|
||||
} else {
|
||||
nnet_unimplemented_continue();
|
||||
|
@ -107,4 +106,4 @@ optional<Tensor> MatmulTransposeMutator::transposeInput(const Tensor &tensor) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -45,6 +45,8 @@ Expr MergeMemboundMutator::merge(bool allowEmptyMembound, bool allowFailure) {
|
|||
curExpr = sub->getObjectPtr();
|
||||
else
|
||||
break;
|
||||
} else if (auto funcOp = as<BinaryOpNode>(summand)) {
|
||||
break;
|
||||
} else {
|
||||
if (allowFailure)
|
||||
return nullptr;
|
||||
|
@ -143,4 +145,4 @@ Expr MergeMemboundMutator::rule4StageMerging(Expr &rCur,
|
|||
return merged;
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -32,8 +32,7 @@ RangeOp PatternMatcher::getOffsetCur() {
|
|||
}
|
||||
auto newSummand = ReplaceKit::replaceMultipleExprs(
|
||||
originalCur->getSummand(), itersFromNonZero, psis);
|
||||
return makeRangeOperator(newLoopVarRanges, originalCur->getSumVarRanges(),
|
||||
newSummand);
|
||||
return mL(newLoopVarRanges, originalCur->getSumVarRanges(), newSummand);
|
||||
}
|
||||
|
||||
VecExpr PatternMatcher::matchKernel(const Pattern &pattern,
|
||||
|
@ -106,9 +105,9 @@ VecExpr PatternMatcher::applyWrapper(const VecExpr &exprs) {
|
|||
}
|
||||
}
|
||||
for (auto &expr : exprs) {
|
||||
auto newSub = makeSubscript(expr, indexes);
|
||||
ret.emplace_back(makeRangeOperator(originalCur->getLoopVarRanges(), {},
|
||||
newSub, originalCur->getPaddings()));
|
||||
auto newSub = mSub(expr, indexes);
|
||||
ret.emplace_back(mL(originalCur->getLoopVarRanges(), {}, newSub,
|
||||
originalCur->getPaddings()));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -73,17 +73,52 @@ string Serializer::visit_(const Tensor &c) {
|
|||
return key;
|
||||
}
|
||||
|
||||
bool Serializer::serialize(const Expr &expr, const string &filePath,
|
||||
const string &msg) {
|
||||
string Serializer::visit_(const Func &c) {
|
||||
const string key = std::to_string(id++);
|
||||
j[key]["type"] = c->getType();
|
||||
j[key]["funcType"] = c->getFuncType();
|
||||
j[key]["object"] = dispatch(c->getObject());
|
||||
return key;
|
||||
}
|
||||
|
||||
std::optional<std::string> Serializer::toString(const Expr &expr,
|
||||
const string &msg,
|
||||
vector<Tensor> inputs,
|
||||
double exec_time, string hint) {
|
||||
// Metadata
|
||||
j["Version"] = VERSION;
|
||||
j["Msg"] = msg;
|
||||
j["exec_time"] = exec_time;
|
||||
j["hint"] = hint;
|
||||
// Expressions and routines
|
||||
id = 0;
|
||||
dispatch(expr);
|
||||
std::ofstream fout(filePath);
|
||||
fout << std::setw(4) << j << std::endl;
|
||||
return true;
|
||||
|
||||
// Input tensors
|
||||
vector<string> inputsIndices;
|
||||
for (const auto &tensor : inputs) {
|
||||
inputsIndices.emplace_back(std::to_string(id));
|
||||
dispatch(tensor);
|
||||
}
|
||||
j["nnetInputs"] = inputsIndices;
|
||||
|
||||
// To string
|
||||
std::stringstream ss;
|
||||
ss << std::setw(4) << j << std::endl;
|
||||
return {ss.str()};
|
||||
}
|
||||
|
||||
bool Serializer::toFile(const Expr &expr, const string &filePath,
|
||||
const string &msg, vector<Tensor> inputs,
|
||||
double exec_time, string hint) {
|
||||
if (auto s = toString(expr, msg, inputs, exec_time, hint)) {
|
||||
// Write to file
|
||||
std::ofstream fout(filePath);
|
||||
fout << *s;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
string Serializer::dispatchRoutine(const Routine &c) {
|
||||
|
@ -129,7 +164,15 @@ string Serializer::dispatchRoutine(const Routine &c) {
|
|||
return key;
|
||||
}
|
||||
|
||||
Expr Serializer::deserialize(const string &filePath) {
|
||||
Expr Serializer::fromString(const string &text) {
|
||||
std::stringstream str;
|
||||
str << text;
|
||||
str >> j;
|
||||
assert(j["Version"] == VERSION);
|
||||
return buildExprTree("0");
|
||||
}
|
||||
|
||||
Expr Serializer::fromFile(const string &filePath) {
|
||||
std::ifstream fin(filePath);
|
||||
fin >> j;
|
||||
assert(j["Version"] == VERSION);
|
||||
|
@ -160,7 +203,7 @@ Expr Serializer::buildExprTree(string key) {
|
|||
}
|
||||
auto summand = buildExprTree(j[key]["summand"]);
|
||||
auto paddings = j[key]["paddings"].get<std::vector<int>>();
|
||||
auto rangeOp = makeRangeOperator(loopIters, sumIters, summand);
|
||||
auto rangeOp = mL(loopIters, sumIters, summand);
|
||||
rangeOp->setPaddings(paddings);
|
||||
return rangeOp;
|
||||
}
|
||||
|
@ -180,6 +223,10 @@ Expr Serializer::buildExprTree(string key) {
|
|||
return make_ref<TensorNode>(j[key]["name"], j[key]["shape"],
|
||||
j[key]["paddings"], source);
|
||||
}
|
||||
case NodeType::FuncNodeType: {
|
||||
auto object = buildExprTree(j[key]["object"]);
|
||||
return make_ref<FuncNode>(object, j[key]["funcType"]);
|
||||
}
|
||||
default: {
|
||||
nnet_unimplemented_halt();
|
||||
break;
|
||||
|
@ -242,4 +289,25 @@ Routine Serializer::buildRoutine(string key) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
tuple<Expr, vector<Tensor>, double, string>
|
||||
Serializer::deserializeAsMemobundOp(const string &filePath) {
|
||||
std::ifstream fin(filePath);
|
||||
fin >> j;
|
||||
assert(j["Version"] == VERSION);
|
||||
vector<Tensor> inputs;
|
||||
for (const auto &input : j["nnetInputs"])
|
||||
inputs.emplace_back(as<TensorNode>(buildExprTree(input)));
|
||||
return {buildExprTree("0"), inputs, j["exec_time"], j["hint"]};
|
||||
}
|
||||
|
||||
tuple<Expr, vector<Tensor>, double, string>
|
||||
Serializer::membundOpFromString(const string &data) {
|
||||
j = json::parse(data);
|
||||
assert(j["Version"] == VERSION);
|
||||
vector<Tensor> inputs;
|
||||
for (const auto &input : j["nnetInputs"])
|
||||
inputs.emplace_back(as<TensorNode>(buildExprTree(input)));
|
||||
return {buildExprTree("0"), inputs, j["exec_time"], j["hint"]};
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
|
|
|
@ -136,9 +136,10 @@ void Derivator::dfs(Formula &origin, int depth) {
|
|||
}
|
||||
|
||||
Derivator::Derivator(int maxDepth, bool enableHashPruning, LogMode logMode,
|
||||
PassMode passMode)
|
||||
PassMode passMode, bool printAndExit)
|
||||
: maxDepth(maxDepth), logMode(logMode), passMode(passMode),
|
||||
enableHashPruning(enableHashPruning), cntAppliedRules(12) {}
|
||||
enableHashPruning(enableHashPruning), cntAppliedRules(12),
|
||||
printAndExit(printAndExit) {}
|
||||
|
||||
int Derivator::getNumIntermediateStates() { return cntStates; }
|
||||
|
||||
|
@ -405,6 +406,8 @@ Expr Derivator::mergeMemboundStages(VecExpr stages) {
|
|||
void Derivator::appendCanddiate(const Tensor &tensor, int depth) {
|
||||
// if (!CountRoutineVisitor().match(tensor, 1, 0, 3))
|
||||
// return;
|
||||
if (intermediateStates.size() > 1 && printAndExit)
|
||||
printDerivationRules();
|
||||
|
||||
candidates.emplace_back(tensor, depth);
|
||||
// dbg("!!!!!!!!!!!!!!!Success!!!!!!!!!!!!!!!");
|
||||
|
@ -478,6 +481,7 @@ void Derivator::printStatistics() {
|
|||
printf("#Hashed intermediate states = %lu\n", visited.size());
|
||||
printf("#Iteratos = %d\n", nIteratorNames);
|
||||
printf("#Tensors = %d\n", nTensorNames);
|
||||
printf("#Print and Exit mode = %d\n", printAndExit);
|
||||
}
|
||||
|
||||
void Derivator::setDumpFirstSuccess(const string &_logFnPrefix) {
|
||||
|
@ -490,6 +494,9 @@ void Derivator::printIntermediateStates() {
|
|||
// Skip in NoLog mode
|
||||
if (logMode == LogMode::NoLog)
|
||||
return;
|
||||
if (intermediateStates.size() > 1 && printAndExit)
|
||||
printDerivationRules();
|
||||
|
||||
assert(intermediateStates.size() == ruleStates.size());
|
||||
assert(intermediateStates.size() == ruleMsgs.size());
|
||||
for (size_t i = 0; i < intermediateStates.size(); ++i) {
|
||||
|
@ -499,16 +506,17 @@ void Derivator::printIntermediateStates() {
|
|||
std::cout << FullPrinterVisitor().print(intermediateStates[i]) << endl;
|
||||
if (logMode == LogMode::DumpFristCandiate) {
|
||||
Serializer serializer;
|
||||
serializer.serialize(intermediateStates[i],
|
||||
logFnPrefix + to_string(i) + ".expr", msg);
|
||||
serializer.toFile(intermediateStates[i],
|
||||
logFnPrefix + to_string(i) + ".expr", msg);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < intermediateStates.size(); ++i) {
|
||||
if (auto cur = as<RangeOpNode>(intermediateStates[i]))
|
||||
if (CheckOOBVisitor().checkRangeOp(cur)) {
|
||||
printf("OOB detected depth=%lu\n", i);
|
||||
}
|
||||
}
|
||||
// FIXME
|
||||
// for (size_t i = 0; i < intermediateStates.size(); ++i) {
|
||||
// if (auto cur = as<RangeOpNode>(intermediateStates[i]))
|
||||
// if (CheckOOBVisitor().checkRangeOp(cur)) {
|
||||
// printf("OOB detected depth=%lu\n", i);
|
||||
// }
|
||||
// }
|
||||
if (logMode == LogMode::DumpFristCandiate) {
|
||||
puts("Serializaiton finished.");
|
||||
exit(0);
|
||||
|
@ -528,4 +536,23 @@ Derivator::PassMode Derivator::getPassMode() { return passMode; }
|
|||
|
||||
Derivator::LogMode Derivator::getLogMode() { return logMode; }
|
||||
|
||||
void Derivator::printDerivationRules() {
|
||||
int cntRules = 0, cntNonGuideRules = 0;
|
||||
bool startGuided = false;
|
||||
std::cout << ruleStates.size() << "rules" << std::endl;
|
||||
for (size_t i = 1; i < ruleStates.size(); ++i) {
|
||||
int ruleId = ruleStates[i][4] - '0';
|
||||
if (ruleId != 4)
|
||||
++cntRules;
|
||||
if (ruleId == 8)
|
||||
startGuided = true;
|
||||
if (!startGuided && ruleId != 4)
|
||||
++cntNonGuideRules;
|
||||
}
|
||||
printf("#Steps w/o converging derivation %d, #Steps w/ converging "
|
||||
"derivation %d\n",
|
||||
cntRules, cntNonGuideRules);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
|
|
|
@ -60,7 +60,7 @@ optional<Expr> DLT::apply(const RangeOp &rangeOp, const Subscript &subscript,
|
|||
// Maybe there are bugs...
|
||||
// assert(index != nullptr);
|
||||
if (index == nullptr) {
|
||||
std::cout << "Warning empty" << std::endl;
|
||||
// std::cout << "Warning empty" << std::endl;
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
@ -83,12 +83,11 @@ optional<Expr> DLT::apply(const RangeOp &rangeOp, const Subscript &subscript,
|
|||
// HACK [important] fix this fake tensor.
|
||||
auto elementRoutine = make_ref<ElementWiseNode>(
|
||||
// FIXME: implement transpose
|
||||
// makeTensor(newTensorName + "_DLT", {}), vector<Tensor>{tensor},
|
||||
// mT(newTensorName + "_DLT", {}), vector<Tensor>{tensor},
|
||||
// shape0);
|
||||
makeTensor("__DLT", {}), vector<Tensor>{tensor}, shape0);
|
||||
auto dltedTensor =
|
||||
makeTensor(newTensorName, shape0, dltedPaddings, elementRoutine);
|
||||
auto dltedSubscript = makeSubscript(dltedTensor, index0);
|
||||
mT("__DLT", {}), vector<Tensor>{tensor}, shape0);
|
||||
auto dltedTensor = mT(newTensorName, shape0, dltedPaddings, elementRoutine);
|
||||
auto dltedSubscript = mSub(dltedTensor, index0);
|
||||
return optional<Expr>(std::in_place, dltedSubscript);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include "nnet/expr.h"
|
||||
#include "nnet/Visitor/FullPrinterVisitor.h"
|
||||
#include "nnet/Visitor/GetTensorsVisitor.h"
|
||||
|
||||
namespace nnet {
|
||||
|
@ -367,19 +368,19 @@ Expr operator/(const Expr &lhs, const int rhs) {
|
|||
}
|
||||
|
||||
// Wrappers for type deduction
|
||||
Subscript makeSubscript(const Expr &tensor, const VecExpr &subscripts) {
|
||||
Subscript mSub(const Expr &tensor, const VecExpr &subscripts) {
|
||||
return make_ref<SubscriptNode>(tensor, subscripts);
|
||||
}
|
||||
|
||||
RangeOp makeRangeOperator(const vector<VarRangePair> &_loopIters,
|
||||
const vector<VarRangePair> &_sumIters, Expr _summand,
|
||||
const vector<int> &paddings) {
|
||||
RangeOp mL(const vector<VarRangePair> &_loopIters,
|
||||
const vector<VarRangePair> &_sumIters, Expr _summand,
|
||||
const vector<int> &paddings) {
|
||||
return make_ref<RangeOpNode>(_loopIters, _sumIters, _summand, paddings);
|
||||
}
|
||||
|
||||
// Wrappers for type deduction
|
||||
Tensor makeTensor(const string &name, const vector<int> &shape,
|
||||
const vector<int> &paddings, const Routine &source) {
|
||||
Tensor mT(const string &name, const vector<int> &shape,
|
||||
const vector<int> &paddings, const Routine &source) {
|
||||
if (paddings.size() == 0)
|
||||
return make_ref<TensorNode>(name, shape,
|
||||
vector<int>((int)shape.size(), 0), source);
|
||||
|
@ -463,4 +464,9 @@ void FuncNode::setObject(Expr e) {
|
|||
object = e;
|
||||
}
|
||||
|
||||
string RangeOpNode::getFullExpression() {
|
||||
FullPrinterVisitor printer;
|
||||
return printer.print(this->shared_from_this());
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
|
|
|
@ -296,10 +296,9 @@ const Pattern &MatmulPattern::getMatmulPattern() {
|
|||
auto k = make_ref<VarNode>("_Matmul_k");
|
||||
auto A = make_ref<TensorNode>("_Matmul_A", vector<int>({M, K}));
|
||||
auto B = make_ref<TensorNode>("_Matmul_B", vector<int>({N, K}));
|
||||
auto subA = makeSubscript(A, {m, k});
|
||||
auto subB = makeSubscript(B, {n, k});
|
||||
auto range = makeRangeOperator({{m, {0, M}}, {n, {0, N}}},
|
||||
{{k, {0, K}}}, subA * subB);
|
||||
auto subA = mSub(A, {m, k});
|
||||
auto subB = mSub(B, {n, k});
|
||||
auto range = mL({{m, {0, M}}, {n, {0, N}}}, {{k, {0, K}}}, subA * subB);
|
||||
auto success = exprIT.analyzeExpr(range);
|
||||
assert(success);
|
||||
exprIT.buildTable({0, 1});
|
||||
|
@ -317,11 +316,10 @@ const Pattern &ConvPattern::getPattern() {
|
|||
// auto n = make_ref<VarNode>("_Matmul_n");
|
||||
auto A = make_ref<TensorNode>("_Conv_A", vector<int>({N, C, H, W}));
|
||||
auto B = make_ref<TensorNode>("_Conv_K", vector<int>({F, C, R, S}));
|
||||
auto subA = makeSubscript(A, {n, c, h + r, w + s});
|
||||
auto subB = makeSubscript(B, {f, c, r, s});
|
||||
auto range = makeRangeOperator(
|
||||
{{n, {0, 0}}, {f, {0, 0}}, {h, {0, 0}}, {w, {0, 0}}},
|
||||
{{c, {0, 0}}, {r, {0, 0}}, {s, {0, 0}}}, subA * subB);
|
||||
auto subA = mSub(A, {n, c, h + r, w + s});
|
||||
auto subB = mSub(B, {f, c, r, s});
|
||||
auto range = mL({{n, {0, 0}}, {f, {0, 0}}, {h, {0, 0}}, {w, {0, 0}}},
|
||||
{{c, {0, 0}}, {r, {0, 0}}, {s, {0, 0}}}, subA * subB);
|
||||
auto success = exprIT.analyzeExpr(range);
|
||||
assert(success);
|
||||
exprIT.buildTable({0, 1});
|
||||
|
@ -350,7 +348,7 @@ Expr ConvPattern::buildExpr(
|
|||
auto shape = conv->getShape();
|
||||
auto rangeOpShape = as<RangeOpNode>(expr)->getOutputShape();
|
||||
assert(shape.size() == rangeOpShape.size());
|
||||
dbg(shape, rangeOpShape);
|
||||
// dbg(shape, rangeOpShape);
|
||||
for (size_t i = 0; i < shape.size(); ++i) {
|
||||
if (shape[i] != rangeOpShape[i]) {
|
||||
dbg("Warning: unmatched Conv output", shape, rangeOpShape);
|
||||
|
@ -404,11 +402,10 @@ const Pattern &Sg2bmmPattern::getPattern() {
|
|||
// auto n = make_ref<VarNode>("_Matmul_n");
|
||||
auto A = make_ref<TensorNode>("_Sg2bmm_A", vector<int>{Batch, M, K});
|
||||
auto B = make_ref<TensorNode>("_Sg2bmm_B", vector<int>{Batch, M, K});
|
||||
auto subA = makeSubscript(A, {b, m, k});
|
||||
auto subB = makeSubscript(B, {b, m + w, k});
|
||||
auto range =
|
||||
makeRangeOperator({{b, {0, Batch}}, {m, {0, M}}, {w, {-W, W + 1}}},
|
||||
{{k, {0, K}}}, subA * subB);
|
||||
auto subA = mSub(A, {b, m, k});
|
||||
auto subB = mSub(B, {b, m + w, k});
|
||||
auto range = mL({{b, {0, Batch}}, {m, {0, M}}, {w, {-W, W + 1}}},
|
||||
{{k, {0, K}}}, subA * subB);
|
||||
auto success = exprIT.analyzeExpr(range);
|
||||
assert(success);
|
||||
exprIT.buildTableWithDefaultMap();
|
||||
|
@ -458,11 +455,10 @@ const Pattern &LongformerGBMMPattern::getPattern() {
|
|||
auto A =
|
||||
make_ref<TensorNode>("_lo_A", vector<int>{Batch, M, 2 * W + 1});
|
||||
auto B = make_ref<TensorNode>("_lo_B", vector<int>{Batch, M, N});
|
||||
auto subA = makeSubscript(A, {b, m, w});
|
||||
auto subB = makeSubscript(B, {b, m + w, n});
|
||||
auto range =
|
||||
makeRangeOperator({{b, {0, Batch}}, {m, {0, M}}, {n, {0, M}}},
|
||||
{{w, {-W, W + 1}}}, subA * subB);
|
||||
auto subA = mSub(A, {b, m, w});
|
||||
auto subB = mSub(B, {b, m + w, n});
|
||||
auto range = mL({{b, {0, Batch}}, {m, {0, M}}, {n, {0, M}}},
|
||||
{{w, {-W, W + 1}}}, subA * subB);
|
||||
auto success = exprIT.analyzeExpr(range);
|
||||
assert(success);
|
||||
exprIT.buildTableWithDefaultMap();
|
||||
|
@ -536,11 +532,10 @@ Expr ConvPattern::getExpr(Tensor A, Tensor K, int N, int C, int H, int W, int F,
|
|||
DEFINE_VAR(f);
|
||||
DEFINE_VAR(r);
|
||||
DEFINE_VAR(s);
|
||||
auto subA = makeSubscript(A, {n, c, h + r - R / 2, w + s - S / 2});
|
||||
auto subB = makeSubscript(K, {f, c, r, s});
|
||||
auto range =
|
||||
makeRangeOperator({{n, {0, N}}, {f, {0, F}}, {h, {0, H}}, {w, {0, W}}},
|
||||
{{c, {0, C}}, {r, {0, R}}, {s, {0, S}}}, subA * subB);
|
||||
auto subA = mSub(A, {n, c, h + r - R / 2, w + s - S / 2});
|
||||
auto subB = mSub(K, {f, c, r, s});
|
||||
auto range = mL({{n, {0, N}}, {f, {0, F}}, {h, {0, H}}, {w, {0, W}}},
|
||||
{{c, {0, C}}, {r, {0, R}}, {s, {0, S}}}, subA * subB);
|
||||
return range;
|
||||
}
|
||||
|
||||
|
@ -572,13 +567,13 @@ Expr ConvTransPattern::getExpr(Tensor A, Tensor K, int N, int C, int H, int W,
|
|||
// vector<int>{0, padding, padding, 0});
|
||||
// auto K = make_ref<TensorNode>("K", vector<int>({R, S, F, C}));
|
||||
|
||||
auto subA = makeSubscript(A, {n, x1 + r - 1, y1 + s - 1, f});
|
||||
auto subA = mSub(A, {n, x1 + r - 1, y1 + s - 1, f});
|
||||
auto subK =
|
||||
// makeSubscript(K, {(R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, f, c});
|
||||
makeSubscript(K, {f, (R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, c});
|
||||
// mSub(K, {(R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, f, c});
|
||||
mSub(K, {f, (R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, c});
|
||||
// x1=(h+1)//2, x2=(h+1)%2, y1=(w+1)//2
|
||||
|
||||
auto range1 = makeRangeOperator(
|
||||
auto range1 = mL(
|
||||
{
|
||||
{n, {0, N}},
|
||||
{c, {0, C}},
|
||||
|
@ -588,10 +583,10 @@ Expr ConvTransPattern::getExpr(Tensor A, Tensor K, int N, int C, int H, int W,
|
|||
{y2, {0, 2}},
|
||||
},
|
||||
{{f, {0, F}}, {r, {0, R / 2}}, {s, {0, S / 2}}}, subA * subK);
|
||||
auto sub0 = makeSubscript(
|
||||
auto sub0 = mSub(
|
||||
range1, {n, c, (h + 1) / 2, (h + 1) % 2, (w + 1) / 2, (w + 1) % 2});
|
||||
auto range0 = makeRangeOperator(
|
||||
{{n, {0, N}}, {h, {0, OH}}, {w, {0, OW}}, {c, {0, C}}}, {}, sub0);
|
||||
auto range0 =
|
||||
mL({{n, {0, N}}, {h, {0, OH}}, {w, {0, OW}}, {c, {0, C}}}, {}, sub0);
|
||||
return range0;
|
||||
}
|
||||
|
||||
|
@ -606,11 +601,10 @@ pair<Expr, pair<Tensor, Tensor>> Sg2bmmPattern::getExpr(int Batch, int M, int K,
|
|||
auto B = make_ref<TensorNode>("B", vector<int>({Batch, M, K}),
|
||||
vector<int>{0, D * W, 0});
|
||||
|
||||
auto subA = makeSubscript(A, {b, m, k});
|
||||
auto subB = makeSubscript(B, {b, m + D * (w - W), k});
|
||||
auto range =
|
||||
makeRangeOperator({{b, {0, Batch}}, {m, {0, M}}, {w, {0, 2 * W + 1}}},
|
||||
{{k, {0, K}}}, subA * subB);
|
||||
auto subA = mSub(A, {b, m, k});
|
||||
auto subB = mSub(B, {b, m + D * (w - W), k});
|
||||
auto range = mL({{b, {0, Batch}}, {m, {0, M}}, {w, {0, 2 * W + 1}}},
|
||||
{{k, {0, K}}}, subA * subB);
|
||||
return {range, {A, B}};
|
||||
}
|
||||
|
||||
|
@ -624,10 +618,10 @@ LongformerGBMMPattern::getExpr(int Batch, int M, int W, int K, int dilation) {
|
|||
vector<int>{0, 0, 0});
|
||||
auto B = make_ref<TensorNode>("B", vector<int>({Batch, M, K}),
|
||||
vector<int>{0, dilation * W, 0});
|
||||
auto subA = makeSubscript(A, {b, m, w});
|
||||
auto subB = makeSubscript(B, {b, m + dilation * w - dilation * W, n});
|
||||
auto range = makeRangeOperator({{b, {0, Batch}}, {m, {0, M}}, {n, {0, K}}},
|
||||
{{w, {0, 2 * W + 1}}}, subA * subB);
|
||||
auto subA = mSub(A, {b, m, w});
|
||||
auto subB = mSub(B, {b, m + dilation * w - dilation * W, n});
|
||||
auto range = mL({{b, {0, Batch}}, {m, {0, M}}, {n, {0, K}}},
|
||||
{{w, {0, 2 * W + 1}}}, subA * subB);
|
||||
return {range, {A, B}};
|
||||
}
|
||||
|
||||
|
@ -642,10 +636,10 @@ pair<Expr, pair<Tensor, Tensor>> MatmulPattern::getExpr(bool transA,
|
|||
vector<int>{0, 0, 0});
|
||||
auto B = make_ref<TensorNode>("B", vector<int>({Batch, K, N}),
|
||||
vector<int>{0, 0, 0});
|
||||
auto subA = makeSubscript(A, {b, m, k});
|
||||
auto subB = makeSubscript(B, {b, k, n});
|
||||
auto range = makeRangeOperator({{b, {0, Batch}}, {m, {0, M}}, {n, {0, N}}},
|
||||
{{k, {0, K}}}, subA * subB);
|
||||
auto subA = mSub(A, {b, m, k});
|
||||
auto subB = mSub(B, {b, k, n});
|
||||
auto range = mL({{b, {0, Batch}}, {m, {0, M}}, {n, {0, N}}}, {{k, {0, K}}},
|
||||
subA * subB);
|
||||
return {range, {A, B}};
|
||||
}
|
||||
|
||||
|
|
1157
src/nnet/nmutator.cc
1157
src/nnet/nmutator.cc
File diff suppressed because it is too large
Load Diff
|
@ -17,8 +17,8 @@ RangeOp ReplaceKit::replaceRangeOpIterator(const RangeOp &rangeOp,
|
|||
replace.oldIters.size() +
|
||||
replace.newIters.size());
|
||||
// Check the number of loop iterators
|
||||
return makeRangeOperator(newVarRangePairs, rangeOp->getSumVarRanges(),
|
||||
replacedSummand);
|
||||
return mL(newVarRangePairs, rangeOp->getSumVarRanges(),
|
||||
replacedSummand);
|
||||
} else if (replace.iteratorType == IterationType::Sum) {
|
||||
for (const auto &[var, range] : rangeOp->getSumVarRanges()) {
|
||||
if (!replace.isReplaced(var))
|
||||
|
@ -27,8 +27,8 @@ RangeOp ReplaceKit::replaceRangeOpIterator(const RangeOp &rangeOp,
|
|||
assert(newVarRangePairs.size() == rangeOp->getSumVarRanges().size() -
|
||||
replace.oldIters.size() +
|
||||
replace.newIters.size());
|
||||
return makeRangeOperator(rangeOp->getLoopVarRanges(), newVarRangePairs,
|
||||
replacedSummand, rangeOp->getPaddings());
|
||||
return mL(rangeOp->getLoopVarRanges(), newVarRangePairs,
|
||||
replacedSummand, rangeOp->getPaddings());
|
||||
}
|
||||
assert(false);
|
||||
return nullptr;
|
||||
|
@ -55,7 +55,7 @@ Subscript ReplaceKit::buildSubscirptForLoopVarReplace(const RangeOp &inner,
|
|||
// } else
|
||||
// subs.emplace_back(inner->getLoopVar(i));
|
||||
// }
|
||||
return makeSubscript(inner, subs);
|
||||
return mSub(inner, subs);
|
||||
}
|
||||
|
||||
RangeOp
|
||||
|
@ -89,4 +89,4 @@ Expr ReplaceKit::replaceExpr(const Expr &cur, const Expr &pattern,
|
|||
return ret;
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
namespace nnet {
|
||||
|
||||
int matchExprResult(Derivator &derivator, string fn) {
|
||||
auto ans = Serializer().deserialize(fn);
|
||||
auto ans = Serializer().fromFile(fn);
|
||||
auto hashAns = HashVisitor()(ans);
|
||||
int match = 0;
|
||||
for (const auto &candidate : derivator.getCandidates()) {
|
||||
|
@ -19,14 +19,14 @@ int matchExprResult(Derivator &derivator, string fn) {
|
|||
bool checkExprLogSame(string fnPrefix, int start, int end) {
|
||||
Serializer serializer;
|
||||
string fn0 = fnPrefix + to_string(start) + ".expr";
|
||||
Expr expr0 = serializer.deserialize(fn0);
|
||||
Expr expr0 = serializer.fromFile(fn0);
|
||||
RangeOp range0 = as<RangeOpNode>(expr0);
|
||||
Interpreter interpreter(range0);
|
||||
auto ans0 = interpreter.interpretUniformSample(range0);
|
||||
dbg(expr0, ans0);
|
||||
for (int i = start + 1; i < end; ++i) {
|
||||
string fn1 = fnPrefix + to_string(i) + ".expr";
|
||||
Expr expr1 = serializer.deserialize(fn1);
|
||||
Expr expr1 = serializer.fromFile(fn1);
|
||||
RangeOp range1 = as<RangeOpNode>(expr1);
|
||||
dbg(fn1, expr1);
|
||||
auto ans1 = interpreter.interpretUniformSample(range1);
|
||||
|
@ -67,4 +67,4 @@ bool checkExprsEquvivalence(VecExpr exprs) {
|
|||
return true;
|
||||
}
|
||||
|
||||
} // namespace nnet
|
||||
} // namespace nnet
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
#include "operators/any.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
AnyObj::AnyObj(GraphObj *graph, const TensorVec &inputs,
|
||||
const TensorVec &outputs, const string &kernelName,
|
||||
const vector<int> &attr)
|
||||
: OperatorObj(OpType::Any, inputs, outputs), kernelName(kernelName),
|
||||
attr(attr) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
// Outputs must assigned when constructing AnyObj
|
||||
IT_ASSERT(!outputs.empty());
|
||||
for (auto &output : outputs)
|
||||
IT_ASSERT(output != nullptr && output->size() > 0);
|
||||
}
|
||||
|
||||
string AnyObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << "Any[" << getGuid() << "](";
|
||||
for (size_t i = 0; i < inputs.size(); ++i) {
|
||||
os << "i" << i << "=" << inputs[i]->getGuid();
|
||||
if (i != inputs.size() - 1)
|
||||
os << " ";
|
||||
}
|
||||
os << ", ";
|
||||
for (size_t i = 0; i < outputs.size(); ++i) {
|
||||
os << "o" << i << "=" << outputs[i]->getGuid();
|
||||
if (i != outputs.size() - 1)
|
||||
os << " ";
|
||||
}
|
||||
os << ", ";
|
||||
os << "kernel name: " << kernelName << ", ";
|
||||
os << "attr = [";
|
||||
for (size_t i = 0; i < attr.size(); ++i) {
|
||||
os << attr[i];
|
||||
if (i != attr.size() - 1)
|
||||
os << ", ";
|
||||
}
|
||||
os << "])\n";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
optional<vector<Shape>> AnyObj::inferShape(const TensorVec &inputs) const {
|
||||
vector<Shape> ret;
|
||||
for (auto output : outputs) {
|
||||
ret.emplace_back(output->getDims());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
const string AnyObj::getKernelName() const { return kernelName; }
|
||||
|
||||
vector<int> AnyObj::getOpAttrVector() const { return attr; };
|
||||
|
||||
vector<int> AnyObj::getWorkloadVector() const {
|
||||
vector<int> ret = {};
|
||||
for (auto &input : inputs) {
|
||||
auto inputDims = input->getDims();
|
||||
ret.insert(ret.end(), inputDims.begin(), inputDims.end());
|
||||
}
|
||||
for (auto &output : outputs) {
|
||||
auto outputDims = output->getDims();
|
||||
ret.insert(ret.end(), outputDims.begin(), outputDims.end());
|
||||
}
|
||||
for (auto c : kernelName) {
|
||||
ret.emplace_back(c);
|
||||
}
|
||||
for (auto at : attr) {
|
||||
ret.emplace_back(at);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -21,9 +21,8 @@ string ConvBaseObj::toString() const {
|
|||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(getOpType()) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
if (inputs.size() == 2) {
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << vecToString(inputs[1]->getDims()) << ",";
|
||||
for (auto &input : inputs) {
|
||||
os << vecToString(input->getDims()) << ",";
|
||||
}
|
||||
os << "p=[" << ph << "," << pw << "],";
|
||||
os << "s=[" << sh << "," << sw << "],";
|
||||
|
@ -114,6 +113,75 @@ optional<vector<Shape>> ConvObj::inferShape(const TensorVec &inputs) const {
|
|||
return {{{on, oc, oh, ow}}};
|
||||
}
|
||||
|
||||
void ConvNHWCObj::setAuxilaryAttributes(PaddingMode mode) {
|
||||
const Tensor &input = inputs[0];
|
||||
const Tensor &weight = inputs[1];
|
||||
n = input->getDims()[0], c = input->getDims()[3], h = input->getDims()[1],
|
||||
w = input->getDims()[2], f = weight->getDims()[0], r = weight->getDims()[1],
|
||||
s = weight->getDims()[2];
|
||||
if (mode == PaddingMode::Same) {
|
||||
int oh = h / sh;
|
||||
int ow = w / sw;
|
||||
ph = (h - oh * sh + (r - sh) * dh) / 2;
|
||||
pw = (w - ow * sw + (s - sw) * dw) / 2;
|
||||
} else if (mode == PaddingMode::Valid) {
|
||||
ph = pw = 0;
|
||||
}
|
||||
}
|
||||
|
||||
ConvNHWCObj::ConvNHWCObj(GraphObj *graph, Tensor input, Tensor weight,
|
||||
Tensor output, int ph, int pw, int sh, int sw, int dh,
|
||||
int dw, Tensor bias, ActType act)
|
||||
: ConvBaseObj(OpType::ConvNHWC, {input, weight}, output, ph, pw, sh, sw, dh,
|
||||
dw, input, weight, act) {
|
||||
if (bias)
|
||||
IT_TODO_HALT();
|
||||
setAuxilaryAttributes(PaddingMode::Other);
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
ConvNHWCObj::ConvNHWCObj(GraphObj *graph, Tensor input, Tensor weight,
|
||||
Tensor output, PaddingMode mode, int sh, int sw,
|
||||
int dh, int dw, Tensor bias, ActType act)
|
||||
: ConvBaseObj(OpType::ConvNHWC, {input, weight}, output, mode, sh, sw, dh,
|
||||
dw, input, weight, act) {
|
||||
if (bias)
|
||||
IT_TODO_HALT();
|
||||
setAuxilaryAttributes(mode);
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> ConvNHWCObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto &input = inputs[0], &weight = inputs[1];
|
||||
auto n = input->getDims()[0];
|
||||
auto h = input->getDims()[1];
|
||||
auto w = input->getDims()[2];
|
||||
auto f = weight->getDims()[0];
|
||||
auto r = weight->getDims()[1];
|
||||
auto s = weight->getDims()[2];
|
||||
int on = n, oc = f;
|
||||
int oh = 0, ow = 0;
|
||||
// For NCHW+FCRS layout, C of input is divisable by C of weight
|
||||
if (input->getDims()[3] % weight->getDims()[3] != 0)
|
||||
return {};
|
||||
// Set padding size
|
||||
if (padding == PaddingMode::Other) {
|
||||
oh = (h - (r - sh) * dh + ph * 2) / sh;
|
||||
ow = (w - (s - sw) * dw + pw * 2) / sw;
|
||||
} else if (padding == PaddingMode::Same) {
|
||||
oh = h / sh;
|
||||
ow = w / sw;
|
||||
// ph = (h - oh * sh + (r - sh) * dh) / 2;
|
||||
// pw = (w - ow * sw + (s - sw) * dw) / 2;
|
||||
} else if (padding == PaddingMode::Valid) {
|
||||
int ph = 0;
|
||||
int pw = 0;
|
||||
oh = (h - (r - sh) * dh + ph * 2) / sh;
|
||||
ow = (w - (s - sw) * dw + pw * 2) / sw;
|
||||
}
|
||||
return {{{on, oh, ow, oc}}};
|
||||
}
|
||||
|
||||
ConvTransposed2dObj::ConvTransposed2dObj(GraphObj *graph, Tensor input,
|
||||
Tensor weight, Tensor output, int ph,
|
||||
int pw, int sh, int sw, int dh, int dw,
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
#include "operators/conv2dreduce.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
Conv2dReduceBase::Conv2dReduceBase(OpType opType, Tensor input, Tensor bias_,
|
||||
Tensor output, bool PReLU_, float paramReLU_,
|
||||
int ph_, int pw_, int sh_, int sw_, int dh_,
|
||||
int dw_)
|
||||
: OperatorObj(opType, {input}, {output}), bias(bias_), ph(ph_), pw(pw_),
|
||||
sh(sh_), sw(sw_), dh(dh_), dw(dw_), PReLU(PReLU_), paramReLU(paramReLU_) {
|
||||
// expect input shape is (n, h, w, f, r, s)
|
||||
auto inputShape = input->getDims();
|
||||
IT_ASSERT(inputShape.size() == 6);
|
||||
n = inputShape[0];
|
||||
h = inputShape[1];
|
||||
w = inputShape[2];
|
||||
f = inputShape[3];
|
||||
r = inputShape[4];
|
||||
s = inputShape[5];
|
||||
|
||||
if (bias) {
|
||||
auto biasShape = bias->getDims();
|
||||
IT_ASSERT(biasShape.size() == 1);
|
||||
IT_ASSERT(biasShape[0] == f);
|
||||
}
|
||||
}
|
||||
|
||||
std::string Conv2dReduceBase::toString() const {
|
||||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(getOpType()) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
if (inputs.size() == 2) {
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << vecToString(inputs[1]->getDims()) << ",";
|
||||
} else {
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
}
|
||||
os << "p=[" << ph << "," << pw << "],";
|
||||
os << "s=[" << sh << "," << sw << "],";
|
||||
os << "d=[" << dh << "," << dw << "],";
|
||||
os << "PReLU=" << (PReLU ? "true" : "false") << ",";
|
||||
// os << "act=" << enum_to_underlying(act) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
if (bias != nullptr) {
|
||||
os << "bias=" << bias->getGuid() << ",";
|
||||
}
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
std::vector<int> Conv2dReduceBase::getWorkloadVector() const {
|
||||
return {enum_to_underlying(type), n, h, w, f, r, s, ph, pw, sh, sw, dh, dw};
|
||||
}
|
||||
|
||||
std::vector<int> Conv2dReduceBase::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type), ph, pw, sh, sw, dh, dw};
|
||||
}
|
||||
|
||||
Conv2dReduce::Conv2dReduce(GraphObj *graph, Tensor input, Tensor bias,
|
||||
Tensor output, bool PReLU_, float paramReLU_,
|
||||
int ph_, int pw_, int sh_, int sw_, int dh_, int dw_)
|
||||
: Conv2dReduceBase(OpType::Conv2dReduce, input, bias, output, PReLU_,
|
||||
paramReLU_, ph_, pw_, sh_, sw_, dh_, dw_) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>>
|
||||
Conv2dReduce::inferShape(const TensorVec &inputs) const {
|
||||
// const auto &input = inputs[0], &bias = inputs[1];
|
||||
int on = n, of = f;
|
||||
int oh = (h + ph * 2 - dh * (r - 1) - 1) / sh + 1;
|
||||
int ow = (w + pw * 2 - dw * (s - 1) - 1) / sw + 1;
|
||||
|
||||
return {{{on, oh, ow, of}}};
|
||||
}
|
||||
|
||||
Conv2dReduceTranspose::Conv2dReduceTranspose(GraphObj *graph, Tensor input,
|
||||
Tensor bias, Tensor output,
|
||||
bool PReLU_, float paramReLU_,
|
||||
int ph_, int pw_, int sh_, int sw_,
|
||||
int dh_, int dw_)
|
||||
: Conv2dReduceBase(OpType::Conv2dReduceTranspose, input, bias, output,
|
||||
PReLU_, paramReLU_, ph_, pw_, sh_, sw_, dh_, dw_) {
|
||||
IT_ASSERT(dh_ == 1);
|
||||
IT_ASSERT(dw_ == 1);
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>>
|
||||
Conv2dReduceTranspose::inferShape(const TensorVec &inputs) const {
|
||||
// const auto &input = inputs[0], &bias = inputs[1];
|
||||
int on = n, of = f;
|
||||
int oh = (h - 1) * sh - 2 * ph + dh * (r - 1) + 1;
|
||||
int ow = (w - 1) * sw - 2 * pw + dw * (s - 1) + 1;
|
||||
|
||||
return {{{on, oh, ow, of}}};
|
||||
}
|
||||
} // namespace infini
|
|
@ -2,6 +2,7 @@
|
|||
#include "nnet/Visitor/CheckOOBVisitor.h"
|
||||
#include "nnet/Visitor/HashVisitor.h"
|
||||
#include "nnet/Visitor/MergeMemboundMutator.h"
|
||||
#include "nnet/Visitor/Serializer.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -9,8 +10,8 @@ MemBoundObj::MemBoundObj(GraphObj *graph, const TensorVec &input,
|
|||
const TensorVec &output,
|
||||
const std::vector<nnet::Tensor> &nnetInputs,
|
||||
nnet::Expr expr, double exec_time, std::string hint)
|
||||
: OperatorObj(OpType::MemBound, input, output), nnetInputs(nnetInputs),
|
||||
expr(expr), exec_time(exec_time), hint(hint) {
|
||||
: OperatorObj(OpType::MemBound, input, output), expr(expr),
|
||||
nnetInputs(nnetInputs), exec_time(exec_time), hint(hint) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
IT_ASSERT(!checkOOB(expr));
|
||||
hash = calcHash(expr);
|
||||
|
@ -45,7 +46,7 @@ string MemBoundObj::toString() const {
|
|||
os << "exec_time=" << exec_time << ", ";
|
||||
os << "NNet Inputs=[";
|
||||
for (const auto &tensor : nnetInputs)
|
||||
os << tensor->toReadable() << ",";
|
||||
os << tensor->toReadable() << vecToString(tensor->getShape()) << ",";
|
||||
os << "]";
|
||||
os << ", ExprHash=" << hash;
|
||||
os << ", SimplifiedExprHash=" << simplifiedHash;
|
||||
|
@ -60,11 +61,18 @@ string MemBoundObj::toString() const {
|
|||
|
||||
optional<vector<Shape>> MemBoundObj::inferShape(const TensorVec &inputs) const {
|
||||
// inputs have to match nnetInputs excatly
|
||||
if (inputs.size() != nnetInputs.size())
|
||||
if (inputs.size() != nnetInputs.size()) {
|
||||
std::cout << "Num mismatch" << inputs.size() << " "
|
||||
<< nnetInputs.size();
|
||||
return {};
|
||||
}
|
||||
for (size_t i = 0; i < inputs.size(); ++i)
|
||||
if (inputs[i]->getDims() != nnetInputs[i]->getShape())
|
||||
if (inputs[i]->getDims() != nnetInputs[i]->getShape()) {
|
||||
std::cout << "Shape mismatch " << inputs[i]
|
||||
<< vecToString(inputs[i]->getDims()) << " "
|
||||
<< vecToString(nnetInputs[i]->getShape());
|
||||
return {};
|
||||
}
|
||||
return {{nnet::as<nnet::RangeOpNode>(expr)->getOutputShape()}};
|
||||
}
|
||||
|
||||
|
@ -83,4 +91,9 @@ bool MemBoundObj::checkOOB(nnet::Expr expr) {
|
|||
nnet::as<nnet::RangeOpNode>(expr));
|
||||
}
|
||||
|
||||
string MemBoundObj::toJson() const {
|
||||
return *nnet::Serializer().toString(expr, "MemBoundObj::toJson", nnetInputs,
|
||||
exec_time, hint);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -2,7 +2,8 @@
|
|||
|
||||
namespace infini {
|
||||
ReshapeObj::ReshapeObj(GraphObj *graph, Tensor input, Tensor output, Shape dims)
|
||||
: OperatorObj(OpType::Reshape, {input}, {output}), dims(std::move(dims)) {
|
||||
: OperatorObj(OpType::Reshape, {input}, {output}),
|
||||
dims(dims.size() == 0 ? output->getDims() : dims) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
|
@ -19,9 +20,9 @@ optional<vector<Shape>> ReshapeObj::inferShape(const TensorVec &inputs) const {
|
|||
std::string ReshapeObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << "Reshape[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << "(input dim=";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "dims=" << vecToString(dims) << ",";
|
||||
os << "output dims=" << vecToString(dims) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
|
|
|
@ -4,13 +4,7 @@ namespace infini {
|
|||
TransposeObj::TransposeObj(GraphObj *graph, Tensor input, Tensor output,
|
||||
vector<int> permute)
|
||||
: OperatorObj(OpType::Transpose, {input}, {output}) {
|
||||
if (permute.size() != 4) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
transposePermute[0] = permute[0];
|
||||
transposePermute[1] = permute[1];
|
||||
transposePermute[2] = permute[2];
|
||||
transposePermute[3] = permute[3];
|
||||
transposePermute = permute;
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
|
@ -20,7 +14,8 @@ TransposeObj::inferShape(const TensorVec &inputs) const {
|
|||
auto input = A->getDims();
|
||||
auto output = input;
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
auto nDims = input.size();
|
||||
for (size_t i = 0; i < nDims; ++i) {
|
||||
output[i] = input[transposePermute[i]];
|
||||
}
|
||||
return {{output}};
|
||||
|
@ -32,7 +27,8 @@ std::string TransposeObj::toString() const {
|
|||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
os << "output=" << outputs[0]->getGuid() << ",";
|
||||
os << "perm=" << vecToString(transposePermute) << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
|
|
|
@ -7,9 +7,10 @@ namespace infini {
|
|||
TEST(Handler, matmul) {
|
||||
auto runtime = NativeCpuRuntimeObj::getInstance();
|
||||
auto handler = make_ref<GraphHandlerObj>(runtime);
|
||||
auto i = handler->tensor({1, 2, 3}, OnnxDType::UINT32);
|
||||
auto w = handler->tensor({1, 3, 4}, OnnxDType::UINT32);
|
||||
auto o = handler->tensor({1, 2, 4}, OnnxDType::UINT32);
|
||||
auto i = handler->tensor({1, 2, 3}, OnnxDType::UINT32, TensorType::Input);
|
||||
auto w =
|
||||
handler->tensor({1, 3, 4}, OnnxDType::UINT32, TensorType::Initialized);
|
||||
auto o = handler->tensor({1, 2, 4}, OnnxDType::UINT32, TensorType::Other);
|
||||
handler->matmul(i, w, o, false, false, nullptr, ActType::None);
|
||||
}
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ TEST(SubGraphRewriter, subGraphMatch1) {
|
|||
SubGraphRewriter v(g);
|
||||
vector<MatchGraph> subgs = v.findMatch(subG);
|
||||
|
||||
EXPECT_TRUE(subgs.size() == 2);
|
||||
EXPECT_TRUE(subgs.size() == 2u);
|
||||
}
|
||||
|
||||
TEST(MatchGraph, single_input) {
|
||||
|
@ -116,12 +116,12 @@ TEST(MatchGraph, single_input) {
|
|||
|
||||
auto o4 = v.addSubGraph(subG, TensorVec{add1->getOutput(0)});
|
||||
|
||||
EXPECT_EQ(g->getOperators().size(), 52);
|
||||
EXPECT_EQ(g->getOperators().size(), 52u);
|
||||
vector<MatchGraph> subgs = v.findMatch(subG);
|
||||
EXPECT_TRUE(subgs.size() == 5);
|
||||
EXPECT_TRUE(subgs.size() == 5u);
|
||||
|
||||
vector<MatchGraph> subgs1 = v.findMatch(subG1);
|
||||
EXPECT_TRUE(subgs1.size() == 4);
|
||||
EXPECT_TRUE(subgs1.size() == 4u);
|
||||
|
||||
// test replace
|
||||
Tensor sii0 =
|
||||
|
@ -135,7 +135,7 @@ TEST(MatchGraph, single_input) {
|
|||
}
|
||||
|
||||
v.replaceSubGraph(subG, subG2);
|
||||
EXPECT_EQ(g->getOperators().size(), 37);
|
||||
EXPECT_EQ(g->getOperators().size(), 37u);
|
||||
}
|
||||
|
||||
TEST(MatchGraph, multi_input) {
|
||||
|
@ -186,17 +186,17 @@ TEST(MatchGraph, multi_input) {
|
|||
nullptr);
|
||||
|
||||
auto matches = v.findMatch(subG);
|
||||
EXPECT_EQ(2, matches.size());
|
||||
EXPECT_EQ(2u, matches.size());
|
||||
|
||||
auto div0 = g->addOp<DivObj>(reduce1->getOutput(0), i2, nullptr);
|
||||
auto add1 =
|
||||
g->addOp<AddObj>(sub0->getOutput(), div0->getOutput(), nullptr);
|
||||
matches = v.findMatch(subG);
|
||||
EXPECT_EQ(1, matches.size());
|
||||
EXPECT_EQ(1u, matches.size());
|
||||
|
||||
// two matched subgraphs overlaped,so only replaced one sub graph
|
||||
v.replaceSubGraph(subG, replaceG);
|
||||
EXPECT_EQ(1, v.findMatch(replaceG).size());
|
||||
EXPECT_EQ(1u, v.findMatch(replaceG).size());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -240,7 +240,7 @@ TEST(MatchGraph, multi_output) {
|
|||
{
|
||||
auto input = g->cloneTensor(i);
|
||||
auto outs = v.addSubGraph(subg0, {input});
|
||||
EXPECT_EQ(2, outs.size());
|
||||
EXPECT_EQ(2u, outs.size());
|
||||
Tensor w0 = g->addTensor(Shape{96, 64, 3, 3}, DataType::UInt32);
|
||||
auto conv0 = g->addOp<ConvObj>(outs[0], w0, nullptr, 1, 1);
|
||||
auto relu0 = g->addOp<ReluObj>(conv0->getOutput(0), nullptr);
|
||||
|
@ -263,11 +263,11 @@ TEST(MatchGraph, multi_output) {
|
|||
}
|
||||
|
||||
auto matches = v.findMatch(subg0);
|
||||
EXPECT_EQ(1, matches.size());
|
||||
EXPECT_EQ(1u, matches.size());
|
||||
|
||||
v.replaceSubGraph(subg0, subg1);
|
||||
auto matches2 = v.findMatch(subg1);
|
||||
EXPECT_EQ(1, matches2.size());
|
||||
EXPECT_EQ(1u, matches2.size());
|
||||
}
|
||||
|
||||
// gcn
|
||||
|
@ -354,16 +354,16 @@ TEST(MatchGraph, multi_input_output) {
|
|||
v.addSubGraph(subg0, {relu->getOutput(0), maxPool->getOutput(0)});
|
||||
auto out1 =
|
||||
v.addSubGraph(subg1, {maxPool->getOutput(0), relu->getOutput(0)});
|
||||
EXPECT_EQ(2, out0.size());
|
||||
EXPECT_EQ(2, out1.size());
|
||||
EXPECT_EQ(2u, out0.size());
|
||||
EXPECT_EQ(2u, out1.size());
|
||||
auto div = g->addOp<DivObj>(out0[0], out1[1], nullptr);
|
||||
auto sub = g->addOp<SubObj>(out0[1], out1[0], nullptr);
|
||||
}
|
||||
|
||||
EXPECT_EQ(2, v.findMatch(subg0).size());
|
||||
EXPECT_EQ(2, v.findMatch(subg1).size());
|
||||
EXPECT_EQ(2u, v.findMatch(subg0).size());
|
||||
EXPECT_EQ(2u, v.findMatch(subg1).size());
|
||||
v.replaceSubGraph(subg0, subg2);
|
||||
EXPECT_EQ(v.findMatch(subg2).size(), 2);
|
||||
EXPECT_EQ(v.findMatch(subg2).size(), 2u);
|
||||
}
|
||||
|
||||
/* One Node having two or more successors is not supported yet.
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "operators/any.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
TEST(cuda_Any, anyKernel) {
|
||||
// conv2dreduce
|
||||
{
|
||||
// Construct Runtime and graph for CPU and CUDA
|
||||
Runtime cpu =
|
||||
NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Runtime cuda = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cuda);
|
||||
|
||||
auto generator = IncrementalGenerator();
|
||||
|
||||
int PRelu = 0, n = 1, h = 4, w = 4, f = 2, r = 3, s = 3, oh = 4, ow = 4,
|
||||
ph = 1, pw = 1, sh = 1, sw = 1, dh = 1, dw = 1;
|
||||
string kernelName = "conv2dreduce_kernel";
|
||||
vector<int> attr{PRelu, n, h, w, f, r, s, oh,
|
||||
ow, ph, pw, sh, sw, dh, dw};
|
||||
|
||||
// Build input data on CPu
|
||||
Tensor i0Cpu = gCpu->addTensor({n, 1, h, w}, DataType::Float32);
|
||||
Tensor w0Cpu = gCpu->addTensor({f, 1, r, s}, DataType::Float32);
|
||||
// Malloc data for all tensors in a graph. Do we need implicit
|
||||
// allocation?
|
||||
gCpu->dataMalloc();
|
||||
i0Cpu->setData(generator);
|
||||
w0Cpu->setData(generator);
|
||||
// Copy input tensors from CPU to CUDA
|
||||
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
|
||||
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
|
||||
Tensor o0Cuda = gCuda->addTensor({n, f, oh, ow});
|
||||
auto anyOp = gCuda->addOpWithOutputs<AnyObj>(
|
||||
TensorVec{i0Cuda, w0Cuda}, TensorVec{o0Cuda}, kernelName, attr);
|
||||
anyOp->print();
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
std::cout << "data malloc success..." << std::endl;
|
||||
// Execute on CUDA
|
||||
cuda->run(gCuda);
|
||||
std::cout << "cuda run success..." << std::endl;
|
||||
// copy output from CUDA to CPU
|
||||
auto o0Cpu = gCpu->cloneTensor(anyOp->getOutput());
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(1);
|
||||
// print a tensor/operator/graph by print()
|
||||
gCuda->print();
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
|
@ -43,6 +43,42 @@ void testConvCudnn(
|
|||
gCuda->print();
|
||||
}
|
||||
|
||||
void testConvNHWCCudnn(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
vector<float> ansVec) {
|
||||
// Construct Runtime and graph for CPU and CUDA
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Runtime cuda = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cuda);
|
||||
// Set input data on CPU in a CPU Graph
|
||||
Tensor i0Cpu = gCpu->addTensor({1, 4, 4, 3}, DataType::Float32);
|
||||
Tensor w0Cpu = gCpu->addTensor({2, 3, 3, 3}, DataType::Float32);
|
||||
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||
gCpu->dataMalloc();
|
||||
i0Cpu->setData(generator);
|
||||
w0Cpu->setData(generator);
|
||||
|
||||
// Copy input tensors from CPU to CUDA
|
||||
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
|
||||
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
|
||||
// Build CUDA graph
|
||||
auto conv =
|
||||
gCuda->addOp<ConvNHWCObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
// Execute on CUDA
|
||||
cuda->run(gCuda);
|
||||
// copy output from CUDA to CPU
|
||||
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
|
||||
o0Cpu->print();
|
||||
o0Cpu->printData();
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(o0Cpu->equalData(ansVec));
|
||||
// print a tensor/operator/graph by print()
|
||||
gCuda->print();
|
||||
}
|
||||
|
||||
TEST(cuDNN_Conv, run) {
|
||||
testConvCudnn(OneGenerator(),
|
||||
vector<float>{12, 12, 18, 18, 12, 12, 18, 18});
|
||||
|
@ -51,6 +87,14 @@ TEST(cuDNN_Conv, run) {
|
|||
vector<float>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
|
||||
}
|
||||
|
||||
TEST(cuDNN_Conv, runNHWC) {
|
||||
testConvNHWCCudnn(OneGenerator(),
|
||||
vector<float>{12., 12., 12., 12., 18., 18., 18., 18.});
|
||||
testConvNHWCCudnn(
|
||||
IncrementalGenerator(),
|
||||
vector<float>{3350, 7562, 2306, 5546, 9480, 24546, 7185, 20793});
|
||||
}
|
||||
|
||||
TEST(cuDNN_Conv, tune) {
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
|
|
|
@ -68,16 +68,16 @@ TEST(cuBLAS_Matmul, tune) {
|
|||
const int B = 1, M = 4, N = 4096, K = 448;
|
||||
const bool transA = true, transB = false;
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
cudaRuntime->setEnableTF32(true);
|
||||
Graph g = make_ref<GraphObj>(cudaRuntime);
|
||||
auto a = g->addTensor(transA ? Shape{B, K, M} : Shape{B, M, K});
|
||||
auto b = g->addTensor(transB ? Shape{B, N, K} : Shape{B, K, N});
|
||||
// allocate CUDA memory
|
||||
|
||||
auto matmul = g->addOp<MatmulObj>(a, b, nullptr, transA, transB);
|
||||
g->dataMalloc();
|
||||
a->setData(IncrementalGenerator());
|
||||
b->setData(IncrementalGenerator());
|
||||
|
||||
auto matmul = g->addOp<MatmulObj>(a, b, nullptr, transA, transB);
|
||||
matmul->print();
|
||||
double time = cudaRuntime->getPerfTime(g);
|
||||
EXPECT_GT(time, 1e-3);
|
||||
EXPECT_LT(time, 1);
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue