forked from jiuyuan/InfiniTensor
Add search engine (#64)
* Add: tensor fuid * [Intermediate state] Add: Graph ctor for OpVec * Add: clone for operators * tmp: search_engine * search: init search Engine. * Add: dummy mutator for the test of search engine * search: add print graph. * search: add partition. * search: update comments. * Fix: remain FUID in Tensor::clone * Chore: rename GUidBaseType to UidBaseType * Fix: connect NMutator to SearchEngine * Chore: output * Fix test_memboundOp: nmutator uses input runtime * Chore: clang-format * Chore: clang-format * Fix: comments in the review --------- Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com> Co-authored-by: mazx <dyxdy@live.com>
This commit is contained in:
parent
14c9c82dab
commit
c7ec9ee6e7
|
@ -0,0 +1,15 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/mutator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class DummyMutator : public Mutator {
|
||||||
|
public:
|
||||||
|
DummyMutator(int candidatesLimit) : Mutator(candidatesLimit){};
|
||||||
|
|
||||||
|
virtual vector<Graph> run(const Graph &inGraph) override;
|
||||||
|
virtual vector<Graph> mergeMultiBranch(const Graph &inGraph) override;
|
||||||
|
virtual bool isMultiBranchMergable(const Graph &inGraph) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -8,19 +8,22 @@ class GraphObj : public Object {
|
||||||
protected:
|
protected:
|
||||||
Runtime runtime;
|
Runtime runtime;
|
||||||
TensorVec tensors;
|
TensorVec tensors;
|
||||||
TensorVec inputs;
|
// TODO: whether to record input and output tensors
|
||||||
TensorVec outputs;
|
// TensorVec inputs;
|
||||||
|
// TensorVec outputs;
|
||||||
OpVec ops;
|
OpVec ops;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
GraphObj(Runtime runtime) : runtime(runtime){};
|
GraphObj(Runtime runtime) : runtime(runtime){};
|
||||||
|
GraphObj(Runtime runtime, OpVec ops_in);
|
||||||
string toString() const override;
|
string toString() const override;
|
||||||
|
Runtime getRuntime() const { return runtime; }
|
||||||
|
|
||||||
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
|
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
|
||||||
|
Tensor addTensor(const Tensor &tensor);
|
||||||
|
TensorVec addTensor(const TensorVec &tensors);
|
||||||
Tensor cloneTensor(const Tensor &tensor) {
|
Tensor cloneTensor(const Tensor &tensor) {
|
||||||
auto ret = addTensor(tensor->getDims(), tensor->getDType());
|
auto ret = addTensor(tensor->clone(runtime));
|
||||||
ret->dataMalloc();
|
|
||||||
ret->copyData(tensor);
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -45,12 +48,22 @@ class GraphObj : public Object {
|
||||||
}
|
}
|
||||||
|
|
||||||
const TensorVec &getTensors() const { return tensors; }
|
const TensorVec &getTensors() const { return tensors; }
|
||||||
const TensorVec &getInputs() const { return inputs; }
|
const TensorVec getInputs() const {
|
||||||
const TensorVec &getOutputs() const { return outputs; }
|
TensorVec ret;
|
||||||
|
for (auto t : tensors)
|
||||||
|
if (!t->getOutputOf())
|
||||||
|
ret.emplace_back(t);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
const TensorVec getOutputs() const {
|
||||||
|
TensorVec ret;
|
||||||
|
for (auto t : tensors)
|
||||||
|
if (t->getInputOf().empty())
|
||||||
|
ret.emplace_back(t);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
const OpVec &getOperators() const { return ops; }
|
const OpVec &getOperators() const { return ops; }
|
||||||
OpVec getComputeOps() const;
|
OpVec getComputeOps() const;
|
||||||
// TensorVec &getInputs();
|
|
||||||
// TensorVec &getOutputs();
|
|
||||||
|
|
||||||
void dataMalloc();
|
void dataMalloc();
|
||||||
|
|
||||||
|
|
|
@ -106,7 +106,7 @@ class KernelRegistry {
|
||||||
"Kernel not found for key {" +
|
"Kernel not found for key {" +
|
||||||
to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
|
to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
|
||||||
", " + OpRegistry::getOpName(std::get<1>(kernelAttrs)) +
|
", " + OpRegistry::getOpName(std::get<1>(kernelAttrs)) +
|
||||||
", " + std::get<2>(kernelAttrs).toString());
|
", " + std::get<2>(kernelAttrs).toString() + "}");
|
||||||
return std::get<0>(it->second);
|
return std::get<0>(it->second);
|
||||||
}
|
}
|
||||||
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
|
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
|
||||||
|
|
|
@ -8,12 +8,27 @@ class Mutator {
|
||||||
int candidatesLimit;
|
int candidatesLimit;
|
||||||
// // Statistical data
|
// // Statistical data
|
||||||
// int numTotalCandidates;
|
// int numTotalCandidates;
|
||||||
|
protected:
|
||||||
|
Runtime runtime;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Mutator(int candidatesLimit) : candidatesLimit(candidatesLimit){};
|
Mutator(int candidatesLimit, Runtime runtime = CpuRuntimeObj::getInstance())
|
||||||
|
: candidatesLimit(candidatesLimit), runtime(runtime){};
|
||||||
virtual ~Mutator(){};
|
virtual ~Mutator(){};
|
||||||
|
|
||||||
virtual vector<Graph> run(const Graph &in_graph) = 0;
|
virtual vector<Graph> run(const Graph &in_graph) = 0;
|
||||||
|
/**
|
||||||
|
* @brief Merge a multi-branch graph into single branch graphs
|
||||||
|
*
|
||||||
|
* @param in_graph
|
||||||
|
* @return vector<Graph> Transformed graphs except the orignal one.
|
||||||
|
*/
|
||||||
|
virtual vector<Graph> mergeMultiBranch(const Graph &in_graph) {
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
virtual bool isMultiBranchMergable(const Graph &in_graph) {
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -4,27 +4,44 @@
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
using GuidBaseType = int;
|
using UidBaseType = int;
|
||||||
|
|
||||||
class Guid {
|
class Uid {
|
||||||
private:
|
private:
|
||||||
GuidBaseType guid;
|
UidBaseType uid;
|
||||||
|
|
||||||
|
public:
|
||||||
|
Uid(UidBaseType uid) : uid(uid) {}
|
||||||
|
Uid &operator=(const Uid &rhs) = delete;
|
||||||
|
|
||||||
|
operator UidBaseType() const { return uid; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class Guid : public Uid {
|
||||||
private:
|
private:
|
||||||
GuidBaseType generateGuid() {
|
UidBaseType generateGuid() {
|
||||||
static GuidBaseType guidCnt = 0;
|
static UidBaseType guidCnt = 0;
|
||||||
return ++guidCnt;
|
return ++guidCnt;
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Guid() { guid = generateGuid(); }
|
Guid() : Uid(generateGuid()) {}
|
||||||
Guid(const Guid &rhs) { guid = generateGuid(); }
|
Guid(const Guid &rhs) : Uid(generateGuid()) {}
|
||||||
Guid &operator=(const Guid &rhs) {
|
};
|
||||||
guid = generateGuid();
|
|
||||||
return *this;
|
/**
|
||||||
|
* @brief Family unique ID. Cloned tensors shared the same FUID.
|
||||||
|
*/
|
||||||
|
class Fuid : public Uid {
|
||||||
|
private:
|
||||||
|
UidBaseType generateFuid() {
|
||||||
|
static UidBaseType fuidCnt = 0;
|
||||||
|
return ++fuidCnt;
|
||||||
}
|
}
|
||||||
|
|
||||||
operator GuidBaseType() const { return guid; }
|
public:
|
||||||
|
Fuid() : Uid(generateFuid()) {}
|
||||||
|
Fuid(const Fuid &fuid) : Uid(fuid) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
class Object {
|
class Object {
|
||||||
|
@ -35,7 +52,7 @@ class Object {
|
||||||
virtual ~Object(){};
|
virtual ~Object(){};
|
||||||
virtual string toString() const = 0;
|
virtual string toString() const = 0;
|
||||||
void print() { std::cout << toString() << std::endl; }
|
void print() { std::cout << toString() << std::endl; }
|
||||||
GuidBaseType getGuid() const { return guid; }
|
UidBaseType getGuid() const { return guid; }
|
||||||
};
|
};
|
||||||
|
|
||||||
inline std::ostream &operator<<(std::ostream &os, const Object &obj) {
|
inline std::ostream &operator<<(std::ostream &os, const Object &obj) {
|
||||||
|
|
|
@ -197,6 +197,16 @@ class OperatorObj : public Object {
|
||||||
virtual int numInputs() const = 0;
|
virtual int numInputs() const = 0;
|
||||||
virtual int numOutputs() const = 0;
|
virtual int numOutputs() const = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Clone this operator and replace its inputs and outputs.
|
||||||
|
*
|
||||||
|
* @param newInputs
|
||||||
|
* @param newOutputs
|
||||||
|
* @return Operator
|
||||||
|
*/
|
||||||
|
virtual Operator clone(const TensorVec &newInputs,
|
||||||
|
const TensorVec &newOutputs) const = 0;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
optional<vector<Shape>> inferShape() const;
|
optional<vector<Shape>> inferShape() const;
|
||||||
vector<DataType> inferDataType() const;
|
vector<DataType> inferDataType() const;
|
||||||
|
@ -215,6 +225,18 @@ class OperatorObj : public Object {
|
||||||
virtual vector<int> getWorkloadVector() const { IT_TODO_HALT(); }
|
virtual vector<int> getWorkloadVector() const { IT_TODO_HALT(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define OP_CLONE(OpObj) \
|
||||||
|
virtual Operator clone(const TensorVec &newInputs, \
|
||||||
|
const TensorVec &newOutputs) const override { \
|
||||||
|
auto op = infini::make_ref<OpObj>(*this); \
|
||||||
|
op->inputs = newInputs; \
|
||||||
|
op->outputs = newOutputs; \
|
||||||
|
op->predecessors.clear(); \
|
||||||
|
op->successors.clear(); \
|
||||||
|
IT_ASSERT(op->checkValid(nullptr)); \
|
||||||
|
return op; \
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
||||||
namespace std {
|
namespace std {
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "graph.h"
|
||||||
|
#include "mutator.h"
|
||||||
|
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class SearchEngine {
|
||||||
|
private:
|
||||||
|
Runtime runtimeExec;
|
||||||
|
Ref<Mutator> mutator;
|
||||||
|
|
||||||
|
public:
|
||||||
|
SearchEngine(Runtime _runtime, Ref<Mutator> _mutator) {
|
||||||
|
runtimeExec = _runtime;
|
||||||
|
mutator = _mutator;
|
||||||
|
}
|
||||||
|
~SearchEngine() {}
|
||||||
|
|
||||||
|
private: // Configurations
|
||||||
|
size_t partitionThreshold =
|
||||||
|
3; // cut nodes whose #in + #out >= partitionThreshold
|
||||||
|
size_t GRAPH_SIZE = 16; // num of best graphs.
|
||||||
|
|
||||||
|
private: // Composed objects
|
||||||
|
std::shared_ptr<Mutator> mutationEngine;
|
||||||
|
|
||||||
|
public:
|
||||||
|
std::shared_ptr<Mutator> getMutationEngine() { return mutationEngine; };
|
||||||
|
struct GroupEdge {
|
||||||
|
int v, next;
|
||||||
|
GroupEdge() = delete;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Candidate { // a graph with perf
|
||||||
|
std::shared_ptr<Graph> graph;
|
||||||
|
double perf = INFINITY;
|
||||||
|
};
|
||||||
|
class MetaGraph { // a graph of subgraphs, for searching.
|
||||||
|
public:
|
||||||
|
MetaGraph() {}
|
||||||
|
~MetaGraph() {}
|
||||||
|
struct Node {
|
||||||
|
Graph graph;
|
||||||
|
std::vector<int> suc;
|
||||||
|
std::vector<int> pre;
|
||||||
|
int type, cnt;
|
||||||
|
};
|
||||||
|
std::vector<Node> nodes;
|
||||||
|
};
|
||||||
|
|
||||||
|
Graph run(const Graph graph); // entrance of search engine.
|
||||||
|
std::vector<Graph> search(const Graph &graph); // search for a partition.
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<Graph> partitionGraph(const Graph graph);
|
||||||
|
std::shared_ptr<MetaGraph> buildMetaGraphWithGraph(const Graph graph);
|
||||||
|
std::shared_ptr<MetaGraph>
|
||||||
|
buildMetaGraphWithPlan(const std::shared_ptr<MetaGraph> metaGraph,
|
||||||
|
const std::vector<int> &plan);
|
||||||
|
// search horizontal merges
|
||||||
|
std::vector<std::shared_ptr<MetaGraph>>
|
||||||
|
searchMerge(std::shared_ptr<MetaGraph> &metaGraph);
|
||||||
|
void searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
|
||||||
|
std::vector<int> &plan, std::vector<int> &frontier,
|
||||||
|
std::vector<std::vector<int>> &plans,
|
||||||
|
std::unordered_set<uint64_t> &planSet);
|
||||||
|
std::vector<Graph>
|
||||||
|
searchMutation(const std::shared_ptr<MetaGraph> &metaGraph);
|
||||||
|
|
||||||
|
void printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph);
|
||||||
|
/**
|
||||||
|
* @brief Check whether a multi-brach graph can be merged into a single
|
||||||
|
* branch.
|
||||||
|
*/
|
||||||
|
bool isMultiBranchMergable(const Graph graph);
|
||||||
|
};
|
||||||
|
} // namespace infini
|
|
@ -10,6 +10,8 @@ using Shape = vector<ShapeElem>;
|
||||||
class TensorObj : public TensorBaseObj {
|
class TensorObj : public TensorBaseObj {
|
||||||
private:
|
private:
|
||||||
Shape shape;
|
Shape shape;
|
||||||
|
Fuid fuid; // Cloned tensors share the same id. Tensors constructed from
|
||||||
|
// scratch have a new id.
|
||||||
|
|
||||||
public:
|
public:
|
||||||
TensorObj(const Shape &shape, DataType dtype, Runtime runtime);
|
TensorObj(const Shape &shape, DataType dtype, Runtime runtime);
|
||||||
|
@ -25,6 +27,7 @@ class TensorObj : public TensorBaseObj {
|
||||||
using TensorBaseObj::getData;
|
using TensorBaseObj::getData;
|
||||||
VType getData(const Shape &pos) const;
|
VType getData(const Shape &pos) const;
|
||||||
void dataMalloc();
|
void dataMalloc();
|
||||||
|
UidBaseType getFuid() const { return fuid; }
|
||||||
|
|
||||||
void load(std::string file_path);
|
void load(std::string file_path);
|
||||||
void save(std::string file_path);
|
void save(std::string file_path);
|
||||||
|
@ -51,10 +54,24 @@ class TensorObj : public TensorBaseObj {
|
||||||
}
|
}
|
||||||
generator(data->getPtr<void *>(), size(), dtype);
|
generator(data->getPtr<void *>(), size(), dtype);
|
||||||
}
|
}
|
||||||
Tensor clone(Runtime runtime) {
|
Tensor clone() const {
|
||||||
auto obj = make_ref<TensorObj>(shape, dtype, runtime);
|
auto obj = make_ref<TensorObj>(*this);
|
||||||
obj->dataMalloc();
|
obj->freeData();
|
||||||
obj->copyData(this);
|
obj->inputOf.clear();
|
||||||
|
obj->outputOf.reset();
|
||||||
|
return obj;
|
||||||
|
}
|
||||||
|
// TODO: clarify whether clone copies data
|
||||||
|
Tensor clone(Runtime runtime) const {
|
||||||
|
auto obj = make_ref<TensorObj>(*this);
|
||||||
|
obj->runtime = runtime;
|
||||||
|
obj->freeData();
|
||||||
|
obj->inputOf.clear();
|
||||||
|
obj->outputOf.reset();
|
||||||
|
if (hasData()) {
|
||||||
|
obj->dataMalloc();
|
||||||
|
obj->copyData(this);
|
||||||
|
}
|
||||||
return obj;
|
return obj;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,8 @@ class TensorBaseObj : public Object {
|
||||||
data = blob;
|
data = blob;
|
||||||
}
|
}
|
||||||
Blob getDataBlob() const { return data; }
|
Blob getDataBlob() const { return data; }
|
||||||
|
bool hasData() const { return data != nullptr; }
|
||||||
|
void freeData() { data = nullptr; }
|
||||||
template <typename T> T getRawDataPtr() const {
|
template <typename T> T getRawDataPtr() const {
|
||||||
static_assert(std::is_pointer_v<T>,
|
static_assert(std::is_pointer_v<T>,
|
||||||
"Raw data pointer has a type of pointer");
|
"Raw data pointer has a type of pointer");
|
||||||
|
|
|
@ -47,6 +47,7 @@ class CudaRuntimeObj : public RuntimeObj {
|
||||||
CudaPtr alloc(size_t size) override {
|
CudaPtr alloc(size_t size) override {
|
||||||
void *ptr;
|
void *ptr;
|
||||||
checkCudaError(cudaMalloc(&ptr, size));
|
checkCudaError(cudaMalloc(&ptr, size));
|
||||||
|
// printf("cuda malloc: %p %lu bytes\n", ptr, size);
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
void dealloc(void *ptr) override { checkCudaError(cudaFree(ptr)); }
|
void dealloc(void *ptr) override { checkCudaError(cudaFree(ptr)); }
|
||||||
|
|
|
@ -26,6 +26,7 @@ class G2BMMObj : public OperatorObj {
|
||||||
G2BMMObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, const int width,
|
G2BMMObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, const int width,
|
||||||
const int dilation, Tensor bias = nullptr,
|
const int dilation, Tensor bias = nullptr,
|
||||||
ActType act = ActType::None);
|
ActType act = ActType::None);
|
||||||
|
OP_CLONE(G2BMMObj);
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
|
@ -24,6 +24,7 @@ class GBMMObj : public OperatorObj {
|
||||||
*/
|
*/
|
||||||
GBMMObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, const int dilation,
|
GBMMObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, const int dilation,
|
||||||
Tensor bias = nullptr, ActType act = ActType::None);
|
Tensor bias = nullptr, ActType act = ActType::None);
|
||||||
|
OP_CLONE(GBMMObj);
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
|
@ -10,6 +10,7 @@ class BatchNormObj : public OperatorObj {
|
||||||
BatchNormObj(GraphObj *graph, Tensor input, Tensor output, Tensor mean,
|
BatchNormObj(GraphObj *graph, Tensor input, Tensor output, Tensor mean,
|
||||||
Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
|
Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
|
||||||
float eps = 1e-5, bool training = false);
|
float eps = 1e-5, bool training = false);
|
||||||
|
OP_CLONE(BatchNormObj);
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
|
|
|
@ -7,6 +7,7 @@ class ConcatObj : public OperatorObj {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim);
|
ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim);
|
||||||
|
OP_CLONE(ConcatObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
|
|
@ -77,6 +77,7 @@ class ConvObj : public ConvBaseObj {
|
||||||
PaddingMode mode = PaddingMode::Same, int sh = 1, int sw = 1,
|
PaddingMode mode = PaddingMode::Same, int sh = 1, int sw = 1,
|
||||||
int dh = 1, int dw = 1, Tensor bias = nullptr,
|
int dh = 1, int dw = 1, Tensor bias = nullptr,
|
||||||
ActType act = ActType::None);
|
ActType act = ActType::None);
|
||||||
|
OP_CLONE(ConvObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
ActType getAct() const { return act; }
|
ActType getAct() const { return act; }
|
||||||
|
@ -104,6 +105,7 @@ class ConvTransposed2dObj : public ConvBaseObj {
|
||||||
int sh = 1, int sw = 1, int dh = 1, int dw = 1,
|
int sh = 1, int sw = 1, int dh = 1, int dw = 1,
|
||||||
int oph = 0, int opw = 0, int group = 1,
|
int oph = 0, int opw = 0, int group = 1,
|
||||||
Tensor bias = nullptr, ActType act = ActType::None);
|
Tensor bias = nullptr, ActType act = ActType::None);
|
||||||
|
OP_CLONE(ConvTransposed2dObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
ActType getAct() const { return act; }
|
ActType getAct() const { return act; }
|
||||||
|
|
|
@ -23,6 +23,7 @@ class ElementWiseObj : public OperatorObj {
|
||||||
prefix##Obj(GraphObj *graph, Tensor input0, Tensor input1, \
|
prefix##Obj(GraphObj *graph, Tensor input0, Tensor input1, \
|
||||||
Tensor output) \
|
Tensor output) \
|
||||||
: ElementWiseObj(type, graph, input0, input1, output) {} \
|
: ElementWiseObj(type, graph, input0, input1, output) {} \
|
||||||
|
OP_CLONE(prefix##Obj); \
|
||||||
};
|
};
|
||||||
|
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Add, OpType::Add)
|
DEFINE_ELEMENT_WISE_OBJ(Add, OpType::Add)
|
||||||
|
|
|
@ -8,6 +8,7 @@ class ExtendObj : public OperatorObj {
|
||||||
public:
|
public:
|
||||||
ExtendObj(GraphObj *graph, Tensor input, Tensor output, int dim,
|
ExtendObj(GraphObj *graph, Tensor input, Tensor output, int dim,
|
||||||
int num = 1);
|
int num = 1);
|
||||||
|
OP_CLONE(ExtendObj);
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
|
|
|
@ -9,6 +9,7 @@ class GatherObj : public OperatorObj {
|
||||||
public:
|
public:
|
||||||
GatherObj(GraphObj *graph, Tensor input, Tensor index, Tensor output,
|
GatherObj(GraphObj *graph, Tensor input, Tensor index, Tensor output,
|
||||||
int axis);
|
int axis);
|
||||||
|
OP_CLONE(GatherObj);
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
int numInputs() const override { return 2; }
|
int numInputs() const override { return 2; }
|
||||||
int numOutputs() const override { return 1; }
|
int numOutputs() const override { return 1; }
|
||||||
|
|
|
@ -29,6 +29,7 @@ class MatmulObj : public OperatorObj {
|
||||||
MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
|
MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
|
||||||
bool transA = false, bool transB = false, Tensor bias = nullptr,
|
bool transA = false, bool transB = false, Tensor bias = nullptr,
|
||||||
ActType act = ActType::None);
|
ActType act = ActType::None);
|
||||||
|
OP_CLONE(MatmulObj);
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
|
@ -17,6 +17,7 @@ class MemBoundObj : public OperatorObj {
|
||||||
const TensorVec &output,
|
const TensorVec &output,
|
||||||
const std::vector<nnet::Tensor> &nnetInputs, nnet::Expr expr,
|
const std::vector<nnet::Tensor> &nnetInputs, nnet::Expr expr,
|
||||||
double exec_time, std::string hint = {});
|
double exec_time, std::string hint = {});
|
||||||
|
OP_CLONE(MemBoundObj);
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
|
@ -10,6 +10,7 @@ class PadObj : public OperatorObj {
|
||||||
// pad for appointed axises,if axis is empty,then pad for all axises.
|
// pad for appointed axises,if axis is empty,then pad for all axises.
|
||||||
PadObj(GraphObj *graph, Tensor input, Tensor output,
|
PadObj(GraphObj *graph, Tensor input, Tensor output,
|
||||||
const vector<int> &pads, const optional<const vector<int>> &axis);
|
const vector<int> &pads, const optional<const vector<int>> &axis);
|
||||||
|
OP_CLONE(PadObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
|
|
|
@ -14,6 +14,7 @@ class PoolingObj : public OperatorObj {
|
||||||
public:
|
public:
|
||||||
PoolingObj(GraphObj *graph, OpType optype, Tensor input, Tensor output,
|
PoolingObj(GraphObj *graph, OpType optype, Tensor input, Tensor output,
|
||||||
int kh, int kw, int dh, int dw, int ph, int pw, int sh, int sw);
|
int kh, int kw, int dh, int dw, int ph, int pw, int sh, int sw);
|
||||||
|
OP_CLONE(PoolingObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
|
|
|
@ -10,6 +10,7 @@ class ReduceMeanObj : public OperatorObj {
|
||||||
ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
|
ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
|
||||||
const optional<const vector<int>> &axis,
|
const optional<const vector<int>> &axis,
|
||||||
bool keepDims = true);
|
bool keepDims = true);
|
||||||
|
OP_CLONE(ReduceMeanObj);
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
|
|
|
@ -8,6 +8,7 @@ class ReshapeObj : public OperatorObj {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
ReshapeObj(GraphObj *graph, Tensor input, Tensor output, const Shape &dims);
|
ReshapeObj(GraphObj *graph, Tensor input, Tensor output, const Shape &dims);
|
||||||
|
OP_CLONE(ReshapeObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
@ -24,6 +25,7 @@ class FlattenObj : public OperatorObj {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
FlattenObj(GraphObj *graph, Tensor input, Tensor output);
|
FlattenObj(GraphObj *graph, Tensor input, Tensor output);
|
||||||
|
OP_CLONE(FlattenObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
@ -40,6 +42,7 @@ class IdentityObj : public OperatorObj {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
IdentityObj(GraphObj *graph, Tensor input, Tensor output);
|
IdentityObj(GraphObj *graph, Tensor input, Tensor output);
|
||||||
|
OP_CLONE(IdentityObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
|
|
@ -51,6 +51,7 @@ class ResizeObj : public OperatorObj {
|
||||||
Tensor roi, ECoeffMode mode,
|
Tensor roi, ECoeffMode mode,
|
||||||
EKeepAspectRatioPolicy ratioPolicy = EKeepAspectRatioPolicy::none,
|
EKeepAspectRatioPolicy ratioPolicy = EKeepAspectRatioPolicy::none,
|
||||||
ECoordinateTransMode coordTransMode = ECoordinateTransMode::halfPixel);
|
ECoordinateTransMode coordTransMode = ECoordinateTransMode::halfPixel);
|
||||||
|
OP_CLONE(ResizeObj);
|
||||||
|
|
||||||
// Operator clone(TensorVec inputs, TensorVec outputs) override;
|
// Operator clone(TensorVec inputs, TensorVec outputs) override;
|
||||||
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
||||||
|
|
|
@ -10,6 +10,7 @@ class SliceObj : public OperatorObj {
|
||||||
const vector<int> &starts, const vector<int> &ends,
|
const vector<int> &starts, const vector<int> &ends,
|
||||||
const optional<vector<int>> &axis,
|
const optional<vector<int>> &axis,
|
||||||
const optional<vector<int>> &steps);
|
const optional<vector<int>> &steps);
|
||||||
|
OP_CLONE(SliceObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
|
|
|
@ -10,6 +10,7 @@ class SplitObj : public OperatorObj {
|
||||||
int dim, int num);
|
int dim, int num);
|
||||||
SplitObj(GraphObj *graph, Tensor input, std::optional<TensorVec> outputs,
|
SplitObj(GraphObj *graph, Tensor input, std::optional<TensorVec> outputs,
|
||||||
int dim, const vector<int> &ratio);
|
int dim, const vector<int> &ratio);
|
||||||
|
OP_CLONE(SplitObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ class UnaryObj : public OperatorObj {
|
||||||
public: \
|
public: \
|
||||||
prefix##Obj(GraphObj *graph, Tensor input, Tensor output) \
|
prefix##Obj(GraphObj *graph, Tensor input, Tensor output) \
|
||||||
: UnaryObj(type, graph, input, output) {} \
|
: UnaryObj(type, graph, input, output) {} \
|
||||||
|
OP_CLONE(prefix##Obj); \
|
||||||
};
|
};
|
||||||
|
|
||||||
DEFINE_UNARY_OBJ(Relu, OpType::Relu)
|
DEFINE_UNARY_OBJ(Relu, OpType::Relu)
|
||||||
|
|
|
@ -68,10 +68,9 @@ def gen_ansor_op(input_tensors, input_dtypes, output_tensor, output_dtype, f, fu
|
||||||
func_code = func.imported_modules[0].get_source()
|
func_code = func.imported_modules[0].get_source()
|
||||||
invoke_code = "%s_kernel0<<<dim3(%s), dim3(%s)>>>(%s, %s);" % (
|
invoke_code = "%s_kernel0<<<dim3(%s), dim3(%s)>>>(%s, %s);" % (
|
||||||
func_name, ", ".join(map(str, block_dim)), ", ".join(
|
func_name, ", ".join(map(str, block_dim)), ", ".join(
|
||||||
map(str, thread_dim)),
|
map(str, thread_dim)), ", ".join(input_names), output_name)
|
||||||
output_name, ", ".join(input_names))
|
|
||||||
invoke_params = block_dim + thread_dim
|
invoke_params = block_dim + thread_dim
|
||||||
|
|
||||||
ctx = tvm.cuda(0)
|
ctx = tvm.cuda(0)
|
||||||
input_a = []
|
input_a = []
|
||||||
for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
|
for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
|
||||||
|
@ -91,4 +90,4 @@ def gen_ansor_op(input_tensors, input_dtypes, output_tensor, output_dtype, f, fu
|
||||||
print("Time")
|
print("Time")
|
||||||
print(conv_time)
|
print(conv_time)
|
||||||
|
|
||||||
return func_code, invoke_code, conv_time, invoke_params # ms
|
return func_code, invoke_code, conv_time, invoke_params # ms
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
#include "core/dummy_mutator.h"
|
||||||
|
#include "operators/concat.h"
|
||||||
|
#include "operators/conv.h"
|
||||||
|
#include "operators/matmul.h"
|
||||||
|
#include "operators/split.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
vector<Graph> DummyMutator::run(const Graph &inGraph) {
|
||||||
|
if (inGraph->getOperators().size() > 1)
|
||||||
|
return {inGraph};
|
||||||
|
// Conv -> Conv + Relu
|
||||||
|
auto op0 = as<ConvObj>(inGraph->getOperators()[0]);
|
||||||
|
auto g = make_ref<GraphObj>(runtime);
|
||||||
|
auto a0 = g->cloneTensor(op0->getInputs()[0]),
|
||||||
|
w0 = g->cloneTensor(op0->getInputs()[1]),
|
||||||
|
o0 = g->cloneTensor(op0->getOutput());
|
||||||
|
auto [ph, pw, sh, sw, dh, dw] = op0->getPadStrideDilation();
|
||||||
|
auto t =
|
||||||
|
g->addOp<ConvObj>(a0, w0, nullptr, ph, pw, sh, sw, dh, dw)->getOutput();
|
||||||
|
g->addOpWithOutputs<ReluObj>(t, o0);
|
||||||
|
return {inGraph, g};
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<Graph> DummyMutator::mergeMultiBranch(const Graph &inGraph) {
|
||||||
|
// Two Mamtul of the same shapes -> One Batched Matmul
|
||||||
|
if (!isMultiBranchMergable(inGraph))
|
||||||
|
return {};
|
||||||
|
auto op0 = as<MatmulObj>(inGraph->getOperators()[0]);
|
||||||
|
auto op1 = as<MatmulObj>(inGraph->getOperators()[1]);
|
||||||
|
auto [b, m, n, k, transA, transB] = op0->getBMNKTransAB();
|
||||||
|
auto g = make_ref<GraphObj>(runtime);
|
||||||
|
auto a0 = g->cloneTensor(op0->getInputs()[0]),
|
||||||
|
w0 = g->cloneTensor(op0->getInputs()[1]),
|
||||||
|
o0 = g->cloneTensor(op0->getOutput());
|
||||||
|
auto a1 = g->cloneTensor(op1->getInputs()[0]),
|
||||||
|
w1 = g->cloneTensor(op1->getInputs()[1]),
|
||||||
|
o1 = g->cloneTensor(op1->getOutput());
|
||||||
|
auto a = g->addOp<ConcatObj>(TensorVec{a0, a1}, nullptr, 0)->getOutput();
|
||||||
|
auto w = g->addOp<ConcatObj>(TensorVec{w0, w1}, nullptr, 0)->getOutput();
|
||||||
|
auto t = g->addOp<MatmulObj>(a, w, nullptr, transA, transB);
|
||||||
|
g->addOpWithOutputs<SplitObj>(t->getOutput(), TensorVec{o0, o1}, 0, 2);
|
||||||
|
return {g};
|
||||||
|
}
|
||||||
|
|
||||||
|
bool DummyMutator::isMultiBranchMergable(const Graph &inGraph) {
|
||||||
|
if (inGraph->getOperators().size() != 2)
|
||||||
|
return false;
|
||||||
|
for (auto op : inGraph->getOperators()) {
|
||||||
|
if (op->getOpType() != OpType::Matmul)
|
||||||
|
return false;
|
||||||
|
if (op->getPredecessors().size() > 0)
|
||||||
|
return false;
|
||||||
|
if (op->getSuccessors().size() > 0)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
auto op0 = as<MatmulObj>(inGraph->getOperators()[0]);
|
||||||
|
auto op1 = as<MatmulObj>(inGraph->getOperators()[1]);
|
||||||
|
auto args0 = op0->getBMNKTransAB();
|
||||||
|
auto args1 = op1->getBMNKTransAB();
|
||||||
|
return args0 == args1;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -1,7 +1,32 @@
|
||||||
#include "core/graph.h"
|
#include "core/graph.h"
|
||||||
|
#include <queue>
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
GraphObj::GraphObj(Runtime runtime, OpVec ops_in) : runtime(runtime) {
|
||||||
|
map<UidBaseType, Tensor> tensorPool;
|
||||||
|
// Clone tensors
|
||||||
|
for (const auto &op : ops_in) {
|
||||||
|
for (const auto &t : op->getInputs())
|
||||||
|
if (tensorPool.find(t->getFuid()) == tensorPool.end())
|
||||||
|
tensorPool[t->getFuid()] = t->clone();
|
||||||
|
for (const auto &t : op->getOutputs())
|
||||||
|
if (tensorPool.find(t->getFuid()) == tensorPool.end())
|
||||||
|
tensorPool[t->getFuid()] = t->clone();
|
||||||
|
}
|
||||||
|
for (const auto &[_, t] : tensorPool)
|
||||||
|
addTensor(t);
|
||||||
|
// Clone operators and add connections
|
||||||
|
for (const auto &op : ops_in) {
|
||||||
|
TensorVec inputs, outputs;
|
||||||
|
for (const auto &t : op->getInputs())
|
||||||
|
inputs.emplace_back(tensorPool.at(t->getFuid()));
|
||||||
|
for (const auto &t : op->getOutputs())
|
||||||
|
outputs.emplace_back(tensorPool.at(t->getFuid()));
|
||||||
|
addOperatorAndConnect(op->clone(inputs, outputs));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void GraphObj::addOperatorAndConnect(const Operator &op) {
|
void GraphObj::addOperatorAndConnect(const Operator &op) {
|
||||||
ops.push_back(op);
|
ops.push_back(op);
|
||||||
for (auto &input : op->getInputs()) {
|
for (auto &input : op->getInputs()) {
|
||||||
|
@ -28,7 +53,7 @@ string GraphObj::toString() const {
|
||||||
|
|
||||||
oss << "Graph operators:\n";
|
oss << "Graph operators:\n";
|
||||||
for (const auto &op : ops) {
|
for (const auto &op : ops) {
|
||||||
vector<GuidBaseType> preds, succs;
|
vector<UidBaseType> preds, succs;
|
||||||
for (auto &o : op->getPredecessors())
|
for (auto &o : op->getPredecessors())
|
||||||
preds.emplace_back(o->getGuid());
|
preds.emplace_back(o->getGuid());
|
||||||
for (auto &o : op->getSuccessors())
|
for (auto &o : op->getSuccessors())
|
||||||
|
@ -53,6 +78,18 @@ Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
|
||||||
return tensor;
|
return tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Tensor GraphObj::addTensor(const Tensor &tensor) {
|
||||||
|
IT_ASSERT(tensor->getRuntime() == runtime, "Tensor runtime mismatch");
|
||||||
|
tensors.emplace_back(tensor);
|
||||||
|
return tensor;
|
||||||
|
}
|
||||||
|
|
||||||
|
TensorVec GraphObj::addTensor(const TensorVec &tensors) {
|
||||||
|
for (auto &t : tensors)
|
||||||
|
addTensor(t);
|
||||||
|
return tensors;
|
||||||
|
}
|
||||||
|
|
||||||
OpVec GraphObj::getComputeOps() const {
|
OpVec GraphObj::getComputeOps() const {
|
||||||
OpVec opList;
|
OpVec opList;
|
||||||
for (auto op : ops)
|
for (auto op : ops)
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#include "core/blob.h"
|
#include "core/blob.h"
|
||||||
#include "core/kernel.h"
|
#include "core/kernel.h"
|
||||||
#include "core/perf_engine.h"
|
#include "core/perf_engine.h"
|
||||||
|
#include "utils/data_generator.h"
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
@ -73,8 +74,27 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
||||||
PerfRecord record;
|
PerfRecord record;
|
||||||
// Tune the kernel if there is no record
|
// Tune the kernel if there is no record
|
||||||
if (!perfData) {
|
if (!perfData) {
|
||||||
|
// TODO: should tenosrs automatically allocate when access data?
|
||||||
|
// allocate memory for empty tensors and release it after profiling
|
||||||
|
TensorVec allocatedTensors;
|
||||||
|
for (auto t : op->getInputs())
|
||||||
|
if (!t->hasData())
|
||||||
|
allocatedTensors.emplace_back(t);
|
||||||
|
for (auto t : op->getOutputs())
|
||||||
|
if (!t->hasData())
|
||||||
|
allocatedTensors.emplace_back(t);
|
||||||
|
for (auto t : allocatedTensors) {
|
||||||
|
t->dataMalloc();
|
||||||
|
t->setData(IncrementalGenerator());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Profile operators and record the results
|
||||||
record = kernel->tune(op, this);
|
record = kernel->tune(op, this);
|
||||||
perfEngine.setPerfData(perfKey, record);
|
perfEngine.setPerfData(perfKey, record);
|
||||||
|
|
||||||
|
// Free allocated memory
|
||||||
|
for (auto t : allocatedTensors)
|
||||||
|
t->freeData();
|
||||||
} else
|
} else
|
||||||
record = perfData;
|
record = perfData;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,441 @@
|
||||||
|
#include "core/search_engine.h"
|
||||||
|
#include "core/hash.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <unordered_set>
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void SearchEngine::printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph) {
|
||||||
|
for (size_t i = 0; i < metaGraph->nodes.size(); i++) {
|
||||||
|
auto &node = metaGraph->nodes[i];
|
||||||
|
std::cout << "id: " << i << std::endl;
|
||||||
|
node.graph->print();
|
||||||
|
std::cout << "type: " << node.type << std::endl;
|
||||||
|
std::cout << "pre: ";
|
||||||
|
for (auto &x : node.pre) {
|
||||||
|
std::cout << x << " ";
|
||||||
|
}
|
||||||
|
std::cout << std::endl;
|
||||||
|
std::cout << "suc: ";
|
||||||
|
for (auto &x : node.suc) {
|
||||||
|
std::cout << x << " ";
|
||||||
|
}
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
Graph SearchEngine::run(const Graph graph) {
|
||||||
|
IT_ASSERT(runtimeExec == graph->getRuntime());
|
||||||
|
std::cout << "[INFO] original graph: " << std::endl;
|
||||||
|
std::cout << graph->toString();
|
||||||
|
std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(graph)
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
|
std::vector<Graph> partitions = partitionGraph(graph);
|
||||||
|
|
||||||
|
std::cout << "[INFO] Partition num: " << partitions.size() << std::endl;
|
||||||
|
std::vector<Graph> bestGraphs = {nullptr};
|
||||||
|
for (size_t pid = 0; pid < partitions.size(); pid++) {
|
||||||
|
auto &subGraph = partitions[pid];
|
||||||
|
std::cout << "[INFO] Partition: " << pid << std::endl;
|
||||||
|
std::vector<Graph> candidates = search(subGraph);
|
||||||
|
std::cout << "[INFO] size: " << candidates.size() << std::endl;
|
||||||
|
IT_ASSERT(candidates.size() > 0);
|
||||||
|
std::cout << subGraph->toString() << std::endl;
|
||||||
|
std::vector<Graph> nextGraphs;
|
||||||
|
for (auto lastGraph : bestGraphs) {
|
||||||
|
for (auto thisGraph : candidates) {
|
||||||
|
std::vector<Operator> ops;
|
||||||
|
if (lastGraph != nullptr) {
|
||||||
|
for (auto op : lastGraph->getOperators()) {
|
||||||
|
ops.emplace_back(op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (thisGraph != nullptr) {
|
||||||
|
for (auto op : thisGraph->getOperators()) {
|
||||||
|
ops.emplace_back(op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto tmp = make_ref<GraphObj>(runtimeExec, ops);
|
||||||
|
tmp->dataMalloc();
|
||||||
|
nextGraphs.emplace_back(tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
|
||||||
|
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
|
||||||
|
});
|
||||||
|
if (nextGraphs.size() > GRAPH_SIZE) {
|
||||||
|
nextGraphs.resize(GRAPH_SIZE);
|
||||||
|
}
|
||||||
|
bestGraphs.clear();
|
||||||
|
for (size_t i = 0; i < nextGraphs.size(); i++) {
|
||||||
|
bestGraphs.emplace_back(nextGraphs[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "[INFO] unfused graph: " << std::endl;
|
||||||
|
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||||
|
std::cout << "bestGraph " << i << ":" << std::endl;
|
||||||
|
std::cout << bestGraphs[i]->toString();
|
||||||
|
std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(bestGraphs[i])
|
||||||
|
<< std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return bestGraphs[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<Graph> SearchEngine::search(const Graph &graph) {
|
||||||
|
auto metaGraph = buildMetaGraphWithGraph(graph);
|
||||||
|
auto mergedGraphs = searchMerge(metaGraph);
|
||||||
|
std::cout << "[INFO] merged graphs: " << mergedGraphs.size() << std::endl;
|
||||||
|
|
||||||
|
std::vector<Graph> results;
|
||||||
|
for (auto mergedGraph : mergedGraphs) {
|
||||||
|
auto mutatedGraphs = searchMutation(mergedGraph);
|
||||||
|
for (size_t i = 0; i < std::min(mutatedGraphs.size(), GRAPH_SIZE);
|
||||||
|
i++) {
|
||||||
|
results.emplace_back(mutatedGraphs[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sort(results.begin(), results.end(), [&](Graph x, Graph y) {
|
||||||
|
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
|
||||||
|
}); // compare with perf time
|
||||||
|
if (results.size() > GRAPH_SIZE) {
|
||||||
|
results.resize(GRAPH_SIZE);
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build metagraph with a graph, each operator is a node.
|
||||||
|
std::shared_ptr<SearchEngine::MetaGraph>
|
||||||
|
SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
|
||||||
|
auto metaGraph = std::make_shared<MetaGraph>();
|
||||||
|
|
||||||
|
int numOps = graph->getOperators().size();
|
||||||
|
std::vector<int> cnt(numOps, 0);
|
||||||
|
std::unordered_map<int, int> opMap;
|
||||||
|
metaGraph->nodes.clear();
|
||||||
|
std::vector<int> q(0);
|
||||||
|
for (size_t i = 0; i < graph->getOperators().size(); i++) {
|
||||||
|
auto &op = graph->getOperators()[i];
|
||||||
|
MetaGraph::Node node;
|
||||||
|
std::vector<Operator> ops;
|
||||||
|
ops.emplace_back(op);
|
||||||
|
node.graph = make_ref<GraphObj>(runtimeExec, ops);
|
||||||
|
node.type = op->isComputeOp();
|
||||||
|
node.cnt = op->getPredecessors().size();
|
||||||
|
opMap.emplace(op->getGuid(), i);
|
||||||
|
metaGraph->nodes.emplace_back(node);
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < graph->getOperators().size(); i++) {
|
||||||
|
auto &op = graph->getOperators()[i];
|
||||||
|
std::unordered_set<int> set;
|
||||||
|
set.clear();
|
||||||
|
set.emplace(i);
|
||||||
|
for (auto preOp : op->getPredecessors()) {
|
||||||
|
int id = opMap[preOp->getGuid()];
|
||||||
|
if (set.find(id) == set.end()) {
|
||||||
|
metaGraph->nodes[i].pre.emplace_back(id);
|
||||||
|
set.emplace(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto sucOp : op->getSuccessors()) {
|
||||||
|
int id = opMap[sucOp->getGuid()];
|
||||||
|
if (set.find(id) == set.end()) {
|
||||||
|
metaGraph->nodes[i].suc.emplace_back(id);
|
||||||
|
set.emplace(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return metaGraph;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build a metagraph with graph and a plan, a plan is which ops should be a
|
||||||
|
// node.
|
||||||
|
std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
|
||||||
|
const std::shared_ptr<SearchEngine::MetaGraph> metaGraph,
|
||||||
|
const std::vector<int> &plan) {
|
||||||
|
int numGroups = 0;
|
||||||
|
for (auto i : plan) {
|
||||||
|
if (i > numGroups) {
|
||||||
|
numGroups = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<int>> groups(numGroups + 1, std::vector<int>(0));
|
||||||
|
for (size_t i = 0; i < plan.size(); i++) {
|
||||||
|
groups[plan[i]].emplace_back(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto resultMetaGraph = make_ref<MetaGraph>();
|
||||||
|
for (auto &group : groups) {
|
||||||
|
std::vector<Operator> ops;
|
||||||
|
std::unordered_set<int> preSet, sucSet;
|
||||||
|
for (auto id : group) {
|
||||||
|
MetaGraph::Node node;
|
||||||
|
for (auto op : metaGraph->nodes[id].graph->getOperators()) {
|
||||||
|
ops.emplace_back(op);
|
||||||
|
}
|
||||||
|
for (auto suc : metaGraph->nodes[id].suc) {
|
||||||
|
if (sucSet.find(plan[suc]) == sucSet.end()) {
|
||||||
|
node.suc.emplace_back(plan[suc]);
|
||||||
|
sucSet.emplace(plan[suc]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto pre : metaGraph->nodes[id].pre) {
|
||||||
|
IT_ASSERT(sucSet.find(plan[pre]) == sucSet.end());
|
||||||
|
if (preSet.find(plan[pre]) == preSet.end()) {
|
||||||
|
node.pre.emplace_back(plan[pre]);
|
||||||
|
preSet.emplace(plan[pre]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
node.graph = make_ref<GraphObj>(runtimeExec, ops);
|
||||||
|
node.cnt = node.pre.size();
|
||||||
|
node.type = ops[0]->isComputeOp();
|
||||||
|
resultMetaGraph->nodes.emplace_back(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return resultMetaGraph;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search how to merge multiple ops.
|
||||||
|
std::vector<std::shared_ptr<SearchEngine::MetaGraph>>
|
||||||
|
SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
||||||
|
IT_ASSERT(metaGraph != nullptr);
|
||||||
|
std::vector<int> plan(metaGraph->nodes.size());
|
||||||
|
for (size_t i = 0; i < plan.size(); i++) {
|
||||||
|
plan[i] = i;
|
||||||
|
}
|
||||||
|
std::vector<int> frontier;
|
||||||
|
for (size_t i = 0; i < plan.size(); i++) {
|
||||||
|
if (metaGraph->nodes[i].cnt == 0) {
|
||||||
|
frontier.emplace_back(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<int>> plans;
|
||||||
|
std::unordered_set<HashType> planSet;
|
||||||
|
searchMergeDfs(metaGraph, plan, frontier, plans, planSet);
|
||||||
|
|
||||||
|
std::vector<std::shared_ptr<SearchEngine::MetaGraph>> metaGraphs;
|
||||||
|
for (auto &curPlan : plans) {
|
||||||
|
metaGraphs.emplace_back(buildMetaGraphWithPlan(metaGraph, curPlan));
|
||||||
|
}
|
||||||
|
return metaGraphs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// DFS impl for search merge.
|
||||||
|
void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
|
||||||
|
std::vector<int> &plan,
|
||||||
|
std::vector<int> &frontier,
|
||||||
|
std::vector<std::vector<int>> &plans,
|
||||||
|
std::unordered_set<uint64_t> &planSet) {
|
||||||
|
if (frontier.size() == 0) {
|
||||||
|
// remark id
|
||||||
|
std::unordered_map<int, int> id_map;
|
||||||
|
int cnt = 0;
|
||||||
|
for (size_t i = 0; i < plan.size(); i++) {
|
||||||
|
if (id_map.find(plan[i]) == id_map.end()) {
|
||||||
|
id_map.emplace(plan[i], cnt++);
|
||||||
|
}
|
||||||
|
plan[i] = id_map[plan[i]];
|
||||||
|
}
|
||||||
|
auto hash = hashVector(plan);
|
||||||
|
if (planSet.find(hash) != planSet.end()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
planSet.emplace(hash);
|
||||||
|
plans.emplace_back(plan);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int numNonCompute = 0;
|
||||||
|
for (auto x : frontier) {
|
||||||
|
if (metaGraph->nodes[x].type == 0) {
|
||||||
|
numNonCompute++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto planBackup = plan;
|
||||||
|
auto metaGraphBackup = metaGraph;
|
||||||
|
// DFS non compute ops.
|
||||||
|
if (numNonCompute > 0) {
|
||||||
|
std::vector<int> nextFrontier;
|
||||||
|
for (auto x : frontier) {
|
||||||
|
if (metaGraph->nodes[x].type == 0) {
|
||||||
|
for (auto y : metaGraph->nodes[x].suc) {
|
||||||
|
metaGraph->nodes[y].cnt--;
|
||||||
|
if (metaGraph->nodes[y].cnt == 0) {
|
||||||
|
nextFrontier.emplace_back(y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nextFrontier.emplace_back(x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
searchMergeDfs(metaGraph, plan, nextFrontier, plans, planSet);
|
||||||
|
metaGraph = metaGraphBackup;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// DFS compute ops.
|
||||||
|
for (int mask = (1 << frontier.size()) - 1; mask > 0; mask--) {
|
||||||
|
int mergedId = -1;
|
||||||
|
std::vector<int> nextFrontier;
|
||||||
|
std::vector<Operator> ops;
|
||||||
|
for (size_t i = 0; i < frontier.size(); i++) {
|
||||||
|
if ((1 << i) & mask) {
|
||||||
|
if (mergedId == -1) {
|
||||||
|
mergedId = plan[frontier[i]];
|
||||||
|
} else {
|
||||||
|
plan[frontier[i]] = mergedId;
|
||||||
|
}
|
||||||
|
for (auto y : metaGraph->nodes[frontier[i]].suc) {
|
||||||
|
metaGraph->nodes[y].cnt--;
|
||||||
|
if (metaGraph->nodes[y].cnt == 0) {
|
||||||
|
nextFrontier.emplace_back(y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto op :
|
||||||
|
metaGraph->nodes[frontier[i]].graph->getOperators()) {
|
||||||
|
ops.emplace_back(op);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nextFrontier.emplace_back(frontier[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto graph = make_ref<GraphObj>(runtimeExec, ops);
|
||||||
|
if (ops.size() == 1 || isMultiBranchMergable(graph)) {
|
||||||
|
searchMergeDfs(metaGraph, plan, nextFrontier, plans, planSet);
|
||||||
|
}
|
||||||
|
plan = planBackup;
|
||||||
|
metaGraph = metaGraphBackup;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search mutation for each compute op.
|
||||||
|
std::vector<Graph> SearchEngine::searchMutation(
|
||||||
|
const std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
|
||||||
|
std::vector<Graph> graphs = {nullptr};
|
||||||
|
// Append a node to all existing candidates
|
||||||
|
for (auto &node : metaGraph->nodes) {
|
||||||
|
std::vector<Graph> nextGraphs;
|
||||||
|
if (node.type == 1) { // If it has computing OPs
|
||||||
|
auto mutatedGraphs = mutator->run(node.graph);
|
||||||
|
for (auto graph : graphs) {
|
||||||
|
for (auto mutatedGraph : mutatedGraphs) {
|
||||||
|
std::vector<Operator> ops;
|
||||||
|
if (graph != nullptr) {
|
||||||
|
for (auto op : graph->getOperators()) {
|
||||||
|
ops.emplace_back(op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto op : mutatedGraph->getOperators()) {
|
||||||
|
ops.emplace_back(op);
|
||||||
|
}
|
||||||
|
nextGraphs.emplace_back(
|
||||||
|
make_ref<GraphObj>(runtimeExec, ops));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (auto graph : graphs) {
|
||||||
|
std::vector<Operator> ops;
|
||||||
|
if (graph != nullptr) {
|
||||||
|
for (auto op : graph->getOperators()) {
|
||||||
|
ops.emplace_back(op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto op : node.graph->getOperators()) {
|
||||||
|
ops.emplace_back(op);
|
||||||
|
}
|
||||||
|
nextGraphs.emplace_back(make_ref<GraphObj>(runtimeExec, ops));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto g : nextGraphs) {
|
||||||
|
g->dataMalloc();
|
||||||
|
}
|
||||||
|
std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
|
||||||
|
return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
|
||||||
|
});
|
||||||
|
if (nextGraphs.size() > GRAPH_SIZE) {
|
||||||
|
nextGraphs.resize(GRAPH_SIZE);
|
||||||
|
}
|
||||||
|
graphs = nextGraphs;
|
||||||
|
}
|
||||||
|
return graphs;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool SearchEngine::isMultiBranchMergable(const Graph graph) {
|
||||||
|
return mutationEngine->isMultiBranchMergable(graph);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split a graph into multiple independt graphs. Search engine will search for
|
||||||
|
// each one.
|
||||||
|
std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
|
||||||
|
std::vector<Graph> partitions;
|
||||||
|
// Reversed DFS post-order is topo-order.
|
||||||
|
std::unordered_map<size_t, size_t> preOrder, postOrder;
|
||||||
|
std::vector<Operator> ops;
|
||||||
|
int preCnt = 0, postCnt = 0;
|
||||||
|
std::function<void(Operator)> dfs = [&](Operator op) {
|
||||||
|
if (preOrder.count(op->getGuid())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
preOrder[op->getGuid()] = preCnt++;
|
||||||
|
for (auto &&next : op->getSuccessors()) {
|
||||||
|
dfs(next);
|
||||||
|
}
|
||||||
|
postOrder[op->getGuid()] = postCnt++;
|
||||||
|
ops.emplace_back(op);
|
||||||
|
};
|
||||||
|
for (auto &&op : graph->getOperators()) {
|
||||||
|
dfs(op);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<Operator> headOps;
|
||||||
|
for (size_t i = 0; i < ops.size(); i++) {
|
||||||
|
auto &op = ops[i];
|
||||||
|
headOps.emplace_back(op);
|
||||||
|
if (op->getPredecessors().size() + op->getSuccessors().size() >=
|
||||||
|
(size_t)partitionThreshold &&
|
||||||
|
!op->isComputeOp()) {
|
||||||
|
auto preOrderI = preOrder[op->getGuid()];
|
||||||
|
auto postOrderI = postOrder[op->getGuid()];
|
||||||
|
for (size_t j = 0; j < i; j++) {
|
||||||
|
// True predecessor
|
||||||
|
if (preOrder[ops[j]->getGuid()] < preOrderI) {
|
||||||
|
for (auto nextOp : ops[j]->getSuccessors()) {
|
||||||
|
if (postOrder[nextOp->getGuid()] < postOrderI) {
|
||||||
|
// FIXME: DO NOT USE goto
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout << "partition!!!: " << i << std::endl;
|
||||||
|
for (auto op : headOps) {
|
||||||
|
std::cout << op->toString() << std::endl;
|
||||||
|
}
|
||||||
|
auto tmp = make_ref<GraphObj>(runtimeExec, headOps);
|
||||||
|
tmp->dataMalloc();
|
||||||
|
partitions.emplace_back(tmp);
|
||||||
|
headOps.clear();
|
||||||
|
}
|
||||||
|
fail:;
|
||||||
|
}
|
||||||
|
if (!headOps.empty()) {
|
||||||
|
auto tmp = make_ref<GraphObj>(runtimeExec, headOps);
|
||||||
|
tmp->dataMalloc();
|
||||||
|
partitions.emplace_back(tmp);
|
||||||
|
}
|
||||||
|
std::reverse(partitions.begin(), partitions.end());
|
||||||
|
return partitions;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -14,9 +14,10 @@ VType TensorObj::getData(const Shape &pos) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
string TensorObj::toString() const {
|
string TensorObj::toString() const {
|
||||||
string ret = "Tensor " + std::to_string(guid) + ", shape " +
|
string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
|
||||||
vecToString(shape) + ", dtype " + dtype.toString();
|
std::to_string(fuid) + ", shape " + vecToString(shape) +
|
||||||
vector<GuidBaseType> inputOfGuid;
|
", dtype " + dtype.toString();
|
||||||
|
vector<UidBaseType> inputOfGuid;
|
||||||
for (const auto &op : inputOf)
|
for (const auto &op : inputOf)
|
||||||
inputOfGuid.emplace_back(op.lock()->getGuid());
|
inputOfGuid.emplace_back(op.lock()->getGuid());
|
||||||
if (auto o = outputOf.lock())
|
if (auto o = outputOf.lock())
|
||||||
|
|
|
@ -69,7 +69,7 @@ void CudaRuntimeObj::run(const Graph &graph, bool runTune,
|
||||||
sync();
|
sync();
|
||||||
}
|
}
|
||||||
|
|
||||||
void CudaRuntimeObj::sync() const { cudaDeviceSynchronize(); }
|
void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
|
||||||
|
|
||||||
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
|
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
|
||||||
|
|
||||||
|
|
|
@ -127,19 +127,16 @@ class MemboundTVM : public Kernel {
|
||||||
cuModuleLoadDataEx(&module, ret->ptx.data(), 0, nullptr, nullptr));
|
cuModuleLoadDataEx(&module, ret->ptx.data(), 0, nullptr, nullptr));
|
||||||
checkCUresult(cuModuleGetFunction(&kernel, module, kernelName.c_str()));
|
checkCUresult(cuModuleGetFunction(&kernel, module, kernelName.c_str()));
|
||||||
std::vector<void *> args;
|
std::vector<void *> args;
|
||||||
for (auto &&in : op->getInputs()) {
|
for (auto &&in : op->getInputs())
|
||||||
args.push_back(in->getRawDataPtr<void *>());
|
args.push_back(in->getRawDataPtr<void *>());
|
||||||
}
|
|
||||||
args.push_back(op->getOutput()->getRawDataPtr<void *>());
|
args.push_back(op->getOutput()->getRawDataPtr<void *>());
|
||||||
std::vector<void *> argsPtr;
|
std::vector<void *> argsPtr;
|
||||||
for (auto &arg : args) {
|
for (auto &arg : args)
|
||||||
argsPtr.push_back(&arg);
|
argsPtr.push_back(&arg);
|
||||||
}
|
|
||||||
|
|
||||||
// Evaluate the kernel
|
// Evaluate the kernel
|
||||||
ret->time = timeit(
|
ret->time = timeit(
|
||||||
[&]() {
|
[&]() {
|
||||||
// TODO: run the kernel
|
|
||||||
cuLaunchKernel(kernel, invokeParams[0], invokeParams[1],
|
cuLaunchKernel(kernel, invokeParams[0], invokeParams[1],
|
||||||
invokeParams[2], invokeParams[3],
|
invokeParams[2], invokeParams[3],
|
||||||
invokeParams[4], invokeParams[5], 0, NULL,
|
invokeParams[4], invokeParams[5], 0, NULL,
|
||||||
|
|
|
@ -22,18 +22,12 @@ NMutator::~NMutator() {}
|
||||||
void NMutator::setToNaiveMembound() { mode = Mode::ToNaiveMembound; }
|
void NMutator::setToNaiveMembound() { mode = Mode::ToNaiveMembound; }
|
||||||
|
|
||||||
vector<Graph> NMutator::run(const Graph &in_graph) {
|
vector<Graph> NMutator::run(const Graph &in_graph) {
|
||||||
vector<Graph> out_graphs;
|
vector<Graph> out_graphs{in_graph};
|
||||||
// Test helper: naively transform one Op to Membound
|
// Test helper: naively transform one Op to Membound
|
||||||
if (mode == Mode::ToNaiveMembound) {
|
if (mode == Mode::ToNaiveMembound) {
|
||||||
runSingleOpToNaiveMembound(in_graph, out_graphs);
|
runSingleOpToNaiveMembound(in_graph, out_graphs);
|
||||||
dbg(out_graphs.size());
|
|
||||||
return out_graphs;
|
return out_graphs;
|
||||||
}
|
}
|
||||||
// // Hack for HetConv fusion
|
|
||||||
// if (statGraph(in_graph) == NMutator::SGType::HetConv) {
|
|
||||||
// dbg("Start fuse HetConv");
|
|
||||||
// out_graphs.emplace_back(fuseHetConv(nullptr, in_graph));
|
|
||||||
// }
|
|
||||||
// Clear input names maps with tensor
|
// Clear input names maps with tensor
|
||||||
inputsNameNToTensorT.clear();
|
inputsNameNToTensorT.clear();
|
||||||
OpVec computeOps = in_graph->getComputeOps();
|
OpVec computeOps = in_graph->getComputeOps();
|
||||||
|
@ -51,15 +45,25 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
|
||||||
OpVec computeOps = in_graph->getComputeOps();
|
OpVec computeOps = in_graph->getComputeOps();
|
||||||
assert(computeOps.size() == 1);
|
assert(computeOps.size() == 1);
|
||||||
const auto &computeOp = computeOps[0];
|
const auto &computeOp = computeOps[0];
|
||||||
auto g = infini::make_ref<GraphObj>(CpuRuntimeObj::getInstance());
|
auto g = infini::make_ref<GraphObj>(in_graph->getRuntime());
|
||||||
auto expr = opToExpression(computeOp);
|
auto expr = opToExpression(computeOp);
|
||||||
auto inputsN = nnet::GetTensorsVisitor().get(expr);
|
auto inputsN = nnet::GetTensorsVisitor().get(expr);
|
||||||
dbg(inputsN);
|
dbg(inputsN, expr);
|
||||||
dbg(expr);
|
IT_ASSERT(inputsN.count("B") + inputsN.count("K") == 1,
|
||||||
// FIXME: tensors should be copied?
|
"Which one is the second input tensor?");
|
||||||
g->addOpWithOutputs<MemBoundObj>(
|
vector<nnet::Tensor> inputsVectorN = {inputsN.at("A")};
|
||||||
computeOp->getInputs(), computeOp->getOutputs(),
|
if (inputsN.count("B"))
|
||||||
vector<nnet::Tensor>{inputsN.at("A"), inputsN.at("B")}, expr, 0.);
|
inputsVectorN.emplace_back(inputsN["B"]);
|
||||||
|
else
|
||||||
|
inputsVectorN.emplace_back(inputsN["K"]);
|
||||||
|
// clone IF inputs and outputs into the new graph
|
||||||
|
TensorVec inputsT, outputsT;
|
||||||
|
for (auto t : computeOp->getInputs())
|
||||||
|
inputsT.emplace_back(g->cloneTensor(t));
|
||||||
|
for (auto t : computeOp->getOutputs())
|
||||||
|
outputsT.emplace_back(g->cloneTensor(t));
|
||||||
|
g->addOpWithOutputs<MemBoundObj>(inputsT, outputsT, inputsVectorN, expr,
|
||||||
|
0.);
|
||||||
g->print();
|
g->print();
|
||||||
out_graphs.emplace_back(g);
|
out_graphs.emplace_back(g);
|
||||||
}
|
}
|
||||||
|
@ -226,62 +230,62 @@ void NMutator::runMultipleOps(Graph in_graph, std::vector<Graph> &out_graphs) {
|
||||||
|
|
||||||
nnet::Expr NMutator::opToExpression(Operator op) {
|
nnet::Expr NMutator::opToExpression(Operator op) {
|
||||||
// IT_TODO_HALT();
|
// IT_TODO_HALT();
|
||||||
// if (auto convOp = dynamic_cast<ConvOp *>(op)) {
|
if (auto convOp = as<ConvObj>(op)) {
|
||||||
// const auto &inputs = convOp->getInputs();
|
const auto &inputs = convOp->getInputs();
|
||||||
// const auto &AT = inputs[0];
|
const auto &AT = inputs[0];
|
||||||
// const auto &KT = inputs[1];
|
const auto &KT = inputs[1];
|
||||||
// const auto &[n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, g, bi, ac]
|
const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS();
|
||||||
// =
|
const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation();
|
||||||
// convOp->getArgs(0);
|
if (!(sh == 1 && sw == 1 && dh == 1 && dw == 1))
|
||||||
// dbg(n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw);
|
return nullptr;
|
||||||
// if (!(sh == 1 && sw == 1 && dh == 1 && dw == 1))
|
assert(sh == 1 && sw == 1 && dh == 1 && dw == 1);
|
||||||
// return nullptr;
|
inputsNameNToTensorT["A"] = AT;
|
||||||
// assert(sh == 1 && sw == 1 && dh == 1 && dw == 1);
|
inputsNameNToTensorT["K"] = KT;
|
||||||
// inputsNameNToTensorT["A"] = AT;
|
const auto A = nnet::makeTensor("A", AT->getDims(),
|
||||||
// inputsNameNToTensorT["K"] = KT;
|
std::vector<int>{0, 0, ph, pw});
|
||||||
// const auto A = nnet::makeTensor("A", AT->getDims(),
|
const auto K = nnet::makeTensor("K", KT->getDims());
|
||||||
// std::vector<int>{0, 0, ph, pw});
|
return nnet::ConvPattern::getExpr(A, K, n, c, h, w, f, r, s);
|
||||||
// const auto K = nnet::makeTensor("K", KT->getDims());
|
// } else if (auto convOp = dynamic_cast<ConvTransOp *>(op)) {
|
||||||
// return nnet::ConvPattern::getExpr(A, K, n, c, h, w, f, r, s);
|
// const auto &AT = convOp->getInputs()[0];
|
||||||
// } else if (auto convOp = dynamic_cast<ConvTransOp *>(op)) {
|
// const auto &KT = convOp->getInputs()[1];
|
||||||
// const auto &AT = convOp->getInputs()[0];
|
// inputsNameNToTensorT["A"] = AT;
|
||||||
// const auto &KT = convOp->getInputs()[1];
|
// inputsNameNToTensorT["K"] = KT;
|
||||||
// inputsNameNToTensorT["A"] = AT;
|
// const auto &[n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, g, bi,
|
||||||
// inputsNameNToTensorT["K"] = KT;
|
// ac]
|
||||||
// const auto &[n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, g, bi, ac]
|
// =
|
||||||
// =
|
// convOp->getArgs(0);
|
||||||
// convOp->getArgs(0);
|
// if (r != 4) {
|
||||||
// if (r != 4) {
|
// dbg("ConvTranspose R!=4. Skipped.", r);
|
||||||
// dbg("ConvTranspose R!=4. Skipped.", r);
|
// return nullptr;
|
||||||
// return nullptr;
|
// }
|
||||||
// }
|
// int padding = 1 * (r - 1) - 1;
|
||||||
// int padding = 1 * (r - 1) - 1;
|
// const auto A = nnet::makeTensor(
|
||||||
// const auto A = nnet::makeTensor(
|
// "A", AT->getDims(), std::vector<int>{0, padding, padding,
|
||||||
// "A", AT->getDims(), std::vector<int>{0, padding, padding, 0});
|
// 0});
|
||||||
// const auto K = nnet::makeTensor("K", KT->getDims());
|
// const auto K = nnet::makeTensor("K", KT->getDims());
|
||||||
// return nnet::ConvTransPattern::getExpr(A, K, n, c, h, w, f, r, s);
|
// return nnet::ConvTransPattern::getExpr(A, K, n, c, h, w, f, r,
|
||||||
// } else if (auto g2bmmOp = dynamic_cast<G2BMMOp *>(op)) {
|
// s);
|
||||||
// const auto &AT = g2bmmOp->getInputs()[0];
|
// } else if (auto g2bmmOp = dynamic_cast<G2BMMOp *>(op)) {
|
||||||
// const auto &BT = g2bmmOp->getInputs()[1];
|
// const auto &AT = g2bmmOp->getInputs()[0];
|
||||||
// const auto [b, m, k, width, dilation] = g2bmmOp->getArgs();
|
// const auto &BT = g2bmmOp->getInputs()[1];
|
||||||
|
// const auto [b, m, k, width, dilation] = g2bmmOp->getArgs();
|
||||||
|
|
||||||
// const auto &[expr, inputsN] =
|
// const auto &[expr, inputsN] =
|
||||||
// nnet::Sg2bmmPattern::getExpr(b, m, k, width, dilation);
|
// nnet::Sg2bmmPattern::getExpr(b, m, k, width, dilation);
|
||||||
// inputsNameNToTensorT[inputsN.first->getName()] = AT;
|
// inputsNameNToTensorT[inputsN.first->getName()] = AT;
|
||||||
// inputsNameNToTensorT[inputsN.second->getName()] = BT;
|
// inputsNameNToTensorT[inputsN.second->getName()] = BT;
|
||||||
// return expr;
|
// return expr;
|
||||||
// } else if (auto gbmmlOp = dynamic_cast<GBMMLOp *>(op)) {
|
// } else if (auto gbmmlOp = dynamic_cast<GBMMLOp *>(op)) {
|
||||||
// const auto &AT = gbmmlOp->getInputs()[0];
|
// const auto &AT = gbmmlOp->getInputs()[0];
|
||||||
// const auto &BT = gbmmlOp->getInputs()[1];
|
// const auto &BT = gbmmlOp->getInputs()[1];
|
||||||
// const auto [b, m, w, k, dilation] = gbmmlOp->getArgs();
|
// const auto [b, m, w, k, dilation] = gbmmlOp->getArgs();
|
||||||
// const auto &[expr, inputsN] =
|
// const auto &[expr, inputsN] =
|
||||||
// nnet::LongformerGBMMPattern::getExpr(b, m, w, k, dilation);
|
// nnet::LongformerGBMMPattern::getExpr(b, m, w, k, dilation);
|
||||||
// inputsNameNToTensorT[inputsN.first->getName()] = AT;
|
// inputsNameNToTensorT[inputsN.first->getName()] = AT;
|
||||||
// inputsNameNToTensorT[inputsN.second->getName()] = BT;
|
// inputsNameNToTensorT[inputsN.second->getName()] = BT;
|
||||||
// dbg(b, m, w, k, dilation, expr);
|
// dbg(b, m, w, k, dilation, expr);
|
||||||
// return expr;
|
// return expr;
|
||||||
// } else
|
} else if (auto matmulOp = as<MatmulObj>(op)) {
|
||||||
if (auto matmulOp = as<MatmulObj>(op)) {
|
|
||||||
const auto &AT = matmulOp->getInputs()[0];
|
const auto &AT = matmulOp->getInputs()[0];
|
||||||
const auto &BT = matmulOp->getInputs()[1];
|
const auto &BT = matmulOp->getInputs()[1];
|
||||||
const auto [b, m, n, k, transA, transB] = matmulOp->getBMNKTransAB();
|
const auto [b, m, n, k, transA, transB] = matmulOp->getBMNKTransAB();
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#include "core/graph.h"
|
#include "core/graph.h"
|
||||||
#include "core/runtime.h"
|
#include "core/runtime.h"
|
||||||
#include "operators/matmul.h"
|
#include "operators/matmul.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
#include "test.h"
|
#include "test.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
@ -57,4 +58,55 @@ TEST(Graph, perf_engine) {
|
||||||
EXPECT_TRUE(matmul->getOutput()->equalData(ans));
|
EXPECT_TRUE(matmul->getOutput()->equalData(ans));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(Graph, test_tensor_id) {
|
||||||
|
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||||
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
|
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
||||||
|
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
||||||
|
Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
|
||||||
|
g->dataMalloc();
|
||||||
|
i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||||
|
w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||||
|
auto i1 = g->addTensor(i0->clone());
|
||||||
|
auto matmul = g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
||||||
|
g->print();
|
||||||
|
EXPECT_NE(i0->getGuid(), i1->getGuid());
|
||||||
|
EXPECT_EQ(i0->getFuid(), i1->getFuid());
|
||||||
|
EXPECT_NE(i0->getDataBlob(), nullptr);
|
||||||
|
EXPECT_EQ(i1->getDataBlob(), nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Graph, test_OpVec_ctor) {
|
||||||
|
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||||
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
|
Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
||||||
|
Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
||||||
|
Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
|
||||||
|
g->dataMalloc();
|
||||||
|
i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||||
|
w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||||
|
auto o1 = g->addTensor(o0->clone());
|
||||||
|
auto matmul = g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
||||||
|
g->addOp<ReluObj>(o1, nullptr);
|
||||||
|
g->print();
|
||||||
|
puts("=========");
|
||||||
|
OpVec ops = g->getOperators();
|
||||||
|
Graph g2 = make_ref<GraphObj>(runtime, ops);
|
||||||
|
g2->print();
|
||||||
|
// Check if the two tensors with the same FUID (o0,o1) remain only one in g2
|
||||||
|
EXPECT_EQ(g2->getTensors().size(), 4u);
|
||||||
|
EXPECT_EQ(g2->getOperators().size(), 2u);
|
||||||
|
map<pair<int, int>, int> inputOutput2Cnt = {
|
||||||
|
{{1, 0}, 2}, {{1, 1}, 1}, {{0, 1}, 1}};
|
||||||
|
for (auto t : g2->getTensors()) {
|
||||||
|
pair<int, int> key = {t->getInputOf().size(),
|
||||||
|
t->getOutputOf() != nullptr};
|
||||||
|
EXPECT_GE(inputOutput2Cnt[key], 0);
|
||||||
|
inputOutput2Cnt[key]--;
|
||||||
|
}
|
||||||
|
for (auto [u, v] : inputOutput2Cnt) {
|
||||||
|
EXPECT_EQ(v, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -0,0 +1,86 @@
|
||||||
|
#include "core/blob.h"
|
||||||
|
#include "core/dummy_mutator.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "core/search_engine.h"
|
||||||
|
#include "nnet/nmutator.h"
|
||||||
|
#include "operators/conv.h"
|
||||||
|
#include "operators/element_wise.h"
|
||||||
|
#include "operators/matmul.h"
|
||||||
|
#include "operators/unary.h"
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
// TEST(Graph, search) {
|
||||||
|
// Runtime runtime = CpuRuntimeObj::getInstance();
|
||||||
|
// Graph g = make_ref<GraphObj>(runtime);
|
||||||
|
// Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
|
||||||
|
// Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
|
||||||
|
// Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
|
||||||
|
// g->dataMalloc();
|
||||||
|
// i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||||
|
// w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
|
||||||
|
// auto matmul = g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
||||||
|
// g->print();
|
||||||
|
// // check inputOf and outputsOf for tensor
|
||||||
|
// SearchEngine searchEngine(runtime, make_ref<NMutator>());
|
||||||
|
// searchEngine.run(g);
|
||||||
|
// // check execution results
|
||||||
|
// }
|
||||||
|
|
||||||
|
TEST(Graph, search_withdm) {
|
||||||
|
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||||
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
|
Tensor t0 = g->addTensor({1, 3, 224, 224});
|
||||||
|
Tensor w0 = g->addTensor({3, 3, 3, 3});
|
||||||
|
Tensor t1 = g->addTensor({1, 3, 224, 224});
|
||||||
|
Tensor t2 = g->addTensor({1, 3, 224, 224});
|
||||||
|
Tensor t3 = g->addTensor({1, 3, 224, 224});
|
||||||
|
Tensor w3 = g->addTensor({3, 3, 3, 3});
|
||||||
|
Tensor t4 = g->addTensor({1, 3, 224, 224});
|
||||||
|
Tensor t5 = g->addTensor({1, 3, 224, 224});
|
||||||
|
Tensor t6 = g->addTensor({1, 3, 224, 224});
|
||||||
|
auto conv0 = g->addOpWithOutputs<ConvObj>(t0, w0, t1, 1, 1);
|
||||||
|
auto add0 = g->addOpWithOutputs<AddObj>(t1, t2, t3);
|
||||||
|
auto conv1 = g->addOpWithOutputs<ConvObj>(t3, w3, t4, 1, 1);
|
||||||
|
auto add1 = g->addOpWithOutputs<AddObj>(t4, t5, t6);
|
||||||
|
g->dataMalloc();
|
||||||
|
// check inputOf and outputsOf for tensor
|
||||||
|
SearchEngine searchEngine(runtime, make_ref<DummyMutator>(10));
|
||||||
|
searchEngine.run(g);
|
||||||
|
// check execution results
|
||||||
|
}
|
||||||
|
|
||||||
|
// TEST(DummyMutator, run) {
|
||||||
|
// Runtime runtime = CpuRuntimeObj::getInstance();
|
||||||
|
// Graph g = make_ref<GraphObj>(runtime);
|
||||||
|
// Tensor i0 = g->addTensor({1, 3, 224, 224});
|
||||||
|
// Tensor w0 = g->addTensor({2, 3, 3, 3});
|
||||||
|
// auto matmul = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1);
|
||||||
|
// DummyMutator m(10);
|
||||||
|
// auto mutations = m.run(g);
|
||||||
|
// g->print();
|
||||||
|
// for (auto gg : mutations) {
|
||||||
|
// gg->print();
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TEST(DummyMutator, fuse) {
|
||||||
|
// Runtime runtime = CpuRuntimeObj::getInstance();
|
||||||
|
// Graph g = make_ref<GraphObj>(runtime);
|
||||||
|
// Tensor i0 = g->addTensor({1, 2, 3});
|
||||||
|
// Tensor w0 = g->addTensor({1, 3, 4});
|
||||||
|
// Tensor i1 = g->addTensor({1, 2, 3});
|
||||||
|
// Tensor w1 = g->addTensor({1, 3, 4});
|
||||||
|
// auto matmul0 = g->addOp<MatmulObj>(i0, w0, nullptr);
|
||||||
|
// auto matmul1 = g->addOp<MatmulObj>(i1, w1, nullptr);
|
||||||
|
// DummyMutator m(10);
|
||||||
|
// auto mutations = m.mergeMultiBranch(g);
|
||||||
|
// g->print();
|
||||||
|
// for (auto gg : mutations) {
|
||||||
|
// gg->print();
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -1,407 +0,0 @@
|
||||||
#include "code_engine.h"
|
|
||||||
#include "nnet/nmutator.h"
|
|
||||||
#include "operator.h"
|
|
||||||
#include "search_engine.h"
|
|
||||||
#include "tensor.h"
|
|
||||||
#include "gtest/gtest.h"
|
|
||||||
using namespace std;
|
|
||||||
using namespace infini;
|
|
||||||
|
|
||||||
// TEST(Mutator, Conv9x9) {
|
|
||||||
// auto g = new tpm::Graph();
|
|
||||||
// auto i0 = g->tensor({1, 1, 224, 224});
|
|
||||||
|
|
||||||
// auto w1 = g->tensor({64, 1, 9, 9});
|
|
||||||
|
|
||||||
// g->conv(i0, w1, 4, 4);
|
|
||||||
|
|
||||||
// g->updateConnection();
|
|
||||||
|
|
||||||
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
|
||||||
// searchEngine.run(graph, bestGraph);
|
|
||||||
// tpm::CodeEngine codeEngine;
|
|
||||||
// auto perfEngine = searchEngine.exportPerfEngine();
|
|
||||||
// codeEngine.importPerfEngine(perfEngine);
|
|
||||||
// codeEngine.genCode(bestGraph, "res.cu");
|
|
||||||
// }
|
|
||||||
|
|
||||||
// TEST(Mutator, TConv_1) {
|
|
||||||
// auto g = new tpm::Graph();
|
|
||||||
|
|
||||||
// auto i0 = g->tensor({1, 1, 1, 228});
|
|
||||||
// auto w1 = g->tensor({228, 2, 2, 448});
|
|
||||||
|
|
||||||
// // g->conv(i0, w1, 4, 4);
|
|
||||||
// g->convTrans(i0, w1, 0, 0, 1, 1);
|
|
||||||
|
|
||||||
// g->updateConnection();
|
|
||||||
|
|
||||||
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
|
||||||
// searchEngine.run(graph, bestGraph);
|
|
||||||
// tpm::CodeEngine codeEngine;
|
|
||||||
// auto perfEngine = searchEngine.exportPerfEngine();
|
|
||||||
// codeEngine.importPerfEngine(perfEngine);
|
|
||||||
// codeEngine.genCode(bestGraph, "res.cu");
|
|
||||||
// }
|
|
||||||
|
|
||||||
// TEST(Mutator, TConv_3) {
|
|
||||||
// auto g = new tpm::Graph();
|
|
||||||
|
|
||||||
// auto i0 = g->tensor({1, 2, 2, 448});
|
|
||||||
// auto w1 = g->tensor({448, 4, 4, 256});
|
|
||||||
|
|
||||||
// // g->conv(i0, w1, 4, 4);
|
|
||||||
// g->convTrans(i0, w1, 1, 1, 2, 2, 1, 1);
|
|
||||||
|
|
||||||
// g->updateConnection();
|
|
||||||
|
|
||||||
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
|
||||||
// searchEngine.run(graph, bestGraph);
|
|
||||||
// tpm::CodeEngine codeEngine;
|
|
||||||
// auto perfEngine = searchEngine.exportPerfEngine();
|
|
||||||
// codeEngine.importPerfEngine(perfEngine);
|
|
||||||
// codeEngine.genCode(bestGraph, "res.cu");
|
|
||||||
// }
|
|
||||||
|
|
||||||
// FIXME: failed since implicit transpose for DLT
|
|
||||||
TEST(Mutator, DISABLED_InfoGAN_TConv_3_correctness) {
|
|
||||||
// verifyNaiveMembound True: subgraph after transformation
|
|
||||||
// verifyNaiveMembound False: subgraph of one single membound (eOP)
|
|
||||||
const bool verifyNaiveMembound = false;
|
|
||||||
auto g = new tpm::Graph();
|
|
||||||
|
|
||||||
// {n, h, w, f} * {r, s, f, c}
|
|
||||||
// {n, f, h, w} * {f, c, r, s}
|
|
||||||
auto i0 = g->tensor({1, 448, 2, 2});
|
|
||||||
auto w1 = g->tensor({448, 256, 4, 4});
|
|
||||||
|
|
||||||
g->convTrans(i0, w1, 1, 1, 2, 2, 1, 1);
|
|
||||||
|
|
||||||
g->updateConnection();
|
|
||||||
|
|
||||||
printf("--- Init Finished ---\n");
|
|
||||||
|
|
||||||
std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
vector<tpm::SubGraph *> outGraphs;
|
|
||||||
graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
graph->print();
|
|
||||||
printf("--- Graph Finished ---\n");
|
|
||||||
|
|
||||||
auto mutationEngine = make_shared<tpm::NMutator>();
|
|
||||||
if (verifyNaiveMembound)
|
|
||||||
mutationEngine->setToNaiveMembound();
|
|
||||||
tpm::SearchEngine searchEngine(mutationEngine);
|
|
||||||
|
|
||||||
printf("--- SearchEngine Finished ---\n");
|
|
||||||
|
|
||||||
tpm::NMutator mutator;
|
|
||||||
if (verifyNaiveMembound)
|
|
||||||
mutator.setToNaiveMembound();
|
|
||||||
mutator.run(graph.get(), outGraphs);
|
|
||||||
|
|
||||||
printf("--- Mutator Finished ---\n");
|
|
||||||
|
|
||||||
bestGraph = shared_ptr<tpm::SubGraph>(outGraphs.back());
|
|
||||||
bestGraph->print();
|
|
||||||
|
|
||||||
printf("--- BestGraph Finished ---\n");
|
|
||||||
|
|
||||||
EXPECT_TRUE(graph->verification(bestGraph.get(), true));
|
|
||||||
|
|
||||||
// // Codegen (independent from the above)
|
|
||||||
// searchEngine.run(graph, bestGraph);
|
|
||||||
// tpm::CodeEngine codeEngine;
|
|
||||||
// auto perfEngine = searchEngine.exportPerfEngine();
|
|
||||||
// codeEngine.importPerfEngine(perfEngine);
|
|
||||||
// codeEngine.genCode(bestGraph, "res.cu");
|
|
||||||
}
|
|
||||||
|
|
||||||
// TEST(Mutator, G2BMM) {
|
|
||||||
// auto g = new tpm::Graph();
|
|
||||||
|
|
||||||
// int nHeads = 8, seq_len = 10000, feat_len = 64, w = 1000, d = 4;
|
|
||||||
// auto i0 = g->tensor({nHeads, seq_len, feat_len});
|
|
||||||
// auto i1 = g->tensor({nHeads, seq_len, feat_len});
|
|
||||||
|
|
||||||
// g->g2bmm(i0, i1, w, d);
|
|
||||||
|
|
||||||
// g->updateConnection();
|
|
||||||
|
|
||||||
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
|
||||||
// searchEngine.run(graph, bestGraph);
|
|
||||||
// tpm::CodeEngine codeEngine;
|
|
||||||
// auto perfEngine = searchEngine.exportPerfEngine();
|
|
||||||
// codeEngine.importPerfEngine(perfEngine);
|
|
||||||
// codeEngine.genCode(graph, "res.cu");
|
|
||||||
// // codeEngine.genCode(bestGraph, "res.cu");
|
|
||||||
// }
|
|
||||||
|
|
||||||
// TEST(Mutator, GBMML) {
|
|
||||||
// auto g = new tpm::Graph();
|
|
||||||
|
|
||||||
// int nHeads = 8, seq_len = 10000, feat_len = 64, w = 1000, d = 4;
|
|
||||||
// auto i0 = g->tensor({nHeads, seq_len, 2 * w + 1});
|
|
||||||
// auto i1 = g->tensor({nHeads, seq_len, feat_len});
|
|
||||||
|
|
||||||
// g->gbmml(i0, i1, d);
|
|
||||||
|
|
||||||
// g->updateConnection();
|
|
||||||
|
|
||||||
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
|
||||||
// searchEngine.run(graph, bestGraph);
|
|
||||||
// tpm::CodeEngine codeEngine;
|
|
||||||
// auto perfEngine = searchEngine.exportPerfEngine();
|
|
||||||
// codeEngine.importPerfEngine(perfEngine);
|
|
||||||
// codeEngine.genCode(graph, "res.cu");
|
|
||||||
// // codeEngine.genCode(bestGraph, "res.cu");
|
|
||||||
// }
|
|
||||||
|
|
||||||
// TEST(Mutator, Conv5x5) {
|
|
||||||
// //
|
|
||||||
// conv7x7->relu->conv3x3->relu->conv3x3->relu->conv3x3->relu->conv3x3->relu
|
|
||||||
// auto g = new tpm::Graph();
|
|
||||||
// auto i0 = g->tensor({1, 32, 224, 224});
|
|
||||||
|
|
||||||
// auto w1 = g->tensor({1, 32, 5, 5});
|
|
||||||
|
|
||||||
// g->conv(i0, w1, tpm::ConvOp::PaddingMode::Same);
|
|
||||||
|
|
||||||
// g->updateConnection();
|
|
||||||
|
|
||||||
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
|
||||||
// searchEngine.run(graph, bestGraph);
|
|
||||||
// tpm::CodeEngine codeEngine;
|
|
||||||
// auto perfEngine = searchEngine.exportPerfEngine();
|
|
||||||
// codeEngine.importPerfEngine(perfEngine);
|
|
||||||
// codeEngine.genCode(bestGraph, "res.cu");
|
|
||||||
// }
|
|
||||||
|
|
||||||
// TEST(Mutator, BMM) {
|
|
||||||
// const int m = 16, n = 1024, k = 1024;
|
|
||||||
// auto g = new tpm::Graph();
|
|
||||||
// auto i0 = g->tensor({1, m, k});
|
|
||||||
// auto w0 = g->tensor({1, k, n});
|
|
||||||
// auto w1 = g->tensor({1, k, n});
|
|
||||||
// auto w2 = g->tensor({1, k, n});
|
|
||||||
|
|
||||||
// g->matmul(i0, w0);
|
|
||||||
// g->matmul(i0, w1);
|
|
||||||
// g->matmul(i0, w2);
|
|
||||||
// g->updateConnection();
|
|
||||||
|
|
||||||
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
|
||||||
// searchEngine.run(graph, bestGraph);
|
|
||||||
// tpm::CodeEngine codeEngine;
|
|
||||||
// auto perfEngine = searchEngine.exportPerfEngine();
|
|
||||||
// codeEngine.importPerfEngine(perfEngine);
|
|
||||||
// codeEngine.genCode(bestGraph, "res.cu");
|
|
||||||
// }
|
|
||||||
|
|
||||||
TEST(Mutator, Conv2gemm1x1_bs1_mutator) {
|
|
||||||
const int N = 1, H = 7, W = 7, C = 512, F = 512, R = 1, S = 1;
|
|
||||||
auto g = new tpm::Graph();
|
|
||||||
auto i0 = g->tensor({N, C, H, W});
|
|
||||||
auto w1 = g->tensor({F, C, R, S});
|
|
||||||
g->conv(i0, w1, R / 2, S / 2);
|
|
||||||
|
|
||||||
g->updateConnection();
|
|
||||||
|
|
||||||
std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
vector<tpm::SubGraph *> out_graphs;
|
|
||||||
graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
const vector<int> rules = {3, 2, 2, 8, 8, 6, 6};
|
|
||||||
auto mutator = make_shared<tpm::NMutator>(rules);
|
|
||||||
mutator->run(graph.get(), out_graphs);
|
|
||||||
tpm::SearchEngine searchEngine(mutator);
|
|
||||||
int maxNReshapes = 0;
|
|
||||||
for (const auto &graph : out_graphs) {
|
|
||||||
searchEngine.getPerf(make_shared<tpm::SubGraph>(*graph), true);
|
|
||||||
int nReshapes = 0, nTrans = 0;
|
|
||||||
for (auto op : graph->getOperators()) {
|
|
||||||
nReshapes += op->isReshapeOp();
|
|
||||||
if (auto matmul = dynamic_cast<MatmulOp *>(op))
|
|
||||||
nTrans = matmul->getTransA() + matmul->getTransB();
|
|
||||||
}
|
|
||||||
maxNReshapes = max(maxNReshapes, nReshapes);
|
|
||||||
// Number of Reshapes for KxA and AxK
|
|
||||||
EXPECT_TRUE((nReshapes == 3 - nTrans) || (nReshapes == nTrans));
|
|
||||||
}
|
|
||||||
// Matmul K^N A^N -> no Membound
|
|
||||||
EXPECT_EQ(maxNReshapes, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(Mutator, Conv2gemm1x1_searchEngine_ruleBased) {
|
|
||||||
const int N = 1, H = 7, W = 7, C = 512, F = 512, R = 1, S = 1;
|
|
||||||
auto g = new tpm::Graph();
|
|
||||||
auto i0 = g->tensor({N, C, H, W});
|
|
||||||
auto w1 = g->tensor({F, C, R, S});
|
|
||||||
g->conv(i0, w1, R / 2, S / 2);
|
|
||||||
|
|
||||||
g->updateConnection();
|
|
||||||
|
|
||||||
std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
const vector<int> rules = {3, 2, 2, 8, 8, 6, 6};
|
|
||||||
tpm::SearchEngine searchEngine(make_shared<tpm::NMutator>(rules));
|
|
||||||
searchEngine.run(graph, bestGraph);
|
|
||||||
|
|
||||||
// clang-format off
|
|
||||||
// ========== PET graph getPerf ============
|
|
||||||
// Reshape(in=0,out=126)
|
|
||||||
// op_time 0.000000
|
|
||||||
// Reshape(in=1,out=125)
|
|
||||||
// op_time 0.000000
|
|
||||||
// Matmul([A,B,act=0],A=125,B=126,C=124, TTbmnk: 0, 0, 1, 512, 49, 512)
|
|
||||||
// op_time 0.013799
|
|
||||||
// Reshape(in=124,out=3)
|
|
||||||
// op_time 0.000000
|
|
||||||
// Op Cnt T_tot Percent T_mean
|
|
||||||
// Matmul 1 0.014 100.0 0.014
|
|
||||||
// Reshape 3 0.000 0.0 0.000
|
|
||||||
// Origin Perf: 0.0553319
|
|
||||||
// Best Perf without correction: 0.0137989
|
|
||||||
// Best Perf with correction: 0.0137989
|
|
||||||
// clang-format on
|
|
||||||
EXPECT_EQ(bestGraph->getOperators().size(), 4u);
|
|
||||||
auto cntOps = bestGraph->countOps();
|
|
||||||
EXPECT_EQ(cntOps["Matmul"], 1);
|
|
||||||
EXPECT_EQ(cntOps["Reshape"], 3);
|
|
||||||
bestGraph->print();
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(Mutator, Conv2gemm1x1_searchEngine_search) {
|
|
||||||
const int N = 1, H = 7, W = 7, C = 512, F = 512, R = 1, S = 1;
|
|
||||||
auto g = new tpm::Graph();
|
|
||||||
auto i0 = g->tensor({N, C, H, W});
|
|
||||||
auto w1 = g->tensor({F, C, R, S});
|
|
||||||
g->conv(i0, w1, R / 2, S / 2);
|
|
||||||
|
|
||||||
g->updateConnection();
|
|
||||||
|
|
||||||
std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
tpm::SearchEngine searchEngine(make_shared<tpm::NMutator>());
|
|
||||||
searchEngine.run(graph, bestGraph);
|
|
||||||
|
|
||||||
EXPECT_EQ(bestGraph->getOperators().size(), 4u);
|
|
||||||
auto cntOps = bestGraph->countOps();
|
|
||||||
EXPECT_EQ(cntOps["Matmul"], 1);
|
|
||||||
EXPECT_EQ(cntOps["Reshape"], 3);
|
|
||||||
bestGraph->print();
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(Mutator, Conv2gemm1x7_searchEngine_ruleBased) {
|
|
||||||
const int N = 1, C = 2048, H = 7, W = 7, F = 128, R = 1,
|
|
||||||
S = 7; // gcn_Conv_137
|
|
||||||
auto g = new tpm::Graph();
|
|
||||||
auto i0 = g->tensor({N, C, H, W});
|
|
||||||
auto w1 = g->tensor({F, C, R, S});
|
|
||||||
g->conv(i0, w1, R / 2, S / 2);
|
|
||||||
|
|
||||||
g->updateConnection();
|
|
||||||
|
|
||||||
std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
const vector<int> rules = {3, 2, 2, 5, 8, 8, 6, 90};
|
|
||||||
tpm::SearchEngine searchEngine(make_shared<tpm::NMutator>(rules));
|
|
||||||
searchEngine.run(graph, bestGraph);
|
|
||||||
|
|
||||||
// clang-format off
|
|
||||||
// ========== PET graph getPerf ============
|
|
||||||
// Reshape(in=0,out=309)
|
|
||||||
// op_time 0.000000
|
|
||||||
// MemBound[124644277](i0=1, o0=308, exec_time=0.0683594, NNet Inputs=[K,])
|
|
||||||
// L<c:0:2048><i52:0:896>Sum ... [i52,c]
|
|
||||||
// {L<i52:0:896><c:0:2048>Sum ... [(i52 / 7),c,((i52 / 7) % 1),(i52 % 7)]
|
|
||||||
// {K}}
|
|
||||||
|
|
||||||
// op_time 0.000000
|
|
||||||
// Matmul([A^T,B,act=0],A=308,B=309,C=307, TTbmnk: 1, 0, 1, 896, 49, 2048)
|
|
||||||
// op_time 0.024471
|
|
||||||
// MemBound[124644277](i0=307, o0=3, exec_time=0.001, NNet Inputs=[T49,])
|
|
||||||
// L<n:0:1><f:0:128><h:0:7><w:0:7>Sum<r:0:1><s:0:7> ... [(h + r),r,(w + s),s,n,f]
|
|
||||||
// {L<i45:0:7><i46:0:1><i26:3:10><i27:0:7><n:0:1><f:0:128><pad=0,0,3,0,0,0,>Sum ... [(((7 * f) + (7 * i46)) + i27),(((49 * n) + (7 * i45)) + (i26 + -3))]
|
|
||||||
// {T49}}
|
|
||||||
|
|
||||||
// op_time 0.001000
|
|
||||||
// Op Cnt T_tot Percent T_mean
|
|
||||||
// Matmul 1 0.024 96.1 0.024
|
|
||||||
// Reshape 1 0.000 0.0 0.000
|
|
||||||
// MemBound 2 0.001 3.9 0.001
|
|
||||||
// Origin Perf: 0.405595
|
|
||||||
// Best Perf without correction: 0.0254715
|
|
||||||
// Best Perf with correction: 0.0254715
|
|
||||||
// Transpose perf: 0
|
|
||||||
// clang-format on
|
|
||||||
EXPECT_EQ(bestGraph->getOperators().size(), 4u);
|
|
||||||
auto cntOps = bestGraph->countOps();
|
|
||||||
EXPECT_EQ(cntOps["Matmul"], 1);
|
|
||||||
EXPECT_EQ(cntOps["Reshape"], 1);
|
|
||||||
EXPECT_EQ(cntOps["MemBound"], 2);
|
|
||||||
bestGraph->print();
|
|
||||||
EXPECT_TRUE(graph->verification(bestGraph.get(), true));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(Mutator, Conv2gemm7x1_searchEngine_ruleBased) {
|
|
||||||
const int N = 1, C = 2048, H = 7, W = 7, F = 128, R = 7,
|
|
||||||
S = 1; // gcn_Conv_137
|
|
||||||
auto g = new tpm::Graph();
|
|
||||||
auto i0 = g->tensor({N, C, H, W});
|
|
||||||
auto w1 = g->tensor({F, C, R, S});
|
|
||||||
g->conv(i0, w1, R / 2, S / 2);
|
|
||||||
|
|
||||||
g->updateConnection();
|
|
||||||
|
|
||||||
std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
const vector<int> rules = {3, 2, 2, 5, 8, 8, 6, 90};
|
|
||||||
tpm::SearchEngine searchEngine(make_shared<tpm::NMutator>(rules));
|
|
||||||
searchEngine.run(graph, bestGraph);
|
|
||||||
|
|
||||||
EXPECT_EQ(bestGraph->getOperators().size(), 4u);
|
|
||||||
auto cntOps = bestGraph->countOps();
|
|
||||||
EXPECT_EQ(cntOps["Matmul"], 1);
|
|
||||||
EXPECT_EQ(cntOps["Reshape"], 1);
|
|
||||||
EXPECT_EQ(cntOps["MemBound"], 2);
|
|
||||||
bestGraph->print();
|
|
||||||
EXPECT_TRUE(graph->verification(bestGraph.get(), true));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(Mutator, Conv2gemm7x1_searchEngine_search) {
|
|
||||||
const int N = 1, C = 2048, H = 7, W = 7, F = 128, R = 7,
|
|
||||||
S = 1; // gcn_Conv_137
|
|
||||||
auto g = new tpm::Graph();
|
|
||||||
auto i0 = g->tensor({N, C, H, W});
|
|
||||||
auto w1 = g->tensor({F, C, R, S});
|
|
||||||
g->conv(i0, w1, R / 2, S / 2);
|
|
||||||
|
|
||||||
g->updateConnection();
|
|
||||||
|
|
||||||
std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
|
||||||
graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
|
||||||
// const vector<int> rules = {3, 2, 2, 5, 8, 8, 6, 90};
|
|
||||||
tpm::SearchEngine searchEngine(make_shared<tpm::NMutator>());
|
|
||||||
searchEngine.run(graph, bestGraph);
|
|
||||||
|
|
||||||
EXPECT_EQ(bestGraph->getOperators().size(), 4u);
|
|
||||||
auto cntOps = bestGraph->countOps();
|
|
||||||
EXPECT_EQ(cntOps["Matmul"], 1);
|
|
||||||
EXPECT_EQ(cntOps["Reshape"], 1);
|
|
||||||
EXPECT_EQ(cntOps["MemBound"], 2);
|
|
||||||
bestGraph->print();
|
|
||||||
EXPECT_TRUE(graph->verification(bestGraph.get(), true));
|
|
||||||
}
|
|
|
@ -23,8 +23,8 @@ TEST(nnet, MemboundOpInterpretation) {
|
||||||
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
||||||
NMutator nmutator(NMutator::Mode::ToNaiveMembound);
|
NMutator nmutator(NMutator::Mode::ToNaiveMembound);
|
||||||
auto mutations = nmutator.run(g);
|
auto mutations = nmutator.run(g);
|
||||||
ASSERT_EQ(mutations.size(), 1u);
|
ASSERT_EQ(mutations.size(), 2u);
|
||||||
Graph gNew = mutations[0];
|
Graph gNew = mutations[1];
|
||||||
gNew->print();
|
gNew->print();
|
||||||
|
|
||||||
gNew->dataMalloc();
|
gNew->dataMalloc();
|
||||||
|
@ -54,8 +54,8 @@ TEST(nnet, MemboundOp_Ansor_Codegen) {
|
||||||
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
|
||||||
NMutator nmutator(NMutator::Mode::ToNaiveMembound);
|
NMutator nmutator(NMutator::Mode::ToNaiveMembound);
|
||||||
auto mutations = nmutator.run(g);
|
auto mutations = nmutator.run(g);
|
||||||
ASSERT_EQ(mutations.size(), 1u);
|
ASSERT_EQ(mutations.size(), 2u);
|
||||||
Graph gNew = mutations[0];
|
Graph gNew = mutations[1];
|
||||||
gNew->print();
|
gNew->print();
|
||||||
gNew->dataMalloc();
|
gNew->dataMalloc();
|
||||||
runtime->run(gNew, true); // tune kernels
|
runtime->run(gNew, true); // tune kernels
|
||||||
|
|
|
@ -0,0 +1,421 @@
|
||||||
|
#include "core/blob.h"
|
||||||
|
#include "core/dummy_mutator.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "core/search_engine.h"
|
||||||
|
#include "nnet/nmutator.h"
|
||||||
|
#include "operators/conv.h"
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
// TEST(Mutator, Conv9x9) {
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
// auto i0 = g->tensor({1, 1, 224, 224});
|
||||||
|
|
||||||
|
// auto w1 = g->tensor({64, 1, 9, 9});
|
||||||
|
|
||||||
|
// g->conv(i0, w1, 4, 4);
|
||||||
|
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
||||||
|
// searchEngine.run(graph, bestGraph);
|
||||||
|
// tpm::CodeEngine codeEngine;
|
||||||
|
// auto perfEngine = searchEngine.exportPerfEngine();
|
||||||
|
// codeEngine.importPerfEngine(perfEngine);
|
||||||
|
// codeEngine.genCode(bestGraph, "res.cu");
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TEST(Mutator, TConv_1) {
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
|
||||||
|
// auto i0 = g->tensor({1, 1, 1, 228});
|
||||||
|
// auto w1 = g->tensor({228, 2, 2, 448});
|
||||||
|
|
||||||
|
// // g->conv(i0, w1, 4, 4);
|
||||||
|
// g->convTrans(i0, w1, 0, 0, 1, 1);
|
||||||
|
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
||||||
|
// searchEngine.run(graph, bestGraph);
|
||||||
|
// tpm::CodeEngine codeEngine;
|
||||||
|
// auto perfEngine = searchEngine.exportPerfEngine();
|
||||||
|
// codeEngine.importPerfEngine(perfEngine);
|
||||||
|
// codeEngine.genCode(bestGraph, "res.cu");
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TEST(Mutator, TConv_3) {
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
|
||||||
|
// auto i0 = g->tensor({1, 2, 2, 448});
|
||||||
|
// auto w1 = g->tensor({448, 4, 4, 256});
|
||||||
|
|
||||||
|
// // g->conv(i0, w1, 4, 4);
|
||||||
|
// g->convTrans(i0, w1, 1, 1, 2, 2, 1, 1);
|
||||||
|
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
||||||
|
// searchEngine.run(graph, bestGraph);
|
||||||
|
// tpm::CodeEngine codeEngine;
|
||||||
|
// auto perfEngine = searchEngine.exportPerfEngine();
|
||||||
|
// codeEngine.importPerfEngine(perfEngine);
|
||||||
|
// codeEngine.genCode(bestGraph, "res.cu");
|
||||||
|
// }
|
||||||
|
|
||||||
|
// // FIXME: failed since implicit transpose for DLT
|
||||||
|
// TEST(Mutator, InfoGAN_TConv_3_correctness) {
|
||||||
|
// // verifyNaiveMembound True: subgraph after transformation
|
||||||
|
// // verifyNaiveMembound False: subgraph of one single membound (eOP)
|
||||||
|
// const bool verifyNaiveMembound = false;
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
// // {n, h, w, f} * {r, s, f, c}
|
||||||
|
// // {n, f, h, w} * {f, c, r, s}
|
||||||
|
// auto i0 = g->tensor({1, 448, 2, 2});
|
||||||
|
// auto w1 = g->tensor({448, 256, 4, 4});
|
||||||
|
// g->convTrans(i0, w1, 1, 1, 2, 2, 1, 1);
|
||||||
|
// }
|
||||||
|
|
||||||
|
TEST(Mutator, NaiveConvWithInterpreter) {
|
||||||
|
// verifyNaiveMembound True: subgraph after transformation
|
||||||
|
// verifyNaiveMembound False: subgraph of one single membound (eOP)
|
||||||
|
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||||
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
|
// const bool verifyNaiveMembound = false;
|
||||||
|
|
||||||
|
auto i0 = g->addTensor({1, 3, 32, 32}, DataType::UInt32);
|
||||||
|
auto w1 = g->addTensor({2, 3, 3, 3}, DataType::UInt32);
|
||||||
|
g->addOp<ConvObj>(i0, w1, nullptr, 1, 1);
|
||||||
|
printf("--- Init Finished ---\n");
|
||||||
|
|
||||||
|
auto mutator = make_ref<NMutator>();
|
||||||
|
mutator->setToNaiveMembound();
|
||||||
|
SearchEngine searchEngine(runtime, mutator);
|
||||||
|
// g->dataMalloc();
|
||||||
|
auto bestGraph = searchEngine.run(g);
|
||||||
|
bestGraph->print();
|
||||||
|
printf("--- SearchEngine Finished ---\n");
|
||||||
|
|
||||||
|
auto mutatedGraphs = mutator->run(g);
|
||||||
|
IT_ASSERT(mutatedGraphs.size() == 2);
|
||||||
|
printf("--- Mutator Finished ---\n");
|
||||||
|
|
||||||
|
auto gg = mutatedGraphs[1];
|
||||||
|
g->dataMalloc();
|
||||||
|
gg->dataMalloc();
|
||||||
|
for (auto t : g->getTensors()) {
|
||||||
|
if (t->getFuid() <= 2)
|
||||||
|
t->setData(IncrementalGenerator());
|
||||||
|
}
|
||||||
|
for (auto t : gg->getTensors()) {
|
||||||
|
if (t->getFuid() <= 2)
|
||||||
|
t->setData(IncrementalGenerator());
|
||||||
|
}
|
||||||
|
runtime->run(g);
|
||||||
|
runtime->run(gg);
|
||||||
|
gg->print();
|
||||||
|
|
||||||
|
EXPECT_TRUE(g->getOutputs()[0]->equalData(gg->getOutputs()[0]));
|
||||||
|
EXPECT_TRUE(g->getOutputs()[0]->getRawDataPtr<void *>() !=
|
||||||
|
gg->getOutputs()[0]->getRawDataPtr<void *>());
|
||||||
|
}
|
||||||
|
|
||||||
|
// TEST(Mutator, G2BMM) {
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
|
||||||
|
// int nHeads = 8, seq_len = 10000, feat_len = 64, w = 1000, d = 4;
|
||||||
|
// auto i0 = g->tensor({nHeads, seq_len, feat_len});
|
||||||
|
// auto i1 = g->tensor({nHeads, seq_len, feat_len});
|
||||||
|
|
||||||
|
// g->g2bmm(i0, i1, w, d);
|
||||||
|
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
||||||
|
// searchEngine.run(graph, bestGraph);
|
||||||
|
// tpm::CodeEngine codeEngine;
|
||||||
|
// auto perfEngine = searchEngine.exportPerfEngine();
|
||||||
|
// codeEngine.importPerfEngine(perfEngine);
|
||||||
|
// codeEngine.genCode(graph, "res.cu");
|
||||||
|
// // codeEngine.genCode(bestGraph, "res.cu");
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TEST(Mutator, GBMML) {
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
|
||||||
|
// int nHeads = 8, seq_len = 10000, feat_len = 64, w = 1000, d = 4;
|
||||||
|
// auto i0 = g->tensor({nHeads, seq_len, 2 * w + 1});
|
||||||
|
// auto i1 = g->tensor({nHeads, seq_len, feat_len});
|
||||||
|
|
||||||
|
// g->gbmml(i0, i1, d);
|
||||||
|
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
||||||
|
// searchEngine.run(graph, bestGraph);
|
||||||
|
// tpm::CodeEngine codeEngine;
|
||||||
|
// auto perfEngine = searchEngine.exportPerfEngine();
|
||||||
|
// codeEngine.importPerfEngine(perfEngine);
|
||||||
|
// codeEngine.genCode(graph, "res.cu");
|
||||||
|
// // codeEngine.genCode(bestGraph, "res.cu");
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TEST(Mutator, Conv5x5) {
|
||||||
|
// //
|
||||||
|
// conv7x7->relu->conv3x3->relu->conv3x3->relu->conv3x3->relu->conv3x3->relu
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
// auto i0 = g->tensor({1, 32, 224, 224});
|
||||||
|
|
||||||
|
// auto w1 = g->tensor({1, 32, 5, 5});
|
||||||
|
|
||||||
|
// g->conv(i0, w1, tpm::ConvOp::PaddingMode::Same);
|
||||||
|
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
||||||
|
// searchEngine.run(graph, bestGraph);
|
||||||
|
// tpm::CodeEngine codeEngine;
|
||||||
|
// auto perfEngine = searchEngine.exportPerfEngine();
|
||||||
|
// codeEngine.importPerfEngine(perfEngine);
|
||||||
|
// codeEngine.genCode(bestGraph, "res.cu");
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TEST(Mutator, BMM) {
|
||||||
|
// const int m = 16, n = 1024, k = 1024;
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
// auto i0 = g->tensor({1, m, k});
|
||||||
|
// auto w0 = g->tensor({1, k, n});
|
||||||
|
// auto w1 = g->tensor({1, k, n});
|
||||||
|
// auto w2 = g->tensor({1, k, n});
|
||||||
|
|
||||||
|
// g->matmul(i0, w0);
|
||||||
|
// g->matmul(i0, w1);
|
||||||
|
// g->matmul(i0, w2);
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// tpm::SearchEngine<tpm::NMutator> searchEngine;
|
||||||
|
// searchEngine.run(graph, bestGraph);
|
||||||
|
// tpm::CodeEngine codeEngine;
|
||||||
|
// auto perfEngine = searchEngine.exportPerfEngine();
|
||||||
|
// codeEngine.importPerfEngine(perfEngine);
|
||||||
|
// codeEngine.genCode(bestGraph, "res.cu");
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TEST(Mutator, Conv2gemm1x1_bs1_mutator) {
|
||||||
|
// const int N = 1, H = 7, W = 7, C = 512, F = 512, R = 1, S = 1;
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
// auto i0 = g->tensor({N, C, H, W});
|
||||||
|
// auto w1 = g->tensor({F, C, R, S});
|
||||||
|
// g->conv(i0, w1, R / 2, S / 2);
|
||||||
|
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// vector<tpm::SubGraph *> out_graphs;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// const vector<int> rules = {3, 2, 2, 8, 8, 6, 6};
|
||||||
|
// auto mutator = make_shared<tpm::NMutator>(rules);
|
||||||
|
// mutator->run(graph.get(), out_graphs);
|
||||||
|
// tpm::SearchEngine searchEngine(mutator);
|
||||||
|
// int maxNReshapes = 0;
|
||||||
|
// for (const auto &graph : out_graphs) {
|
||||||
|
// searchEngine.getPerf(make_shared<tpm::SubGraph>(*graph), true);
|
||||||
|
// int nReshapes = 0, nTrans = 0;
|
||||||
|
// for (auto op : graph->getOperators()) {
|
||||||
|
// nReshapes += op->isReshapeOp();
|
||||||
|
// if (auto matmul = dynamic_cast<MatmulOp *>(op))
|
||||||
|
// nTrans = matmul->getTransA() + matmul->getTransB();
|
||||||
|
// }
|
||||||
|
// maxNReshapes = max(maxNReshapes, nReshapes);
|
||||||
|
// // Number of Reshapes for KxA and AxK
|
||||||
|
// EXPECT_TRUE((nReshapes == 3 - nTrans) || (nReshapes == nTrans));
|
||||||
|
// }
|
||||||
|
// // Matmul K^N A^N -> no Membound
|
||||||
|
// EXPECT_EQ(maxNReshapes, 3);
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TEST(Mutator, Conv2gemm1x1_searchEngine_ruleBased) {
|
||||||
|
// const int N = 1, H = 7, W = 7, C = 512, F = 512, R = 1, S = 1;
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
// auto i0 = g->tensor({N, C, H, W});
|
||||||
|
// auto w1 = g->tensor({F, C, R, S});
|
||||||
|
// g->conv(i0, w1, R / 2, S / 2);
|
||||||
|
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// const vector<int> rules = {3, 2, 2, 8, 8, 6, 6};
|
||||||
|
// tpm::SearchEngine searchEngine(make_shared<tpm::NMutator>(rules));
|
||||||
|
// searchEngine.run(graph, bestGraph);
|
||||||
|
|
||||||
|
// // clang-format off
|
||||||
|
// // ========== PET graph getPerf ============
|
||||||
|
// // Reshape(in=0,out=126)
|
||||||
|
// // op_time 0.000000
|
||||||
|
// // Reshape(in=1,out=125)
|
||||||
|
// // op_time 0.000000
|
||||||
|
// // Matmul([A,B,act=0],A=125,B=126,C=124, TTbmnk: 0, 0, 1, 512, 49, 512)
|
||||||
|
// // op_time 0.013799
|
||||||
|
// // Reshape(in=124,out=3)
|
||||||
|
// // op_time 0.000000
|
||||||
|
// // Op Cnt T_tot Percent T_mean
|
||||||
|
// // Matmul 1 0.014 100.0 0.014
|
||||||
|
// // Reshape 3 0.000 0.0 0.000
|
||||||
|
// // Origin Perf: 0.0553319
|
||||||
|
// // Best Perf without correction: 0.0137989
|
||||||
|
// // Best Perf with correction: 0.0137989
|
||||||
|
// // clang-format on
|
||||||
|
// EXPECT_EQ(bestGraph->getOperators().size(), 4u);
|
||||||
|
// auto cntOps = bestGraph->countOps();
|
||||||
|
// EXPECT_EQ(cntOps["Matmul"], 1);
|
||||||
|
// EXPECT_EQ(cntOps["Reshape"], 3);
|
||||||
|
// bestGraph->print();
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TEST(Mutator, Conv2gemm1x1_searchEngine_search) {
|
||||||
|
// const int N = 1, H = 7, W = 7, C = 512, F = 512, R = 1, S = 1;
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
// auto i0 = g->tensor({N, C, H, W});
|
||||||
|
// auto w1 = g->tensor({F, C, R, S});
|
||||||
|
// g->conv(i0, w1, R / 2, S / 2);
|
||||||
|
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// tpm::SearchEngine searchEngine(make_shared<tpm::NMutator>());
|
||||||
|
// searchEngine.run(graph, bestGraph);
|
||||||
|
|
||||||
|
// EXPECT_EQ(bestGraph->getOperators().size(), 4u);
|
||||||
|
// auto cntOps = bestGraph->countOps();
|
||||||
|
// EXPECT_EQ(cntOps["Matmul"], 1);
|
||||||
|
// EXPECT_EQ(cntOps["Reshape"], 3);
|
||||||
|
// bestGraph->print();
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TEST(Mutator, Conv2gemm1x7_searchEngine_ruleBased) {
|
||||||
|
// const int N = 1, C = 2048, H = 7, W = 7, F = 128, R = 1,
|
||||||
|
// S = 7; // gcn_Conv_137
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
// auto i0 = g->tensor({N, C, H, W});
|
||||||
|
// auto w1 = g->tensor({F, C, R, S});
|
||||||
|
// g->conv(i0, w1, R / 2, S / 2);
|
||||||
|
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// const vector<int> rules = {3, 2, 2, 5, 8, 8, 6, 90};
|
||||||
|
// tpm::SearchEngine searchEngine(make_shared<tpm::NMutator>(rules));
|
||||||
|
// searchEngine.run(graph, bestGraph);
|
||||||
|
|
||||||
|
// // clang-format off
|
||||||
|
// // ========== PET graph getPerf ============
|
||||||
|
// // Reshape(in=0,out=309)
|
||||||
|
// // op_time 0.000000
|
||||||
|
// // MemBound[124644277](i0=1, o0=308, exec_time=0.0683594, NNet
|
||||||
|
// Inputs=[K,])
|
||||||
|
// // L<c:0:2048><i52:0:896>Sum ... [i52,c]
|
||||||
|
// // {L<i52:0:896><c:0:2048>Sum ... [(i52 / 7),c,((i52 / 7) % 1),(i52
|
||||||
|
// % 7)]
|
||||||
|
// // {K}}
|
||||||
|
|
||||||
|
// // op_time 0.000000
|
||||||
|
// // Matmul([A^T,B,act=0],A=308,B=309,C=307, TTbmnk: 1, 0, 1, 896, 49,
|
||||||
|
// 2048)
|
||||||
|
// // op_time 0.024471
|
||||||
|
// // MemBound[124644277](i0=307, o0=3, exec_time=0.001, NNet Inputs=[T49,])
|
||||||
|
// // L<n:0:1><f:0:128><h:0:7><w:0:7>Sum<r:0:1><s:0:7> ... [(h + r),r,(w +
|
||||||
|
// s),s,n,f]
|
||||||
|
// //
|
||||||
|
// {L<i45:0:7><i46:0:1><i26:3:10><i27:0:7><n:0:1><f:0:128><pad=0,0,3,0,0,0,>Sum
|
||||||
|
// ... [(((7 * f) + (7 * i46)) + i27),(((49 * n) + (7 * i45)) + (i26 +
|
||||||
|
// -3))]
|
||||||
|
// // {T49}}
|
||||||
|
|
||||||
|
// // op_time 0.001000
|
||||||
|
// // Op Cnt T_tot Percent T_mean
|
||||||
|
// // Matmul 1 0.024 96.1 0.024
|
||||||
|
// // Reshape 1 0.000 0.0 0.000
|
||||||
|
// // MemBound 2 0.001 3.9 0.001
|
||||||
|
// // Origin Perf: 0.405595
|
||||||
|
// // Best Perf without correction: 0.0254715
|
||||||
|
// // Best Perf with correction: 0.0254715
|
||||||
|
// // Transpose perf: 0
|
||||||
|
// // clang-format on
|
||||||
|
// EXPECT_EQ(bestGraph->getOperators().size(), 4u);
|
||||||
|
// auto cntOps = bestGraph->countOps();
|
||||||
|
// EXPECT_EQ(cntOps["Matmul"], 1);
|
||||||
|
// EXPECT_EQ(cntOps["Reshape"], 1);
|
||||||
|
// EXPECT_EQ(cntOps["MemBound"], 2);
|
||||||
|
// bestGraph->print();
|
||||||
|
// EXPECT_TRUE(graph->verification(bestGraph.get(), true));
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TEST(Mutator, Conv2gemm7x1_searchEngine_ruleBased) {
|
||||||
|
// const int N = 1, C = 2048, H = 7, W = 7, F = 128, R = 7,
|
||||||
|
// S = 1; // gcn_Conv_137
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
// auto i0 = g->tensor({N, C, H, W});
|
||||||
|
// auto w1 = g->tensor({F, C, R, S});
|
||||||
|
// g->conv(i0, w1, R / 2, S / 2);
|
||||||
|
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// const vector<int> rules = {3, 2, 2, 5, 8, 8, 6, 90};
|
||||||
|
// tpm::SearchEngine searchEngine(make_shared<tpm::NMutator>(rules));
|
||||||
|
// searchEngine.run(graph, bestGraph);
|
||||||
|
|
||||||
|
// EXPECT_EQ(bestGraph->getOperators().size(), 4u);
|
||||||
|
// auto cntOps = bestGraph->countOps();
|
||||||
|
// EXPECT_EQ(cntOps["Matmul"], 1);
|
||||||
|
// EXPECT_EQ(cntOps["Reshape"], 1);
|
||||||
|
// EXPECT_EQ(cntOps["MemBound"], 2);
|
||||||
|
// bestGraph->print();
|
||||||
|
// EXPECT_TRUE(graph->verification(bestGraph.get(), true));
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TEST(Mutator, Conv2gemm7x1_searchEngine_search) {
|
||||||
|
// const int N = 1, C = 2048, H = 7, W = 7, F = 128, R = 7,
|
||||||
|
// S = 1; // gcn_Conv_137
|
||||||
|
// auto g = new tpm::Graph();
|
||||||
|
// auto i0 = g->tensor({N, C, H, W});
|
||||||
|
// auto w1 = g->tensor({F, C, R, S});
|
||||||
|
// g->conv(i0, w1, R / 2, S / 2);
|
||||||
|
|
||||||
|
// g->updateConnection();
|
||||||
|
|
||||||
|
// std::shared_ptr<tpm::SubGraph> graph, bestGraph;
|
||||||
|
// graph = std::make_shared<tpm::SubGraph>(g->getOperators());
|
||||||
|
// // const vector<int> rules = {3, 2, 2, 5, 8, 8, 6, 90};
|
||||||
|
// tpm::SearchEngine searchEngine(make_shared<tpm::NMutator>());
|
||||||
|
// searchEngine.run(graph, bestGraph);
|
||||||
|
|
||||||
|
// EXPECT_EQ(bestGraph->getOperators().size(), 4u);
|
||||||
|
// auto cntOps = bestGraph->countOps();
|
||||||
|
// EXPECT_EQ(cntOps["Matmul"], 1);
|
||||||
|
// EXPECT_EQ(cntOps["Reshape"], 1);
|
||||||
|
// EXPECT_EQ(cntOps["MemBound"], 2);
|
||||||
|
// bestGraph->print();
|
||||||
|
// EXPECT_TRUE(graph->verification(bestGraph.get(), true));
|
||||||
|
// }
|
||||||
|
} // namespace infini
|
Loading…
Reference in New Issue