InfiniTensor/include/core/runtime.h

136 lines
4.5 KiB
C
Raw Normal View History

#pragma once
#include "core/common.h"
#include "core/communicator.h"
#include "core/op_type.h"
#include "core/ref.h"
#include <memory>
namespace infini {
/***************** Forward declaration begin *****************/
class TensorBaseObj;
class TensorObj;
class OperatorObj;
class GraphObj;
class GraphHandlerObj;
class RuntimeObj;
class BlobObj;
template <typename T> class WorkspaceObj;
using TensorBase = Ref<TensorBaseObj>;
using Tensor = Ref<TensorObj>;
using Operator = Ref<OperatorObj>;
using Graph = Ref<GraphObj>;
using GraphHandler = Ref<GraphHandlerObj>;
using Runtime = Ref<RuntimeObj>;
using Blob = Ref<BlobObj>;
template <typename T> using Workspace = Ref<WorkspaceObj<T>>;
using TensorVec = vector<Tensor>;
using OpVec = vector<Operator>;
using OpLists = list<Operator>;
using VType = uint32_t;
Xpu (#82) * support kunlun xpu and add an operator named Add * add sub, mul, div, pow, maximum, minimum * add code * add xpu code * add code * add matmul * add transpose * add unary operator * add unary operator * add some operator * add code * support run resnet18 on xpu * add code * add max pool2d * fix xpu code, let it can run. * 添加XPU算子 (#120) * add floordiv for xpu * add batchnorm for xpu * add more cast types for xpu * add conv_trans for xpu * add pad for xpu * add logical ops for xpu * fix format for xpu src and include * fix format for xpu test * fix format for xpu src --------- Co-authored-by: Bolun <bolunz@u.nus.edu> * Xpu abs (#121) * add: unary kernel for xpu * formatting * format * format * format * fix: pointer jump * fix optype comments * fix bug introduced while resolving conflict * change cmake option for kunlunxin xpu from 'xpu' to 'kunlun'; fix bug after merging distributed infrastructure * Add doc support for xpu (#141) * fix * fix * fix pooling test * format * format * fix * fix * set cmake version requirement * fix cmakelists * rename xpu to kunlun * fix * fix format * fix format * fix format * fix change name to kunlun * format * fix format * clang format * fix format --------- Co-authored-by: root <root@localhost.localdomain> Co-authored-by: wanghailu <wanghailu@qiyuanlab.com> Co-authored-by: wanghailu <wanghailu0717@163.com> Co-authored-by: Bolun Zhang <48948016+Chamberlain0w0@users.noreply.github.com> Co-authored-by: Bolun <bolunz@u.nus.edu> Co-authored-by: zhangyue207 <138768300+zhangyue207@users.noreply.github.com> Co-authored-by: Haojie Wang <haojie0429@gmail.com> Co-authored-by: baominghelly <41820386+baominghelly@users.noreply.github.com> Co-authored-by: Bolun <chamberlain0w0@gmail.com>
2023-10-16 10:57:08 +08:00
enum class Device { CPU = 1, CUDA, BANG, INTELCPU, KUNLUN };
/***************** Forward declaration end *****************/
class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
protected:
Device device;
int deviceId;
public:
explicit RuntimeObj(Device device, int deviceId = 0)
: device(device), deviceId(deviceId) {}
RuntimeObj(RuntimeObj &other) = delete;
RuntimeObj &operator=(RuntimeObj const &) = delete;
virtual ~RuntimeObj() {}
/**
* @brief Execute a graph.
*
* @param graph
* @param tune If there is no performance record, whether to tune it. These
* can be independent method.
* @param profiling Whether to print breakdown of time
*/
virtual void run(const Graph &graph, bool tune = false,
bool profiling = false) const = 0;
virtual void *alloc(size_t size) = 0;
virtual void dealloc(void *ptr) = 0;
/**
* @brief Get the execution time of each operator in performance record. No
* execution happens.
*
* @param graph
* @param profiling Whether to print breakdown of time
* @return double Return the sum of perf time for each operator
*/
double getPerfTime(const Graph &graph, bool profiling = false) const;
Blob allocBlob(size_t size);
bool isCpu() const {
return device == Device::CPU || device == Device::INTELCPU;
}
bool isCuda() const { return device == Device::CUDA; }
bool isBang() const { return device == Device::BANG; }
Xpu (#82) * support kunlun xpu and add an operator named Add * add sub, mul, div, pow, maximum, minimum * add code * add xpu code * add code * add matmul * add transpose * add unary operator * add unary operator * add some operator * add code * support run resnet18 on xpu * add code * add max pool2d * fix xpu code, let it can run. * 添加XPU算子 (#120) * add floordiv for xpu * add batchnorm for xpu * add more cast types for xpu * add conv_trans for xpu * add pad for xpu * add logical ops for xpu * fix format for xpu src and include * fix format for xpu test * fix format for xpu src --------- Co-authored-by: Bolun <bolunz@u.nus.edu> * Xpu abs (#121) * add: unary kernel for xpu * formatting * format * format * format * fix: pointer jump * fix optype comments * fix bug introduced while resolving conflict * change cmake option for kunlunxin xpu from 'xpu' to 'kunlun'; fix bug after merging distributed infrastructure * Add doc support for xpu (#141) * fix * fix * fix pooling test * format * format * fix * fix * set cmake version requirement * fix cmakelists * rename xpu to kunlun * fix * fix format * fix format * fix format * fix change name to kunlun * format * fix format * clang format * fix format --------- Co-authored-by: root <root@localhost.localdomain> Co-authored-by: wanghailu <wanghailu@qiyuanlab.com> Co-authored-by: wanghailu <wanghailu0717@163.com> Co-authored-by: Bolun Zhang <48948016+Chamberlain0w0@users.noreply.github.com> Co-authored-by: Bolun <bolunz@u.nus.edu> Co-authored-by: zhangyue207 <138768300+zhangyue207@users.noreply.github.com> Co-authored-by: Haojie Wang <haojie0429@gmail.com> Co-authored-by: baominghelly <41820386+baominghelly@users.noreply.github.com> Co-authored-by: Bolun <chamberlain0w0@gmail.com>
2023-10-16 10:57:08 +08:00
bool isKUNLUN() const { return device == Device::KUNLUN; }
void copyBlob(const TensorObj *dst, const TensorObj *src) const;
// TODO: unify these copy APIs
virtual void copyBlobFromCPU(void *dst, const void *src,
size_t bytes) const = 0;
virtual void copyBlobToCPU(void *dst, const void *src,
size_t bytes) const = 0;
virtual string toString() const = 0;
int getDeviceId() const { return deviceId; }
virtual void initComm(const string &name, int worldSize, int rank) = 0;
virtual CommunicatorObj &getCommunicator() const = 0;
protected:
void printProfilingData(double totTime,
const std::map<OpType, double> &opTime,
const std::map<OpType, int> &opCnt) const;
virtual void copyBlobInsideRuntime(void *dst, const void *src,
size_t bytes) const = 0;
};
class CpuRuntimeObj : public RuntimeObj {
public:
CpuRuntimeObj(Device dev) : RuntimeObj(dev) {}
void run(const Graph &graph, bool tune = false,
bool profiling = false) const override;
void copyBlobFromCPU(void *dst, const void *src,
size_t bytes) const override;
void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
void copyBlobInsideRuntime(void *dst, const void *src,
size_t bytes) const override;
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
};
class NativeCpuRuntimeObj : public CpuRuntimeObj {
public:
NativeCpuRuntimeObj() : CpuRuntimeObj(Device::CPU) {}
static Ref<NativeCpuRuntimeObj> &getInstance() {
static Ref<NativeCpuRuntimeObj> instance =
make_ref<NativeCpuRuntimeObj>();
return instance;
}
void dealloc(void *ptr) override { return free(ptr); };
void *alloc(size_t size) override {
return calloc((size + sizeof(uint64_t) - 1) / sizeof(uint64_t),
sizeof(uint64_t));
};
string toString() const override;
};
} // namespace infini