diff --git a/CMakeLists.txt b/CMakeLists.txt index dd37869a..3fa588d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,7 +46,6 @@ if(BUILD_TEST) endif() file(GLOB_RECURSE SRC src/*.cc src/*.cu) -# file(GLOB_RECURSE TEST test/*.cc) # file(GLOB_RECURSE FFI src/ffi/ffi_pet.cc) # list(REMOVE_ITEM SRC ${TEST} ${FFI}) @@ -62,7 +61,6 @@ add_library(InfiniTensor SHARED ${SRC}) if(BUILD_TEST) enable_testing() - # Build all tests file( GLOB TEST_SOURCES test/test_sg2bmm.cc ) file(GLOB_RECURSE TEST_SOURCES test/*.cc) foreach(testsourcefile ${TEST_SOURCES}) get_filename_component(testname ${testsourcefile} NAME_WE) diff --git a/include/core/common.h b/include/core/common.h index 801ce908..365b40f4 100644 --- a/include/core/common.h +++ b/include/core/common.h @@ -29,14 +29,14 @@ using std::vector; // Aliases using dtype = float; -// Utilities +// Metaprogramming utilities #define _CAT(A, B) A##B #define _SELECT(NAME, NUM) _CAT(NAME##_, NUM) #define _GET_COUNT(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, COUNT, ...) COUNT #define _VA_SIZE(...) _GET_COUNT(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1) #define _VA_SELECT(NAME, ...) _SELECT(NAME, _VA_SIZE(__VA_ARGS__))(__VA_ARGS__) -// Assert +// Assert: conditions should have no side effect #define _IT_ASSERT_2(name, info) \ (static_cast(name) \ ? void(0) \ @@ -49,4 +49,11 @@ using dtype = float; #define IT_TODO_HALT(...) IT_ASSERT(false, "Unimplemented") #define IT_TODO_SKIP(...) puts("Unimplemented " __FILE__ ":" __LINE__) +// Other utilities + +// std::to_underlying is avaiable since C++23 +template auto enum_to_underlying(T e) { + return static_cast>(e); +} + } // namespace it diff --git a/include/core/graph.h b/include/core/graph.h index a3db32bb..64db2330 100644 --- a/include/core/graph.h +++ b/include/core/graph.h @@ -23,13 +23,14 @@ class GraphNode : public Object { // TensorVec &getInputs(); // TensorVec &getOutputs(); - Tensor addTensor(Shape dim) { - Tensor tensor = make_ref(dim); + Tensor addTensor(Shape dim, DataType dtype = DataType::Int32) { + Tensor tensor = make_ref(dim, dtype); tensors.emplace_back(tensor); return tensor; } void updateConnection(); + void dataMalloc(); // TODO // bool compute(); diff --git a/include/core/kernel.h b/include/core/kernel.h new file mode 100644 index 00000000..74b01168 --- /dev/null +++ b/include/core/kernel.h @@ -0,0 +1,56 @@ +#pragma once +#include "core/common.h" +#include "core/operator.h" +#include "core/tensor.h" + +namespace it { + +enum class Device { CPU = 1, CUDA }; + +class Kernel { + public: + Kernel() {} + virtual ~Kernel() {} + + virtual void compute(const Operator &op) const = 0; +}; + +class KernelRegistry { + public: + using Key = std::tuple; + + public: + ~KernelRegistry() { + for (auto &[k, v] : kernels) + delete v; + } + static KernelRegistry &getInstance() { + static KernelRegistry instance; + return instance; + } + bool registerKernel(const Key &key, Kernel *kernel) { + // TODO: kernels with priority + IT_ASSERT(kernels.find(key) == kernels.end(), + "Kernel already registered"); + kernels.emplace(key, kernel); + return true; + } + Kernel *getKernel(Device device, OpType opType, DataType dataType) const { + return kernels.at(Key{device, opType, dataType}); + } + + private: + std::map kernels; +}; + +#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, cnt) \ + namespace it { \ + static const bool _CAT(_register_kernel_, cnt) = \ + KernelRegistry::getInstance().registerKernel( \ + KernelRegistry::Key{device, opType, dataType}, new kernel()); \ + } + +#define REGISTER_KERNEL(device, opType, dataType, kernel) \ + _REGISTER_KERNEL_1(device, opType, dataType, kernel, __COUNTER__) + +} // namespace it diff --git a/include/core/operator.h b/include/core/operator.h index 9565e78f..18b8e842 100644 --- a/include/core/operator.h +++ b/include/core/operator.h @@ -3,7 +3,7 @@ namespace it { -enum OpType { +enum class OpType { Unknown = 0, // linear Conv = 100, @@ -41,7 +41,7 @@ class OpRegistry { public: std::string getOpName(OpType opType) { #define FOP(op) \ - case op: \ + case OpType::op: \ return #op switch (opType) { @@ -83,7 +83,7 @@ class OpRegistry { } }; -enum ActType { +enum class ActType { None, Relu, Sigmoid, @@ -100,26 +100,19 @@ class OperatorNode : public Object { // vector> successors; public: - OperatorNode(TensorVec inputs, TensorVec outputs) - : inputs(inputs), outputs(outputs) {} + OperatorNode(OpType opType, TensorVec inputs, TensorVec outputs) + : type(opType), inputs(inputs), outputs(outputs) {} virtual vector computeShape() const = 0; public: // check Op type - bool isLinearOp() const { return type >= 100 && type < 200; } - bool isElementWiseOp() const { return type >= 200 && type < 300; } - bool isSplitOp() const { return type == Split; } - bool isConcatOp() const { return type == Concat; } - bool isComputeOp() const { - return type == Conv || type == Matmul || type == ConvTrans || - type == G2BMM || type == GBMML; - } - bool isTransposeOp() const { return type == Transpose; } - - bool isReshapeOp() const { return type == Reshape; } - - bool isMemBoundOp() const { - return type == MemBound || type == Activation || type == Transpose; - } + bool isLinearOp() const; + bool isElementWiseOp() const; + bool isSplitOp() const; + bool isConcatOp() const; + bool isComputeOp() const; + bool isTransposeOp() const; + bool isReshapeOp() const; + bool isMemBoundOp() const; public: // getter and setter // TensorVec getInputs() { return inputs; } @@ -131,6 +124,7 @@ class OperatorNode : public Object { IT_ASSERT(outputs.size() == 1, "Unimplemented"); return outputs[0]; } + OpType getOpType() const { return type; } virtual int numInputs() const = 0; virtual int numOutputs() const = 0; @@ -152,7 +146,8 @@ class MatmulNode : public OperatorNode { public: MatmulNode(Tensor A, Tensor B, Tensor C, bool transA = false, - bool transB = false, Tensor bias = nullptr, ActType act = None); + bool transB = false, Tensor bias = nullptr, + ActType act = ActType::None); std::string toString() const override; vector computeShape() const override; diff --git a/include/core/run_enigne.h b/include/core/run_enigne.h new file mode 100644 index 00000000..ee8ad644 --- /dev/null +++ b/include/core/run_enigne.h @@ -0,0 +1,24 @@ +#include "core/graph.h" +#include "core/kernel.h" + +namespace it { + +class RunEngine { + public: + RunEngine(Device device) : device(device) {} + ~RunEngine() {} + + void run(Graph graph) const { + const auto &kernelRegistry = KernelRegistry::getInstance(); + for (auto &op : graph->getOperators()) { + // HACK: set correct data type + Kernel *kernel = kernelRegistry.getKernel(device, op->getOpType(), + DataType::Int32); + kernel->compute(op); + } + } + + private: + Device device; +}; +} // namespace it \ No newline at end of file diff --git a/include/core/tensor.h b/include/core/tensor.h index 569d7294..eb9117ab 100644 --- a/include/core/tensor.h +++ b/include/core/tensor.h @@ -11,22 +11,21 @@ class TensorNode : public TensorBaseNode { Shape shape; public: - TensorNode(const Shape &shape, DataType dtype = DataType::Float32); + TensorNode(const Shape &shape, DataType dtype); virtual ~TensorNode() {} string toString() const override; - int size(); - - void dataMalloc(size_t size) { - IT_ASSERT(data == nullptr); - data = make_ref>(size); - } + size_t size() const; + void dataMalloc(); Shape getDims() const { return shape; } size_t getOffset(const Shape &ds) const; using TensorBaseNode::getData; VType getData(const Shape &pos) const; + void copyData(VType *dptr); + void printData() const; + bool equalData(const Tensor &rhs) const; // void setDims(const Dim &dms) { dims = dms; } // bool dataRand(int seed = 0) { @@ -47,17 +46,6 @@ class TensorNode : public TensorBaseNode { // return true; // } - // bool setData(VType *dptr) { - // if (dptr == nullptr) - // return false; - // auto sz = size(); - // #pragma omp parallel for - // for (size_t i = 0; i < sz; ++i) - // data[i] = dptr[i]; - // computed = ComputedFull; - // return true; - // } - // bool setScalar(VType val) { // if (data == nullptr || !dims.empty()) // return false; @@ -137,60 +125,9 @@ class TensorNode : public TensorBaseNode { // } // } - // size_t size() const { - // size_t sz = 1; - // auto dm = dims.size(); - // while (dm > 0) - // sz *= dims[--dm]; - // return sz; - // } - // TensorType getType() const { return type; } // void setType(TensorType ty) { type = ty; } - // void print() { - // if (type == Invalid) { - // std::cout << "Invalid tensor" << std::endl; - // return; - // } - - // if (data == nullptr || dims.size() == 0) { - // std::cout << "Empty tensor" << std::endl; - // return; - // } - - // // TODO: can be uncommented after tensor's compute type is - // correctly set if (computed == NotComputed) { - // std::cout << "Uncomputed tensor" << std::endl; - // return; - // } - - // std::cout << "Tensor: " << guid << std::endl; - // auto numDims = dims.size(); - // auto dimSzVec = std::vector(numDims, 1); - // dimSzVec[numDims - 1] = dims[numDims - 1]; - // for (int i = numDims - 1; i != 0; --i) - // dimSzVec[i - 1] = dimSzVec[i] * dims[i - 1]; - // for (size_t i = 0, iEnd = size(); i < iEnd; ++i) { - // for (size_t j = 0; j < numDims; ++j) { - // if (i % dimSzVec[j] == 0) { - // std::cout << "["; - // } - // } - // std::cout << data[i]; - // for (size_t j = 0; j < numDims; ++j) { - // if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) { - // std::cout << "]"; - // } - // } - // if (i != size() - 1) - // std::cout << ", "; - // if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - - // 1) - // std::cout << std::endl; - // } - // } - // static inline void initFastrand() { // assert(omp_get_max_threads() <= 256); // // srand(0); // constant seed for test diff --git a/include/core/tensor_base.h b/include/core/tensor_base.h index 368c3a65..c8a5d8dd 100644 --- a/include/core/tensor_base.h +++ b/include/core/tensor_base.h @@ -20,13 +20,13 @@ using OpVec = vector; using VType = uint32_t; +enum class DataType { + Float32, + Int32, +}; + class TensorBaseNode : public Object { public: - enum DataType { - Float32, - Int32, - }; - // enum TensorType { // Input, // Weight, @@ -49,7 +49,8 @@ class TensorBaseNode : public Object { DataType dtype; vector> inputOf; WRef outputOf; - Ref> data; + // TODO: use a blob instead of vector + Ref data; // ComputeState computed; // static int random_seed[256 * 16]; // static bool random_inited; @@ -58,7 +59,7 @@ class TensorBaseNode : public Object { TensorBaseNode(int dim, DataType dtype); virtual ~TensorBaseNode() {} - // Ref> getDataPtr() const { return data; } + Ref getDataPtr() const { return data; } VType getData(size_t offset) const; DataType getDType() const { return dtype; } @@ -77,12 +78,6 @@ class TensorBaseNode : public Object { // Operator *getOutputOf() { return outputOf; } // std::pair getOutputOfWithIndex(); - // bool dataMalloc() { - // if (data == nullptr) - // data = new VType[size()]; - // return data != nullptr; - // } - // const Dim &getDims() const { return dims; } // void setDims(const Dim &dms) { dims = dms; } @@ -104,17 +99,6 @@ class TensorBaseNode : public Object { // return true; // } - // bool setData(VType *dptr) { - // if (dptr == nullptr) - // return false; - // auto sz = size(); - // #pragma omp parallel for - // for (size_t i = 0; i < sz; ++i) - // data[i] = dptr[i]; - // computed = ComputedFull; - // return true; - // } - // bool setScalar(VType val) { // if (data == nullptr || !dims.empty()) // return false; @@ -234,49 +218,6 @@ class TensorBaseNode : public Object { // TensorType getType() const { return type; } // void setType(TensorType ty) { type = ty; } - // void print() { - // if (type == Invalid) { - // std::cout << "Invalid tensor" << std::endl; - // return; - // } - - // if (data == nullptr || dims.size() == 0) { - // std::cout << "Empty tensor" << std::endl; - // return; - // } - - // // TODO: can be uncommented after tensor's compute type is - // correctly set if (computed == NotComputed) { - // std::cout << "Uncomputed tensor" << std::endl; - // return; - // } - - // std::cout << "Tensor: " << guid << std::endl; - // auto numDims = dims.size(); - // auto dimSzVec = std::vector(numDims, 1); - // dimSzVec[numDims - 1] = dims[numDims - 1]; - // for (int i = numDims - 1; i != 0; --i) - // dimSzVec[i - 1] = dimSzVec[i] * dims[i - 1]; - // for (size_t i = 0, iEnd = size(); i < iEnd; ++i) { - // for (size_t j = 0; j < numDims; ++j) { - // if (i % dimSzVec[j] == 0) { - // std::cout << "["; - // } - // } - // std::cout << data[i]; - // for (size_t j = 0; j < numDims; ++j) { - // if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) { - // std::cout << "]"; - // } - // } - // if (i != size() - 1) - // std::cout << ", "; - // if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - - // 1) - // std::cout << std::endl; - // } - // } - // static inline void initFastrand() { // assert(omp_get_max_threads() <= 256); // // srand(0); // constant seed for test diff --git a/src/core/graph.cc b/src/core/graph.cc index 5b9bbe23..453cba7e 100644 --- a/src/core/graph.cc +++ b/src/core/graph.cc @@ -12,4 +12,9 @@ string GraphNode::toString() const { return oss.str(); } +void GraphNode::dataMalloc() { + for (auto &tensor : tensors) + tensor->dataMalloc(); +} + } // namespace it \ No newline at end of file diff --git a/src/core/operator.cc b/src/core/operator.cc index 003c24a8..122a7dfd 100644 --- a/src/core/operator.cc +++ b/src/core/operator.cc @@ -2,6 +2,33 @@ namespace it { +bool OperatorNode::isLinearOp() const { + return enum_to_underlying(type) >= 100 && enum_to_underlying(type) < 200; +} + +bool OperatorNode::isElementWiseOp() const { + return enum_to_underlying(type) >= 200 && enum_to_underlying(type) < 300; +} + +bool OperatorNode::isSplitOp() const { return type == OpType::Split; } + +bool OperatorNode::isConcatOp() const { return type == OpType::Concat; } + +bool OperatorNode::isComputeOp() const { + return type == OpType::Conv || type == OpType::Matmul || + type == OpType::ConvTrans || type == OpType::G2BMM || + type == OpType::GBMML; +} + +bool OperatorNode::isTransposeOp() const { return type == OpType::Transpose; } + +bool OperatorNode::isReshapeOp() const { return type == OpType::Reshape; } + +bool OperatorNode::isMemBoundOp() const { + return type == OpType::MemBound || type == OpType::Activation || + type == OpType::Transpose; +} + vector MatmulNode::computeShape() const { Shape ret{args.b, args.m, args.n}; return {ret}; @@ -9,16 +36,14 @@ vector MatmulNode::computeShape() const { MatmulNode::MatmulNode(Tensor A, Tensor B, Tensor C, bool transA, bool transB, Tensor bias, ActType act) - : OperatorNode({A, B, bias}, {C}), args{.b = A->getDims()[0], - .m = transA ? A->getDims()[2] - : A->getDims()[1], - .n = transB ? B->getDims()[1] - : B->getDims()[2], - .k = transA ? A->getDims()[1] - : A->getDims()[2], - .transA = transA, - .transB = transB, - .act = act} { + : OperatorNode(OpType::Matmul, {A, B, bias}, {C}), + args{.b = A->getDims()[0], + .m = transA ? A->getDims()[2] : A->getDims()[1], + .n = transB ? B->getDims()[1] : B->getDims()[2], + .k = transA ? A->getDims()[1] : A->getDims()[2], + .transA = transA, + .transB = transB, + .act = act} { IT_ASSERT(checkValid(inputs)); } diff --git a/src/core/tensor.cc b/src/core/tensor.cc index 9f20c224..877e0396 100644 --- a/src/core/tensor.cc +++ b/src/core/tensor.cc @@ -4,6 +4,12 @@ namespace it { TensorNode::TensorNode(const Shape &shape, DataType dtype) : TensorBaseNode(shape.size(), dtype), shape(shape) {} +void TensorNode::dataMalloc() { + IT_ASSERT(data == nullptr); + // initialized to zero + data.reset(reinterpret_cast(calloc(size(), sizeof(VType)))); +} + VType TensorNode::getData(const Shape &pos) const { return getData(getOffset(pos)); } @@ -26,4 +32,59 @@ size_t TensorNode::getOffset(const Shape &pos) const { return idx; } +size_t TensorNode::size() const { + size_t ret = 1; + for (const auto &d : shape) + ret *= d; + return ret; +} + +void TensorNode::copyData(VType *dptr) { + IT_ASSERT(data != nullptr); + size_t sz = size(); +#pragma omp parallel for + for (size_t i = 0; i < sz; ++i) { + data[i] = dptr[i]; + } +} + +void TensorNode::printData() const { + IT_ASSERT(data != nullptr); + std::cout << "Tensor: " << guid << std::endl; + auto numDims = shape.size(); + auto dimSzVec = std::vector(numDims, 1); + dimSzVec[numDims - 1] = shape[numDims - 1]; + for (int i = numDims - 1; i != 0; --i) + dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1]; + for (size_t i = 0, iEnd = size(); i < iEnd; ++i) { + for (size_t j = 0; j < numDims; ++j) { + if (i % dimSzVec[j] == 0) { + std::cout << "["; + } + } + std::cout << data[i]; + for (size_t j = 0; j < numDims; ++j) { + if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) { + std::cout << "]"; + } + } + if (i != size() - 1) + std::cout << ", "; + if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1) + std::cout << std::endl; + } +} + +bool TensorNode::equalData(const Tensor &rhs) const { + IT_ASSERT(data != nullptr); + IT_ASSERT(rhs->data != nullptr); + if (shape != rhs->getDims()) + return false; + size_t sz = size(); + for (size_t i = 0; i < sz; ++i) + if (data[i] != rhs->data[i]) + return false; + return true; +} + }; // namespace it \ No newline at end of file diff --git a/src/core/tensor_base.cc b/src/core/tensor_base.cc index 29bb40d3..3275a84f 100644 --- a/src/core/tensor_base.cc +++ b/src/core/tensor_base.cc @@ -4,6 +4,6 @@ namespace it { TensorBaseNode::TensorBaseNode(int dim, DataType dtype) : dim(dim), dtype(dtype) {} -VType TensorBaseNode::getData(size_t offset) const { return data->at(offset); } +VType TensorBaseNode::getData(size_t offset) const { return data[offset]; } }; // namespace it \ No newline at end of file diff --git a/src/kerels/cpu/matmul.cc b/src/kerels/cpu/matmul.cc new file mode 100644 index 00000000..7f53dfb3 --- /dev/null +++ b/src/kerels/cpu/matmul.cc @@ -0,0 +1,30 @@ +#include "core/kernel.h" + +namespace it { + +template class NaiveMatmul : public Kernel { + void compute(const Operator &_op) const override { + auto op = as(_op); + T *A = reinterpret_cast(op->getInputs(0)->getDataPtr().get()); + T *B = reinterpret_cast(op->getInputs(1)->getDataPtr().get()); + T *C = reinterpret_cast(op->getOutput()->getDataPtr().get()); + const auto args = op->getArgs(); + IT_ASSERT(args.transA == false && args.transB == false); + IT_ASSERT(args.act == ActType::None); + const int M = args.m, N = args.n, K = args.k; + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < K; k++) { + C[i * N + j] += A[i * K + k] * B[k * N + j]; + } + } + } + } +}; + +REGISTER_KERNEL(Device::CPU, OpType::Matmul, DataType::Int32, + NaiveMatmul); +REGISTER_KERNEL(Device::CPU, OpType::Matmul, DataType::Float32, + NaiveMatmul); + +} // namespace it \ No newline at end of file diff --git a/test/core/test_graph.cc b/test/core/test_graph.cc index 54849783..206a7a7f 100644 --- a/test/core/test_graph.cc +++ b/test/core/test_graph.cc @@ -1,15 +1,24 @@ #include "core/graph.h" +#include "core/run_enigne.h" #include "test.h" namespace it { TEST(Graph, build) { Graph g = make_ref(); - Tensor i0 = g->addTensor({1, 2, 3}); - Tensor w0 = g->addTensor({1, 3, 4}); - Tensor o0 = g->addTensor({1, 2, 4}); + Tensor i0 = g->addTensor({1, 2, 3}, DataType::Int32); + Tensor w0 = g->addTensor({1, 3, 4}, DataType::Int32); + Tensor o0 = g->addTensor({1, 2, 4}, DataType::Int32); + g->dataMalloc(); + i0->copyData(vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}.data()); + w0->copyData(vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}.data()); g->addOp(make_ref(i0, w0, o0)); - g->print(); + RunEngine(Device::CPU).run(g); + // check answer + auto ans = make_ref(Shape{1, 2, 4}, DataType::Int32); + ans->dataMalloc(); + ans->copyData(vector{38, 44, 50, 56, 83, 98, 113, 128}.data()); + EXPECT_TRUE(o0->equalData(ans)); } } // namespace it \ No newline at end of file