InfiniTensor/include/operators/all_reduce.h

#pragma once
#include "core/operator.h"

namespace infini {
/**
 * @brief The AllReduce operation is performing reductions on data (sum, min,
 * max, avg, or div) across devices and writing the result in the
 * receive buffers of every rank. For example, in an allreduce operation between
 * k ranks and performing a sum, each rank will provide an array Vk of N values,
 * and receive an identical arrays S of N values, where S[i] =
 * V0[i]+V1[i]+…+Vk-1[i].
 *
 * For more details:
 * https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allreduce
 */
class AllReduceBaseObj : public OperatorObj {

  public:
    /**
     * @brief Construct a new AllReduce base object. Should be called by every
     * child class constructor, but not directly.
     *
     * @param graph The computation graph that this operator belongs to.
     * @param opType The operation type. This param is taken care of by child
     * classes.
     * @param input The input tensor from this rank.
     * @param output The output tensor, same size as input.
     */
    AllReduceBaseObj(GraphObj *graph, OpType opType, Tensor input,
                     Tensor output);
    OP_CLONE(AllReduceBaseObj);

    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }

    optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
        return {{inputs[0]->getDims()}};
    };

    std::string toString() const override;

  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
    vector<DataType> inferDataType(const TensorVec &inputs) const override {
        return {inputs[0]->getDType()};
    };
};

class AllReduceSumObj : public AllReduceBaseObj {
  public:
    AllReduceSumObj(GraphObj *graph, Tensor input, Tensor output);
};

class AllReduceProdObj : public AllReduceBaseObj {
  public:
    AllReduceProdObj(GraphObj *graph, Tensor input, Tensor output);
};

class AllReduceMinObj : public AllReduceBaseObj {
  public:
    AllReduceMinObj(GraphObj *graph, Tensor input, Tensor output);
};

class AllReduceMaxObj : public AllReduceBaseObj {
  public:
    AllReduceMaxObj(GraphObj *graph, Tensor input, Tensor output);
};

class AllReduceAvgObj : public AllReduceBaseObj {
  public:
    AllReduceAvgObj(GraphObj *graph, Tensor input, Tensor output);
};

} // namespace infini
impl distributed launch with NCCL (#106) * add cmake bits about NCCL * move example to examples/NNmodel * impl NCCL communicator * add comm related function to Runtime * export runtime interface * add launch.py * use unique name to distingush the the NCCL ID file * add timeout to communicator init * expose communicator obj from runtime obj, add unit test for nccl communicator * reformat files * Add allReduce operator and cuda nccl allReduce kernel * impl model parallel for resnet * add allGather nccl kernel and operator * Add allreduce allgather operator tests, change allgather kernel to output list of tensor, fix shape infer, handle nullptr output * fix format of onnx.py * use concat following AllGather * get tensor parallel for resnet * fix format of graph_handler.cc * change BUILD_DIST default to OFF * polish code of communicator * update .gitignore * Add broadcast operator and cuda kernel * Add comments for operators * remove const of class member * move communicator to CudaRuntimeObj * Add an empty line at EOF. --------- Co-authored-by: panzezhong <panzezhong@qiyuanlab.com> Co-authored-by: Haojie Wang <haojie0429@gmail.com> 2023-09-05 09:47:35 +08:00			`#pragma once`
			`#include "core/operator.h"`

			`namespace infini {`
			`/**`
			`* @brief The AllReduce operation is performing reductions on data (sum, min,`
			`* max, avg, or div) across devices and writing the result in the`
			`* receive buffers of every rank. For example, in an allreduce operation between`
			`* k ranks and performing a sum, each rank will provide an array Vk of N values,`
			`* and receive an identical arrays S of N values, where S[i] =`
			`* V0[i]+V1[i]+…+Vk-1[i].`
			`*`
			`* For more details:`
			`* https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allreduce`
			`*/`
			`class AllReduceBaseObj : public OperatorObj {`

			`public:`
			`/**`
			`* @brief Construct a new AllReduce base object. Should be called by every`
			`* child class constructor, but not directly.`
			`*`
			`* @param graph The computation graph that this operator belongs to.`
			`* @param opType The operation type. This param is taken care of by child`
			`* classes.`
			`* @param input The input tensor from this rank.`
			`* @param output The output tensor, same size as input.`
			`*/`
			`AllReduceBaseObj(GraphObj *graph, OpType opType, Tensor input,`
			`Tensor output);`
			`OP_CLONE(AllReduceBaseObj);`

			`int numInputs() const override { return 1; }`
			`int numOutputs() const override { return 1; }`

support Dynamic tensor infer shape and fix memory pool (#176) * feat: support dynamic tensor part1 * feat: support dynamic-tensor part2 * feat: support dynamic tensor part 3 * fix: fix some .. * - add kvcache example * feat: support concat to identity kernel * add a simple mempory pool for allocator * fix: rebase to master * fix bug after merging * - remove outdated script * fix: fix as review --------- Co-authored-by: kilinchange <kilinchange@163.com> Co-authored-by: Haojie Wang <haojie0429@gmail.com> 2023-11-23 13:11:50 +08:00			`optional<vector<Shape>> inferShape(const TensorVec &inputs) override {`
impl distributed launch with NCCL (#106) * add cmake bits about NCCL * move example to examples/NNmodel * impl NCCL communicator * add comm related function to Runtime * export runtime interface * add launch.py * use unique name to distingush the the NCCL ID file * add timeout to communicator init * expose communicator obj from runtime obj, add unit test for nccl communicator * reformat files * Add allReduce operator and cuda nccl allReduce kernel * impl model parallel for resnet * add allGather nccl kernel and operator * Add allreduce allgather operator tests, change allgather kernel to output list of tensor, fix shape infer, handle nullptr output * fix format of onnx.py * use concat following AllGather * get tensor parallel for resnet * fix format of graph_handler.cc * change BUILD_DIST default to OFF * polish code of communicator * update .gitignore * Add broadcast operator and cuda kernel * Add comments for operators * remove const of class member * move communicator to CudaRuntimeObj * Add an empty line at EOF. --------- Co-authored-by: panzezhong <panzezhong@qiyuanlab.com> Co-authored-by: Haojie Wang <haojie0429@gmail.com> 2023-09-05 09:47:35 +08:00			`return {{inputs[0]->getDims()}};`
			`};`

			`std::string toString() const override;`

			`private:`
			`vector<int> getWorkloadVector() const override;`
			`vector<int> getOpAttrVector() const override;`
			`vector<DataType> inferDataType(const TensorVec &inputs) const override {`
			`return {inputs[0]->getDType()};`
			`};`
			`};`

			`class AllReduceSumObj : public AllReduceBaseObj {`
			`public:`
			`AllReduceSumObj(GraphObj *graph, Tensor input, Tensor output);`
			`};`

			`class AllReduceProdObj : public AllReduceBaseObj {`
			`public:`
			`AllReduceProdObj(GraphObj *graph, Tensor input, Tensor output);`
			`};`

			`class AllReduceMinObj : public AllReduceBaseObj {`
			`public:`
			`AllReduceMinObj(GraphObj *graph, Tensor input, Tensor output);`
			`};`

			`class AllReduceMaxObj : public AllReduceBaseObj {`
			`public:`
			`AllReduceMaxObj(GraphObj *graph, Tensor input, Tensor output);`
			`};`

			`class AllReduceAvgObj : public AllReduceBaseObj {`
			`public:`
			`AllReduceAvgObj(GraphObj *graph, Tensor input, Tensor output);`
			`};`

			`} // namespace infini`