impl distributed launch with NCCL (#106)

* add cmake bits about NCCL * move example to examples/NNmodel * impl NCCL communicator * add comm related function to Runtime * export runtime interface * add launch.py * use unique name to distingush the the NCCL ID file * add timeout to communicator init * expose communicator obj from runtime obj, add unit test for nccl communicator * reformat files * Add allReduce operator and cuda nccl allReduce kernel * impl model parallel for resnet * add allGather nccl kernel and operator * Add allreduce allgather operator tests, change allgather kernel to output list of tensor, fix shape infer, handle nullptr output * fix format of onnx.py * use concat following AllGather * get tensor parallel for resnet * fix format of graph_handler.cc * change BUILD_DIST default to OFF * polish code of communicator * update .gitignore * Add broadcast operator and cuda kernel * Add comments for operators * remove const of class member * move communicator to CudaRuntimeObj * Add an empty line at EOF. --------- Co-authored-by: panzezhong <panzezhong@qiyuanlab.com> Co-authored-by: Haojie Wang <haojie0429@gmail.com>
2023-09-05 09:47:35 +08:00 · 2023-09-05 09:47:35 +08:00 · f60767a770
parent b4eda85e67
commit f60767a770
35 changed files with 1540 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@ -42,3 +42,5 @@ build_debug/
 # onnx model
 *.onnx
 *.pb
 *.npy
--- a/.gitmodules
+++ b/.gitmodules
@ -11,5 +11,5 @@
 	path = 3rd-party/backward-cpp
 	url = git@github.com:bombela/backward-cpp.git
 [submodule "example"]
-	path = example
+	path = examples/NNmodel
 	url = git@github.com:wanghailu0717/NNmodel.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -8,6 +8,7 @@ option(USE_BANG "Support BANG MLU" OFF)
 option(USE_INTELCPU "Support INTELCPU" OFF)
 option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
 option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
 option(BUILD_DIST "Build project for distributed running" OFF)
 option(BUILD_TEST "Build tests" OFF)
 cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
@ -194,6 +195,13 @@ if(USE_CUDA)
  enable_language(CUDA)
  find_package(CUDAToolkit) # For nvrtc and cuda driver
  target_link_libraries(InfiniTensor cudnn CUDA::curand CUDA::cublas CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
  if (BUILD_DIST)
    message(STATUS "Add BUILD_DIST, use NCCL with CUDA")
    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
    find_package(NCCL REQUIRED)
    add_compile_definitions(INFINI_USE_NCCL=1)
    target_link_libraries(InfiniTensor nccl)
  endif()
 endif()
 if(USE_BANG)
@ -261,6 +269,7 @@ if(BUILD_TEST)
    build_test(test/operators/*.cc)
    if (USE_CUDA)
      build_test(test/kernels/cuda/*.cc)
      build_test(test/cuda/*.cc)
    endif()
    if (USE_BANG)
      build_test(test/kernels/bang/*.cc)
--- a/cmake/FindNCCL.cmake
+++ b/cmake/FindNCCL.cmake
@ -0,0 +1,165 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 # 
 # From PyTorch:
 # 
 # Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
 # Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
 # Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 # Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
 # Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 # Copyright (c) 2011-2013 NYU                      (Clement Farabet)
 # Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 # Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 # Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
 # 
 # From Caffe2:
 # 
 # Copyright (c) 2016-present, Facebook Inc. All rights reserved.
 # 
 # All contributions by Facebook:
 # Copyright (c) 2016 Facebook Inc.
 # 
 # All contributions by Google:
 # Copyright (c) 2015 Google Inc.
 # All rights reserved.
 # 
 # All contributions by Yangqing Jia:
 # Copyright (c) 2015 Yangqing Jia
 # All rights reserved.
 # 
 # All contributions by Kakao Brain:
 # Copyright 2019-2020 Kakao Brain
 # 
 # All contributions from Caffe:
 # Copyright(c) 2013, 2014, 2015, the respective contributors
 # All rights reserved.
 # 
 # All other contributions:
 # Copyright(c) 2015, 2016 the respective contributors
 # All rights reserved.
 # 
 # Caffe2 uses a copyright model similar to Caffe: each contributor holds
 # copyright over their contributions to Caffe2. The project versioning records
 # all such contribution and copyright details. If a contributor wants to further
 # mark their specific copyright on a particular contribution, they should
 # indicate their copyright solely in the commit message of the change when it is
 # committed.
 # 
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 # 
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 # 
 # 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
 #    and IDIAP Research Institute nor the names of its contributors may be
 #    used to endorse or promote products derived from this software without
 #    specific prior written permission.
 # 
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 # 
 # Find the nccl libraries
 #
 # The following variables are optionally searched for defaults
 #  NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
 #  NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
 #  NCCL_LIB_DIR: Directory where NCCL library is found
 #
 # The following are set after configuration is done:
 #  NCCL_FOUND
 #  NCCL_INCLUDE_DIRS
 #  NCCL_LIBRARIES
 #
 # The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
 # install NCCL in the same location as the CUDA toolkit.
 # See https://github.com/caffe2/caffe2/issues/1601
 set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
 set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
 set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
 if ($ENV{NCCL_ROOT_DIR})
  message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
 endif()
 list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
 # Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
 list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
 find_path(NCCL_INCLUDE_DIRS
  NAMES nccl.h
  HINTS ${NCCL_INCLUDE_DIR})
 if (USE_STATIC_NCCL)
  MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
  SET(NCCL_LIBNAME "nccl_static")
  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
  endif()
 else()
  SET(NCCL_LIBNAME "nccl")
  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
  endif()
 endif()
 find_library(NCCL_LIBRARIES
  NAMES ${NCCL_LIBNAME}
  HINTS ${NCCL_LIB_DIR})
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
 if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
  set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
  message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
  list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
  include(CheckCXXSymbolExists)
  check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
  if (NCCL_VERSION_DEFINED)
    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
    file(WRITE ${file} "
      #include <iostream>
      #include <nccl.h>
      int main()
      {
        std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
        int x;
        ncclGetVersion(&x);
        return x == NCCL_VERSION_CODE;
      }
 ")
    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
          RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
          LINK_LIBRARIES ${NCCL_LIBRARIES})
    if (NOT NCCL_VERSION_MATCHED)
      message(FATAL_ERROR "Found NCCL header version and library version do not match! \
 (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
    endif()
    message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
  else()
    # message(STATUS "NCCL version < 2.3.5-5")
  endif ()
  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
 endif()
--- a/examples/NNmodel
+++ b/examples/NNmodel
--- a/examples/distributed/launch.py
+++ b/examples/distributed/launch.py
@ -0,0 +1,100 @@
 import argparse
 import os
 import time
 import multiprocessing as mp
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 import numpy as np
 from parallel import parallel_model
 def parse_args():
    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
    parser.add_argument(
        "--nproc_per_node", type=int, default=1, help="number of processes per node"
    )
    parser.add_argument(
        "--model", type=str, required=True, help="path to the ONNX model file."
    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return args.num_nodes, args.nproc_per_node, args.model
 def run_stub(stub: OnnxStub, inputs: np.array, n=100):
    # warm up
    next(stub.inputs.items().__iter__())[1].copyin_float(inputs.reshape(-1).tolist())
    stub.tune()
    for _ in range(20):
        stub.run()
    outputs = np.array(next(stub.outputs.items().__iter__())[1].copyout_float())
    # bench
    next(stub.inputs.items().__iter__())[1].copyin_float(inputs.reshape(-1).tolist())
    begin = time.time()
    for _ in range(n):
        stub.run()
    end = time.time()
    outputs = np.array(next(stub.outputs.items().__iter__())[1].copyout_float())
    print("outputs sum:", outputs.sum())
    # np.save("results", outputs)
    results = np.load("results.npy")
    print("max diff:", abs(outputs - results).max())
    assert np.allclose(outputs, results, rtol=1e-6, atol=1e-6)
    avg_time = (end - begin) / n
    return avg_time
 def start_worker(
    dist_name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
 ):
    print("start worker")
    runtime = backend.CudaRuntime(local_rank)
    print("init comm")
    runtime.init_comm(
        dist_name,
        world_size,
        rank,
    )
    model = parallel_model(model, world_size, rank)
    onnx.save(model, f"dist_model_rank{rank}.onnx")
    print("load model")
    stub = OnnxStub(model, runtime)
    data = np.load("inputs.npy")
    print("run model")
    avg_time = run_stub(stub, data)
    print(f"average time: {avg_time}")
 def main():
    nnodes, nproc_per_node, model_path = parse_args()
    world_size = nnodes * nproc_per_node
    model = onnx.load(model_path)
    # generate standard results
    # runtime = backend.CudaRuntime(0)
    # stub = OnnxStub(model, runtime)
    # data = np.random.randn(1, 3, 224, 224)
    # np.save("inputs", data)
    # run_stub(stub, data)
    # del stub
    dist_name = f"dist_{os.getpid()}"
    workers = [
        mp.Process(
            target=start_worker,
            args=(dist_name, world_size, rank, rank % nproc_per_node, model),
        )
        for rank in range(world_size)
    ]
    for w in workers:
        w.start()
    for w in workers:
        w.join()
 if __name__ == "__main__":
    main()
--- a/examples/distributed/parallel.py
+++ b/examples/distributed/parallel.py
@ -0,0 +1,103 @@
 import onnx
 from onnx import (
    ModelProto,
    TensorProto,
    NodeProto,
    AttributeProto,
 )
 from onnx import helper, numpy_helper
 from typing import Dict, Any
 def parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
    for attr in node.attribute:
        if attr.name in attrs:
            if attr.type == AttributeProto.INT:
                attrs[attr.name] = attr.i
            elif attr.type == AttributeProto.INTS:
                attrs[attr.name] = attr.ints
            elif attr.type == AttributeProto.FLOAT:
                attrs[attr.name] = attr.f
            elif attr.type == AttributeProto.STRING:
                attrs[attr.name] = attr.s
            elif attr.type == AttributeProto.TENSOR:
                attrs[attr.name] = attr.t
            else:
                assert False, "Unsupported Attribute Type: {}".format(attr.type)
    return attrs
 def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
    data = {init.name: init for init in model.graph.initializer}
    nodes = list(model.graph.node)
    def shard_tensor(tensor: TensorProto, dim: int):
        array = numpy_helper.to_array(tensor)
        if dim >= array.ndim:
            dim = array.ndim - 1
        assert array.shape[dim] % tp_world_size == 0
        seg = array.shape[dim] // tp_world_size
        array = array[tp_rank * seg : (tp_rank + 1) * seg]
        return numpy_helper.from_array(array, name=tensor.name + f":sharded({dim})")
    def shard_gemm(node: NodeProto):
        attrs = parse_attribute(
            node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
        )
        trans = [attrs["transA"], attrs["transB"]]
        dim = 0
        for i, (input, t) in enumerate(zip(node.input, trans)):
            if input in data:
                dim = i
                sharded = shard_tensor(data[input], dim ^ t)
                node.input[i] = sharded.name
                data[input] = sharded
        if len(node.input) > 2:
            input = node.input[2]
            sharded = shard_tensor(data[input], dim)
            node.input[2] = sharded.name
            data[input] = sharded
        node.output[0] += f":sharded({dim})"
        return dim
    for i, node in enumerate(nodes):
        if node.op_type == "Gemm":
            output = node.output[0]
            dim = shard_gemm(node)
            gathered = [node.output[0] + f".{i}" for i in range(tp_world_size)]
            # all_gather
            nodes.insert(
                i + 1,
                helper.make_node(
                    op_type="AllGather",
                    inputs=[node.output[0]],
                    outputs=gathered,
                    name=node.name + "/allgather",
                    # domain="infini", # shape inference fails for custom domain
                ),
            )
            # concat
            nodes.insert(
                i + 2,
                helper.make_node(
                    op_type="Concat",
                    inputs=gathered,
                    outputs=[output],
                    name=node.name + "/concat",
                    axis=dim,
                ),
            )
    graph = helper.make_graph(
        nodes,
        model.graph.name + f"_{tp_rank}",
        model.graph.input,
        model.graph.output,
        data.values(),
        doc_string=model.graph.doc_string,
        value_info=model.graph.value_info,
    )
    model = helper.make_model(graph)
    onnx.shape_inference.infer_shapes(model)
    return model
--- a/include/core/communicator.h
+++ b/include/core/communicator.h
@ -0,0 +1,22 @@
 #pragma once
 #include "object.h"
 #include "ref.h"
 namespace infini {
 // base class
 class CommunicatorObj : public Object {
  protected:
    int worldSize;
    int rank;
  public:
    CommunicatorObj(int worldSize, int rank)
        : worldSize(worldSize), rank(rank) {}
    virtual ~CommunicatorObj() = default;
    virtual int getWorldSize() const { return worldSize; }
    virtual int getRank() const { return rank; }
 };
 } // namespace infini
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -74,6 +74,14 @@ class GraphHandlerObj {
    Tensor expand(Tensor input, Tensor output, Shape dims);
    Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
    Tensor allReduceSum(Tensor input, Tensor output);
    Tensor allReduceProd(Tensor input, Tensor output);
    Tensor allReduceMin(Tensor input, Tensor output);
    Tensor allReduceMax(Tensor input, Tensor output);
    Tensor allReduceAvg(Tensor input, Tensor output);
    TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
    Tensor broadcast(Tensor input, Tensor output, int root);
    //------ modifiers
    inline bool topo_sort() { return g->topo_sort(); }
--- a/include/core/op_type.h
+++ b/include/core/op_type.h
@ -221,6 +221,15 @@ struct OpType {
        FloorMod,
        Square,
        SquaredDifference,
        // Communication Ops
        AllReduceSum,
        AllReduceProd,
        AllReduceMin,
        AllReduceMax,
        AllReduceAvg,
        AllGather,
        Broadcast,
    } type;
    constexpr OpType(decltype(type) t) : type(t) {}
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -1,5 +1,6 @@
 #pragma once
 #include "core/common.h"
 #include "core/communicator.h"
 #include "core/op_type.h"
 #include "core/ref.h"
 #include <memory>
@ -35,9 +36,11 @@ enum class Device { CPU = 1, CUDA, BANG, INTELCPU };
 class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
  protected:
    Device device;
    int deviceId;
  public:
-    RuntimeObj(Device device) : device(device) {}
+    explicit RuntimeObj(Device device, int deviceId = 0)
        : device(device), deviceId(deviceId) {}
    RuntimeObj(RuntimeObj &other) = delete;
    RuntimeObj &operator=(RuntimeObj const &) = delete;
    virtual ~RuntimeObj() {}
@ -77,6 +80,12 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
                               size_t bytes) const = 0;
    virtual string toString() const = 0;
    int getDeviceId() const { return deviceId; }
    virtual void initComm(const string &name, int worldSize, int rank) = 0;
    virtual CommunicatorObj &getCommunicator() const = 0;
  protected:
    void printProfilingData(double totTime,
                            const std::map<OpType, double> &opTime,
@ -97,6 +106,9 @@ class CpuRuntimeObj : public RuntimeObj {
    void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
    void copyBlobInsideRuntime(void *dst, const void *src,
                               size_t bytes) const override;
    void initComm(const string &, int, int) override { IT_TODO_HALT(); }
    CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
 };
 class NativeCpuRuntimeObj : public CpuRuntimeObj {
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -1,6 +1,9 @@
 #pragma once
 #include "core/runtime.h"
 #include "cuda/cuda_common.h"
 #ifdef INFINI_USE_NCCL
 #include "cuda/nccl_communicator.h"
 #endif
 namespace infini {
@ -8,12 +11,15 @@ class CudaRuntimeObj : public RuntimeObj {
  private:
    cudnnHandle_t cudnn;
    cublasHandle_t cublas;
    std::unique_ptr<CommunicatorObj> comm;
    CudaPtr workspace;
    size_t workspaceSize;
  public:
-    CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
+    explicit CudaRuntimeObj(int deviceId = 0)
        : RuntimeObj(Device::CUDA, deviceId) {
        checkCudaError(cudaSetDevice(deviceId));
        checkCudnnError(cudnnCreate(&cudnn));
        checkCublasError(cublasCreate(&cublas));
        // 10GB for Longformer
@ -69,6 +75,11 @@ class CudaRuntimeObj : public RuntimeObj {
    void runWithoutSync(const Graph &graph) const;
    // init communicator
    void initComm(const string &name, int worldSize, int rank) final;
    CommunicatorObj &getCommunicator() const final { return *comm; }
  private:
    void tune(const Graph &graph, bool profiling) const;
 };
--- a/include/cuda/nccl_communicator.h
+++ b/include/cuda/nccl_communicator.h
@ -0,0 +1,70 @@
 #pragma once
 #include "core/communicator.h"
 #include <chrono>
 #include <cstdlib>
 #include <filesystem>
 #include <fstream>
 #include <nccl.h>
 #include <thread>
 #define checkNcclError(call)                                                   \
    {                                                                          \
        auto err = call;                                                       \
        if (ncclSuccess != err) {                                              \
            fprintf(stderr, "NCCL error in %s:%i : %s.\n", __FILE__, __LINE__, \
                    ncclGetErrorString(err));                                  \
            exit(EXIT_FAILURE);                                                \
        }                                                                      \
    }
 namespace infini {
 class NcclCommunicatorObj final : public CommunicatorObj {
  private:
    ncclComm_t comm;
  public:
    NcclCommunicatorObj(const string &name, int worldSize, int rank)
        : CommunicatorObj(worldSize, rank) {
        const std::string filePath("./" + name + "_nccl_id.bin");
        ncclUniqueId commId;
        if (rank == 0) {
            checkNcclError(ncclGetUniqueId(&commId));
            std::ofstream ofs(filePath, std::ios::binary);
            ofs.write((char *)&commId, sizeof(ncclUniqueId));
        } else {
            auto begin = std::chrono::steady_clock::now();
            while (!std::filesystem::exists(filePath)) {
                auto now = std::chrono::steady_clock::now();
                _IT_ASSERT_2(now < begin + std::chrono::seconds(10),
                             "time limit (10s) exceeded.");
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
            }
            std::ifstream ifs(filePath, std::ios::binary);
            ifs.read((char *)&commId, sizeof(ncclUniqueId));
        }
        checkNcclError(ncclCommInitRank(&comm, worldSize, commId, rank));
        if (rank == 0) {
            std::filesystem::remove(filePath);
        }
    }
    // Get the actual ncclComm_t
    ncclComm_t getNcclComm() { return comm; }
    void finalize() { checkNcclError(ncclCommFinalize(comm)); }
    ~NcclCommunicatorObj() final {
        finalize();
        checkNcclError(ncclCommDestroy(comm));
    }
    virtual string toString() const final {
        std::ostringstream oss;
        oss << "NCCL communicator";
        return oss.str();
    }
 };
 } // namespace infini
--- a/include/operators/all_gather.h
+++ b/include/operators/all_gather.h
@ -0,0 +1,44 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 /**
 * @brief The AllGather operation gathers N values from k ranks into
 * an output of size k*N, and distributes that result to all ranks.
 * The output is ordered by rank index.
 *
 * For more details:
 * https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allgather
 */
 class AllGatherObj : public OperatorObj {
  public:
    /**
     * @brief Construct a new AllGather object
     *
     * @param graph The computation graph that this operator belongs to.
     * @param input The input tensor from this rank.
     * @param outputs A list of output tensors collected from all ranks.
     * @param world_size Total number of ranks.
     */
    AllGatherObj(GraphObj *graph, Tensor input, std::optional<TensorVec>,
                 int world_size);
    OP_CLONE(AllGatherObj);
    int numInputs() const override { return 1; }
    int numOutputs() const override { return world_size; }
    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
    std::string toString() const override;
    int getWorldSize() const { return world_size; }
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
    vector<DataType> inferDataType(const TensorVec &inputs) const override;
  protected:
    int world_size;
 };
 } // namespace infini
--- a/include/operators/all_reduce.h
+++ b/include/operators/all_reduce.h
@ -0,0 +1,75 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 /**
 * @brief The AllReduce operation is performing reductions on data (sum, min,
 * max, avg, or div) across devices and writing the result in the
 * receive buffers of every rank. For example, in an allreduce operation between
 * k ranks and performing a sum, each rank will provide an array Vk of N values,
 * and receive an identical arrays S of N values, where S[i] =
 * V0[i]+V1[i]+…+Vk-1[i].
 *
 * For more details:
 * https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allreduce
 */
 class AllReduceBaseObj : public OperatorObj {
  public:
    /**
     * @brief Construct a new AllReduce base object. Should be called by every
     * child class constructor, but not directly.
     *
     * @param graph The computation graph that this operator belongs to.
     * @param opType The operation type. This param is taken care of by child
     * classes.
     * @param input The input tensor from this rank.
     * @param output The output tensor, same size as input.
     */
    AllReduceBaseObj(GraphObj *graph, OpType opType, Tensor input,
                     Tensor output);
    OP_CLONE(AllReduceBaseObj);
    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }
    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override {
        return {{inputs[0]->getDims()}};
    };
    std::string toString() const override;
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
    vector<DataType> inferDataType(const TensorVec &inputs) const override {
        return {inputs[0]->getDType()};
    };
 };
 class AllReduceSumObj : public AllReduceBaseObj {
  public:
    AllReduceSumObj(GraphObj *graph, Tensor input, Tensor output);
 };
 class AllReduceProdObj : public AllReduceBaseObj {
  public:
    AllReduceProdObj(GraphObj *graph, Tensor input, Tensor output);
 };
 class AllReduceMinObj : public AllReduceBaseObj {
  public:
    AllReduceMinObj(GraphObj *graph, Tensor input, Tensor output);
 };
 class AllReduceMaxObj : public AllReduceBaseObj {
  public:
    AllReduceMaxObj(GraphObj *graph, Tensor input, Tensor output);
 };
 class AllReduceAvgObj : public AllReduceBaseObj {
  public:
    AllReduceAvgObj(GraphObj *graph, Tensor input, Tensor output);
 };
 } // namespace infini
--- a/include/operators/broadcast.h
+++ b/include/operators/broadcast.h
@ -0,0 +1,49 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 /**
 * @brief The Broadcast operation copies an N-element buffer on the root rank to
 * all ranks.
 *
 * For more details:
 * https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#broadcast
 */
 class BroadcastObj : public OperatorObj {
  public:
    /**
     * @brief Construct a new Broadcast object.
     *
     * @param graph The computation graph that this operator belongs to.
     * @param input The input tensor. Only root needs to initialize it with
     * data.
     * @param output The output tensor, same size as input.
     * @param root The root rank who performs the broadcast.
     */
    BroadcastObj(GraphObj *graph, Tensor input, Tensor output, int root);
    OP_CLONE(BroadcastObj);
    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }
    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override {
        return {{inputs[0]->getDims()}};
    };
    std::string toString() const override;
    int getRoot() const { return root; }
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
    vector<DataType> inferDataType(const TensorVec &inputs) const override {
        return {inputs[0]->getDType()};
    };
  protected:
    // The rank who broadcasts data among this communication group
    int root;
 };
 } // namespace infini
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -591,6 +591,54 @@ class OnnxStub:
                        tensors.get(node.output[0]),
                        next((attr.i for attr in node.attribute if attr.name == "to")),
                    )
                elif node.op_type == "AllReduceSum":
                    tensors[node.output[0]] = self.handler.allReduceSum(
                        tensors[node.input[0]],
                        tensors.get(node.output[0]),
                    )
                elif node.op_type == "AllReduceProd":
                    tensors[node.output[0]] = self.handler.allReduceProd(
                        tensors[node.input[0]],
                        tensors.get(node.output[0]),
                    )
                elif node.op_type == "AllReduceMin":
                    tensors[node.output[0]] = self.handler.allReduceMin(
                        tensors[node.input[0]],
                        tensors.get(node.output[0]),
                    )
                elif node.op_type == "AllReduceMax":
                    tensors[node.output[0]] = self.handler.allReduceMax(
                        tensors[node.input[0]],
                        tensors.get(node.output[0]),
                    )
                elif node.op_type == "AllReduceAvg":
                    tensors[node.output[0]] = self.handler.allReduceAvg(
                        tensors[node.input[0]],
                        tensors.get(node.output[0]),
                    )
                elif node.op_type == "AllGather":
                    for name, tensor in zip(
                        node.output,
                        self.handler.allGather(
                            tensors[node.input[0]],
                            None,
                            len(node.output),
                        ),
                    ):
                        tensors[name] = tensor
                elif node.op_type == "Broadcast":
                    tensors[node.output[0]] = self.handler.broadcast(
                        tensors[node.input[0]],
                        tensors.get(node.output[0]),
                        next(
                                (
                                    attr.i
                                    for attr in node.attribute
                                    if attr.name == "root"
                                ),
                                0,
                            ),
                    )
                elif node.op_type == "Expand":
                    shape = _parse_data(data[node.input[1]])
                    tensors[node.output[0]] = self.handler.expand(
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@ -329,6 +329,83 @@ class TestStringMethods(unittest.TestCase):
                [pads_data],
            )
        )
    def test_allReduceSum(self):
        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
        output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
        allReduceSum = make_node(
            "AllReduceSum", ["input"], ["output"], name="allReduceSum"
        )
        graph = make_graph([allReduceSum], "allReduceSum", [input], [output])
        model = make_model(graph)
        from_onnx(model, backend.cpu_runtime())
    def test_allReduceProd(self):
        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
        output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
        allReduceProd = make_node(
            "AllReduceProd", ["input"], ["output"], name="allReduceProd"
        )
        graph = make_graph([allReduceProd], "allReduceProd", [input], [output])
        model = make_model(graph)
        from_onnx(model, backend.cpu_runtime())
    def test_allReduceMin(self):
        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
        output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
        allReduceMin = make_node(
            "AllReduceMin", ["input"], ["output"], name="allReduceMin"
        )
        graph = make_graph([allReduceMin], "allReduceMin", [input], [output])
        model = make_model(graph)
        from_onnx(model, backend.cpu_runtime())
    def test_allReduceMax(self):
        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
        output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
        allReduceMax = make_node(
            "AllReduceMax", ["input"], ["output"], name="allReduceMax"
        )
        graph = make_graph([allReduceMax], "allReduceMax", [input], [output])
        model = make_model(graph)
        from_onnx(model, backend.cpu_runtime())
    def test_allReduceAvg(self):
        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
        output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
        allReduceAvg = make_node(
            "AllReduceAvg", ["input"], ["output"], name="allReduceAvg"
        )
        graph = make_graph([allReduceAvg], "allReduceAvg", [input], [output])
        model = make_model(graph)
        from_onnx(model, backend.cpu_runtime())
    def test_split(self):
        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
        split = make_node(
            "Split", ["input"], ["output"], name="split", axis=0
        )
        make_and_import_model(make_graph([split], "split", [input], []))
    def test_allBroadcast(self):
        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
        output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
        broadcast = make_node(
            "Broadcast", ["input"], ["output"], name="broadcast", root=1
        )
        graph = make_graph([broadcast], "broadcast", [input], [output])
        model = make_model(graph)
        from_onnx(model, backend.cpu_runtime())
    def test_allGather(self):
        input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
        world_size = make_tensor_value_info("world_size", TensorProto.INT32, [1])
        allGather = make_node(
            "AllGather", ["input", "world_size"], ["output"], name="allGather"
        )
        graph = make_graph([allGather], "allGather", [input, world_size], [])
        model = make_model(graph)
        from_onnx(model, backend.cpu_runtime())
    # see <https://onnx.ai/onnx/intro/python.html#a-simple-example-a-linear-regression>
    def test_linear(self):
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@ -1,5 +1,8 @@
 #include "core/graph_handler.h"
 #include "operators/all_gather.h"
 #include "operators/all_reduce.h"
 #include "operators/batch_norm.h"
 #include "operators/broadcast.h"
 #include "operators/concat.h"
 #include "operators/conv.h"
 #include "operators/element_wise.h"
@ -300,6 +303,73 @@ Tensor GraphHandlerObj::pad(Tensor input, Tensor output,
    }
 }
 Tensor GraphHandlerObj::allReduceSum(Tensor input, Tensor output) {
    if (output) {
        g->addOpWithOutputs<AllReduceSumObj>(std::move(input), output);
        return output;
    } else {
        return g->addOp<AllReduceSumObj>(std::move(input), output)->getOutput();
    }
 }
 Tensor GraphHandlerObj::allReduceProd(Tensor input, Tensor output) {
    if (output) {
        g->addOpWithOutputs<AllReduceProdObj>(std::move(input), output);
        return output;
    } else {
        return g->addOp<AllReduceProdObj>(std::move(input), output)
            ->getOutput();
    }
 }
 Tensor GraphHandlerObj::allReduceMin(Tensor input, Tensor output) {
    if (output) {
        g->addOpWithOutputs<AllReduceMinObj>(std::move(input), output);
        return output;
    } else {
        return g->addOp<AllReduceMinObj>(std::move(input), output)->getOutput();
    }
 }
 Tensor GraphHandlerObj::allReduceMax(Tensor input, Tensor output) {
    if (output) {
        g->addOpWithOutputs<AllReduceMaxObj>(std::move(input), output);
        return output;
    } else {
        return g->addOp<AllReduceMaxObj>(std::move(input), output)->getOutput();
    }
 }
 Tensor GraphHandlerObj::allReduceAvg(Tensor input, Tensor output) {
    if (output) {
        g->addOpWithOutputs<AllReduceAvgObj>(std::move(input), output);
        return output;
    } else {
        return g->addOp<AllReduceAvgObj>(std::move(input), output)->getOutput();
    }
 }
 TensorVec GraphHandlerObj::allGather(Tensor input,
                                     std::optional<TensorVec> outputs, int n) {
    if (outputs) {
        g->addOpWithOutputs<AllGatherObj>(std::move(input), outputs, n);
        return *outputs;
    } else {
        return g->addOp<AllGatherObj>(std::move(input), outputs, n)
            ->getOutputs();
    }
 }
 Tensor GraphHandlerObj::broadcast(Tensor input, Tensor output, int root) {
    if (output) {
        g->addOpWithOutputs<BroadcastObj>(std::move(input), output, root);
        return output;
    } else {
        return g->addOp<BroadcastObj>(std::move(input), output, root)
            ->getOutput();
    }
 }
 Tensor GraphHandlerObj::cast(Tensor input, Tensor output, int to) {
    if (output) {
        g->addOpWithOutputs<CastObj>(std::move(input), output,
--- a/src/core/op_type.cc
+++ b/src/core/op_type.cc
@ -214,6 +214,15 @@ const char *OpType::toString() const {
        CASE(FloorMod);
        CASE(Square);
        CASE(SquaredDifference);
        // Communcation
        CASE(AllReduceSum);
        CASE(AllReduceProd);
        CASE(AllReduceMin);
        CASE(AllReduceMax);
        CASE(AllReduceAvg);
        CASE(AllGather);
        CASE(Broadcast);
    default:
        return "Unknown";
    }
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@ -2,6 +2,9 @@
 #include "core/kernel.h"
 #include "core/perf_engine.h"
 #include "core/runtime.h"
 #ifdef INFINI_USE_NCCL
 #include "cuda/nccl_communicator.h"
 #endif
 #include "operators/conv.h"
 #include "operators/matmul.h"
@ -96,4 +99,15 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
 string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
 void CudaRuntimeObj::initComm(const string &name, int worldSize, int rank) {
    IT_ASSERT(worldSize > 0);
    IT_ASSERT(rank >= 0);
    IT_ASSERT(rank < worldSize);
 #ifdef INFINI_USE_NCCL
    comm = std::make_unique<NcclCommunicatorObj>(name, worldSize, rank);
 #else
    IT_TODO_HALT_MSG("Not compiled with NCCL.");
 #endif
 }
 } // namespace infini
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -143,7 +143,10 @@ static int tensor_dtype(Tensor t) {
 }
 #ifdef USE_CUDA
-static Ref<CudaRuntimeObj> cuda_runtime() { return make_ref<CudaRuntimeObj>(); }
+// NOTE(lizhouyang): deprecate this, use CudaRuntime directly.
 [[deprecated]] static Ref<CudaRuntimeObj> cuda_runtime() {
    return make_ref<CudaRuntimeObj>(0);
 }
 #endif
 #ifdef USE_BANG
@ -311,7 +314,9 @@ void init_graph_builder(py::module &m) {
               RuntimeObj>(m, "CpuRuntime");
 #ifdef USE_CUDA
    py::class_<CudaRuntimeObj, std::shared_ptr<CudaRuntimeObj>, RuntimeObj>(
-        m, "CudaRuntime");
+        m, "CudaRuntime")
        .def(py::init<int>(), py::arg("device") = 0)
        .def("init_comm", &CudaRuntimeObj::initComm);
 #endif
 #ifdef USE_BANG
    py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
@ -435,6 +440,13 @@ void init_graph_builder(py::module &m) {
        .def("reduce_mean", &Handler::reduceMean, policy::move)
        .def("slice", &Handler::slice, policy::move)
        .def("pad", &Handler::pad, policy::move)
        .def("allReduceSum", &Handler::allReduceSum, policy::move)
        .def("allReduceProd", &Handler::allReduceProd, policy::move)
        .def("allReduceMin", &Handler::allReduceMin, policy::move)
        .def("allReduceMax", &Handler::allReduceMax, policy::move)
        .def("allReduceAvg", &Handler::allReduceAvg, policy::move)
        .def("allGather", &Handler::allGather, policy::move)
        .def("broadcast", &Handler::broadcast, policy::move)
        .def("cast", &Handler::cast, policy::move)
        .def("expand", &Handler::expand, policy::move)
        .def("erf", &Handler::erf, policy::move)
--- a/src/kernels/cuda/all_gather.cc
+++ b/src/kernels/cuda/all_gather.cc
@ -0,0 +1,46 @@
 #ifdef INFINI_USE_NCCL
 #include "operators/all_gather.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/nccl_communicator.h"
 namespace infini {
 class AllGatherNCCL : public CudaKernelWithoutConfig {
  public:
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<AllGatherObj>(_op);
        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
        int world_size = op->getWorldSize();
        // Check if world size info in operator matches runtime
        IT_ASSERT(world_size == context->getCommunicator().getWorldSize());
        void *input = op->getInputs(0)->getRawDataPtr<void *>();
        CudaPtr output_temp =
            context->getWorkspace(op->getInputs(0)->getBytes() * world_size);
        // void *output = op->getOutput()->getRawDataPtr<void *>();
        IT_ASSERT(op->getDType() == DataType::Float32);
        size_t bytes = op->getInputs(0)->getBytes();
        size_t count = bytes / op->getDType().getSize();
        ncclComm_t comm =
            dynamic_cast<NcclCommunicatorObj &>(context->getCommunicator())
                .getNcclComm();
        // TODO: Using default stream 0 for now.
        checkNcclError(
            ncclAllGather(input, output_temp, count, ncclFloat, comm, 0));
        for (int i = 0; i < world_size; ++i) {
            Tensor output = op->getOutput(i);
            context->copyBlobInsideRuntime(
                output->getRawDataPtr<float *>(),
                static_cast<float *>(output_temp) + i * count, bytes);
        }
    }
 };
 REGISTER_KERNEL(Device::CUDA, OpType::AllGather, DataType::Float32,
                AllGatherNCCL, "AllGather_NCCL_CUDA_Float32");
 } // namespace infini
 #endif
--- a/src/kernels/cuda/all_reduce.cc
+++ b/src/kernels/cuda/all_reduce.cc
@ -0,0 +1,58 @@
 #ifdef INFINI_USE_NCCL
 #include "operators/all_reduce.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/nccl_communicator.h"
 namespace infini {
 class AllReduceNCCL : public CudaKernelWithoutConfig {
  public:
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<AllReduceBaseObj>(_op);
        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
        void *input = op->getInputs(0)->getRawDataPtr<void *>();
        void *output = op->getOutput()->getRawDataPtr<void *>();
        IT_ASSERT(op->getDType() == DataType::Float32);
        size_t count = op->getInputs(0)->getBytes() / op->getDType().getSize();
        ncclComm_t comm =
            dynamic_cast<NcclCommunicatorObj &>(context->getCommunicator())
                .getNcclComm();
        // TODO: Using default stream 0 for now.
        checkNcclError(ncclAllReduce(input, output, count, ncclFloat,
                                     getRedOp(), comm, 0));
    }
    virtual ncclRedOp_t getRedOp() const = 0;
 };
 class AllReduceSumNCCL : public AllReduceNCCL {
    ncclRedOp_t getRedOp() const override { return ncclSum; }
 };
 class AllReduceProdNCCL : public AllReduceNCCL {
    ncclRedOp_t getRedOp() const override { return ncclProd; }
 };
 class AllReduceMinNCCL : public AllReduceNCCL {
    ncclRedOp_t getRedOp() const override { return ncclMin; }
 };
 class AllReduceMaxNCCL : public AllReduceNCCL {
    ncclRedOp_t getRedOp() const override { return ncclMax; }
 };
 class AllReduceAvgNCCL : public AllReduceNCCL {
    ncclRedOp_t getRedOp() const override { return ncclAvg; }
 };
 REGISTER_KERNEL(Device::CUDA, OpType::AllReduceSum, DataType::Float32,
                AllReduceSumNCCL, "AllReduce_Sum_NCCL_CUDA_Float32");
 REGISTER_KERNEL(Device::CUDA, OpType::AllReduceProd, DataType::Float32,
                AllReduceProdNCCL, "AllReduce_Prod_NCCL_CUDA_Float32");
 REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMin, DataType::Float32,
                AllReduceMinNCCL, "AllReduce_Min_NCCL_CUDA_Float32");
 REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMax, DataType::Float32,
                AllReduceMaxNCCL, "AllReduce_Max_NCCL_CUDA_Float32");
 REGISTER_KERNEL(Device::CUDA, OpType::AllReduceAvg, DataType::Float32,
                AllReduceAvgNCCL, "AllReduce_Avg_NCCL_CUDA_Float32");
 } // namespace infini
 #endif
--- a/src/kernels/cuda/broadcast.cc
+++ b/src/kernels/cuda/broadcast.cc
@ -0,0 +1,32 @@
 #ifdef INFINI_USE_NCCL
 #include "operators/broadcast.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/nccl_communicator.h"
 namespace infini {
 class BroadcastNCCL : public CudaKernelWithoutConfig {
  public:
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<BroadcastObj>(_op);
        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
        void *input = op->getInputs(0)->getRawDataPtr<void *>();
        void *output = op->getOutput()->getRawDataPtr<void *>();
        IT_ASSERT(op->getDType() == DataType::Float32);
        size_t count = op->getInputs(0)->getBytes() / op->getDType().getSize();
        ncclComm_t comm =
            dynamic_cast<NcclCommunicatorObj &>(context->getCommunicator())
                .getNcclComm();
        // TODO: Using default stream 0 for now.
        checkNcclError(ncclBroadcast(input, output, count, ncclFloat,
                                     op->getRoot(), comm, 0));
    }
 };
 REGISTER_KERNEL(Device::CUDA, OpType::Broadcast, DataType::Float32,
                BroadcastNCCL, "Broadcast_NCCL_CUDA_Float32");
 } // namespace infini
 #endif
--- a/src/operators/all_gather.cc
+++ b/src/operators/all_gather.cc
@ -0,0 +1,49 @@
 #include "operators/all_gather.h"
 namespace infini {
 AllGatherObj::AllGatherObj(GraphObj *graph, Tensor input,
                           std::optional<TensorVec> outputs, int world_size)
    : OperatorObj(
          OpType::AllGather, {input},
          ((!outputs) ? TensorVec(world_size, nullptr) : std::move(*outputs))),
      world_size(world_size) {
    IT_ASSERT(checkValid(graph));
 }
 optional<vector<Shape>>
 AllGatherObj::inferShape(const TensorVec &inputs) const {
    Shape input_shape = inputs[0]->getDims();
    vector<Shape> output_shapes(getWorldSize(), input_shape);
    return output_shapes;
 }
 vector<DataType> AllGatherObj::inferDataType(const TensorVec &inputs) const {
    return vector<DataType>(world_size, inputs[0]->getDType());
 }
 std::string AllGatherObj::toString() const {
    std::ostringstream os;
    os << "AllGather"
       << "[" << getGuid() << "]";
    os << "(";
    os << vecToString(inputs[0]->getDims()) << ",";
    os << "input=" << inputs[0]->getGuid() << ",";
    os << "output=";
    for (auto i = 0; i < world_size; i++)
        os << outputs[i]->getGuid() << ",";
    os << ")";
    return os.str();
 }
 vector<int> AllGatherObj::getWorkloadVector() const {
    vector<int> ret{type.underlying()};
    const Shape shape = inputs[0]->getDims();
    ret.insert(ret.end(), shape.begin(), shape.end());
    ret.emplace_back(world_size);
    return ret;
 }
 vector<int> AllGatherObj::getOpAttrVector() const {
    return {type.underlying(), world_size};
 }
 } // namespace infini
--- a/src/operators/all_reduce.cc
+++ b/src/operators/all_reduce.cc
@ -0,0 +1,45 @@
 #include "operators/all_reduce.h"
 namespace infini {
 AllReduceBaseObj::AllReduceBaseObj(GraphObj *graph, OpType opType, Tensor input,
                                   Tensor output)
    : OperatorObj(opType, {input}, {output}) {
    IT_ASSERT(checkValid(graph));
 }
 std::string AllReduceBaseObj::toString() const {
    std::ostringstream os;
    os << type.toString() << "[" << getGuid() << "]";
    os << "(";
    os << vecToString(inputs[0]->getDims()) << ",";
    os << "input=" << inputs[0]->getGuid() << ",";
    os << "output=" << outputs[0]->getGuid() << ",";
    return os.str();
 }
 vector<int> AllReduceBaseObj::getWorkloadVector() const {
    vector<int> ret{type.underlying()};
    const Shape shape = outputs[0]->getDims();
    ret.insert(ret.end(), shape.begin(), shape.end());
    return ret;
 }
 vector<int> AllReduceBaseObj::getOpAttrVector() const {
    return {type.underlying()};
 }
 AllReduceSumObj::AllReduceSumObj(GraphObj *graph, Tensor input, Tensor output)
    : AllReduceBaseObj(graph, OpType::AllReduceSum, input, output) {}
 AllReduceProdObj::AllReduceProdObj(GraphObj *graph, Tensor input, Tensor output)
    : AllReduceBaseObj(graph, OpType::AllReduceProd, input, output) {}
 AllReduceMinObj::AllReduceMinObj(GraphObj *graph, Tensor input, Tensor output)
    : AllReduceBaseObj(graph, OpType::AllReduceMin, input, output) {}
 AllReduceMaxObj::AllReduceMaxObj(GraphObj *graph, Tensor input, Tensor output)
    : AllReduceBaseObj(graph, OpType::AllReduceMax, input, output) {}
 AllReduceAvgObj::AllReduceAvgObj(GraphObj *graph, Tensor input, Tensor output)
    : AllReduceBaseObj(graph, OpType::AllReduceAvg, input, output) {}
 } // namespace infini
--- a/src/operators/broadcast.cc
+++ b/src/operators/broadcast.cc
@ -0,0 +1,33 @@
 #include "operators/broadcast.h"
 namespace infini {
 BroadcastObj::BroadcastObj(GraphObj *graph, Tensor input, Tensor output,
                           int root)
    : OperatorObj(OpType::Broadcast, {input}, {output}), root(root) {
    IT_ASSERT(checkValid(graph));
 }
 vector<int> BroadcastObj::getWorkloadVector() const {
    vector<int> ret{type.underlying()};
    const Shape shape = inputs[0]->getDims();
    ret.insert(ret.end(), shape.begin(), shape.end());
    return ret;
 }
 vector<int> BroadcastObj::getOpAttrVector() const {
    return {type.underlying()};
 }
 std::string BroadcastObj::toString() const {
    std::ostringstream os;
    os << "Broadcast"
       << "[" << getGuid() << "]";
    os << "(";
    os << vecToString(inputs[0]->getDims()) << ",";
    os << "input=" << inputs[0]->getGuid() << ",";
    os << "output=" << outputs[0]->getGuid() << ",";
    os << "root=" << root;
    os << ")";
    return os.str();
 }
 } // namespace infini
--- a/test/cuda/test_nccl_comm.cc
+++ b/test/cuda/test_nccl_comm.cc
@ -0,0 +1,55 @@
 #ifdef INFINI_USE_NCCL
 #include "cuda/cuda_runtime.h"
 #include "cuda/nccl_communicator.h"
 #include "test.h"
 static int WORLD_SIZE = 2;
 namespace infini {
 void allReduceSum(float *data, int deviceId) {
    // Create Runtime and setup communication
    CudaRuntimeObj *cuda_runtime = new CudaRuntimeObj(deviceId);
    int rank = deviceId;
    cuda_runtime->initComm("test_nccl_comm", WORLD_SIZE, rank);
    ncclComm_t comm =
        dynamic_cast<NcclCommunicatorObj &>(cuda_runtime->getCommunicator())
            .getNcclComm();
    // Copy data
    float *data_gpu;
    checkCudaError(cudaMalloc(&data_gpu, sizeof(float)));
    checkCudaError(
        cudaMemcpy(data_gpu, data, sizeof(float), cudaMemcpyHostToDevice));
    // Do AllReduce
    checkNcclError(
        ncclAllReduce(data_gpu, data_gpu, 1, ncclFloat, ncclSum, comm, 0));
    // Copy data back and sync device
    checkCudaError(
        cudaMemcpy(data, data_gpu, sizeof(float), cudaMemcpyDeviceToHost));
    checkCudaError(cudaDeviceSynchronize());
 }
 // Setup communication between 2 threads, each controlling 1 GPU.
 // Do AllReduce Sum on {1.0, 4.0}. Results should be {5.0, 5.0}.
 TEST(NCCL, multi_gpu_communication) {
    int num_threads = WORLD_SIZE;
    float data[] = {1.0, 4.0};
    std::vector<std::thread> threads;
    for (int gpu = 0; gpu < num_threads; ++gpu) {
        threads.emplace_back(allReduceSum, &data[gpu], gpu);
    }
    for (auto &thread : threads) {
        thread.join();
    }
    for (int i = 0; i < num_threads; ++i) {
        ASSERT_EQ(data[i], 5.0f);
    }
 }
 } // namespace infini
 #endif
--- a/test/kernels/cuda/test_cuda_all_gather.cc
+++ b/test/kernels/cuda/test_cuda_all_gather.cc
@ -0,0 +1,51 @@
 #ifdef INFINI_USE_NCCL
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
 #include "operators/all_gather.h"
 #include "test.h"
 #include <nccl.h>
 #include <thread>
 static int WORLD_SIZE = 2;
 namespace infini {
 void allGather(const string taskName, int deviceID, vector<float> data,
               vector<vector<float>> ans) {
    // Create Runtimes and initiate communication
    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
    Runtime cudaRuntime = make_ref<CudaRuntimeObj>(deviceID);
    cudaRuntime->initComm(taskName, WORLD_SIZE, deviceID);
    // Create Graph and insert allReduce operation
    Graph g = make_ref<GraphObj>(cudaRuntime);
    auto input =
        g->addTensor(Shape{static_cast<int>(data.size())}, DataType::Float32);
    auto op = g->addOp<AllGatherObj>(input, std::nullopt, WORLD_SIZE);
    // Copy data from CPU to GPU
    g->dataMalloc();
    input->copyin(data);
    // Run operation
    cudaRuntime->run(g);
    // Copy output from GPU to CPU
    for (int i = 0; i < WORLD_SIZE; ++i) {
        auto result = op->getOutputs()[i]->clone(cpuRuntime);
        EXPECT_TRUE(result->equalData(ans[i]));
    }
 }
 TEST(CUDA_AllGather, run) {
    vector<float> data[2] = {{2., 3.}, {5., 6.}};
    vector<vector<float>> ans = {{2., 3.}, {5., 6.}};
    std::vector<std::thread> threads;
    for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) {
        threads.emplace_back(allGather, "test_all_gather", gpu, data[gpu], ans);
    }
    for (auto &thread : threads) {
        thread.join();
    }
 }
 } // namespace infini
 #endif
--- a/test/kernels/cuda/test_cuda_all_reduce.cc
+++ b/test/kernels/cuda/test_cuda_all_reduce.cc
@ -0,0 +1,109 @@
 #ifdef INFINI_USE_NCCL
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
 #include "operators/all_reduce.h"
 #include "test.h"
 #include <nccl.h>
 #include <thread>
 static int WORLD_SIZE = 2;
 namespace infini {
 template <typename OperatorObj>
 void allReduce(const string taskName, int deviceID, vector<float> data,
               vector<float> ans) {
    // Create Runtimes and initiate communication
    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
    Runtime cudaRuntime = make_ref<CudaRuntimeObj>(deviceID);
    cudaRuntime->initComm(taskName, WORLD_SIZE, deviceID);
    // Create Graph and insert allReduce operation
    Graph g = make_ref<GraphObj>(cudaRuntime);
    auto input =
        g->addTensor(Shape{static_cast<int>(data.size())}, DataType::Float32);
    auto op = g->addOp<OperatorObj>(input, nullptr);
    // Copy data from CPU to GPU
    g->dataMalloc();
    input->copyin(data);
    // Run operation
    cudaRuntime->run(g);
    // Copy output from GPU to CPU
    auto result = op->getOutput()->clone(cpuRuntime);
    EXPECT_TRUE(result->equalData(ans));
 }
 TEST(CUDA_AllReduce, sum) {
    vector<float> data[2] = {{2., 3.}, {5., 6.}};
    vector<float> ans = {7., 9.};
    std::vector<std::thread> threads;
    for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) {
        threads.emplace_back(allReduce<AllReduceSumObj>, "test_allreduce_sum",
                             gpu, data[gpu], ans);
    }
    for (auto &thread : threads) {
        thread.join();
    }
 }
 TEST(CUDA_AllReduce, prod) {
    vector<float> data[2] = {{2., 3.}, {5., 6.}};
    vector<float> ans = {10., 18.};
    std::vector<std::thread> threads;
    for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) {
        threads.emplace_back(allReduce<AllReduceProdObj>, "test_allreduce_prod",
                             gpu, data[gpu], ans);
    }
    for (auto &thread : threads) {
        thread.join();
    }
 }
 TEST(CUDA_AllReduce, min) {
    vector<float> data[2] = {{2., 3.}, {5., 6.}};
    vector<float> ans = {2., 3.};
    std::vector<std::thread> threads;
    for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) {
        threads.emplace_back(allReduce<AllReduceMinObj>, "test_allreduce_min",
                             gpu, data[gpu], ans);
    }
    for (auto &thread : threads) {
        thread.join();
    }
 }
 TEST(CUDA_AllReduce, max) {
    vector<float> data[2] = {{2., 3.}, {5., 6.}};
    vector<float> ans = {5., 6.};
    std::vector<std::thread> threads;
    for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) {
        threads.emplace_back(allReduce<AllReduceMaxObj>, "test_allreduce_max",
                             gpu, data[gpu], ans);
    }
    for (auto &thread : threads) {
        thread.join();
    }
 }
 TEST(CUDA_AllReduce, avg) {
    vector<float> data[2] = {{2., 3.}, {5., 6.}};
    vector<float> ans = {3.5, 4.5};
    std::vector<std::thread> threads;
    for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) {
        threads.emplace_back(allReduce<AllReduceAvgObj>, "test_allreduce_avg",
                             gpu, data[gpu], ans);
    }
    for (auto &thread : threads) {
        thread.join();
    }
 }
 } // namespace infini
 #endif
--- a/test/kernels/cuda/test_cuda_broadcast.cc
+++ b/test/kernels/cuda/test_cuda_broadcast.cc
@ -0,0 +1,56 @@
 #ifdef INFINI_USE_NCCL
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
 #include "operators/broadcast.h"
 #include "test.h"
 #include <nccl.h>
 #include <thread>
 static int WORLD_SIZE = 2;
 static int root = 0;
 namespace infini {
 void broadcast(const string taskName, int deviceID, vector<float> data,
               vector<float> ans) {
    // Create Runtimes and initiate communication
    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
    Runtime cudaRuntime = make_ref<CudaRuntimeObj>(deviceID);
    cudaRuntime->initComm(taskName, WORLD_SIZE, deviceID);
    // Create Graph and insert allReduce operation
    Graph g = make_ref<GraphObj>(cudaRuntime);
    auto input =
        g->addTensor(Shape{static_cast<int>(data.size())}, DataType::Float32);
    auto op = g->addOp<BroadcastObj>(input, nullptr, root);
    // Copy data from CPU to GPU
    g->dataMalloc();
    // Only rank 0 has the data
    if (deviceID == root) {
        input->copyin(data);
    }
    // Run broadcast operation
    cudaRuntime->run(g);
    // Copy output from GPU to CPU
    auto result = op->getOutput()->clone(cpuRuntime);
    EXPECT_TRUE(result->equalData(ans));
 }
 TEST(CUDA_Broadcast, run) {
    // Only 1 device gets data. Every rank should have the same data after
    // broadcast.
    vector<float> data = {2., 3., 5., 6.};
    vector<float> ans = {2., 3., 5., 6.};
    std::vector<std::thread> threads;
    for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) {
        threads.emplace_back(broadcast, "test_broadcast", gpu, data, ans);
    }
    for (auto &thread : threads) {
        thread.join();
    }
 }
 } // namespace infini
 #endif
--- a/test/operators/test_all_gather.cc
+++ b/test/operators/test_all_gather.cc
@ -0,0 +1,23 @@
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "operators/all_gather.h"
 #include "test.h"
 namespace infini {
 TEST(AllGather, ShapeTypeInfer) {
    Runtime runtime = NativeCpuRuntimeObj::getInstance();
    int world_size = 8;
    {
        Shape shape = {1, 3, 2, 4};
        Graph g = make_ref<GraphObj>(runtime);
        Tensor input = g->addTensor(shape, DataType::Float32);
        auto op = g->addOp<AllGatherObj>(input, std::nullopt, world_size);
        EXPECT_EQ(op->getOpType(), OpType::AllGather);
        EXPECT_EQ(op->numOutputs(), world_size);
        for (int i = 0; i < world_size; ++i) {
            EXPECT_EQ(op->getOutput(i)->getDims(), shape);
            EXPECT_EQ(op->getOutput(i)->getDType(), DataType::Float32);
        }
    }
 }
 } // namespace infini
--- a/test/operators/test_all_reduce.cc
+++ b/test/operators/test_all_reduce.cc
@ -0,0 +1,50 @@
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "operators/all_reduce.h"
 #include "test.h"
 namespace infini {
 TEST(AllReuce, ShapeTypeInfer) {
    auto runtime = NativeCpuRuntimeObj::getInstance();
    {
        Graph g = make_ref<GraphObj>(runtime);
        Tensor input = g->addTensor({1, 3, 2, 4}, DataType::Float32);
        auto op = g->addOp<AllReduceSumObj>(input, nullptr);
        EXPECT_EQ(op->getOpType(), OpType::AllReduceSum);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 4}));
        EXPECT_EQ(op->getOutput()->getDType(), DataType::Float32);
    }
    {
        Graph g = make_ref<GraphObj>(runtime);
        Tensor input = g->addTensor({1, 3, 2, 4}, DataType::Float32);
        auto op = g->addOp<AllReduceProdObj>(input, nullptr);
        EXPECT_EQ(op->getOpType(), OpType::AllReduceProd);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 4}));
        EXPECT_EQ(op->getOutput()->getDType(), DataType::Float32);
    }
    {
        Graph g = make_ref<GraphObj>(runtime);
        Tensor input = g->addTensor({1, 3, 2, 4}, DataType::Float32);
        auto op = g->addOp<AllReduceMinObj>(input, nullptr);
        EXPECT_EQ(op->getOpType(), OpType::AllReduceMin);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 4}));
        EXPECT_EQ(op->getOutput()->getDType(), DataType::Float32);
    }
    {
        Graph g = make_ref<GraphObj>(runtime);
        Tensor input = g->addTensor({1, 3, 2, 4}, DataType::Float32);
        auto op = g->addOp<AllReduceMaxObj>(input, nullptr);
        EXPECT_EQ(op->getOpType(), OpType::AllReduceMax);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 4}));
        EXPECT_EQ(op->getOutput()->getDType(), DataType::Float32);
    }
    {
        Graph g = make_ref<GraphObj>(runtime);
        Tensor input = g->addTensor({1, 3, 2, 4}, DataType::Float32);
        auto op = g->addOp<AllReduceAvgObj>(input, nullptr);
        EXPECT_EQ(op->getOpType(), OpType::AllReduceAvg);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 4}));
        EXPECT_EQ(op->getOutput()->getDType(), DataType::Float32);
    }
 }
 } // namespace infini
--- a/test/operators/test_broadcast.cc
+++ b/test/operators/test_broadcast.cc
@ -0,0 +1,19 @@
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "operators/broadcast.h"
 #include "test.h"
 namespace infini {
 TEST(Broadcast, ShapeTypeInfer) {
    auto runtime = NativeCpuRuntimeObj::getInstance();
    int root = 0;
    {
        Graph g = make_ref<GraphObj>(runtime);
        Tensor input = g->addTensor({1, 3, 2, 4}, DataType::Float32);
        auto op = g->addOp<BroadcastObj>(input, nullptr, root);
        EXPECT_EQ(op->getOpType(), OpType::Broadcast);
        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 4}));
        EXPECT_EQ(op->getOutput()->getDType(), DataType::Float32);
    }
 }
 } // namespace infini