InfiniTensor/test/cuda/test_nccl_comm.cc

#ifdef INFINI_USE_NCCL
#include "cuda/cuda_runtime.h"
#include "cuda/nccl_communicator.h"
#include "test.h"

static int WORLD_SIZE = 2;

namespace infini {

void allReduceSum(float *data, int deviceId) {
    // Create Runtime and setup communication
    CudaRuntimeObj *cuda_runtime = new CudaRuntimeObj(deviceId);
    int rank = deviceId;
    cuda_runtime->initComm("test_nccl_comm", WORLD_SIZE, rank);
    ncclComm_t comm =
        dynamic_cast<NcclCommunicatorObj &>(cuda_runtime->getCommunicator())
            .getNcclComm();

    // Copy data
    float *data_gpu;
    checkCudaError(cudaMalloc(&data_gpu, sizeof(float)));
    checkCudaError(
        cudaMemcpy(data_gpu, data, sizeof(float), cudaMemcpyHostToDevice));

    // Do AllReduce
    checkNcclError(
        ncclAllReduce(data_gpu, data_gpu, 1, ncclFloat, ncclSum, comm, 0));

    // Copy data back and sync device
    checkCudaError(
        cudaMemcpy(data, data_gpu, sizeof(float), cudaMemcpyDeviceToHost));
    checkCudaError(cudaDeviceSynchronize());
}

// Setup communication between 2 threads, each controlling 1 GPU.
// Do AllReduce Sum on {1.0, 4.0}. Results should be {5.0, 5.0}.
TEST(NCCL, multi_gpu_communication) {
    int num_threads = WORLD_SIZE;
    float data[] = {1.0, 4.0};

    std::vector<std::thread> threads;
    for (int gpu = 0; gpu < num_threads; ++gpu) {
        threads.emplace_back(allReduceSum, &data[gpu], gpu);
    }
    for (auto &thread : threads) {
        thread.join();
    }

    for (int i = 0; i < num_threads; ++i) {
        ASSERT_EQ(data[i], 5.0f);
    }
}

} // namespace infini
#endif
impl distributed launch with NCCL (#106) * add cmake bits about NCCL * move example to examples/NNmodel * impl NCCL communicator * add comm related function to Runtime * export runtime interface * add launch.py * use unique name to distingush the the NCCL ID file * add timeout to communicator init * expose communicator obj from runtime obj, add unit test for nccl communicator * reformat files * Add allReduce operator and cuda nccl allReduce kernel * impl model parallel for resnet * add allGather nccl kernel and operator * Add allreduce allgather operator tests, change allgather kernel to output list of tensor, fix shape infer, handle nullptr output * fix format of onnx.py * use concat following AllGather * get tensor parallel for resnet * fix format of graph_handler.cc * change BUILD_DIST default to OFF * polish code of communicator * update .gitignore * Add broadcast operator and cuda kernel * Add comments for operators * remove const of class member * move communicator to CudaRuntimeObj * Add an empty line at EOF. --------- Co-authored-by: panzezhong <panzezhong@qiyuanlab.com> Co-authored-by: Haojie Wang <haojie0429@gmail.com> 2023-09-05 09:47:35 +08:00			`#ifdef INFINI_USE_NCCL`
			`#include "cuda/cuda_runtime.h"`
			`#include "cuda/nccl_communicator.h"`
			`#include "test.h"`

			`static int WORLD_SIZE = 2;`

			`namespace infini {`

			`void allReduceSum(float *data, int deviceId) {`
			`// Create Runtime and setup communication`
			`CudaRuntimeObj *cuda_runtime = new CudaRuntimeObj(deviceId);`
			`int rank = deviceId;`
			`cuda_runtime->initComm("test_nccl_comm", WORLD_SIZE, rank);`
			`ncclComm_t comm =`
			`dynamic_cast<NcclCommunicatorObj &>(cuda_runtime->getCommunicator())`
			`.getNcclComm();`

			`// Copy data`
			`float *data_gpu;`
			`checkCudaError(cudaMalloc(&data_gpu, sizeof(float)));`
			`checkCudaError(`
			`cudaMemcpy(data_gpu, data, sizeof(float), cudaMemcpyHostToDevice));`

			`// Do AllReduce`
			`checkNcclError(`
			`ncclAllReduce(data_gpu, data_gpu, 1, ncclFloat, ncclSum, comm, 0));`

			`// Copy data back and sync device`
			`checkCudaError(`
			`cudaMemcpy(data, data_gpu, sizeof(float), cudaMemcpyDeviceToHost));`
			`checkCudaError(cudaDeviceSynchronize());`
			`}`

			`// Setup communication between 2 threads, each controlling 1 GPU.`
			`// Do AllReduce Sum on {1.0, 4.0}. Results should be {5.0, 5.0}.`
			`TEST(NCCL, multi_gpu_communication) {`
			`int num_threads = WORLD_SIZE;`
			`float data[] = {1.0, 4.0};`

			`std::vector<std::thread> threads;`
			`for (int gpu = 0; gpu < num_threads; ++gpu) {`
			`threads.emplace_back(allReduceSum, &data[gpu], gpu);`
			`}`
			`for (auto &thread : threads) {`
			`thread.join();`
			`}`

			`for (int i = 0; i < num_threads; ++i) {`
			`ASSERT_EQ(data[i], 5.0f);`
			`}`
			`}`

			`} // namespace infini`
			`#endif`