#ifdef INFINI_USE_NCCL #include "cuda/cuda_runtime.h" #include "cuda/nccl_communicator.h" #include "test.h" static int WORLD_SIZE = 2; namespace infini { void allReduceSum(float *data, int deviceId) { // Create Runtime and setup communication CudaRuntimeObj *cuda_runtime = new CudaRuntimeObj(deviceId); int rank = deviceId; cuda_runtime->initComm("test_nccl_comm", WORLD_SIZE, rank); ncclComm_t comm = dynamic_cast(cuda_runtime->getCommunicator()) .getNcclComm(); // Copy data float *data_gpu; checkCudaError(cudaMalloc(&data_gpu, sizeof(float))); checkCudaError( cudaMemcpy(data_gpu, data, sizeof(float), cudaMemcpyHostToDevice)); // Do AllReduce checkNcclError( ncclAllReduce(data_gpu, data_gpu, 1, ncclFloat, ncclSum, comm, 0)); // Copy data back and sync device checkCudaError( cudaMemcpy(data, data_gpu, sizeof(float), cudaMemcpyDeviceToHost)); checkCudaError(cudaDeviceSynchronize()); } // Setup communication between 2 threads, each controlling 1 GPU. // Do AllReduce Sum on {1.0, 4.0}. Results should be {5.0, 5.0}. TEST(NCCL, multi_gpu_communication) { int num_threads = WORLD_SIZE; float data[] = {1.0, 4.0}; std::vector threads; for (int gpu = 0; gpu < num_threads; ++gpu) { threads.emplace_back(allReduceSum, &data[gpu], gpu); } for (auto &thread : threads) { thread.join(); } for (int i = 0; i < num_threads; ++i) { ASSERT_EQ(data[i], 5.0f); } } } // namespace infini #endif