From c73f78703b07bacdb6a9358a35d481545697cb69 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Thu, 9 Sep 2021 00:19:05 +0800 Subject: [PATCH 1/4] Add GitHub Action for build and test in CI Add GitHub Action for build and test in CI: * build msccl * build nccl-tests * run tests with validation check --- .github/workflows/nccl-tests.yml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/nccl-tests.yml diff --git a/.github/workflows/nccl-tests.yml b/.github/workflows/nccl-tests.yml new file mode 100644 index 00000000..09da847e --- /dev/null +++ b/.github/workflows/nccl-tests.yml @@ -0,0 +1,31 @@ +name: CI + +on: + pull_request: + branches: + - msccl/* + +jobs: + nccl-tests: + name: Build and run nccl tests + runs-on: [self-hosted, linux, x64, gpu] + container: + image: nvcr.io/nvidia/pytorch:20.12-py3 + options: --privileged --net=host --ipc=host --gpus=all + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Build and install + run: | + make src.build -j + make install + - name: Build nccl-tests + run: | + git clone https://github.com/nvidia/nccl-tests /nccl-tests + cd /nccl-tests + make MPI=1 MPI_HOME=/usr/local/mpi -j + - name: Test local all reduce + run: | + mpirun \ + -allow-run-as-root -H localhost:4 -np 4 \ + /nccl-tests/build/all_reduce_perf -b 1K -e 256M -f 2 -g 1 -c 1 -w 20 -n 50 From c1d51a5106e78cef792822ecaf9a78e8a6059ddf Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Thu, 9 Sep 2021 00:40:11 +0800 Subject: [PATCH 2/4] Remove `--network` to avoid conflict Remove `--network` to avoid conflict. --- .github/workflows/nccl-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nccl-tests.yml b/.github/workflows/nccl-tests.yml index 09da847e..be304ed1 100644 --- a/.github/workflows/nccl-tests.yml +++ b/.github/workflows/nccl-tests.yml @@ -11,7 +11,7 @@ jobs: runs-on: [self-hosted, linux, x64, gpu] container: image: nvcr.io/nvidia/pytorch:20.12-py3 - options: --privileged --net=host --ipc=host --gpus=all + options: --privileged --ipc=host --gpus=all steps: - name: Checkout uses: actions/checkout@v2 From 87f0c7f6ce8e93edd39d1aa75c176e530bc18eb9 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 10 Sep 2021 23:09:16 +0800 Subject: [PATCH 3/4] Update Update. --- .github/workflows/nccl-tests.yml | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/.github/workflows/nccl-tests.yml b/.github/workflows/nccl-tests.yml index be304ed1..eae245b3 100644 --- a/.github/workflows/nccl-tests.yml +++ b/.github/workflows/nccl-tests.yml @@ -10,22 +10,31 @@ jobs: name: Build and run nccl tests runs-on: [self-hosted, linux, x64, gpu] container: - image: nvcr.io/nvidia/pytorch:20.12-py3 + image: nvcr.io/nvidia/pytorch:19.12-py3 options: --privileged --ipc=host --gpus=all steps: - - name: Checkout + - name: Checkout msccl uses: actions/checkout@v2 - - name: Build and install + - name: Checkout nccl-tests + uses: actions/checkout@v2 + with: + repository: nvidia/nccl-tests + path: ./nccl-tests + - name: Build msccl run: | make src.build -j - make install - name: Build nccl-tests run: | - git clone https://github.com/nvidia/nccl-tests /nccl-tests - cd /nccl-tests - make MPI=1 MPI_HOME=/usr/local/mpi -j + make MPI=1 MPI_HOME=/usr/local/mpi NCCL_HOME=./build -j -C ./nccl-tests - name: Test local all reduce run: | mpirun \ - -allow-run-as-root -H localhost:4 -np 4 \ - /nccl-tests/build/all_reduce_perf -b 1K -e 256M -f 2 -g 1 -c 1 -w 20 -n 50 + -allow-run-as-root -H localhost:4 -np 4 -mca btl ^openib \ + -x NCCL_DEBUG=VERSION \ + ./nccl-tests/build/all_reduce_perf -b 1K -e 256M -f 2 -g 1 -c 1 -w 20 -n 50 + - name: Test local alltoall + run: | + mpirun \ + -allow-run-as-root -H localhost:4 -np 4 -mca btl ^openib \ + -x NCCL_DEBUG=VERSION \ + ./nccl-tests/build/alltoall_perf -b 1K -e 256M -f 2 -g 1 -c 1 -w 20 -n 50 From 8d74ffd0245cc3be6112dd8185ae14f2da745d52 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 10 Sep 2021 23:14:51 +0800 Subject: [PATCH 4/4] Fix build path Fix build path. --- .github/workflows/nccl-tests.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/nccl-tests.yml b/.github/workflows/nccl-tests.yml index eae245b3..469c7b8d 100644 --- a/.github/workflows/nccl-tests.yml +++ b/.github/workflows/nccl-tests.yml @@ -20,21 +20,22 @@ jobs: with: repository: nvidia/nccl-tests path: ./nccl-tests - - name: Build msccl + - name: Build and install msccl run: | make src.build -j + make install - name: Build nccl-tests run: | - make MPI=1 MPI_HOME=/usr/local/mpi NCCL_HOME=./build -j -C ./nccl-tests + make MPI=1 MPI_HOME=/usr/local/mpi NCCL_HOME=../../build -j -C ./nccl-tests - name: Test local all reduce run: | mpirun \ -allow-run-as-root -H localhost:4 -np 4 -mca btl ^openib \ - -x NCCL_DEBUG=VERSION \ + -x LD_LIBRARY_PATH=/usr/local/lib -x NCCL_DEBUG=VERSION \ ./nccl-tests/build/all_reduce_perf -b 1K -e 256M -f 2 -g 1 -c 1 -w 20 -n 50 - name: Test local alltoall run: | mpirun \ -allow-run-as-root -H localhost:4 -np 4 -mca btl ^openib \ - -x NCCL_DEBUG=VERSION \ + -x LD_LIBRARY_PATH=/usr/local/lib -x NCCL_DEBUG=VERSION \ ./nccl-tests/build/alltoall_perf -b 1K -e 256M -f 2 -g 1 -c 1 -w 20 -n 50