From 1d10a2c8f4eba63b46cece0dcb20501e9e920c38 Mon Sep 17 00:00:00 2001 From: arhag23 Date: Wed, 23 Jul 2025 13:44:37 -0600 Subject: [PATCH 1/9] created scratch target for testing new patterns --- CMakeLists.txt | 6 +- CommFunctions/CMakeLists.txt | 5 + CommFunctions/comm.h | 6 +- main.cpp | 328 ++++++++++++++++-------------- scratch.cpp | 22 ++ tests/ipc/CMakeLists.txt | 2 +- tests/ipc/ipc_get_gather_test.cpp | 2 +- tests/mpi/CMakeLists.txt | 2 +- tests/xccl/CMakeLists.txt | 2 +- 9 files changed, 215 insertions(+), 160 deletions(-) create mode 100644 scratch.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 3fb140f..9615ff5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,15 +83,19 @@ add_subdirectory(CommFunctions) # add subdirectory after all configuration, may # Add your source files and executable here add_executable(${PROJECT_NAME} main.cpp) +add_executable(scratch scratch.cpp) -target_link_libraries(${PROJECT_NAME} PUBLIC CommFunctions commbench_compiler_flags OpenMP::OpenMP_CXX) +target_link_libraries(${PROJECT_NAME} PUBLIC CommFunctions commbench_compiler_flags) +target_link_libraries(scratch PUBLIC CommFunctions commbench_compiler_flags) install(TARGETS ${PROJECT_NAME} DESTINATION bin) install(DIRECTORY ${PROJECT_SOURCE_DIR}/CommFunctions DESTINATION include FILES_MATCHING PATTERN "*.h") if (USE_CUDA) set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(scratch.cpp PROPERTIES LANGUAGE CUDA) elseif (USE_HIP) set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) + set_source_files_properties(scratch.cpp PROPERTIES LANGUAGE HIP) endif() if (USE_JSONCPP) diff --git a/CommFunctions/CMakeLists.txt b/CommFunctions/CMakeLists.txt index ccf3da4..bc33c82 100644 --- a/CommFunctions/CMakeLists.txt +++ b/CommFunctions/CMakeLists.txt @@ -2,6 +2,11 @@ add_library(CommFunctions INTERFACE commbench.h validate.h comm.h util.h) target_include_directories(CommFunctions INTERFACE ${MPI_CXX_COMPILER_INCLUDE_DIRS}) target_link_libraries(CommFunctions INTERFACE MPI::MPI_CXX OpenMP::OpenMP_CXX) +if (NOT USE_GASNET) + target_compile_definitions(CommFunctions INTERFACE "USE_MPI") +else() + target_compile_definitions(CommFunctions INTERFACE "USE_GASNET") +endif() if (USE_GTL) target_link_libraries(CommFunctions INTERFACE mpi_gtl_hsa) diff --git a/CommFunctions/comm.h b/CommFunctions/comm.h index b82249a..ad7e495 100644 --- a/CommFunctions/comm.h +++ b/CommFunctions/comm.h @@ -473,7 +473,7 @@ error = zeMemOpenIpcHandle(zeContext, zeDevice, memhandle, 0, (void**)&remotebuf[numsend]); #endif if(error) - printf("IpcOpenMemHandle error %d\n", error); + printf("comm.h:476 IpcOpenMemHandle error %d\n", error); recv(&remoteoffset[numsend], recvid); } #ifdef IPC_ze @@ -508,7 +508,7 @@ // send(&memhandle, sendid); #endif if(error) - printf("IpcGetMemHandle error %d\n", error); + printf("comm.h:511 IpcGetMemHandle error %d\n", error); send(&sendoffset, recvid); } break; @@ -642,7 +642,7 @@ error = zeMemOpenIpcHandle(zeContext, zeDevice, memhandle, 0, (void**)&remotebuf[numrecv]); #endif if(error) - printf("IpcOpenMemHandle error %d\n", error); + printf("comm.h:645 IpcOpenMemHandle error %d\n", error); recv(&remoteoffset[numrecv], sendid); } #ifdef IPC_ze diff --git a/main.cpp b/main.cpp index 4f183d9..6c2dd8f 100644 --- a/main.cpp +++ b/main.cpp @@ -1,4 +1,5 @@ #include "commbench.h" +#include #define ROOT 0 #include "validate.h" #include @@ -9,11 +10,21 @@ #include #include #include -#include using namespace CommBench; -enum pattern { p2p, gather, scatter, broadcast, reduce, alltoall, allgather }; +enum pattern { + p2p, + gather, + scatter, + broadcast, + reduce, + alltoall, + allgather, + rail, + asymmetric_fill, + symmetric_fill +}; int myid_loc; struct step { @@ -21,11 +32,11 @@ struct step { library lib; }; -/*template void FATAL_ERROR(const char *fmt, Args... args) { +template void FATAL_ERROR(const char *fmt, Args... args) { if (myid_loc == printid) { fprintf(stderr, "FATAL ERROR: "); fprintf(stderr, fmt, args...); - std::abort(); + std::exit(EXIT_FAILURE); } } @@ -41,12 +52,12 @@ template void WARNING(const char *fmt, Args... args) { fprintf(stderr, "WARNING: "); fprintf(stderr, fmt, args...); } -}*/ +} std::unordered_map> parseArgs(int argc, char *argv[]) { - static const std::array valid_args = {"use", "pattern", "validate", - "nbytes", "file"}; + static const std::array valid_args = { + "use", "pattern", "validate", "nbytes", "file"}; int i = 1; std::unordered_map> args; std::string prev = ""; @@ -63,9 +74,9 @@ parseArgs(int argc, char *argv[]) { break; } if (!valid) { -//FATAL_ERROR("unknown argument \"%s\"\n", argv[i]); - std::cout << "unknown argument \"%s\"\n" << std::endl; - std::exit(EXIT_FAILURE); + FATAL_ERROR("unknown argument \"%s\"\n", argv[i]); + // std::cout << "unknown argument \"%s\"\n" << std::endl; + // std::exit(EXIT_FAILURE); } if (args.find(arg) == args.end()) args.insert({arg, {}}); @@ -75,9 +86,9 @@ parseArgs(int argc, char *argv[]) { args[prev].push_back(cur); } else { -//FATAL_ERROR("unknown argument", argv[i]); - std::cout << "unknown argument" << std::endl; - std::exit(EXIT_FAILURE); + FATAL_ERROR("unknown argument", argv[i]); + // std::cout << "unknown argument" << std::endl; + // std::exit(EXIT_FAILURE); } i++; } @@ -89,39 +100,40 @@ library parseLib(const std::string &libStr) { return library::MPI; else if (libStr == "ipc_put") { #if !(defined(PORT_CUDA) || defined(PORT_HIP) || defined(PORT_ONEAPI)) - //FATAL_ERROR("Cannot use IPC without compiling for CUDA, " - // "ROCm, or OneAPI\n"); - std::cout << "Cannot use IPC without compiling for CUDA, ROCm, or OneAPI" << std::endl; - std::exit(EXIT_FAILURE); + FATAL_ERROR("Cannot use IPC without compiling for CUDA, " + "ROCm, or OneAPI\n"); + // std::cout << "Cannot use IPC without compiling for CUDA, ROCm, or OneAPI" + // << std::endl; std::exit(EXIT_FAILURE); #endif return library::IPC; } else if (libStr == "ipc_get") { #if !(defined(PORT_CUDA) || defined(PORT_HIP) || defined(PORT_ONEAPI)) - //FATAL_ERROR("Cannot use IPC without compiling for CUDA, " - // "ROCm, or OneAPI\n"); - std::cout << "Cannot use IPC without compiling for CUDA, ROCm, or OneAPI" << std::endl; - std::exit(EXIT_FAILURE); + FATAL_ERROR("Cannot use IPC without compiling for CUDA, " + "ROCm, or OneAPI\n"); + // std::cout << "Cannot use IPC without compiling for CUDA, ROCm, or OneAPI" + // << std::endl; std::exit(EXIT_FAILURE); #endif return library::IPC_get; } else if (libStr == "xccl") { #if !(defined(PORT_CUDA) || defined(PORT_HIP) || defined(PORT_ONEAPI)) - //FATAL_ERROR("Cannot use IPC without compiling for CUDA, " - // "ROCm, or OneAPI\n"); - std::cout << "Cannot use IPC without compiling for CUDA, ROCm, or OneAPI" << std::endl; - std::exit(EXIT_FAILURE); + FATAL_ERROR("Cannot use IPC without compiling for CUDA, " + "ROCm, or OneAPI\n"); + // std::cout << "Cannot use IPC without compiling for CUDA, ROCm, or OneAPI" + // << std::endl; std::exit(EXIT_FAILURE); #endif #ifndef CAP_NCCL - //FATAL_ERROR("Not compiled for using XCCL\n"); - std::cout << "Not compiled for using XCCL" << std::endl; - std::exit(EXIT_FAILURE); + FATAL_ERROR("Not compiled for using XCCL\n"); + // std::cout << "Not compiled for using XCCL" << std::endl; + // std::exit(EXIT_FAILURE); #endif return library::NCCL; } else { - //FATAL_ERROR("Unknown communication library option \"%s\". Please " - // "specify one of: mpi, ipc_put, ipc_get, or xccl.\n", - // libStr.c_str()); - std::cout << "Unknown communication library option. Please specify one of: mpi, ipc_put, ipc_get, or xccl." << std::endl; - std::exit(EXIT_FAILURE); + FATAL_ERROR("Unknown communication library option \"%s\". Please " + "specify one of: mpi, ipc_put, ipc_get, or xccl.\n", + libStr.c_str()); + // std::cout << "Unknown communication library option. Please specify one + // of: mpi, ipc_put, ipc_get, or xccl." << std::endl; + // std::exit(EXIT_FAILURE); } } @@ -138,13 +150,20 @@ pattern parsePattern(const std::string &patStr) { return pattern::alltoall; else if (patStr == "allgather") return pattern::allgather; + else if (patStr == "rail") + return pattern::rail; + else if (patStr == "asymmetric_fill") + return pattern::asymmetric_fill; + else if (patStr == "symmetric_fill") + return pattern::symmetric_fill; else { - std::cout << "Unknown communication pattern. Please use one of: p2p, broadcast, gather, scatter, alltoall, or allgather." << std::endl; - std::exit(EXIT_FAILURE); - //FATAL_ERROR( - // "Unknown communication pattern \"%s\". Please use one " - // "of: p2p, broadcast, gather, scatter, alltoall, or allgather.\n", - // patStr.c_str()); + // std::cout << "Unknown communication pattern. Please use one of: p2p, + // broadcast, gather, scatter, alltoall, or allgather." << std::endl; + // std::exit(EXIT_FAILURE); + FATAL_ERROR( + "Unknown communication pattern \"%s\". Please use one " + "of: p2p, broadcast, gather, scatter, alltoall, or allgather.\n", + patStr.c_str()); } } @@ -158,144 +177,149 @@ int main(int argc, char *argv[]) { parseArgs(argc, argv); if (args.find("pattern") != args.end() && args.find("file") != args.end()) { - std::cout << "Cannot use both the --file and --pattern options." << std::endl; - std::exit(EXIT_FAILURE); + // std::cout << "Cannot use both the --file and --pattern options." << + // std::endl; std::exit(EXIT_FAILURE); + FATAL_ERROR("Cannot use both the --file and --pattern options.\n"); } - //FATAL_ERROR("Cannot use both the --file and --pattern options.\n"); library lib_def = library::MPI; if (args["use"].size() != 0) { lib_def = parseLib(args["use"][0]); } else { - std::cout << "No communication library specified, using MPI by default" << std::endl; - //WARNING("No communication library specified, using MPI by default\n"); + // std::cout << "No communication library specified, using MPI by default" + // << std::endl; + WARNING("No communication library specified, using MPI by default\n"); } std::vector steps; if (args.find("file") != args.end()) { - #ifndef CONFIG_FILE - std::cout << "Cannot use the --file flag unless compiled with jsoncpp support." << std::endl; - std::exit(EXIT_FAILURE); - //FATAL_ERROR("Cannot use the --file flag unless compiled with jsoncpp support.\n"); - #else +#ifndef CONFIG_FILE + // std::cout << "Cannot use the --file flag unless compiled with jsoncpp + // support." << std::endl; std::exit(EXIT_FAILURE); + FATAL_ERROR( + "Cannot use the --file flag unless compiled with jsoncpp support.\n"); +#else std::ifstream file(args["file"][0], std::ifstream::binary); if (!file.is_open()) - std::cout << "Could not open file." << std::endl; - std::exit(EXIT_FAILURE); - //FATAL_ERROR("Could not open file \"%s\"\n", args["file"][0]); + // std::cout << "Could not open file." << std::endl; + // std::exit(EXIT_FAILURE); + FATAL_ERROR("Could not open file \"%s\"\n", args["file"][0]); - Json::Value root; - Json::CharReaderBuilder builder; - std::string errs; - int step = 1; + Json::Value root; + Json::CharReaderBuilder builder; + std::string errs; + int step = 1; - if (!Json::parseFromStream(builder, file, &root, &errs)) - std::cout << "Failed to parse JSON" << std::endl; - std::exit(EXIT_FAILURE); - //FATAL_ERROR("Failed to parse JSON\n"); + if (!Json::parseFromStream(builder, file, &root, &errs)) + // std::cout << "Failed to parse JSON" << std::endl; + // std::exit(EXIT_FAILURE); + FATAL_ERROR("Failed to parse JSON\n"); - if (root.isMember("steps") && root["steps"].isArray()) { - Json::Value &stepsArray = root["steps"]; - for (const auto &stepNumber : stepsArray) { - std::cout << "STEP " << step << std::endl; - step = step + 1; - library lib = lib_def; - if (stepNumber.isMember("library")) - lib = parseLib(stepNumber["library"].asString()); - std::vector patterns; - Json::Value patternsArray = stepNumber["patterns"]; - for (const auto &testType : patternsArray) - patterns.push_back(parsePattern(testType.asString())); - steps.push_back({patterns, lib}); - } - } - #endif - } else { - std::vector patterns; - for (int i = 0; i < args["pattern"].size(); i++) { - patterns.push_back(parsePattern(args["pattern"][i])); + if (root.isMember("steps") && root["steps"].isArray()) { + Json::Value &stepsArray = root["steps"]; + for (const auto &stepNumber : stepsArray) { + std::cout << "STEP " << step << std::endl; + step = step + 1; + library lib = lib_def; + if (stepNumber.isMember("library")) + lib = parseLib(stepNumber["library"].asString()); + std::vector patterns; + Json::Value patternsArray = stepNumber["patterns"]; + for (const auto &testType : patternsArray) + patterns.push_back(parsePattern(testType.asString())); + steps.push_back({patterns, lib}); } + } +#endif + } else { + std::vector patterns; + for (int i = 0; i < args["pattern"].size(); i++) { + patterns.push_back(parsePattern(args["pattern"][i])); + } - if (patterns.size() == 0) { - std::cout << "No communication pattern specified, using P2P by default." << std::endl; - //WARNING("No communication pattern specified, using P2P by default\n"); - patterns.push_back(p2p); - } - steps.push_back({patterns, lib_def}); + if (patterns.size() == 0) { + // std::cout << "No communication pattern specified, using P2P by + // default." << std::endl; + WARNING("No communication pattern specified, using P2P by default\n"); + patterns.push_back(p2p); } + steps.push_back({patterns, lib_def}); + } - bool run_validate = false; - if (args.find("validate") != args.end()) - run_validate = true; + bool run_validate = false; + if (args.find("validate") != args.end()) + run_validate = true; - int *sendbuf; - int *recvbuf; - size_t numbytes = 1e8; - if (args.find("nbytes") != args.end()) { - if (args["nbytes"].size() == 0) - std::cout << "Missing number of bytes argument for --nbytes" << std::endl; - std::exit(EXIT_FAILURE); - //FATAL_ERROR("Missing number of bytes argument for --nbytes"); - try { - numbytes = std::stoull(args["nbytes"][0]); - } catch (...) { - std::cout << "Invalid input to --nbytes" << std::endl; - std::exit(EXIT_FAILURE); - //FATAL_ERROR("Invalid input \"%s\" to --nbytes.", args["nbytes"][0]); - } + int *sendbuf; + int *recvbuf; + size_t numbytes = 1e8; + if (args.find("nbytes") != args.end()) { + if (args["nbytes"].size() == 0) + // std::cout << "Missing number of bytes argument for --nbytes" << + // std::endl; + // std::exit(EXIT_FAILURE); + FATAL_ERROR("Missing number of bytes argument for --nbytes"); + try { + numbytes = std::stoull(args["nbytes"][0]); + } catch (...) { + // std::cout << "Invalid input to --nbytes" << std::endl; + // std::exit(EXIT_FAILURE); + FATAL_ERROR("Invalid input \"%s\" to --nbytes.", + args["nbytes"][0].c_str()); } + } - allocate(sendbuf, numbytes * numproc); - allocate(recvbuf, numbytes * numproc); - for (int j = 0; j < steps.size(); j++) { - const auto &patterns = steps[j].patterns; - for (int i = 0; i < patterns.size(); i++) { - Comm test(steps[j].lib); - switch (patterns[i]) { - case pattern::p2p: - test.add(sendbuf, recvbuf, numbytes, 0, 1); - break; - case pattern::broadcast: - for (int p = 0; p < numproc; p++) - test.add(sendbuf, 0, recvbuf, 0, numbytes, ROOT, p); - break; - case pattern::gather: - for (int p = 0; p < numproc; p++) - test.add(sendbuf, 0, recvbuf, p * numbytes, numbytes, p, ROOT); - break; - case pattern::scatter: - for (int p = 0; p < numproc; p++) - test.add(sendbuf, p * numbytes, recvbuf, 0, numbytes, ROOT, p); - break; - case pattern::alltoall: - for (int sender = 0; sender < numproc; sender++) - for (int recver = 0; recver < numproc; recver++) - test.add(sendbuf, recver * numbytes, recvbuf, sender * numbytes, - numbytes, sender, recver); - break; - case pattern::allgather: - for (int sender = 0; sender < numproc; sender++) - for (int recver = 0; recver < numproc; recver++) - test.add(sendbuf, 0, recvbuf, sender * numbytes, numbytes, sender, - recver); - break; - default:; // error? - } - if (run_validate) - validate(sendbuf, recvbuf, numbytes, patterns[i], test); - else { -#ifndef BENCH_CALIPER - test.measure(5, 10, numbytes * numproc); + allocate(sendbuf, numbytes * numproc); + allocate(recvbuf, numbytes * numproc); + for (int j = 0; j < steps.size(); j++) { + const auto &patterns = steps[j].patterns; + for (int i = 0; i < patterns.size(); i++) { + Comm test(steps[j].lib); + switch (patterns[i]) { + case pattern::p2p: + test.add(sendbuf, recvbuf, numbytes, 0, 1); + break; + case pattern::broadcast: + for (int p = 0; p < numproc; p++) + test.add(sendbuf, 0, recvbuf, 0, numbytes, ROOT, p); + break; + case pattern::gather: + for (int p = 0; p < numproc; p++) + test.add(sendbuf, 0, recvbuf, p * numbytes, numbytes, p, ROOT); + break; + case pattern::scatter: + for (int p = 0; p < numproc; p++) + test.add(sendbuf, p * numbytes, recvbuf, 0, numbytes, ROOT, p); + break; + case pattern::alltoall: + for (int sender = 0; sender < numproc; sender++) + for (int recver = 0; recver < numproc; recver++) + test.add(sendbuf, recver * numbytes, recvbuf, sender * numbytes, + numbytes, sender, recver); + break; + case pattern::allgather: + for (int sender = 0; sender < numproc; sender++) + for (int recver = 0; recver < numproc; recver++) + test.add(sendbuf, 0, recvbuf, sender * numbytes, numbytes, sender, + recver); + break; + default:; // error? + } + if (run_validate) + validate(sendbuf, recvbuf, numbytes, patterns[i], test); + else { +#ifdef BENCH_CALIPER + test.measure_caliper(5, 10); #else - test.measure_caliper(5, 10); + test.measure(5, 10, numbytes * numproc); #endif - } } } - free(sendbuf); - free(recvbuf); + } + free(sendbuf); + free(recvbuf); - MPI_Finalize(); + MPI_Finalize(); - return 0; - } + return 0; +} diff --git a/scratch.cpp b/scratch.cpp new file mode 100644 index 0000000..b7a655a --- /dev/null +++ b/scratch.cpp @@ -0,0 +1,22 @@ +#include "commbench.h" +#include + +using namespace CommBench; + +int main(int argc, char* argv[]) { + init(); + + int *sendbuf, *recvbuf; + size_t num_bytes = 1e8; + const unsigned N = 2, G = 4, K = 1; + + allocate(sendbuf, num_bytes * numproc); + allocate(recvbuf, num_bytes * numproc); + + Comm test(library::MPI); + + // Rail + + + MPI_Finalize(); +} \ No newline at end of file diff --git a/tests/ipc/CMakeLists.txt b/tests/ipc/CMakeLists.txt index d03e1f0..f745f85 100644 --- a/tests/ipc/CMakeLists.txt +++ b/tests/ipc/CMakeLists.txt @@ -93,4 +93,4 @@ set_tests_properties(IPC_GET_ALLTOALL_Test PROPERTIES PASS_REGULAR_EXPRESSION "P add_test(NAME IPC_GET_ALLGATHER_Test COMMAND ${MPIEXEC} -n 3 $ ) -set_tests_properties(IPC_GET_ALLGATHER_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSED!") \ No newline at end of file +set_tests_properties(IPC_GET_ALLGATHER_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSED!") diff --git a/tests/ipc/ipc_get_gather_test.cpp b/tests/ipc/ipc_get_gather_test.cpp index 54062ce..ffd9cd4 100644 --- a/tests/ipc/ipc_get_gather_test.cpp +++ b/tests/ipc/ipc_get_gather_test.cpp @@ -18,7 +18,7 @@ int main() { allocate(sendbuf, numbytes * numproc); allocate(recvbuf, numbytes * numproc); - Comm test1(IPC); + Comm test1(IPC_get); for(int p = 0; p < numproc; p++){ test1.add(sendbuf, 0, recvbuf, p * numbytes, numbytes, p, ROOT); } diff --git a/tests/mpi/CMakeLists.txt b/tests/mpi/CMakeLists.txt index 3470b4f..dbc6253 100644 --- a/tests/mpi/CMakeLists.txt +++ b/tests/mpi/CMakeLists.txt @@ -49,4 +49,4 @@ set_tests_properties(MPI_ALLTOALL_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSE add_test(NAME MPI_ALLGATHER_Test COMMAND ${MPIEXEC} -n 3 $ ) -set_tests_properties(MPI_ALLGATHER_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSED!") \ No newline at end of file +set_tests_properties(MPI_ALLGATHER_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSED!") diff --git a/tests/xccl/CMakeLists.txt b/tests/xccl/CMakeLists.txt index 1fb1f7e..94a34fc 100644 --- a/tests/xccl/CMakeLists.txt +++ b/tests/xccl/CMakeLists.txt @@ -48,4 +48,4 @@ set_tests_properties(XCCL_ALLTOALL_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASS add_test(NAME XCCL_ALLGATHER_Test COMMAND ${MPIEXEC} -n 3 $ ) -set_tests_properties(XCCL_ALLGATHER_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSED!") \ No newline at end of file +set_tests_properties(XCCL_ALLGATHER_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSED!") From 1867acf2ec018d3fc5c4b9aacdb10d222a7f5912 Mon Sep 17 00:00:00 2001 From: arhag23 Date: Mon, 28 Jul 2025 15:24:37 -0600 Subject: [PATCH 2/9] initial setup --- CMakeLists.txt | 8 +++--- CommFunctions/validate.h | 6 ++--- bench.cpp | 54 ++++++++++++++++++++++++++++++++++++++++ main.cpp | 27 +++++++++++++++++--- scratch.cpp | 22 ---------------- 5 files changed, 84 insertions(+), 33 deletions(-) create mode 100644 bench.cpp delete mode 100644 scratch.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 9615ff5..7cc9361 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,19 +83,19 @@ add_subdirectory(CommFunctions) # add subdirectory after all configuration, may # Add your source files and executable here add_executable(${PROJECT_NAME} main.cpp) -add_executable(scratch scratch.cpp) +add_executable(bench bench.cpp) target_link_libraries(${PROJECT_NAME} PUBLIC CommFunctions commbench_compiler_flags) -target_link_libraries(scratch PUBLIC CommFunctions commbench_compiler_flags) +target_link_libraries(bench PUBLIC CommFunctions commbench_compiler_flags) install(TARGETS ${PROJECT_NAME} DESTINATION bin) install(DIRECTORY ${PROJECT_SOURCE_DIR}/CommFunctions DESTINATION include FILES_MATCHING PATTERN "*.h") if (USE_CUDA) set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA) - set_source_files_properties(scratch.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(bench.cpp PROPERTIES LANGUAGE CUDA) elseif (USE_HIP) set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) - set_source_files_properties(scratch.cpp PROPERTIES LANGUAGE HIP) + set_source_files_properties(bench.cpp PROPERTIES LANGUAGE HIP) endif() if (USE_JSONCPP) diff --git a/CommFunctions/validate.h b/CommFunctions/validate.h index 8d03cd7..4c26da9 100644 --- a/CommFunctions/validate.h +++ b/CommFunctions/validate.h @@ -1,5 +1,5 @@ template -void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &coll) { +void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &coll, int source = 0, int dest = 1) { int myid = CommBench::myid; int numproc = CommBench::numproc; @@ -26,8 +26,8 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c switch(pattern) { case 0: { - if(myid == 0) printf("VERIFY P2P\n"); - if(myid == 1) { + if(myid == source) printf("VERIFY P2P\n"); + if(myid == dest) { for(size_t i = 0; i < count; i++) { // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[i] != i) diff --git a/bench.cpp b/bench.cpp new file mode 100644 index 0000000..b241710 --- /dev/null +++ b/bench.cpp @@ -0,0 +1,54 @@ +#include "commbench.h" +#include +#include +#include + +using namespace CommBench; + +int main(int argc, char *argv[]) { + init(); + + int *sendbuf, *recvbuf; + int max_num_bytes = 17; + library libs[] = {library::MPI, library::IPC_get, library::IPC, + library::NCCL}; + std::string libs_str[] = {"mpi", "ipc_get", "ipc_put", "xccl"}; + char file_name[100]; + int gpus = numproc; + + allocate(sendbuf, 1ull << max_num_bytes); + allocate(recvbuf, 1ull << max_num_bytes); + + for (int i = 0; i < 4; i++) { + if (myid == 0) + fprintf(stderr, "Testing library %s\n", libs_str[i].c_str()); + for (int source = 0; source < gpus; source++) { + for (int dest = 0; dest < gpus; dest++) { + // fprintf(stderr, "\033[2K\rP2P Comm: %d->%d", source, dest); + if (myid == 0) + fprintf(stderr, "P2P Comm: %d->%d\n", source, dest); + for (int size = 4; size < max_num_bytes; size += 1) { + Comm bench(libs[i]); + // if (myid == 0) { + // sprintf(file_name, "data_CPX_all_prelim/%s/%d_%d_%llu.out", + // libs_str[i].c_str(), source, dest, 1ull << size); + // freopen(file_name, "w", stdout); + // } + bench.add(sendbuf, recvbuf, 1ull << size, source, dest); + bench.measure(5, 10); + // bench.clear(); + } + double time = MPI_Wtime(); + while (MPI_Wtime() < time + 1.0) ; + } + } + + // fprintf(stderr, "\033[2K\rCompleted testing library %s\n", + // libs_str[i].c_str()); + if (myid == 0) + fprintf(stderr, "Completed testing library %s\n", libs_str[i].c_str()); + } + free(sendbuf); + free(recvbuf); + MPI_Finalize(); +} \ No newline at end of file diff --git a/main.cpp b/main.cpp index 6c2dd8f..51361c9 100644 --- a/main.cpp +++ b/main.cpp @@ -56,8 +56,8 @@ template void WARNING(const char *fmt, Args... args) { std::unordered_map> parseArgs(int argc, char *argv[]) { - static const std::array valid_args = { - "use", "pattern", "validate", "nbytes", "file"}; + static const std::array valid_args = { + "use", "pattern", "validate", "nbytes", "file", "dest", "source"}; int i = 1; std::unordered_map> args; std::string prev = ""; @@ -269,6 +269,25 @@ int main(int argc, char *argv[]) { } } + int source = 0, dest = 1; + + if (args.find("source") != args.end()) { + try { + source = std::stoi(args["source"][0]); + } catch (...) { + FATAL_ERROR("invalid input \"%s\" to --source.", args["source"][0].c_str()); + } + } + + if (args.find("dest") != args.end()) { + try { + dest = std::stoi(args["dest"][0]); + } catch (...) { + FATAL_ERROR("invalid input \"%s\" to --dest.", args["dest"][0].c_str()); + } + } + + allocate(sendbuf, numbytes * numproc); allocate(recvbuf, numbytes * numproc); for (int j = 0; j < steps.size(); j++) { @@ -277,7 +296,7 @@ int main(int argc, char *argv[]) { Comm test(steps[j].lib); switch (patterns[i]) { case pattern::p2p: - test.add(sendbuf, recvbuf, numbytes, 0, 1); + test.add(sendbuf, recvbuf, numbytes, source, dest); break; case pattern::broadcast: for (int p = 0; p < numproc; p++) @@ -306,7 +325,7 @@ int main(int argc, char *argv[]) { default:; // error? } if (run_validate) - validate(sendbuf, recvbuf, numbytes, patterns[i], test); + validate(sendbuf, recvbuf, numbytes, patterns[i], test, source, dest); else { #ifdef BENCH_CALIPER test.measure_caliper(5, 10); diff --git a/scratch.cpp b/scratch.cpp deleted file mode 100644 index b7a655a..0000000 --- a/scratch.cpp +++ /dev/null @@ -1,22 +0,0 @@ -#include "commbench.h" -#include - -using namespace CommBench; - -int main(int argc, char* argv[]) { - init(); - - int *sendbuf, *recvbuf; - size_t num_bytes = 1e8; - const unsigned N = 2, G = 4, K = 1; - - allocate(sendbuf, num_bytes * numproc); - allocate(recvbuf, num_bytes * numproc); - - Comm test(library::MPI); - - // Rail - - - MPI_Finalize(); -} \ No newline at end of file From 453c4b62e1771f18aefb92e8838cf4b6b30bb812 Mon Sep 17 00:00:00 2001 From: arhag23 <35051569+arhag23@users.noreply.github.com> Date: Mon, 28 Jul 2025 15:52:36 -0600 Subject: [PATCH 3/9] Update bench.cpp remove wait --- bench.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bench.cpp b/bench.cpp index b241710..35ce7aa 100644 --- a/bench.cpp +++ b/bench.cpp @@ -38,8 +38,6 @@ int main(int argc, char *argv[]) { bench.measure(5, 10); // bench.clear(); } - double time = MPI_Wtime(); - while (MPI_Wtime() < time + 1.0) ; } } @@ -51,4 +49,4 @@ int main(int argc, char *argv[]) { free(sendbuf); free(recvbuf); MPI_Finalize(); -} \ No newline at end of file +} From 62c8ac70b9e26ae12fd239abcce042bfb7bb0012 Mon Sep 17 00:00:00 2001 From: arhag23 Date: Wed, 30 Jul 2025 16:12:35 -0600 Subject: [PATCH 4/9] Benchmarking and plotting utils --- CommFunctions/comm.h | 116 +++++++++++++++++------------------ CommFunctions/commbench.h | 98 +++++++++++++++--------------- CommFunctions/util.h | 124 +++++++++++++++++++------------------- CommFunctions/validate.h | 34 +++++------ bench.cpp | 56 +++++++++++------ parse.py | 22 +++++++ visualize.py | 44 ++++++++++++++ visualize_df.py | 47 +++++++++++++++ 8 files changed, 337 insertions(+), 204 deletions(-) create mode 100644 parse.py create mode 100644 visualize.py create mode 100644 visualize_df.py diff --git a/CommFunctions/comm.h b/CommFunctions/comm.h index ad7e495..d1d31f7 100644 --- a/CommFunctions/comm.h +++ b/CommFunctions/comm.h @@ -131,28 +131,28 @@ numrecv = 0; if(myid == printid) { - printf("printid: %d Create Bench %d with %d processors\n", printid, benchid, numproc); - printf(" Port: "); + fprintf(stderr, "printid: %d Create Bench %d with %d processors\n", printid, benchid, numproc); + fprintf(stderr, " Port: "); #ifdef PORT_CUDA - printf("CUDA, "); + fprintf(stderr, "CUDA, "); #elif defined PORT_HIP - printf("HIP, "); + fprintf(stderr, "HIP, "); #elif defined PORT_ONEAPI - printf("ONEAPI, "); + fprintf(stderr, "ONEAPI, "); #else - printf("CPU, "); + fprintf(stderr, "CPU, "); #endif #ifdef CAP_NCCL - printf("NCCL, "); + fprintf(stderr, "NCCL, "); #elif CAP_ONECCL - printf("ONECCL, "); + fprintf(stderr, "ONECCL, "); #endif #ifdef IPC_kernel - printf("IPC will call a kernel, \n"); + fprintf(stderr, "IPC will call a kernel, \n"); #endif - printf("Library: "); + fprintf(stderr, "Library: "); print_lib(lib); - printf("\n"); + fprintf(stderr, "\n"); } if(lib == NCCL) { #ifdef CAP_NCCL @@ -165,7 +165,7 @@ broadcast(&id); ncclCommInitRank(&comm_nccl, numproc, id, myid); // this is where it the third gpu hangs if(myid == printid) - printf("******************** NCCL COMMUNICATOR IS CREATED\n"); + fprintf(stderr, "******************** NCCL COMMUNICATOR IS CREATED\n"); } #ifdef PORT_CUDA cudaStreamCreate(&stream_nccl); @@ -195,7 +195,7 @@ auto ctx = ccl::create_context(CommBench::q.get_context()); comm_ccl = new ccl::communicator(ccl::create_communicator(numproc, myid, dev, ctx, kvs)); if(myid == printid) - printf("******************** ONECCL COMMUNICATOR IS CREATED\n"); + fprintf(stderr, "******************** ONECCL COMMUNICATOR IS CREATED\n"); stream_ccl = new ccl::stream(ccl::create_stream(CommBench::q)); } #endif @@ -215,7 +215,7 @@ zeDeviceGetCommandQueueGroupProperties(hDevice, &numQueueGroups, queueProperties.data()); int n_commands_lists = 0; if(myid == printid) - printf("device descovery:\n"); + fprintf(stderr, "device descovery:\n"); for (uint32_t i = 0; i < numQueueGroups; i++) { bool isCompute = false; bool isCopy = false; @@ -224,10 +224,10 @@ if ((queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) isCopy = true; if(myid == printid) - printf("group %d isCompute %d isCopy %d\n", i, isCompute, isCopy); + fprintf(stderr, "group %d isCompute %d isCopy %d\n", i, isCompute, isCopy); for (uint32_t j = 0; j < queueProperties[i].numQueues; j++) { if(myid == printid) - printf(" queue: %d\n", j); + fprintf(stderr, " queue: %d\n", j); n_commands_lists++; } } @@ -253,7 +253,7 @@ } } if(myid == printid) - printf("number of command queues: %ld\n", command_queue.size()); + fprintf(stderr, "number of command queues: %ld\n", command_queue.size()); } #endif } @@ -281,7 +281,7 @@ buffer_list.clear(); buffer_count.clear(); if(myid == printid) - printf("memory freed.\n"); + fprintf(stderr, "memory freed.\n"); } template @@ -310,9 +310,9 @@ T *ptr = nullptr; // MPI_Recv(&ptr, sizeof(T*), MPI_BYTE, i, 0, comm_mpi, MPI_STATUS_IGNORE); recv(&ptr, i); - printf("Bench %d proc %d allocate %p count %ld (", benchid, i, ptr, count); + fprintf(stderr, "Bench %d proc %d allocate %p count %ld (", benchid, i, ptr, count); print_data(count * sizeof(T)); - printf(")\n"); + fprintf(stderr, ")\n"); } } }*/ @@ -334,7 +334,7 @@ // OMIT ZERO MESSAGE SIZE if(count == 0) { if(myid == printid) - printf("Bench %d communication (%d->%d) count = 0 (skipped)\n", benchid, sendid, recvid); + fprintf(stderr, "Bench %d communication (%d->%d) count = 0 (skipped)\n", benchid, sendid, recvid); return; } // ADJUST MESSAGE SIZE @@ -365,11 +365,11 @@ pair(&sendoffset, &sendoffset_temp, sendid, printid); pair(&recvoffset, &recvoffset_temp, recvid, printid); if(myid == printid) { - printf("Bench %d comm %d (%d->%d) sendbuf %p sendoffset %zu recvbuf %p recvoffset %zu count %zu (", benchid, numcomm, sendid, recvid, sendbuf_temp, sendoffset_temp, recvbuf_temp, recvoffset_temp, count); + fprintf(stderr, "Bench %d comm %d (%d->%d) sendbuf %p sendoffset %zu recvbuf %p recvoffset %zu count %zu (", benchid, numcomm, sendid, recvid, sendbuf_temp, sendoffset_temp, recvbuf_temp, recvoffset_temp, count); print_data(count * sizeof(T)); - printf(") "); + fprintf(stderr, ") "); print_lib(lib); - printf("\n"); + fprintf(stderr, "\n"); } } numcomm++; @@ -398,13 +398,13 @@ // PUT (SENDER INITIALIZES) pair(&queue, queue_temp, sendid, printid); if(myid == printid) - printf("selected put queue: %d\n", queue_temp); + fprintf(stderr, "selected put queue: %d\n", queue_temp); } if(lib == IPC_get) { // GET (RECVER INITIALIZES) pair(&queue, queue_temp, recvid, printid); if(myid == printid) - printf("selected get queue: %d\n", queue_temp); + fprintf(stderr, "selected get queue: %d\n", queue_temp); } } } @@ -448,7 +448,7 @@ if(sendid != recvid) { int error = -1; #ifdef PORT_CUDA - printf("trying ipc \n"); + fprintf(stderr, "trying ipc \n"); cudaIpcMemHandle_t memhandle; recv(&memhandle, recvid); error = cudaIpcOpenMemHandle((void**)&remotebuf[numsend], memhandle, cudaIpcMemLazyEnablePeerAccess); @@ -473,7 +473,7 @@ error = zeMemOpenIpcHandle(zeContext, zeDevice, memhandle, 0, (void**)&remotebuf[numsend]); #endif if(error) - printf("comm.h:476 IpcOpenMemHandle error %d\n", error); + fprintf(stderr, "comm.h:476 IpcOpenMemHandle error %d\n", error); recv(&remoteoffset[numsend], recvid); } #ifdef IPC_ze @@ -508,7 +508,7 @@ // send(&memhandle, sendid); #endif if(error) - printf("comm.h:511 IpcGetMemHandle error %d\n", error); + fprintf(stderr, "comm.h:511 IpcGetMemHandle error %d\n", error); send(&sendoffset, recvid); } break; @@ -596,7 +596,7 @@ // send(&memhandle, sendid); #endif if(error) - printf("IpcGetMemHandle error %d\n", error); + fprintf(stderr, "IpcGetMemHandle error %d\n", error); send(&recvoffset, sendid); } break; @@ -642,7 +642,7 @@ error = zeMemOpenIpcHandle(zeContext, zeDevice, memhandle, 0, (void**)&remotebuf[numrecv]); #endif if(error) - printf("comm.h:645 IpcOpenMemHandle error %d\n", error); + fprintf(stderr, "comm.h:645 IpcOpenMemHandle error %d\n", error); recv(&remoteoffset[numrecv], sendid); } #ifdef IPC_ze @@ -702,14 +702,14 @@ double maxTime; double avgTime; CommBench::measure(warmup, numiter, minTime, medTime, maxTime, avgTime, *this); - if(myid == printid) { + if(myid == 0) { size_t data = count * sizeof(T); - printf("data: "); print_data(data); printf("\n"); - printf("minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9); - printf("medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9); - printf("maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9); - printf("avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9); - printf("\n"); + // fprintf(stderr, "data: "); print_data(data); fprintf(stderr, "\n"); + fprintf(stdout, "minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9); + fprintf(stdout, "medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9); + fprintf(stdout, "maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9); + fprintf(stdout, "avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9); + // fprintf(stderr, "\n"); } }; @@ -735,19 +735,19 @@ std::vector matrix = getMatrix(); if(myid == printid) { - printf("\nCommBench %d: ", benchid); + fprintf(stderr, "\nCommBench %d: ", benchid); print_lib(lib); - printf(" communication matrix (reciever x sender) nnz: %d\n", numcomm); + fprintf(stderr, " communication matrix (reciever x sender) nnz: %d\n", numcomm); for(int recver = 0; recver < numproc; recver++) { for(int sender = 0; sender < numproc; sender++) { size_t count = matrix[recver * numproc + sender]; if(count) - printf("%ld ", count); - // printf("1 "); + fprintf(stderr, "%ld ", count); + // fprintf(stderr, "1 "); else - printf(". "); + fprintf(stderr, ". "); } - printf("\n"); + fprintf(stderr, "\n"); } } long sendTotal = 0; @@ -775,24 +775,24 @@ MPI_Allreduce(MPI_IN_PLACE, &total_count, 1, MPI_LONG, MPI_SUM, comm_mpi); if(myid == printid) { for(int p = 0; p < numproc; p++) { - printf("proc %d: %d pieces count %ld ", p, total_buffs[p], total_counts[p]); + fprintf(stderr, "proc %d: %d pieces count %ld ", p, total_buffs[p], total_counts[p]); print_data(total_counts[p] * sizeof(T)); - printf("\n"); + fprintf(stderr, "\n"); } - printf("total pieces: %d count %ld ", total_buff, total_count); + fprintf(stderr, "total pieces: %d count %ld ", total_buff, total_count); print_data(total_count * sizeof(T)); - printf("\n"); + fprintf(stderr, "\n"); } }*/ if(myid == printid) { - printf("send footprint: %ld ", sendTotal); + fprintf(stderr, "send footprint: %ld ", sendTotal); print_data(sendTotal * sizeof(T)); - printf("\n"); - printf("recv footprint: %ld ", recvTotal); + fprintf(stderr, "\n"); + fprintf(stderr, "recv footprint: %ld ", recvTotal); print_data(recvTotal * sizeof(T)); - printf("\n"); - printf("\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "\n"); } } template @@ -817,12 +817,12 @@ /* if(myid == printid) { char filename[2048]; - sprintf(filename, "matrix_%d.txt", benchid); + sfprintf(stderr, filename, "matrix_%d.txt", benchid); FILE *matfile = fopen(filename, "w"); for(int recver = 0; recver < numproc; recver++) { for(int sender = 0; sender < numproc; sender++) - fprintf(matfile, "%ld ", matrix[sender * numproc + recver]); - fprintf(matfile, "\n"); + ffprintf(stderr, matfile, "%ld ", matrix[sender * numproc + recver]); + ffprintf(stderr, matfile, "\n"); } fclose(matfile); }*/ @@ -969,7 +969,7 @@ #endif default: print_lib(lib); - printf(" option is not implemented!\n"); + fprintf(stderr, " option is not implemented!\n"); break; } } @@ -1038,7 +1038,7 @@ #endif default: print_lib(lib); - printf(" option is not implemented!\n"); + fprintf(stderr, " option is not implemented!\n"); break; } } diff --git a/CommFunctions/commbench.h b/CommFunctions/commbench.h index 7078bc0..20f0016 100644 --- a/CommFunctions/commbench.h +++ b/CommFunctions/commbench.h @@ -92,7 +92,11 @@ namespace CommBench { +#ifdef PRINT_DEBUG static int printid = 0; +#else + static int printid = -1; +#endif static int numbench = 0; static std::vector benchlist; static int mydevice = -1; @@ -124,26 +128,26 @@ namespace CommBench static void print_data(size_t data) { if (data < 1e3) - printf("%d bytes", (int)data); + fprintf(stderr, "%d bytes", (int)data); else if (data < 1e6) - printf("%.4f KB", data / 1e3); + fprintf(stderr, "%.4f KB", data / 1e3); else if (data < 1e9) - printf("%.4f MB", data / 1e6); + fprintf(stderr, "%.4f MB", data / 1e6); else if (data < 1e12) - printf("%.4f GB", data / 1e9); + fprintf(stderr, "%.4f GB", data / 1e9); else - printf("%.4f TB", data / 1e12); + fprintf(stderr, "%.4f TB", data / 1e12); } static void print_lib(library lib) { switch(lib) { - case dummy : printf("dummy"); break; - case IPC : printf("IPC (PUT)"); break; - case IPC_get : printf("IPC (GET)"); break; - case MPI : printf("MPI"); break; - case NCCL : printf("NCCL"); break; - case GEX : printf("GASNET (PUT)"); break; - case GEX_get : printf("GASNET (GET)"); break; - case numlib : printf("numlib"); break; + case dummy : fprintf(stderr, "dummy"); break; + case IPC : fprintf(stderr, "IPC (PUT)"); break; + case IPC_get : fprintf(stderr, "IPC (GET)"); break; + case MPI : fprintf(stderr, "MPI"); break; + case NCCL : fprintf(stderr, "NCCL"); break; + case GEX : fprintf(stderr, "GASNET (PUT)"); break; + case GEX_get : fprintf(stderr, "GASNET (GET)"); break; + case numlib : fprintf(stderr, "numlib"); break; } } @@ -289,12 +293,12 @@ namespace CommBench MPI_Comm_size(comm_mpi, &numproc); if(myid == printid) { if(!init_mpi) { - printf("#################### MPI IS INITIALIZED, it is user's responsibility to finalize.\n"); + fprintf(stderr, "#################### MPI IS INITIALIZED, it is user's responsibility to finalize.\n"); int provided; MPI_Query_thread(&provided); - printf("provided thread support: %d\n", provided); + fprintf(stderr, "provided thread support: %d\n", provided); } - printf("******************** MPI COMMUNICATOR IS CREATED\n"); + fprintf(stderr, "******************** MPI COMMUNICATOR IS CREATED\n"); } } #endif @@ -307,7 +311,7 @@ namespace CommBench myep.push_back(ep_primordial); // primordial is index 0 myep_ptr.push_back(nullptr); // primordial segment is 0 if(myid == printid) - printf("******************** GASNET CLIENT IS CREATED\n"); + fprintf(stderr, "******************** GASNET CLIENT IS CREATED\n"); #ifdef USE_GASNET gex_AM_Entry_t handlers[] = { {am_recv_index, (gex_AM_Fn_t)am_recv, GEX_FLAG_AM_REQUEST | GEX_FLAG_AM_SHORT, 0}, @@ -371,19 +375,19 @@ namespace CommBench int numiter = times.size(); if(myid == printid) { - printf("%d measurement iterations (sorted):\n", numiter); + fprintf(stderr, "%d measurement iterations (sorted):\n", numiter); for(int iter = 0; iter < numiter; iter++) { - printf("time: %.4e", times[iter] * 1e6); + fprintf(stderr, "time: %.4e", times[iter] * 1e6); if(iter == 0) - printf(" -> min\n"); + fprintf(stderr, " -> min\n"); else if(iter == numiter / 2) - printf(" -> median\n"); + fprintf(stderr, " -> median\n"); else if(iter == numiter - 1) - printf(" -> max\n"); + fprintf(stderr, " -> max\n"); else - printf("\n"); + fprintf(stderr, "\n"); } - printf("\n"); + fprintf(stderr, "\n"); } double minTime = times[0]; double medTime = times[numiter / 2]; @@ -393,12 +397,12 @@ namespace CommBench avgTime += times[iter]; avgTime /= numiter; if(myid == printid) { - printf("data: "); print_data(data); printf("\n"); - printf("minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9); - printf("medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9); - printf("maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9); - printf("avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9); - printf("\n"); + fprintf(stderr, "data: "); print_data(data); fprintf(stderr, "\n"); + fprintf(stderr, "minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9); + fprintf(stderr, "medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9); + fprintf(stderr, "maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9); + fprintf(stderr, "avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9); + fprintf(stderr, "\n"); } } @@ -457,7 +461,7 @@ namespace CommBench } //for(int i = 0; i < numproc; i++) - // printf("myid %d i: %d sendcount %d senddispl %d recvcount %d recvdispl %d\n", myid, i, sendcount[i], senddispl[i], recvcount[i], recvdispl[i]); + // fprintf(stderr, "myid %d i: %d sendcount %d senddispl %d recvcount %d recvdispl %d\n", myid, i, sendcount[i], senddispl[i], recvcount[i], recvdispl[i]); T *sendbuf; T *recvbuf; allocate(sendbuf, senddispl[numproc]); @@ -495,7 +499,7 @@ namespace CommBench double starts[numiter]; if(myid == printid) - printf("%d warmup iterations (in order):\n", warmup); + fprintf(stderr, "%d warmup iterations (in order):\n", warmup); for (int iter = -warmup; iter < numiter; iter++) { for(int send = 0; send < comm.numsend; send++) { #if defined PORT_CUDA @@ -519,7 +523,7 @@ namespace CommBench allreduce_max(&time); if(iter < 0) { if(myid == printid) - printf("startup %.2e warmup: %.2e\n", start * 1e6, time * 1e6); + fprintf(stderr, "startup %.2e warmup: %.2e\n", start * 1e6, time * 1e6); } else { starts[iter] = start; @@ -530,19 +534,19 @@ namespace CommBench std::sort(starts, starts + numiter, [](const double & a, const double & b) -> bool {return a < b;}); if(myid == printid) { - printf("%d measurement iterations (sorted):\n", numiter); + fprintf(stderr, "%d measurement iterations (sorted):\n", numiter); for(int iter = 0; iter < numiter; iter++) { - printf("start: %.4e time: %.4e", starts[iter] * 1e6, times[iter] * 1e6); + fprintf(stderr, "start: %.4e time: %.4e", starts[iter] * 1e6, times[iter] * 1e6); if(iter == 0) - printf(" -> min\n"); + fprintf(stderr, " -> min\n"); else if(iter == numiter / 2) - printf(" -> median\n"); + fprintf(stderr, " -> median\n"); else if(iter == numiter - 1) - printf(" -> max\n"); + fprintf(stderr, " -> max\n"); else - printf("\n"); + fprintf(stderr, "\n"); } - printf("\n"); + fprintf(stderr, "\n"); } minTime = times[0]; medTime = times[numiter / 2]; @@ -560,18 +564,18 @@ namespace CommBench allgather(&memory, memory_all.data()); if(myid == printid) { size_t memory_total = 0; - printf("\n"); - printf("CommBench memory report:\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "CommBench memory report:\n"); for(int i = 0; i < numproc; i++) { - printf("proc: %d memory ", i); + fprintf(stderr, "proc: %d memory ", i); print_data(memory_all[i]); - printf("\n"); + fprintf(stderr, "\n"); memory_total += memory_all[i]; } - printf("total memory: "); + fprintf(stderr, "total memory: "); print_data(memory_total); - printf("\n"); - printf("\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "\n"); } } diff --git a/CommFunctions/util.h b/CommFunctions/util.h index 4c94f05..36d28f1 100644 --- a/CommFunctions/util.h +++ b/CommFunctions/util.h @@ -29,7 +29,7 @@ void setup_gpu() { cudaGetDeviceCount(&deviceCount); #ifdef CAP_NCCL if (numproc > deviceCount) { - printf("Warning: Using the same device for different ranks of a communicator for NCCL is not supported\n"); + fprintf(stderr, "Warning: Using the same device for different ranks of a communicator for NCCL is not supported\n"); } #endif int device = myid % deviceCount; @@ -37,10 +37,10 @@ void setup_gpu() { set_device(device); if(!init) { if(myid == printid) - printf("CUDA PORT\n"); + fprintf(stderr, "CUDA PORT\n"); // SET DEVICE if(myid == printid) - printf("deviceCount: %d\n", deviceCount); + fprintf(stderr, "deviceCount: %d\n", deviceCount); // REPORT if(myid == printid){ int error = system("nvidia-smi"); @@ -48,21 +48,21 @@ void setup_gpu() { int device; cudaGetDevice(&device); cudaGetDeviceCount(&deviceCount); - printf("Device %d Count: %d\n", device, deviceCount); + fprintf(stderr, "Device %d Count: %d\n", device, deviceCount); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp,0); - printf("Device %d name: %s\n",0,deviceProp.name); - printf("Clock Frequency: %f GHz\n",deviceProp.clockRate/1.e9); - printf("Computational Capabilities: %d, %d\n",deviceProp.major,deviceProp.minor); - printf("Maximum global memory size: %lu\n",deviceProp.totalGlobalMem); - printf("Maximum constant memory size: %lu\n",deviceProp.totalConstMem); - printf("Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock); - printf("Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]); - printf("Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]); - printf("Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock); - printf("Warp size: %d\n",deviceProp.warpSize); - printf("32-bit Reg. per block: %d\n",deviceProp.regsPerBlock); - printf("\n"); + fprintf(stderr, "Device %d name: %s\n",0,deviceProp.name); + fprintf(stderr, "Clock Frequency: %f GHz\n",deviceProp.clockRate/1.e9); + fprintf(stderr, "Computational Capabilities: %d, %d\n",deviceProp.major,deviceProp.minor); + fprintf(stderr, "Maximum global memory size: %lu\n",deviceProp.totalGlobalMem); + fprintf(stderr, "Maximum constant memory size: %lu\n",deviceProp.totalConstMem); + fprintf(stderr, "Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock); + fprintf(stderr, "Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]); + fprintf(stderr, "Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]); + fprintf(stderr, "Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock); + fprintf(stderr, "Warp size: %d\n",deviceProp.warpSize); + fprintf(stderr, "32-bit Reg. per block: %d\n",deviceProp.regsPerBlock); + fprintf(stderr, "\n"); } } #elif defined PORT_HIP @@ -73,10 +73,10 @@ void setup_gpu() { set_device(device); if(!init) { if(myid == printid) - printf("HIP PORT\n"); + fprintf(stderr, "HIP PORT\n"); //DEVICE MANAGEMENT if(myid == printid) - printf("deviceCount: %d\n", deviceCount); + fprintf(stderr, "deviceCount: %d\n", deviceCount); // REPORT if(myid == printid) { system("rocm-smi"); @@ -84,34 +84,34 @@ void setup_gpu() { int device; hipGetDevice(&device); hipGetDeviceCount(&deviceCount); - printf("Device %d Count: %d\n", device, deviceCount); + fprintf(stderr, "Device %d Count: %d\n", device, deviceCount); hipDeviceProp_t deviceProp; hipGetDeviceProperties(&deviceProp,0); - printf("Device %d name: %s\n",0,deviceProp.name); - printf("Maximum global memory size: %lu\n",deviceProp.totalGlobalMem); - printf("Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock); - printf("32-bit Reg. per block: %d\n",deviceProp.regsPerBlock); - printf("Warp size: %d\n",deviceProp.warpSize); - printf("Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock); - printf("Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]); - printf("Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]); - printf("Clock frequency: %d khz\n",deviceProp.clockRate); - printf("Global memory frequency: %d khz\n", deviceProp.memoryClockRate); - printf("Global memory bus width: %d bits\n", deviceProp.memoryBusWidth); - printf("Maximum constant memory size: %lu\n",deviceProp.totalConstMem); - printf("Compute capability: %d.%d\n", deviceProp.major, deviceProp.minor); - printf("Number of multi-processors: %d\n", deviceProp.multiProcessorCount); - printf("L2 cache size: %d\n", deviceProp.l2CacheSize); - printf("Max. threads per multi-processor: %d\n", deviceProp.maxThreadsPerMultiProcessor); - printf("Compute mode: %d\n", deviceProp.computeMode); - printf("Device-side clock instruction rate: %d khz\n", deviceProp.clockInstructionRate); - printf("\n"); + fprintf(stderr, "Device %d name: %s\n",0,deviceProp.name); + fprintf(stderr, "Maximum global memory size: %lu\n",deviceProp.totalGlobalMem); + fprintf(stderr, "Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock); + fprintf(stderr, "32-bit Reg. per block: %d\n",deviceProp.regsPerBlock); + fprintf(stderr, "Warp size: %d\n",deviceProp.warpSize); + fprintf(stderr, "Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock); + fprintf(stderr, "Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]); + fprintf(stderr, "Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]); + fprintf(stderr, "Clock frequency: %d khz\n",deviceProp.clockRate); + fprintf(stderr, "Global memory frequency: %d khz\n", deviceProp.memoryClockRate); + fprintf(stderr, "Global memory bus width: %d bits\n", deviceProp.memoryBusWidth); + fprintf(stderr, "Maximum constant memory size: %lu\n",deviceProp.totalConstMem); + fprintf(stderr, "Compute capability: %d.%d\n", deviceProp.major, deviceProp.minor); + fprintf(stderr, "Number of multi-processors: %d\n", deviceProp.multiProcessorCount); + fprintf(stderr, "L2 cache size: %d\n", deviceProp.l2CacheSize); + fprintf(stderr, "Max. threads per multi-processor: %d\n", deviceProp.maxThreadsPerMultiProcessor); + fprintf(stderr, "Compute mode: %d\n", deviceProp.computeMode); + fprintf(stderr, "Device-side clock instruction rate: %d khz\n", deviceProp.clockInstructionRate); + fprintf(stderr, "\n"); } } #elif defined PORT_SYCL if(!init) { if(CommBench::myid == CommBench::printid) - printf("SYCL PORT\n"); + fprintf(stderr, "SYCL PORT\n"); // Initialize the driver zeInit(0); // Discover all the driver instances @@ -134,34 +134,34 @@ void setup_gpu() { if(CommBench::myid == CommBench::printid) { if(ZE_DEVICE_TYPE_GPU == device_properties.type) - printf("driverCount %d deviceCount %d GPU\n", driverCount, deviceCount); + fprintf(stderr, "driverCount %d deviceCount %d GPU\n", driverCount, deviceCount); else - printf("GPU not found!\n"); - printf("type %d\n", device_properties.type); - printf("vendorId %d\n", device_properties.vendorId); - printf("deviceId %d\n", device_properties.deviceId); - printf("flags %d\n", device_properties.flags); - printf("subdeviceId %d\n", device_properties.subdeviceId); - printf("coreClockRate %d\n", device_properties.coreClockRate); - printf("maxMemAllocSize %ld\n", device_properties.maxMemAllocSize); - printf("maxHardwareContexts %d\n", device_properties.maxHardwareContexts); - printf("maxCommandQueuePriority %d\n", device_properties.maxCommandQueuePriority); - printf("numThreadsPerEU %d\n", device_properties.numThreadsPerEU); - printf("physicalEUSimdWidth %d\n", device_properties.physicalEUSimdWidth); - printf("numSubslicesPerSlice %d\n", device_properties.numEUsPerSubslice); - printf("numSlices %d\n", device_properties.numSlices); - printf("timerResolution %ld\n", device_properties.timerResolution); - printf("timestampValidBits %d\n", device_properties.timestampValidBits); - printf("kernelTimestampValidBits %d\n", device_properties.kernelTimestampValidBits); + fprintf(stderr, "GPU not found!\n"); + fprintf(stderr, "type %d\n", device_properties.type); + fprintf(stderr, "vendorId %d\n", device_properties.vendorId); + fprintf(stderr, "deviceId %d\n", device_properties.deviceId); + fprintf(stderr, "flags %d\n", device_properties.flags); + fprintf(stderr, "subdeviceId %d\n", device_properties.subdeviceId); + fprintf(stderr, "coreClockRate %d\n", device_properties.coreClockRate); + fprintf(stderr, "maxMemAllocSize %ld\n", device_properties.maxMemAllocSize); + fprintf(stderr, "maxHardwareContexts %d\n", device_properties.maxHardwareContexts); + fprintf(stderr, "maxCommandQueuePriority %d\n", device_properties.maxCommandQueuePriority); + fprintf(stderr, "numThreadsPerEU %d\n", device_properties.numThreadsPerEU); + fprintf(stderr, "physicalEUSimdWidth %d\n", device_properties.physicalEUSimdWidth); + fprintf(stderr, "numSubslicesPerSlice %d\n", device_properties.numEUsPerSubslice); + fprintf(stderr, "numSlices %d\n", device_properties.numSlices); + fprintf(stderr, "timerResolution %ld\n", device_properties.timerResolution); + fprintf(stderr, "timestampValidBits %d\n", device_properties.timestampValidBits); + fprintf(stderr, "kernelTimestampValidBits %d\n", device_properties.kernelTimestampValidBits); //for(int j = 0; j < ZE_MAX_DEVICE_UUID_SIZE; j++) - // printf("uuid %d\n", device_properties.uuid.id[j]); - printf("name %s\n", device_properties.name); - printf("\n"); + // fprintf(stderr, "uuid %d\n", device_properties.uuid.id[j]); + fprintf(stderr, "name %s\n", device_properties.name); + fprintf(stderr, "\n"); } } /*ze_bool_t test = false; zeDeviceCanAccessPeer(allDevices[0], allDevices[1], &test); - printf("can access peer %d\n", test);*/ + fprintf(stderr, "can access peer %d\n", test);*/ delete[] allDevices; } delete[] allDrivers; @@ -169,7 +169,7 @@ void setup_gpu() { #else if(!init) if(CommBench::myid == CommBench::printid) - printf("CPU VERSION\n"); + fprintf(stderr, "CPU VERSION\n"); #endif init = true; } diff --git a/CommFunctions/validate.h b/CommFunctions/validate.h index 4c26da9..7dbb31c 100644 --- a/CommFunctions/validate.h +++ b/CommFunctions/validate.h @@ -26,10 +26,10 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c switch(pattern) { case 0: { - if(myid == source) printf("VERIFY P2P\n"); + if(myid == source) fprintf(stderr, "VERIFY P2P\n"); if(myid == dest) { for(size_t i = 0; i < count; i++) { - // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); + // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[i] != i) pass = false; } @@ -38,11 +38,11 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c break; case 1: { - if(myid == ROOT) printf("VERIFY GATHER\n"); + if(myid == ROOT) fprintf(stderr, "VERIFY GATHER\n"); if(myid == ROOT) { for(int p = 0; p < numproc; p++) for(size_t i = 0; i < count; i++) { - // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); + // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[p * count + i] != i) pass = false; } @@ -51,9 +51,9 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c break; case 2: { - if(myid == ROOT) printf("VERIFY SCATTER\n"); + if(myid == ROOT) fprintf(stderr, "VERIFY SCATTER\n"); for(size_t i = 0; i < count; i++) { - // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); + // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[i] != myid * count + i) pass = false; } @@ -61,9 +61,9 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c break; case 3: { - if(myid == ROOT) printf("VERIFY BCAST\n"); + if(myid == ROOT) fprintf(stderr, "VERIFY BCAST\n"); for(size_t i = 0; i < count; i++) { - // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); + // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[i] != i) pass = false; } @@ -71,16 +71,16 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c break; case 4: { - if(myid == ROOT) printf("REDUCE IS NOT TESTED\n"); + if(myid == ROOT) fprintf(stderr, "REDUCE IS NOT TESTED\n"); pass = false; } break; case 5: { - if(myid == ROOT) printf("VERIFY ALL-TO-ALL\n"); + if(myid == ROOT) fprintf(stderr, "VERIFY ALL-TO-ALL\n"); for(int p = 0; p < numproc; p++) for(size_t i = 0; i < count; i++) { - // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); + // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[p * count + i] != myid * count + i) pass = false; } @@ -88,10 +88,10 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c break; case 6: { - if(myid == ROOT) printf("VERIFY ALL-GATHER\n"); + if(myid == ROOT) fprintf(stderr, "VERIFY ALL-GATHER\n"); for(int p = 0; p < numproc; p++) for(size_t i = 0; i < count; i++) { - // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); + // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[p * count + i] != i) pass = false; } @@ -99,13 +99,13 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c break; case 7: { - if(myid == ROOT) printf("REDUCE-SCATTER IS NOT TESTED\n"); + if(myid == ROOT) fprintf(stderr, "REDUCE-SCATTER IS NOT TESTED\n"); pass = false; } break; case 8: { - if(myid == ROOT) printf("ALL-REDUCE IS NOT TESTED\n"); + if(myid == ROOT) fprintf(stderr, "ALL-REDUCE IS NOT TESTED\n"); pass = false; } break; @@ -113,9 +113,9 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c pass = CommBench::allreduce_land(pass); if(myid == ROOT) { if(pass) - printf("PASSED!\n"); + fprintf(stderr, "PASSED!\n"); else - printf("FAILED!!!\n"); + fprintf(stderr, "FAILED!!!\n"); } CommBench::freeHost(sendbuf); diff --git a/bench.cpp b/bench.cpp index b241710..4aec884 100644 --- a/bench.cpp +++ b/bench.cpp @@ -8,45 +8,61 @@ using namespace CommBench; int main(int argc, char *argv[]) { init(); + if (argc < 3) { + if (myid == 0) + fprintf(stderr, "Invalid arguments, usage is ./bench \n"); + return 1; + } + char* folder = argv[1]; + int *sendbuf, *recvbuf; - int max_num_bytes = 17; - library libs[] = {library::MPI, library::IPC_get, library::IPC, - library::NCCL}; - std::string libs_str[] = {"mpi", "ipc_get", "ipc_put", "xccl"}; + int max_num_bytes = 31; char file_name[100]; int gpus = numproc; allocate(sendbuf, 1ull << max_num_bytes); allocate(recvbuf, 1ull << max_num_bytes); - for (int i = 0; i < 4; i++) { + for (int i = 2; i < argc; i++) { + char* lib_str = argv[i]; + library lib; + if (strcmp(lib_str, "mpi") == 0) + lib = library::MPI; + else if (strcmp(lib_str, "ipc_put") == 0) + lib = library::IPC; + else if (strcmp(lib_str, "ipc_get") == 0) + lib = library::IPC_get; + else if (strcmp(lib_str, "xccl") == 0) + lib = library::NCCL; + else { + if (myid == 0) + fprintf(stderr, "Invalid library %s\n", lib_str); + return 1; + } if (myid == 0) - fprintf(stderr, "Testing library %s\n", libs_str[i].c_str()); + fprintf(stderr, "Testing library %s\n", lib_str); for (int source = 0; source < gpus; source++) { for (int dest = 0; dest < gpus; dest++) { - // fprintf(stderr, "\033[2K\rP2P Comm: %d->%d", source, dest); if (myid == 0) - fprintf(stderr, "P2P Comm: %d->%d\n", source, dest); - for (int size = 4; size < max_num_bytes; size += 1) { - Comm bench(libs[i]); - // if (myid == 0) { - // sprintf(file_name, "data_CPX_all_prelim/%s/%d_%d_%llu.out", - // libs_str[i].c_str(), source, dest, 1ull << size); - // freopen(file_name, "w", stdout); - // } + fprintf(stderr, "\033[2K\rP2P Comm: %d->%d", source, dest); + // fprintf(stderr, "P2P Comm: %d->%d\n", source, dest); + for (int size = 2; size < max_num_bytes; size+=2) { + Comm bench(lib); + if (myid == 0) { + sprintf(file_name, "%s/%s/%d_%d_%llu.out", folder, + lib_str, source, dest, 1ull << size); + freopen(file_name, "w", stdout); + } bench.add(sendbuf, recvbuf, 1ull << size, source, dest); bench.measure(5, 10); // bench.clear(); } - double time = MPI_Wtime(); - while (MPI_Wtime() < time + 1.0) ; } } - // fprintf(stderr, "\033[2K\rCompleted testing library %s\n", - // libs_str[i].c_str()); if (myid == 0) - fprintf(stderr, "Completed testing library %s\n", libs_str[i].c_str()); + fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str); + // fprintf(stderr, "Completed testing library %s\n", libs_str[i].c_str()); } free(sendbuf); free(recvbuf); diff --git a/parse.py b/parse.py new file mode 100644 index 0000000..18d87f7 --- /dev/null +++ b/parse.py @@ -0,0 +1,22 @@ +import os +import pickle as pkl +import pandas as pd +import numpy as np + +folder = "data_CPX_all" +out_file = "CPX.pkl" +df = pd.DataFrame({}); + +for comm in os.listdir(folder): + data = [] + for file_name in os.listdir(f"{folder}/{comm}"): + parts = file_name.removesuffix(".out").split(sep='_') + with open(f"{folder}/{comm}/{file_name}", "r") as file: + times = [np.float64(line.split(sep=' ')[1]) for line in file] # min, med, max, avg + data.append((np.int64(parts[2]), np.int32(parts[0]), np.int32(parts[1]), *times)) + df[comm] = data # size, source, dest, min, med, max, avg + +print(df) # df.explode() ? + +with open(out_file, mode="wb") as file: + pkl.dump(df, file) \ No newline at end of file diff --git a/visualize.py b/visualize.py new file mode 100644 index 0000000..afd0e17 --- /dev/null +++ b/visualize.py @@ -0,0 +1,44 @@ +import matplotlib.pyplot as plt +import numpy as np +import os + +def get_data(folder: str, gpus: int, size: int) -> np.ndarray: + arr = np.empty((gpus, gpus)) + + for file_name in os.listdir(folder): + parts = file_name.removesuffix(".out").split(sep='_') + if parts[2] == str(size): + with open(f"{folder}/{file_name}", "r") as file: + min_time = file.readline().split(sep=' ')[1] + arr[int(parts[0])][int(parts[1])] = min_time + + return arr + +nproc = 12 +fig, axes = plt.subplots(2, 2, figsize=(24, 24), constrained_layout=True) +folder = "data_aws_xccl" +lib = "xccl" +lib_title = "XCCL" + +for i in range(4): + ax = axes[i // 2][i % 2] + nbytes = 2 ** (8 * (i + 1) - 2) + arr = get_data(f"{folder}/{lib}", nproc, nbytes) + + ax.imshow(arr, origin="lower") + ax.set_xticks(np.arange(nproc)) + ax.set_yticks(np.arange(nproc)) + + ax.set_xlabel("Source") + ax.set_ylabel("Destination") + ax.set_title(f"{nbytes * 4} bytes") + + # Loop over data dimensions and create text annotations. + for i in range(nproc): + for j in range(nproc): + entry = arr[i, j] + text = ax.text(j, i, f"{entry:.1f}", + ha="center", va="center", color="w", fontsize=6) + +fig.suptitle(f"{lib_title} Latency") +fig.savefig(f"vis/{lib}.png", dpi=400) diff --git a/visualize_df.py b/visualize_df.py new file mode 100644 index 0000000..d0629ee --- /dev/null +++ b/visualize_df.py @@ -0,0 +1,47 @@ +import pickle as pkl +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +with open("SPX.pkl", mode="rb") as file: + df: pd.DataFrame = pkl.load(file) + +cols = ("size", "source", "dest", "min", "med", "max", "avg") +sizes = (2 ** 18, 2 ** 22, 2 ** 26, 2 ** 30) +nproc = 4 + +col_names = { "ipc_get": "IPC Get", "ipc_put": "IPC Put", "ipc_get_blit": "IPC Get BLIT Kernel", "ipc_put_blit": "IPC Put BLIT Kernel", "xccl": "XCCL", "mpi": "MPI" } + +for col in df.columns: + df_loc = pd.DataFrame([[*row] for row in df[col].values], columns=cols) + groups = df_loc.groupby(["size"]) + fig, axes = plt.subplots(2, 2, figsize=(8, 8), constrained_layout=True) + for plot_i, size in enumerate(sizes): + df_size = groups.get_group(size) + # print(df_size) + arr = np.empty((nproc, nproc), dtype=np.float64) + nbytes = size * 4 + # df_size.apply(lambda row: arr[row["source"]][row["dest"]] = nbytes / row["min"] * 1e-9) + for i, j, val in df_size[["source", "dest", "min"]].itertuples(index=False): + arr[i][j] = nbytes / val * 1e-3 + + ax = axes[plot_i // 2][plot_i % 2] + + ax.imshow(arr, origin="lower") + ax.set_xticks(np.arange(nproc)) + ax.set_yticks(np.arange(nproc)) + + ax.set_xlabel("Source") + ax.set_ylabel("Destination") + ax.set_title(f"{nbytes} bytes") + + # Loop over data dimensions and create text annotations. + for i in range(nproc): + for j in range(nproc): + entry = arr[i, j] + text = ax.text(j, i, f"{entry:.4f}", + ha="center", va="center", color="w", fontsize=6) + + fig.suptitle(f"{col_names[col]} Bandwidth in GB/s") + fig.savefig(f"vis_SPX_bandwidth/{col}.png", dpi=400) + From 97863c71fb33d34efd426b39f319b830b399a391 Mon Sep 17 00:00:00 2001 From: arhag23 Date: Thu, 7 Aug 2025 11:32:13 -0600 Subject: [PATCH 5/9] instructions --- CMakeLists.txt | 21 ++++++++--- CommFunctions/CMakeLists.txt | 4 +++ CommFunctions/comm.h | 10 +++--- CommFunctions/commbench.h | 0 CommFunctions/util.h | 0 CommFunctions/validate.h | 0 bench_alltoall.cpp | 66 ++++++++++++++++++++++++++++++++++ bench_broadcast.cpp | 66 ++++++++++++++++++++++++++++++++++ bench_gather.cpp | 67 +++++++++++++++++++++++++++++++++++ bench_p2p.cpp | 68 ++++++++++++++++++++++++++++++++++++ instructions.md | 61 ++++++++++++++++++++++++++++++++ main.cpp | 8 ++--- 12 files changed, 358 insertions(+), 13 deletions(-) mode change 100644 => 100755 CMakeLists.txt mode change 100644 => 100755 CommFunctions/CMakeLists.txt mode change 100644 => 100755 CommFunctions/comm.h mode change 100644 => 100755 CommFunctions/commbench.h mode change 100644 => 100755 CommFunctions/util.h mode change 100644 => 100755 CommFunctions/validate.h create mode 100755 bench_alltoall.cpp create mode 100755 bench_broadcast.cpp create mode 100755 bench_gather.cpp create mode 100755 bench_p2p.cpp create mode 100644 instructions.md mode change 100644 => 100755 main.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt old mode 100644 new mode 100755 index 7cc9361..e8aa377 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,6 +53,7 @@ option(USE_BLIT_KERNEL "Use Blit/Copy Kernel for IPC communication" OFF) option(USE_CALIPER "Enable Caliper benchmarking support" OFF) option(USE_JSONCPP "Enable support for passing a config file" OFF) +option(PRINT_DEBUG "Enable debug output" OFF) # Create a variable for NCCL path @@ -83,19 +84,31 @@ add_subdirectory(CommFunctions) # add subdirectory after all configuration, may # Add your source files and executable here add_executable(${PROJECT_NAME} main.cpp) -add_executable(bench bench.cpp) +add_executable(bench_p2p bench_p2p.cpp) +add_executable(bench_alltoall bench_alltoall.cpp) +add_executable(bench_broadcast bench_broadcast.cpp) +add_executable(bench_gather bench_gather.cpp) target_link_libraries(${PROJECT_NAME} PUBLIC CommFunctions commbench_compiler_flags) -target_link_libraries(bench PUBLIC CommFunctions commbench_compiler_flags) +target_link_libraries(bench_p2p PUBLIC CommFunctions commbench_compiler_flags) +target_link_libraries(bench_alltoall PUBLIC CommFunctions commbench_compiler_flags) +target_link_libraries(bench_broadcast PUBLIC CommFunctions commbench_compiler_flags) +target_link_libraries(bench_gather PUBLIC CommFunctions commbench_compiler_flags) install(TARGETS ${PROJECT_NAME} DESTINATION bin) install(DIRECTORY ${PROJECT_SOURCE_DIR}/CommFunctions DESTINATION include FILES_MATCHING PATTERN "*.h") if (USE_CUDA) set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA) - set_source_files_properties(bench.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(bench_p2p.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(bench_alltoall.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(bench_broadcast.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(bench_gather.cpp PROPERTIES LANGUAGE CUDA) elseif (USE_HIP) set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) - set_source_files_properties(bench.cpp PROPERTIES LANGUAGE HIP) + set_source_files_properties(bench_p2p.cpp PROPERTIES LANGUAGE HIP) + set_source_files_properties(bench_alltoall.cpp PROPERTIES LANGUAGE HIP) + set_source_files_properties(bench_broadcast.cpp PROPERTIES LANGUAGE HIP) + set_source_files_properties(bench_gather.cpp PROPERTIES LANGUAGE HIP) endif() if (USE_JSONCPP) diff --git a/CommFunctions/CMakeLists.txt b/CommFunctions/CMakeLists.txt old mode 100644 new mode 100755 index bc33c82..1b11657 --- a/CommFunctions/CMakeLists.txt +++ b/CommFunctions/CMakeLists.txt @@ -8,6 +8,10 @@ else() target_compile_definitions(CommFunctions INTERFACE "USE_GASNET") endif() +if (PRINT_DEBUG) + target_compile_definitions(CommFunctions INTERFACE "PRINT_DEBUG") +endif() + if (USE_GTL) target_link_libraries(CommFunctions INTERFACE mpi_gtl_hsa) diff --git a/CommFunctions/comm.h b/CommFunctions/comm.h old mode 100644 new mode 100755 index d1d31f7..ea87586 --- a/CommFunctions/comm.h +++ b/CommFunctions/comm.h @@ -831,10 +831,10 @@ #ifdef IPC_kernel template - __global__ void copy_kernel(T *output, T *input, size_t count) { + __global__ void copy_kernel(T *output, const T *input) { const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; - if(tid < count) - output[tid] = input[tid]; + // if(tid < count) + output[tid] = input[tid]; } #endif @@ -902,7 +902,7 @@ for(int send = 0; send < numsend; send++) { #ifdef IPC_kernel #if defined(PORT_CUDA) || defined(PORT_HIP) - copy_kernel<<<(sendcount[send] + 255) / 256, 256, 0, stream_ipc[send]>>>(remotebuf[send] + remoteoffset[send], sendbuf[send] + sendoffset[send], sendcount[send]); + copy_kernel<<>>(remotebuf[send] + remoteoffset[send], sendbuf[send] + sendoffset[send]); #elif defined PORT_ONEAPI && !defined IPC_ze // q_ipc[send].memcpy(remotebuf[send] + remoteoffset[send], sendbuf[send] + sendoffset[send], sendcount[send] * sizeof(T)); #endif @@ -931,7 +931,7 @@ for(int recv = 0; recv < numrecv; recv++) { #ifdef IPC_kernel #if defined(PORT_CUDA) || defined(PORT_HIP) - copy_kernel<<<(recvcount[recv] + 255) / 256, 256, 0, stream_ipc[recv]>>>(recvbuf[recv] + recvoffset[recv], remotebuf[recv] + remoteoffset[recv], recvcount[recv]); + copy_kernel<<>>(recvbuf[recv] + recvoffset[recv], remotebuf[recv] + remoteoffset[recv]); #elif defined PORT_ONEAPI && !defined IPC_ze // q_ipc[send].memcpy(remotebuf[send] + remoteoffset[send], sendbuf[send] + sendoffset[send], sendcount[send] * sizeof(T)); #endif diff --git a/CommFunctions/commbench.h b/CommFunctions/commbench.h old mode 100644 new mode 100755 diff --git a/CommFunctions/util.h b/CommFunctions/util.h old mode 100644 new mode 100755 diff --git a/CommFunctions/validate.h b/CommFunctions/validate.h old mode 100644 new mode 100755 diff --git a/bench_alltoall.cpp b/bench_alltoall.cpp new file mode 100755 index 0000000..1ada1ce --- /dev/null +++ b/bench_alltoall.cpp @@ -0,0 +1,66 @@ +#include "commbench.h" +#include +#include +#include + +using namespace CommBench; + +int main(int argc, char *argv[]) { + init(); + + if (argc < 3) { + if (myid == 0) + fprintf(stderr, "Invalid arguments, usage is ./bench \n"); + return 1; + } + char* folder = argv[1]; + + int *sendbuf, *recvbuf; + int max_num_bytes = 25; + char file_name[100]; + int gpus = numproc; + + allocate(sendbuf, (1ull << max_num_bytes) * numproc); + allocate(recvbuf, (1ull << max_num_bytes) * numproc); + + for (int i = 2; i < argc; i++) { + char* lib_str = argv[i]; + library lib; + if (strcmp(lib_str, "mpi") == 0) + lib = library::MPI; + else if (strcmp(lib_str, "ipc_put") == 0) + lib = library::IPC; + else if (strcmp(lib_str, "ipc_get") == 0) + lib = library::IPC_get; + else if (strcmp(lib_str, "xccl") == 0) + lib = library::NCCL; + else { + if (myid == 0) + fprintf(stderr, "Invalid library %s\n", lib_str); + return 1; + } + if (myid == 0) + fprintf(stderr, "Testing library %s\n", lib_str); + for (int size = 2; size < max_num_bytes; size+=2) { + if (myid == 0) + fprintf(stderr, "\033[2K\rTesting size: %llu bytes", 1ull << size); + Comm bench(lib); + if (myid == 0) { + sprintf(file_name, "%s/%s/%llu.out", folder, + lib_str, 1ull << size); + freopen(file_name, "w", stdout); + } + for (int sender = 0; sender < numproc; sender++) + for (int recver = 0; recver < numproc; recver++) + bench.add(sendbuf, recver * (1ull << size), recvbuf, sender * (1ull << size), + 1ull << size, sender, recver); + bench.measure(5, 10); + } + + if (myid == 0) + fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str); + } + free(sendbuf); + free(recvbuf); + MPI_Finalize(); +} diff --git a/bench_broadcast.cpp b/bench_broadcast.cpp new file mode 100755 index 0000000..8da3cfe --- /dev/null +++ b/bench_broadcast.cpp @@ -0,0 +1,66 @@ +#include "commbench.h" +#include +#include +#include + +using namespace CommBench; + +int main(int argc, char *argv[]) { + init(); + + if (argc < 3) { + if (myid == 0) + fprintf(stderr, "Invalid arguments, usage is ./bench \n"); + return 1; + } + char* folder = argv[1]; + + int *sendbuf, *recvbuf; + int max_num_bytes = 31; + char file_name[100]; + int gpus = numproc; + + allocate(sendbuf, 1ull << max_num_bytes); + allocate(recvbuf, 1ull << max_num_bytes); + + for (int i = 2; i < argc; i++) { + char* lib_str = argv[i]; + library lib; + if (strcmp(lib_str, "mpi") == 0) + lib = library::MPI; + else if (strcmp(lib_str, "ipc_put") == 0) + lib = library::IPC; + else if (strcmp(lib_str, "ipc_get") == 0) + lib = library::IPC_get; + else if (strcmp(lib_str, "xccl") == 0) + lib = library::NCCL; + else { + if (myid == 0) + fprintf(stderr, "Invalid library %s\n", lib_str); + return 1; + } + if (myid == 0) + fprintf(stderr, "Testing library %s\n", lib_str); + for (int source = 0; source < gpus; source++) { + for (int size = 2; size < max_num_bytes; size+=2) { + if (myid == 0) + fprintf(stderr, "\033[2K\rBroadcaster: %d, size: %llu", source, 1ull << size); + Comm bench(lib); + if (myid == 0) { + sprintf(file_name, "%s/%s/%d_%llu.out", folder, + lib_str, source, 1ull << size); + freopen(file_name, "w", stdout); + } + for (int p = 0; p < numproc; p++) + bench.add(sendbuf, 0, recvbuf, 0, 1ull << size, source, p); + bench.measure(5, 10); + } + } + + if (myid == 0) + fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str); + } + free(sendbuf); + free(recvbuf); + MPI_Finalize(); +} diff --git a/bench_gather.cpp b/bench_gather.cpp new file mode 100755 index 0000000..447631b --- /dev/null +++ b/bench_gather.cpp @@ -0,0 +1,67 @@ +#include "commbench.h" +#include +#include +#include + +using namespace CommBench; + +int main(int argc, char *argv[]) { + init(); + + if (argc < 3) { + if (myid == 0) + fprintf(stderr, "Invalid arguments, usage is ./bench \n"); + return 1; + } + char* folder = argv[1]; + + int *sendbuf, *recvbuf; + int max_num_bytes = 27; + char file_name[100]; + int gpus = numproc; + + allocate(sendbuf, 1ull << max_num_bytes); + allocate(recvbuf, (1ull << max_num_bytes) * numproc); + + for (int i = 2; i < argc; i++) { + char* lib_str = argv[i]; + library lib; + if (strcmp(lib_str, "mpi") == 0) + lib = library::MPI; + else if (strcmp(lib_str, "ipc_put") == 0) + lib = library::IPC; + else if (strcmp(lib_str, "ipc_get") == 0) + lib = library::IPC_get; + else if (strcmp(lib_str, "xccl") == 0) + lib = library::NCCL; + else { + if (myid == 0) + fprintf(stderr, "Invalid library %s\n", lib_str); + return 1; + } + if (myid == 0) + fprintf(stderr, "Testing library %s\n", lib_str); + for (int dest = 0; dest < gpus; dest++) { + // fprintf(stderr, "P2P Comm: %d->%d\n", source, dest); + for (int size = 2; size < max_num_bytes; size+=2) { + if (myid == 0) + fprintf(stderr, "\033[2K\rGatherer: %d, size: %llu", dest, 1ull << size); + Comm bench(lib); + if (myid == 0) { + sprintf(file_name, "%s/%s/%d_%llu.out", folder, + lib_str, dest, 1ull << size); + freopen(file_name, "w", stdout); + } + for (int p = 0; p < numproc; p++) + bench.add(sendbuf, 0, recvbuf, p * (1ull << size), 1ull << size, p, dest); + bench.measure(5, 10); + } + } + + if (myid == 0) + fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str); + } + free(sendbuf); + free(recvbuf); + MPI_Finalize(); +} diff --git a/bench_p2p.cpp b/bench_p2p.cpp new file mode 100755 index 0000000..c848960 --- /dev/null +++ b/bench_p2p.cpp @@ -0,0 +1,68 @@ +#include "commbench.h" +#include +#include +#include + +using namespace CommBench; + +int main(int argc, char *argv[]) { + init(); + + if (argc < 3) { + if (myid == 0) + fprintf(stderr, "Invalid arguments, usage is ./bench \n"); + return 1; + } + char* folder = argv[1]; + + int *sendbuf, *recvbuf; + int max_num_bytes = 31; + char file_name[100]; + int gpus = numproc; + + allocate(sendbuf, 1ull << max_num_bytes); + allocate(recvbuf, 1ull << max_num_bytes); + + for (int i = 2; i < argc; i++) { + char* lib_str = argv[i]; + library lib; + if (strcmp(lib_str, "mpi") == 0) + lib = library::MPI; + else if (strcmp(lib_str, "ipc_put") == 0) + lib = library::IPC; + else if (strcmp(lib_str, "ipc_get") == 0) + lib = library::IPC_get; + else if (strcmp(lib_str, "xccl") == 0) + lib = library::NCCL; + else { + if (myid == 0) + fprintf(stderr, "Invalid library %s\n", lib_str); + return 1; + } + if (myid == 0) + fprintf(stderr, "Testing library %s\n", lib_str); + for (int source = 0; source < gpus; source++) { + for (int dest = 0; dest < gpus; dest++) { + if (myid == 0) + fprintf(stderr, "\033[2K\rP2P Comm: %d->%d", source, dest); + for (int size = 2; size < max_num_bytes; size+=2) { + Comm bench(lib); + if (myid == 0) { + sprintf(file_name, "%s/%s/%d_%d_%llu.out", folder, + lib_str, source, dest, 1ull << size); + freopen(file_name, "w", stdout); + } + bench.add(sendbuf, recvbuf, 1ull << size, source, dest); + // bench.add(sendbuf, recvbuf, 1ull << size, dest, source); + bench.measure(5, 10); + } + } + } + + if (myid == 0) + fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str); + } + free(sendbuf); + free(recvbuf); + MPI_Finalize(); +} diff --git a/instructions.md b/instructions.md new file mode 100644 index 0000000..6482e2e --- /dev/null +++ b/instructions.md @@ -0,0 +1,61 @@ +# Overview of Usage + +This branch provides the same `CommBench` executable with the standard usage flags, but also adds the `--source` and `--dest` flags that can be used for P2P, gather, and broadcast patterns. Gather only supports `--dest` and broadcast only supports `--source`. + +In addition, we provide four new targets. `bench_p2p`, `bench_gather`, `bench_broadcast` and `bench_alltoall` for these respective patterns. These provide support for "mass benchmarking tests", testing all combinations of the available processors. The usage for these is `bench_ ` where you can provide a space separated list of libraries (out of `ipc_get`, `ipc_put`, `mpi`, and `xccl`) to test. + +This will then dump the output for each of these benchmarks into a lot of files in the directories `output_folder/` for each specified library. These can be parsed and visualized later. These DO NOT create the directories themselves, so make sure that the directory `output_folder/` exists for every library you want to test. Otherwise the benchmarks will silently go on + +# Building CommBench for El Dorado Benchmarking + +In this branch, most of the "debug" output is disabled by default and when enabled is directed to `stderr`. To enable this output, configure CMake with the `-DPRINT_DEBUG=ON` flag, but by default it is off so that only the necessary output is shown. + +To build, run the following in the CommBench directory: +```sh +cmake -S . -B build -DUSE_HIP=ON -DUSE_XCCL=ON -DXCCL_PATH=/opt/rocm-6.3.1 -DUSE_GTL=ON -DGTL_PATH=/opt/cray/pe/mpich/8.1.32/ofi/amd/6.0 -DCMAKE_EXPORT_COMPILE_COMMANDS=TRUE -DMPI_C_COMPILER=/opt/cray/pe/mpich/8.1.32/ofi/amd/6.0/bin/mpicc -DMPI_CXX_COMPILER=/opt/cray/pe/mpich/8.1.32/ofi/amd/6.0/bin/mpicxx +``` +and if you want to use CommBench's `IPC_kernel` for the `ipc_put` and `ipc_get` communications, add the flag `-DUSE_BLIT_KERNEL=ON`. It may be helpful to have two separate build folders, one with and one without this flag. Adjust `-B ` accordingly. + +Make sure to have the `PrgEnv-amd/8.6.0`, `craype-accel-amd-gfx942`, `cmake/3.24.2` and `rocm/6.3.1` modules loaded. To build for `rocm/6.4.1`, load `amd/6.4.1` and `rocm/6.4.1`. + +# Running Benchmarks + +Make sure to have to have `MPICH_GPU_SUPPORT_ENABLED=1` set. Experiment with the `HSA_ENABLE_SDMA=0` environment variable as well. + +Set up the output folders before running the benchmark. The scripts expect this file structure: +``` +pattern +├── CPX +│ ├── lib1 +│ ├── lib2 +│ └── lib3 +├── SPX +│ ├── lib1 +│ ├── lib2 +│ └── lib3 +└── TPX + ├── lib1 + ├── lib2 + └── lib3 +``` +You can technically have different libraries specified for the partitions, but as mentioned before, make sure the folders are created inside the partition. + +Allocate an instance with +```sh +flux alloc -N1 --setattr=gpumode= --conf=resource.rediscover=true --time-limit=