From 6953e92d00be85b312f266d36a00a3a9935f842e Mon Sep 17 00:00:00 2001 From: beinggod Date: Tue, 3 Feb 2026 15:43:17 +0800 Subject: [PATCH 1/3] [ROCm] feat: support rocm backend for Hygon DCU --- 3rdparty/cmake/FindPaddle.cmake | 23 +- 3rdparty/find_dependencies.cmake | 263 ++++++++++++------ 3rdparty/gflags/LICENSE | 28 ++ 3rdparty/gflags/gflags.cmake | 44 +++ 3rdparty/glog/LICENSE | 8 + 3rdparty/glog/glog.cmake | 47 ++++ 3rdparty/hipify_torch/LICENSE | 21 ++ 3rdparty/hipify_torch/hipify_torch.cmake | 15 + 3rdparty/mkl/tbb.cmake | 5 + 3rdparty/stdgpu/stdgpu.cmake | 37 ++- CMakeLists.txt | 60 +++- cmake/Open3DMakeRocmArchitectures.cmake | 52 ++++ cmake/Open3DPrintConfigurationSummary.cmake | 1 + cmake/Open3DSetGlobalProperties.cmake | 9 + cmake/Open3DShowAndAbortOnWarning.cmake | 51 +++- cpp/open3d/CMakeLists.txt | 35 +++ cpp/open3d/core/CMakeLists.txt | 20 +- cpp/open3d/core/CUDAUtils.cpp | 99 +++++++ cpp/open3d/core/CUDAUtils.h | 45 +++ cpp/open3d/core/ParallelFor.h | 18 +- .../hashmap/CUDA/CUDAHashBackendBuffer.cu | 4 + .../CUDA/CUDAHashBackendBufferAccessor.h | 8 + .../core/hashmap/CUDA/SlabHashBackend.h | 63 +++++ cpp/open3d/core/hashmap/CUDA/SlabMacros.h | 2 +- .../core/hashmap/CUDA/SlabNodeManager.h | 20 ++ cpp/open3d/core/hashmap/Dispatch.h | 2 +- cpp/open3d/core/linalg/LinalgHeadersCUDA.h | 9 +- cpp/open3d/core/nns/NeighborSearchCommon.h | 2 +- cpp/open3d/core/nns/kernel/DeviceDefs.cuh | 12 +- cpp/open3d/core/nns/kernel/MergeNetwork.cuh | 15 +- cpp/open3d/core/nns/kernel/Pair.cuh | 4 + cpp/open3d/core/nns/kernel/PtxUtils.cuh | 95 +++++++ cpp/open3d/core/nns/kernel/Reduction.cuh | 4 + cpp/open3d/core/nns/kernel/ReductionOps.cuh | 4 + cpp/open3d/core/nns/kernel/Select.cuh | 4 +- cpp/open3d/core/nns/kernel/StaticUtils.cuh | 4 + cpp/open3d/core/nns/kernel/WarpShuffle.cuh | 37 ++- cpp/open3d/ml/Helper.h | 13 + cpp/open3d/ml/contrib/CMakeLists.txt | 16 ++ .../impl/continuous_conv/ContinuousConv.cuh | 28 ++ .../ContinuousConvBackpropFilter.cuh | 30 +- .../ContinuousConvCUDAKernels.cu | 1 + .../ContinuousConvTranspose.cuh | 30 +- .../ContinuousConvTransposeBackpropFilter.cuh | 31 ++- .../ml/impl/misc/NeighborSearchCommon.h | 2 +- cpp/open3d/ml/impl/sparse_conv/SparseConv.cuh | 28 ++ .../sparse_conv/SparseConvBackpropFilter.cuh | 32 ++- .../impl/sparse_conv/SparseConvCUDAKernels.cu | 1 + .../impl/sparse_conv/SparseConvTranspose.cuh | 30 +- .../SparseConvTransposeBackpropFilter.cuh | 32 ++- cpp/open3d/ml/paddle/CMakeLists.txt | 41 ++- cpp/open3d/ml/paddle/PaddleHelper.cpp | 2 +- cpp/open3d/ml/paddle/PaddleHelper.h | 10 +- cpp/open3d/t/geometry/kernel/CMakeLists.txt | 17 ++ cpp/open3d/t/geometry/kernel/GeometryMacros.h | 4 +- cpp/open3d/t/geometry/kernel/ImageImpl.h | 18 +- cpp/open3d/t/geometry/kernel/NPPImage.cpp | 29 ++ cpp/open3d/t/geometry/kernel/PointCloudImpl.h | 34 +-- cpp/open3d/t/geometry/kernel/TransformImpl.h | 8 +- .../t/geometry/kernel/TriangleMeshImpl.h | 8 +- .../t/geometry/kernel/VoxelBlockGridImpl.h | 58 ++-- cpp/open3d/t/pipelines/kernel/CMakeLists.txt | 16 ++ cpp/open3d/t/pipelines/kernel/FeatureImpl.h | 4 +- .../pipelines/kernel/FillInLinearSystemImpl.h | 10 +- .../kernel/RGBDOdometryJacobianImpl.h | 2 +- .../t/pipelines/kernel/RegistrationImpl.h | 224 ++++++++------- cpp/open3d/utility/Logging.h | 3 +- cpp/open3d/utility/MiniVec.h | 2 + cpp/pybind/_build_config.py.in | 3 +- cpp/pybind/t/geometry/geometry.h | 1 - hipify_custom_map.json | 43 +++ python/open3d/ml/paddle/ops/__init__.py | 25 +- python/requirements_style.txt | 1 + python/test/ml_ops/test_cublas.py | 2 +- 74 files changed, 1656 insertions(+), 353 deletions(-) create mode 100644 3rdparty/gflags/LICENSE create mode 100644 3rdparty/gflags/gflags.cmake create mode 100644 3rdparty/glog/LICENSE create mode 100644 3rdparty/glog/glog.cmake create mode 100644 3rdparty/hipify_torch/LICENSE create mode 100644 3rdparty/hipify_torch/hipify_torch.cmake create mode 100644 cmake/Open3DMakeRocmArchitectures.cmake create mode 100644 hipify_custom_map.json diff --git a/3rdparty/cmake/FindPaddle.cmake b/3rdparty/cmake/FindPaddle.cmake index fe139cf14..dd1a32eed 100644 --- a/3rdparty/cmake/FindPaddle.cmake +++ b/3rdparty/cmake/FindPaddle.cmake @@ -20,8 +20,15 @@ if(NOT Paddle_FOUND) endif() if(BUILD_CUDA_MODULE) - find_package(CUDAToolkit REQUIRED) - string(SUBSTRING ${CUDAToolkit_VERSION} 0 4 CUDA_VERSION) + if (WITH_CUDA) + find_package(CUDAToolkit REQUIRED) + string(SUBSTRING ${CUDAToolkit_VERSION} 0 4 CUDA_VERSION) + elseif(WITH_ROCM) + find_package(HIP REQUIRED) + # Reuse CUDAToolkit variables for HIP + set(CUDAToolkit_INCLUDE_DIRS ${ROCM_PATH}/include) + set(CUDAToolkit_LIBRARY_DIR ${ROCM_PATH}/lib) + endif() endif() message(STATUS "Getting Paddle properties ...") @@ -81,14 +88,20 @@ if(NOT Paddle_FOUND) list(APPEND PADDLE_LIBRARY_DIRS "${PADDLE_LIB}") if(BUILD_CUDA_MODULE) - find_library(CUDART_LIB NAMES cudart PATHS "${CUDAToolkit_LIBRARY_DIR}") - list(APPEND PADDLE_LIBRARY_DIRS "${CUDART_LIB}") + if (WITH_CUDA) + find_library(CUDART_LIB NAMES cudart PATHS "${CUDAToolkit_LIBRARY_DIR}") + list(APPEND PADDLE_LIBRARY_DIRS "${CUDART_LIB}") + endif() endif() # handle compile flags set(PADDLE_CXX_FLAGS) if(BUILD_CUDA_MODULE) - set(PADDLE_CXX_FLAGS "-DPADDLE_WITH_CUDA ${PADDLE_CXX_FLAGS}") + if (WITH_CUDA) + set(PADDLE_CXX_FLAGS -DPADDLE_WITH_CUDA ${PADDLE_CXX_FLAGS}) + elseif(WITH_ROCM) + set(PADDLE_CXX_FLAGS -DPADDLE_WITH_HIP -DPADDLE_WITH_CUSTOM_KERNEL ${PADDLE_CXX_FLAGS}) + endif() endif() set_target_properties(paddle PROPERTIES diff --git a/3rdparty/find_dependencies.cmake b/3rdparty/find_dependencies.cmake index 169bfbe1a..b8dbb64a7 100644 --- a/3rdparty/find_dependencies.cmake +++ b/3rdparty/find_dependencies.cmake @@ -473,7 +473,19 @@ ProcessorCount(NPROC) # CUDAToolkit (required at this point for subsequent checks and targets) if(BUILD_CUDA_MODULE) - find_package(CUDAToolkit REQUIRED) + if (WITH_CUDA) + find_package(CUDAToolkit REQUIRED) + elseif (WITH_ROCM) + find_package(HIP REQUIRED) + + find_package(hipblaslt REQUIRED) + find_package(hipblas REQUIRED) + find_package(hipsparse REQUIRED) + find_package(hipsolver REQUIRED) + + # hipify_torch will automatically become available. + include(${Open3D_3RDPARTY_DIR}/hipify_torch/hipify_torch.cmake) + endif() endif() # Threads @@ -532,37 +544,49 @@ endif() # CUB (already included in CUDA 11.0+) if(BUILD_CUDA_MODULE AND CUDAToolkit_VERSION VERSION_LESS "11.0") - include(${Open3D_3RDPARTY_DIR}/cub/cub.cmake) - open3d_import_3rdparty_library(3rdparty_cub - INCLUDE_DIRS ${CUB_INCLUDE_DIRS} - DEPENDS ext_cub - ) - list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_cub) + if (WITH_CUDA) + include(${Open3D_3RDPARTY_DIR}/cub/cub.cmake) + open3d_import_3rdparty_library(3rdparty_cub + INCLUDE_DIRS ${CUB_INCLUDE_DIRS} + DEPENDS ext_cub + ) + list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_cub) + elseif (WITH_ROCM) + set(CUB_INCLUDE_DIRS ${ROCM_PATH}/include) + open3d_import_3rdparty_library(3rdparty_cub + INCLUDE_DIRS ${CUB_INCLUDE_DIRS} + ) + list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_cub) + endif() endif() # cutlass if(BUILD_CUDA_MODULE) - if(USE_SYSTEM_CUTLASS) - find_path(3rdparty_cutlass_INCLUDE_DIR NAMES cutlass/cutlass.h) - if(3rdparty_cutlass_INCLUDE_DIR) - add_library(3rdparty_cutlass INTERFACE) - target_include_directories(3rdparty_cutlass INTERFACE ${3rdparty_cutlass_INCLUDE_DIR}) - add_library(Open3D::3rdparty_cutlass ALIAS 3rdparty_cutlass) - if(NOT BUILD_SHARED_LIBS) - install(TARGETS 3rdparty_cutlass EXPORT ${PROJECT_NAME}Targets) + if (WITH_CUDA) + if(USE_SYSTEM_CUTLASS) + find_path(3rdparty_cutlass_INCLUDE_DIR NAMES cutlass/cutlass.h) + if(3rdparty_cutlass_INCLUDE_DIR) + add_library(3rdparty_cutlass INTERFACE) + target_include_directories(3rdparty_cutlass INTERFACE ${3rdparty_cutlass_INCLUDE_DIR}) + add_library(Open3D::3rdparty_cutlass ALIAS 3rdparty_cutlass) + if(NOT BUILD_SHARED_LIBS) + install(TARGETS 3rdparty_cutlass EXPORT ${PROJECT_NAME}Targets) + endif() + else() + set(USE_SYSTEM_CUTLASS OFF) endif() - else() - set(USE_SYSTEM_CUTLASS OFF) endif() + if(NOT USE_SYSTEM_CUTLASS) + include(${Open3D_3RDPARTY_DIR}/cutlass/cutlass.cmake) + open3d_import_3rdparty_library(3rdparty_cutlass + INCLUDE_DIRS ${CUTLASS_INCLUDE_DIRS} + DEPENDS ext_cutlass + ) + endif() + list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_cutlass) + elseif (WITH_ROCM) + message(WARNING "ROCM is not supported for cutlass.") endif() - if(NOT USE_SYSTEM_CUTLASS) - include(${Open3D_3RDPARTY_DIR}/cutlass/cutlass.cmake) - open3d_import_3rdparty_library(3rdparty_cutlass - INCLUDE_DIRS ${CUTLASS_INCLUDE_DIRS} - DEPENDS ext_cutlass - ) - endif() - list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_cutlass) endif() # Dirent @@ -1756,80 +1780,99 @@ endif() # if(OPEN3D_USE_ONEAPI_PACKAGES) # cuBLAS if(BUILD_CUDA_MODULE) - if(WIN32) - # Nvidia does not provide static libraries for Windows. We don't release - # pip wheels for Windows with CUDA support at the moment. For the pip - # wheels to support CUDA on Windows out-of-the-box, we need to either - # ship the CUDA toolkit with the wheel (e.g. PyTorch can make use of the - # cudatoolkit conda package), or have a mechanism to locate the CUDA - # toolkit from the system. - list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM CUDA::cusolver CUDA::cublas) - else() - # CMake docs : https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html - # cusolver 11.0: https://docs.nvidia.com/cuda/archive/11.0/cusolver/index.html#static-link-lapack - # cublas 11.0: https://docs.nvidia.com/cuda/archive/11.0/cublas/index.html#static-library - # The link order below is important. Theoretically we should use - # open3d_find_package_3rdparty_library, but we have to insert - # liblapack_static.a in the middle of the targets. - add_library(3rdparty_cublas INTERFACE) - if(CUDAToolkit_VERSION VERSION_LESS "12.0") - target_link_libraries(3rdparty_cublas INTERFACE - CUDA::cusolver_static - ${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a - CUDA::cusparse_static - CUDA::cublas_static - CUDA::cublasLt_static - CUDA::culibos - ) + if (WITH_CUDA) + if(WIN32) + # Nvidia does not provide static libraries for Windows. We don't release + # pip wheels for Windows with CUDA support at the moment. For the pip + # wheels to support CUDA on Windows out-of-the-box, we need to either + # ship the CUDA toolkit with the wheel (e.g. PyTorch can make use of the + # cudatoolkit conda package), or have a mechanism to locate the CUDA + # toolkit from the system. + list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM CUDA::cusolver CUDA::cublas) else() - # In CUDA12.0 the liblapack_static.a is deprecated and removed. - # Use the libcusolver_lapack_static.a instead. - target_link_libraries(3rdparty_cublas INTERFACE - CUDA::cusolver_static - ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a - CUDA::cusparse_static - CUDA::cublas_static - CUDA::cublasLt_static - CUDA::culibos - ) + # CMake docs : https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html + # cusolver 11.0: https://docs.nvidia.com/cuda/archive/11.0/cusolver/index.html#static-link-lapack + # cublas 11.0: https://docs.nvidia.com/cuda/archive/11.0/cublas/index.html#static-library + # The link order below is important. Theoretically we should use + # open3d_find_package_3rdparty_library, but we have to insert + # liblapack_static.a in the middle of the targets. + add_library(3rdparty_cublas INTERFACE) + if(CUDAToolkit_VERSION VERSION_LESS "12.0") + target_link_libraries(3rdparty_cublas INTERFACE + CUDA::cusolver_static + ${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a + CUDA::cusparse_static + CUDA::cublas_static + CUDA::cublasLt_static + CUDA::culibos + ) + else() + # In CUDA12.0 the liblapack_static.a is deprecated and removed. + # Use the libcusolver_lapack_static.a instead. + target_link_libraries(3rdparty_cublas INTERFACE + CUDA::cusolver_static + ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a + CUDA::cusparse_static + CUDA::cublas_static + CUDA::cublasLt_static + CUDA::culibos + ) + endif() + if(NOT BUILD_SHARED_LIBS) + # Listed in ${CMAKE_INSTALL_PREFIX}/lib/cmake/Open3D/Open3DTargets.cmake. + install(TARGETS 3rdparty_cublas EXPORT Open3DTargets) + list(APPEND Open3D_3RDPARTY_EXTERNAL_MODULES "CUDAToolkit") + endif() + add_library(Open3D::3rdparty_cublas ALIAS 3rdparty_cublas) + list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_cublas) endif() + elseif (WITH_ROCM) + add_library(3rdparty_hipblas INTERFACE) + target_link_libraries(3rdparty_hipblas INTERFACE + roc::hipblas + roc::hipsparse + roc::hipsolver + ) if(NOT BUILD_SHARED_LIBS) # Listed in ${CMAKE_INSTALL_PREFIX}/lib/cmake/Open3D/Open3DTargets.cmake. - install(TARGETS 3rdparty_cublas EXPORT Open3DTargets) - list(APPEND Open3D_3RDPARTY_EXTERNAL_MODULES "CUDAToolkit") + install(TARGETS 3rdparty_hipblas EXPORT Open3DTargets) endif() - add_library(Open3D::3rdparty_cublas ALIAS 3rdparty_cublas) - list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_cublas) + add_library(Open3D::3rdparty_hipblas ALIAS 3rdparty_hipblas) + list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_hipblas) endif() endif() # NPP if (BUILD_CUDA_MODULE) - # NPP library list: https://docs.nvidia.com/cuda/npp/index.html - if(WIN32) - open3d_find_package_3rdparty_library(3rdparty_cuda_npp - REQUIRED - PACKAGE CUDAToolkit - TARGETS CUDA::nppc - CUDA::nppicc - CUDA::nppif - CUDA::nppig - CUDA::nppim - CUDA::nppial - ) - else() - open3d_find_package_3rdparty_library(3rdparty_cuda_npp - REQUIRED - PACKAGE CUDAToolkit - TARGETS CUDA::nppc_static - CUDA::nppicc_static - CUDA::nppif_static - CUDA::nppig_static - CUDA::nppim_static - CUDA::nppial_static - ) + if (WITH_CUDA) + # NPP library list: https://docs.nvidia.com/cuda/npp/index.html + if(WIN32) + open3d_find_package_3rdparty_library(3rdparty_cuda_npp + REQUIRED + PACKAGE CUDAToolkit + TARGETS CUDA::nppc + CUDA::nppicc + CUDA::nppif + CUDA::nppig + CUDA::nppim + CUDA::nppial + ) + else() + open3d_find_package_3rdparty_library(3rdparty_cuda_npp + REQUIRED + PACKAGE CUDAToolkit + TARGETS CUDA::nppc_static + CUDA::nppicc_static + CUDA::nppif_static + CUDA::nppig_static + CUDA::nppim_static + CUDA::nppial_static + ) + endif() + list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_cuda_npp) + elseif(WITH_ROCM) + message(WARNING "ROCM is not supported for NPP. ") endif() - list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_cuda_npp) endif () # IPP @@ -1944,6 +1987,50 @@ else() set(BUILD_WEBRTC_COMMENT "//") endif() +# gflags +if(USE_SYSTEM_GFLAGS) + open3d_find_package_3rdparty_library(3rdparty_gflags + PACKAGE gflags + TARGETS gflags + ) + if(NOT 3rdparty_gflags_FOUND) + set(USE_SYSTEM_GFLAGS OFF) + endif() + list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_gflags) +endif() +if(NOT USE_SYSTEM_GFLAGS) + include(${Open3D_3RDPARTY_DIR}/gflags/gflags.cmake) + open3d_import_3rdparty_library(3rdparty_gflags + INCLUDE_DIRS ${GFLAGS_INCLUDE_DIRS} + LIB_DIR ${GFLAGS_LIB_DIR} + LIBRARIES ${GFLAGS_LIBRARIES} + DEPENDS ext_gflags + ) + list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_gflags) +endif() + +# glog +if(USE_SYSTEM_GLOG) + open3d_find_package_3rdparty_library(3rdparty_glog + PACKAGE glog + TARGETS glog + ) + if(NOT 3rdparty_glog_FOUND) + set(USE_SYSTEM_GLOG OFF) + endif() + list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_glog) +endif() +if(NOT USE_SYSTEM_GLOG) + include(${Open3D_3RDPARTY_DIR}/glog/glog.cmake) + open3d_import_3rdparty_library(3rdparty_glog + INCLUDE_DIRS ${GLOG_INCLUDE_DIRS} + LIB_DIR ${GLOG_LIB_DIR} + LIBRARIES ${GLOG_LIBRARIES} + DEPENDS ext_glog + ) + list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_glog) +endif() + # Compactify list of external modules. # This must be called after all dependencies are processed. list(REMOVE_DUPLICATES Open3D_3RDPARTY_EXTERNAL_MODULES) diff --git a/3rdparty/gflags/LICENSE b/3rdparty/gflags/LICENSE new file mode 100644 index 000000000..55dd398cc --- /dev/null +++ b/3rdparty/gflags/LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2006, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/3rdparty/gflags/gflags.cmake b/3rdparty/gflags/gflags.cmake new file mode 100644 index 000000000..460bbea87 --- /dev/null +++ b/3rdparty/gflags/gflags.cmake @@ -0,0 +1,44 @@ +include(ExternalProject) + +set(GFLAGS_LIB_NAME gflags) + +# For CMake >= 4.0.0, set policy compatibility for third-party gflags' CMake. +set(GFLAGS_POLICY_ARGS "") +if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0") + message( + WARNING + "gflags: forcing CMake policy compatibility for CMake >= 4.0 (CMAKE_POLICY_VERSION_MINIMUM=3.5)" + ) + set(GFLAGS_POLICY_ARGS "-DCMAKE_POLICY_VERSION_MINIMUM=3.5") +endif() + +ExternalProject_Add( + ext_gflags + PREFIX gflags + URL https://github.com/gflags/gflags/archive/refs/tags/v2.2.2.tar.gz + URL_HASH SHA256=34af2f15cf7367513b352bdcd2493ab14ce43692d2dcd9dfc499492966c64dcf + DOWNLOAD_DIR "${OPEN3D_THIRD_PARTY_DOWNLOAD_DIR}/gflags" + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + ${GFLAGS_POLICY_ARGS} + -DBUILD_STATIC_LIBS=ON + -DCMAKE_INSTALL_PREFIX= + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + BUILD_BYPRODUCTS + /${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${GFLAGS_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX} +) + +ExternalProject_Get_Property(ext_gflags INSTALL_DIR) +set(GFLAGS_INCLUDE_DIRS ${INSTALL_DIR}/include/) # "/" is critical. +set(GFLAGS_LIB_DIR ${INSTALL_DIR}/${Open3D_INSTALL_LIB_DIR}) +set(GFLAGS_LIBRARIES ${GFLAGS_LIB_NAME}) diff --git a/3rdparty/glog/LICENSE b/3rdparty/glog/LICENSE new file mode 100644 index 000000000..e2daebaec --- /dev/null +++ b/3rdparty/glog/LICENSE @@ -0,0 +1,8 @@ +Copyright © 2024, Google Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/3rdparty/glog/glog.cmake b/3rdparty/glog/glog.cmake new file mode 100644 index 000000000..7b9b0a1b0 --- /dev/null +++ b/3rdparty/glog/glog.cmake @@ -0,0 +1,47 @@ +include(ExternalProject) + +set(GLOG_LIB_NAME glog) + +add_definitions(-DGLOG_NO_ABBREVIATED_SEVERITIES) + +# For CMake >= 4.0.0, set policy compatibility for glog's CMake. +set(GLOG_POLICY_ARGS "") +if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0") + message( + WARNING + "glog: forcing CMake policy compatibility for CMake >= 4.0 (CMAKE_POLICY_VERSION_MINIMUM=3.5)" + ) + set(GLOG_POLICY_ARGS "-DCMAKE_POLICY_VERSION_MINIMUM=3.5") +endif() + + +ExternalProject_Add( + ext_glog + PREFIX glog + URL https://github.com/google/glog/archive/refs/tags/v0.4.0.tar.gz + URL_HASH SHA256=f28359aeba12f30d73d9e4711ef356dc842886968112162bc73002645139c39c + DOWNLOAD_DIR "${OPEN3D_THIRD_PARTY_DOWNLOAD_DIR}/glog" + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + ${GLOG_POLICY_ARGS} + -DCMAKE_INSTALL_PREFIX= + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DWITH_GFLAGS=OFF + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + BUILD_BYPRODUCTS + /${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${GLOG_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX} +) + +ExternalProject_Get_Property(ext_glog INSTALL_DIR) +set(GLOG_INCLUDE_DIRS ${INSTALL_DIR}/include/) # "/" is critical. +set(GLOG_LIB_DIR ${INSTALL_DIR}/${Open3D_INSTALL_LIB_DIR}) +set(GLOG_LIBRARIES ${GLOG_LIB_NAME}) diff --git a/3rdparty/hipify_torch/LICENSE b/3rdparty/hipify_torch/LICENSE new file mode 100644 index 000000000..bceedac5f --- /dev/null +++ b/3rdparty/hipify_torch/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021-2024, Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/3rdparty/hipify_torch/hipify_torch.cmake b/3rdparty/hipify_torch/hipify_torch.cmake new file mode 100644 index 000000000..b87517703 --- /dev/null +++ b/3rdparty/hipify_torch/hipify_torch.cmake @@ -0,0 +1,15 @@ +include(FetchContent) + +FetchContent_Declare( + ext_hipify_torch + prefix hipify_torch + URL https://github.com/ROCm/hipify_torch/archive/ee928d80eb49a74be5d556465e04c6a40de7e3bc.tar.gz + URL_HASH SHA256=6446fc51f849c8f6fce37aa71e23f9b0a4715d15a5ab75ba69e635f89b6a9d6c + DOWNLOAD_DIR "${OPEN3D_THIRD_PARTY_DOWNLOAD_DIR}/hipify_torch" +) + +message(STATUS "Fetching hipify_torch") + +FetchContent_MakeAvailable(ext_hipify_torch) + +list(APPEND CMAKE_MODULE_PATH "${ext_hipify_torch_SOURCE_DIR}/cmake") \ No newline at end of file diff --git a/3rdparty/mkl/tbb.cmake b/3rdparty/mkl/tbb.cmake index b87ef79fe..8c4fd6fa5 100644 --- a/3rdparty/mkl/tbb.cmake +++ b/3rdparty/mkl/tbb.cmake @@ -23,6 +23,11 @@ set(STATIC_TBB_LIBRARIES tbb_static tbbmalloc_static) find_package(Git QUIET REQUIRED) +if (WITH_ROCM) + # NOTE(beinggod): Set CMAKE_CXX_COMPILER_ID to GNU to avoid link libunwind and libc++. + set(ExternalProject_CMAKE_ARGS_hidden "${ExternalProject_CMAKE_ARGS_hidden} -DCMAKE_CXX_COMPILER_ID=GNU") +endif() + ExternalProject_Add( ext_tbb PREFIX tbb diff --git a/3rdparty/stdgpu/stdgpu.cmake b/3rdparty/stdgpu/stdgpu.cmake index 68d8f47a2..7d6c7582c 100644 --- a/3rdparty/stdgpu/stdgpu.cmake +++ b/3rdparty/stdgpu/stdgpu.cmake @@ -4,27 +4,56 @@ include(ExternalProject) +if (WITH_CUDA) ExternalProject_Add( ext_stdgpu PREFIX stdgpu - URL https://github.com/stotko/stdgpu/archive/2588168d226bd17229dbf58d821549580791089d.tar.gz - URL_HASH SHA256=86e50789bbe21c57f64358c6acbd4481d56c1e45ce9ba1fb5c5c8482c3973215 + URL https://github.com/stotko/stdgpu/archive/c25d4bd9d7cb61ab3c3fed179c393916372e6034.tar.gz + URL_HASH SHA256=5f2accdab5776920d33d4d3601f26e10e50fa6786ae1503e6e9d2d9280c0f4d2 DOWNLOAD_DIR "${OPEN3D_THIRD_PARTY_DOWNLOAD_DIR}/stdgpu" UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= - -DCUDAToolkit_ROOT=${CUDAToolkit_LIBRARY_ROOT} + -DCMAKE_BUILD_TYPE=Release -DSTDGPU_BUILD_SHARED_LIBS=OFF -DSTDGPU_BUILD_EXAMPLES=OFF -DSTDGPU_BUILD_TESTS=OFF + -DSTDGPU_BUILD_BENCHMARKS=OFF -DSTDGPU_ENABLE_CONTRACT_CHECKS=OFF - -DTHRUST_INCLUDE_DIR=${CUDAToolkit_INCLUDE_DIRS} ${ExternalProject_CMAKE_ARGS_hidden} CMAKE_CACHE_ARGS # Lists must be passed via CMAKE_CACHE_ARGS -DCMAKE_CUDA_ARCHITECTURES:STRING=${CMAKE_CUDA_ARCHITECTURES} BUILD_BYPRODUCTS /lib/${CMAKE_STATIC_LIBRARY_PREFIX}stdgpu${CMAKE_STATIC_LIBRARY_SUFFIX} ) +elseif(WITH_ROCM) +ExternalProject_Add( + ext_stdgpu + PREFIX stdgpu + # NOTE: Fix make install error on ROCm. PR: https://github.com/stotko/stdgpu/pull/473 + URL https://github.com/stotko/stdgpu/archive/6a8be3eafb485866afa488714b6120adc5140f10.tar.gz + URL_HASH SHA256=267551bb482e2971c9cb3b3a23324891fb173295ef5ecc3cc3a520d03e76bddd + DOWNLOAD_DIR "${OPEN3D_THIRD_PARTY_DOWNLOAD_DIR}/stdgpu" + UPDATE_COMMAND "" + CMAKE_ARGS + -DCMAKE_INSTALL_PREFIX= + -DCMAKE_BUILD_TYPE=Release + -DSTDGPU_BUILD_SHARED_LIBS=OFF + -DSTDGPU_BUILD_EXAMPLES=OFF + -DSTDGPU_BUILD_TESTS=OFF + -DSTDGPU_BUILD_BENCHMARKS=OFF + -DSTDGPU_ENABLE_CONTRACT_CHECKS=OFF + -DSTDGPU_BACKEND=STDGPU_BACKEND_HIP + ${ExternalProject_CMAKE_ARGS_hidden} + # NOTE: Place this line before ExternalProject_CMAKE_ARGS_hidden to override the CMAKE_CXX_COMPILER. + -DCMAKE_CXX_COMPILER=$ENV{ROCM_PATH}/llvm/bin/clang++ + CMAKE_CACHE_ARGS # Lists must be passed via CMAKE_CACHE_ARGS + # FIXME(beinggod): The CMAKE_HIP_ARCHITECTURES not work because of unknown issue + -DCMAKE_HIP_ARCHITECTURES:STRING=${CMAKE_HIP_ARCHITECTURES} + BUILD_BYPRODUCTS + /lib/${CMAKE_STATIC_LIBRARY_PREFIX}stdgpu${CMAKE_STATIC_LIBRARY_SUFFIX} +) +endif() ExternalProject_Get_Property(ext_stdgpu INSTALL_DIR) set(STDGPU_INCLUDE_DIRS ${INSTALL_DIR}/include/) # "/" is critical. diff --git a/CMakeLists.txt b/CMakeLists.txt index 1e1c46128..b14e13e45 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,6 +47,7 @@ option(BUILD_UNIT_TESTS "Build Open3D unit tests" OFF option(BUILD_BENCHMARKS "Build the micro benchmarks" OFF) option(BUILD_PYTHON_MODULE "Build the python module" ON ) option(BUILD_CUDA_MODULE "Build the CUDA module" OFF) +option(WITH_ROCM "Use ROCM to replace CUDA" OFF) option(BUILD_COMMON_CUDA_ARCHS "Build for common CUDA GPUs (for release)" OFF) if (WIN32) # Causes CUDA runtime error on Windows (See issue #6555) option(ENABLE_CACHED_CUDA_MANAGER "Enable cached CUDA memory manager" OFF) @@ -115,6 +116,7 @@ option(USE_SYSTEM_TINYGLTF "Use system pre-installed tinygltf" OFF option(USE_SYSTEM_TINYOBJLOADER "Use system pre-installed tinyobjloader" OFF) option(USE_SYSTEM_VTK "Use system pre-installed VTK" OFF) option(USE_SYSTEM_ZEROMQ "Use system pre-installed ZeroMQ" OFF) +option(USE_SYSTEM_GLOG "Use system pre-installed GLOG" OFF) if(LINUX_AARCH64 OR APPLE_AARCH64) option(BUILD_VTK_FROM_SOURCE "Build VTK from source" ON ) option(BUILD_FILAMENT_FROM_SOURCE "Build filament from source" ON ) @@ -296,6 +298,25 @@ if(BUILD_SYCL_MODULE AND BUILD_CUDA_MODULE) message(FATAL_ERROR "BUILD_SYCL_MODULE and BUILD_SYCL_MODULE cannot be on at the same time for now.") endif() +# Check ROCM compatiblility +if (WITH_ROCM) + # Find ROCM path + set(ROCM_PATH $ENV{ROCM_PATH}) + if (NOT ROCM_PATH) + message(FATAL_ERROR "ROCM_PATH is not set") + endif() + + # Reuse BUILD_CUDA_MODULE variables for ROCM build + if (NOT BUILD_CUDA_MODULE) + message(FATAL_ERROR "WITH_ROCM requires BUILD_CUDA_MODULE=ON") + endif() + + # Check DL framework compatibility + if (BUILD_TENSORFLOW_OPS OR BUILD_PYTORCH_OPS) + message(FATAL_ERROR "WITH_ROCM only available on Paddle backend.") + endif() +endif() + # Global flag to set CXX standard. # This does not affect 3rd party libraries. # Tensorflow 2.9+ requires cxx_17, but MSVC 19.29 throws errors with C++17 @@ -402,19 +423,34 @@ macro(open3d_patch_findthreads_module_) endmacro() cmake_language(EVAL CODE "cmake_language(DEFER CALL open3d_patch_findthreads_module_)") -# Build CUDA module by default if CUDA is available -if(BUILD_CUDA_MODULE) - include(Open3DMakeCudaArchitectures) - open3d_make_cuda_architectures(CUDA_ARCHS) - set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCHS}) - - message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") - enable_language(CUDA) - set(CMAKE_CUDA_STANDARD 17) - if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.5") - message(FATAL_ERROR "CUDA 11.4 and older are not supported. Please upgrade to CUDA 11.5 or newer.") +if (BUILD_CUDA_MODULE) + if (WITH_ROCM) + # Build ROCM module + include(Open3DMakeRocmArchitectures) + open3d_make_rocm_architectures(ROCM_ARCHS) + set(CMAKE_HIP_ARCHITECTURES ${ROCM_ARCHS}) + + message(STATUS "Using ROCM architectures: ${ROCM_ARCHS}") + enable_language(HIP) + + # Disable interprocedural optimization for ROCM to avoid LLVMgold.so not found error + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF) + else () + set(WITH_CUDA ON) + + # Build CUDA module by default if CUDA is available + include(Open3DMakeCudaArchitectures) + open3d_make_cuda_architectures(CUDA_ARCHS) + set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCHS}) + + message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") + enable_language(CUDA) + set(CMAKE_CUDA_STANDARD 17) + if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.5") + message(FATAL_ERROR "CUDA 11.4 and older are not supported. Please upgrade to CUDA 11.5 or newer.") + endif() endif() -endif () +endif() # ISPC language emulation support include(Open3DISPC) diff --git a/cmake/Open3DMakeRocmArchitectures.cmake b/cmake/Open3DMakeRocmArchitectures.cmake new file mode 100644 index 000000000..7810aa8a8 --- /dev/null +++ b/cmake/Open3DMakeRocmArchitectures.cmake @@ -0,0 +1,52 @@ +# open3d_make_rocm_architectures(rocm_archs) +# +# Sets up ROCM architectures based on the following precedence rules +# and stores them into the variable. +# 1. All common architectures if BUILD_COMMON_ROCM_ARCHS=ON +# 2. User-defined architectures +# 3. Architectures detected on the current machine +function(open3d_make_rocm_architectures rocm_archs) + unset(${rocm_archs}) + + find_package(HIP REQUIRED) + + if(BUILD_COMMON_ROCM_ARCHS) + # split by semicolon + set(${rocm_archs} "gfx906;gfx926;gfx928;gfx936") + else() + file(WRITE + "${CMAKE_CURRENT_BINARY_DIR}/rocm_architectures.sh" + " + #!/bin/bash + set -e + + ARCH=$(rocminfo | grep -E '^\\s*Name:\\s+gfx[0-9]+' | awk '{print $2}' | sort -u) + + if [ -z "$ARCH" ]; then + echo -n "Unknown" + exit 1 + else + echo -n "$ARCH" + exit 0 + fi + ") + + execute_process( + COMMAND bash ${CMAKE_CURRENT_BINARY_DIR}/rocm_architectures.sh + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + RESULT_VARIABLE RET + OUTPUT_VARIABLE DETECTED_ARCHITECTURES + ERROR_VARIABLE ERROR + ) + + if(RET EQUAL 0) + message(STATUS "Building with detected architectures") + set(${rocm_archs} ${DETECTED_ARCHITECTURES}) + else() + message(FATAL_ERROR "Failed to detect ROCM architectures") + endif() + endif() + + set(${rocm_archs} ${${rocm_archs}} PARENT_SCOPE) + +endfunction() diff --git a/cmake/Open3DPrintConfigurationSummary.cmake b/cmake/Open3DPrintConfigurationSummary.cmake index c8e8a891a..ce1120147 100644 --- a/cmake/Open3DPrintConfigurationSummary.cmake +++ b/cmake/Open3DPrintConfigurationSummary.cmake @@ -40,6 +40,7 @@ function(open3d_print_configuration_summary) open3d_aligned_print("Build Jupyter Extension" "${BUILD_JUPYTER_EXTENSION}") open3d_aligned_print("Build TensorFlow Ops" "${BUILD_TENSORFLOW_OPS}") open3d_aligned_print("Build PyTorch Ops" "${BUILD_PYTORCH_OPS}") + open3d_aligned_print("Build Paddle Ops" "${BUILD_PADDLE_OPS}") open3d_aligned_print("Build Benchmarks" "${BUILD_BENCHMARKS}") open3d_aligned_print("Bundle Open3D-ML" "${BUNDLE_OPEN3D_ML}") if(GLIBCXX_USE_CXX11_ABI) diff --git a/cmake/Open3DSetGlobalProperties.cmake b/cmake/Open3DSetGlobalProperties.cmake index 251f046ca..73e519c39 100644 --- a/cmake/Open3DSetGlobalProperties.cmake +++ b/cmake/Open3DSetGlobalProperties.cmake @@ -87,6 +87,15 @@ function(open3d_set_global_properties target) # Propagate build configuration into source code if (BUILD_CUDA_MODULE) target_compile_definitions(${target} PRIVATE BUILD_CUDA_MODULE) + # Add WITH_ROCM definitions and build options for ROCM build + if (WITH_ROCM) + # Add definitions for ROCM build + list(APPEND HIP_COMPILE_DEFINITIONS __HIP_PLATFORM_AMD__=1) + list(APPEND HIP_COMPILE_DEFINITIONS ROCM_NO_WRAPPER_HEADER_WARNING) + list(APPEND HIP_COMPILE_DEFINITIONS THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) + + target_compile_definitions(${target} PRIVATE ${HIP_COMPILE_DEFINITIONS}) + endif() if (ENABLE_CACHED_CUDA_MANAGER) target_compile_definitions(${target} PRIVATE ENABLE_CACHED_CUDA_MANAGER) endif() diff --git a/cmake/Open3DShowAndAbortOnWarning.cmake b/cmake/Open3DShowAndAbortOnWarning.cmake index c95bfa75a..fa9f1fa6f 100644 --- a/cmake/Open3DShowAndAbortOnWarning.cmake +++ b/cmake/Open3DShowAndAbortOnWarning.cmake @@ -25,7 +25,55 @@ function(open3d_show_and_abort_on_warning target) -Wno-unused-parameter # (many places in Open3D code) ) - if (BUILD_CUDA_MODULE) + if (WITH_ROCM) + # Add HIP compiler flags to suppress warnings + set(HIP_FLAGS "") + + string(APPEND HIP_FLAGS " -Wno-macro-redefined") + string(APPEND HIP_FLAGS " -Wno-inconsistent-missing-override") + string(APPEND HIP_FLAGS " -Wno-exceptions") + string(APPEND HIP_FLAGS " -Wno-shift-count-negative") + string(APPEND HIP_FLAGS " -Wno-shift-count-overflow") + string(APPEND HIP_FLAGS " -Wno-unused-command-line-argument") + string(APPEND HIP_FLAGS " -Wno-duplicate-decl-specifier") + string(APPEND HIP_FLAGS " -Wno-implicit-int-float-conversion") + string(APPEND HIP_FLAGS " -Wno-pass-failed") + string(APPEND HIP_FLAGS " -Wno-unused-result") + string(APPEND HIP_FLAGS " -Wno-deprecated-declarations") + string(APPEND HIP_FLAGS " -Wno-format") + string(APPEND HIP_FLAGS " -Wno-dangling-gsl") + string(APPEND HIP_FLAGS " -Wno-unused-value") + string(APPEND HIP_FLAGS " -Wno-braced-scalar-init") + string(APPEND HIP_FLAGS " -Wno-return-type") + string(APPEND HIP_FLAGS " -Wno-pragma-once-outside-header") + string(APPEND HIP_FLAGS " -Wno-deprecated-builtins") + string(APPEND HIP_FLAGS " -Wno-switch") + string(APPEND HIP_FLAGS " -Wno-literal-conversion") + string(APPEND HIP_FLAGS " -Wno-constant-conversion") + string(APPEND HIP_FLAGS " -Wno-defaulted-function-deleted") + string(APPEND HIP_FLAGS " -Wno-sign-compare") + string(APPEND HIP_FLAGS " -Wno-bitwise-instead-of-logical") + string(APPEND HIP_FLAGS " -Wno-unknown-warning-option") + string(APPEND HIP_FLAGS " -Wno-unused-lambda-capture") + string(APPEND HIP_FLAGS " -Wno-unused-variable") + string(APPEND HIP_FLAGS " -Wno-unused-but-set-variable") + string(APPEND HIP_FLAGS " -Wno-reorder-ctor") + string(APPEND HIP_FLAGS " -Wno-deprecated-copy-with-user-provided-copy") + string(APPEND HIP_FLAGS " -Wno-unused-local-typedef") + string(APPEND HIP_FLAGS " -Wno-missing-braces") + string(APPEND HIP_FLAGS " -Wno-sometimes-uninitialized") + string(APPEND HIP_FLAGS " -Wno-deprecated-copy") + string(APPEND HIP_FLAGS " -Wno-pessimizing-move") + string(APPEND HIP_FLAGS " -Wunused-command-line-argument") + + # Add CXX compiler flags to suppress warnings + list(APPEND DISABLE_GNU_CLANG_INTEL_WARNINGS -Wno-return-type) + list(APPEND DISABLE_GNU_CLANG_INTEL_WARNINGS -Wno-unused-result) + else() + set(HIP_FLAGS "") + endif() + + if (WITH_CUDA) # General NVCC flags set(DISABLE_NVCC_WARNINGS 2809 # ignoring return value from routine declared with "nodiscard" attribute (cub) @@ -63,6 +111,7 @@ function(open3d_show_and_abort_on_warning target) $<$:/W4 /WX ${DISABLE_MSVC_WARNINGS}> $<$:-Wall -Wextra -Werror ${DISABLE_GNU_CLANG_INTEL_WARNINGS}> $<$:SHELL:${CUDA_FLAGS}> + $<$:SHELL:${HIP_FLAGS}> $<$:--werror> ) endfunction() diff --git a/cpp/open3d/CMakeLists.txt b/cpp/open3d/CMakeLists.txt index 6da3581e5..3ab84bf8e 100644 --- a/cpp/open3d/CMakeLists.txt +++ b/cpp/open3d/CMakeLists.txt @@ -52,6 +52,37 @@ configure_file("${PROJECT_SOURCE_DIR}/cpp/open3d/Open3D.h.in" configure_file("${PROJECT_SOURCE_DIR}/cpp/open3d/Open3DConfig.h.in" "${PROJECT_SOURCE_DIR}/cpp/open3d/Open3DConfig.h") +# HIPify the source code +if (WITH_ROCM) + include(Hipify) + + set(HIPIFY_HEADER_INCLUDE_DIRS + ${CMAKE_CURRENT_SOURCE_DIR}/core + ${CMAKE_CURRENT_SOURCE_DIR}/core/kernel + ${CMAKE_CURRENT_SOURCE_DIR}/core/hashmap + ${CMAKE_CURRENT_SOURCE_DIR}/core/linalg + ${CMAKE_CURRENT_SOURCE_DIR}/core/linalg/kernel + ${CMAKE_CURRENT_SOURCE_DIR}/core/nns + ${CMAKE_CURRENT_SOURCE_DIR}/core/nns/kernel) + + # Hipify the source code + hipify(CUDA_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} + HEADER_INCLUDE_DIRS ${HIPIFY_HEADER_INCLUDE_DIRS} + IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/utility/*" + IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/MemoryManager.h" + IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/DLPack.h" + IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/CUDAUtils*" + IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/linalg/LinalgHeadersCUDA.h*" + IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/nns/kernel/PtxUtils.cuh" + IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/nns/kernel/DeviceDefs.cuh" + IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h" + IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/hashmap/CUDA/*" + IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/hashmap/Dispatch.h" + IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/t/geometry/kernel/GeometryMacros.h" + CUSTOM_MAP_FILE "${PROJECT_SOURCE_DIR}/hipify_custom_map.json" + NO_MATH_REPLACE + ) +endif() add_library(Open3D) @@ -136,6 +167,10 @@ open3d_link_3rdparty_libraries(Open3D) target_link_options(Open3D INTERFACE $<$,STATIC_LIBRARY>:$>) +# get_target_property(SRCS Open3D SOURCES) +# message(FATAL_ERROR "sources = ${SRCS}") + + add_library(Open3D::Open3D ALIAS Open3D) include(CMakePackageConfigHelpers) diff --git a/cpp/open3d/core/CMakeLists.txt b/cpp/open3d/core/CMakeLists.txt index d42e645da..68fbf367c 100644 --- a/cpp/open3d/core/CMakeLists.txt +++ b/cpp/open3d/core/CMakeLists.txt @@ -85,7 +85,8 @@ if (BUILD_CUDA_MODULE) target_sources(core PRIVATE MemoryManagerCUDA.cpp ) - target_sources(core PRIVATE + + target_sources(core PRIVATE hashmap/CUDA/CreateCUDAHashBackend.cu hashmap/CUDA/CUDAHashBackendBuffer.cu hashmap/CUDA/SlabNodeManager.cu @@ -120,6 +121,23 @@ if (BUILD_ISPC_MODULE) ) endif() +if (WITH_ROCM) + include(Hipify) + + get_target_property(SOURCE_LIST core SOURCES) + get_hipified_list("${SOURCE_LIST}" HIPIFIED_SOURCE_LIST) + + foreach(src IN LISTS HIPIFIED_SOURCE_LIST) + get_filename_component(ext ${src} EXT) + if(ext STREQUAL ".cu") + set_source_files_properties(${src} PROPERTIES LANGUAGE HIP) + endif() + endforeach() + + set_property(TARGET core PROPERTY SOURCES ${HIPIFIED_SOURCE_LIST}) + get_target_property(SOURCE_LIST core SOURCES) +endif() + open3d_show_and_abort_on_warning(core) open3d_set_global_properties(core) open3d_set_open3d_lib_properties(core) diff --git a/cpp/open3d/core/CUDAUtils.cpp b/cpp/open3d/core/CUDAUtils.cpp index ab805bda0..696a35e16 100644 --- a/cpp/open3d/core/CUDAUtils.cpp +++ b/cpp/open3d/core/CUDAUtils.cpp @@ -20,6 +20,15 @@ namespace cuda { int DeviceCount() { #ifdef BUILD_CUDA_MODULE +#if __HIP_PLATFORM_AMD__ + try { + int num_devices; + OPEN3D_CUDA_CHECK(hipGetDeviceCount(&num_devices)); + return num_devices; + } catch (const std::runtime_error&) { + return 0; + } +#else try { int num_devices; OPEN3D_CUDA_CHECK(cudaGetDeviceCount(&num_devices)); @@ -30,6 +39,7 @@ int DeviceCount() { catch (const std::runtime_error&) { return 0; } +#endif #else return 0; #endif @@ -67,7 +77,11 @@ void Synchronize(const Device& device) { #ifdef BUILD_CUDA_MODULE if (device.IsCUDA()) { CUDAScopedDevice scoped_device(device); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipDeviceSynchronize()); +#else OPEN3D_CUDA_CHECK(cudaDeviceSynchronize()); +#endif } #endif } @@ -132,13 +146,21 @@ bool SupportsMemoryPools(const Device& device) { #ifdef BUILD_CUDA_MODULE int GetDevice() { int device; +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetDevice(&device)); +#else OPEN3D_CUDA_CHECK(cudaGetDevice(&device)); +#endif return device; } static void SetDevice(int device_id) { AssertCUDADeviceAvailable(device_id); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipSetDevice(device_id)); +#else OPEN3D_CUDA_CHECK(cudaSetDevice(device_id)); +#endif } class CUDAStream { @@ -149,20 +171,37 @@ class CUDAStream { static thread_local CUDAStream instance; return instance; } +#if __HIP_PLATFORM_AMD__ + hipStream_t Get() { return stream_; } + void Set(hipStream_t stream) { stream_ = stream; } + static hipStream_t Default() { return static_cast(0); } +#else cudaStream_t Get() { return stream_; } void Set(cudaStream_t stream) { stream_ = stream; } static cudaStream_t Default() { return static_cast(0); } +#endif private: CUDAStream() = default; CUDAStream(const CUDAStream&) = delete; CUDAStream& operator=(const CUDAStream&) = delete; +#if __HIP_PLATFORM_AMD__ + hipStream_t stream_ = Default(); +#else cudaStream_t stream_ = Default(); +#endif }; +#if __HIP_PLATFORM_AMD__ +hipStream_t GetStream() { return CUDAStream::GetInstance().Get(); } + +void SetStream(hipStream_t stream) { CUDAStream::GetInstance().Set(stream); } + +hipStream_t GetDefaultStream() { return CUDAStream::Default(); } +#else cudaStream_t GetStream() { return CUDAStream::GetInstance().Get(); } static void SetStream(cudaStream_t stream) { @@ -170,6 +209,7 @@ static void SetStream(cudaStream_t stream) { } cudaStream_t GetDefaultStream() { return CUDAStream::Default(); } +#endif #endif @@ -194,11 +234,19 @@ constexpr CUDAScopedStream::CreateNewStreamTag CUDAScopedStream::CUDAScopedStream(const CreateNewStreamTag&) : prev_stream_(cuda::GetStream()), owns_new_stream_(true) { +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipStreamCreate(&new_stream_)); +#else OPEN3D_CUDA_CHECK(cudaStreamCreate(&new_stream_)); +#endif cuda::SetStream(new_stream_); } +#if __HIP_PLATFORM_AMD__ +CUDAScopedStream::CUDAScopedStream(hipStream_t stream) +#else CUDAScopedStream::CUDAScopedStream(cudaStream_t stream) +#endif : prev_stream_(cuda::GetStream()), new_stream_(stream), owns_new_stream_(false) { @@ -207,7 +255,11 @@ CUDAScopedStream::CUDAScopedStream(cudaStream_t stream) CUDAScopedStream::~CUDAScopedStream() { if (owns_new_stream_) { +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipStreamDestroy(new_stream_)); +#else OPEN3D_CUDA_CHECK(cudaStreamDestroy(new_stream_)); +#endif } cuda::SetStream(prev_stream_); } @@ -253,11 +305,26 @@ CUDAState::CUDAState() { // Check access. int can_access = 0; +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK( + hipDeviceCanAccessPeer(&can_access, src_id, tar_id)); +#else OPEN3D_CUDA_CHECK( cudaDeviceCanAccessPeer(&can_access, src_id, tar_id)); +#endif // Enable access. if (can_access) { p2p_enabled_[src_id][tar_id] = true; +#if __HIP_PLATFORM_AMD__ + hipError_t err = hipDeviceEnablePeerAccess(tar_id, 0); + if (err == hipErrorPeerAccessAlreadyEnabled) { + // Ignore error since P2P is already enabled. + // Add void to suppress unused variable warning. + (void)hipGetLastError(); + } else { + OPEN3D_CUDA_CHECK(err); + } +#else cudaError_t err = cudaDeviceEnablePeerAccess(tar_id, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { // Ignore error since P2P is already enabled. @@ -265,6 +332,7 @@ CUDAState::CUDAState() { } else { OPEN3D_CUDA_CHECK(err); } +#endif } else { p2p_enabled_[src_id][tar_id] = false; } @@ -275,22 +343,36 @@ CUDAState::CUDAState() { int GetCUDACurrentDeviceTextureAlignment() { int value; +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipDeviceGetAttribute( + &value, hipDeviceAttributeTextureAlignment, cuda::GetDevice())); +#else OPEN3D_CUDA_CHECK(cudaDeviceGetAttribute( &value, cudaDevAttrTextureAlignment, cuda::GetDevice())); +#endif return value; } int GetCUDACurrentWarpSize() { int value; +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, + cuda::GetDevice())); +#else OPEN3D_CUDA_CHECK(cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, cuda::GetDevice())); +#endif return value; } size_t GetCUDACurrentTotalMemSize() { size_t free; size_t total; +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipMemGetInfo(&free, &total)); +#else OPEN3D_CUDA_CHECK(cudaMemGetInfo(&free, &total)); +#endif return total; } @@ -304,21 +386,38 @@ size_t GetCUDACurrentTotalMemSize() { namespace open3d { namespace core { +#if __HIP_PLATFORM_AMD__ +void __OPEN3D_CUDA_CHECK(hipError_t err, const char* file, const int line) { + if (err != hipSuccess) { + utility::LogError("{}:{} CUDA runtime error: {}", file, line, + hipGetErrorString(err)); + } +} +#else void __OPEN3D_CUDA_CHECK(cudaError_t err, const char* file, const int line) { if (err != cudaSuccess) { utility::LogError("{}:{} CUDA runtime error: {}", file, line, cudaGetErrorString(err)); } } +#endif void __OPEN3D_GET_LAST_CUDA_ERROR(const char* message, const char* file, const int line) { +#if __HIP_PLATFORM_AMD__ + hipError_t err = hipGetLastError(); + if (err != hipSuccess) { + utility::LogError("{}:{} {}: OPEN3D_GET_LAST_CUDA_ERROR(): {}", file, + line, message, hipGetErrorString(err)); + } +#else cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { utility::LogError("{}:{} {}: OPEN3D_GET_LAST_CUDA_ERROR(): {}", file, line, message, cudaGetErrorString(err)); } +#endif } } // namespace core diff --git a/cpp/open3d/core/CUDAUtils.h b/cpp/open3d/core/CUDAUtils.h index 16a3841e2..b59f1a318 100644 --- a/cpp/open3d/core/CUDAUtils.h +++ b/cpp/open3d/core/CUDAUtils.h @@ -18,8 +18,12 @@ #ifdef BUILD_CUDA_MODULE +#if __HIP_PLATFORM_AMD__ +#include +#else #include #include +#endif #include #include @@ -29,9 +33,19 @@ #define OPEN3D_FORCE_INLINE __forceinline__ #define OPEN3D_HOST_DEVICE __host__ __device__ #define OPEN3D_DEVICE __device__ + +#if __HIP_PLATFORM_AMD__ +// NOTE: clang++ not support __nv_is_extended_host_device_lambda_closure_type. +// It will throw error in compile time so static_assert should be set to true +// always. +#define OPEN3D_ASSERT_HOST_DEVICE_LAMBDA(type) \ + static_assert(true, #type " must be a __host__ __device__ lambda") +#else #define OPEN3D_ASSERT_HOST_DEVICE_LAMBDA(type) \ static_assert(__nv_is_extended_host_device_lambda_closure_type(type), \ #type " must be a __host__ __device__ lambda") +#endif + #define OPEN3D_CUDA_CHECK(err) \ open3d::core::__OPEN3D_CUDA_CHECK(err, __FILE__, __LINE__) #define OPEN3D_GET_LAST_CUDA_ERROR(message) \ @@ -148,7 +162,11 @@ class CUDAScopedStream { explicit CUDAScopedStream(const CreateNewStreamTag&); +#if __HIP_PLATFORM_AMD__ + explicit CUDAScopedStream(hipStream_t stream); +#else explicit CUDAScopedStream(cudaStream_t stream); +#endif ~CUDAScopedStream(); @@ -156,8 +174,13 @@ class CUDAScopedStream { CUDAScopedStream& operator=(const CUDAScopedStream&) = delete; private: +#if __HIP_PLATFORM_AMD__ + hipStream_t prev_stream_; + hipStream_t new_stream_; +#else cudaStream_t prev_stream_; cudaStream_t new_stream_; +#endif bool owns_new_stream_ = false; }; @@ -265,8 +288,13 @@ bool SupportsMemoryPools(const Device& device); #ifdef BUILD_CUDA_MODULE int GetDevice(); +#if __HIP_PLATFORM_AMD__ +hipStream_t GetStream(); +hipStream_t GetDefaultStream(); +#else cudaStream_t GetStream(); cudaStream_t GetDefaultStream(); +#endif #endif @@ -280,7 +308,11 @@ cudaStream_t GetDefaultStream(); namespace open3d { namespace core { +#if __HIP_PLATFORM_AMD__ +void __OPEN3D_CUDA_CHECK(hipError_t err, const char* file, const int line); +#else void __OPEN3D_CUDA_CHECK(cudaError_t err, const char* file, const int line); +#endif void __OPEN3D_GET_LAST_CUDA_ERROR(const char* message, const char* file, @@ -290,3 +322,16 @@ void __OPEN3D_GET_LAST_CUDA_ERROR(const char* message, } // namespace open3d #endif + +#ifdef BUILD_CUDA_MODULE +#if __HIP_PLATFORM_AMD__ + +// NOTE(beinggod): Ignore the mask for ROCm. +#define __shfl_sync(mask, ...) __shfl(__VA_ARGS__) +#define __shfl_down_sync(mask, ...) __shfl_down(__VA_ARGS__) +#define __ballot_sync(mask, ...) __ballot(__VA_ARGS__) +// warp size is 64 for ROCm. +#define __activemask() 0xffffffffffffffff + +#endif +#endif diff --git a/cpp/open3d/core/ParallelFor.h b/cpp/open3d/core/ParallelFor.h index 2d5bef788..45cb16c69 100644 --- a/cpp/open3d/core/ParallelFor.h +++ b/cpp/open3d/core/ParallelFor.h @@ -16,17 +16,25 @@ #include "open3d/utility/Parallel.h" #include "open3d/utility/Preprocessor.h" -#ifdef __CUDACC__ +#if defined(__CUDACC__) + #include #include #include "open3d/core/CUDAUtils.h" + +#elif defined(__HIPCC__) + +#include + +#include "open3d/core/CUDAUtils.h" + #endif namespace open3d { namespace core { -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) static constexpr int64_t OPEN3D_PARFOR_BLOCK = 128; static constexpr int64_t OPEN3D_PARFOR_THREAD = 4; @@ -101,7 +109,7 @@ void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) { /// kernel to be used on both CPU and CUDA, capture the variables by value. template void ParallelFor(const Device& device, int64_t n, const func_t& func) { -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) ParallelForCUDA_(device, n, func); #else ParallelForCPU_(device, n, func); @@ -161,7 +169,7 @@ void ParallelFor(const Device& device, const vec_func_t& vec_func) { #ifdef BUILD_ISPC_MODULE -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) ParallelForCUDA_(device, n, func); #else int num_threads = utility::EstimateMaxThreads(); @@ -174,7 +182,7 @@ void ParallelFor(const Device& device, #else -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) ParallelForCUDA_(device, n, func); #else ParallelForCPU_(device, n, func); diff --git a/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBuffer.cu b/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBuffer.cu index 627095858..79eeebd05 100644 --- a/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBuffer.cu +++ b/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBuffer.cu @@ -16,7 +16,11 @@ namespace core { void CUDAResetHeap(Tensor &heap) { uint32_t *heap_ptr = heap.GetDataPtr(); thrust::sequence(thrust::device, heap_ptr, heap_ptr + heap.GetLength(), 0); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif } } // namespace core } // namespace open3d diff --git a/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBufferAccessor.h b/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBufferAccessor.h index eeb74bc26..6d4724ea2 100644 --- a/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBufferAccessor.h +++ b/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBufferAccessor.h @@ -58,7 +58,11 @@ class CUDAHashBackendBufferAccessor { std::vector value_ptrs(n_values_); for (size_t i = 0; i < n_values_; ++i) { value_ptrs[i] = value_buffers[i].GetDataPtr(); +#if __HIP_PLATFORM_AMD__ + hipMemset(value_ptrs[i], 0, capacity_ * value_dsizes_host[i]); +#else cudaMemset(value_ptrs[i], 0, capacity_ * value_dsizes_host[i]); +#endif } values_ = static_cast( MemoryManager::Malloc(n_values_ * sizeof(uint8_t *), device)); @@ -67,7 +71,11 @@ class CUDAHashBackendBufferAccessor { heap_top_ = hashmap_buffer.GetHeapTop().cuda.GetDataPtr(); cuda::Synchronize(); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif } __host__ void Shutdown(const Device &device) { diff --git a/cpp/open3d/core/hashmap/CUDA/SlabHashBackend.h b/cpp/open3d/core/hashmap/CUDA/SlabHashBackend.h index dbf665c79..5b7bf95f6 100644 --- a/cpp/open3d/core/hashmap/CUDA/SlabHashBackend.h +++ b/cpp/open3d/core/hashmap/CUDA/SlabHashBackend.h @@ -97,16 +97,28 @@ void SlabHashBackend::Find(const void* input_keys, CUDAScopedDevice scoped_device(this->device_); if (count == 0) return; +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipMemset(output_masks, 0, sizeof(bool) * count)); +#else OPEN3D_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count)); +#endif cuda::Synchronize(); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif const int64_t num_blocks = (count + kThreadsPerBlock - 1) / kThreadsPerBlock; FindKernel<<>>( impl_, input_keys, output_buf_indices, output_masks, count); cuda::Synchronize(); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif } template @@ -116,9 +128,17 @@ void SlabHashBackend::Erase(const void* input_keys, CUDAScopedDevice scoped_device(this->device_); if (count == 0) return; +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipMemset(output_masks, 0, sizeof(bool) * count)); +#else OPEN3D_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count)); +#endif cuda::Synchronize(); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif auto buf_indices = static_cast( MemoryManager::Malloc(sizeof(buf_index_t) * count, this->device_)); @@ -131,7 +151,11 @@ void SlabHashBackend::Erase(const void* input_keys, core::cuda::GetStream()>>>(impl_, buf_indices, output_masks, count); cuda::Synchronize(); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif MemoryManager::Free(buf_indices, this->device_); } @@ -142,10 +166,18 @@ int64_t SlabHashBackend::GetActiveIndices( CUDAScopedDevice scoped_device(this->device_); uint32_t* count = static_cast( MemoryManager::Malloc(sizeof(uint32_t), this->device_)); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipMemset(count, 0, sizeof(uint32_t))); +#else OPEN3D_CUDA_CHECK(cudaMemset(count, 0, sizeof(uint32_t))); +#endif cuda::Synchronize(); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif const int64_t num_blocks = (impl_.bucket_count_ * kWarpSize + kThreadsPerBlock - 1) / @@ -154,7 +186,11 @@ int64_t SlabHashBackend::GetActiveIndices( core::cuda::GetStream()>>>( impl_, output_buf_indices, count); cuda::Synchronize(); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif uint32_t ret; MemoryManager::MemcpyToHost(&ret, count, this->device_, sizeof(uint32_t)); @@ -170,10 +206,19 @@ void SlabHashBackend::Clear() { this->buffer_->ResetHeap(); // Clear the linked list heads +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipMemset(impl_.bucket_list_head_, 0xFF, + sizeof(Slab) * this->bucket_count_)); +#else OPEN3D_CUDA_CHECK(cudaMemset(impl_.bucket_list_head_, 0xFF, sizeof(Slab) * this->bucket_count_)); +#endif cuda::Synchronize(); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif // Clear the linked list nodes node_mgr_->Reset(); @@ -204,7 +249,11 @@ std::vector SlabHashBackend::BucketSizes() const { core::cuda::GetStream()>>>( impl_, thrust::raw_pointer_cast(elems_per_bucket.data())); cuda::Synchronize(); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif std::vector result(impl_.bucket_count_); thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(), @@ -258,7 +307,11 @@ void SlabHashBackend::Insert( output_masks, count, n_values); }); cuda::Synchronize(); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif } template @@ -279,10 +332,20 @@ void SlabHashBackend::Allocate(int64_t capacity) { // Allocate linked list heads. impl_.bucket_list_head_ = static_cast(MemoryManager::Malloc( sizeof(Slab) * this->bucket_count_, this->device_)); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipMemset(impl_.bucket_list_head_, 0xFF, + sizeof(Slab) * this->bucket_count_)); +#else OPEN3D_CUDA_CHECK(cudaMemset(impl_.bucket_list_head_, 0xFF, sizeof(Slab) * this->bucket_count_)); cuda::Synchronize(); +#endif + +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif impl_.Setup(this->bucket_count_, node_mgr_->impl_, buffer_accessor_); } diff --git a/cpp/open3d/core/hashmap/CUDA/SlabMacros.h b/cpp/open3d/core/hashmap/CUDA/SlabMacros.h index 21ba410ab..7e3b628cd 100644 --- a/cpp/open3d/core/hashmap/CUDA/SlabMacros.h +++ b/cpp/open3d/core/hashmap/CUDA/SlabMacros.h @@ -27,7 +27,7 @@ namespace open3d { namespace core { -// Device-specific, safe for current NVIDIA architectures. +// Device-specific static constexpr uint32_t kWarpSize = 32; ////////////////////// diff --git a/cpp/open3d/core/hashmap/CUDA/SlabNodeManager.h b/cpp/open3d/core/hashmap/CUDA/SlabNodeManager.h index 9668c3100..f3b71f7b0 100644 --- a/cpp/open3d/core/hashmap/CUDA/SlabNodeManager.h +++ b/cpp/open3d/core/hashmap/CUDA/SlabNodeManager.h @@ -233,18 +233,34 @@ class SlabNodeManager { ~SlabNodeManager() { MemoryManager::Free(impl_.super_blocks_, device_); } void Reset() { +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipMemset( + impl_.super_blocks_, 0xFF, + kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t))); +#else OPEN3D_CUDA_CHECK(cudaMemset( impl_.super_blocks_, 0xFF, kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t))); +#endif for (uint32_t i = 0; i < kSuperBlocks; i++) { // setting bitmaps into zeros: +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipMemset( + impl_.super_blocks_ + i * kUIntsPerSuperBlock, 0x00, + kBlocksPerSuperBlock * kSlabsPerBlock * sizeof(uint32_t))); +#else OPEN3D_CUDA_CHECK(cudaMemset( impl_.super_blocks_ + i * kUIntsPerSuperBlock, 0x00, kBlocksPerSuperBlock * kSlabsPerBlock * sizeof(uint32_t))); +#endif } cuda::Synchronize(); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif } std::vector CountSlabsPerSuperblock() { @@ -262,7 +278,11 @@ class SlabNodeManager { core::cuda::GetStream()>>>( impl_, thrust::raw_pointer_cast(slabs_per_superblock.data())); cuda::Synchronize(); +#if __HIP_PLATFORM_AMD__ + OPEN3D_CUDA_CHECK(hipGetLastError()); +#else OPEN3D_CUDA_CHECK(cudaGetLastError()); +#endif std::vector result(num_super_blocks); thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(), diff --git a/cpp/open3d/core/hashmap/Dispatch.h b/cpp/open3d/core/hashmap/Dispatch.h index 7318e5898..705871fa5 100644 --- a/cpp/open3d/core/hashmap/Dispatch.h +++ b/cpp/open3d/core/hashmap/Dispatch.h @@ -59,7 +59,7 @@ } \ }() -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) // Reinterpret hash maps' void* value arrays as CUDA primitive types arrays, to // avoid slow memcpy or byte-by-byte copy in kernels. // Not used in the CPU version since memcpy is relatively fast on CPU. diff --git a/cpp/open3d/core/linalg/LinalgHeadersCUDA.h b/cpp/open3d/core/linalg/LinalgHeadersCUDA.h index 49b124057..7e2ab0187 100644 --- a/cpp/open3d/core/linalg/LinalgHeadersCUDA.h +++ b/cpp/open3d/core/linalg/LinalgHeadersCUDA.h @@ -12,8 +12,15 @@ #pragma once -#ifdef BUILD_CUDA_MODULE +#if BUILD_CUDA_MODULE + +#if __HIP_PLATFORM_AMD__ +#include +#include +#else #include #include #include #endif + +#endif diff --git a/cpp/open3d/core/nns/NeighborSearchCommon.h b/cpp/open3d/core/nns/NeighborSearchCommon.h index 97ffb810f..865cdc14e 100644 --- a/cpp/open3d/core/nns/NeighborSearchCommon.h +++ b/cpp/open3d/core/nns/NeighborSearchCommon.h @@ -18,7 +18,7 @@ namespace nns { /// Supported metrics enum Metric { L1, L2, Linf }; -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) #define HOST_DEVICE __host__ __device__ #else #define HOST_DEVICE diff --git a/cpp/open3d/core/nns/kernel/DeviceDefs.cuh b/cpp/open3d/core/nns/kernel/DeviceDefs.cuh index 85b67c436..fd21d3623 100644 --- a/cpp/open3d/core/nns/kernel/DeviceDefs.cuh +++ b/cpp/open3d/core/nns/kernel/DeviceDefs.cuh @@ -31,13 +31,17 @@ #pragma once +#if __HIP_PLATFORM_AMD__ +#include +#else #include +#endif namespace open3d { namespace core { // We require at least CUDA 8.0 for compilation -#if CUDA_VERSION < 8000 +#if !__HIP_PLATFORM_AMD__ && (CUDA_VERSION < 8000) #error "CUDA >= 8.0 is required" #endif @@ -46,7 +50,9 @@ constexpr int kWarpSize = 32; // This is a memory barrier for intra-warp writes to shared memory. __forceinline__ __device__ void warpFence() { -#if CUDA_VERSION >= 9000 +#if __HIP_PLATFORM_AMD__ + __builtin_amdgcn_wave_barrier(); +#elif (CUDA_VERSION >= 9000) __syncwarp(); #else // For the time being, assume synchronicity. @@ -54,7 +60,7 @@ __forceinline__ __device__ void warpFence() { #endif } -#if CUDA_VERSION > 9000 +#if (CUDA_VERSION > 9000) || __HIP_PLATFORM_AMD__ // Based on the CUDA version (we assume what version of nvcc/ptxas we were // compiled with), the register allocation algorithm is much better, so only // enable the 2048 selection code if we are above 9.0 (9.2 seems to be ok) diff --git a/cpp/open3d/core/nns/kernel/MergeNetwork.cuh b/cpp/open3d/core/nns/kernel/MergeNetwork.cuh index 5306eb7dc..ec4f2aa7c 100644 --- a/cpp/open3d/core/nns/kernel/MergeNetwork.cuh +++ b/cpp/open3d/core/nns/kernel/MergeNetwork.cuh @@ -194,7 +194,7 @@ struct BitonicMergeStep { template struct BitonicMergeStep { - static inline __device__ void merge(K k[N], V v[N]) { + static inline __device__ void merge(K* k, V* v) { static_assert(isPowerOf2(N), "must be power of 2"); static_assert(N > 1, "must be N > 1"); @@ -259,7 +259,7 @@ struct BitonicMergeStep { // Low recursion template struct BitonicMergeStep { - static inline __device__ void merge(K k[N], V v[N]) { + static inline __device__ void merge(K* k, V* v) { static_assert(!isPowerOf2(N), "must be non-power-of-2"); static_assert(N >= 3, "must be N >= 3"); @@ -337,7 +337,7 @@ struct BitonicMergeStep { // High recursion template struct BitonicMergeStep { - static inline __device__ void merge(K k[N], V v[N]) { + static inline __device__ void merge(K* k, V* v) { static_assert(!isPowerOf2(N), "must be non-power-of-2"); static_assert(N >= 3, "must be N >= 3"); @@ -422,10 +422,7 @@ template -inline __device__ void warpMergeAnyRegisters(K k1[N1], - V v1[N1], - K k2[N2], - V v2[N2]) { +inline __device__ void warpMergeAnyRegisters(K* k1, V* v1, K* k2, V* v2) { constexpr int kSmallestN = N1 < N2 ? N1 : N2; #pragma unroll @@ -479,7 +476,7 @@ inline __device__ void warpMergeAnyRegisters(K k1[N1], // bitonic sort template struct BitonicSortStep { - static inline __device__ void sort(K k[N], V v[N]) { + static inline __device__ void sort(K* k, V* v) { static_assert(N > 1, "did not hit specialized case"); // Sort recursively @@ -544,7 +541,7 @@ struct BitonicSortStep { /// Sort a list of kWarpSize * N elements in registers, where N is an /// arbitrary >= 1 template -inline __device__ void warpSortAnyRegisters(K k[N], V v[N]) { +inline __device__ void warpSortAnyRegisters(K* k, V* v) { BitonicSortStep::sort(k, v); } diff --git a/cpp/open3d/core/nns/kernel/Pair.cuh b/cpp/open3d/core/nns/kernel/Pair.cuh index 15928ba6e..ad1a88659 100644 --- a/cpp/open3d/core/nns/kernel/Pair.cuh +++ b/cpp/open3d/core/nns/kernel/Pair.cuh @@ -31,7 +31,11 @@ #pragma once +#if __HIP_PLATFORM_AMD__ +#include +#else #include +#endif #include "open3d/core/nns/kernel/PtxUtils.cuh" #include "open3d/core/nns/kernel/WarpShuffle.cuh" diff --git a/cpp/open3d/core/nns/kernel/PtxUtils.cuh b/cpp/open3d/core/nns/kernel/PtxUtils.cuh index 8c497b4c2..1f587f6c0 100644 --- a/cpp/open3d/core/nns/kernel/PtxUtils.cuh +++ b/cpp/open3d/core/nns/kernel/PtxUtils.cuh @@ -31,11 +31,104 @@ #pragma once +#if __HIP_PLATFORM_AMD__ +#include +#else #include +#endif namespace open3d { namespace core { +// ROCm/HIP does not support inline PTX, but provides similar functionality via +// builtins. We'll use conditional compilation to provide ROCm/HIP-compatible +// versions for relevant functions/macros. + +#if __HIP_PLATFORM_AMD__ + +// There are no direct equivalents of `bfe.u32`, `bfe.u64`, or `bfi.b32` PTX in +// HIP C++ builtins, but bit-field extract/insert can be implemented in pure C++ +// as follows: defines to simplify the SASS assembly structure file/line in the +// profiler + +template +static inline T bfe(T val, unsigned pos, unsigned len) { + static_assert(std::is_unsigned::value, "bfe requires unsigned type"); + constexpr unsigned BITS = sizeof(T) * 8; + + if (len == 0) return 0; + if (len >= BITS) return val >> pos; + return (val >> pos) & ((T(1) << len) - 1); +} + +#define GET_BITFIELD_U32(OUT, VAL, POS, LEN) \ + (OUT = bfe(VAL, POS, LEN)); + +#define GET_BITFIELD_U64(OUT, VAL, POS, LEN) \ + (OUT = bfe(VAL, POS, LEN)); + +__device__ __forceinline__ unsigned int getBitfield(unsigned int val, + int pos, + int len) { + // Extract `len` bits from `val`, starting at bit `pos`. + if (len == 0) return 0; + return (val >> pos) & ((1u << len) - 1); +} + +__device__ __forceinline__ uint64_t getBitfield(uint64_t val, + int pos, + int len) { + if (len == 0) return 0; + return (val >> pos) & ((1ull << len) - 1); +} + +__device__ __forceinline__ unsigned int setBitfield(unsigned int val, + unsigned int toInsert, + int pos, + int len) { + // Clear the target bitfield and insert the new bits. + if (len == 0) return val; + unsigned int mask = ((1u << len) - 1) << pos; + return (val & ~mask) | ((toInsert << pos) & mask); +} + +__device__ __forceinline__ int getLaneId() { return __lane_id(); } + +__device__ __forceinline__ unsigned getLaneMaskLt() { + // No direct HIP equivalent. Use __ballot()/__ballot_sync. + unsigned lane_id = getLaneId(); + return (1u << lane_id) - 1; +} + +__device__ __forceinline__ unsigned getLaneMaskLe() { + unsigned lane_id = getLaneId(); + return (1u << (lane_id + 1)) - 1; +} + +__device__ __forceinline__ unsigned getLaneMaskGt() { + unsigned lane_id = getLaneId(); + return 0xffffffffu << (lane_id + 1); +} + +__device__ __forceinline__ unsigned getLaneMaskGe() { + unsigned lane_id = getLaneId(); + return 0xffffffffu << lane_id; +} + +__device__ __forceinline__ void namedBarrierWait(int name, int numThreads) { + // NOTE: hip does not support bar.sync with num threads, so we use a trap to + // indicate an error. Keep this function to compact the code. + __builtin_trap(); +} + +__device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) { + // NOTE: hip does not support bar.arrived with num threads, so we use a trap + // to indicate an error. Keep this function to compact the code. + __builtin_trap(); +} + +#else // __HIP_PLATFORM_AMD__ + // defines to simplify the SASS assembly structure file/line in the profiler #define GET_BITFIELD_U32(OUT, VAL, POS, LEN) \ asm("bfe.u32 %0, %1, %2, %3;" : "=r"(OUT) : "r"(VAL), "r"(POS), "r"(LEN)); @@ -111,5 +204,7 @@ __device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) { : "memory"); } +#endif + } // namespace core } // namespace open3d diff --git a/cpp/open3d/core/nns/kernel/Reduction.cuh b/cpp/open3d/core/nns/kernel/Reduction.cuh index a87dc0ac9..44ddb0b11 100644 --- a/cpp/open3d/core/nns/kernel/Reduction.cuh +++ b/cpp/open3d/core/nns/kernel/Reduction.cuh @@ -31,7 +31,11 @@ #pragma once +#if __HIP_PLATFORM_AMD__ +#include +#else #include +#endif #include "open3d/core/nns/kernel/BlockMerge.cuh" #include "open3d/core/nns/kernel/PtxUtils.cuh" diff --git a/cpp/open3d/core/nns/kernel/ReductionOps.cuh b/cpp/open3d/core/nns/kernel/ReductionOps.cuh index da8eb4710..0eeb493a1 100644 --- a/cpp/open3d/core/nns/kernel/ReductionOps.cuh +++ b/cpp/open3d/core/nns/kernel/ReductionOps.cuh @@ -31,7 +31,11 @@ #pragma once +#if __HIP_PLATFORM_AMD__ +#include +#else #include +#endif #include "open3d/core/nns/kernel/Limits.cuh" #include "open3d/core/nns/kernel/Pair.cuh" diff --git a/cpp/open3d/core/nns/kernel/Select.cuh b/cpp/open3d/core/nns/kernel/Select.cuh index cadfea819..5e7667a06 100644 --- a/cpp/open3d/core/nns/kernel/Select.cuh +++ b/cpp/open3d/core/nns/kernel/Select.cuh @@ -157,7 +157,7 @@ struct BlockSelect { __device__ inline void checkThreadQ() { bool needSort = (numVals == NumThreadQ); -#if CUDA_VERSION >= 9000 +#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__ needSort = __any_sync(0xffffffff, needSort); #else needSort = __any(needSort); @@ -421,7 +421,7 @@ struct WarpSelect { __device__ inline void checkThreadQ() { bool needSort = (numVals == NumThreadQ); -#if CUDA_VERSION >= 9000 +#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__ needSort = __any_sync(0xffffffff, needSort); #else needSort = __any(needSort); diff --git a/cpp/open3d/core/nns/kernel/StaticUtils.cuh b/cpp/open3d/core/nns/kernel/StaticUtils.cuh index f636ff9e4..016b6d26f 100644 --- a/cpp/open3d/core/nns/kernel/StaticUtils.cuh +++ b/cpp/open3d/core/nns/kernel/StaticUtils.cuh @@ -31,7 +31,11 @@ #pragma once +#if __HIP_PLATFORM_AMD__ +#include +#else #include +#endif // allow usage for non-CUDA files #ifndef __host__ diff --git a/cpp/open3d/core/nns/kernel/WarpShuffle.cuh b/cpp/open3d/core/nns/kernel/WarpShuffle.cuh index bce9a4a93..ad29c8788 100644 --- a/cpp/open3d/core/nns/kernel/WarpShuffle.cuh +++ b/cpp/open3d/core/nns/kernel/WarpShuffle.cuh @@ -31,13 +31,20 @@ #pragma once +#if __HIP_PLATFORM_AMD__ +#include +#include + +using half = __half; +#else #include +#endif namespace open3d { namespace core { // defines to simplify the SASS assembly structure file/line in the profiler -#if CUDA_VERSION >= 9000 +#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__ #define SHFL_SYNC(VAL, SRC_LANE, WIDTH) \ __shfl_sync(0xffffffff, VAL, SRC_LANE, WIDTH) #else @@ -46,7 +53,7 @@ namespace core { template inline __device__ T shfl(const T val, int srcLane, int width = kWarpSize) { -#if CUDA_VERSION >= 9000 +#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__ return __shfl_sync(0xffffffff, val, srcLane, width); #else return __shfl(val, srcLane, width); @@ -66,7 +73,7 @@ template inline __device__ T shfl_up(const T val, unsigned int delta, int width = kWarpSize) { -#if CUDA_VERSION >= 9000 +#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__ return __shfl_up_sync(0xffffffff, val, delta, width); #else return __shfl_up(val, delta, width); @@ -88,7 +95,7 @@ template inline __device__ T shfl_down(const T val, unsigned int delta, int width = kWarpSize) { -#if CUDA_VERSION >= 9000 +#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__ return __shfl_down_sync(0xffffffff, val, delta, width); #else return __shfl_down(val, delta, width); @@ -107,7 +114,7 @@ inline __device__ T* shfl_down(T* const val, template inline __device__ T shfl_xor(const T val, int laneMask, int width = kWarpSize) { -#if CUDA_VERSION >= 9000 +#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__ return __shfl_xor_sync(0xffffffff, val, laneMask, width); #else return __shfl_xor(val, laneMask, width); @@ -125,22 +132,32 @@ inline __device__ T* shfl_xor(T* const val, } // CUDA 9.0+ has half shuffle -#if CUDA_VERSION < 9000 +#if CUDA_VERSION < 9000 || __HIP_PLATFORM_AMD__ inline __device__ half shfl(half v, int srcLane, int width = kWarpSize) { + half h; +#if __HIP_PLATFORM_AMD__ + unsigned int vu = *reinterpret_cast(&v); + vu = __shfl(vu, srcLane, width); + h = *reinterpret_cast(&vu); +#else unsigned int vu = v.x; vu = __shfl(vu, srcLane, width); - - half h; h.x = (unsigned short)vu; +#endif return h; } inline __device__ half shfl_xor(half v, int laneMask, int width = kWarpSize) { + half h; +#if __HIP_PLATFORM_AMD__ + unsigned int vu = *reinterpret_cast(&v); + vu = __shfl_xor(vu, laneMask, width); + h = *reinterpret_cast(&vu); +#else unsigned int vu = v.x; vu = __shfl_xor(vu, laneMask, width); - - half h; h.x = (unsigned short)vu; +#endif return h; } #endif // CUDA_VERSION diff --git a/cpp/open3d/ml/Helper.h b/cpp/open3d/ml/Helper.h index 4193cfac9..e0c8edac2 100644 --- a/cpp/open3d/ml/Helper.h +++ b/cpp/open3d/ml/Helper.h @@ -32,6 +32,18 @@ namespace ml { #define OPEN3D_ML_CUDA_DRIVER_CHECK(err) \ __OPEN3D_ML_CUDA_DRIVER_CHECK(err, __FILE__, __LINE__) +#if __HIP_PLATFORM_AMD__ +inline void __OPEN3D_ML_CUDA_DRIVER_CHECK(hipError_t err, + const char *file, + const int line, + bool abort = true) { + if (err != CUDA_SUCCESS) { + const char *error_string = hipGetErrorString(err); + utility::LogError("{}:{} CUDA driver error: {}", file, line, + error_string); + } +} +#else inline void __OPEN3D_ML_CUDA_DRIVER_CHECK(CUresult err, const char *file, const int line, @@ -48,6 +60,7 @@ inline void __OPEN3D_ML_CUDA_DRIVER_CHECK(CUresult err, } } } +#endif inline cudaStream_t GetDefaultStream() { (cudaStream_t)0; } diff --git a/cpp/open3d/ml/contrib/CMakeLists.txt b/cpp/open3d/ml/contrib/CMakeLists.txt index bca160547..8ce20657c 100644 --- a/cpp/open3d/ml/contrib/CMakeLists.txt +++ b/cpp/open3d/ml/contrib/CMakeLists.txt @@ -12,6 +12,22 @@ if(BUILD_CUDA_MODULE) ) endif() +if (WITH_ROCM) + include(Hipify) + + get_target_property(SOURCE_LIST ml_contrib SOURCES) + get_hipified_list("${SOURCE_LIST}" HIPIFIED_SOURCE_LIST) + + foreach(src IN LISTS HIPIFIED_SOURCE_LIST) + get_filename_component(ext ${src} EXT) + if(ext STREQUAL ".cu") + set_source_files_properties(${src} PROPERTIES LANGUAGE HIP) + endif() + endforeach() + + set_property(TARGET ml_contrib PROPERTY SOURCES ${HIPIFIED_SOURCE_LIST}) +endif() + open3d_show_and_abort_on_warning(ml_contrib) open3d_set_global_properties(ml_contrib) open3d_set_open3d_lib_properties(ml_contrib) diff --git a/cpp/open3d/ml/impl/continuous_conv/ContinuousConv.cuh b/cpp/open3d/ml/impl/continuous_conv/ContinuousConv.cuh index 322b38180..245c00abe 100644 --- a/cpp/open3d/ml/impl/continuous_conv/ContinuousConv.cuh +++ b/cpp/open3d/ml/impl/continuous_conv/ContinuousConv.cuh @@ -9,9 +9,17 @@ #define EIGEN_USE_GPU +#if __HIP_PLATFORM_AMD__ + +#include + +#else + #include #include +#endif // __HIP_PLATFORM_AMD__ + #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h" #include "open3d/ml/impl/misc/MemoryAllocation.h" #include "open3d/utility/Helper.h" @@ -191,6 +199,7 @@ void CConvComputeFeaturesCUDA(const cudaStream_t& stream, size_t num_cols_per_run = std::min(mem_columns.second / bytes_per_column, size_t(num_out)); +#if !__HIP_PLATFORM_AMD__ typedef cutlass::gemm::SgemmTraits< cutlass::MatrixLayout::kColumnMajor, // layout of A matrix (filter) cutlass::MatrixLayout::kColumnMajor, // layout of B matrix @@ -200,6 +209,7 @@ void CConvComputeFeaturesCUDA(const cudaStream_t& stream, GemmTraits; typedef cutlass::gemm::Gemm Gemm; +#endif // !__HIP_PLATFORM_AMD__ // this is the pointer to the patch matrix TFeat* columns = (TFeat*)mem_columns.first; @@ -236,6 +246,23 @@ void CConvComputeFeaturesCUDA(const cudaStream_t& stream, float* C = out_features + (run_i * num_cols_per_run * out_channels); int ldc = m; +#if __HIP_PLATFORM_AMD__ + hipblasHandle_t handle; + hipblasStatus_t status; + status = hipblasCreate(&handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to create HIPBLAS handle."); + } + status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k, + &alpha, A, lda, B, ldb, &beta, C, ldc); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to launch HIPBLAS Gemm."); + } + status = hipblasDestroy(handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to destroy HIPBLAS handle."); + } +#else typename Gemm::Params params; int result = params.initialize(m, // GEMM M dimension n, // GEMM N dimension @@ -257,6 +284,7 @@ void CConvComputeFeaturesCUDA(const cudaStream_t& stream, } Gemm::launch(params, stream); +#endif // __HIP_PLATFORM_AMD__ } } diff --git a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.cuh b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.cuh index ef03b9f05..0217ea471 100644 --- a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.cuh +++ b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.cuh @@ -8,9 +8,17 @@ #pragma once #define EIGEN_USE_GPU +#if __HIP_PLATFORM_AMD__ + +#include + +#else + #include #include +#endif // __HIP_PLATFORM_AMD__ + #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h" #include "open3d/ml/impl/misc/MemoryAllocation.h" #include "open3d/utility/Helper.h" @@ -190,6 +198,7 @@ void CConvBackpropFilterCUDA(const cudaStream_t& stream, size_t num_cols_per_run = std::min(mem_columns.second / bytes_per_column, size_t(num_out)); +#if !__HIP_PLATFORM_AMD__ typedef cutlass::gemm::SgemmTraits< cutlass::MatrixLayout::kColumnMajor, // layout of A matrix cutlass::MatrixLayout::kRowMajor, // layout of B matrix @@ -198,6 +207,7 @@ void CConvBackpropFilterCUDA(const cudaStream_t& stream, GemmTraits; typedef cutlass::gemm::Gemm Gemm; +#endif // !__HIP_PLATFORM_AMD__ TFeat* columns = (TFeat*)mem_columns.first; @@ -217,7 +227,6 @@ void CConvBackpropFilterCUDA(const cudaStream_t& stream, filter_dims, interpolation, coordinate_mapping, align_corners, individual_extent, isotropic_extent, normalize); - typename Gemm::Params params; // C is MxN // B is KxN // A is MxK @@ -233,6 +242,24 @@ void CConvBackpropFilterCUDA(const cudaStream_t& stream, float beta = 1; float* C = filter_backprop; int ldc = m; +#if __HIP_PLATFORM_AMD__ + hipblasHandle_t handle; + hipblasStatus_t status; + status = hipblasCreate(&handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to create HIPBLAS handle."); + } + status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_T, m, n, k, + &alpha, A, lda, B, ldb, &beta, C, ldc); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to launch HIPBLAS Gemm."); + } + status = hipblasDestroy(handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to destroy HIPBLAS handle."); + } +#else + typename Gemm::Params params; int result = params.initialize(m, // GEMM M dimension n, // GEMM N dimension @@ -254,6 +281,7 @@ void CConvBackpropFilterCUDA(const cudaStream_t& stream, } Gemm::launch(params, stream); +#endif // __HIP_PLATFORM_AMD__ } } diff --git a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.cu b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.cu index 218e4c2b6..7863b9d4f 100644 --- a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.cu +++ b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.cu @@ -5,6 +5,7 @@ // SPDX-License-Identifier: MIT // ---------------------------------------------------------------------------- +#include "open3d/core/CUDAUtils.h" #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h" #include "open3d/utility/Helper.h" diff --git a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTranspose.cuh b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTranspose.cuh index 29d8aba31..54f9c006f 100644 --- a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTranspose.cuh +++ b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTranspose.cuh @@ -8,9 +8,17 @@ #pragma once #define EIGEN_USE_GPU +#if __HIP_PLATFORM_AMD__ + +#include + +#else + #include #include +#endif // __HIP_PLATFORM_AMD__ + #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h" #include "open3d/ml/impl/misc/MemoryAllocation.h" #include "open3d/utility/Helper.h" @@ -102,6 +110,7 @@ void CConvTransposeComputeFeaturesCUDA( size_t num_cols_per_run = std::min(mem_columns.second / bytes_per_column, size_t(num_out)); +#if !__HIP_PLATFORM_AMD__ typedef cutlass::gemm::SgemmTraits< cutlass::MatrixLayout::kColumnMajor, // layout of A matrix cutlass::MatrixLayout::kColumnMajor, // layout of B matrix @@ -110,6 +119,7 @@ void CConvTransposeComputeFeaturesCUDA( GemmTraits; typedef cutlass::gemm::Gemm Gemm; +#endif // !__HIP_PLATFORM_AMD__ TFeat* columns = (TFeat*)mem_columns.first; @@ -130,7 +140,6 @@ void CConvTransposeComputeFeaturesCUDA( interpolation, coordinate_mapping, align_corners, individual_extent, isotropic_extent, normalize); - typename Gemm::Params params; // C is MxN // B is KxN // A is MxK @@ -145,6 +154,24 @@ void CConvTransposeComputeFeaturesCUDA( float beta = 1; float* C = out_features + (run_i * num_cols_per_run * out_channels); int ldc = m; +#if __HIP_PLATFORM_AMD__ + hipblasHandle_t handle; + hipblasStatus_t status; + status = hipblasCreate(&handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to create HIPBLAS handle."); + } + status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k, + &alpha, A, lda, B, ldb, &beta, C, ldc); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to launch HIPBLAS Gemm."); + } + status = hipblasDestroy(handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to destroy HIPBLAS handle."); + } +#else + typename Gemm::Params params; int result = params.initialize(m, // GEMM M dimension @@ -168,6 +195,7 @@ void CConvTransposeComputeFeaturesCUDA( } Gemm::launch(params, stream); +#endif // __HIP_PLATFORM_AMD__ } if (out_importance) { diff --git a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.cuh b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.cuh index 2c1dc7a16..3558e0ba8 100644 --- a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.cuh +++ b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.cuh @@ -8,9 +8,17 @@ #pragma once #define EIGEN_USE_GPU +#if __HIP_PLATFORM_AMD__ + +#include + +#else + #include #include +#endif // __HIP_PLATFORM_AMD__ + #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h" #include "open3d/ml/impl/misc/MemoryAllocation.h" #include "open3d/utility/Helper.h" @@ -203,6 +211,7 @@ void CConvTransposeBackpropFilterCUDA(const cudaStream_t& stream, sizeof(TOut) * spatial_filter_size * in_channels * out_channels, stream); +#if !__HIP_PLATFORM_AMD__ typedef cutlass::gemm::SgemmTraits< cutlass::MatrixLayout::kColumnMajor, // layout of A matrix cutlass::MatrixLayout::kRowMajor, // layout of B matrix @@ -211,6 +220,7 @@ void CConvTransposeBackpropFilterCUDA(const cudaStream_t& stream, GemmTraits; typedef cutlass::gemm::Gemm Gemm; +#endif // !__HIP_PLATFORM_AMD__ TFeat* columns = (TFeat*)mem_columns.first; TFeat* gradient = ((TFeat*)mem_columns.first) + @@ -245,7 +255,6 @@ void CConvTransposeBackpropFilterCUDA(const cudaStream_t& stream, interpolation, coordinate_mapping, align_corners, individual_extent, isotropic_extent, normalize); - typename Gemm::Params params; // C is MxN // B is KxN // A is MxK @@ -261,6 +270,25 @@ void CConvTransposeBackpropFilterCUDA(const cudaStream_t& stream, float* C = filter_backprop; int ldc = m; +#if __HIP_PLATFORM_AMD__ + hipblasHandle_t handle; + hipblasStatus_t status; + status = hipblasCreate(&handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to create HIPBLAS handle."); + } + status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_T, m, n, k, + &alpha, A, lda, B, ldb, &beta, C, ldc); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to launch HIPBLAS Gemm."); + } + status = hipblasDestroy(handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to destroy HIPBLAS handle."); + } +#else + typename Gemm::Params params; + int result = params.initialize(m, // GEMM M dimension n, // GEMM N dimension @@ -282,6 +310,7 @@ void CConvTransposeBackpropFilterCUDA(const cudaStream_t& stream, } Gemm::launch(params, stream); +#endif // __HIP_PLATFORM_AMD__ } } diff --git a/cpp/open3d/ml/impl/misc/NeighborSearchCommon.h b/cpp/open3d/ml/impl/misc/NeighborSearchCommon.h index 4f1d1b740..e8f05a77f 100644 --- a/cpp/open3d/ml/impl/misc/NeighborSearchCommon.h +++ b/cpp/open3d/ml/impl/misc/NeighborSearchCommon.h @@ -18,7 +18,7 @@ namespace impl { /// Supported metrics enum Metric { L1, L2, Linf }; -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) #define HOST_DEVICE __host__ __device__ #else #define HOST_DEVICE diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConv.cuh b/cpp/open3d/ml/impl/sparse_conv/SparseConv.cuh index 3bbc297db..6fac85e85 100644 --- a/cpp/open3d/ml/impl/sparse_conv/SparseConv.cuh +++ b/cpp/open3d/ml/impl/sparse_conv/SparseConv.cuh @@ -9,9 +9,17 @@ #define EIGEN_USE_GPU +#if __HIP_PLATFORM_AMD__ + +#include + +#else + #include #include +#endif // __HIP_PLATFORM_AMD__ + #include "open3d/ml/impl/misc/MemoryAllocation.h" #include "open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.h" #include "open3d/utility/Helper.h" @@ -151,6 +159,7 @@ void SparseConvComputeFeaturesCUDA(const cudaStream_t& stream, size_t num_cols_per_run = std::min(mem_columns.second / bytes_per_column, size_t(num_out)); +#if !__HIP_PLATFORM_AMD__ typedef cutlass::gemm::SgemmTraits< cutlass::MatrixLayout::kColumnMajor, // layout of A matrix (filter) cutlass::MatrixLayout::kColumnMajor, // layout of B matrix @@ -160,6 +169,7 @@ void SparseConvComputeFeaturesCUDA(const cudaStream_t& stream, GemmTraits; typedef cutlass::gemm::Gemm Gemm; +#endif // !__HIP_PLATFORM_AMD__ // this is the pointer to the patch matrix TFeat* columns = (TFeat*)mem_columns.first; @@ -194,6 +204,23 @@ void SparseConvComputeFeaturesCUDA(const cudaStream_t& stream, float* C = out_features + (run_i * num_cols_per_run * out_channels); int ldc = m; +#if __HIP_PLATFORM_AMD__ + hipblasHandle_t handle; + hipblasStatus_t status; + status = hipblasCreate(&handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to create HIPBLAS handle."); + } + status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k, + &alpha, A, lda, B, ldb, &beta, C, ldc); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to launch HIPBLAS Gemm."); + } + status = hipblasDestroy(handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to destroy HIPBLAS handle."); + } +#else typename Gemm::Params params; int result = params.initialize(m, // GEMM M dimension n, // GEMM N dimension @@ -215,6 +242,7 @@ void SparseConvComputeFeaturesCUDA(const cudaStream_t& stream, } Gemm::launch(params, stream); +#endif // __HIP_PLATFORM_AMD__ } } diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.cuh b/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.cuh index 5dc0e6973..d8b7e3cfa 100644 --- a/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.cuh +++ b/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.cuh @@ -8,9 +8,17 @@ #pragma once #define EIGEN_USE_GPU +#if __HIP_PLATFORM_AMD__ + +#include + +#else + #include #include +#endif // __HIP_PLATFORM_AMD__ + #include "open3d/ml/impl/misc/MemoryAllocation.h" #include "open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.h" #include "open3d/utility/Helper.h" @@ -150,6 +158,7 @@ void SparseConvBackpropFilterCUDA(const cudaStream_t& stream, size_t num_cols_per_run = std::min(mem_columns.second / bytes_per_column, size_t(num_out)); +#if !__HIP_PLATFORM_AMD__ typedef cutlass::gemm::SgemmTraits< cutlass::MatrixLayout::kColumnMajor, // layout of A matrix cutlass::MatrixLayout::kRowMajor, // layout of B matrix @@ -158,6 +167,7 @@ void SparseConvBackpropFilterCUDA(const cudaStream_t& stream, GemmTraits; typedef cutlass::gemm::Gemm Gemm; +#endif // !__HIP_PLATFORM_AMD__ TFeat* columns = (TFeat*)mem_columns.first; @@ -175,7 +185,6 @@ void SparseConvBackpropFilterCUDA(const cudaStream_t& stream, neighbors_index, neighbors_kernel_index, neighbors_importance, neighbors_row_splits, num_kernel_elements, normalize); - typename Gemm::Params params; // C is MxN // B is KxN // A is MxK @@ -192,6 +201,26 @@ void SparseConvBackpropFilterCUDA(const cudaStream_t& stream, float* C = filter_backprop; int ldc = m; +#if __HIP_PLATFORM_AMD__ + hipblasHandle_t handle; + hipblasStatus_t status; + status = hipblasCreate(&handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to create HIPBLAS handle."); + } + status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_T, m, n, k, + &alpha, A, lda, B, ldb, &beta, C, ldc); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to launch HIPBLAS Gemm."); + } + status = hipblasDestroy(handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to destroy HIPBLAS handle."); + } +#else + + typename Gemm::Params params; + int result = params.initialize(m, // GEMM M dimension n, // GEMM N dimension k, // GEMM K dimension @@ -212,6 +241,7 @@ void SparseConvBackpropFilterCUDA(const cudaStream_t& stream, } Gemm::launch(params, stream); +#endif // __HIP_PLATFORM_AMD__ } } diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.cu b/cpp/open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.cu index 87d11ba3a..c163e1f4c 100644 --- a/cpp/open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.cu +++ b/cpp/open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.cu @@ -5,6 +5,7 @@ // SPDX-License-Identifier: MIT // ---------------------------------------------------------------------------- +#include "open3d/core/CUDAUtils.h" #include "open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.h" #include "open3d/utility/Helper.h" diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConvTranspose.cuh b/cpp/open3d/ml/impl/sparse_conv/SparseConvTranspose.cuh index efd0a5115..c571417e1 100644 --- a/cpp/open3d/ml/impl/sparse_conv/SparseConvTranspose.cuh +++ b/cpp/open3d/ml/impl/sparse_conv/SparseConvTranspose.cuh @@ -8,9 +8,17 @@ #pragma once #define EIGEN_USE_GPU +#if __HIP_PLATFORM_AMD__ + +#include + +#else + #include #include +#endif // __HIP_PLATFORM_AMD__ + #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h" #include "open3d/ml/impl/misc/MemoryAllocation.h" #include "open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.h" @@ -96,6 +104,7 @@ void SparseConvTransposeComputeFeaturesCUDA( size_t num_cols_per_run = std::min(mem_columns.second / bytes_per_column, size_t(num_out)); +#if !__HIP_PLATFORM_AMD__ typedef cutlass::gemm::SgemmTraits< cutlass::MatrixLayout::kColumnMajor, // layout of A matrix cutlass::MatrixLayout::kColumnMajor, // layout of B matrix @@ -104,6 +113,7 @@ void SparseConvTransposeComputeFeaturesCUDA( GemmTraits; typedef cutlass::gemm::Gemm Gemm; +#endif // !__HIP_PLATFORM_AMD__ TFeat* columns = (TFeat*)mem_columns.first; @@ -122,7 +132,6 @@ void SparseConvTransposeComputeFeaturesCUDA( neighbors_kernel_index, neighbors_importance, neighbors_row_splits, num_kernel_elements, normalize); - typename Gemm::Params params; // C is MxN // B is KxN // A is MxK @@ -137,6 +146,24 @@ void SparseConvTransposeComputeFeaturesCUDA( float beta = 1; float* C = out_features + (run_i * num_cols_per_run * out_channels); int ldc = m; +#if __HIP_PLATFORM_AMD__ + hipblasHandle_t handle; + hipblasStatus_t status; + status = hipblasCreate(&handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to create HIPBLAS handle."); + } + status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k, + &alpha, A, lda, B, ldb, &beta, C, ldc); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to launch HIPBLAS Gemm."); + } + status = hipblasDestroy(handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to destroy HIPBLAS handle."); + } +#else + typename Gemm::Params params; int result = params.initialize(m, // GEMM M dimension @@ -160,6 +187,7 @@ void SparseConvTransposeComputeFeaturesCUDA( } Gemm::launch(params, stream); +#endif // __HIP_PLATFORM_AMD__ } if (out_importance) { diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.cuh b/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.cuh index 3c6f546a0..a7dc46e96 100644 --- a/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.cuh +++ b/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.cuh @@ -8,9 +8,17 @@ #pragma once #define EIGEN_USE_GPU +#if __HIP_PLATFORM_AMD__ + +#include + +#else + #include #include +#endif // __HIP_PLATFORM_AMD__ + #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h" #include "open3d/ml/impl/misc/MemoryAllocation.h" #include "open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.h" @@ -164,7 +172,7 @@ void SparseConvTransposeBackpropFilterCUDA( filter_backprop, 0, sizeof(TOut) * num_kernel_elements * in_channels * out_channels, stream); - +#if !__HIP_PLATFORM_AMD__ typedef cutlass::gemm::SgemmTraits< cutlass::MatrixLayout::kColumnMajor, // layout of A matrix cutlass::MatrixLayout::kRowMajor, // layout of B matrix @@ -173,6 +181,7 @@ void SparseConvTransposeBackpropFilterCUDA( GemmTraits; typedef cutlass::gemm::Gemm Gemm; +#endif // !__HIP_PLATFORM_AMD__ TFeat* columns = (TFeat*)mem_columns.first; TFeat* gradient = ((TFeat*)mem_columns.first) + @@ -205,7 +214,6 @@ void SparseConvTransposeBackpropFilterCUDA( neighbors_kernel_index, neighbors_importance, neighbors_row_splits, num_kernel_elements, normalize); - typename Gemm::Params params; // C is MxN // B is KxN // A is MxK @@ -221,6 +229,25 @@ void SparseConvTransposeBackpropFilterCUDA( float* C = filter_backprop; int ldc = m; +#if __HIP_PLATFORM_AMD__ + hipblasHandle_t handle; + hipblasStatus_t status; + status = hipblasCreate(&handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to create HIPBLAS handle."); + } + status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_T, m, n, k, + &alpha, A, lda, B, ldb, &beta, C, ldc); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to launch HIPBLAS Gemm."); + } + status = hipblasDestroy(handle); + if (status != HIPBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to destroy HIPBLAS handle."); + } +#else + typename Gemm::Params params; + int result = params.initialize(m, // GEMM M dimension n, // GEMM N dimension @@ -242,6 +269,7 @@ void SparseConvTransposeBackpropFilterCUDA( } Gemm::launch(params, stream); +#endif // __HIP_PLATFORM_AMD__ } } diff --git a/cpp/open3d/ml/paddle/CMakeLists.txt b/cpp/open3d/ml/paddle/CMakeLists.txt index a81fd2a09..26158f7ca 100644 --- a/cpp/open3d/ml/paddle/CMakeLists.txt +++ b/cpp/open3d/ml/paddle/CMakeLists.txt @@ -102,6 +102,17 @@ if (BUILD_CUDA_MODULE) ) endif() +if (WITH_ROCM) + include(Hipify) + + get_target_property(SOURCE_LIST open3d_paddle_ops SOURCES) + get_hipified_list("${SOURCE_LIST}" HIPIFIED_SOURCE_LIST) + # NOTE: Set language to HIP for source files to enforce using HIP_COMPILER instead of CXX_COMPILER + # to workaround _Float16 is not defined in CXX_COMPILER + set_source_files_properties(${HIPIFIED_SOURCE_LIST} PROPERTIES LANGUAGE HIP) + set_property(TARGET open3d_paddle_ops PROPERTY SOURCES ${HIPIFIED_SOURCE_LIST}) +endif() + open3d_show_and_abort_on_warning(open3d_paddle_ops) open3d_set_global_properties(open3d_paddle_ops) @@ -133,17 +144,31 @@ target_link_libraries(open3d_paddle_ops PRIVATE ) if (BUILD_CUDA_MODULE) - target_link_libraries(open3d_paddle_ops PRIVATE - Open3D::3rdparty_cutlass - ${PADDLE_LIBRARIES} - CUDA::cuda_driver - ) - - if (TARGET Open3D::3rdparty_cub) + if (WITH_CUDA) + target_link_libraries(open3d_paddle_ops PRIVATE + Open3D::3rdparty_cutlass + ${PADDLE_LIBRARIES} + CUDA::cuda_driver + ) + if (TARGET Open3D::3rdparty_cub) + target_link_libraries(open3d_paddle_ops PRIVATE + Open3D::3rdparty_cub + ) + endif() + elseif(WITH_ROCM) target_link_libraries(open3d_paddle_ops PRIVATE - Open3D::3rdparty_cub + Open3D::3rdparty_hipblas + ${PADDLE_LIBRARIES} + Open3D::3rdparty_gflags + Open3D::3rdparty_glog ) + if (TARGET Open3D::3rdparty_cub) + target_link_libraries(open3d_paddle_ops PRIVATE + Open3D::3rdparty_cub + ) + endif() endif() + endif() install(TARGETS open3d_paddle_ops EXPORT Open3DPaddleOps diff --git a/cpp/open3d/ml/paddle/PaddleHelper.cpp b/cpp/open3d/ml/paddle/PaddleHelper.cpp index f67baf2c4..a902f0c9e 100644 --- a/cpp/open3d/ml/paddle/PaddleHelper.cpp +++ b/cpp/open3d/ml/paddle/PaddleHelper.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: MIT // ---------------------------------------------------------------------------- -#include "PaddleHelper.h" +#include "open3d/ml/paddle/PaddleHelper.h" paddle::Tensor InitializedEmptyTensor(const phi::DataType dtype, const phi::IntArray& shape, diff --git a/cpp/open3d/ml/paddle/PaddleHelper.h b/cpp/open3d/ml/paddle/PaddleHelper.h index 17f86e21e..70764c4a8 100644 --- a/cpp/open3d/ml/paddle/PaddleHelper.h +++ b/cpp/open3d/ml/paddle/PaddleHelper.h @@ -241,7 +241,11 @@ std::tuple CheckShape(paddle::Tensor tensor, #ifdef BUILD_CUDA_MODULE static void cudaFreeWrapper(void* ptr) { +#if __HIP_PLATFORM_AMD__ + phi::gpuError_t result = hipFree(ptr); +#else phi::gpuError_t result = cudaFree(ptr); +#endif PADDLE_ENFORCE_GPU_SUCCESS(result); } #endif @@ -260,7 +264,11 @@ paddle::Tensor InitializedEmptyTensor(const phi::IntArray& shape, T* ptr = nullptr; if (phi::is_gpu_place(place)) { #ifdef BUILD_CUDA_MODULE +#if __HIP_PLATFORM_AMD__ + phi::gpuError_t result = hipMalloc(&ptr, sizeof(T) * 1); +#else phi::gpuError_t result = cudaMalloc(&ptr, sizeof(T) * 1); +#endif PADDLE_ENFORCE_GPU_SUCCESS(result); deleter = std::function(cudaFreeWrapper); #else @@ -269,7 +277,7 @@ paddle::Tensor InitializedEmptyTensor(const phi::IntArray& shape, #endif } else if (phi::is_cpu_place(place)) { ptr = (T*)malloc(sizeof(T) * 1); - deleter = std::function(free); + deleter = std::function(std::free); } else { PD_CHECK(false, "Not supported backend!"); } diff --git a/cpp/open3d/t/geometry/kernel/CMakeLists.txt b/cpp/open3d/t/geometry/kernel/CMakeLists.txt index 081d24a6b..6d536ecf5 100644 --- a/cpp/open3d/t/geometry/kernel/CMakeLists.txt +++ b/cpp/open3d/t/geometry/kernel/CMakeLists.txt @@ -32,6 +32,23 @@ if (WITH_IPPICV) ) endif() +if (WITH_ROCM) + include(Hipify) + + get_target_property(SOURCE_LIST tgeometry_kernel SOURCES) + get_hipified_list("${SOURCE_LIST}" HIPIFIED_SOURCE_LIST) + + foreach(src IN LISTS HIPIFIED_SOURCE_LIST) + get_filename_component(ext ${src} EXT) + if(ext STREQUAL ".cu") + set_source_files_properties(${src} PROPERTIES LANGUAGE HIP) + endif() + endforeach() + + set_property(TARGET tgeometry_kernel PROPERTY SOURCES ${HIPIFIED_SOURCE_LIST}) +endif() + + open3d_show_and_abort_on_warning(tgeometry_kernel) open3d_set_global_properties(tgeometry_kernel) open3d_set_open3d_lib_properties(tgeometry_kernel HIDDEN) diff --git a/cpp/open3d/t/geometry/kernel/GeometryMacros.h b/cpp/open3d/t/geometry/kernel/GeometryMacros.h index bb5eab804..d692da781 100644 --- a/cpp/open3d/t/geometry/kernel/GeometryMacros.h +++ b/cpp/open3d/t/geometry/kernel/GeometryMacros.h @@ -11,7 +11,7 @@ #include "open3d/core/CUDAUtils.h" -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) #if defined(__CUDA_ARCH__) #if __CUDA_ARCH__ < 600 @@ -54,7 +54,7 @@ OPEN3D_HOST_DEVICE scalar_t Square(const scalar_t &x) { } // namespace open3d // https://stackoverflow.com/a/51549250 -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) __device__ inline float atomicMinf(float *addr, float value) { float old; old = (value >= 0) ? __int_as_float( diff --git a/cpp/open3d/t/geometry/kernel/ImageImpl.h b/cpp/open3d/t/geometry/kernel/ImageImpl.h index fbe2bcd1d..4743f2bac 100644 --- a/cpp/open3d/t/geometry/kernel/ImageImpl.h +++ b/cpp/open3d/t/geometry/kernel/ImageImpl.h @@ -20,12 +20,12 @@ namespace geometry { namespace kernel { namespace image { -#ifndef __CUDACC__ +#if !defined(__CUDACC__) && !defined(__HIPCC__) using std::isinf; using std::isnan; #endif -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) void ToCUDA #else void ToCPU @@ -82,7 +82,7 @@ void ToCPU #undef LINEAR_SATURATE } -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) void ClipTransformCUDA #else void ClipTransformCPU @@ -118,7 +118,7 @@ void ClipTransformCPU // Reimplementation of the reference: // https://github.com/mp3guy/ICPCUDA/blob/master/Cuda/pyrdown.cu#L41 -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) void PyrDownDepthCUDA #else void PyrDownDepthCPU @@ -143,7 +143,7 @@ void PyrDownDepthCPU const int gkernel_size_2 = gkernel_size / 2; const float gweights[3] = {0.375f, 0.25f, 0.0625f}; -#ifndef __CUDACC__ +#if !defined(__CUDACC__) && !defined(__HIPCC__) using std::abs; using std::max; using std::min; @@ -191,7 +191,7 @@ void PyrDownDepthCPU }); } -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) void CreateVertexMapCUDA #else void CreateVertexMapCPU @@ -209,7 +209,7 @@ void CreateVertexMapCPU int64_t cols = src.GetShape(1); int64_t n = rows * cols; -#ifndef __CUDACC__ +#if !defined(__CUDACC__) && !defined(__HIPCC__) using std::isinf; using std::isnan; #endif @@ -238,7 +238,7 @@ void CreateVertexMapCPU } }); } -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) void CreateNormalMapCUDA #else void CreateNormalMapCPU @@ -303,7 +303,7 @@ void CreateNormalMapCPU }); } -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) void ColorizeDepthCUDA #else void ColorizeDepthCPU diff --git a/cpp/open3d/t/geometry/kernel/NPPImage.cpp b/cpp/open3d/t/geometry/kernel/NPPImage.cpp index eb47ec9ba..24dc77b9d 100644 --- a/cpp/open3d/t/geometry/kernel/NPPImage.cpp +++ b/cpp/open3d/t/geometry/kernel/NPPImage.cpp @@ -7,7 +7,9 @@ #include "open3d/t/geometry/kernel/NPPImage.h" +#if !__HIP_PLATFORM_AMD__ #include +#endif #include "open3d/core/CUDAUtils.h" #include "open3d/core/Dtype.h" @@ -21,7 +23,9 @@ namespace t { namespace geometry { namespace npp { +#if !__HIP_PLATFORM_AMD__ static NppStreamContext MakeNPPContext() { + utility::LogError("npp::MakeNPPContext() is not supported on ROCM GPUs."); NppStreamContext context; context.hStream = core::cuda::GetStream(); context.nCudaDeviceId = core::cuda::GetDevice(); @@ -59,8 +63,12 @@ static NppStreamContext MakeNPPContext() { return context; } +#endif // !__HIP_PLATFORM_AMD__ void RGBToGray(const core::Tensor &src_im, core::Tensor &dst_im) { +#if __HIP_PLATFORM_AMD__ + utility::LogError("npp::RGBToGray() is not supported on ROCM GPUs."); +#else if (src_im.GetDevice() != dst_im.GetDevice()) { utility::LogError( "src_im and dst_im are not on the same device, got {} and {}.", @@ -92,11 +100,15 @@ void RGBToGray(const core::Tensor &src_im, core::Tensor &dst_im) { dtype.ToString()); } #undef NPP_ARGS +#endif // __HIP_PLATFORM_AMD__ } void Resize(const open3d::core::Tensor &src_im, open3d::core::Tensor &dst_im, t::geometry::Image::InterpType interp_type) { +#if __HIP_PLATFORM_AMD__ + utility::LogError("npp::Resize() is not supported on ROCM GPUs."); +#else if (src_im.GetDevice() != dst_im.GetDevice()) { utility::LogError( "src_im and dst_im are not on the same device, got {} and {}.", @@ -173,9 +185,13 @@ void Resize(const open3d::core::Tensor &src_im, dtype.ToString()); } #undef NPP_ARGS +#endif // __HIP_PLATFORM_AMD__ } void Dilate(const core::Tensor &src_im, core::Tensor &dst_im, int kernel_size) { +#if __HIP_PLATFORM_AMD__ + utility::LogError("npp::Dilate() is not supported on ROCM GPUs."); +#else if (src_im.GetDevice() != dst_im.GetDevice()) { utility::LogError( "src_im and dst_im are not on the same device, got {} and {}.", @@ -241,11 +257,15 @@ void Dilate(const core::Tensor &src_im, core::Tensor &dst_im, int kernel_size) { dtype.ToString()); } #undef NPP_ARGS +#endif // __HIP_PLATFORM_AMD__ } void Filter(const open3d::core::Tensor &src_im, open3d::core::Tensor &dst_im, const open3d::core::Tensor &kernel) { +#if __HIP_PLATFORM_AMD__ + utility::LogError("npp::Filter() is not supported on ROCM GPUs."); +#else if (src_im.GetDevice() != dst_im.GetDevice()) { utility::LogError( "src_im and dst_im are not on the same device, got {} and {}.", @@ -314,6 +334,7 @@ void Filter(const open3d::core::Tensor &src_im, dtype.ToString()); } #undef NPP_ARGS +#endif // __HIP_PLATFORM_AMD__ } void FilterBilateral(const core::Tensor &src_im, @@ -321,6 +342,9 @@ void FilterBilateral(const core::Tensor &src_im, int kernel_size, float value_sigma, float distance_sigma) { +#if __HIP_PLATFORM_AMD__ + utility::LogError("npp::FilterBilateral() is not supported on ROCM GPUs."); +#else if (src_im.GetDevice() != dst_im.GetDevice()) { utility::LogError( "src_im and dst_im are not on the same device, got {} and {}.", @@ -373,6 +397,7 @@ void FilterBilateral(const core::Tensor &src_im, dtype.ToString()); } #undef NPP_ARGS +#endif // __HIP_PLATFORM_AMD__ } void FilterGaussian(const core::Tensor &src_im, @@ -407,6 +432,9 @@ void FilterSobel(const core::Tensor &src_im, core::Tensor &dst_im_dx, core::Tensor &dst_im_dy, int kernel_size) { +#if __HIP_PLATFORM_AMD__ + utility::LogError("npp::FilterSobel() is not supported on ROCM GPUs."); +#else if (src_im.GetDevice() != dst_im_dx.GetDevice() || src_im.GetDevice() != dst_im_dy.GetDevice()) { utility::LogError( @@ -479,6 +507,7 @@ void FilterSobel(const core::Tensor &src_im, if (cuda_version < 10020) { dst_im_dx.Neg_(); } +#endif // __HIP_PLATFORM_AMD__ } } // namespace npp } // namespace geometry diff --git a/cpp/open3d/t/geometry/kernel/PointCloudImpl.h b/cpp/open3d/t/geometry/kernel/PointCloudImpl.h index cf442e008..14135bba2 100644 --- a/cpp/open3d/t/geometry/kernel/PointCloudImpl.h +++ b/cpp/open3d/t/geometry/kernel/PointCloudImpl.h @@ -37,7 +37,7 @@ using std::min; using std::sqrt; #endif -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void UnprojectCUDA #else void UnprojectCPU @@ -77,7 +77,7 @@ void UnprojectCPU } // Counter -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) core::Tensor count(std::vector{0}, {}, core::Int32, depth.GetDevice()); int* count_ptr = count.GetDataPtr(); #else @@ -119,13 +119,13 @@ void UnprojectCPU } }); }); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) int total_pts_count = count.Item(); #else int total_pts_count = (*count_ptr).load(); #endif -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) || defined(__HIPCC__) core::cuda::Synchronize(); #endif points = points.Slice(0, 0, total_pts_count); @@ -135,7 +135,7 @@ void UnprojectCPU } } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void GetPointMaskWithinAABBCUDA #else void GetPointMaskWithinAABBCPU @@ -169,7 +169,7 @@ void GetPointMaskWithinAABBCPU }); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void GetPointMaskWithinOBBCUDA #else void GetPointMaskWithinOBBCPU @@ -213,7 +213,7 @@ void GetPointMaskWithinOBBCPU }); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void NormalizeNormalsCUDA #else void NormalizeNormalsCPU @@ -244,7 +244,7 @@ void NormalizeNormalsCPU }); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void OrientNormalsToAlignWithDirectionCUDA #else void OrientNormalsToAlignWithDirectionCPU @@ -278,7 +278,7 @@ void OrientNormalsToAlignWithDirectionCPU }); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void OrientNormalsTowardsCameraLocationCUDA #else void OrientNormalsTowardsCameraLocationCPU @@ -413,7 +413,7 @@ OPEN3D_HOST_DEVICE bool IsBoundaryPoints(const scalar_t* angles, return max_diff > angle_threshold * M_PI / 180.0 ? true : false; } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void ComputeBoundaryPointsCUDA #else void ComputeBoundaryPointsCPU @@ -554,7 +554,7 @@ OPEN3D_HOST_DEVICE void EstimatePointWiseRobustNormalizedCovarianceKernel( covariance_ptr[7] = covariance_ptr[5]; } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void EstimateCovariancesUsingHybridSearchCUDA #else void EstimateCovariancesUsingHybridSearchCPU @@ -604,7 +604,7 @@ void EstimateCovariancesUsingHybridSearchCPU core::cuda::Synchronize(points.GetDevice()); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void EstimateCovariancesUsingRadiusSearchCUDA #else void EstimateCovariancesUsingRadiusSearchCPU @@ -653,7 +653,7 @@ void EstimateCovariancesUsingRadiusSearchCPU core::cuda::Synchronize(points.GetDevice()); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void EstimateCovariancesUsingKNNSearchCUDA #else void EstimateCovariancesUsingKNNSearchCPU @@ -971,7 +971,7 @@ OPEN3D_HOST_DEVICE void EstimatePointWiseNormalsWithFastEigen3x3( } } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void EstimateNormalsFromCovariancesCUDA #else void EstimateNormalsFromCovariancesCPU @@ -1125,7 +1125,7 @@ OPEN3D_HOST_DEVICE void EstimatePointWiseColorGradientKernel( } } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void EstimateColorGradientsUsingHybridSearchCUDA #else void EstimateColorGradientsUsingHybridSearchCPU @@ -1177,7 +1177,7 @@ void EstimateColorGradientsUsingHybridSearchCPU core::cuda::Synchronize(points.GetDevice()); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void EstimateColorGradientsUsingKNNSearchCUDA #else void EstimateColorGradientsUsingKNNSearchCPU @@ -1232,7 +1232,7 @@ void EstimateColorGradientsUsingKNNSearchCPU core::cuda::Synchronize(points.GetDevice()); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void EstimateColorGradientsUsingRadiusSearchCUDA #else void EstimateColorGradientsUsingRadiusSearchCPU diff --git a/cpp/open3d/t/geometry/kernel/TransformImpl.h b/cpp/open3d/t/geometry/kernel/TransformImpl.h index 14c08a01e..844d5ba76 100644 --- a/cpp/open3d/t/geometry/kernel/TransformImpl.h +++ b/cpp/open3d/t/geometry/kernel/TransformImpl.h @@ -87,7 +87,7 @@ OPEN3D_HOST_DEVICE OPEN3D_FORCE_INLINE void RotateNormalsKernel( normals_ptr[2] = x[2]; } -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) void TransformPointsCUDA #else void TransformPointsCPU @@ -107,7 +107,7 @@ void TransformPointsCPU }); } -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) void TransformNormalsCUDA #else void TransformNormalsCPU @@ -127,7 +127,7 @@ void TransformNormalsCPU }); } -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) void RotatePointsCUDA #else void RotatePointsCPU @@ -149,7 +149,7 @@ void RotatePointsCPU }); } -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) void RotateNormalsCUDA #else void RotateNormalsCPU diff --git a/cpp/open3d/t/geometry/kernel/TriangleMeshImpl.h b/cpp/open3d/t/geometry/kernel/TriangleMeshImpl.h index bbc032121..2e305d83c 100644 --- a/cpp/open3d/t/geometry/kernel/TriangleMeshImpl.h +++ b/cpp/open3d/t/geometry/kernel/TriangleMeshImpl.h @@ -21,11 +21,11 @@ namespace geometry { namespace kernel { namespace trianglemesh { -#ifndef __CUDACC__ +#if !defined(__CUDACC__) && !defined(__HIPCC__) using std::isnan; #endif -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void NormalizeNormalsCUDA #else void NormalizeNormalsCPU @@ -62,7 +62,7 @@ void NormalizeNormalsCPU }); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void ComputeTriangleNormalsCUDA #else void ComputeTriangleNormalsCPU @@ -107,7 +107,7 @@ void ComputeTriangleNormalsCPU }); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void ComputeTriangleAreasCUDA #else void ComputeTriangleAreasCPU diff --git a/cpp/open3d/t/geometry/kernel/VoxelBlockGridImpl.h b/cpp/open3d/t/geometry/kernel/VoxelBlockGridImpl.h index bd40b18b3..1ce00b613 100644 --- a/cpp/open3d/t/geometry/kernel/VoxelBlockGridImpl.h +++ b/cpp/open3d/t/geometry/kernel/VoxelBlockGridImpl.h @@ -30,7 +30,7 @@ namespace voxel_grid { using index_t = int; using ArrayIndexer = TArrayIndexer; -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void GetVoxelCoordinatesAndFlattenedIndicesCUDA #else void GetVoxelCoordinatesAndFlattenedIndicesCPU @@ -141,7 +141,7 @@ template -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void IntegrateCUDA #else void IntegrateCPU @@ -288,12 +288,12 @@ void IntegrateCPU *weight_ptr = weight + 1; }); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) core::cuda::Synchronize(); #endif } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void EstimateRangeCUDA #else void EstimateRangeCPU @@ -337,7 +337,7 @@ void EstimateRangeCPU NDArrayIndexer frag_buffer_indexer(fragment_buffer, 1); NDArrayIndexer block_keys_indexer(block_keys, 1); TransformIndexer w2c_transform_indexer(intrinsics, extrinsics); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) core::Tensor count(std::vector{0}, {1}, core::Int32, block_keys.GetDevice()); int* count_ptr = count.GetDataPtr(); @@ -346,7 +346,7 @@ void EstimateRangeCPU std::atomic* count_ptr = &count_atomic; #endif -#ifndef __CUDACC__ +#if !defined(__CUDACC__) && !defined(__HIPCC__) using std::max; using std::min; #endif @@ -434,7 +434,7 @@ void EstimateRangeCPU } } }); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) int needed_frag_count = count[0].Item(); #else int needed_frag_count = (*count_ptr).load(); @@ -486,7 +486,7 @@ void EstimateRangeCPU float z_min = frag_ptr[0]; float z_max = frag_ptr[1]; float* range_ptr = range_map_indexer.GetDataPtr(u, v); -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) atomicMinf(&(range_ptr[0]), z_min); atomicMaxf(&(range_ptr[1]), z_max); #else @@ -498,7 +498,7 @@ void EstimateRangeCPU #endif }); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) core::cuda::Synchronize(); #endif @@ -533,7 +533,7 @@ struct MiniVecCache { }; template -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void RayCastCUDA #else void RayCastCPU @@ -559,7 +559,7 @@ void RayCastCPU using Eq = utility::MiniVecEq; auto device_hashmap = hashmap->GetDeviceHashBackend(); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) auto cuda_hashmap = std::dynamic_pointer_cast>( device_hashmap); @@ -675,7 +675,7 @@ void RayCastCPU index_t resolution2 = block_resolution * block_resolution; index_t resolution3 = resolution2 * block_resolution; -#ifndef __CUDACC__ +#if !defined(__CUDACC__) && !defined(__HIPCC__) using std::max; using std::sqrt; #endif @@ -781,7 +781,7 @@ void RayCastCPU if (mask_indexer.GetDataPtr()) { mask_ptr = mask_indexer.GetDataPtr(x, y); -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) #pragma unroll #endif for (int i = 0; i < 8; ++i) { @@ -790,7 +790,7 @@ void RayCastCPU } if (index_indexer.GetDataPtr()) { index_ptr = index_indexer.GetDataPtr(x, y); -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) #pragma unroll #endif for (int i = 0; i < 8; ++i) { @@ -799,7 +799,7 @@ void RayCastCPU } if (interp_ratio_indexer.GetDataPtr()) { interp_ratio_ptr = interp_ratio_indexer.GetDataPtr(x, y); -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) #pragma unroll #endif for (int i = 0; i < 8; ++i) { @@ -809,7 +809,7 @@ void RayCastCPU if (interp_ratio_dx_indexer.GetDataPtr()) { interp_ratio_dx_ptr = interp_ratio_dx_indexer.GetDataPtr(x, y); -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) #pragma unroll #endif for (int i = 0; i < 8; ++i) { @@ -819,7 +819,7 @@ void RayCastCPU if (interp_ratio_dy_indexer.GetDataPtr()) { interp_ratio_dy_ptr = interp_ratio_dy_indexer.GetDataPtr(x, y); -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) #pragma unroll #endif for (int i = 0; i < 8; ++i) { @@ -829,7 +829,7 @@ void RayCastCPU if (interp_ratio_dz_indexer.GetDataPtr()) { interp_ratio_dz_ptr = interp_ratio_dz_indexer.GetDataPtr(x, y); -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) #pragma unroll #endif for (int i = 0; i < 8; ++i) { @@ -1026,13 +1026,13 @@ void RayCastCPU } // surface-found }); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) core::cuda::Synchronize(); #endif } template -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void ExtractPointCloudCUDA #else void ExtractPointCloudCPU @@ -1085,7 +1085,7 @@ void ExtractPointCloudCPU index_t n = n_blocks * resolution3; // Output -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) core::Tensor count(std::vector{0}, {1}, core::Int32, block_keys.GetDevice()); index_t* count_ptr = count.GetDataPtr(); @@ -1139,7 +1139,7 @@ void ExtractPointCloudCPU } }); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) valid_size = count[0].Item(); count[0] = 0; #else @@ -1274,7 +1274,7 @@ void ExtractPointCloudCPU } }); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) index_t total_count = count.Item(); #else index_t total_count = (*count_ptr).load(); @@ -1289,7 +1289,7 @@ void ExtractPointCloudCPU } template -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void ExtractTriangleMeshCUDA #else void ExtractTriangleMeshCPU @@ -1434,7 +1434,7 @@ void ExtractTriangleMeshCPU }); // Pass 1: determine valid number of vertices (if not preset) -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) core::Tensor count(std::vector{0}, {}, core::Int32, device); index_t* count_ptr = count.GetDataPtr(); @@ -1473,7 +1473,7 @@ void ExtractTriangleMeshCPU } }); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) vertex_count = count.Item(); #else vertex_count = (*count_ptr).load(); @@ -1495,7 +1495,7 @@ void ExtractTriangleMeshCPU ArrayIndexer block_keys_indexer(block_keys, 1); ArrayIndexer vertex_indexer(vertices, 1); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) count = core::Tensor(std::vector{0}, {}, core::Int32, device); count_ptr = count.GetDataPtr(); #else @@ -1620,7 +1620,7 @@ void ExtractTriangleMeshCPU triangles = core::Tensor({triangle_count, 3}, core::Int32, device); ArrayIndexer triangle_indexer(triangles, 1); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) count = core::Tensor(std::vector{0}, {}, core::Int32, device); count_ptr = count.GetDataPtr(); #else @@ -1678,7 +1678,7 @@ void ExtractTriangleMeshCPU } }); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) triangle_count = count.Item(); #else triangle_count = (*count_ptr).load(); diff --git a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt index a766715e6..4140f0e1e 100644 --- a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt +++ b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt @@ -22,6 +22,22 @@ if (BUILD_CUDA_MODULE) ) endif() +if (WITH_ROCM) + include(Hipify) + + get_target_property(SOURCE_LIST tpipelines_kernel SOURCES) + get_hipified_list("${SOURCE_LIST}" HIPIFIED_SOURCE_LIST) + + foreach(src IN LISTS HIPIFIED_SOURCE_LIST) + get_filename_component(ext ${src} EXT) + if(ext STREQUAL ".cu") + set_source_files_properties(${src} PROPERTIES LANGUAGE HIP) + endif() + endforeach() + + set_property(TARGET tpipelines_kernel PROPERTY SOURCES ${HIPIFIED_SOURCE_LIST}) +endif() + open3d_show_and_abort_on_warning(tpipelines_kernel) open3d_set_global_properties(tpipelines_kernel) # The kernels are used in the unit tests, so they cannot be hidden for now. diff --git a/cpp/open3d/t/pipelines/kernel/FeatureImpl.h b/cpp/open3d/t/pipelines/kernel/FeatureImpl.h index bd00f29fb..14504f7ee 100644 --- a/cpp/open3d/t/pipelines/kernel/FeatureImpl.h +++ b/cpp/open3d/t/pipelines/kernel/FeatureImpl.h @@ -16,7 +16,7 @@ namespace t { namespace pipelines { namespace kernel { -#ifndef __CUDACC__ +#if !defined(__CUDACC__) && !defined(__HIPCC__) using std::max; using std::min; #endif @@ -104,7 +104,7 @@ OPEN3D_HOST_DEVICE void UpdateSPFHFeature(const scalar_t *feature, spfh[idx * 33 + h_index3 + 22] += hist_incr; } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void ComputeFPFHFeatureCUDA #else void ComputeFPFHFeatureCPU diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h index e158809f6..aa65f5082 100644 --- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h +++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h @@ -13,7 +13,7 @@ namespace open3d { namespace t { namespace pipelines { namespace kernel { -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void FillInRigidAlignmentTermCUDA #else void FillInRigidAlignmentTermCPU @@ -127,7 +127,7 @@ void FillInRigidAlignmentTermCPU Atb.IndexSet({indices}, Atb_sub + Atb_local.View({12, 1})); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void FillInSLACAlignmentTermCUDA #else void FillInSLACAlignmentTermCPU @@ -247,7 +247,7 @@ void FillInSLACAlignmentTermCPU } // Not optimized; Switch to reduction if necessary. -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) for (int ki = 0; ki < 60; ++ki) { for (int kj = 0; kj < 60; ++kj) { float AtA_ij = J[ki] * J[kj]; @@ -274,7 +274,7 @@ void FillInSLACAlignmentTermCPU }); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) void FillInSLACRegularizerTermCUDA #else void FillInSLACRegularizerTermCPU @@ -413,7 +413,7 @@ void FillInSLACRegularizerTermCPU int offset_idx_i = 3 * idx_i + 6 * n_frags; int offset_idx_k = 3 * idx_k + 6 * n_frags; -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) // Update residual atomicAdd(residual_ptr, weight * (local_r[0] * local_r[0] + diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryJacobianImpl.h b/cpp/open3d/t/pipelines/kernel/RGBDOdometryJacobianImpl.h index 0dbe0d97f..63e0e84ea 100644 --- a/cpp/open3d/t/pipelines/kernel/RGBDOdometryJacobianImpl.h +++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometryJacobianImpl.h @@ -20,7 +20,7 @@ namespace odometry { using t::geometry::kernel::NDArrayIndexer; using t::geometry::kernel::TransformIndexer; -#ifndef __CUDACC__ +#if !defined(__CUDACC__) && !defined(__HIPCC__) using std::abs; using std::isnan; using std::max; diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h index 3f08ca4c8..533ad6ab0 100644 --- a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h +++ b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h @@ -17,7 +17,7 @@ #include "open3d/t/pipelines/kernel/TransformationConverter.h" #include "open3d/t/pipelines/registration/RobustKernel.h" -#ifndef __CUDACC__ +#if !defined(__CUDACC__) && !defined(__HIPCC__) using std::abs; #endif @@ -187,21 +187,23 @@ OPEN3D_HOST_DEVICE inline bool GetJacobianPointToPlane( return true; } -template bool GetJacobianPointToPlane(int64_t workload_idx, - const float *source_points_ptr, - const float *target_points_ptr, - const float *target_normals_ptr, - const int64_t *correspondence_indices, - float *J_ij, - float &r); - -template bool GetJacobianPointToPlane(int64_t workload_idx, - const double *source_points_ptr, - const double *target_points_ptr, - const double *target_normals_ptr, - const int64_t *correspondence_indices, - double *J_ij, - double &r); +template OPEN3D_HOST_DEVICE bool GetJacobianPointToPlane( + int64_t workload_idx, + const float *source_points_ptr, + const float *target_points_ptr, + const float *target_normals_ptr, + const int64_t *correspondence_indices, + float *J_ij, + float &r); + +template OPEN3D_HOST_DEVICE bool GetJacobianPointToPlane( + int64_t workload_idx, + const double *source_points_ptr, + const double *target_points_ptr, + const double *target_normals_ptr, + const int64_t *correspondence_indices, + double *J_ij, + double &r); template OPEN3D_HOST_DEVICE inline bool GetJacobianColoredICP( @@ -285,35 +287,37 @@ OPEN3D_HOST_DEVICE inline bool GetJacobianColoredICP( return true; } -template bool GetJacobianColoredICP(const int64_t workload_idx, - const float *source_points_ptr, - const float *source_colors_ptr, - const float *target_points_ptr, - const float *target_normals_ptr, - const float *target_colors_ptr, - const float *target_color_gradients_ptr, - const int64_t *correspondence_indices, - const float &sqrt_lambda_geometric, - const float &sqrt_lambda_photometric, - float *J_G, - float *J_I, - float &r_G, - float &r_I); - -template bool GetJacobianColoredICP(const int64_t workload_idx, - const double *source_points_ptr, - const double *source_colors_ptr, - const double *target_points_ptr, - const double *target_normals_ptr, - const double *target_colors_ptr, - const double *target_color_gradients_ptr, - const int64_t *correspondence_indices, - const double &sqrt_lambda_geometric, - const double &sqrt_lambda_photometric, - double *J_G, - double *J_I, - double &r_G, - double &r_I); +template OPEN3D_HOST_DEVICE bool GetJacobianColoredICP( + const int64_t workload_idx, + const float *source_points_ptr, + const float *source_colors_ptr, + const float *target_points_ptr, + const float *target_normals_ptr, + const float *target_colors_ptr, + const float *target_color_gradients_ptr, + const int64_t *correspondence_indices, + const float &sqrt_lambda_geometric, + const float &sqrt_lambda_photometric, + float *J_G, + float *J_I, + float &r_G, + float &r_I); + +template OPEN3D_HOST_DEVICE bool GetJacobianColoredICP( + const int64_t workload_idx, + const double *source_points_ptr, + const double *source_colors_ptr, + const double *target_points_ptr, + const double *target_normals_ptr, + const double *target_colors_ptr, + const double *target_color_gradients_ptr, + const int64_t *correspondence_indices, + const double &sqrt_lambda_geometric, + const double &sqrt_lambda_photometric, + double *J_G, + double *J_I, + double &r_G, + double &r_I); template OPEN3D_HOST_DEVICE inline void PreComputeForDopplerICP( @@ -333,17 +337,19 @@ OPEN3D_HOST_DEVICE inline void PreComputeForDopplerICP( core::linalg::kernel::matmul3x3_3x1(R_S_to_V, v_s_in_V, v_s_in_S); } -template void PreComputeForDopplerICP(const float *R_S_to_V, - const float *r_v_to_s_in_V, - const float *w_v_in_V, - const float *v_v_in_V, - float *v_s_in_S); +template OPEN3D_HOST_DEVICE void PreComputeForDopplerICP( + const float *R_S_to_V, + const float *r_v_to_s_in_V, + const float *w_v_in_V, + const float *v_v_in_V, + float *v_s_in_S); -template void PreComputeForDopplerICP(const double *R_S_to_V, - const double *r_v_to_s_in_V, - const double *w_v_in_V, - const double *v_v_in_V, - double *v_s_in_S); +template OPEN3D_HOST_DEVICE void PreComputeForDopplerICP( + const double *R_S_to_V, + const double *r_v_to_s_in_V, + const double *w_v_in_V, + const double *v_v_in_V, + double *v_s_in_S); template OPEN3D_HOST_DEVICE inline bool GetJacobianDopplerICP( @@ -435,45 +441,47 @@ OPEN3D_HOST_DEVICE inline bool GetJacobianDopplerICP( return true; } -template bool GetJacobianDopplerICP(const int64_t workload_idx, - const float *source_points_ptr, - const float *source_dopplers_ptr, - const float *source_directions_ptr, - const float *target_points_ptr, - const float *target_normals_ptr, - const int64_t *correspondence_indices, - const float *R_S_to_V, - const float *r_v_to_s_in_V, - const float *v_s_in_S, - const bool reject_dynamic_outliers, - const float doppler_outlier_threshold, - const float &sqrt_lambda_geometric, - const float &sqrt_lambda_doppler, - const float &sqrt_lambda_doppler_by_dt, - float *J_G, - float *J_D, - float &r_G, - float &r_D); - -template bool GetJacobianDopplerICP(const int64_t workload_idx, - const double *source_points_ptr, - const double *source_dopplers_ptr, - const double *source_directions_ptr, - const double *target_points_ptr, - const double *target_normals_ptr, - const int64_t *correspondence_indices, - const double *R_S_to_V, - const double *r_v_to_s_in_V, - const double *v_s_in_S, - const bool reject_dynamic_outliers, - const double doppler_outlier_threshold, - const double &sqrt_lambda_geometric, - const double &sqrt_lambda_doppler, - const double &sqrt_lambda_doppler_by_dt, - double *J_G, - double *J_D, - double &r_G, - double &r_D); +template OPEN3D_HOST_DEVICE bool GetJacobianDopplerICP( + const int64_t workload_idx, + const float *source_points_ptr, + const float *source_dopplers_ptr, + const float *source_directions_ptr, + const float *target_points_ptr, + const float *target_normals_ptr, + const int64_t *correspondence_indices, + const float *R_S_to_V, + const float *r_v_to_s_in_V, + const float *v_s_in_S, + const bool reject_dynamic_outliers, + const float doppler_outlier_threshold, + const float &sqrt_lambda_geometric, + const float &sqrt_lambda_doppler, + const float &sqrt_lambda_doppler_by_dt, + float *J_G, + float *J_D, + float &r_G, + float &r_D); + +template OPEN3D_HOST_DEVICE bool GetJacobianDopplerICP( + const int64_t workload_idx, + const double *source_points_ptr, + const double *source_dopplers_ptr, + const double *source_directions_ptr, + const double *target_points_ptr, + const double *target_normals_ptr, + const int64_t *correspondence_indices, + const double *R_S_to_V, + const double *r_v_to_s_in_V, + const double *v_s_in_S, + const bool reject_dynamic_outliers, + const double doppler_outlier_threshold, + const double &sqrt_lambda_geometric, + const double &sqrt_lambda_doppler, + const double &sqrt_lambda_doppler_by_dt, + double *J_G, + double *J_D, + double &r_G, + double &r_D); template OPEN3D_HOST_DEVICE inline bool GetInformationJacobians( @@ -507,19 +515,21 @@ OPEN3D_HOST_DEVICE inline bool GetInformationJacobians( return true; } -template bool GetInformationJacobians(int64_t workload_idx, - const float *target_points_ptr, - const int64_t *correspondence_indices, - float *jacobian_x, - float *jacobian_y, - float *jacobian_z); - -template bool GetInformationJacobians(int64_t workload_idx, - const double *target_points_ptr, - const int64_t *correspondence_indices, - double *jacobian_x, - double *jacobian_y, - double *jacobian_z); +template OPEN3D_HOST_DEVICE bool GetInformationJacobians( + int64_t workload_idx, + const float *target_points_ptr, + const int64_t *correspondence_indices, + float *jacobian_x, + float *jacobian_y, + float *jacobian_z); + +template OPEN3D_HOST_DEVICE bool GetInformationJacobians( + int64_t workload_idx, + const double *target_points_ptr, + const int64_t *correspondence_indices, + double *jacobian_x, + double *jacobian_y, + double *jacobian_z); } // namespace kernel } // namespace pipelines diff --git a/cpp/open3d/utility/Logging.h b/cpp/open3d/utility/Logging.h index fb4a21c86..c881dd9a0 100644 --- a/cpp/open3d/utility/Logging.h +++ b/cpp/open3d/utility/Logging.h @@ -12,7 +12,8 @@ #include // NVCC does not support deprecated attribute on Windows prior to v11. -#if defined(__CUDACC__) && defined(_MSC_VER) && __CUDACC_VER_MAJOR__ < 11 +#if (defined(__CUDACC__) || defined(__HIPCC__)) && defined(_MSC_VER) && \ + __CUDACC_VER_MAJOR__ < 11 #ifndef FMT_DEPRECATED #define FMT_DEPRECATED #endif diff --git a/cpp/open3d/utility/MiniVec.h b/cpp/open3d/utility/MiniVec.h index 17eca20b1..e766b6163 100644 --- a/cpp/open3d/utility/MiniVec.h +++ b/cpp/open3d/utility/MiniVec.h @@ -11,6 +11,8 @@ #ifdef __CUDACC__ #define FN_SPECIFIERS inline __host__ __device__ +#elif __HIPCC__ +#define FN_SPECIFIERS inline __host__ __device__ #else #define FN_SPECIFIERS inline #endif diff --git a/cpp/pybind/_build_config.py.in b/cpp/pybind/_build_config.py.in index 7ea14135f..12ea722d9 100644 --- a/cpp/pybind/_build_config.py.in +++ b/cpp/pybind/_build_config.py.in @@ -18,5 +18,6 @@ _build_config = { "Tensorflow_VERSION" : "@Tensorflow_VERSION@", "Pytorch_VERSION" : "@Pytorch_VERSION@", "Paddle_VERSION" : "@Paddle_VERSION@", - "WITH_OPENMP" : $,True,False> + "WITH_OPENMP" : $,True,False>, + "WITH_ROCM" : $,True,False> } diff --git a/cpp/pybind/t/geometry/geometry.h b/cpp/pybind/t/geometry/geometry.h index c8719bcdf..8888043af 100644 --- a/cpp/pybind/t/geometry/geometry.h +++ b/cpp/pybind/t/geometry/geometry.h @@ -38,7 +38,6 @@ void pybind_geometry(py::module& m); void pybind_geometry_class(py::module& m); void pybind_drawable_geometry_class(py::module& m); void pybind_tensormap(py::module& m); -void pybind_image(py::module& m); void pybind_pointcloud(py::module& m); void pybind_lineset(py::module& m); void pybind_trianglemesh(py::module& m); diff --git a/hipify_custom_map.json b/hipify_custom_map.json new file mode 100644 index 000000000..5ba507997 --- /dev/null +++ b/hipify_custom_map.json @@ -0,0 +1,43 @@ +{ + "custom_map": { + "cub::" : "hipcub::", + "cuStreamGetCtx": "hipStreamGetCtx", + + "cusolverStatus_t": "hipsolverStatus_t", + "cusolverDnHandle_t": "hipsolverDnHandle_t", + "CUSOLVER_STATUS_SUCCESS": "HIPSOLVER_STATUS_SUCCESS", + "CUSOLVER_STATUS_INTERNAL_ERROR": "HIPSOLVER_STATUS_INTERNAL_ERROR", + + "open3d/core/linalg/BlasWrapper.h": "open3d/core/linalg/BlasWrapper_hip.h", + "open3d/core/linalg/LinalgUtils.h": "open3d/core/linalg/LinalgUtils_hip.h", + "open3d/core/linalg/LapackWrapper.h": "open3d/core/linalg/LapackWrapper_hip.h", + "open3d/ml/Helper.h": "open3d/ml/Helper_hip.h", + + "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h": "open3d/ml/impl/continuous_conv/ContinuousConvHIPKernels.h", + "open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.h": "open3d/ml/impl/sparse_conv/SparseConvHIPKernels.h", + + "open3d/ml/impl/continuous_conv/ContinuousConv.cuh": "open3d/ml/impl/continuous_conv/ContinuousConv_hip.cuh", + "open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.cuh": "open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter_hip.cuh", + "open3d/ml/impl/continuous_conv/ContinuousConvTranspose.cuh": "open3d/ml/impl/continuous_conv/ContinuousConvTranspose_hip.cuh", + "open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.cuh": "open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter_hip.cuh", + "open3d/ml/impl/sparse_conv/SparseConv.cuh": "open3d/ml/impl/sparse_conv/SparseConv_hip.cuh", + "open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.cuh": "open3d/ml/impl/sparse_conv/SparseConvBackpropFilter_hip.cuh", + "open3d/ml/impl/sparse_conv/SparseConvTranspose.cuh": "open3d/ml/impl/sparse_conv/SparseConvTranspose_hip.cuh", + "open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.cuh": "open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter_hip.cuh", + + "open3d/ml/impl/misc/ReduceSubarraysSum.cuh": "open3d/ml/impl/misc/ReduceSubarraysSum_hip.cuh", + "open3d/ml/impl/misc/RaggedToDense.cuh": "open3d/ml/impl/misc/RaggedToDense_hip.cuh", + "open3d/ml/impl/misc/Voxelize.cuh": "open3d/ml/impl/misc/Voxelize_hip.cuh", + "open3d/ml/impl/misc/InvertNeighborsList.cuh": "open3d/ml/impl/misc/InvertNeighborsList_hip.cuh", + "open3d/ml/contrib/PointSampling.cuh": "open3d/ml/contrib/PointSampling_hip.cuh", + "open3d/ml/contrib/TrilinearDevoxelize.cuh": "open3d/ml/contrib/TrilinearDevoxelize_hip.cuh", + "open3d/ml/contrib/BallQuery.cuh": "open3d/ml/contrib/BallQuery_hip.cuh", + "open3d/ml/contrib/InterpolatePoints.cuh": "open3d/ml/contrib/InterpolatePoints_hip.cuh", + "open3d/core/nns/FixedRadiusSearchImpl.cuh": "open3d/core/nns/FixedRadiusSearchImpl_hip.cuh", + "open3d/core/nns/kernel/BlockSelectImpl.cuh": "open3d/core/nns/kernel/BlockSelectImpl_hip.cuh", + "open3d/core/nns/kernel/BlockSelect.cuh": "open3d/core/nns/kernel/BlockSelect_hip.cuh", + "open3d/core/nns/kernel/DistancesUtils.cuh": "open3d/core/nns/kernel/DistancesUtils_hip.cuh", + "open3d/core/nns/kernel/L2Select.cuh": "open3d/core/nns/kernel/L2Select_hip.cuh", + "open3d/core/nns/KnnSearchImpl.cuh": "open3d/core/nns/KnnSearchImpl_hip.cuh" + } +} \ No newline at end of file diff --git a/python/open3d/ml/paddle/ops/__init__.py b/python/open3d/ml/paddle/ops/__init__.py index 2e9336fd7..0760225c6 100644 --- a/python/open3d/ml/paddle/ops/__init__.py +++ b/python/open3d/ml/paddle/ops/__init__.py @@ -32,14 +32,25 @@ _lib_arch = ('cpu',) if _build_config["BUILD_CUDA_MODULE"] and _paddle.device.cuda.device_count( ) >= 1: - if _paddle.version.cuda() == _build_config["CUDA_VERSION"]: - _lib_arch = ('cuda', 'cpu') + if _build_config["WITH_ROCM"]: + if _paddle.is_compiled_with_rocm(): + # NOTE(beinggod): Skip rocm version check. + _lib_arch = ('cuda', 'cpu') + else: + print("Warning: Open3D was built with ROCm but " + "Paddle was not built with ROCm. Falling back to CPU for now." + "Otherwise, install Paddle with ROCm{}.") else: - print("Warning: Open3D was built with CUDA {} but" - "Paddle was built with CUDA {}. Falling back to CPU for now." - "Otherwise, install Paddle with CUDA {}.".format( - _build_config["CUDA_VERSION"], _paddle.version.cuda(), - _build_config["CUDA_VERSION"])) + # CUDA + if _paddle.version.cuda() == _build_config["CUDA_VERSION"]: + _lib_arch = ('cuda', 'cpu') + else: + print("Warning: Open3D was built with CUDA {} but " + "Paddle was built with CUDA {}. Falling back to CPU for now." + "Otherwise, install Paddle with CUDA {}.".format( + _build_config["CUDA_VERSION"], _paddle.version.cuda(), + _build_config["CUDA_VERSION"])) + _lib_path.extend([ _os.path.join(_package_root, la, 'open3d_paddle_ops' + _lib_suffix + _lib_ext) diff --git a/python/requirements_style.txt b/python/requirements_style.txt index f20e372b9..9e2a1639b 100644 --- a/python/requirements_style.txt +++ b/python/requirements_style.txt @@ -1,3 +1,4 @@ clang-format==10.0.1.1 yapf==0.30.0 nbformat==5.7.0 +pre-commit==4.5.1 diff --git a/python/test/ml_ops/test_cublas.py b/python/test/ml_ops/test_cublas.py index 075823ec9..daf5ed399 100644 --- a/python/test/ml_ops/test_cublas.py +++ b/python/test/ml_ops/test_cublas.py @@ -19,7 +19,7 @@ @mltest.parametrize.ml_gpu_only def test_cublas_matmul(ml): # This test checks if calling cublas functionality from open3d and the ml framework works. - os.environ["NVIDIA_TF32_OVERRIDE"] = 0 + os.environ["NVIDIA_TF32_OVERRIDE"] = "0" rng = np.random.RandomState(123) From 720ada4b9f3af83d33ae09ce51ac617f15e6af94 Mon Sep 17 00:00:00 2001 From: beinggod Date: Tue, 3 Feb 2026 16:11:03 +0800 Subject: [PATCH 2/3] fix: test_cublas.py and update requirements_style.txt Signed-off-by: beinggod --- python/requirements_style.txt | 1 + python/test/ml_ops/test_cublas.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/requirements_style.txt b/python/requirements_style.txt index f20e372b9..7b7f5fa74 100644 --- a/python/requirements_style.txt +++ b/python/requirements_style.txt @@ -1,3 +1,4 @@ clang-format==10.0.1.1 yapf==0.30.0 nbformat==5.7.0 +pre-commit=4.5.1 diff --git a/python/test/ml_ops/test_cublas.py b/python/test/ml_ops/test_cublas.py index 075823ec9..daf5ed399 100644 --- a/python/test/ml_ops/test_cublas.py +++ b/python/test/ml_ops/test_cublas.py @@ -19,7 +19,7 @@ @mltest.parametrize.ml_gpu_only def test_cublas_matmul(ml): # This test checks if calling cublas functionality from open3d and the ml framework works. - os.environ["NVIDIA_TF32_OVERRIDE"] = 0 + os.environ["NVIDIA_TF32_OVERRIDE"] = "0" rng = np.random.RandomState(123) From bad288d59bf0ba09f7ca736443b061157c43bb7f Mon Sep 17 00:00:00 2001 From: beinggod Date: Wed, 4 Feb 2026 14:03:34 +0800 Subject: [PATCH 3/3] fix tbb cmake Signed-off-by: beinggod --- 3rdparty/mkl/tbb.cmake | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/3rdparty/mkl/tbb.cmake b/3rdparty/mkl/tbb.cmake index 8c4fd6fa5..12705be7e 100644 --- a/3rdparty/mkl/tbb.cmake +++ b/3rdparty/mkl/tbb.cmake @@ -23,9 +23,21 @@ set(STATIC_TBB_LIBRARIES tbb_static tbbmalloc_static) find_package(Git QUIET REQUIRED) +set(TBB_CMAKE_ARGS + -DCMAKE_INSTALL_PREFIX=${MKL_INSTALL_PREFIX} + -DSTATIC_WINDOWS_RUNTIME=${STATIC_WINDOWS_RUNTIME} + -DTBB_BUILD_TBBMALLOC=ON + -DTBB_BUILD_TBBMALLOC_PROXYC=OFF + -DTBB_BUILD_SHARED=OFF + -DTBB_BUILD_STATIC=ON + -DTBB_BUILD_TESTS=OFF + -DTBB_INSTALL_ARCHIVE_DIR=${Open3D_INSTALL_LIB_DIR} + -DTBB_CMAKE_PACKAGE_INSTALL_DIR=${Open3D_INSTALL_LIB_DIR}/cmake/tbb + ${ExternalProject_CMAKE_ARGS_hidden} +) if (WITH_ROCM) # NOTE(beinggod): Set CMAKE_CXX_COMPILER_ID to GNU to avoid link libunwind and libc++. - set(ExternalProject_CMAKE_ARGS_hidden "${ExternalProject_CMAKE_ARGS_hidden} -DCMAKE_CXX_COMPILER_ID=GNU") + list(APPEND TBB_CMAKE_ARGS -DCMAKE_CXX_COMPILER_ID=GNU) endif() ExternalProject_Add( @@ -39,16 +51,7 @@ ExternalProject_Add( COMMAND ${GIT_EXECUTABLE} apply --ignore-space-change --ignore-whitespace ${CMAKE_CURRENT_LIST_DIR}/0001-Allow-selecttion-of-static-dynamic-MSVC-runtime.patch CMAKE_ARGS - -DCMAKE_INSTALL_PREFIX=${MKL_INSTALL_PREFIX} - -DSTATIC_WINDOWS_RUNTIME=${STATIC_WINDOWS_RUNTIME} - -DTBB_BUILD_TBBMALLOC=ON - -DTBB_BUILD_TBBMALLOC_PROXYC=OFF - -DTBB_BUILD_SHARED=OFF - -DTBB_BUILD_STATIC=ON - -DTBB_BUILD_TESTS=OFF - -DTBB_INSTALL_ARCHIVE_DIR=${Open3D_INSTALL_LIB_DIR} - -DTBB_CMAKE_PACKAGE_INSTALL_DIR=${Open3D_INSTALL_LIB_DIR}/cmake/tbb - ${ExternalProject_CMAKE_ARGS_hidden} + ${TBB_CMAKE_ARGS} BUILD_BYPRODUCTS ${STATIC_TBB_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}tbb_static${CMAKE_STATIC_LIBRARY_SUFFIX} ${STATIC_TBB_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}tbbmalloc_static${CMAKE_STATIC_LIBRARY_SUFFIX}