From 6953e92d00be85b312f266d36a00a3a9935f842e Mon Sep 17 00:00:00 2001
From: beinggod <beinggod@foxmail.com>
Date: Tue, 3 Feb 2026 15:43:17 +0800
Subject: [PATCH 1/3] [ROCm] feat: support rocm backend for Hygon DCU

---
 3rdparty/cmake/FindPaddle.cmake               |  23 +-
 3rdparty/find_dependencies.cmake              | 263 ++++++++++++------
 3rdparty/gflags/LICENSE                       |  28 ++
 3rdparty/gflags/gflags.cmake                  |  44 +++
 3rdparty/glog/LICENSE                         |   8 +
 3rdparty/glog/glog.cmake                      |  47 ++++
 3rdparty/hipify_torch/LICENSE                 |  21 ++
 3rdparty/hipify_torch/hipify_torch.cmake      |  15 +
 3rdparty/mkl/tbb.cmake                        |   5 +
 3rdparty/stdgpu/stdgpu.cmake                  |  37 ++-
 CMakeLists.txt                                |  60 +++-
 cmake/Open3DMakeRocmArchitectures.cmake       |  52 ++++
 cmake/Open3DPrintConfigurationSummary.cmake   |   1 +
 cmake/Open3DSetGlobalProperties.cmake         |   9 +
 cmake/Open3DShowAndAbortOnWarning.cmake       |  51 +++-
 cpp/open3d/CMakeLists.txt                     |  35 +++
 cpp/open3d/core/CMakeLists.txt                |  20 +-
 cpp/open3d/core/CUDAUtils.cpp                 |  99 +++++++
 cpp/open3d/core/CUDAUtils.h                   |  45 +++
 cpp/open3d/core/ParallelFor.h                 |  18 +-
 .../hashmap/CUDA/CUDAHashBackendBuffer.cu     |   4 +
 .../CUDA/CUDAHashBackendBufferAccessor.h      |   8 +
 .../core/hashmap/CUDA/SlabHashBackend.h       |  63 +++++
 cpp/open3d/core/hashmap/CUDA/SlabMacros.h     |   2 +-
 .../core/hashmap/CUDA/SlabNodeManager.h       |  20 ++
 cpp/open3d/core/hashmap/Dispatch.h            |   2 +-
 cpp/open3d/core/linalg/LinalgHeadersCUDA.h    |   9 +-
 cpp/open3d/core/nns/NeighborSearchCommon.h    |   2 +-
 cpp/open3d/core/nns/kernel/DeviceDefs.cuh     |  12 +-
 cpp/open3d/core/nns/kernel/MergeNetwork.cuh   |  15 +-
 cpp/open3d/core/nns/kernel/Pair.cuh           |   4 +
 cpp/open3d/core/nns/kernel/PtxUtils.cuh       |  95 +++++++
 cpp/open3d/core/nns/kernel/Reduction.cuh      |   4 +
 cpp/open3d/core/nns/kernel/ReductionOps.cuh   |   4 +
 cpp/open3d/core/nns/kernel/Select.cuh         |   4 +-
 cpp/open3d/core/nns/kernel/StaticUtils.cuh    |   4 +
 cpp/open3d/core/nns/kernel/WarpShuffle.cuh    |  37 ++-
 cpp/open3d/ml/Helper.h                        |  13 +
 cpp/open3d/ml/contrib/CMakeLists.txt          |  16 ++
 .../impl/continuous_conv/ContinuousConv.cuh   |  28 ++
 .../ContinuousConvBackpropFilter.cuh          |  30 +-
 .../ContinuousConvCUDAKernels.cu              |   1 +
 .../ContinuousConvTranspose.cuh               |  30 +-
 .../ContinuousConvTransposeBackpropFilter.cuh |  31 ++-
 .../ml/impl/misc/NeighborSearchCommon.h       |   2 +-
 cpp/open3d/ml/impl/sparse_conv/SparseConv.cuh |  28 ++
 .../sparse_conv/SparseConvBackpropFilter.cuh  |  32 ++-
 .../impl/sparse_conv/SparseConvCUDAKernels.cu |   1 +
 .../impl/sparse_conv/SparseConvTranspose.cuh  |  30 +-
 .../SparseConvTransposeBackpropFilter.cuh     |  32 ++-
 cpp/open3d/ml/paddle/CMakeLists.txt           |  41 ++-
 cpp/open3d/ml/paddle/PaddleHelper.cpp         |   2 +-
 cpp/open3d/ml/paddle/PaddleHelper.h           |  10 +-
 cpp/open3d/t/geometry/kernel/CMakeLists.txt   |  17 ++
 cpp/open3d/t/geometry/kernel/GeometryMacros.h |   4 +-
 cpp/open3d/t/geometry/kernel/ImageImpl.h      |  18 +-
 cpp/open3d/t/geometry/kernel/NPPImage.cpp     |  29 ++
 cpp/open3d/t/geometry/kernel/PointCloudImpl.h |  34 +--
 cpp/open3d/t/geometry/kernel/TransformImpl.h  |   8 +-
 .../t/geometry/kernel/TriangleMeshImpl.h      |   8 +-
 .../t/geometry/kernel/VoxelBlockGridImpl.h    |  58 ++--
 cpp/open3d/t/pipelines/kernel/CMakeLists.txt  |  16 ++
 cpp/open3d/t/pipelines/kernel/FeatureImpl.h   |   4 +-
 .../pipelines/kernel/FillInLinearSystemImpl.h |  10 +-
 .../kernel/RGBDOdometryJacobianImpl.h         |   2 +-
 .../t/pipelines/kernel/RegistrationImpl.h     | 224 ++++++++-------
 cpp/open3d/utility/Logging.h                  |   3 +-
 cpp/open3d/utility/MiniVec.h                  |   2 +
 cpp/pybind/_build_config.py.in                |   3 +-
 cpp/pybind/t/geometry/geometry.h              |   1 -
 hipify_custom_map.json                        |  43 +++
 python/open3d/ml/paddle/ops/__init__.py       |  25 +-
 python/requirements_style.txt                 |   1 +
 python/test/ml_ops/test_cublas.py             |   2 +-
 74 files changed, 1656 insertions(+), 353 deletions(-)
 create mode 100644 3rdparty/gflags/LICENSE
 create mode 100644 3rdparty/gflags/gflags.cmake
 create mode 100644 3rdparty/glog/LICENSE
 create mode 100644 3rdparty/glog/glog.cmake
 create mode 100644 3rdparty/hipify_torch/LICENSE
 create mode 100644 3rdparty/hipify_torch/hipify_torch.cmake
 create mode 100644 cmake/Open3DMakeRocmArchitectures.cmake
 create mode 100644 hipify_custom_map.json

diff --git a/3rdparty/cmake/FindPaddle.cmake b/3rdparty/cmake/FindPaddle.cmake
index fe139cf14..dd1a32eed 100644
--- a/3rdparty/cmake/FindPaddle.cmake
+++ b/3rdparty/cmake/FindPaddle.cmake
@@ -20,8 +20,15 @@ if(NOT Paddle_FOUND)
     endif()
 
     if(BUILD_CUDA_MODULE)
-        find_package(CUDAToolkit REQUIRED)
-        string(SUBSTRING ${CUDAToolkit_VERSION} 0 4 CUDA_VERSION)
+        if (WITH_CUDA)
+            find_package(CUDAToolkit REQUIRED)
+            string(SUBSTRING ${CUDAToolkit_VERSION} 0 4 CUDA_VERSION)
+        elseif(WITH_ROCM)
+            find_package(HIP REQUIRED)
+            # Reuse CUDAToolkit variables for HIP
+            set(CUDAToolkit_INCLUDE_DIRS ${ROCM_PATH}/include)
+            set(CUDAToolkit_LIBRARY_DIR ${ROCM_PATH}/lib)
+        endif()
     endif()
 
     message(STATUS "Getting Paddle properties ...")
@@ -81,14 +88,20 @@ if(NOT Paddle_FOUND)
     list(APPEND PADDLE_LIBRARY_DIRS "${PADDLE_LIB}")
 
     if(BUILD_CUDA_MODULE)
-        find_library(CUDART_LIB NAMES cudart PATHS "${CUDAToolkit_LIBRARY_DIR}")
-        list(APPEND PADDLE_LIBRARY_DIRS "${CUDART_LIB}")
+        if (WITH_CUDA)
+            find_library(CUDART_LIB NAMES cudart PATHS "${CUDAToolkit_LIBRARY_DIR}")
+            list(APPEND PADDLE_LIBRARY_DIRS "${CUDART_LIB}")
+        endif()
     endif()
 
     # handle compile flags
     set(PADDLE_CXX_FLAGS)
     if(BUILD_CUDA_MODULE)
-        set(PADDLE_CXX_FLAGS "-DPADDLE_WITH_CUDA ${PADDLE_CXX_FLAGS}")
+        if (WITH_CUDA)
+            set(PADDLE_CXX_FLAGS -DPADDLE_WITH_CUDA ${PADDLE_CXX_FLAGS})
+        elseif(WITH_ROCM)
+            set(PADDLE_CXX_FLAGS -DPADDLE_WITH_HIP -DPADDLE_WITH_CUSTOM_KERNEL ${PADDLE_CXX_FLAGS})
+        endif()
     endif()
 
     set_target_properties(paddle PROPERTIES
diff --git a/3rdparty/find_dependencies.cmake b/3rdparty/find_dependencies.cmake
index 169bfbe1a..b8dbb64a7 100644
--- a/3rdparty/find_dependencies.cmake
+++ b/3rdparty/find_dependencies.cmake
@@ -473,7 +473,19 @@ ProcessorCount(NPROC)
 
 # CUDAToolkit (required at this point for subsequent checks and targets)
 if(BUILD_CUDA_MODULE)
-    find_package(CUDAToolkit REQUIRED)
+    if (WITH_CUDA)
+        find_package(CUDAToolkit REQUIRED)
+    elseif (WITH_ROCM)
+        find_package(HIP REQUIRED)
+        
+        find_package(hipblaslt REQUIRED)
+        find_package(hipblas REQUIRED)
+        find_package(hipsparse REQUIRED)
+        find_package(hipsolver REQUIRED)
+
+        # hipify_torch will automatically become available.
+        include(${Open3D_3RDPARTY_DIR}/hipify_torch/hipify_torch.cmake)
+    endif()
 endif()
 
 # Threads
@@ -532,37 +544,49 @@ endif()
 
 # CUB (already included in CUDA 11.0+)
 if(BUILD_CUDA_MODULE AND CUDAToolkit_VERSION VERSION_LESS "11.0")
-    include(${Open3D_3RDPARTY_DIR}/cub/cub.cmake)
-    open3d_import_3rdparty_library(3rdparty_cub
-        INCLUDE_DIRS ${CUB_INCLUDE_DIRS}
-        DEPENDS      ext_cub
-    )
-    list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_cub)
+    if (WITH_CUDA)
+        include(${Open3D_3RDPARTY_DIR}/cub/cub.cmake)
+        open3d_import_3rdparty_library(3rdparty_cub
+            INCLUDE_DIRS ${CUB_INCLUDE_DIRS}
+            DEPENDS      ext_cub
+        )
+        list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_cub)
+    elseif (WITH_ROCM)
+        set(CUB_INCLUDE_DIRS ${ROCM_PATH}/include)
+        open3d_import_3rdparty_library(3rdparty_cub
+            INCLUDE_DIRS ${CUB_INCLUDE_DIRS}
+        )
+        list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_cub)
+    endif()
 endif()
 
 # cutlass
 if(BUILD_CUDA_MODULE)
-    if(USE_SYSTEM_CUTLASS)
-        find_path(3rdparty_cutlass_INCLUDE_DIR NAMES cutlass/cutlass.h)
-        if(3rdparty_cutlass_INCLUDE_DIR)
-            add_library(3rdparty_cutlass INTERFACE)
-            target_include_directories(3rdparty_cutlass INTERFACE ${3rdparty_cutlass_INCLUDE_DIR})
-            add_library(Open3D::3rdparty_cutlass ALIAS 3rdparty_cutlass)
-            if(NOT BUILD_SHARED_LIBS)
-                install(TARGETS 3rdparty_cutlass EXPORT ${PROJECT_NAME}Targets)
+    if (WITH_CUDA)
+        if(USE_SYSTEM_CUTLASS)
+            find_path(3rdparty_cutlass_INCLUDE_DIR NAMES cutlass/cutlass.h)
+            if(3rdparty_cutlass_INCLUDE_DIR)
+                add_library(3rdparty_cutlass INTERFACE)
+                target_include_directories(3rdparty_cutlass INTERFACE ${3rdparty_cutlass_INCLUDE_DIR})
+                add_library(Open3D::3rdparty_cutlass ALIAS 3rdparty_cutlass)
+                if(NOT BUILD_SHARED_LIBS)
+                    install(TARGETS 3rdparty_cutlass EXPORT ${PROJECT_NAME}Targets)
+                endif()
+            else()
+                set(USE_SYSTEM_CUTLASS OFF)
             endif()
-        else()
-            set(USE_SYSTEM_CUTLASS OFF)
         endif()
+        if(NOT USE_SYSTEM_CUTLASS)
+            include(${Open3D_3RDPARTY_DIR}/cutlass/cutlass.cmake)
+            open3d_import_3rdparty_library(3rdparty_cutlass
+                INCLUDE_DIRS ${CUTLASS_INCLUDE_DIRS}
+                DEPENDS      ext_cutlass
+            )
+        endif()
+        list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_cutlass)
+    elseif (WITH_ROCM)
+        message(WARNING "ROCM is not supported for cutlass.")
     endif()
-    if(NOT USE_SYSTEM_CUTLASS)
-        include(${Open3D_3RDPARTY_DIR}/cutlass/cutlass.cmake)
-        open3d_import_3rdparty_library(3rdparty_cutlass
-            INCLUDE_DIRS ${CUTLASS_INCLUDE_DIRS}
-            DEPENDS      ext_cutlass
-        )
-    endif()
-    list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_cutlass)
 endif()
 
 # Dirent
@@ -1756,80 +1780,99 @@ endif() # if(OPEN3D_USE_ONEAPI_PACKAGES)
 
 # cuBLAS
 if(BUILD_CUDA_MODULE)
-    if(WIN32)
-        # Nvidia does not provide static libraries for Windows. We don't release
-        # pip wheels for Windows with CUDA support at the moment. For the pip
-        # wheels to support CUDA on Windows out-of-the-box, we need to either
-        # ship the CUDA toolkit with the wheel (e.g. PyTorch can make use of the
-        # cudatoolkit conda package), or have a mechanism to locate the CUDA
-        # toolkit from the system.
-        list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM CUDA::cusolver CUDA::cublas)
-    else()
-        # CMake docs   : https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html
-        # cusolver 11.0: https://docs.nvidia.com/cuda/archive/11.0/cusolver/index.html#static-link-lapack
-        # cublas   11.0: https://docs.nvidia.com/cuda/archive/11.0/cublas/index.html#static-library
-        # The link order below is important. Theoretically we should use
-        # open3d_find_package_3rdparty_library, but we have to insert
-        # liblapack_static.a in the middle of the targets.
-        add_library(3rdparty_cublas INTERFACE)
-        if(CUDAToolkit_VERSION VERSION_LESS "12.0")
-            target_link_libraries(3rdparty_cublas INTERFACE
-                CUDA::cusolver_static
-                ${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a
-                CUDA::cusparse_static
-                CUDA::cublas_static
-                CUDA::cublasLt_static
-                CUDA::culibos
-            )
+    if (WITH_CUDA)
+        if(WIN32)
+            # Nvidia does not provide static libraries for Windows. We don't release
+            # pip wheels for Windows with CUDA support at the moment. For the pip
+            # wheels to support CUDA on Windows out-of-the-box, we need to either
+            # ship the CUDA toolkit with the wheel (e.g. PyTorch can make use of the
+            # cudatoolkit conda package), or have a mechanism to locate the CUDA
+            # toolkit from the system.
+            list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM CUDA::cusolver CUDA::cublas)
         else()
-            # In CUDA12.0 the liblapack_static.a is deprecated and removed.
-            # Use the libcusolver_lapack_static.a instead.
-            target_link_libraries(3rdparty_cublas INTERFACE
-                CUDA::cusolver_static
-                ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a
-                CUDA::cusparse_static
-                CUDA::cublas_static
-                CUDA::cublasLt_static
-                CUDA::culibos
-            )
+            # CMake docs   : https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html
+            # cusolver 11.0: https://docs.nvidia.com/cuda/archive/11.0/cusolver/index.html#static-link-lapack
+            # cublas   11.0: https://docs.nvidia.com/cuda/archive/11.0/cublas/index.html#static-library
+            # The link order below is important. Theoretically we should use
+            # open3d_find_package_3rdparty_library, but we have to insert
+            # liblapack_static.a in the middle of the targets.
+            add_library(3rdparty_cublas INTERFACE)
+            if(CUDAToolkit_VERSION VERSION_LESS "12.0")
+                target_link_libraries(3rdparty_cublas INTERFACE
+                    CUDA::cusolver_static
+                    ${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a
+                    CUDA::cusparse_static
+                    CUDA::cublas_static
+                    CUDA::cublasLt_static
+                    CUDA::culibos
+                )
+            else()
+                # In CUDA12.0 the liblapack_static.a is deprecated and removed.
+                # Use the libcusolver_lapack_static.a instead.
+                target_link_libraries(3rdparty_cublas INTERFACE
+                    CUDA::cusolver_static
+                    ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a
+                    CUDA::cusparse_static
+                    CUDA::cublas_static
+                    CUDA::cublasLt_static
+                    CUDA::culibos
+                )
+            endif()
+            if(NOT BUILD_SHARED_LIBS)
+                # Listed in ${CMAKE_INSTALL_PREFIX}/lib/cmake/Open3D/Open3DTargets.cmake.
+                install(TARGETS 3rdparty_cublas EXPORT Open3DTargets)
+                list(APPEND Open3D_3RDPARTY_EXTERNAL_MODULES "CUDAToolkit")
+            endif()
+            add_library(Open3D::3rdparty_cublas ALIAS 3rdparty_cublas)
+            list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_cublas)
         endif()
+    elseif (WITH_ROCM)
+        add_library(3rdparty_hipblas INTERFACE)
+        target_link_libraries(3rdparty_hipblas INTERFACE
+            roc::hipblas
+            roc::hipsparse
+            roc::hipsolver
+        )
         if(NOT BUILD_SHARED_LIBS)
             # Listed in ${CMAKE_INSTALL_PREFIX}/lib/cmake/Open3D/Open3DTargets.cmake.
-            install(TARGETS 3rdparty_cublas EXPORT Open3DTargets)
-            list(APPEND Open3D_3RDPARTY_EXTERNAL_MODULES "CUDAToolkit")
+            install(TARGETS 3rdparty_hipblas EXPORT Open3DTargets)
         endif()
-        add_library(Open3D::3rdparty_cublas ALIAS 3rdparty_cublas)
-        list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_cublas)
+        add_library(Open3D::3rdparty_hipblas ALIAS 3rdparty_hipblas)
+        list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_hipblas)
     endif()
 endif()
 
 # NPP
 if (BUILD_CUDA_MODULE)
-    # NPP library list: https://docs.nvidia.com/cuda/npp/index.html
-    if(WIN32)
-        open3d_find_package_3rdparty_library(3rdparty_cuda_npp
-            REQUIRED
-            PACKAGE CUDAToolkit
-            TARGETS CUDA::nppc
-                    CUDA::nppicc
-                    CUDA::nppif
-                    CUDA::nppig
-                    CUDA::nppim
-                    CUDA::nppial
-        )
-    else()
-        open3d_find_package_3rdparty_library(3rdparty_cuda_npp
-            REQUIRED
-            PACKAGE CUDAToolkit
-            TARGETS CUDA::nppc_static
-                    CUDA::nppicc_static
-                    CUDA::nppif_static
-                    CUDA::nppig_static
-                    CUDA::nppim_static
-                    CUDA::nppial_static
-        )
+    if (WITH_CUDA)
+        # NPP library list: https://docs.nvidia.com/cuda/npp/index.html
+        if(WIN32)
+            open3d_find_package_3rdparty_library(3rdparty_cuda_npp
+                REQUIRED
+                PACKAGE CUDAToolkit
+                TARGETS CUDA::nppc
+                        CUDA::nppicc
+                        CUDA::nppif
+                        CUDA::nppig
+                        CUDA::nppim
+                        CUDA::nppial
+            )
+        else()
+            open3d_find_package_3rdparty_library(3rdparty_cuda_npp
+                REQUIRED
+                PACKAGE CUDAToolkit
+                TARGETS CUDA::nppc_static
+                        CUDA::nppicc_static
+                        CUDA::nppif_static
+                        CUDA::nppig_static
+                        CUDA::nppim_static
+                        CUDA::nppial_static
+            )
+        endif()
+        list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_cuda_npp)
+    elseif(WITH_ROCM)
+        message(WARNING "ROCM is not supported for NPP. ")
     endif()
-    list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_cuda_npp)
 endif ()
 
 # IPP
@@ -1944,6 +1987,50 @@ else()
     set(BUILD_WEBRTC_COMMENT "//")
 endif()
 
+# gflags
+if(USE_SYSTEM_GFLAGS)
+    open3d_find_package_3rdparty_library(3rdparty_gflags
+        PACKAGE gflags
+        TARGETS gflags
+    )
+    if(NOT 3rdparty_gflags_FOUND)
+        set(USE_SYSTEM_GFLAGS OFF)
+    endif()
+    list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_gflags)
+endif()
+if(NOT USE_SYSTEM_GFLAGS)
+    include(${Open3D_3RDPARTY_DIR}/gflags/gflags.cmake)
+    open3d_import_3rdparty_library(3rdparty_gflags
+        INCLUDE_DIRS ${GFLAGS_INCLUDE_DIRS}
+        LIB_DIR      ${GFLAGS_LIB_DIR}
+        LIBRARIES    ${GFLAGS_LIBRARIES}
+        DEPENDS      ext_gflags
+    )
+    list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_gflags)
+endif()
+
+# glog 
+if(USE_SYSTEM_GLOG)
+    open3d_find_package_3rdparty_library(3rdparty_glog
+        PACKAGE glog
+        TARGETS glog
+    )
+    if(NOT 3rdparty_glog_FOUND)
+        set(USE_SYSTEM_GLOG OFF)
+    endif()
+    list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_glog)
+endif()
+if(NOT USE_SYSTEM_GLOG)
+    include(${Open3D_3RDPARTY_DIR}/glog/glog.cmake)
+    open3d_import_3rdparty_library(3rdparty_glog
+        INCLUDE_DIRS ${GLOG_INCLUDE_DIRS}
+        LIB_DIR      ${GLOG_LIB_DIR}
+        LIBRARIES    ${GLOG_LIBRARIES}
+        DEPENDS      ext_glog
+    )
+    list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_glog)
+endif()
+
 # Compactify list of external modules.
 # This must be called after all dependencies are processed.
 list(REMOVE_DUPLICATES Open3D_3RDPARTY_EXTERNAL_MODULES)
diff --git a/3rdparty/gflags/LICENSE b/3rdparty/gflags/LICENSE
new file mode 100644
index 000000000..55dd398cc
--- /dev/null
+++ b/3rdparty/gflags/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2006, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/3rdparty/gflags/gflags.cmake b/3rdparty/gflags/gflags.cmake
new file mode 100644
index 000000000..460bbea87
--- /dev/null
+++ b/3rdparty/gflags/gflags.cmake
@@ -0,0 +1,44 @@
+include(ExternalProject)
+
+set(GFLAGS_LIB_NAME gflags)
+
+# For CMake >= 4.0.0, set policy compatibility for third-party gflags' CMake.
+set(GFLAGS_POLICY_ARGS "")
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
+  message(
+    WARNING
+      "gflags: forcing CMake policy compatibility for CMake >= 4.0 (CMAKE_POLICY_VERSION_MINIMUM=3.5)"
+  )
+  set(GFLAGS_POLICY_ARGS "-DCMAKE_POLICY_VERSION_MINIMUM=3.5")
+endif()
+
+ExternalProject_Add(
+  ext_gflags
+  PREFIX gflags
+  URL https://github.com/gflags/gflags/archive/refs/tags/v2.2.2.tar.gz
+  URL_HASH SHA256=34af2f15cf7367513b352bdcd2493ab14ce43692d2dcd9dfc499492966c64dcf
+  DOWNLOAD_DIR "${OPEN3D_THIRD_PARTY_DOWNLOAD_DIR}/gflags"
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+             -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+             -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+             ${GFLAGS_POLICY_ARGS}
+             -DBUILD_STATIC_LIBS=ON
+             -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DBUILD_TESTING=OFF
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             ${EXTERNAL_OPTIONAL_ARGS}
+  BUILD_BYPRODUCTS
+    <INSTALL_DIR>/${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${GFLAGS_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}
+)
+
+ExternalProject_Get_Property(ext_gflags INSTALL_DIR)
+set(GFLAGS_INCLUDE_DIRS ${INSTALL_DIR}/include/) # "/" is critical.
+set(GFLAGS_LIB_DIR ${INSTALL_DIR}/${Open3D_INSTALL_LIB_DIR})
+set(GFLAGS_LIBRARIES ${GFLAGS_LIB_NAME})
diff --git a/3rdparty/glog/LICENSE b/3rdparty/glog/LICENSE
new file mode 100644
index 000000000..e2daebaec
--- /dev/null
+++ b/3rdparty/glog/LICENSE
@@ -0,0 +1,8 @@
+Copyright © 2024, Google Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/3rdparty/glog/glog.cmake b/3rdparty/glog/glog.cmake
new file mode 100644
index 000000000..7b9b0a1b0
--- /dev/null
+++ b/3rdparty/glog/glog.cmake
@@ -0,0 +1,47 @@
+include(ExternalProject)
+
+set(GLOG_LIB_NAME glog)
+
+add_definitions(-DGLOG_NO_ABBREVIATED_SEVERITIES)
+
+# For CMake >= 4.0.0, set policy compatibility for glog's CMake.
+set(GLOG_POLICY_ARGS "")
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
+  message(
+    WARNING
+      "glog: forcing CMake policy compatibility for CMake >= 4.0 (CMAKE_POLICY_VERSION_MINIMUM=3.5)"
+  )
+  set(GLOG_POLICY_ARGS "-DCMAKE_POLICY_VERSION_MINIMUM=3.5")
+endif()
+
+
+ExternalProject_Add(
+  ext_glog
+  PREFIX glog
+  URL https://github.com/google/glog/archive/refs/tags/v0.4.0.tar.gz
+  URL_HASH SHA256=f28359aeba12f30d73d9e4711ef356dc842886968112162bc73002645139c39c
+  DOWNLOAD_DIR "${OPEN3D_THIRD_PARTY_DOWNLOAD_DIR}/glog"
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
+             -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+             -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+             ${GLOG_POLICY_ARGS}
+             -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DWITH_GFLAGS=OFF
+             -DBUILD_TESTING=OFF
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             ${EXTERNAL_OPTIONAL_ARGS}
+  BUILD_BYPRODUCTS
+    <INSTALL_DIR>/${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${GLOG_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}
+)
+
+ExternalProject_Get_Property(ext_glog INSTALL_DIR)
+set(GLOG_INCLUDE_DIRS ${INSTALL_DIR}/include/) # "/" is critical.
+set(GLOG_LIB_DIR ${INSTALL_DIR}/${Open3D_INSTALL_LIB_DIR})
+set(GLOG_LIBRARIES ${GLOG_LIB_NAME})
diff --git a/3rdparty/hipify_torch/LICENSE b/3rdparty/hipify_torch/LICENSE
new file mode 100644
index 000000000..bceedac5f
--- /dev/null
+++ b/3rdparty/hipify_torch/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/3rdparty/hipify_torch/hipify_torch.cmake b/3rdparty/hipify_torch/hipify_torch.cmake
new file mode 100644
index 000000000..b87517703
--- /dev/null
+++ b/3rdparty/hipify_torch/hipify_torch.cmake
@@ -0,0 +1,15 @@
+include(FetchContent)
+
+FetchContent_Declare(
+    ext_hipify_torch
+    prefix hipify_torch
+    URL https://github.com/ROCm/hipify_torch/archive/ee928d80eb49a74be5d556465e04c6a40de7e3bc.tar.gz
+    URL_HASH SHA256=6446fc51f849c8f6fce37aa71e23f9b0a4715d15a5ab75ba69e635f89b6a9d6c
+    DOWNLOAD_DIR "${OPEN3D_THIRD_PARTY_DOWNLOAD_DIR}/hipify_torch"
+)
+
+message(STATUS "Fetching hipify_torch")
+
+FetchContent_MakeAvailable(ext_hipify_torch)
+
+list(APPEND CMAKE_MODULE_PATH "${ext_hipify_torch_SOURCE_DIR}/cmake")
\ No newline at end of file
diff --git a/3rdparty/mkl/tbb.cmake b/3rdparty/mkl/tbb.cmake
index b87ef79fe..8c4fd6fa5 100644
--- a/3rdparty/mkl/tbb.cmake
+++ b/3rdparty/mkl/tbb.cmake
@@ -23,6 +23,11 @@ set(STATIC_TBB_LIBRARIES tbb_static tbbmalloc_static)
 
 find_package(Git QUIET REQUIRED)
 
+if (WITH_ROCM)
+    # NOTE(beinggod): Set CMAKE_CXX_COMPILER_ID to GNU to avoid link libunwind and libc++.
+    set(ExternalProject_CMAKE_ARGS_hidden "${ExternalProject_CMAKE_ARGS_hidden} -DCMAKE_CXX_COMPILER_ID=GNU")
+endif()
+
 ExternalProject_Add(
     ext_tbb
     PREFIX tbb
diff --git a/3rdparty/stdgpu/stdgpu.cmake b/3rdparty/stdgpu/stdgpu.cmake
index 68d8f47a2..7d6c7582c 100644
--- a/3rdparty/stdgpu/stdgpu.cmake
+++ b/3rdparty/stdgpu/stdgpu.cmake
@@ -4,27 +4,56 @@
 
 include(ExternalProject)
 
+if (WITH_CUDA)
 ExternalProject_Add(
     ext_stdgpu
     PREFIX stdgpu
-    URL https://github.com/stotko/stdgpu/archive/2588168d226bd17229dbf58d821549580791089d.tar.gz
-    URL_HASH SHA256=86e50789bbe21c57f64358c6acbd4481d56c1e45ce9ba1fb5c5c8482c3973215
+    URL https://github.com/stotko/stdgpu/archive/c25d4bd9d7cb61ab3c3fed179c393916372e6034.tar.gz
+    URL_HASH SHA256=5f2accdab5776920d33d4d3601f26e10e50fa6786ae1503e6e9d2d9280c0f4d2
     DOWNLOAD_DIR "${OPEN3D_THIRD_PARTY_DOWNLOAD_DIR}/stdgpu"
     UPDATE_COMMAND ""
     CMAKE_ARGS
         -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-        -DCUDAToolkit_ROOT=${CUDAToolkit_LIBRARY_ROOT}
+        -DCMAKE_BUILD_TYPE=Release
         -DSTDGPU_BUILD_SHARED_LIBS=OFF
         -DSTDGPU_BUILD_EXAMPLES=OFF
         -DSTDGPU_BUILD_TESTS=OFF
+        -DSTDGPU_BUILD_BENCHMARKS=OFF
         -DSTDGPU_ENABLE_CONTRACT_CHECKS=OFF
-        -DTHRUST_INCLUDE_DIR=${CUDAToolkit_INCLUDE_DIRS}
         ${ExternalProject_CMAKE_ARGS_hidden}
     CMAKE_CACHE_ARGS    # Lists must be passed via CMAKE_CACHE_ARGS
         -DCMAKE_CUDA_ARCHITECTURES:STRING=${CMAKE_CUDA_ARCHITECTURES}
     BUILD_BYPRODUCTS
         <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}stdgpu${CMAKE_STATIC_LIBRARY_SUFFIX}
 )
+elseif(WITH_ROCM)
+ExternalProject_Add(
+    ext_stdgpu
+    PREFIX stdgpu
+    # NOTE: Fix make install error on ROCm. PR: https://github.com/stotko/stdgpu/pull/473
+    URL https://github.com/stotko/stdgpu/archive/6a8be3eafb485866afa488714b6120adc5140f10.tar.gz
+    URL_HASH SHA256=267551bb482e2971c9cb3b3a23324891fb173295ef5ecc3cc3a520d03e76bddd
+    DOWNLOAD_DIR "${OPEN3D_THIRD_PARTY_DOWNLOAD_DIR}/stdgpu"
+    UPDATE_COMMAND ""
+    CMAKE_ARGS
+        -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+        -DCMAKE_BUILD_TYPE=Release
+        -DSTDGPU_BUILD_SHARED_LIBS=OFF
+        -DSTDGPU_BUILD_EXAMPLES=OFF
+        -DSTDGPU_BUILD_TESTS=OFF
+        -DSTDGPU_BUILD_BENCHMARKS=OFF
+        -DSTDGPU_ENABLE_CONTRACT_CHECKS=OFF
+        -DSTDGPU_BACKEND=STDGPU_BACKEND_HIP
+        ${ExternalProject_CMAKE_ARGS_hidden}
+        # NOTE: Place this line before ExternalProject_CMAKE_ARGS_hidden to override the CMAKE_CXX_COMPILER.
+        -DCMAKE_CXX_COMPILER=$ENV{ROCM_PATH}/llvm/bin/clang++
+    CMAKE_CACHE_ARGS    # Lists must be passed via CMAKE_CACHE_ARGS
+        # FIXME(beinggod): The CMAKE_HIP_ARCHITECTURES not work because of unknown issue
+        -DCMAKE_HIP_ARCHITECTURES:STRING=${CMAKE_HIP_ARCHITECTURES}
+    BUILD_BYPRODUCTS
+        <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}stdgpu${CMAKE_STATIC_LIBRARY_SUFFIX}
+)
+endif()
 
 ExternalProject_Get_Property(ext_stdgpu INSTALL_DIR)
 set(STDGPU_INCLUDE_DIRS ${INSTALL_DIR}/include/) # "/" is critical.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1e1c46128..b14e13e45 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ option(BUILD_UNIT_TESTS           "Build Open3D unit tests"                  OFF
 option(BUILD_BENCHMARKS           "Build the micro benchmarks"               OFF)
 option(BUILD_PYTHON_MODULE        "Build the python module"                  ON )
 option(BUILD_CUDA_MODULE          "Build the CUDA module"                    OFF)
+option(WITH_ROCM                    "Use ROCM to replace CUDA"                  OFF)
 option(BUILD_COMMON_CUDA_ARCHS    "Build for common CUDA GPUs (for release)" OFF)
 if (WIN32)   # Causes CUDA runtime error on Windows (See issue #6555)
     option(ENABLE_CACHED_CUDA_MANAGER "Enable cached CUDA memory manager"    OFF)
@@ -115,6 +116,7 @@ option(USE_SYSTEM_TINYGLTF        "Use system pre-installed tinygltf"        OFF
 option(USE_SYSTEM_TINYOBJLOADER   "Use system pre-installed tinyobjloader"   OFF)
 option(USE_SYSTEM_VTK             "Use system pre-installed VTK"             OFF)
 option(USE_SYSTEM_ZEROMQ          "Use system pre-installed ZeroMQ"          OFF)
+option(USE_SYSTEM_GLOG            "Use system pre-installed GLOG"            OFF)
 if(LINUX_AARCH64 OR APPLE_AARCH64)
     option(BUILD_VTK_FROM_SOURCE      "Build VTK from source"                ON )
     option(BUILD_FILAMENT_FROM_SOURCE "Build filament from source"           ON )
@@ -296,6 +298,25 @@ if(BUILD_SYCL_MODULE AND BUILD_CUDA_MODULE)
     message(FATAL_ERROR "BUILD_SYCL_MODULE and BUILD_SYCL_MODULE cannot be on at the same time for now.")
 endif()
 
+# Check ROCM compatiblility
+if (WITH_ROCM)
+    # Find ROCM path
+    set(ROCM_PATH $ENV{ROCM_PATH})
+    if (NOT ROCM_PATH)
+        message(FATAL_ERROR "ROCM_PATH is not set")
+    endif()
+
+    # Reuse BUILD_CUDA_MODULE variables for ROCM build
+    if (NOT BUILD_CUDA_MODULE)
+        message(FATAL_ERROR "WITH_ROCM requires BUILD_CUDA_MODULE=ON")
+    endif()
+
+    # Check DL framework compatibility
+    if (BUILD_TENSORFLOW_OPS OR BUILD_PYTORCH_OPS)
+        message(FATAL_ERROR "WITH_ROCM only available on Paddle backend.")
+    endif()
+endif()
+
 # Global flag to set CXX standard.
 # This does not affect 3rd party libraries.
 # Tensorflow 2.9+ requires cxx_17, but MSVC 19.29 throws errors with C++17
@@ -402,19 +423,34 @@ macro(open3d_patch_findthreads_module_)
 endmacro()
 cmake_language(EVAL CODE "cmake_language(DEFER CALL open3d_patch_findthreads_module_)")
 
-# Build CUDA module by default if CUDA is available
-if(BUILD_CUDA_MODULE)
-    include(Open3DMakeCudaArchitectures)
-    open3d_make_cuda_architectures(CUDA_ARCHS)
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCHS})
-
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-    enable_language(CUDA)
-    set(CMAKE_CUDA_STANDARD 17)
-    if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.5")
-        message(FATAL_ERROR "CUDA 11.4 and older are not supported. Please upgrade to CUDA 11.5 or newer.")
+if (BUILD_CUDA_MODULE)
+    if (WITH_ROCM)
+        # Build ROCM module
+        include(Open3DMakeRocmArchitectures)
+        open3d_make_rocm_architectures(ROCM_ARCHS)
+        set(CMAKE_HIP_ARCHITECTURES ${ROCM_ARCHS})
+
+        message(STATUS "Using ROCM architectures: ${ROCM_ARCHS}")
+        enable_language(HIP)
+
+        # Disable interprocedural optimization for ROCM to avoid LLVMgold.so not found error
+        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)
+    else ()
+        set(WITH_CUDA ON)
+        
+        # Build CUDA module by default if CUDA is available
+        include(Open3DMakeCudaArchitectures)
+        open3d_make_cuda_architectures(CUDA_ARCHS)
+        set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCHS})
+
+        message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+        enable_language(CUDA)
+        set(CMAKE_CUDA_STANDARD 17)
+        if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.5")
+            message(FATAL_ERROR "CUDA 11.4 and older are not supported. Please upgrade to CUDA 11.5 or newer.")
+        endif()
     endif()
-endif ()
+endif()
 
 # ISPC language emulation support
 include(Open3DISPC)
diff --git a/cmake/Open3DMakeRocmArchitectures.cmake b/cmake/Open3DMakeRocmArchitectures.cmake
new file mode 100644
index 000000000..7810aa8a8
--- /dev/null
+++ b/cmake/Open3DMakeRocmArchitectures.cmake
@@ -0,0 +1,52 @@
+# open3d_make_rocm_architectures(rocm_archs)
+#
+# Sets up ROCM architectures based on the following precedence rules
+# and stores them into the <rocm_archs> variable.
+#   1. All common architectures if BUILD_COMMON_ROCM_ARCHS=ON
+#   2. User-defined architectures
+#   3. Architectures detected on the current machine
+function(open3d_make_rocm_architectures rocm_archs)
+    unset(${rocm_archs})
+
+    find_package(HIP REQUIRED)
+
+    if(BUILD_COMMON_ROCM_ARCHS)
+        # split by semicolon
+        set(${rocm_archs} "gfx906;gfx926;gfx928;gfx936")
+    else()
+        file(WRITE
+            "${CMAKE_CURRENT_BINARY_DIR}/rocm_architectures.sh"
+            "
+            #!/bin/bash
+            set -e
+
+            ARCH=$(rocminfo | grep -E '^\\s*Name:\\s+gfx[0-9]+' | awk '{print $2}' | sort -u)
+
+            if [ -z "$ARCH" ]; then
+                echo -n "Unknown"
+                exit 1
+            else
+                echo -n "$ARCH"
+                exit 0
+            fi
+            ")
+
+        execute_process(
+            COMMAND bash ${CMAKE_CURRENT_BINARY_DIR}/rocm_architectures.sh
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+            RESULT_VARIABLE RET
+            OUTPUT_VARIABLE DETECTED_ARCHITECTURES
+            ERROR_VARIABLE ERROR
+        )
+
+        if(RET EQUAL 0)
+            message(STATUS "Building with detected architectures")
+            set(${rocm_archs} ${DETECTED_ARCHITECTURES})
+        else()
+            message(FATAL_ERROR "Failed to detect ROCM architectures")
+        endif()
+    endif()
+
+    set(${rocm_archs} ${${rocm_archs}} PARENT_SCOPE)
+
+endfunction()
diff --git a/cmake/Open3DPrintConfigurationSummary.cmake b/cmake/Open3DPrintConfigurationSummary.cmake
index c8e8a891a..ce1120147 100644
--- a/cmake/Open3DPrintConfigurationSummary.cmake
+++ b/cmake/Open3DPrintConfigurationSummary.cmake
@@ -40,6 +40,7 @@ function(open3d_print_configuration_summary)
     open3d_aligned_print("Build Jupyter Extension" "${BUILD_JUPYTER_EXTENSION}")
     open3d_aligned_print("Build TensorFlow Ops" "${BUILD_TENSORFLOW_OPS}")
     open3d_aligned_print("Build PyTorch Ops" "${BUILD_PYTORCH_OPS}")
+    open3d_aligned_print("Build Paddle Ops" "${BUILD_PADDLE_OPS}")
     open3d_aligned_print("Build Benchmarks" "${BUILD_BENCHMARKS}")
     open3d_aligned_print("Bundle Open3D-ML" "${BUNDLE_OPEN3D_ML}")
     if(GLIBCXX_USE_CXX11_ABI)
diff --git a/cmake/Open3DSetGlobalProperties.cmake b/cmake/Open3DSetGlobalProperties.cmake
index 251f046ca..73e519c39 100644
--- a/cmake/Open3DSetGlobalProperties.cmake
+++ b/cmake/Open3DSetGlobalProperties.cmake
@@ -87,6 +87,15 @@ function(open3d_set_global_properties target)
     # Propagate build configuration into source code
     if (BUILD_CUDA_MODULE)
         target_compile_definitions(${target} PRIVATE BUILD_CUDA_MODULE)
+        # Add WITH_ROCM definitions and build options for ROCM build
+        if (WITH_ROCM)
+            # Add definitions for ROCM build
+            list(APPEND HIP_COMPILE_DEFINITIONS __HIP_PLATFORM_AMD__=1)
+            list(APPEND HIP_COMPILE_DEFINITIONS ROCM_NO_WRAPPER_HEADER_WARNING)
+            list(APPEND HIP_COMPILE_DEFINITIONS THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
+
+            target_compile_definitions(${target} PRIVATE ${HIP_COMPILE_DEFINITIONS}) 
+        endif()
         if (ENABLE_CACHED_CUDA_MANAGER)
             target_compile_definitions(${target} PRIVATE ENABLE_CACHED_CUDA_MANAGER)
         endif()
diff --git a/cmake/Open3DShowAndAbortOnWarning.cmake b/cmake/Open3DShowAndAbortOnWarning.cmake
index c95bfa75a..fa9f1fa6f 100644
--- a/cmake/Open3DShowAndAbortOnWarning.cmake
+++ b/cmake/Open3DShowAndAbortOnWarning.cmake
@@ -25,7 +25,55 @@ function(open3d_show_and_abort_on_warning target)
         -Wno-unused-parameter               # (many places in Open3D code)
     )
 
-    if (BUILD_CUDA_MODULE)
+    if (WITH_ROCM)
+        # Add HIP compiler flags to suppress warnings
+        set(HIP_FLAGS "")
+
+        string(APPEND HIP_FLAGS " -Wno-macro-redefined")
+        string(APPEND HIP_FLAGS " -Wno-inconsistent-missing-override")
+        string(APPEND HIP_FLAGS " -Wno-exceptions")
+        string(APPEND HIP_FLAGS " -Wno-shift-count-negative")
+        string(APPEND HIP_FLAGS " -Wno-shift-count-overflow")
+        string(APPEND HIP_FLAGS " -Wno-unused-command-line-argument")
+        string(APPEND HIP_FLAGS " -Wno-duplicate-decl-specifier")
+        string(APPEND HIP_FLAGS " -Wno-implicit-int-float-conversion")
+        string(APPEND HIP_FLAGS " -Wno-pass-failed")
+        string(APPEND HIP_FLAGS " -Wno-unused-result")
+        string(APPEND HIP_FLAGS " -Wno-deprecated-declarations")
+        string(APPEND HIP_FLAGS " -Wno-format")
+        string(APPEND HIP_FLAGS " -Wno-dangling-gsl")
+        string(APPEND HIP_FLAGS " -Wno-unused-value")
+        string(APPEND HIP_FLAGS " -Wno-braced-scalar-init")
+        string(APPEND HIP_FLAGS " -Wno-return-type")
+        string(APPEND HIP_FLAGS " -Wno-pragma-once-outside-header")
+        string(APPEND HIP_FLAGS " -Wno-deprecated-builtins")
+        string(APPEND HIP_FLAGS " -Wno-switch")
+        string(APPEND HIP_FLAGS " -Wno-literal-conversion")
+        string(APPEND HIP_FLAGS " -Wno-constant-conversion")
+        string(APPEND HIP_FLAGS " -Wno-defaulted-function-deleted")
+        string(APPEND HIP_FLAGS " -Wno-sign-compare")
+        string(APPEND HIP_FLAGS " -Wno-bitwise-instead-of-logical")
+        string(APPEND HIP_FLAGS " -Wno-unknown-warning-option")
+        string(APPEND HIP_FLAGS " -Wno-unused-lambda-capture")
+        string(APPEND HIP_FLAGS " -Wno-unused-variable")
+        string(APPEND HIP_FLAGS " -Wno-unused-but-set-variable")
+        string(APPEND HIP_FLAGS " -Wno-reorder-ctor")
+        string(APPEND HIP_FLAGS " -Wno-deprecated-copy-with-user-provided-copy")
+        string(APPEND HIP_FLAGS " -Wno-unused-local-typedef")
+        string(APPEND HIP_FLAGS " -Wno-missing-braces")
+        string(APPEND HIP_FLAGS " -Wno-sometimes-uninitialized")
+        string(APPEND HIP_FLAGS " -Wno-deprecated-copy")
+        string(APPEND HIP_FLAGS " -Wno-pessimizing-move")
+        string(APPEND HIP_FLAGS " -Wunused-command-line-argument")
+
+        # Add CXX compiler flags to suppress warnings
+        list(APPEND DISABLE_GNU_CLANG_INTEL_WARNINGS -Wno-return-type)
+        list(APPEND DISABLE_GNU_CLANG_INTEL_WARNINGS -Wno-unused-result)
+    else()
+        set(HIP_FLAGS "")
+    endif()
+
+    if (WITH_CUDA)
         # General NVCC flags
         set(DISABLE_NVCC_WARNINGS
             2809           # ignoring return value from routine declared with "nodiscard" attribute (cub)
@@ -63,6 +111,7 @@ function(open3d_show_and_abort_on_warning target)
         $<$<COMPILE_LANG_AND_ID:CXX,MSVC>:/W4 /WX ${DISABLE_MSVC_WARNINGS}>
         $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang,AppleClang,Intel>:-Wall -Wextra -Werror ${DISABLE_GNU_CLANG_INTEL_WARNINGS}>
         $<$<COMPILE_LANGUAGE:CUDA>:SHELL:${CUDA_FLAGS}>
+        $<$<COMPILE_LANGUAGE:HIP>:SHELL:${HIP_FLAGS}>
         $<$<COMPILE_LANGUAGE:ISPC>:--werror>
     )
 endfunction()
diff --git a/cpp/open3d/CMakeLists.txt b/cpp/open3d/CMakeLists.txt
index 6da3581e5..3ab84bf8e 100644
--- a/cpp/open3d/CMakeLists.txt
+++ b/cpp/open3d/CMakeLists.txt
@@ -52,6 +52,37 @@ configure_file("${PROJECT_SOURCE_DIR}/cpp/open3d/Open3D.h.in"
 configure_file("${PROJECT_SOURCE_DIR}/cpp/open3d/Open3DConfig.h.in"
                "${PROJECT_SOURCE_DIR}/cpp/open3d/Open3DConfig.h")
 
+# HIPify the source code
+if (WITH_ROCM)
+    include(Hipify)
+
+    set(HIPIFY_HEADER_INCLUDE_DIRS
+      ${CMAKE_CURRENT_SOURCE_DIR}/core
+      ${CMAKE_CURRENT_SOURCE_DIR}/core/kernel
+      ${CMAKE_CURRENT_SOURCE_DIR}/core/hashmap
+      ${CMAKE_CURRENT_SOURCE_DIR}/core/linalg
+      ${CMAKE_CURRENT_SOURCE_DIR}/core/linalg/kernel
+      ${CMAKE_CURRENT_SOURCE_DIR}/core/nns
+      ${CMAKE_CURRENT_SOURCE_DIR}/core/nns/kernel)
+
+    # Hipify the source code
+    hipify(CUDA_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} 
+        HEADER_INCLUDE_DIRS ${HIPIFY_HEADER_INCLUDE_DIRS}
+        IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/utility/*"
+        IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/MemoryManager.h"
+        IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/DLPack.h"
+        IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/CUDAUtils*"
+        IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/linalg/LinalgHeadersCUDA.h*"
+        IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/nns/kernel/PtxUtils.cuh"
+        IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/nns/kernel/DeviceDefs.cuh"
+        IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h"
+        IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/hashmap/CUDA/*"
+        IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/core/hashmap/Dispatch.h"
+        IGNORES "${PROJECT_SOURCE_DIR}/cpp/open3d/t/geometry/kernel/GeometryMacros.h"
+        CUSTOM_MAP_FILE "${PROJECT_SOURCE_DIR}/hipify_custom_map.json"
+        NO_MATH_REPLACE
+    )
+endif()
 
 add_library(Open3D)
 
@@ -136,6 +167,10 @@ open3d_link_3rdparty_libraries(Open3D)
 target_link_options(Open3D INTERFACE
     $<$<STREQUAL:$<TARGET_PROPERTY:Open3D,TYPE>,STATIC_LIBRARY>:$<LINK_ONLY:${OPEN3D_HIDDEN_3RDPARTY_LINK_OPTIONS}>>)
 
+# get_target_property(SRCS Open3D SOURCES)
+# message(FATAL_ERROR "sources = ${SRCS}")
+
+
 add_library(Open3D::Open3D ALIAS Open3D)
 
 include(CMakePackageConfigHelpers)
diff --git a/cpp/open3d/core/CMakeLists.txt b/cpp/open3d/core/CMakeLists.txt
index d42e645da..68fbf367c 100644
--- a/cpp/open3d/core/CMakeLists.txt
+++ b/cpp/open3d/core/CMakeLists.txt
@@ -85,7 +85,8 @@ if (BUILD_CUDA_MODULE)
     target_sources(core PRIVATE
         MemoryManagerCUDA.cpp
     )
-    target_sources(core PRIVATE
+    
+    target_sources(core PRIVATE 
         hashmap/CUDA/CreateCUDAHashBackend.cu
         hashmap/CUDA/CUDAHashBackendBuffer.cu
         hashmap/CUDA/SlabNodeManager.cu
@@ -120,6 +121,23 @@ if (BUILD_ISPC_MODULE)
     )
 endif()
 
+if (WITH_ROCM)
+    include(Hipify)
+
+    get_target_property(SOURCE_LIST core SOURCES)
+    get_hipified_list("${SOURCE_LIST}" HIPIFIED_SOURCE_LIST)
+
+    foreach(src IN LISTS HIPIFIED_SOURCE_LIST)
+        get_filename_component(ext ${src} EXT)
+        if(ext STREQUAL ".cu")
+            set_source_files_properties(${src} PROPERTIES LANGUAGE HIP)
+        endif()
+    endforeach()
+
+    set_property(TARGET core PROPERTY SOURCES ${HIPIFIED_SOURCE_LIST})
+    get_target_property(SOURCE_LIST core SOURCES)
+endif()
+
 open3d_show_and_abort_on_warning(core)
 open3d_set_global_properties(core)
 open3d_set_open3d_lib_properties(core)
diff --git a/cpp/open3d/core/CUDAUtils.cpp b/cpp/open3d/core/CUDAUtils.cpp
index ab805bda0..696a35e16 100644
--- a/cpp/open3d/core/CUDAUtils.cpp
+++ b/cpp/open3d/core/CUDAUtils.cpp
@@ -20,6 +20,15 @@ namespace cuda {
 
 int DeviceCount() {
 #ifdef BUILD_CUDA_MODULE
+#if __HIP_PLATFORM_AMD__
+    try {
+        int num_devices;
+        OPEN3D_CUDA_CHECK(hipGetDeviceCount(&num_devices));
+        return num_devices;
+    } catch (const std::runtime_error&) {
+        return 0;
+    }
+#else
     try {
         int num_devices;
         OPEN3D_CUDA_CHECK(cudaGetDeviceCount(&num_devices));
@@ -30,6 +39,7 @@ int DeviceCount() {
     catch (const std::runtime_error&) {
         return 0;
     }
+#endif
 #else
     return 0;
 #endif
@@ -67,7 +77,11 @@ void Synchronize(const Device& device) {
 #ifdef BUILD_CUDA_MODULE
     if (device.IsCUDA()) {
         CUDAScopedDevice scoped_device(device);
+#if __HIP_PLATFORM_AMD__
+        OPEN3D_CUDA_CHECK(hipDeviceSynchronize());
+#else
         OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
+#endif
     }
 #endif
 }
@@ -132,13 +146,21 @@ bool SupportsMemoryPools(const Device& device) {
 #ifdef BUILD_CUDA_MODULE
 int GetDevice() {
     int device;
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipGetDevice(&device));
+#else
     OPEN3D_CUDA_CHECK(cudaGetDevice(&device));
+#endif
     return device;
 }
 
 static void SetDevice(int device_id) {
     AssertCUDADeviceAvailable(device_id);
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipSetDevice(device_id));
+#else
     OPEN3D_CUDA_CHECK(cudaSetDevice(device_id));
+#endif
 }
 
 class CUDAStream {
@@ -149,20 +171,37 @@ class CUDAStream {
         static thread_local CUDAStream instance;
         return instance;
     }
+#if __HIP_PLATFORM_AMD__
+    hipStream_t Get() { return stream_; }
+    void Set(hipStream_t stream) { stream_ = stream; }
 
+    static hipStream_t Default() { return static_cast<hipStream_t>(0); }
+#else
     cudaStream_t Get() { return stream_; }
     void Set(cudaStream_t stream) { stream_ = stream; }
 
     static cudaStream_t Default() { return static_cast<cudaStream_t>(0); }
+#endif
 
 private:
     CUDAStream() = default;
     CUDAStream(const CUDAStream&) = delete;
     CUDAStream& operator=(const CUDAStream&) = delete;
 
+#if __HIP_PLATFORM_AMD__
+    hipStream_t stream_ = Default();
+#else
     cudaStream_t stream_ = Default();
+#endif
 };
 
+#if __HIP_PLATFORM_AMD__
+hipStream_t GetStream() { return CUDAStream::GetInstance().Get(); }
+
+void SetStream(hipStream_t stream) { CUDAStream::GetInstance().Set(stream); }
+
+hipStream_t GetDefaultStream() { return CUDAStream::Default(); }
+#else
 cudaStream_t GetStream() { return CUDAStream::GetInstance().Get(); }
 
 static void SetStream(cudaStream_t stream) {
@@ -170,6 +209,7 @@ static void SetStream(cudaStream_t stream) {
 }
 
 cudaStream_t GetDefaultStream() { return CUDAStream::Default(); }
+#endif
 
 #endif
 
@@ -194,11 +234,19 @@ constexpr CUDAScopedStream::CreateNewStreamTag
 
 CUDAScopedStream::CUDAScopedStream(const CreateNewStreamTag&)
     : prev_stream_(cuda::GetStream()), owns_new_stream_(true) {
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipStreamCreate(&new_stream_));
+#else
     OPEN3D_CUDA_CHECK(cudaStreamCreate(&new_stream_));
+#endif
     cuda::SetStream(new_stream_);
 }
 
+#if __HIP_PLATFORM_AMD__
+CUDAScopedStream::CUDAScopedStream(hipStream_t stream)
+#else
 CUDAScopedStream::CUDAScopedStream(cudaStream_t stream)
+#endif
     : prev_stream_(cuda::GetStream()),
       new_stream_(stream),
       owns_new_stream_(false) {
@@ -207,7 +255,11 @@ CUDAScopedStream::CUDAScopedStream(cudaStream_t stream)
 
 CUDAScopedStream::~CUDAScopedStream() {
     if (owns_new_stream_) {
+#if __HIP_PLATFORM_AMD__
+        OPEN3D_CUDA_CHECK(hipStreamDestroy(new_stream_));
+#else
         OPEN3D_CUDA_CHECK(cudaStreamDestroy(new_stream_));
+#endif
     }
     cuda::SetStream(prev_stream_);
 }
@@ -253,11 +305,26 @@ CUDAState::CUDAState() {
 
                 // Check access.
                 int can_access = 0;
+#if __HIP_PLATFORM_AMD__
+                OPEN3D_CUDA_CHECK(
+                        hipDeviceCanAccessPeer(&can_access, src_id, tar_id));
+#else
                 OPEN3D_CUDA_CHECK(
                         cudaDeviceCanAccessPeer(&can_access, src_id, tar_id));
+#endif
                 // Enable access.
                 if (can_access) {
                     p2p_enabled_[src_id][tar_id] = true;
+#if __HIP_PLATFORM_AMD__
+                    hipError_t err = hipDeviceEnablePeerAccess(tar_id, 0);
+                    if (err == hipErrorPeerAccessAlreadyEnabled) {
+                        // Ignore error since P2P is already enabled.
+                        // Add void to suppress unused variable warning.
+                        (void)hipGetLastError();
+                    } else {
+                        OPEN3D_CUDA_CHECK(err);
+                    }
+#else
                     cudaError_t err = cudaDeviceEnablePeerAccess(tar_id, 0);
                     if (err == cudaErrorPeerAccessAlreadyEnabled) {
                         // Ignore error since P2P is already enabled.
@@ -265,6 +332,7 @@ CUDAState::CUDAState() {
                     } else {
                         OPEN3D_CUDA_CHECK(err);
                     }
+#endif
                 } else {
                     p2p_enabled_[src_id][tar_id] = false;
                 }
@@ -275,22 +343,36 @@ CUDAState::CUDAState() {
 
 int GetCUDACurrentDeviceTextureAlignment() {
     int value;
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeTextureAlignment, cuda::GetDevice()));
+#else
     OPEN3D_CUDA_CHECK(cudaDeviceGetAttribute(
             &value, cudaDevAttrTextureAlignment, cuda::GetDevice()));
+#endif
     return value;
 }
 
 int GetCUDACurrentWarpSize() {
     int value;
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize,
+                                            cuda::GetDevice()));
+#else
     OPEN3D_CUDA_CHECK(cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize,
                                              cuda::GetDevice()));
+#endif
     return value;
 }
 
 size_t GetCUDACurrentTotalMemSize() {
     size_t free;
     size_t total;
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipMemGetInfo(&free, &total));
+#else
     OPEN3D_CUDA_CHECK(cudaMemGetInfo(&free, &total));
+#endif
     return total;
 }
 
@@ -304,21 +386,38 @@ size_t GetCUDACurrentTotalMemSize() {
 namespace open3d {
 namespace core {
 
+#if __HIP_PLATFORM_AMD__
+void __OPEN3D_CUDA_CHECK(hipError_t err, const char* file, const int line) {
+    if (err != hipSuccess) {
+        utility::LogError("{}:{} CUDA runtime error: {}", file, line,
+                          hipGetErrorString(err));
+    }
+}
+#else
 void __OPEN3D_CUDA_CHECK(cudaError_t err, const char* file, const int line) {
     if (err != cudaSuccess) {
         utility::LogError("{}:{} CUDA runtime error: {}", file, line,
                           cudaGetErrorString(err));
     }
 }
+#endif
 
 void __OPEN3D_GET_LAST_CUDA_ERROR(const char* message,
                                   const char* file,
                                   const int line) {
+#if __HIP_PLATFORM_AMD__
+    hipError_t err = hipGetLastError();
+    if (err != hipSuccess) {
+        utility::LogError("{}:{} {}: OPEN3D_GET_LAST_CUDA_ERROR(): {}", file,
+                          line, message, hipGetErrorString(err));
+    }
+#else
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
         utility::LogError("{}:{} {}: OPEN3D_GET_LAST_CUDA_ERROR(): {}", file,
                           line, message, cudaGetErrorString(err));
     }
+#endif
 }
 
 }  // namespace core
diff --git a/cpp/open3d/core/CUDAUtils.h b/cpp/open3d/core/CUDAUtils.h
index 16a3841e2..b59f1a318 100644
--- a/cpp/open3d/core/CUDAUtils.h
+++ b/cpp/open3d/core/CUDAUtils.h
@@ -18,8 +18,12 @@
 
 #ifdef BUILD_CUDA_MODULE
 
+#if __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 
 #include <memory>
 #include <vector>
@@ -29,9 +33,19 @@
 #define OPEN3D_FORCE_INLINE __forceinline__
 #define OPEN3D_HOST_DEVICE __host__ __device__
 #define OPEN3D_DEVICE __device__
+
+#if __HIP_PLATFORM_AMD__
+// NOTE: clang++ not support __nv_is_extended_host_device_lambda_closure_type.
+// It will throw error in compile time so static_assert should be set to true
+// always.
+#define OPEN3D_ASSERT_HOST_DEVICE_LAMBDA(type) \
+    static_assert(true, #type " must be a __host__ __device__ lambda")
+#else
 #define OPEN3D_ASSERT_HOST_DEVICE_LAMBDA(type)                            \
     static_assert(__nv_is_extended_host_device_lambda_closure_type(type), \
                   #type " must be a __host__ __device__ lambda")
+#endif
+
 #define OPEN3D_CUDA_CHECK(err) \
     open3d::core::__OPEN3D_CUDA_CHECK(err, __FILE__, __LINE__)
 #define OPEN3D_GET_LAST_CUDA_ERROR(message) \
@@ -148,7 +162,11 @@ class CUDAScopedStream {
 
     explicit CUDAScopedStream(const CreateNewStreamTag&);
 
+#if __HIP_PLATFORM_AMD__
+    explicit CUDAScopedStream(hipStream_t stream);
+#else
     explicit CUDAScopedStream(cudaStream_t stream);
+#endif
 
     ~CUDAScopedStream();
 
@@ -156,8 +174,13 @@ class CUDAScopedStream {
     CUDAScopedStream& operator=(const CUDAScopedStream&) = delete;
 
 private:
+#if __HIP_PLATFORM_AMD__
+    hipStream_t prev_stream_;
+    hipStream_t new_stream_;
+#else
     cudaStream_t prev_stream_;
     cudaStream_t new_stream_;
+#endif
     bool owns_new_stream_ = false;
 };
 
@@ -265,8 +288,13 @@ bool SupportsMemoryPools(const Device& device);
 #ifdef BUILD_CUDA_MODULE
 
 int GetDevice();
+#if __HIP_PLATFORM_AMD__
+hipStream_t GetStream();
+hipStream_t GetDefaultStream();
+#else
 cudaStream_t GetStream();
 cudaStream_t GetDefaultStream();
+#endif
 
 #endif
 
@@ -280,7 +308,11 @@ cudaStream_t GetDefaultStream();
 namespace open3d {
 namespace core {
 
+#if __HIP_PLATFORM_AMD__
+void __OPEN3D_CUDA_CHECK(hipError_t err, const char* file, const int line);
+#else
 void __OPEN3D_CUDA_CHECK(cudaError_t err, const char* file, const int line);
+#endif
 
 void __OPEN3D_GET_LAST_CUDA_ERROR(const char* message,
                                   const char* file,
@@ -290,3 +322,16 @@ void __OPEN3D_GET_LAST_CUDA_ERROR(const char* message,
 }  // namespace open3d
 
 #endif
+
+#ifdef BUILD_CUDA_MODULE
+#if __HIP_PLATFORM_AMD__
+
+// NOTE(beinggod): Ignore the mask for ROCm.
+#define __shfl_sync(mask, ...) __shfl(__VA_ARGS__)
+#define __shfl_down_sync(mask, ...) __shfl_down(__VA_ARGS__)
+#define __ballot_sync(mask, ...) __ballot(__VA_ARGS__)
+// warp size is 64 for ROCm.
+#define __activemask() 0xffffffffffffffff
+
+#endif
+#endif
diff --git a/cpp/open3d/core/ParallelFor.h b/cpp/open3d/core/ParallelFor.h
index 2d5bef788..45cb16c69 100644
--- a/cpp/open3d/core/ParallelFor.h
+++ b/cpp/open3d/core/ParallelFor.h
@@ -16,17 +16,25 @@
 #include "open3d/utility/Parallel.h"
 #include "open3d/utility/Preprocessor.h"
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__)
+
 #include <cuda.h>
 #include <cuda_runtime.h>
 
 #include "open3d/core/CUDAUtils.h"
+
+#elif defined(__HIPCC__)
+
+#include <hip/hip_runtime.h>
+
+#include "open3d/core/CUDAUtils.h"
+
 #endif
 
 namespace open3d {
 namespace core {
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 
 static constexpr int64_t OPEN3D_PARFOR_BLOCK = 128;
 static constexpr int64_t OPEN3D_PARFOR_THREAD = 4;
@@ -101,7 +109,7 @@ void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {
 /// kernel to be used on both CPU and CUDA, capture the variables by value.
 template <typename func_t>
 void ParallelFor(const Device& device, int64_t n, const func_t& func) {
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
     ParallelForCUDA_(device, n, func);
 #else
     ParallelForCPU_(device, n, func);
@@ -161,7 +169,7 @@ void ParallelFor(const Device& device,
                  const vec_func_t& vec_func) {
 #ifdef BUILD_ISPC_MODULE
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
     ParallelForCUDA_(device, n, func);
 #else
     int num_threads = utility::EstimateMaxThreads();
@@ -174,7 +182,7 @@ void ParallelFor(const Device& device,
 
 #else
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
     ParallelForCUDA_(device, n, func);
 #else
     ParallelForCPU_(device, n, func);
diff --git a/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBuffer.cu b/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBuffer.cu
index 627095858..79eeebd05 100644
--- a/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBuffer.cu
+++ b/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBuffer.cu
@@ -16,7 +16,11 @@ namespace core {
 void CUDAResetHeap(Tensor &heap) {
     uint32_t *heap_ptr = heap.GetDataPtr<uint32_t>();
     thrust::sequence(thrust::device, heap_ptr, heap_ptr + heap.GetLength(), 0);
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
     OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
 }
 }  // namespace core
 }  // namespace open3d
diff --git a/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBufferAccessor.h b/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBufferAccessor.h
index eeb74bc26..6d4724ea2 100644
--- a/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBufferAccessor.h
+++ b/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBufferAccessor.h
@@ -58,7 +58,11 @@ class CUDAHashBackendBufferAccessor {
         std::vector<uint8_t *> value_ptrs(n_values_);
         for (size_t i = 0; i < n_values_; ++i) {
             value_ptrs[i] = value_buffers[i].GetDataPtr<uint8_t>();
+#if __HIP_PLATFORM_AMD__
+            hipMemset(value_ptrs[i], 0, capacity_ * value_dsizes_host[i]);
+#else
             cudaMemset(value_ptrs[i], 0, capacity_ * value_dsizes_host[i]);
+#endif
         }
         values_ = static_cast<uint8_t **>(
                 MemoryManager::Malloc(n_values_ * sizeof(uint8_t *), device));
@@ -67,7 +71,11 @@ class CUDAHashBackendBufferAccessor {
 
         heap_top_ = hashmap_buffer.GetHeapTop().cuda.GetDataPtr<int>();
         cuda::Synchronize();
+#if __HIP_PLATFORM_AMD__
+        OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
         OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
     }
 
     __host__ void Shutdown(const Device &device) {
diff --git a/cpp/open3d/core/hashmap/CUDA/SlabHashBackend.h b/cpp/open3d/core/hashmap/CUDA/SlabHashBackend.h
index dbf665c79..5b7bf95f6 100644
--- a/cpp/open3d/core/hashmap/CUDA/SlabHashBackend.h
+++ b/cpp/open3d/core/hashmap/CUDA/SlabHashBackend.h
@@ -97,16 +97,28 @@ void SlabHashBackend<Key, Hash, Eq>::Find(const void* input_keys,
     CUDAScopedDevice scoped_device(this->device_);
     if (count == 0) return;
 
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipMemset(output_masks, 0, sizeof(bool) * count));
+#else
     OPEN3D_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count));
+#endif
     cuda::Synchronize();
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
     OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
 
     const int64_t num_blocks =
             (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
     FindKernel<<<num_blocks, kThreadsPerBlock, 0, core::cuda::GetStream()>>>(
             impl_, input_keys, output_buf_indices, output_masks, count);
     cuda::Synchronize();
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
     OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
 }
 
 template <typename Key, typename Hash, typename Eq>
@@ -116,9 +128,17 @@ void SlabHashBackend<Key, Hash, Eq>::Erase(const void* input_keys,
     CUDAScopedDevice scoped_device(this->device_);
     if (count == 0) return;
 
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipMemset(output_masks, 0, sizeof(bool) * count));
+#else
     OPEN3D_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count));
+#endif
     cuda::Synchronize();
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
     OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
     auto buf_indices = static_cast<buf_index_t*>(
             MemoryManager::Malloc(sizeof(buf_index_t) * count, this->device_));
 
@@ -131,7 +151,11 @@ void SlabHashBackend<Key, Hash, Eq>::Erase(const void* input_keys,
                        core::cuda::GetStream()>>>(impl_, buf_indices,
                                                   output_masks, count);
     cuda::Synchronize();
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
     OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
 
     MemoryManager::Free(buf_indices, this->device_);
 }
@@ -142,10 +166,18 @@ int64_t SlabHashBackend<Key, Hash, Eq>::GetActiveIndices(
     CUDAScopedDevice scoped_device(this->device_);
     uint32_t* count = static_cast<uint32_t*>(
             MemoryManager::Malloc(sizeof(uint32_t), this->device_));
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipMemset(count, 0, sizeof(uint32_t)));
+#else
     OPEN3D_CUDA_CHECK(cudaMemset(count, 0, sizeof(uint32_t)));
+#endif
 
     cuda::Synchronize();
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
     OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
 
     const int64_t num_blocks =
             (impl_.bucket_count_ * kWarpSize + kThreadsPerBlock - 1) /
@@ -154,7 +186,11 @@ int64_t SlabHashBackend<Key, Hash, Eq>::GetActiveIndices(
                              core::cuda::GetStream()>>>(
             impl_, output_buf_indices, count);
     cuda::Synchronize();
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
     OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
 
     uint32_t ret;
     MemoryManager::MemcpyToHost(&ret, count, this->device_, sizeof(uint32_t));
@@ -170,10 +206,19 @@ void SlabHashBackend<Key, Hash, Eq>::Clear() {
     this->buffer_->ResetHeap();
 
     // Clear the linked list heads
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipMemset(impl_.bucket_list_head_, 0xFF,
+                                sizeof(Slab) * this->bucket_count_));
+#else
     OPEN3D_CUDA_CHECK(cudaMemset(impl_.bucket_list_head_, 0xFF,
                                  sizeof(Slab) * this->bucket_count_));
+#endif
     cuda::Synchronize();
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
     OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
 
     // Clear the linked list nodes
     node_mgr_->Reset();
@@ -204,7 +249,11 @@ std::vector<int64_t> SlabHashBackend<Key, Hash, Eq>::BucketSizes() const {
                                 core::cuda::GetStream()>>>(
             impl_, thrust::raw_pointer_cast(elems_per_bucket.data()));
     cuda::Synchronize();
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
     OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
 
     std::vector<int64_t> result(impl_.bucket_count_);
     thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(),
@@ -258,7 +307,11 @@ void SlabHashBackend<Key, Hash, Eq>::Insert(
                                 output_masks, count, n_values);
             });
     cuda::Synchronize();
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
     OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
 }
 
 template <typename Key, typename Hash, typename Eq>
@@ -279,10 +332,20 @@ void SlabHashBackend<Key, Hash, Eq>::Allocate(int64_t capacity) {
     // Allocate linked list heads.
     impl_.bucket_list_head_ = static_cast<Slab*>(MemoryManager::Malloc(
             sizeof(Slab) * this->bucket_count_, this->device_));
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipMemset(impl_.bucket_list_head_, 0xFF,
+                                sizeof(Slab) * this->bucket_count_));
+#else
     OPEN3D_CUDA_CHECK(cudaMemset(impl_.bucket_list_head_, 0xFF,
                                  sizeof(Slab) * this->bucket_count_));
     cuda::Synchronize();
+#endif
+
+#if __HIP_PLATFORM_AMD__
+    OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
     OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
 
     impl_.Setup(this->bucket_count_, node_mgr_->impl_, buffer_accessor_);
 }
diff --git a/cpp/open3d/core/hashmap/CUDA/SlabMacros.h b/cpp/open3d/core/hashmap/CUDA/SlabMacros.h
index 21ba410ab..7e3b628cd 100644
--- a/cpp/open3d/core/hashmap/CUDA/SlabMacros.h
+++ b/cpp/open3d/core/hashmap/CUDA/SlabMacros.h
@@ -27,7 +27,7 @@
 namespace open3d {
 namespace core {
 
-// Device-specific, safe for current NVIDIA architectures.
+// Device-specific
 static constexpr uint32_t kWarpSize = 32;
 
 //////////////////////
diff --git a/cpp/open3d/core/hashmap/CUDA/SlabNodeManager.h b/cpp/open3d/core/hashmap/CUDA/SlabNodeManager.h
index 9668c3100..f3b71f7b0 100644
--- a/cpp/open3d/core/hashmap/CUDA/SlabNodeManager.h
+++ b/cpp/open3d/core/hashmap/CUDA/SlabNodeManager.h
@@ -233,18 +233,34 @@ class SlabNodeManager {
     ~SlabNodeManager() { MemoryManager::Free(impl_.super_blocks_, device_); }
 
     void Reset() {
+#if __HIP_PLATFORM_AMD__
+        OPEN3D_CUDA_CHECK(hipMemset(
+                impl_.super_blocks_, 0xFF,
+                kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t)));
+#else
         OPEN3D_CUDA_CHECK(cudaMemset(
                 impl_.super_blocks_, 0xFF,
                 kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t)));
+#endif
 
         for (uint32_t i = 0; i < kSuperBlocks; i++) {
             // setting bitmaps into zeros:
+#if __HIP_PLATFORM_AMD__
+            OPEN3D_CUDA_CHECK(hipMemset(
+                    impl_.super_blocks_ + i * kUIntsPerSuperBlock, 0x00,
+                    kBlocksPerSuperBlock * kSlabsPerBlock * sizeof(uint32_t)));
+#else
             OPEN3D_CUDA_CHECK(cudaMemset(
                     impl_.super_blocks_ + i * kUIntsPerSuperBlock, 0x00,
                     kBlocksPerSuperBlock * kSlabsPerBlock * sizeof(uint32_t)));
+#endif
         }
         cuda::Synchronize();
+#if __HIP_PLATFORM_AMD__
+        OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
         OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
     }
 
     std::vector<int> CountSlabsPerSuperblock() {
@@ -262,7 +278,11 @@ class SlabNodeManager {
                                         core::cuda::GetStream()>>>(
                 impl_, thrust::raw_pointer_cast(slabs_per_superblock.data()));
         cuda::Synchronize();
+#if __HIP_PLATFORM_AMD__
+        OPEN3D_CUDA_CHECK(hipGetLastError());
+#else
         OPEN3D_CUDA_CHECK(cudaGetLastError());
+#endif
 
         std::vector<int> result(num_super_blocks);
         thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
diff --git a/cpp/open3d/core/hashmap/Dispatch.h b/cpp/open3d/core/hashmap/Dispatch.h
index 7318e5898..705871fa5 100644
--- a/cpp/open3d/core/hashmap/Dispatch.h
+++ b/cpp/open3d/core/hashmap/Dispatch.h
@@ -59,7 +59,7 @@
         }                                                                     \
     }()
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 // Reinterpret hash maps' void* value arrays as CUDA primitive types arrays, to
 // avoid slow memcpy or byte-by-byte copy in kernels.
 // Not used in the CPU version since memcpy is relatively fast on CPU.
diff --git a/cpp/open3d/core/linalg/LinalgHeadersCUDA.h b/cpp/open3d/core/linalg/LinalgHeadersCUDA.h
index 49b124057..7e2ab0187 100644
--- a/cpp/open3d/core/linalg/LinalgHeadersCUDA.h
+++ b/cpp/open3d/core/linalg/LinalgHeadersCUDA.h
@@ -12,8 +12,15 @@
 
 #pragma once
 
-#ifdef BUILD_CUDA_MODULE
+#if BUILD_CUDA_MODULE
+
+#if __HIP_PLATFORM_AMD__
+#include <hipblas/hipblas.h>
+#include <hipsolver/hipsolver.h>
+#else
 #include <cublas_v2.h>
 #include <cusolverDn.h>
 #include <cusolver_common.h>
 #endif
+
+#endif
diff --git a/cpp/open3d/core/nns/NeighborSearchCommon.h b/cpp/open3d/core/nns/NeighborSearchCommon.h
index 97ffb810f..865cdc14e 100644
--- a/cpp/open3d/core/nns/NeighborSearchCommon.h
+++ b/cpp/open3d/core/nns/NeighborSearchCommon.h
@@ -18,7 +18,7 @@ namespace nns {
 /// Supported metrics
 enum Metric { L1, L2, Linf };
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 #define HOST_DEVICE __host__ __device__
 #else
 #define HOST_DEVICE
diff --git a/cpp/open3d/core/nns/kernel/DeviceDefs.cuh b/cpp/open3d/core/nns/kernel/DeviceDefs.cuh
index 85b67c436..fd21d3623 100644
--- a/cpp/open3d/core/nns/kernel/DeviceDefs.cuh
+++ b/cpp/open3d/core/nns/kernel/DeviceDefs.cuh
@@ -31,13 +31,17 @@
 
 #pragma once
 
+#if __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
+#endif
 
 namespace open3d {
 namespace core {
 
 // We require at least CUDA 8.0 for compilation
-#if CUDA_VERSION < 8000
+#if !__HIP_PLATFORM_AMD__ && (CUDA_VERSION < 8000)
 #error "CUDA >= 8.0 is required"
 #endif
 
@@ -46,7 +50,9 @@ constexpr int kWarpSize = 32;
 
 // This is a memory barrier for intra-warp writes to shared memory.
 __forceinline__ __device__ void warpFence() {
-#if CUDA_VERSION >= 9000
+#if __HIP_PLATFORM_AMD__
+    __builtin_amdgcn_wave_barrier();
+#elif (CUDA_VERSION >= 9000)
     __syncwarp();
 #else
     // For the time being, assume synchronicity.
@@ -54,7 +60,7 @@ __forceinline__ __device__ void warpFence() {
 #endif
 }
 
-#if CUDA_VERSION > 9000
+#if (CUDA_VERSION > 9000) || __HIP_PLATFORM_AMD__
 // Based on the CUDA version (we assume what version of nvcc/ptxas we were
 // compiled with), the register allocation algorithm is much better, so only
 // enable the 2048 selection code if we are above 9.0 (9.2 seems to be ok)
diff --git a/cpp/open3d/core/nns/kernel/MergeNetwork.cuh b/cpp/open3d/core/nns/kernel/MergeNetwork.cuh
index 5306eb7dc..ec4f2aa7c 100644
--- a/cpp/open3d/core/nns/kernel/MergeNetwork.cuh
+++ b/cpp/open3d/core/nns/kernel/MergeNetwork.cuh
@@ -194,7 +194,7 @@ struct BitonicMergeStep<K, V, 1, Dir, Low, true> {
 
 template <typename K, typename V, int N, bool Dir, bool Low>
 struct BitonicMergeStep<K, V, N, Dir, Low, true> {
-    static inline __device__ void merge(K k[N], V v[N]) {
+    static inline __device__ void merge(K* k, V* v) {
         static_assert(isPowerOf2(N), "must be power of 2");
         static_assert(N > 1, "must be N > 1");
 
@@ -259,7 +259,7 @@ struct BitonicMergeStep<K, V, N, Dir, Low, true> {
 // Low recursion
 template <typename K, typename V, int N, bool Dir>
 struct BitonicMergeStep<K, V, N, Dir, true, false> {
-    static inline __device__ void merge(K k[N], V v[N]) {
+    static inline __device__ void merge(K* k, V* v) {
         static_assert(!isPowerOf2(N), "must be non-power-of-2");
         static_assert(N >= 3, "must be N >= 3");
 
@@ -337,7 +337,7 @@ struct BitonicMergeStep<K, V, N, Dir, true, false> {
 // High recursion
 template <typename K, typename V, int N, bool Dir>
 struct BitonicMergeStep<K, V, N, Dir, false, false> {
-    static inline __device__ void merge(K k[N], V v[N]) {
+    static inline __device__ void merge(K* k, V* v) {
         static_assert(!isPowerOf2(N), "must be non-power-of-2");
         static_assert(N >= 3, "must be N >= 3");
 
@@ -422,10 +422,7 @@ template <typename K,
           int N2,
           bool Dir,
           bool FullMerge = true>
-inline __device__ void warpMergeAnyRegisters(K k1[N1],
-                                             V v1[N1],
-                                             K k2[N2],
-                                             V v2[N2]) {
+inline __device__ void warpMergeAnyRegisters(K* k1, V* v1, K* k2, V* v2) {
     constexpr int kSmallestN = N1 < N2 ? N1 : N2;
 
 #pragma unroll
@@ -479,7 +476,7 @@ inline __device__ void warpMergeAnyRegisters(K k1[N1],
 // bitonic sort
 template <typename K, typename V, int N, bool Dir>
 struct BitonicSortStep {
-    static inline __device__ void sort(K k[N], V v[N]) {
+    static inline __device__ void sort(K* k, V* v) {
         static_assert(N > 1, "did not hit specialized case");
 
         // Sort recursively
@@ -544,7 +541,7 @@ struct BitonicSortStep<K, V, 1, Dir> {
 /// Sort a list of kWarpSize * N elements in registers, where N is an
 /// arbitrary >= 1
 template <typename K, typename V, int N, bool Dir>
-inline __device__ void warpSortAnyRegisters(K k[N], V v[N]) {
+inline __device__ void warpSortAnyRegisters(K* k, V* v) {
     BitonicSortStep<K, V, N, Dir>::sort(k, v);
 }
 
diff --git a/cpp/open3d/core/nns/kernel/Pair.cuh b/cpp/open3d/core/nns/kernel/Pair.cuh
index 15928ba6e..ad1a88659 100644
--- a/cpp/open3d/core/nns/kernel/Pair.cuh
+++ b/cpp/open3d/core/nns/kernel/Pair.cuh
@@ -31,7 +31,11 @@
 
 #pragma once
 
+#if __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
+#endif
 
 #include "open3d/core/nns/kernel/PtxUtils.cuh"
 #include "open3d/core/nns/kernel/WarpShuffle.cuh"
diff --git a/cpp/open3d/core/nns/kernel/PtxUtils.cuh b/cpp/open3d/core/nns/kernel/PtxUtils.cuh
index 8c497b4c2..1f587f6c0 100644
--- a/cpp/open3d/core/nns/kernel/PtxUtils.cuh
+++ b/cpp/open3d/core/nns/kernel/PtxUtils.cuh
@@ -31,11 +31,104 @@
 
 #pragma once
 
+#if __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
+#endif
 
 namespace open3d {
 namespace core {
 
+// ROCm/HIP does not support inline PTX, but provides similar functionality via
+// builtins. We'll use conditional compilation to provide ROCm/HIP-compatible
+// versions for relevant functions/macros.
+
+#if __HIP_PLATFORM_AMD__
+
+// There are no direct equivalents of `bfe.u32`, `bfe.u64`, or `bfi.b32` PTX in
+// HIP C++ builtins, but bit-field extract/insert can be implemented in pure C++
+// as follows: defines to simplify the SASS assembly structure file/line in the
+// profiler
+
+template <typename T>
+static inline T bfe(T val, unsigned pos, unsigned len) {
+    static_assert(std::is_unsigned<T>::value, "bfe requires unsigned type");
+    constexpr unsigned BITS = sizeof(T) * 8;
+
+    if (len == 0) return 0;
+    if (len >= BITS) return val >> pos;
+    return (val >> pos) & ((T(1) << len) - 1);
+}
+
+#define GET_BITFIELD_U32(OUT, VAL, POS, LEN) \
+    (OUT = bfe<uint32_t>(VAL, POS, LEN));
+
+#define GET_BITFIELD_U64(OUT, VAL, POS, LEN) \
+    (OUT = bfe<uint64_t>(VAL, POS, LEN));
+
+__device__ __forceinline__ unsigned int getBitfield(unsigned int val,
+                                                    int pos,
+                                                    int len) {
+    // Extract `len` bits from `val`, starting at bit `pos`.
+    if (len == 0) return 0;
+    return (val >> pos) & ((1u << len) - 1);
+}
+
+__device__ __forceinline__ uint64_t getBitfield(uint64_t val,
+                                                int pos,
+                                                int len) {
+    if (len == 0) return 0;
+    return (val >> pos) & ((1ull << len) - 1);
+}
+
+__device__ __forceinline__ unsigned int setBitfield(unsigned int val,
+                                                    unsigned int toInsert,
+                                                    int pos,
+                                                    int len) {
+    // Clear the target bitfield and insert the new bits.
+    if (len == 0) return val;
+    unsigned int mask = ((1u << len) - 1) << pos;
+    return (val & ~mask) | ((toInsert << pos) & mask);
+}
+
+__device__ __forceinline__ int getLaneId() { return __lane_id(); }
+
+__device__ __forceinline__ unsigned getLaneMaskLt() {
+    // No direct HIP equivalent. Use __ballot()/__ballot_sync.
+    unsigned lane_id = getLaneId();
+    return (1u << lane_id) - 1;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskLe() {
+    unsigned lane_id = getLaneId();
+    return (1u << (lane_id + 1)) - 1;
+}
+
+__device__ __forceinline__ unsigned getLaneMaskGt() {
+    unsigned lane_id = getLaneId();
+    return 0xffffffffu << (lane_id + 1);
+}
+
+__device__ __forceinline__ unsigned getLaneMaskGe() {
+    unsigned lane_id = getLaneId();
+    return 0xffffffffu << lane_id;
+}
+
+__device__ __forceinline__ void namedBarrierWait(int name, int numThreads) {
+    // NOTE: hip does not support bar.sync with num threads, so we use a trap to
+    // indicate an error. Keep this function to compact the code.
+    __builtin_trap();
+}
+
+__device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) {
+    // NOTE: hip does not support bar.arrived with num threads, so we use a trap
+    // to indicate an error. Keep this function to compact the code.
+    __builtin_trap();
+}
+
+#else  // __HIP_PLATFORM_AMD__
+
 // defines to simplify the SASS assembly structure file/line in the profiler
 #define GET_BITFIELD_U32(OUT, VAL, POS, LEN) \
     asm("bfe.u32 %0, %1, %2, %3;" : "=r"(OUT) : "r"(VAL), "r"(POS), "r"(LEN));
@@ -111,5 +204,7 @@ __device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) {
                  : "memory");
 }
 
+#endif
+
 }  // namespace core
 }  // namespace open3d
diff --git a/cpp/open3d/core/nns/kernel/Reduction.cuh b/cpp/open3d/core/nns/kernel/Reduction.cuh
index a87dc0ac9..44ddb0b11 100644
--- a/cpp/open3d/core/nns/kernel/Reduction.cuh
+++ b/cpp/open3d/core/nns/kernel/Reduction.cuh
@@ -31,7 +31,11 @@
 
 #pragma once
 
+#if __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
+#endif
 
 #include "open3d/core/nns/kernel/BlockMerge.cuh"
 #include "open3d/core/nns/kernel/PtxUtils.cuh"
diff --git a/cpp/open3d/core/nns/kernel/ReductionOps.cuh b/cpp/open3d/core/nns/kernel/ReductionOps.cuh
index da8eb4710..0eeb493a1 100644
--- a/cpp/open3d/core/nns/kernel/ReductionOps.cuh
+++ b/cpp/open3d/core/nns/kernel/ReductionOps.cuh
@@ -31,7 +31,11 @@
 
 #pragma once
 
+#if __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
+#endif
 
 #include "open3d/core/nns/kernel/Limits.cuh"
 #include "open3d/core/nns/kernel/Pair.cuh"
diff --git a/cpp/open3d/core/nns/kernel/Select.cuh b/cpp/open3d/core/nns/kernel/Select.cuh
index cadfea819..5e7667a06 100644
--- a/cpp/open3d/core/nns/kernel/Select.cuh
+++ b/cpp/open3d/core/nns/kernel/Select.cuh
@@ -157,7 +157,7 @@ struct BlockSelect {
     __device__ inline void checkThreadQ() {
         bool needSort = (numVals == NumThreadQ);
 
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__
         needSort = __any_sync(0xffffffff, needSort);
 #else
         needSort = __any(needSort);
@@ -421,7 +421,7 @@ struct WarpSelect {
     __device__ inline void checkThreadQ() {
         bool needSort = (numVals == NumThreadQ);
 
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__
         needSort = __any_sync(0xffffffff, needSort);
 #else
         needSort = __any(needSort);
diff --git a/cpp/open3d/core/nns/kernel/StaticUtils.cuh b/cpp/open3d/core/nns/kernel/StaticUtils.cuh
index f636ff9e4..016b6d26f 100644
--- a/cpp/open3d/core/nns/kernel/StaticUtils.cuh
+++ b/cpp/open3d/core/nns/kernel/StaticUtils.cuh
@@ -31,7 +31,11 @@
 
 #pragma once
 
+#if __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
+#endif
 
 // allow usage for non-CUDA files
 #ifndef __host__
diff --git a/cpp/open3d/core/nns/kernel/WarpShuffle.cuh b/cpp/open3d/core/nns/kernel/WarpShuffle.cuh
index bce9a4a93..ad29c8788 100644
--- a/cpp/open3d/core/nns/kernel/WarpShuffle.cuh
+++ b/cpp/open3d/core/nns/kernel/WarpShuffle.cuh
@@ -31,13 +31,20 @@
 
 #pragma once
 
+#if __HIP_PLATFORM_AMD__
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+
+using half = __half;
+#else
 #include <cuda.h>
+#endif
 
 namespace open3d {
 namespace core {
 
 // defines to simplify the SASS assembly structure file/line in the profiler
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__
 #define SHFL_SYNC(VAL, SRC_LANE, WIDTH) \
     __shfl_sync(0xffffffff, VAL, SRC_LANE, WIDTH)
 #else
@@ -46,7 +53,7 @@ namespace core {
 
 template <typename T>
 inline __device__ T shfl(const T val, int srcLane, int width = kWarpSize) {
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__
     return __shfl_sync(0xffffffff, val, srcLane, width);
 #else
     return __shfl(val, srcLane, width);
@@ -66,7 +73,7 @@ template <typename T>
 inline __device__ T shfl_up(const T val,
                             unsigned int delta,
                             int width = kWarpSize) {
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__
     return __shfl_up_sync(0xffffffff, val, delta, width);
 #else
     return __shfl_up(val, delta, width);
@@ -88,7 +95,7 @@ template <typename T>
 inline __device__ T shfl_down(const T val,
                               unsigned int delta,
                               int width = kWarpSize) {
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__
     return __shfl_down_sync(0xffffffff, val, delta, width);
 #else
     return __shfl_down(val, delta, width);
@@ -107,7 +114,7 @@ inline __device__ T* shfl_down(T* const val,
 
 template <typename T>
 inline __device__ T shfl_xor(const T val, int laneMask, int width = kWarpSize) {
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || !__HIP_PLATFORM_AMD__
     return __shfl_xor_sync(0xffffffff, val, laneMask, width);
 #else
     return __shfl_xor(val, laneMask, width);
@@ -125,22 +132,32 @@ inline __device__ T* shfl_xor(T* const val,
 }
 
 // CUDA 9.0+ has half shuffle
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 || __HIP_PLATFORM_AMD__
 inline __device__ half shfl(half v, int srcLane, int width = kWarpSize) {
+    half h;
+#if __HIP_PLATFORM_AMD__
+    unsigned int vu = *reinterpret_cast<unsigned int*>(&v);
+    vu = __shfl(vu, srcLane, width);
+    h = *reinterpret_cast<half*>(&vu);
+#else
     unsigned int vu = v.x;
     vu = __shfl(vu, srcLane, width);
-
-    half h;
     h.x = (unsigned short)vu;
+#endif
     return h;
 }
 
 inline __device__ half shfl_xor(half v, int laneMask, int width = kWarpSize) {
+    half h;
+#if __HIP_PLATFORM_AMD__
+    unsigned int vu = *reinterpret_cast<unsigned int*>(&v);
+    vu = __shfl_xor(vu, laneMask, width);
+    h = *reinterpret_cast<half*>(&vu);
+#else
     unsigned int vu = v.x;
     vu = __shfl_xor(vu, laneMask, width);
-
-    half h;
     h.x = (unsigned short)vu;
+#endif
     return h;
 }
 #endif  // CUDA_VERSION
diff --git a/cpp/open3d/ml/Helper.h b/cpp/open3d/ml/Helper.h
index 4193cfac9..e0c8edac2 100644
--- a/cpp/open3d/ml/Helper.h
+++ b/cpp/open3d/ml/Helper.h
@@ -32,6 +32,18 @@ namespace ml {
 #define OPEN3D_ML_CUDA_DRIVER_CHECK(err) \
     __OPEN3D_ML_CUDA_DRIVER_CHECK(err, __FILE__, __LINE__)
 
+#if __HIP_PLATFORM_AMD__
+inline void __OPEN3D_ML_CUDA_DRIVER_CHECK(hipError_t err,
+                                          const char *file,
+                                          const int line,
+                                          bool abort = true) {
+    if (err != CUDA_SUCCESS) {
+        const char *error_string = hipGetErrorString(err);
+        utility::LogError("{}:{} CUDA driver error: {}", file, line,
+                          error_string);
+    }
+}
+#else
 inline void __OPEN3D_ML_CUDA_DRIVER_CHECK(CUresult err,
                                           const char *file,
                                           const int line,
@@ -48,6 +60,7 @@ inline void __OPEN3D_ML_CUDA_DRIVER_CHECK(CUresult err,
         }
     }
 }
+#endif
 
 inline cudaStream_t GetDefaultStream() { (cudaStream_t)0; }
 
diff --git a/cpp/open3d/ml/contrib/CMakeLists.txt b/cpp/open3d/ml/contrib/CMakeLists.txt
index bca160547..8ce20657c 100644
--- a/cpp/open3d/ml/contrib/CMakeLists.txt
+++ b/cpp/open3d/ml/contrib/CMakeLists.txt
@@ -12,6 +12,22 @@ if(BUILD_CUDA_MODULE)
     )
 endif()
 
+if (WITH_ROCM)
+    include(Hipify)
+
+    get_target_property(SOURCE_LIST ml_contrib SOURCES)
+    get_hipified_list("${SOURCE_LIST}" HIPIFIED_SOURCE_LIST)
+
+    foreach(src IN LISTS HIPIFIED_SOURCE_LIST)
+        get_filename_component(ext ${src} EXT)
+        if(ext STREQUAL ".cu")
+            set_source_files_properties(${src} PROPERTIES LANGUAGE HIP)
+        endif()
+    endforeach()
+
+    set_property(TARGET ml_contrib PROPERTY SOURCES ${HIPIFIED_SOURCE_LIST})
+endif()
+
 open3d_show_and_abort_on_warning(ml_contrib)
 open3d_set_global_properties(ml_contrib)
 open3d_set_open3d_lib_properties(ml_contrib)
diff --git a/cpp/open3d/ml/impl/continuous_conv/ContinuousConv.cuh b/cpp/open3d/ml/impl/continuous_conv/ContinuousConv.cuh
index 322b38180..245c00abe 100644
--- a/cpp/open3d/ml/impl/continuous_conv/ContinuousConv.cuh
+++ b/cpp/open3d/ml/impl/continuous_conv/ContinuousConv.cuh
@@ -9,9 +9,17 @@
 
 #define EIGEN_USE_GPU
 
+#if __HIP_PLATFORM_AMD__
+
+#include <hipblas/hipblas.h>
+
+#else
+
 #include <cutlass/gemm/gemm.h>
 #include <cutlass/gemm/sgemm_traits.h>
 
+#endif  // __HIP_PLATFORM_AMD__
+
 #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h"
 #include "open3d/ml/impl/misc/MemoryAllocation.h"
 #include "open3d/utility/Helper.h"
@@ -191,6 +199,7 @@ void CConvComputeFeaturesCUDA(const cudaStream_t& stream,
     size_t num_cols_per_run =
             std::min(mem_columns.second / bytes_per_column, size_t(num_out));
 
+#if !__HIP_PLATFORM_AMD__
     typedef cutlass::gemm::SgemmTraits<
             cutlass::MatrixLayout::kColumnMajor,  // layout of A matrix (filter)
             cutlass::MatrixLayout::kColumnMajor,  // layout of B matrix
@@ -200,6 +209,7 @@ void CConvComputeFeaturesCUDA(const cudaStream_t& stream,
             GemmTraits;
 
     typedef cutlass::gemm::Gemm<GemmTraits> Gemm;
+#endif  // !__HIP_PLATFORM_AMD__
 
     // this is the pointer to the patch matrix
     TFeat* columns = (TFeat*)mem_columns.first;
@@ -236,6 +246,23 @@ void CConvComputeFeaturesCUDA(const cudaStream_t& stream,
         float* C = out_features + (run_i * num_cols_per_run * out_channels);
         int ldc = m;
 
+#if __HIP_PLATFORM_AMD__
+        hipblasHandle_t handle;
+        hipblasStatus_t status;
+        status = hipblasCreate(&handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to create HIPBLAS handle.");
+        }
+        status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k,
+                              &alpha, A, lda, B, ldb, &beta, C, ldc);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to launch HIPBLAS Gemm.");
+        }
+        status = hipblasDestroy(handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to destroy HIPBLAS handle.");
+        }
+#else
         typename Gemm::Params params;
         int result = params.initialize(m,      // GEMM M dimension
                                        n,      // GEMM N dimension
@@ -257,6 +284,7 @@ void CConvComputeFeaturesCUDA(const cudaStream_t& stream,
         }
 
         Gemm::launch(params, stream);
+#endif  // __HIP_PLATFORM_AMD__
     }
 }
 
diff --git a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.cuh b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.cuh
index ef03b9f05..0217ea471 100644
--- a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.cuh
+++ b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.cuh
@@ -8,9 +8,17 @@
 #pragma once
 #define EIGEN_USE_GPU
 
+#if __HIP_PLATFORM_AMD__
+
+#include <hipblas/hipblas.h>
+
+#else
+
 #include <cutlass/gemm/gemm.h>
 #include <cutlass/gemm/sgemm_traits.h>
 
+#endif  // __HIP_PLATFORM_AMD__
+
 #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h"
 #include "open3d/ml/impl/misc/MemoryAllocation.h"
 #include "open3d/utility/Helper.h"
@@ -190,6 +198,7 @@ void CConvBackpropFilterCUDA(const cudaStream_t& stream,
     size_t num_cols_per_run =
             std::min(mem_columns.second / bytes_per_column, size_t(num_out));
 
+#if !__HIP_PLATFORM_AMD__
     typedef cutlass::gemm::SgemmTraits<
             cutlass::MatrixLayout::kColumnMajor,  // layout of A matrix
             cutlass::MatrixLayout::kRowMajor,     // layout of B matrix
@@ -198,6 +207,7 @@ void CConvBackpropFilterCUDA(const cudaStream_t& stream,
             GemmTraits;
 
     typedef cutlass::gemm::Gemm<GemmTraits> Gemm;
+#endif  // !__HIP_PLATFORM_AMD__
 
     TFeat* columns = (TFeat*)mem_columns.first;
 
@@ -217,7 +227,6 @@ void CConvBackpropFilterCUDA(const cudaStream_t& stream,
                 filter_dims, interpolation, coordinate_mapping, align_corners,
                 individual_extent, isotropic_extent, normalize);
 
-        typename Gemm::Params params;
         // C is MxN
         // B is KxN
         // A is MxK
@@ -233,6 +242,24 @@ void CConvBackpropFilterCUDA(const cudaStream_t& stream,
         float beta = 1;
         float* C = filter_backprop;
         int ldc = m;
+#if __HIP_PLATFORM_AMD__
+        hipblasHandle_t handle;
+        hipblasStatus_t status;
+        status = hipblasCreate(&handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to create HIPBLAS handle.");
+        }
+        status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_T, m, n, k,
+                              &alpha, A, lda, B, ldb, &beta, C, ldc);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to launch HIPBLAS Gemm.");
+        }
+        status = hipblasDestroy(handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to destroy HIPBLAS handle.");
+        }
+#else
+        typename Gemm::Params params;
 
         int result = params.initialize(m,      // GEMM M dimension
                                        n,      // GEMM N dimension
@@ -254,6 +281,7 @@ void CConvBackpropFilterCUDA(const cudaStream_t& stream,
         }
 
         Gemm::launch(params, stream);
+#endif  // __HIP_PLATFORM_AMD__
     }
 }
 
diff --git a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.cu b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.cu
index 218e4c2b6..7863b9d4f 100644
--- a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.cu
+++ b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.cu
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
+#include "open3d/core/CUDAUtils.h"
 #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h"
 #include "open3d/utility/Helper.h"
 
diff --git a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTranspose.cuh b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTranspose.cuh
index 29d8aba31..54f9c006f 100644
--- a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTranspose.cuh
+++ b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTranspose.cuh
@@ -8,9 +8,17 @@
 #pragma once
 #define EIGEN_USE_GPU
 
+#if __HIP_PLATFORM_AMD__
+
+#include <hipblas/hipblas.h>
+
+#else
+
 #include <cutlass/gemm/gemm.h>
 #include <cutlass/gemm/sgemm_traits.h>
 
+#endif  // __HIP_PLATFORM_AMD__
+
 #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h"
 #include "open3d/ml/impl/misc/MemoryAllocation.h"
 #include "open3d/utility/Helper.h"
@@ -102,6 +110,7 @@ void CConvTransposeComputeFeaturesCUDA(
     size_t num_cols_per_run =
             std::min(mem_columns.second / bytes_per_column, size_t(num_out));
 
+#if !__HIP_PLATFORM_AMD__
     typedef cutlass::gemm::SgemmTraits<
             cutlass::MatrixLayout::kColumnMajor,  // layout of A matrix
             cutlass::MatrixLayout::kColumnMajor,  // layout of B matrix
@@ -110,6 +119,7 @@ void CConvTransposeComputeFeaturesCUDA(
             GemmTraits;
 
     typedef cutlass::gemm::Gemm<GemmTraits> Gemm;
+#endif  // !__HIP_PLATFORM_AMD__
 
     TFeat* columns = (TFeat*)mem_columns.first;
 
@@ -130,7 +140,6 @@ void CConvTransposeComputeFeaturesCUDA(
                 interpolation, coordinate_mapping, align_corners,
                 individual_extent, isotropic_extent, normalize);
 
-        typename Gemm::Params params;
         // C is MxN
         // B is KxN
         // A is MxK
@@ -145,6 +154,24 @@ void CConvTransposeComputeFeaturesCUDA(
         float beta = 1;
         float* C = out_features + (run_i * num_cols_per_run * out_channels);
         int ldc = m;
+#if __HIP_PLATFORM_AMD__
+        hipblasHandle_t handle;
+        hipblasStatus_t status;
+        status = hipblasCreate(&handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to create HIPBLAS handle.");
+        }
+        status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k,
+                              &alpha, A, lda, B, ldb, &beta, C, ldc);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to launch HIPBLAS Gemm.");
+        }
+        status = hipblasDestroy(handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to destroy HIPBLAS handle.");
+        }
+#else
+        typename Gemm::Params params;
 
         int result =
                 params.initialize(m,      // GEMM M dimension
@@ -168,6 +195,7 @@ void CConvTransposeComputeFeaturesCUDA(
         }
 
         Gemm::launch(params, stream);
+#endif  // __HIP_PLATFORM_AMD__
     }
 
     if (out_importance) {
diff --git a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.cuh b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.cuh
index 2c1dc7a16..3558e0ba8 100644
--- a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.cuh
+++ b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.cuh
@@ -8,9 +8,17 @@
 #pragma once
 #define EIGEN_USE_GPU
 
+#if __HIP_PLATFORM_AMD__
+
+#include <hipblas/hipblas.h>
+
+#else
+
 #include <cutlass/gemm/gemm.h>
 #include <cutlass/gemm/sgemm_traits.h>
 
+#endif  // __HIP_PLATFORM_AMD__
+
 #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h"
 #include "open3d/ml/impl/misc/MemoryAllocation.h"
 #include "open3d/utility/Helper.h"
@@ -203,6 +211,7 @@ void CConvTransposeBackpropFilterCUDA(const cudaStream_t& stream,
             sizeof(TOut) * spatial_filter_size * in_channels * out_channels,
             stream);
 
+#if !__HIP_PLATFORM_AMD__
     typedef cutlass::gemm::SgemmTraits<
             cutlass::MatrixLayout::kColumnMajor,  // layout of A matrix
             cutlass::MatrixLayout::kRowMajor,     // layout of B matrix
@@ -211,6 +220,7 @@ void CConvTransposeBackpropFilterCUDA(const cudaStream_t& stream,
             GemmTraits;
 
     typedef cutlass::gemm::Gemm<GemmTraits> Gemm;
+#endif  // !__HIP_PLATFORM_AMD__
 
     TFeat* columns = (TFeat*)mem_columns.first;
     TFeat* gradient = ((TFeat*)mem_columns.first) +
@@ -245,7 +255,6 @@ void CConvTransposeBackpropFilterCUDA(const cudaStream_t& stream,
                 interpolation, coordinate_mapping, align_corners,
                 individual_extent, isotropic_extent, normalize);
 
-        typename Gemm::Params params;
         // C is MxN
         // B is KxN
         // A is MxK
@@ -261,6 +270,25 @@ void CConvTransposeBackpropFilterCUDA(const cudaStream_t& stream,
         float* C = filter_backprop;
         int ldc = m;
 
+#if __HIP_PLATFORM_AMD__
+        hipblasHandle_t handle;
+        hipblasStatus_t status;
+        status = hipblasCreate(&handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to create HIPBLAS handle.");
+        }
+        status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_T, m, n, k,
+                              &alpha, A, lda, B, ldb, &beta, C, ldc);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to launch HIPBLAS Gemm.");
+        }
+        status = hipblasDestroy(handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to destroy HIPBLAS handle.");
+        }
+#else
+        typename Gemm::Params params;
+
         int result =
                 params.initialize(m,      // GEMM M dimension
                                   n,      // GEMM N dimension
@@ -282,6 +310,7 @@ void CConvTransposeBackpropFilterCUDA(const cudaStream_t& stream,
         }
 
         Gemm::launch(params, stream);
+#endif  // __HIP_PLATFORM_AMD__
     }
 }
 
diff --git a/cpp/open3d/ml/impl/misc/NeighborSearchCommon.h b/cpp/open3d/ml/impl/misc/NeighborSearchCommon.h
index 4f1d1b740..e8f05a77f 100644
--- a/cpp/open3d/ml/impl/misc/NeighborSearchCommon.h
+++ b/cpp/open3d/ml/impl/misc/NeighborSearchCommon.h
@@ -18,7 +18,7 @@ namespace impl {
 /// Supported metrics
 enum Metric { L1, L2, Linf };
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 #define HOST_DEVICE __host__ __device__
 #else
 #define HOST_DEVICE
diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConv.cuh b/cpp/open3d/ml/impl/sparse_conv/SparseConv.cuh
index 3bbc297db..6fac85e85 100644
--- a/cpp/open3d/ml/impl/sparse_conv/SparseConv.cuh
+++ b/cpp/open3d/ml/impl/sparse_conv/SparseConv.cuh
@@ -9,9 +9,17 @@
 
 #define EIGEN_USE_GPU
 
+#if __HIP_PLATFORM_AMD__
+
+#include <hipblas/hipblas.h>
+
+#else
+
 #include <cutlass/gemm/gemm.h>
 #include <cutlass/gemm/sgemm_traits.h>
 
+#endif  // __HIP_PLATFORM_AMD__
+
 #include "open3d/ml/impl/misc/MemoryAllocation.h"
 #include "open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.h"
 #include "open3d/utility/Helper.h"
@@ -151,6 +159,7 @@ void SparseConvComputeFeaturesCUDA(const cudaStream_t& stream,
     size_t num_cols_per_run =
             std::min(mem_columns.second / bytes_per_column, size_t(num_out));
 
+#if !__HIP_PLATFORM_AMD__
     typedef cutlass::gemm::SgemmTraits<
             cutlass::MatrixLayout::kColumnMajor,  // layout of A matrix (filter)
             cutlass::MatrixLayout::kColumnMajor,  // layout of B matrix
@@ -160,6 +169,7 @@ void SparseConvComputeFeaturesCUDA(const cudaStream_t& stream,
             GemmTraits;
 
     typedef cutlass::gemm::Gemm<GemmTraits> Gemm;
+#endif  // !__HIP_PLATFORM_AMD__
 
     // this is the pointer to the patch matrix
     TFeat* columns = (TFeat*)mem_columns.first;
@@ -194,6 +204,23 @@ void SparseConvComputeFeaturesCUDA(const cudaStream_t& stream,
         float* C = out_features + (run_i * num_cols_per_run * out_channels);
         int ldc = m;
 
+#if __HIP_PLATFORM_AMD__
+        hipblasHandle_t handle;
+        hipblasStatus_t status;
+        status = hipblasCreate(&handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to create HIPBLAS handle.");
+        }
+        status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k,
+                              &alpha, A, lda, B, ldb, &beta, C, ldc);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to launch HIPBLAS Gemm.");
+        }
+        status = hipblasDestroy(handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to destroy HIPBLAS handle.");
+        }
+#else
         typename Gemm::Params params;
         int result = params.initialize(m,      // GEMM M dimension
                                        n,      // GEMM N dimension
@@ -215,6 +242,7 @@ void SparseConvComputeFeaturesCUDA(const cudaStream_t& stream,
         }
 
         Gemm::launch(params, stream);
+#endif  // __HIP_PLATFORM_AMD__
     }
 }
 
diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.cuh b/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.cuh
index 5dc0e6973..d8b7e3cfa 100644
--- a/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.cuh
+++ b/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.cuh
@@ -8,9 +8,17 @@
 #pragma once
 #define EIGEN_USE_GPU
 
+#if __HIP_PLATFORM_AMD__
+
+#include <hipblas/hipblas.h>
+
+#else
+
 #include <cutlass/gemm/gemm.h>
 #include <cutlass/gemm/sgemm_traits.h>
 
+#endif  // __HIP_PLATFORM_AMD__
+
 #include "open3d/ml/impl/misc/MemoryAllocation.h"
 #include "open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.h"
 #include "open3d/utility/Helper.h"
@@ -150,6 +158,7 @@ void SparseConvBackpropFilterCUDA(const cudaStream_t& stream,
     size_t num_cols_per_run =
             std::min(mem_columns.second / bytes_per_column, size_t(num_out));
 
+#if !__HIP_PLATFORM_AMD__
     typedef cutlass::gemm::SgemmTraits<
             cutlass::MatrixLayout::kColumnMajor,  // layout of A matrix
             cutlass::MatrixLayout::kRowMajor,     // layout of B matrix
@@ -158,6 +167,7 @@ void SparseConvBackpropFilterCUDA(const cudaStream_t& stream,
             GemmTraits;
 
     typedef cutlass::gemm::Gemm<GemmTraits> Gemm;
+#endif  // !__HIP_PLATFORM_AMD__
 
     TFeat* columns = (TFeat*)mem_columns.first;
 
@@ -175,7 +185,6 @@ void SparseConvBackpropFilterCUDA(const cudaStream_t& stream,
                 neighbors_index, neighbors_kernel_index, neighbors_importance,
                 neighbors_row_splits, num_kernel_elements, normalize);
 
-        typename Gemm::Params params;
         // C is MxN
         // B is KxN
         // A is MxK
@@ -192,6 +201,26 @@ void SparseConvBackpropFilterCUDA(const cudaStream_t& stream,
         float* C = filter_backprop;
         int ldc = m;
 
+#if __HIP_PLATFORM_AMD__
+        hipblasHandle_t handle;
+        hipblasStatus_t status;
+        status = hipblasCreate(&handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to create HIPBLAS handle.");
+        }
+        status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_T, m, n, k,
+                              &alpha, A, lda, B, ldb, &beta, C, ldc);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to launch HIPBLAS Gemm.");
+        }
+        status = hipblasDestroy(handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to destroy HIPBLAS handle.");
+        }
+#else
+
+        typename Gemm::Params params;
+
         int result = params.initialize(m,      // GEMM M dimension
                                        n,      // GEMM N dimension
                                        k,      // GEMM K dimension
@@ -212,6 +241,7 @@ void SparseConvBackpropFilterCUDA(const cudaStream_t& stream,
         }
 
         Gemm::launch(params, stream);
+#endif  // __HIP_PLATFORM_AMD__
     }
 }
 
diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.cu b/cpp/open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.cu
index 87d11ba3a..c163e1f4c 100644
--- a/cpp/open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.cu
+++ b/cpp/open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.cu
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
+#include "open3d/core/CUDAUtils.h"
 #include "open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.h"
 #include "open3d/utility/Helper.h"
 
diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConvTranspose.cuh b/cpp/open3d/ml/impl/sparse_conv/SparseConvTranspose.cuh
index efd0a5115..c571417e1 100644
--- a/cpp/open3d/ml/impl/sparse_conv/SparseConvTranspose.cuh
+++ b/cpp/open3d/ml/impl/sparse_conv/SparseConvTranspose.cuh
@@ -8,9 +8,17 @@
 #pragma once
 #define EIGEN_USE_GPU
 
+#if __HIP_PLATFORM_AMD__
+
+#include <hipblas/hipblas.h>
+
+#else
+
 #include <cutlass/gemm/gemm.h>
 #include <cutlass/gemm/sgemm_traits.h>
 
+#endif  // __HIP_PLATFORM_AMD__
+
 #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h"
 #include "open3d/ml/impl/misc/MemoryAllocation.h"
 #include "open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.h"
@@ -96,6 +104,7 @@ void SparseConvTransposeComputeFeaturesCUDA(
     size_t num_cols_per_run =
             std::min(mem_columns.second / bytes_per_column, size_t(num_out));
 
+#if !__HIP_PLATFORM_AMD__
     typedef cutlass::gemm::SgemmTraits<
             cutlass::MatrixLayout::kColumnMajor,  // layout of A matrix
             cutlass::MatrixLayout::kColumnMajor,  // layout of B matrix
@@ -104,6 +113,7 @@ void SparseConvTransposeComputeFeaturesCUDA(
             GemmTraits;
 
     typedef cutlass::gemm::Gemm<GemmTraits> Gemm;
+#endif  // !__HIP_PLATFORM_AMD__
 
     TFeat* columns = (TFeat*)mem_columns.first;
 
@@ -122,7 +132,6 @@ void SparseConvTransposeComputeFeaturesCUDA(
                 neighbors_kernel_index, neighbors_importance,
                 neighbors_row_splits, num_kernel_elements, normalize);
 
-        typename Gemm::Params params;
         // C is MxN
         // B is KxN
         // A is MxK
@@ -137,6 +146,24 @@ void SparseConvTransposeComputeFeaturesCUDA(
         float beta = 1;
         float* C = out_features + (run_i * num_cols_per_run * out_channels);
         int ldc = m;
+#if __HIP_PLATFORM_AMD__
+        hipblasHandle_t handle;
+        hipblasStatus_t status;
+        status = hipblasCreate(&handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to create HIPBLAS handle.");
+        }
+        status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, m, n, k,
+                              &alpha, A, lda, B, ldb, &beta, C, ldc);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to launch HIPBLAS Gemm.");
+        }
+        status = hipblasDestroy(handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to destroy HIPBLAS handle.");
+        }
+#else
+        typename Gemm::Params params;
 
         int result =
                 params.initialize(m,      // GEMM M dimension
@@ -160,6 +187,7 @@ void SparseConvTransposeComputeFeaturesCUDA(
         }
 
         Gemm::launch(params, stream);
+#endif  // __HIP_PLATFORM_AMD__
     }
 
     if (out_importance) {
diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.cuh b/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.cuh
index 3c6f546a0..a7dc46e96 100644
--- a/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.cuh
+++ b/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.cuh
@@ -8,9 +8,17 @@
 #pragma once
 #define EIGEN_USE_GPU
 
+#if __HIP_PLATFORM_AMD__
+
+#include <hipblas/hipblas.h>
+
+#else
+
 #include <cutlass/gemm/gemm.h>
 #include <cutlass/gemm/sgemm_traits.h>
 
+#endif  // __HIP_PLATFORM_AMD__
+
 #include "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h"
 #include "open3d/ml/impl/misc/MemoryAllocation.h"
 #include "open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.h"
@@ -164,7 +172,7 @@ void SparseConvTransposeBackpropFilterCUDA(
             filter_backprop, 0,
             sizeof(TOut) * num_kernel_elements * in_channels * out_channels,
             stream);
-
+#if !__HIP_PLATFORM_AMD__
     typedef cutlass::gemm::SgemmTraits<
             cutlass::MatrixLayout::kColumnMajor,  // layout of A matrix
             cutlass::MatrixLayout::kRowMajor,     // layout of B matrix
@@ -173,6 +181,7 @@ void SparseConvTransposeBackpropFilterCUDA(
             GemmTraits;
 
     typedef cutlass::gemm::Gemm<GemmTraits> Gemm;
+#endif  // !__HIP_PLATFORM_AMD__
 
     TFeat* columns = (TFeat*)mem_columns.first;
     TFeat* gradient = ((TFeat*)mem_columns.first) +
@@ -205,7 +214,6 @@ void SparseConvTransposeBackpropFilterCUDA(
                 neighbors_kernel_index, neighbors_importance,
                 neighbors_row_splits, num_kernel_elements, normalize);
 
-        typename Gemm::Params params;
         // C is MxN
         // B is KxN
         // A is MxK
@@ -221,6 +229,25 @@ void SparseConvTransposeBackpropFilterCUDA(
         float* C = filter_backprop;
         int ldc = m;
 
+#if __HIP_PLATFORM_AMD__
+        hipblasHandle_t handle;
+        hipblasStatus_t status;
+        status = hipblasCreate(&handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to create HIPBLAS handle.");
+        }
+        status = hipblasSgemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_T, m, n, k,
+                              &alpha, A, lda, B, ldb, &beta, C, ldc);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to launch HIPBLAS Gemm.");
+        }
+        status = hipblasDestroy(handle);
+        if (status != HIPBLAS_STATUS_SUCCESS) {
+            throw std::runtime_error("Failed to destroy HIPBLAS handle.");
+        }
+#else
+        typename Gemm::Params params;
+
         int result =
                 params.initialize(m,      // GEMM M dimension
                                   n,      // GEMM N dimension
@@ -242,6 +269,7 @@ void SparseConvTransposeBackpropFilterCUDA(
         }
 
         Gemm::launch(params, stream);
+#endif  // __HIP_PLATFORM_AMD__
     }
 }
 
diff --git a/cpp/open3d/ml/paddle/CMakeLists.txt b/cpp/open3d/ml/paddle/CMakeLists.txt
index a81fd2a09..26158f7ca 100644
--- a/cpp/open3d/ml/paddle/CMakeLists.txt
+++ b/cpp/open3d/ml/paddle/CMakeLists.txt
@@ -102,6 +102,17 @@ if (BUILD_CUDA_MODULE)
     )
 endif()
 
+if (WITH_ROCM)
+    include(Hipify)
+
+    get_target_property(SOURCE_LIST open3d_paddle_ops SOURCES)
+    get_hipified_list("${SOURCE_LIST}" HIPIFIED_SOURCE_LIST)
+    # NOTE: Set language to HIP for source files to enforce using HIP_COMPILER instead of CXX_COMPILER
+    # to workaround _Float16 is not defined in CXX_COMPILER
+    set_source_files_properties(${HIPIFIED_SOURCE_LIST} PROPERTIES LANGUAGE HIP)
+    set_property(TARGET open3d_paddle_ops PROPERTY SOURCES ${HIPIFIED_SOURCE_LIST})
+endif()
+
 open3d_show_and_abort_on_warning(open3d_paddle_ops)
 open3d_set_global_properties(open3d_paddle_ops)
 
@@ -133,17 +144,31 @@ target_link_libraries(open3d_paddle_ops PRIVATE
 )
 
 if (BUILD_CUDA_MODULE)
-    target_link_libraries(open3d_paddle_ops PRIVATE
-        Open3D::3rdparty_cutlass
-        ${PADDLE_LIBRARIES}
-        CUDA::cuda_driver
-    )
-
-    if (TARGET Open3D::3rdparty_cub)
+    if (WITH_CUDA)
+        target_link_libraries(open3d_paddle_ops PRIVATE
+            Open3D::3rdparty_cutlass
+            ${PADDLE_LIBRARIES}
+            CUDA::cuda_driver
+        )
+        if (TARGET Open3D::3rdparty_cub)
+            target_link_libraries(open3d_paddle_ops PRIVATE
+                Open3D::3rdparty_cub
+            )
+        endif()
+    elseif(WITH_ROCM)
         target_link_libraries(open3d_paddle_ops PRIVATE
-            Open3D::3rdparty_cub
+            Open3D::3rdparty_hipblas
+            ${PADDLE_LIBRARIES}
+            Open3D::3rdparty_gflags
+            Open3D::3rdparty_glog
         )
+        if (TARGET Open3D::3rdparty_cub)
+            target_link_libraries(open3d_paddle_ops PRIVATE
+                Open3D::3rdparty_cub
+            )
+        endif()
     endif()
+    
 endif()
 
 install(TARGETS open3d_paddle_ops EXPORT Open3DPaddleOps
diff --git a/cpp/open3d/ml/paddle/PaddleHelper.cpp b/cpp/open3d/ml/paddle/PaddleHelper.cpp
index f67baf2c4..a902f0c9e 100644
--- a/cpp/open3d/ml/paddle/PaddleHelper.cpp
+++ b/cpp/open3d/ml/paddle/PaddleHelper.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
-#include "PaddleHelper.h"
+#include "open3d/ml/paddle/PaddleHelper.h"
 
 paddle::Tensor InitializedEmptyTensor(const phi::DataType dtype,
                                       const phi::IntArray& shape,
diff --git a/cpp/open3d/ml/paddle/PaddleHelper.h b/cpp/open3d/ml/paddle/PaddleHelper.h
index 17f86e21e..70764c4a8 100644
--- a/cpp/open3d/ml/paddle/PaddleHelper.h
+++ b/cpp/open3d/ml/paddle/PaddleHelper.h
@@ -241,7 +241,11 @@ std::tuple<bool, std::string> CheckShape(paddle::Tensor tensor,
 
 #ifdef BUILD_CUDA_MODULE
 static void cudaFreeWrapper(void* ptr) {
+#if __HIP_PLATFORM_AMD__
+    phi::gpuError_t result = hipFree(ptr);
+#else
     phi::gpuError_t result = cudaFree(ptr);
+#endif
     PADDLE_ENFORCE_GPU_SUCCESS(result);
 }
 #endif
@@ -260,7 +264,11 @@ paddle::Tensor InitializedEmptyTensor(const phi::IntArray& shape,
     T* ptr = nullptr;
     if (phi::is_gpu_place(place)) {
 #ifdef BUILD_CUDA_MODULE
+#if __HIP_PLATFORM_AMD__
+        phi::gpuError_t result = hipMalloc(&ptr, sizeof(T) * 1);
+#else
         phi::gpuError_t result = cudaMalloc(&ptr, sizeof(T) * 1);
+#endif
         PADDLE_ENFORCE_GPU_SUCCESS(result);
         deleter = std::function<void(void*)>(cudaFreeWrapper);
 #else
@@ -269,7 +277,7 @@ paddle::Tensor InitializedEmptyTensor(const phi::IntArray& shape,
 #endif
     } else if (phi::is_cpu_place(place)) {
         ptr = (T*)malloc(sizeof(T) * 1);
-        deleter = std::function<void(void*)>(free);
+        deleter = std::function<void(void*)>(std::free);
     } else {
         PD_CHECK(false, "Not supported backend!");
     }
diff --git a/cpp/open3d/t/geometry/kernel/CMakeLists.txt b/cpp/open3d/t/geometry/kernel/CMakeLists.txt
index 081d24a6b..6d536ecf5 100644
--- a/cpp/open3d/t/geometry/kernel/CMakeLists.txt
+++ b/cpp/open3d/t/geometry/kernel/CMakeLists.txt
@@ -32,6 +32,23 @@ if (WITH_IPPICV)
     )
 endif()
 
+if (WITH_ROCM)
+    include(Hipify)
+
+    get_target_property(SOURCE_LIST tgeometry_kernel SOURCES)
+    get_hipified_list("${SOURCE_LIST}" HIPIFIED_SOURCE_LIST)
+
+    foreach(src IN LISTS HIPIFIED_SOURCE_LIST)
+        get_filename_component(ext ${src} EXT)
+        if(ext STREQUAL ".cu")
+            set_source_files_properties(${src} PROPERTIES LANGUAGE HIP)
+        endif()
+    endforeach()
+
+    set_property(TARGET tgeometry_kernel PROPERTY SOURCES ${HIPIFIED_SOURCE_LIST})
+endif()
+
+
 open3d_show_and_abort_on_warning(tgeometry_kernel)
 open3d_set_global_properties(tgeometry_kernel)
 open3d_set_open3d_lib_properties(tgeometry_kernel HIDDEN)
diff --git a/cpp/open3d/t/geometry/kernel/GeometryMacros.h b/cpp/open3d/t/geometry/kernel/GeometryMacros.h
index bb5eab804..d692da781 100644
--- a/cpp/open3d/t/geometry/kernel/GeometryMacros.h
+++ b/cpp/open3d/t/geometry/kernel/GeometryMacros.h
@@ -11,7 +11,7 @@
 
 #include "open3d/core/CUDAUtils.h"
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 
 #if defined(__CUDA_ARCH__)
 #if __CUDA_ARCH__ < 600
@@ -54,7 +54,7 @@ OPEN3D_HOST_DEVICE scalar_t Square(const scalar_t &x) {
 }  // namespace open3d
 
 // https://stackoverflow.com/a/51549250
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 __device__ inline float atomicMinf(float *addr, float value) {
     float old;
     old = (value >= 0) ? __int_as_float(
diff --git a/cpp/open3d/t/geometry/kernel/ImageImpl.h b/cpp/open3d/t/geometry/kernel/ImageImpl.h
index fbe2bcd1d..4743f2bac 100644
--- a/cpp/open3d/t/geometry/kernel/ImageImpl.h
+++ b/cpp/open3d/t/geometry/kernel/ImageImpl.h
@@ -20,12 +20,12 @@ namespace geometry {
 namespace kernel {
 namespace image {
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
 using std::isinf;
 using std::isnan;
 #endif
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void ToCUDA
 #else
 void ToCPU
@@ -82,7 +82,7 @@ void ToCPU
 #undef LINEAR_SATURATE
 }
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void ClipTransformCUDA
 #else
 void ClipTransformCPU
@@ -118,7 +118,7 @@ void ClipTransformCPU
 
 // Reimplementation of the reference:
 // https://github.com/mp3guy/ICPCUDA/blob/master/Cuda/pyrdown.cu#L41
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void PyrDownDepthCUDA
 #else
 void PyrDownDepthCPU
@@ -143,7 +143,7 @@ void PyrDownDepthCPU
     const int gkernel_size_2 = gkernel_size / 2;
     const float gweights[3] = {0.375f, 0.25f, 0.0625f};
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
     using std::abs;
     using std::max;
     using std::min;
@@ -191,7 +191,7 @@ void PyrDownDepthCPU
             });
 }
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void CreateVertexMapCUDA
 #else
 void CreateVertexMapCPU
@@ -209,7 +209,7 @@ void CreateVertexMapCPU
     int64_t cols = src.GetShape(1);
     int64_t n = rows * cols;
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
     using std::isinf;
     using std::isnan;
 #endif
@@ -238,7 +238,7 @@ void CreateVertexMapCPU
                 }
             });
 }
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void CreateNormalMapCUDA
 #else
 void CreateNormalMapCPU
@@ -303,7 +303,7 @@ void CreateNormalMapCPU
             });
 }
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void ColorizeDepthCUDA
 #else
 void ColorizeDepthCPU
diff --git a/cpp/open3d/t/geometry/kernel/NPPImage.cpp b/cpp/open3d/t/geometry/kernel/NPPImage.cpp
index eb47ec9ba..24dc77b9d 100644
--- a/cpp/open3d/t/geometry/kernel/NPPImage.cpp
+++ b/cpp/open3d/t/geometry/kernel/NPPImage.cpp
@@ -7,7 +7,9 @@
 
 #include "open3d/t/geometry/kernel/NPPImage.h"
 
+#if !__HIP_PLATFORM_AMD__
 #include <npp.h>
+#endif
 
 #include "open3d/core/CUDAUtils.h"
 #include "open3d/core/Dtype.h"
@@ -21,7 +23,9 @@ namespace t {
 namespace geometry {
 namespace npp {
 
+#if !__HIP_PLATFORM_AMD__
 static NppStreamContext MakeNPPContext() {
+    utility::LogError("npp::MakeNPPContext() is not supported on ROCM GPUs.");
     NppStreamContext context;
     context.hStream = core::cuda::GetStream();
     context.nCudaDeviceId = core::cuda::GetDevice();
@@ -59,8 +63,12 @@ static NppStreamContext MakeNPPContext() {
 
     return context;
 }
+#endif  // !__HIP_PLATFORM_AMD__
 
 void RGBToGray(const core::Tensor &src_im, core::Tensor &dst_im) {
+#if __HIP_PLATFORM_AMD__
+    utility::LogError("npp::RGBToGray() is not supported on ROCM GPUs.");
+#else
     if (src_im.GetDevice() != dst_im.GetDevice()) {
         utility::LogError(
                 "src_im and dst_im are not on the same device, got {} and {}.",
@@ -92,11 +100,15 @@ void RGBToGray(const core::Tensor &src_im, core::Tensor &dst_im) {
                           dtype.ToString());
     }
 #undef NPP_ARGS
+#endif  // __HIP_PLATFORM_AMD__
 }
 
 void Resize(const open3d::core::Tensor &src_im,
             open3d::core::Tensor &dst_im,
             t::geometry::Image::InterpType interp_type) {
+#if __HIP_PLATFORM_AMD__
+    utility::LogError("npp::Resize() is not supported on ROCM GPUs.");
+#else
     if (src_im.GetDevice() != dst_im.GetDevice()) {
         utility::LogError(
                 "src_im and dst_im are not on the same device, got {} and {}.",
@@ -173,9 +185,13 @@ void Resize(const open3d::core::Tensor &src_im,
                           dtype.ToString());
     }
 #undef NPP_ARGS
+#endif  // __HIP_PLATFORM_AMD__
 }
 
 void Dilate(const core::Tensor &src_im, core::Tensor &dst_im, int kernel_size) {
+#if __HIP_PLATFORM_AMD__
+    utility::LogError("npp::Dilate() is not supported on ROCM GPUs.");
+#else
     if (src_im.GetDevice() != dst_im.GetDevice()) {
         utility::LogError(
                 "src_im and dst_im are not on the same device, got {} and {}.",
@@ -241,11 +257,15 @@ void Dilate(const core::Tensor &src_im, core::Tensor &dst_im, int kernel_size) {
                           dtype.ToString());
     }
 #undef NPP_ARGS
+#endif  // __HIP_PLATFORM_AMD__
 }
 
 void Filter(const open3d::core::Tensor &src_im,
             open3d::core::Tensor &dst_im,
             const open3d::core::Tensor &kernel) {
+#if __HIP_PLATFORM_AMD__
+    utility::LogError("npp::Filter() is not supported on ROCM GPUs.");
+#else
     if (src_im.GetDevice() != dst_im.GetDevice()) {
         utility::LogError(
                 "src_im and dst_im are not on the same device, got {} and {}.",
@@ -314,6 +334,7 @@ void Filter(const open3d::core::Tensor &src_im,
                           dtype.ToString());
     }
 #undef NPP_ARGS
+#endif  // __HIP_PLATFORM_AMD__
 }
 
 void FilterBilateral(const core::Tensor &src_im,
@@ -321,6 +342,9 @@ void FilterBilateral(const core::Tensor &src_im,
                      int kernel_size,
                      float value_sigma,
                      float distance_sigma) {
+#if __HIP_PLATFORM_AMD__
+    utility::LogError("npp::FilterBilateral() is not supported on ROCM GPUs.");
+#else
     if (src_im.GetDevice() != dst_im.GetDevice()) {
         utility::LogError(
                 "src_im and dst_im are not on the same device, got {} and {}.",
@@ -373,6 +397,7 @@ void FilterBilateral(const core::Tensor &src_im,
                           dtype.ToString());
     }
 #undef NPP_ARGS
+#endif  // __HIP_PLATFORM_AMD__
 }
 
 void FilterGaussian(const core::Tensor &src_im,
@@ -407,6 +432,9 @@ void FilterSobel(const core::Tensor &src_im,
                  core::Tensor &dst_im_dx,
                  core::Tensor &dst_im_dy,
                  int kernel_size) {
+#if __HIP_PLATFORM_AMD__
+    utility::LogError("npp::FilterSobel() is not supported on ROCM GPUs.");
+#else
     if (src_im.GetDevice() != dst_im_dx.GetDevice() ||
         src_im.GetDevice() != dst_im_dy.GetDevice()) {
         utility::LogError(
@@ -479,6 +507,7 @@ void FilterSobel(const core::Tensor &src_im,
     if (cuda_version < 10020) {
         dst_im_dx.Neg_();
     }
+#endif  // __HIP_PLATFORM_AMD__
 }
 }  // namespace npp
 }  // namespace geometry
diff --git a/cpp/open3d/t/geometry/kernel/PointCloudImpl.h b/cpp/open3d/t/geometry/kernel/PointCloudImpl.h
index cf442e008..14135bba2 100644
--- a/cpp/open3d/t/geometry/kernel/PointCloudImpl.h
+++ b/cpp/open3d/t/geometry/kernel/PointCloudImpl.h
@@ -37,7 +37,7 @@ using std::min;
 using std::sqrt;
 #endif
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void UnprojectCUDA
 #else
 void UnprojectCPU
@@ -77,7 +77,7 @@ void UnprojectCPU
     }
 
     // Counter
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     core::Tensor count(std::vector<int>{0}, {}, core::Int32, depth.GetDevice());
     int* count_ptr = count.GetDataPtr<int>();
 #else
@@ -119,13 +119,13 @@ void UnprojectCPU
                     }
                 });
     });
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     int total_pts_count = count.Item<int>();
 #else
     int total_pts_count = (*count_ptr).load();
 #endif
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__) || defined(__HIPCC__)
     core::cuda::Synchronize();
 #endif
     points = points.Slice(0, 0, total_pts_count);
@@ -135,7 +135,7 @@ void UnprojectCPU
     }
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void GetPointMaskWithinAABBCUDA
 #else
 void GetPointMaskWithinAABBCPU
@@ -169,7 +169,7 @@ void GetPointMaskWithinAABBCPU
     });
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void GetPointMaskWithinOBBCUDA
 #else
 void GetPointMaskWithinOBBCPU
@@ -213,7 +213,7 @@ void GetPointMaskWithinOBBCPU
     });
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void NormalizeNormalsCUDA
 #else
 void NormalizeNormalsCPU
@@ -244,7 +244,7 @@ void NormalizeNormalsCPU
     });
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void OrientNormalsToAlignWithDirectionCUDA
 #else
 void OrientNormalsToAlignWithDirectionCPU
@@ -278,7 +278,7 @@ void OrientNormalsToAlignWithDirectionCPU
     });
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void OrientNormalsTowardsCameraLocationCUDA
 #else
 void OrientNormalsTowardsCameraLocationCPU
@@ -413,7 +413,7 @@ OPEN3D_HOST_DEVICE bool IsBoundaryPoints(const scalar_t* angles,
     return max_diff > angle_threshold * M_PI / 180.0 ? true : false;
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void ComputeBoundaryPointsCUDA
 #else
 void ComputeBoundaryPointsCPU
@@ -554,7 +554,7 @@ OPEN3D_HOST_DEVICE void EstimatePointWiseRobustNormalizedCovarianceKernel(
     covariance_ptr[7] = covariance_ptr[5];
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void EstimateCovariancesUsingHybridSearchCUDA
 #else
 void EstimateCovariancesUsingHybridSearchCPU
@@ -604,7 +604,7 @@ void EstimateCovariancesUsingHybridSearchCPU
     core::cuda::Synchronize(points.GetDevice());
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void EstimateCovariancesUsingRadiusSearchCUDA
 #else
 void EstimateCovariancesUsingRadiusSearchCPU
@@ -653,7 +653,7 @@ void EstimateCovariancesUsingRadiusSearchCPU
     core::cuda::Synchronize(points.GetDevice());
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void EstimateCovariancesUsingKNNSearchCUDA
 #else
 void EstimateCovariancesUsingKNNSearchCPU
@@ -971,7 +971,7 @@ OPEN3D_HOST_DEVICE void EstimatePointWiseNormalsWithFastEigen3x3(
     }
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void EstimateNormalsFromCovariancesCUDA
 #else
 void EstimateNormalsFromCovariancesCPU
@@ -1125,7 +1125,7 @@ OPEN3D_HOST_DEVICE void EstimatePointWiseColorGradientKernel(
     }
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void EstimateColorGradientsUsingHybridSearchCUDA
 #else
 void EstimateColorGradientsUsingHybridSearchCPU
@@ -1177,7 +1177,7 @@ void EstimateColorGradientsUsingHybridSearchCPU
     core::cuda::Synchronize(points.GetDevice());
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void EstimateColorGradientsUsingKNNSearchCUDA
 #else
 void EstimateColorGradientsUsingKNNSearchCPU
@@ -1232,7 +1232,7 @@ void EstimateColorGradientsUsingKNNSearchCPU
     core::cuda::Synchronize(points.GetDevice());
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void EstimateColorGradientsUsingRadiusSearchCUDA
 #else
 void EstimateColorGradientsUsingRadiusSearchCPU
diff --git a/cpp/open3d/t/geometry/kernel/TransformImpl.h b/cpp/open3d/t/geometry/kernel/TransformImpl.h
index 14c08a01e..844d5ba76 100644
--- a/cpp/open3d/t/geometry/kernel/TransformImpl.h
+++ b/cpp/open3d/t/geometry/kernel/TransformImpl.h
@@ -87,7 +87,7 @@ OPEN3D_HOST_DEVICE OPEN3D_FORCE_INLINE void RotateNormalsKernel(
     normals_ptr[2] = x[2];
 }
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void TransformPointsCUDA
 #else
 void TransformPointsCPU
@@ -107,7 +107,7 @@ void TransformPointsCPU
     });
 }
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void TransformNormalsCUDA
 #else
 void TransformNormalsCPU
@@ -127,7 +127,7 @@ void TransformNormalsCPU
     });
 }
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void RotatePointsCUDA
 #else
 void RotatePointsCPU
@@ -149,7 +149,7 @@ void RotatePointsCPU
     });
 }
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void RotateNormalsCUDA
 #else
 void RotateNormalsCPU
diff --git a/cpp/open3d/t/geometry/kernel/TriangleMeshImpl.h b/cpp/open3d/t/geometry/kernel/TriangleMeshImpl.h
index bbc032121..2e305d83c 100644
--- a/cpp/open3d/t/geometry/kernel/TriangleMeshImpl.h
+++ b/cpp/open3d/t/geometry/kernel/TriangleMeshImpl.h
@@ -21,11 +21,11 @@ namespace geometry {
 namespace kernel {
 namespace trianglemesh {
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
 using std::isnan;
 #endif
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void NormalizeNormalsCUDA
 #else
 void NormalizeNormalsCPU
@@ -62,7 +62,7 @@ void NormalizeNormalsCPU
     });
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void ComputeTriangleNormalsCUDA
 #else
 void ComputeTriangleNormalsCPU
@@ -107,7 +107,7 @@ void ComputeTriangleNormalsCPU
     });
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void ComputeTriangleAreasCUDA
 #else
 void ComputeTriangleAreasCPU
diff --git a/cpp/open3d/t/geometry/kernel/VoxelBlockGridImpl.h b/cpp/open3d/t/geometry/kernel/VoxelBlockGridImpl.h
index bd40b18b3..1ce00b613 100644
--- a/cpp/open3d/t/geometry/kernel/VoxelBlockGridImpl.h
+++ b/cpp/open3d/t/geometry/kernel/VoxelBlockGridImpl.h
@@ -30,7 +30,7 @@ namespace voxel_grid {
 using index_t = int;
 using ArrayIndexer = TArrayIndexer<index_t>;
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void GetVoxelCoordinatesAndFlattenedIndicesCUDA
 #else
 void GetVoxelCoordinatesAndFlattenedIndicesCPU
@@ -141,7 +141,7 @@ template <typename input_depth_t,
           typename tsdf_t,
           typename weight_t,
           typename color_t>
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void IntegrateCUDA
 #else
 void IntegrateCPU
@@ -288,12 +288,12 @@ void IntegrateCPU
         *weight_ptr = weight + 1;
     });
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     core::cuda::Synchronize();
 #endif
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void EstimateRangeCUDA
 #else
 void EstimateRangeCPU
@@ -337,7 +337,7 @@ void EstimateRangeCPU
     NDArrayIndexer frag_buffer_indexer(fragment_buffer, 1);
     NDArrayIndexer block_keys_indexer(block_keys, 1);
     TransformIndexer w2c_transform_indexer(intrinsics, extrinsics);
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     core::Tensor count(std::vector<int>{0}, {1}, core::Int32,
                        block_keys.GetDevice());
     int* count_ptr = count.GetDataPtr<int>();
@@ -346,7 +346,7 @@ void EstimateRangeCPU
     std::atomic<int>* count_ptr = &count_atomic;
 #endif
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
     using std::max;
     using std::min;
 #endif
@@ -434,7 +434,7 @@ void EstimateRangeCPU
                     }
                 }
             });
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     int needed_frag_count = count[0].Item<int>();
 #else
     int needed_frag_count = (*count_ptr).load();
@@ -486,7 +486,7 @@ void EstimateRangeCPU
                 float z_min = frag_ptr[0];
                 float z_max = frag_ptr[1];
                 float* range_ptr = range_map_indexer.GetDataPtr<float>(u, v);
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
                 atomicMinf(&(range_ptr[0]), z_min);
                 atomicMaxf(&(range_ptr[1]), z_max);
 #else
@@ -498,7 +498,7 @@ void EstimateRangeCPU
 #endif
             });
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     core::cuda::Synchronize();
 #endif
 
@@ -533,7 +533,7 @@ struct MiniVecCache {
 };
 
 template <typename tsdf_t, typename weight_t, typename color_t>
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void RayCastCUDA
 #else
 void RayCastCPU
@@ -559,7 +559,7 @@ void RayCastCPU
     using Eq = utility::MiniVecEq<index_t, 3>;
 
     auto device_hashmap = hashmap->GetDeviceHashBackend();
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     auto cuda_hashmap =
             std::dynamic_pointer_cast<core::StdGPUHashBackend<Key, Hash, Eq>>(
                     device_hashmap);
@@ -675,7 +675,7 @@ void RayCastCPU
     index_t resolution2 = block_resolution * block_resolution;
     index_t resolution3 = resolution2 * block_resolution;
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
     using std::max;
     using std::sqrt;
 #endif
@@ -781,7 +781,7 @@ void RayCastCPU
 
         if (mask_indexer.GetDataPtr()) {
             mask_ptr = mask_indexer.GetDataPtr<bool>(x, y);
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 #pragma unroll
 #endif
             for (int i = 0; i < 8; ++i) {
@@ -790,7 +790,7 @@ void RayCastCPU
         }
         if (index_indexer.GetDataPtr()) {
             index_ptr = index_indexer.GetDataPtr<int64_t>(x, y);
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 #pragma unroll
 #endif
             for (int i = 0; i < 8; ++i) {
@@ -799,7 +799,7 @@ void RayCastCPU
         }
         if (interp_ratio_indexer.GetDataPtr()) {
             interp_ratio_ptr = interp_ratio_indexer.GetDataPtr<float>(x, y);
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 #pragma unroll
 #endif
             for (int i = 0; i < 8; ++i) {
@@ -809,7 +809,7 @@ void RayCastCPU
         if (interp_ratio_dx_indexer.GetDataPtr()) {
             interp_ratio_dx_ptr =
                     interp_ratio_dx_indexer.GetDataPtr<float>(x, y);
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 #pragma unroll
 #endif
             for (int i = 0; i < 8; ++i) {
@@ -819,7 +819,7 @@ void RayCastCPU
         if (interp_ratio_dy_indexer.GetDataPtr()) {
             interp_ratio_dy_ptr =
                     interp_ratio_dy_indexer.GetDataPtr<float>(x, y);
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 #pragma unroll
 #endif
             for (int i = 0; i < 8; ++i) {
@@ -829,7 +829,7 @@ void RayCastCPU
         if (interp_ratio_dz_indexer.GetDataPtr()) {
             interp_ratio_dz_ptr =
                     interp_ratio_dz_indexer.GetDataPtr<float>(x, y);
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 #pragma unroll
 #endif
             for (int i = 0; i < 8; ++i) {
@@ -1026,13 +1026,13 @@ void RayCastCPU
         }  // surface-found
     });
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     core::cuda::Synchronize();
 #endif
 }
 
 template <typename tsdf_t, typename weight_t, typename color_t>
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void ExtractPointCloudCUDA
 #else
 void ExtractPointCloudCPU
@@ -1085,7 +1085,7 @@ void ExtractPointCloudCPU
     index_t n = n_blocks * resolution3;
 
     // Output
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     core::Tensor count(std::vector<index_t>{0}, {1}, core::Int32,
                        block_keys.GetDevice());
     index_t* count_ptr = count.GetDataPtr<index_t>();
@@ -1139,7 +1139,7 @@ void ExtractPointCloudCPU
             }
         });
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
         valid_size = count[0].Item<index_t>();
         count[0] = 0;
 #else
@@ -1274,7 +1274,7 @@ void ExtractPointCloudCPU
         }
     });
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     index_t total_count = count.Item<index_t>();
 #else
     index_t total_count = (*count_ptr).load();
@@ -1289,7 +1289,7 @@ void ExtractPointCloudCPU
 }
 
 template <typename tsdf_t, typename weight_t, typename color_t>
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void ExtractTriangleMeshCUDA
 #else
 void ExtractTriangleMeshCPU
@@ -1434,7 +1434,7 @@ void ExtractTriangleMeshCPU
     });
 
     // Pass 1: determine valid number of vertices (if not preset)
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     core::Tensor count(std::vector<index_t>{0}, {}, core::Int32, device);
 
     index_t* count_ptr = count.GetDataPtr<index_t>();
@@ -1473,7 +1473,7 @@ void ExtractTriangleMeshCPU
             }
         });
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
         vertex_count = count.Item<index_t>();
 #else
         vertex_count = (*count_ptr).load();
@@ -1495,7 +1495,7 @@ void ExtractTriangleMeshCPU
     ArrayIndexer block_keys_indexer(block_keys, 1);
     ArrayIndexer vertex_indexer(vertices, 1);
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     count = core::Tensor(std::vector<index_t>{0}, {}, core::Int32, device);
     count_ptr = count.GetDataPtr<index_t>();
 #else
@@ -1620,7 +1620,7 @@ void ExtractTriangleMeshCPU
     triangles = core::Tensor({triangle_count, 3}, core::Int32, device);
     ArrayIndexer triangle_indexer(triangles, 1);
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     count = core::Tensor(std::vector<index_t>{0}, {}, core::Int32, device);
     count_ptr = count.GetDataPtr<index_t>();
 #else
@@ -1678,7 +1678,7 @@ void ExtractTriangleMeshCPU
         }
     });
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     triangle_count = count.Item<index_t>();
 #else
     triangle_count = (*count_ptr).load();
diff --git a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt
index a766715e6..4140f0e1e 100644
--- a/cpp/open3d/t/pipelines/kernel/CMakeLists.txt
+++ b/cpp/open3d/t/pipelines/kernel/CMakeLists.txt
@@ -22,6 +22,22 @@ if (BUILD_CUDA_MODULE)
     )
 endif()
 
+if (WITH_ROCM)
+    include(Hipify)
+
+    get_target_property(SOURCE_LIST tpipelines_kernel SOURCES)
+    get_hipified_list("${SOURCE_LIST}" HIPIFIED_SOURCE_LIST)
+
+    foreach(src IN LISTS HIPIFIED_SOURCE_LIST)
+        get_filename_component(ext ${src} EXT)
+        if(ext STREQUAL ".cu")
+            set_source_files_properties(${src} PROPERTIES LANGUAGE HIP)
+        endif()
+    endforeach()
+
+    set_property(TARGET tpipelines_kernel PROPERTY SOURCES ${HIPIFIED_SOURCE_LIST})
+endif()
+
 open3d_show_and_abort_on_warning(tpipelines_kernel)
 open3d_set_global_properties(tpipelines_kernel)
 # The kernels are used in the unit tests, so they cannot be hidden for now.
diff --git a/cpp/open3d/t/pipelines/kernel/FeatureImpl.h b/cpp/open3d/t/pipelines/kernel/FeatureImpl.h
index bd00f29fb..14504f7ee 100644
--- a/cpp/open3d/t/pipelines/kernel/FeatureImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/FeatureImpl.h
@@ -16,7 +16,7 @@ namespace t {
 namespace pipelines {
 namespace kernel {
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
 using std::max;
 using std::min;
 #endif
@@ -104,7 +104,7 @@ OPEN3D_HOST_DEVICE void UpdateSPFHFeature(const scalar_t *feature,
     spfh[idx * 33 + h_index3 + 22] += hist_incr;
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void ComputeFPFHFeatureCUDA
 #else
 void ComputeFPFHFeatureCPU
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
index e158809f6..aa65f5082 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
@@ -13,7 +13,7 @@ namespace open3d {
 namespace t {
 namespace pipelines {
 namespace kernel {
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void FillInRigidAlignmentTermCUDA
 #else
 void FillInRigidAlignmentTermCPU
@@ -127,7 +127,7 @@ void FillInRigidAlignmentTermCPU
     Atb.IndexSet({indices}, Atb_sub + Atb_local.View({12, 1}));
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void FillInSLACAlignmentTermCUDA
 #else
 void FillInSLACAlignmentTermCPU
@@ -247,7 +247,7 @@ void FillInSLACAlignmentTermCPU
                 }
 
         // Not optimized; Switch to reduction if necessary.
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
                 for (int ki = 0; ki < 60; ++ki) {
                     for (int kj = 0; kj < 60; ++kj) {
                         float AtA_ij = J[ki] * J[kj];
@@ -274,7 +274,7 @@ void FillInSLACAlignmentTermCPU
             });
 }
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 void FillInSLACRegularizerTermCUDA
 #else
 void FillInSLACRegularizerTermCPU
@@ -413,7 +413,7 @@ void FillInSLACRegularizerTermCPU
                         int offset_idx_i = 3 * idx_i + 6 * n_frags;
                         int offset_idx_k = 3 * idx_k + 6 * n_frags;
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
                         // Update residual
                         atomicAdd(residual_ptr,
                                   weight * (local_r[0] * local_r[0] +
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryJacobianImpl.h b/cpp/open3d/t/pipelines/kernel/RGBDOdometryJacobianImpl.h
index 0dbe0d97f..63e0e84ea 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometryJacobianImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometryJacobianImpl.h
@@ -20,7 +20,7 @@ namespace odometry {
 using t::geometry::kernel::NDArrayIndexer;
 using t::geometry::kernel::TransformIndexer;
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
 using std::abs;
 using std::isnan;
 using std::max;
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
index 3f08ca4c8..533ad6ab0 100644
--- a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
@@ -17,7 +17,7 @@
 #include "open3d/t/pipelines/kernel/TransformationConverter.h"
 #include "open3d/t/pipelines/registration/RobustKernel.h"
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
 using std::abs;
 #endif
 
@@ -187,21 +187,23 @@ OPEN3D_HOST_DEVICE inline bool GetJacobianPointToPlane(
     return true;
 }
 
-template bool GetJacobianPointToPlane(int64_t workload_idx,
-                                      const float *source_points_ptr,
-                                      const float *target_points_ptr,
-                                      const float *target_normals_ptr,
-                                      const int64_t *correspondence_indices,
-                                      float *J_ij,
-                                      float &r);
-
-template bool GetJacobianPointToPlane(int64_t workload_idx,
-                                      const double *source_points_ptr,
-                                      const double *target_points_ptr,
-                                      const double *target_normals_ptr,
-                                      const int64_t *correspondence_indices,
-                                      double *J_ij,
-                                      double &r);
+template OPEN3D_HOST_DEVICE bool GetJacobianPointToPlane(
+        int64_t workload_idx,
+        const float *source_points_ptr,
+        const float *target_points_ptr,
+        const float *target_normals_ptr,
+        const int64_t *correspondence_indices,
+        float *J_ij,
+        float &r);
+
+template OPEN3D_HOST_DEVICE bool GetJacobianPointToPlane(
+        int64_t workload_idx,
+        const double *source_points_ptr,
+        const double *target_points_ptr,
+        const double *target_normals_ptr,
+        const int64_t *correspondence_indices,
+        double *J_ij,
+        double &r);
 
 template <typename scalar_t>
 OPEN3D_HOST_DEVICE inline bool GetJacobianColoredICP(
@@ -285,35 +287,37 @@ OPEN3D_HOST_DEVICE inline bool GetJacobianColoredICP(
     return true;
 }
 
-template bool GetJacobianColoredICP(const int64_t workload_idx,
-                                    const float *source_points_ptr,
-                                    const float *source_colors_ptr,
-                                    const float *target_points_ptr,
-                                    const float *target_normals_ptr,
-                                    const float *target_colors_ptr,
-                                    const float *target_color_gradients_ptr,
-                                    const int64_t *correspondence_indices,
-                                    const float &sqrt_lambda_geometric,
-                                    const float &sqrt_lambda_photometric,
-                                    float *J_G,
-                                    float *J_I,
-                                    float &r_G,
-                                    float &r_I);
-
-template bool GetJacobianColoredICP(const int64_t workload_idx,
-                                    const double *source_points_ptr,
-                                    const double *source_colors_ptr,
-                                    const double *target_points_ptr,
-                                    const double *target_normals_ptr,
-                                    const double *target_colors_ptr,
-                                    const double *target_color_gradients_ptr,
-                                    const int64_t *correspondence_indices,
-                                    const double &sqrt_lambda_geometric,
-                                    const double &sqrt_lambda_photometric,
-                                    double *J_G,
-                                    double *J_I,
-                                    double &r_G,
-                                    double &r_I);
+template OPEN3D_HOST_DEVICE bool GetJacobianColoredICP(
+        const int64_t workload_idx,
+        const float *source_points_ptr,
+        const float *source_colors_ptr,
+        const float *target_points_ptr,
+        const float *target_normals_ptr,
+        const float *target_colors_ptr,
+        const float *target_color_gradients_ptr,
+        const int64_t *correspondence_indices,
+        const float &sqrt_lambda_geometric,
+        const float &sqrt_lambda_photometric,
+        float *J_G,
+        float *J_I,
+        float &r_G,
+        float &r_I);
+
+template OPEN3D_HOST_DEVICE bool GetJacobianColoredICP(
+        const int64_t workload_idx,
+        const double *source_points_ptr,
+        const double *source_colors_ptr,
+        const double *target_points_ptr,
+        const double *target_normals_ptr,
+        const double *target_colors_ptr,
+        const double *target_color_gradients_ptr,
+        const int64_t *correspondence_indices,
+        const double &sqrt_lambda_geometric,
+        const double &sqrt_lambda_photometric,
+        double *J_G,
+        double *J_I,
+        double &r_G,
+        double &r_I);
 
 template <typename scalar_t>
 OPEN3D_HOST_DEVICE inline void PreComputeForDopplerICP(
@@ -333,17 +337,19 @@ OPEN3D_HOST_DEVICE inline void PreComputeForDopplerICP(
     core::linalg::kernel::matmul3x3_3x1(R_S_to_V, v_s_in_V, v_s_in_S);
 }
 
-template void PreComputeForDopplerICP(const float *R_S_to_V,
-                                      const float *r_v_to_s_in_V,
-                                      const float *w_v_in_V,
-                                      const float *v_v_in_V,
-                                      float *v_s_in_S);
+template OPEN3D_HOST_DEVICE void PreComputeForDopplerICP(
+        const float *R_S_to_V,
+        const float *r_v_to_s_in_V,
+        const float *w_v_in_V,
+        const float *v_v_in_V,
+        float *v_s_in_S);
 
-template void PreComputeForDopplerICP(const double *R_S_to_V,
-                                      const double *r_v_to_s_in_V,
-                                      const double *w_v_in_V,
-                                      const double *v_v_in_V,
-                                      double *v_s_in_S);
+template OPEN3D_HOST_DEVICE void PreComputeForDopplerICP(
+        const double *R_S_to_V,
+        const double *r_v_to_s_in_V,
+        const double *w_v_in_V,
+        const double *v_v_in_V,
+        double *v_s_in_S);
 
 template <typename scalar_t>
 OPEN3D_HOST_DEVICE inline bool GetJacobianDopplerICP(
@@ -435,45 +441,47 @@ OPEN3D_HOST_DEVICE inline bool GetJacobianDopplerICP(
     return true;
 }
 
-template bool GetJacobianDopplerICP(const int64_t workload_idx,
-                                    const float *source_points_ptr,
-                                    const float *source_dopplers_ptr,
-                                    const float *source_directions_ptr,
-                                    const float *target_points_ptr,
-                                    const float *target_normals_ptr,
-                                    const int64_t *correspondence_indices,
-                                    const float *R_S_to_V,
-                                    const float *r_v_to_s_in_V,
-                                    const float *v_s_in_S,
-                                    const bool reject_dynamic_outliers,
-                                    const float doppler_outlier_threshold,
-                                    const float &sqrt_lambda_geometric,
-                                    const float &sqrt_lambda_doppler,
-                                    const float &sqrt_lambda_doppler_by_dt,
-                                    float *J_G,
-                                    float *J_D,
-                                    float &r_G,
-                                    float &r_D);
-
-template bool GetJacobianDopplerICP(const int64_t workload_idx,
-                                    const double *source_points_ptr,
-                                    const double *source_dopplers_ptr,
-                                    const double *source_directions_ptr,
-                                    const double *target_points_ptr,
-                                    const double *target_normals_ptr,
-                                    const int64_t *correspondence_indices,
-                                    const double *R_S_to_V,
-                                    const double *r_v_to_s_in_V,
-                                    const double *v_s_in_S,
-                                    const bool reject_dynamic_outliers,
-                                    const double doppler_outlier_threshold,
-                                    const double &sqrt_lambda_geometric,
-                                    const double &sqrt_lambda_doppler,
-                                    const double &sqrt_lambda_doppler_by_dt,
-                                    double *J_G,
-                                    double *J_D,
-                                    double &r_G,
-                                    double &r_D);
+template OPEN3D_HOST_DEVICE bool GetJacobianDopplerICP(
+        const int64_t workload_idx,
+        const float *source_points_ptr,
+        const float *source_dopplers_ptr,
+        const float *source_directions_ptr,
+        const float *target_points_ptr,
+        const float *target_normals_ptr,
+        const int64_t *correspondence_indices,
+        const float *R_S_to_V,
+        const float *r_v_to_s_in_V,
+        const float *v_s_in_S,
+        const bool reject_dynamic_outliers,
+        const float doppler_outlier_threshold,
+        const float &sqrt_lambda_geometric,
+        const float &sqrt_lambda_doppler,
+        const float &sqrt_lambda_doppler_by_dt,
+        float *J_G,
+        float *J_D,
+        float &r_G,
+        float &r_D);
+
+template OPEN3D_HOST_DEVICE bool GetJacobianDopplerICP(
+        const int64_t workload_idx,
+        const double *source_points_ptr,
+        const double *source_dopplers_ptr,
+        const double *source_directions_ptr,
+        const double *target_points_ptr,
+        const double *target_normals_ptr,
+        const int64_t *correspondence_indices,
+        const double *R_S_to_V,
+        const double *r_v_to_s_in_V,
+        const double *v_s_in_S,
+        const bool reject_dynamic_outliers,
+        const double doppler_outlier_threshold,
+        const double &sqrt_lambda_geometric,
+        const double &sqrt_lambda_doppler,
+        const double &sqrt_lambda_doppler_by_dt,
+        double *J_G,
+        double *J_D,
+        double &r_G,
+        double &r_D);
 
 template <typename scalar_t>
 OPEN3D_HOST_DEVICE inline bool GetInformationJacobians(
@@ -507,19 +515,21 @@ OPEN3D_HOST_DEVICE inline bool GetInformationJacobians(
     return true;
 }
 
-template bool GetInformationJacobians(int64_t workload_idx,
-                                      const float *target_points_ptr,
-                                      const int64_t *correspondence_indices,
-                                      float *jacobian_x,
-                                      float *jacobian_y,
-                                      float *jacobian_z);
-
-template bool GetInformationJacobians(int64_t workload_idx,
-                                      const double *target_points_ptr,
-                                      const int64_t *correspondence_indices,
-                                      double *jacobian_x,
-                                      double *jacobian_y,
-                                      double *jacobian_z);
+template OPEN3D_HOST_DEVICE bool GetInformationJacobians(
+        int64_t workload_idx,
+        const float *target_points_ptr,
+        const int64_t *correspondence_indices,
+        float *jacobian_x,
+        float *jacobian_y,
+        float *jacobian_z);
+
+template OPEN3D_HOST_DEVICE bool GetInformationJacobians(
+        int64_t workload_idx,
+        const double *target_points_ptr,
+        const int64_t *correspondence_indices,
+        double *jacobian_x,
+        double *jacobian_y,
+        double *jacobian_z);
 
 }  // namespace kernel
 }  // namespace pipelines
diff --git a/cpp/open3d/utility/Logging.h b/cpp/open3d/utility/Logging.h
index fb4a21c86..c881dd9a0 100644
--- a/cpp/open3d/utility/Logging.h
+++ b/cpp/open3d/utility/Logging.h
@@ -12,7 +12,8 @@
 #include <string>
 
 // NVCC does not support deprecated attribute on Windows prior to v11.
-#if defined(__CUDACC__) && defined(_MSC_VER) && __CUDACC_VER_MAJOR__ < 11
+#if (defined(__CUDACC__) || defined(__HIPCC__)) && defined(_MSC_VER) && \
+        __CUDACC_VER_MAJOR__ < 11
 #ifndef FMT_DEPRECATED
 #define FMT_DEPRECATED
 #endif
diff --git a/cpp/open3d/utility/MiniVec.h b/cpp/open3d/utility/MiniVec.h
index 17eca20b1..e766b6163 100644
--- a/cpp/open3d/utility/MiniVec.h
+++ b/cpp/open3d/utility/MiniVec.h
@@ -11,6 +11,8 @@
 
 #ifdef __CUDACC__
 #define FN_SPECIFIERS inline __host__ __device__
+#elif __HIPCC__
+#define FN_SPECIFIERS inline __host__ __device__
 #else
 #define FN_SPECIFIERS inline
 #endif
diff --git a/cpp/pybind/_build_config.py.in b/cpp/pybind/_build_config.py.in
index 7ea14135f..12ea722d9 100644
--- a/cpp/pybind/_build_config.py.in
+++ b/cpp/pybind/_build_config.py.in
@@ -18,5 +18,6 @@ _build_config = {
     "Tensorflow_VERSION" : "@Tensorflow_VERSION@",
     "Pytorch_VERSION" : "@Pytorch_VERSION@",
     "Paddle_VERSION" : "@Paddle_VERSION@",
-    "WITH_OPENMP" : $<IF:$<BOOL:@WITH_OPENMP@>,True,False>
+    "WITH_OPENMP" : $<IF:$<BOOL:@WITH_OPENMP@>,True,False>,
+    "WITH_ROCM" : $<IF:$<BOOL:@WITH_ROCM@>,True,False>
 }
diff --git a/cpp/pybind/t/geometry/geometry.h b/cpp/pybind/t/geometry/geometry.h
index c8719bcdf..8888043af 100644
--- a/cpp/pybind/t/geometry/geometry.h
+++ b/cpp/pybind/t/geometry/geometry.h
@@ -38,7 +38,6 @@ void pybind_geometry(py::module& m);
 void pybind_geometry_class(py::module& m);
 void pybind_drawable_geometry_class(py::module& m);
 void pybind_tensormap(py::module& m);
-void pybind_image(py::module& m);
 void pybind_pointcloud(py::module& m);
 void pybind_lineset(py::module& m);
 void pybind_trianglemesh(py::module& m);
diff --git a/hipify_custom_map.json b/hipify_custom_map.json
new file mode 100644
index 000000000..5ba507997
--- /dev/null
+++ b/hipify_custom_map.json
@@ -0,0 +1,43 @@
+{
+    "custom_map": {
+        "cub::" : "hipcub::",
+        "cuStreamGetCtx": "hipStreamGetCtx",
+
+        "cusolverStatus_t": "hipsolverStatus_t",
+        "cusolverDnHandle_t": "hipsolverDnHandle_t",
+        "CUSOLVER_STATUS_SUCCESS": "HIPSOLVER_STATUS_SUCCESS",
+        "CUSOLVER_STATUS_INTERNAL_ERROR": "HIPSOLVER_STATUS_INTERNAL_ERROR",
+
+        "open3d/core/linalg/BlasWrapper.h": "open3d/core/linalg/BlasWrapper_hip.h",
+        "open3d/core/linalg/LinalgUtils.h": "open3d/core/linalg/LinalgUtils_hip.h",
+        "open3d/core/linalg/LapackWrapper.h": "open3d/core/linalg/LapackWrapper_hip.h",
+        "open3d/ml/Helper.h": "open3d/ml/Helper_hip.h",
+
+        "open3d/ml/impl/continuous_conv/ContinuousConvCUDAKernels.h": "open3d/ml/impl/continuous_conv/ContinuousConvHIPKernels.h",
+        "open3d/ml/impl/sparse_conv/SparseConvCUDAKernels.h": "open3d/ml/impl/sparse_conv/SparseConvHIPKernels.h",
+
+        "open3d/ml/impl/continuous_conv/ContinuousConv.cuh": "open3d/ml/impl/continuous_conv/ContinuousConv_hip.cuh",
+        "open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.cuh": "open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter_hip.cuh",
+        "open3d/ml/impl/continuous_conv/ContinuousConvTranspose.cuh": "open3d/ml/impl/continuous_conv/ContinuousConvTranspose_hip.cuh",
+        "open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.cuh": "open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter_hip.cuh",
+        "open3d/ml/impl/sparse_conv/SparseConv.cuh": "open3d/ml/impl/sparse_conv/SparseConv_hip.cuh",
+        "open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.cuh": "open3d/ml/impl/sparse_conv/SparseConvBackpropFilter_hip.cuh",
+        "open3d/ml/impl/sparse_conv/SparseConvTranspose.cuh": "open3d/ml/impl/sparse_conv/SparseConvTranspose_hip.cuh",
+        "open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.cuh": "open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter_hip.cuh",
+        
+        "open3d/ml/impl/misc/ReduceSubarraysSum.cuh": "open3d/ml/impl/misc/ReduceSubarraysSum_hip.cuh",
+        "open3d/ml/impl/misc/RaggedToDense.cuh": "open3d/ml/impl/misc/RaggedToDense_hip.cuh",
+        "open3d/ml/impl/misc/Voxelize.cuh": "open3d/ml/impl/misc/Voxelize_hip.cuh",
+        "open3d/ml/impl/misc/InvertNeighborsList.cuh": "open3d/ml/impl/misc/InvertNeighborsList_hip.cuh",
+        "open3d/ml/contrib/PointSampling.cuh": "open3d/ml/contrib/PointSampling_hip.cuh",
+        "open3d/ml/contrib/TrilinearDevoxelize.cuh": "open3d/ml/contrib/TrilinearDevoxelize_hip.cuh",
+        "open3d/ml/contrib/BallQuery.cuh": "open3d/ml/contrib/BallQuery_hip.cuh",
+        "open3d/ml/contrib/InterpolatePoints.cuh": "open3d/ml/contrib/InterpolatePoints_hip.cuh",
+        "open3d/core/nns/FixedRadiusSearchImpl.cuh": "open3d/core/nns/FixedRadiusSearchImpl_hip.cuh",
+        "open3d/core/nns/kernel/BlockSelectImpl.cuh": "open3d/core/nns/kernel/BlockSelectImpl_hip.cuh",
+        "open3d/core/nns/kernel/BlockSelect.cuh": "open3d/core/nns/kernel/BlockSelect_hip.cuh",
+        "open3d/core/nns/kernel/DistancesUtils.cuh": "open3d/core/nns/kernel/DistancesUtils_hip.cuh",
+        "open3d/core/nns/kernel/L2Select.cuh": "open3d/core/nns/kernel/L2Select_hip.cuh",
+        "open3d/core/nns/KnnSearchImpl.cuh": "open3d/core/nns/KnnSearchImpl_hip.cuh"
+    }
+}
\ No newline at end of file
diff --git a/python/open3d/ml/paddle/ops/__init__.py b/python/open3d/ml/paddle/ops/__init__.py
index 2e9336fd7..0760225c6 100644
--- a/python/open3d/ml/paddle/ops/__init__.py
+++ b/python/open3d/ml/paddle/ops/__init__.py
@@ -32,14 +32,25 @@
 _lib_arch = ('cpu',)
 if _build_config["BUILD_CUDA_MODULE"] and _paddle.device.cuda.device_count(
 ) >= 1:
-    if _paddle.version.cuda() == _build_config["CUDA_VERSION"]:
-        _lib_arch = ('cuda', 'cpu')
+    if _build_config["WITH_ROCM"]:
+        if _paddle.is_compiled_with_rocm():
+            # NOTE(beinggod): Skip rocm version check.
+            _lib_arch = ('cuda', 'cpu')
+        else:
+            print("Warning: Open3D was built with ROCm but "
+                  "Paddle was not built with ROCm. Falling back to CPU for now."
+                  "Otherwise, install Paddle with ROCm{}.")
     else:
-        print("Warning: Open3D was built with CUDA {} but"
-              "Paddle was built with CUDA {}. Falling back to CPU for now."
-              "Otherwise, install Paddle with CUDA {}.".format(
-                  _build_config["CUDA_VERSION"], _paddle.version.cuda(),
-                  _build_config["CUDA_VERSION"]))
+        # CUDA
+        if _paddle.version.cuda() == _build_config["CUDA_VERSION"]:
+            _lib_arch = ('cuda', 'cpu')
+        else:
+            print("Warning: Open3D was built with CUDA {} but "
+                  "Paddle was built with CUDA {}. Falling back to CPU for now."
+                  "Otherwise, install Paddle with CUDA {}.".format(
+                      _build_config["CUDA_VERSION"], _paddle.version.cuda(),
+                      _build_config["CUDA_VERSION"]))
+
 _lib_path.extend([
     _os.path.join(_package_root, la,
                   'open3d_paddle_ops' + _lib_suffix + _lib_ext)
diff --git a/python/requirements_style.txt b/python/requirements_style.txt
index f20e372b9..9e2a1639b 100644
--- a/python/requirements_style.txt
+++ b/python/requirements_style.txt
@@ -1,3 +1,4 @@
 clang-format==10.0.1.1
 yapf==0.30.0
 nbformat==5.7.0
+pre-commit==4.5.1
diff --git a/python/test/ml_ops/test_cublas.py b/python/test/ml_ops/test_cublas.py
index 075823ec9..daf5ed399 100644
--- a/python/test/ml_ops/test_cublas.py
+++ b/python/test/ml_ops/test_cublas.py
@@ -19,7 +19,7 @@
 @mltest.parametrize.ml_gpu_only
 def test_cublas_matmul(ml):
     # This test checks if calling cublas functionality from open3d and the ml framework works.
-    os.environ["NVIDIA_TF32_OVERRIDE"] = 0
+    os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
 
     rng = np.random.RandomState(123)
 

From 720ada4b9f3af83d33ae09ce51ac617f15e6af94 Mon Sep 17 00:00:00 2001
From: beinggod <beinggod@foxmail.com>
Date: Tue, 3 Feb 2026 16:11:03 +0800
Subject: [PATCH 2/3] fix: test_cublas.py and update requirements_style.txt

Signed-off-by: beinggod <beinggod@foxmail.com>
---
 python/requirements_style.txt     | 1 +
 python/test/ml_ops/test_cublas.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/requirements_style.txt b/python/requirements_style.txt
index f20e372b9..7b7f5fa74 100644
--- a/python/requirements_style.txt
+++ b/python/requirements_style.txt
@@ -1,3 +1,4 @@
 clang-format==10.0.1.1
 yapf==0.30.0
 nbformat==5.7.0
+pre-commit=4.5.1
diff --git a/python/test/ml_ops/test_cublas.py b/python/test/ml_ops/test_cublas.py
index 075823ec9..daf5ed399 100644
--- a/python/test/ml_ops/test_cublas.py
+++ b/python/test/ml_ops/test_cublas.py
@@ -19,7 +19,7 @@
 @mltest.parametrize.ml_gpu_only
 def test_cublas_matmul(ml):
     # This test checks if calling cublas functionality from open3d and the ml framework works.
-    os.environ["NVIDIA_TF32_OVERRIDE"] = 0
+    os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
 
     rng = np.random.RandomState(123)
 

From bad288d59bf0ba09f7ca736443b061157c43bb7f Mon Sep 17 00:00:00 2001
From: beinggod <beinggod@foxmail.com>
Date: Wed, 4 Feb 2026 14:03:34 +0800
Subject: [PATCH 3/3] fix tbb cmake

Signed-off-by: beinggod <beinggod@foxmail.com>
---
 3rdparty/mkl/tbb.cmake | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/3rdparty/mkl/tbb.cmake b/3rdparty/mkl/tbb.cmake
index 8c4fd6fa5..12705be7e 100644
--- a/3rdparty/mkl/tbb.cmake
+++ b/3rdparty/mkl/tbb.cmake
@@ -23,9 +23,21 @@ set(STATIC_TBB_LIBRARIES tbb_static tbbmalloc_static)
 
 find_package(Git QUIET REQUIRED)
 
+set(TBB_CMAKE_ARGS 
+    -DCMAKE_INSTALL_PREFIX=${MKL_INSTALL_PREFIX}
+    -DSTATIC_WINDOWS_RUNTIME=${STATIC_WINDOWS_RUNTIME}
+    -DTBB_BUILD_TBBMALLOC=ON
+    -DTBB_BUILD_TBBMALLOC_PROXYC=OFF
+    -DTBB_BUILD_SHARED=OFF
+    -DTBB_BUILD_STATIC=ON
+    -DTBB_BUILD_TESTS=OFF
+    -DTBB_INSTALL_ARCHIVE_DIR=${Open3D_INSTALL_LIB_DIR}
+    -DTBB_CMAKE_PACKAGE_INSTALL_DIR=${Open3D_INSTALL_LIB_DIR}/cmake/tbb
+    ${ExternalProject_CMAKE_ARGS_hidden}
+)
 if (WITH_ROCM)
     # NOTE(beinggod): Set CMAKE_CXX_COMPILER_ID to GNU to avoid link libunwind and libc++.
-    set(ExternalProject_CMAKE_ARGS_hidden "${ExternalProject_CMAKE_ARGS_hidden} -DCMAKE_CXX_COMPILER_ID=GNU")
+    list(APPEND TBB_CMAKE_ARGS -DCMAKE_CXX_COMPILER_ID=GNU)
 endif()
 
 ExternalProject_Add(
@@ -39,16 +51,7 @@ ExternalProject_Add(
     COMMAND ${GIT_EXECUTABLE} apply --ignore-space-change --ignore-whitespace
         ${CMAKE_CURRENT_LIST_DIR}/0001-Allow-selecttion-of-static-dynamic-MSVC-runtime.patch
     CMAKE_ARGS
-        -DCMAKE_INSTALL_PREFIX=${MKL_INSTALL_PREFIX}
-        -DSTATIC_WINDOWS_RUNTIME=${STATIC_WINDOWS_RUNTIME}
-        -DTBB_BUILD_TBBMALLOC=ON
-        -DTBB_BUILD_TBBMALLOC_PROXYC=OFF
-        -DTBB_BUILD_SHARED=OFF
-        -DTBB_BUILD_STATIC=ON
-        -DTBB_BUILD_TESTS=OFF
-        -DTBB_INSTALL_ARCHIVE_DIR=${Open3D_INSTALL_LIB_DIR}
-        -DTBB_CMAKE_PACKAGE_INSTALL_DIR=${Open3D_INSTALL_LIB_DIR}/cmake/tbb
-        ${ExternalProject_CMAKE_ARGS_hidden}
+        ${TBB_CMAKE_ARGS}
     BUILD_BYPRODUCTS
         ${STATIC_TBB_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}tbb_static${CMAKE_STATIC_LIBRARY_SUFFIX}
         ${STATIC_TBB_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}tbbmalloc_static${CMAKE_STATIC_LIBRARY_SUFFIX}