diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 4cf949ba5ab..351b920e805 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -17,13 +17,17 @@ namespace api { PackedDimInfo::PackedDimInfo( const int32_t dim, const int32_t dim_block_size, + const int32_t dim_align, const int32_t outer_dim, const int32_t outer_dim_block_size, + const int32_t outer_dim_align, const bool is_block_transposed) : packed_dim(dim), packed_dim_block_size(dim_block_size), + packed_dim_align(dim_align), outer_packed_dim(outer_dim), outer_packed_dim_block_size(outer_dim_block_size), + outer_packed_dim_align(outer_dim_align), block_transposed(is_block_transposed), block_numel(packed_dim_block_size * outer_packed_dim_block_size) { // Packed dims must be different @@ -33,19 +37,97 @@ PackedDimInfo::PackedDimInfo( PackedDimInfo calculate_packed_dim_info( const utils::GPUMemoryLayout memory_layout, const utils::StorageType storage_type) { - const int32_t packed_dim = utils::to_packed_dim(memory_layout); - const int32_t outer_packed_dim = - utils::to_outer_packed_dim(memory_layout); - const int32_t packed_dim_block_size = - utils::to_packed_dim_block_size(memory_layout, storage_type); - const int32_t outer_packed_dim_block_size = - utils::to_outer_packed_dim_block_size(memory_layout); - const bool is_block_transposed = - utils::is_block_transposed_layout(memory_layout); - - const int32_t block_numel = - packed_dim_block_size * outer_packed_dim_block_size; - if (storage_type != utils::kBuffer) { + const bool is_buffer = storage_type == utils::kBuffer; + + PackedDimInfo packed_dim_info(0, 1, 1, 1, 1, 1, false); + switch (memory_layout) { + case utils::kWidthPacked: + packed_dim_info = PackedDimInfo( + /*dim=*/0, + /*dim_block_size=*/is_buffer ? 1 : 4, + /*dim_align=*/is_buffer ? 1 : 4, + /*outer_dim=*/1, + /*outer_dim_block_size=*/1, + /*outer_dim_align=*/1, + /*is_block_transposed=*/false); + break; + case utils::kHeightPacked: + packed_dim_info = PackedDimInfo( + /*dim=*/1, + /*dim_block_size=*/is_buffer ? 1 : 4, + /*dim_align=*/is_buffer ? 1 : 4, + /*outer_dim=*/0, + /*outer_dim_block_size=*/1, + /*outer_dim_align=*/1, + /*is_block_transposed=*/false); + break; + case utils::kChannelsPacked: + packed_dim_info = PackedDimInfo( + /*dim=*/2, + /*dim_block_size=*/is_buffer ? 1 : 4, + /*dim_align=*/is_buffer ? 1 : 4, + /*outer_dim=*/0, + /*outer_dim_block_size=*/1, + /*outer_dim_align=*/1, + /*is_block_transposed=*/false); + break; + case utils::kPackedInt8_4W: + packed_dim_info = PackedDimInfo( + /*dim=*/0, + /*dim_block_size=*/is_buffer ? 4 : 16, + /*dim_align=*/is_buffer ? 4 : 16, + /*outer_dim=*/1, + /*outer_dim_block_size=*/1, + /*outer_dim_align=*/1, + /*is_block_transposed=*/false); + break; + case utils::kPackedInt8_4C: + packed_dim_info = PackedDimInfo( + /*dim=*/2, + /*dim_block_size=*/is_buffer ? 4 : 16, + /*dim_align=*/is_buffer ? 4 : 16, + /*outer_dim=*/0, + /*outer_dim_block_size=*/1, + /*outer_dim_align=*/1, + /*is_block_transposed=*/false); + break; + case utils::kPackedInt8_4W4C: + packed_dim_info = PackedDimInfo( + /*dim=*/2, + /*dim_block_size=*/4, + /*dim_align=*/4, + /*outer_dim=*/0, + /*outer_dim_block_size=*/4, + /*outer_dim_align=*/4, + /*is_block_transposed=*/false); + break; + case utils::kPackedInt8_4H4W: + packed_dim_info = PackedDimInfo( + /*dim=*/0, + /*dim_block_size=*/4, + /*dim_align=*/4, + /*outer_dim=*/1, + /*outer_dim_block_size=*/4, + /*outer_dim_align=*/4, + /*is_block_transposed=*/false); + break; + case utils::kPackedInt8_4C1W: + packed_dim_info = PackedDimInfo( + /*dim=*/2, + /*dim_block_size=*/is_buffer ? 4 : 16, + /*dim_align=*/is_buffer ? 4 : 16, + /*outer_dim=*/0, + /*outer_dim_block_size=*/1, + /*outer_dim_align=*/1, + /*is_block_transposed=*/true); + break; + default: + VK_THROW("Unknown GPUMemoryLayout"); + } + + if (!is_buffer) { + const int32_t block_numel = packed_dim_info.packed_dim_block_size * + packed_dim_info.outer_packed_dim_block_size; if (is_packed_int8_layout(memory_layout)) { VK_CHECK_COND(block_numel == 16); } else { @@ -53,12 +135,7 @@ PackedDimInfo calculate_packed_dim_info( } } - return PackedDimInfo( - packed_dim, - packed_dim_block_size, - outer_packed_dim, - outer_packed_dim_block_size, - is_block_transposed); + return packed_dim_info; } /* @@ -297,7 +374,8 @@ utils::ivec4 flip_and_unsqueeze_ivec4( * for GPU storage in the following ways: * * 1. The dimensionality of the tensor will be padded to a multiple of 4. - * 2. The size of the packed dimension will be padded to a multiple of 4. + * 2. The size of the packed dimension will be padded to a multiple of the + * packed dimension's alignment value. * * The "packed dimension" is determined based on the utils::GPUMemoryLayout * argument. @@ -317,23 +395,23 @@ std::vector calculate_padded_sizes( padded_sizes.at(i) = utils::val_at(i - ndim_up4, sizes); } - // Pad the packed dim to the block size - if (packed_dim_info.packed_dim_block_size > 1) { + // Pad the packed dim to the alignment + if (packed_dim_info.packed_dim_align > 1) { const int64_t dim_offset = packed_dim_info.packed_dim + 1; const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes); padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up( padded_dim_size, - static_cast(packed_dim_info.packed_dim_block_size)); + static_cast(packed_dim_info.packed_dim_align)); } - // Also pad the outer packed dimension if it's different from the inner packed - // dimension and is marked as padded. - if (packed_dim_info.outer_packed_dim_block_size > 1) { + // Also pad the outer packed dimension if it has alignment > 1. + if (packed_dim_info.outer_packed_dim_align > 1) { const int64_t outer_dim_offset = packed_dim_info.outer_packed_dim + 1; const int64_t outer_padded_dim_size = utils::val_at(-outer_dim_offset, sizes); - padded_sizes.at(ndim_up4 - outer_dim_offset) = - utils::align_up_4(outer_padded_dim_size); + padded_sizes.at(ndim_up4 - outer_dim_offset) = utils::align_up( + outer_padded_dim_size, + static_cast(packed_dim_info.outer_packed_dim_align)); } return padded_sizes; diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 8341097b490..301666f45c6 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -67,6 +67,12 @@ struct PackedDimInfo { // In physical memory, the size of the packed dim is aligned to this size to // ensure that data for the packed dim aligns with texel/block boundaries. int32_t packed_dim_block_size; + // In physical memory, the size of the packed dimension will be aligned to be + // a multiple of this value. This value must be a multiple of the packed_dim's + // block size, and is selected for performance reasons i.e. to ensure loads + // along the packed dim are aligned to cache lines, or to enable performance + // optimizations in shaders, i.e. remove the need for bounds checking. + int32_t packed_dim_align; // For block-packed layouts, represents the second tensor dimension that forms // the "width" dimension of the MxN square that is kept contiguous in memory. // For non block-packed layouts, represent the dimension with the next lowest @@ -77,6 +83,8 @@ struct PackedDimInfo { // 4H4W, represents the "height" of the square block that is kept contiguous // in memory. int32_t outer_packed_dim_block_size; + // See packed_dim_align + int32_t outer_packed_dim_align; // Typically the blocks of the tensor will be arranged such that the inner // dim of the block (i.e. the packed dim) has the lowest stride, and the // outer dim of the block (i.e. the outer packed dim) has the next lowest @@ -94,8 +102,10 @@ struct PackedDimInfo { PackedDimInfo( const int32_t dim, const int32_t dim_block_size, + const int32_t dim_align, const int32_t outer_dim, const int32_t outer_dim_block_size, + const int32_t outer_dim_align, const bool is_block_transposed); }; diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h index d2997019a8b..d2978f1d662 100644 --- a/backends/vulkan/runtime/utils/StorageUtils.h +++ b/backends/vulkan/runtime/utils/StorageUtils.h @@ -139,104 +139,6 @@ static constexpr GPUMemoryLayout kPackedInt8_4H4W = static constexpr GPUMemoryLayout kPackedInt8_4C1W = GPUMemoryLayout::TENSOR_PACKED_INT8_4C1W; -template -T to_packed_dim(const GPUMemoryLayout layout) { - switch (layout) { - case kWidthPacked: - return 0; - case kHeightPacked: - return 1; - case kChannelsPacked: - return 2; - case kPackedInt8_4W: - return 0; - case kPackedInt8_4C: - return 2; - case kPackedInt8_4W4C: - return 2; - case kPackedInt8_4H4W: - return 0; - case kPackedInt8_4C1W: - return 2; - }; - // Should be unreachable - return 0; -} - -template -T to_outer_packed_dim(const GPUMemoryLayout layout) { - switch (layout) { - case kWidthPacked: - return 1; - case kHeightPacked: - return 0; - case kChannelsPacked: - return 0; - case kPackedInt8_4W: - return 1; - case kPackedInt8_4C: - return 0; - case kPackedInt8_4W4C: - return 0; - case kPackedInt8_4H4W: - return 1; - case kPackedInt8_4C1W: - return 0; - }; - // Should be unreachable - return 1; -} - -template -T to_packed_dim_block_size( - const GPUMemoryLayout layout, - const StorageType storage) { - switch (layout) { - case kWidthPacked: - return storage == kBuffer ? 1 : 4; - case kHeightPacked: - return storage == kBuffer ? 1 : 4; - case kChannelsPacked: - return storage == kBuffer ? 1 : 4; - case kPackedInt8_4W: - return storage == kBuffer ? 4 : 16; - case kPackedInt8_4C: - return storage == kBuffer ? 4 : 16; - case kPackedInt8_4W4C: - return 4; - case kPackedInt8_4H4W: - return 4; - case kPackedInt8_4C1W: - return storage == kBuffer ? 4 : 16; - }; - // Should be unreachable - return 1; -} - -template -T to_outer_packed_dim_block_size(const GPUMemoryLayout layout) { - switch (layout) { - case kWidthPacked: - return 1; - case kHeightPacked: - return 1; - case kChannelsPacked: - return 1; - case kPackedInt8_4W: - return 1; - case kPackedInt8_4C: - return 1; - case kPackedInt8_4W4C: - return 4; - case kPackedInt8_4H4W: - return 4; - case kPackedInt8_4C1W: - return 1; - }; - // Should be unreachable - return 1; -} - bool is_block_transposed_layout(const GPUMemoryLayout layout); bool is_packed_int8_layout(const GPUMemoryLayout layout);