From 7696369a1b18fb8cb122fb5eebba84d372142bcd Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Tue, 3 Feb 2026 13:27:12 -0800
Subject: [PATCH] [ET-VK] Add alignment fields to PackedDimInfo for padded size
 calculation

This change introduces separate alignment fields to PackedDimInfo, decoupling
the alignment used for padding tensor dimensions from the block size used for
packing.

Previously, `calculate_padded_sizes` used `packed_dim_block_size` and
`outer_packed_dim_block_size` directly to determine how much to pad tensor
dimensions. This works but limits flexibility - there are scenarios where we
want to pad dimensions to a larger alignment than the block size for
performance reasons, such as ensuring loads are aligned to cache lines or
removing the need for bounds checking in shaders.

The new fields `packed_dim_align` and `outer_packed_dim_align` allow specifying
the alignment independently. For now, these are initialized to match the
corresponding block sizes, preserving existing behavior. Future changes can
set larger alignment values when beneficial for performance.

Authored with Claude.

Differential Revision: [D92196649](https://our.internmc.facebook.com/intern/diff/D92196649/)

[ghstack-poisoned]
---
 .../vulkan/runtime/api/containers/Tensor.cpp  | 134 ++++++++++++++----
 .../vulkan/runtime/api/containers/Tensor.h    |  10 ++
 backends/vulkan/runtime/utils/StorageUtils.h  |  98 -------------
 3 files changed, 116 insertions(+), 126 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 4cf949ba5ab..351b920e805 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -17,13 +17,17 @@ namespace api {
 PackedDimInfo::PackedDimInfo(
     const int32_t dim,
     const int32_t dim_block_size,
+    const int32_t dim_align,
     const int32_t outer_dim,
     const int32_t outer_dim_block_size,
+    const int32_t outer_dim_align,
     const bool is_block_transposed)
     : packed_dim(dim),
       packed_dim_block_size(dim_block_size),
+      packed_dim_align(dim_align),
       outer_packed_dim(outer_dim),
       outer_packed_dim_block_size(outer_dim_block_size),
+      outer_packed_dim_align(outer_dim_align),
       block_transposed(is_block_transposed),
       block_numel(packed_dim_block_size * outer_packed_dim_block_size) {
   // Packed dims must be different
@@ -33,19 +37,97 @@ PackedDimInfo::PackedDimInfo(
 PackedDimInfo calculate_packed_dim_info(
     const utils::GPUMemoryLayout memory_layout,
     const utils::StorageType storage_type) {
-  const int32_t packed_dim = utils::to_packed_dim<int32_t>(memory_layout);
-  const int32_t outer_packed_dim =
-      utils::to_outer_packed_dim<int32_t>(memory_layout);
-  const int32_t packed_dim_block_size =
-      utils::to_packed_dim_block_size<int32_t>(memory_layout, storage_type);
-  const int32_t outer_packed_dim_block_size =
-      utils::to_outer_packed_dim_block_size<int32_t>(memory_layout);
-  const bool is_block_transposed =
-      utils::is_block_transposed_layout(memory_layout);
-
-  const int32_t block_numel =
-      packed_dim_block_size * outer_packed_dim_block_size;
-  if (storage_type != utils::kBuffer) {
+  const bool is_buffer = storage_type == utils::kBuffer;
+
+  PackedDimInfo packed_dim_info(0, 1, 1, 1, 1, 1, false);
+  switch (memory_layout) {
+    case utils::kWidthPacked:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/0,
+          /*dim_block_size=*/is_buffer ? 1 : 4,
+          /*dim_align=*/is_buffer ? 1 : 4,
+          /*outer_dim=*/1,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kHeightPacked:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/1,
+          /*dim_block_size=*/is_buffer ? 1 : 4,
+          /*dim_align=*/is_buffer ? 1 : 4,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kChannelsPacked:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/2,
+          /*dim_block_size=*/is_buffer ? 1 : 4,
+          /*dim_align=*/is_buffer ? 1 : 4,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4W:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/0,
+          /*dim_block_size=*/is_buffer ? 4 : 16,
+          /*dim_align=*/is_buffer ? 4 : 16,
+          /*outer_dim=*/1,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4C:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/2,
+          /*dim_block_size=*/is_buffer ? 4 : 16,
+          /*dim_align=*/is_buffer ? 4 : 16,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4W4C:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/2,
+          /*dim_block_size=*/4,
+          /*dim_align=*/4,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/4,
+          /*outer_dim_align=*/4,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4H4W:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/0,
+          /*dim_block_size=*/4,
+          /*dim_align=*/4,
+          /*outer_dim=*/1,
+          /*outer_dim_block_size=*/4,
+          /*outer_dim_align=*/4,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4C1W:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/2,
+          /*dim_block_size=*/is_buffer ? 4 : 16,
+          /*dim_align=*/is_buffer ? 4 : 16,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/true);
+      break;
+    default:
+      VK_THROW("Unknown GPUMemoryLayout");
+  }
+
+  if (!is_buffer) {
+    const int32_t block_numel = packed_dim_info.packed_dim_block_size *
+        packed_dim_info.outer_packed_dim_block_size;
     if (is_packed_int8_layout(memory_layout)) {
       VK_CHECK_COND(block_numel == 16);
     } else {
@@ -53,12 +135,7 @@ PackedDimInfo calculate_packed_dim_info(
     }
   }
 
-  return PackedDimInfo(
-      packed_dim,
-      packed_dim_block_size,
-      outer_packed_dim,
-      outer_packed_dim_block_size,
-      is_block_transposed);
+  return packed_dim_info;
 }
 
 /*
@@ -297,7 +374,8 @@ utils::ivec4 flip_and_unsqueeze_ivec4(
  * for GPU storage in the following ways:
  *
  *   1. The dimensionality of the tensor will be padded to a multiple of 4.
- *   2. The size of the packed dimension will be padded to a multiple of 4.
+ *   2. The size of the packed dimension will be padded to a multiple of the
+ *      packed dimension's alignment value.
  *
  * The "packed dimension" is determined based on the utils::GPUMemoryLayout
  * argument.
@@ -317,23 +395,23 @@ std::vector<int64_t> calculate_padded_sizes(
     padded_sizes.at(i) = utils::val_at(i - ndim_up4, sizes);
   }
 
-  // Pad the packed dim to the block size
-  if (packed_dim_info.packed_dim_block_size > 1) {
+  // Pad the packed dim to the alignment
+  if (packed_dim_info.packed_dim_align > 1) {
     const int64_t dim_offset = packed_dim_info.packed_dim + 1;
     const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
     padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up(
         padded_dim_size,
-        static_cast<int64_t>(packed_dim_info.packed_dim_block_size));
+        static_cast<int64_t>(packed_dim_info.packed_dim_align));
   }
 
-  // Also pad the outer packed dimension if it's different from the inner packed
-  // dimension and is marked as padded.
-  if (packed_dim_info.outer_packed_dim_block_size > 1) {
+  // Also pad the outer packed dimension if it has alignment > 1.
+  if (packed_dim_info.outer_packed_dim_align > 1) {
     const int64_t outer_dim_offset = packed_dim_info.outer_packed_dim + 1;
     const int64_t outer_padded_dim_size =
         utils::val_at(-outer_dim_offset, sizes);
-    padded_sizes.at(ndim_up4 - outer_dim_offset) =
-        utils::align_up_4(outer_padded_dim_size);
+    padded_sizes.at(ndim_up4 - outer_dim_offset) = utils::align_up(
+        outer_padded_dim_size,
+        static_cast<int64_t>(packed_dim_info.outer_packed_dim_align));
   }
 
   return padded_sizes;
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 8341097b490..301666f45c6 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -67,6 +67,12 @@ struct PackedDimInfo {
   // In physical memory, the size of the packed dim is aligned to this size to
   // ensure that data for the packed dim aligns with texel/block boundaries.
   int32_t packed_dim_block_size;
+  // In physical memory, the size of the packed dimension will be aligned to be
+  // a multiple of this value. This value must be a multiple of the packed_dim's
+  // block size, and is selected for performance reasons i.e. to ensure loads
+  // along the packed dim are aligned to cache lines, or to enable performance
+  // optimizations in shaders, i.e. remove the need for bounds checking.
+  int32_t packed_dim_align;
   // For block-packed layouts, represents the second tensor dimension that forms
   // the "width" dimension of the MxN square that is kept contiguous in memory.
   // For non block-packed layouts, represent the dimension with the next lowest
@@ -77,6 +83,8 @@ struct PackedDimInfo {
   // 4H4W, represents the "height" of the square block that is kept contiguous
   // in memory.
   int32_t outer_packed_dim_block_size;
+  // See packed_dim_align
+  int32_t outer_packed_dim_align;
   // Typically the blocks of the tensor will be arranged such that the inner
   // dim of the block (i.e. the packed dim) has the lowest stride, and the
   // outer dim of the block (i.e. the outer packed dim) has the next lowest
@@ -94,8 +102,10 @@ struct PackedDimInfo {
   PackedDimInfo(
       const int32_t dim,
       const int32_t dim_block_size,
+      const int32_t dim_align,
       const int32_t outer_dim,
       const int32_t outer_dim_block_size,
+      const int32_t outer_dim_align,
       const bool is_block_transposed);
 };
 
diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h
index d2997019a8b..d2978f1d662 100644
--- a/backends/vulkan/runtime/utils/StorageUtils.h
+++ b/backends/vulkan/runtime/utils/StorageUtils.h
@@ -139,104 +139,6 @@ static constexpr GPUMemoryLayout kPackedInt8_4H4W =
 static constexpr GPUMemoryLayout kPackedInt8_4C1W =
     GPUMemoryLayout::TENSOR_PACKED_INT8_4C1W;
 
-template <typename T>
-T to_packed_dim(const GPUMemoryLayout layout) {
-  switch (layout) {
-    case kWidthPacked:
-      return 0;
-    case kHeightPacked:
-      return 1;
-    case kChannelsPacked:
-      return 2;
-    case kPackedInt8_4W:
-      return 0;
-    case kPackedInt8_4C:
-      return 2;
-    case kPackedInt8_4W4C:
-      return 2;
-    case kPackedInt8_4H4W:
-      return 0;
-    case kPackedInt8_4C1W:
-      return 2;
-  };
-  // Should be unreachable
-  return 0;
-}
-
-template <typename T>
-T to_outer_packed_dim(const GPUMemoryLayout layout) {
-  switch (layout) {
-    case kWidthPacked:
-      return 1;
-    case kHeightPacked:
-      return 0;
-    case kChannelsPacked:
-      return 0;
-    case kPackedInt8_4W:
-      return 1;
-    case kPackedInt8_4C:
-      return 0;
-    case kPackedInt8_4W4C:
-      return 0;
-    case kPackedInt8_4H4W:
-      return 1;
-    case kPackedInt8_4C1W:
-      return 0;
-  };
-  // Should be unreachable
-  return 1;
-}
-
-template <typename T>
-T to_packed_dim_block_size(
-    const GPUMemoryLayout layout,
-    const StorageType storage) {
-  switch (layout) {
-    case kWidthPacked:
-      return storage == kBuffer ? 1 : 4;
-    case kHeightPacked:
-      return storage == kBuffer ? 1 : 4;
-    case kChannelsPacked:
-      return storage == kBuffer ? 1 : 4;
-    case kPackedInt8_4W:
-      return storage == kBuffer ? 4 : 16;
-    case kPackedInt8_4C:
-      return storage == kBuffer ? 4 : 16;
-    case kPackedInt8_4W4C:
-      return 4;
-    case kPackedInt8_4H4W:
-      return 4;
-    case kPackedInt8_4C1W:
-      return storage == kBuffer ? 4 : 16;
-  };
-  // Should be unreachable
-  return 1;
-}
-
-template <typename T>
-T to_outer_packed_dim_block_size(const GPUMemoryLayout layout) {
-  switch (layout) {
-    case kWidthPacked:
-      return 1;
-    case kHeightPacked:
-      return 1;
-    case kChannelsPacked:
-      return 1;
-    case kPackedInt8_4W:
-      return 1;
-    case kPackedInt8_4C:
-      return 1;
-    case kPackedInt8_4W4C:
-      return 4;
-    case kPackedInt8_4H4W:
-      return 4;
-    case kPackedInt8_4C1W:
-      return 1;
-  };
-  // Should be unreachable
-  return 1;
-}
-
 bool is_block_transposed_layout(const GPUMemoryLayout layout);
 
 bool is_packed_int8_layout(const GPUMemoryLayout layout);