diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 2645be6478e..874b2a65168 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1137,7 +1137,7 @@ jobs:
         ./cmake-out/backends/vulkan/test/custom_ops/q8csw_conv2d
         ./cmake-out/backends/vulkan/test/custom_ops/q4gsw_linear
         ./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
-        ./cmake-out/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations
+        ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_qdq
         ./cmake-out/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add
 
         # "Classic" Operator tests
diff --git a/backends/vulkan/runtime/graph/ops/glsl/block_indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/block_indexing.glslh
new file mode 100644
index 00000000000..e7e64a601ef
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/block_indexing.glslh
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef BLOCK_GLSLH
+#define BLOCK_GLSLH
+
+#include "block_int8x4_utils.glslh"
+#include "common.glslh"
+
+//
+// Block Layout Int Utils
+//
+
+// These macros extract fields from the packed int returned by
+// BlockConfig::as_packed_int(). See Common.h for the bit layout.
+//
+// Bit layout matches hashed layout format:
+//   bits  0- 3: block_dim_order[0] (inner_dim if not transposed, outer_dim if transposed)
+//   bits  4- 7: block_dim_order[1] (outer_dim if not transposed, inner_dim if transposed)
+//   bits  8-11: block_dim_order[2] (first_nonblock_dim)
+//   bits 12-15: block_dim_order[3] (second_nonblock_dim)
+//   bits 16-19: inner_dim
+//   bits 20-23: outer_dim
+//   bits 24-27: inner_dim_block_size
+//   bits 28-31: outer_dim_block_size
+
+// Extract block_dim_order elements (bits 0-15)
+#define get_block_dim_order_0(x) ((x) & 0xF)
+#define get_block_dim_order_1(x) (((x) >> 4) & 0xF)
+#define get_block_dim_order_2(x) (((x) >> 8) & 0xF)
+#define get_block_dim_order_3(x) (((x) >> 12) & 0xF)
+
+// Extract packed_dim_info (bits 16-31)
+#define get_block_inner_dim(x) (((x) >> 16) & 0xF)
+#define get_block_outer_dim(x) (((x) >> 20) & 0xF)
+#define get_block_inner_dim_block_size(x) (((x) >> 24) & 0xF)
+#define get_block_outer_dim_block_size(x) (((x) >> 28) & 0xF)
+
+/*
+ * Block-based programming utilities for compute shaders.
+ *
+ * A "block" is a 4x4 tile of tensor elements. The two dimensions of the block
+ * are called the "inner" dimension and the "outer" dimension. The inner dim is
+ * the one that is kept contiguous in memory (i.e. the packed dimension of the
+ * tensor), and the outer dim forms the other axis of the 2D block.
+ *
+ * For texel-packed tensors (single level of packing), a block is effectively a
+ * single texel repeated 4 times along the outer dimension. For block-packed
+ * tensors (two levels of packing), a block corresponds exactly to the 4x4
+ * packed unit.
+ *
+ * When dispatching a block-based shader:
+ * - gl_GlobalInvocationID.x = block index along inner dimension
+ * - gl_GlobalInvocationID.y = block index along outer dimension
+ * - gl_GlobalInvocationID.z = plane index (remaining dimensions flattened)
+ */
+
+//
+// Index Conversion Utilities (TensorIndex4D versions)
+//
+
+TensorIndex4D contiguous_block_idx_to_tensor4d_idx_with_block_config(
+    const BufferMetadata meta,
+    const uint block_idx,
+    const int block_config) {
+  TensorIndex4D tidx;
+
+  uint block_strides[4];
+
+  uint stride = 1;
+  // Inner block dim
+  const int packed_dim_1 = get_block_inner_dim(block_config);
+  block_strides[packed_dim_1] = 1;
+  const uint block_size_1 = uint(get_block_inner_dim_block_size(block_config));
+  stride = div_up(meta.sizes[0][packed_dim_1], block_size_1);
+  // Outer block dim
+  const int packed_dim_2 = get_block_outer_dim(block_config);
+  block_strides[packed_dim_2] = stride;
+  const uint block_size_2 =
+      uint(get_block_outer_dim_block_size(block_config));
+  stride *= div_up(meta.sizes[0][packed_dim_2], block_size_2);
+  // First non-block dim
+  const int outer_dim_1 = get_block_dim_order_2(block_config);
+  block_strides[outer_dim_1] = stride;
+  stride *= meta.sizes[0][outer_dim_1];
+  // Second non-block dim
+  const int outer_dim_2 = get_block_dim_order_3(block_config);
+  block_strides[outer_dim_2] = stride;
+
+  uint contig_idx = block_idx;
+  // Second non-block dim
+  tidx.data[outer_dim_2] = int(contig_idx / block_strides[outer_dim_2]);
+  contig_idx %= block_strides[outer_dim_2];
+  // First non-block dim (1; height)
+  tidx.data[outer_dim_1] = int(contig_idx / block_strides[outer_dim_1]);
+  contig_idx %= block_strides[outer_dim_1];
+  // Outer block dim (0; width)
+  tidx.data[packed_dim_2] =
+      int(mul_4(contig_idx / block_strides[packed_dim_2]));
+  contig_idx %= block_strides[packed_dim_2];
+  // Inner block dim (2; channels)
+  tidx.data[packed_dim_1] = int(mul_4(contig_idx));
+
+  return tidx;
+}
+
+//
+// TextureMetadata variants of block indexing
+//
+
+TensorIndex4D contiguous_block_idx_to_tensor4d_idx_with_block_config(
+    const TextureMetadata meta,
+    const uint block_idx,
+    const int block_config) {
+  TensorIndex4D tidx;
+
+  uint block_strides[4];
+
+  uint stride = 1;
+  // Inner block dim
+  const int packed_dim_1 = get_block_inner_dim(block_config);
+  block_strides[packed_dim_1] = 1;
+  const uint block_size_1 = uint(get_block_inner_dim_block_size(block_config));
+  stride = div_up(meta.sizes[packed_dim_1], block_size_1);
+  // Outer block dim
+  const int packed_dim_2 = get_block_outer_dim(block_config);
+  block_strides[packed_dim_2] = stride;
+  const uint block_size_2 =
+      uint(get_block_outer_dim_block_size(block_config));
+  stride *= div_up(meta.sizes[packed_dim_2], block_size_2);
+  // First non-block dim
+  const int outer_dim_1 = get_block_dim_order_2(block_config);
+  block_strides[outer_dim_1] = stride;
+  stride *= meta.sizes[outer_dim_1];
+  // Second non-block dim
+  const int outer_dim_2 = get_block_dim_order_3(block_config);
+  block_strides[outer_dim_2] = stride;
+
+  uint contig_idx = block_idx;
+  // Second non-block dim
+  tidx.data[outer_dim_2] = int(contig_idx / block_strides[outer_dim_2]);
+  contig_idx %= block_strides[outer_dim_2];
+  // First non-block dim
+  tidx.data[outer_dim_1] = int(contig_idx / block_strides[outer_dim_1]);
+  contig_idx %= block_strides[outer_dim_1];
+  // Outer block dim
+  tidx.data[packed_dim_2] =
+      int(mul_4(contig_idx / block_strides[packed_dim_2]));
+  contig_idx %= block_strides[packed_dim_2];
+  // Inner block dim
+  tidx.data[packed_dim_1] = int(mul_4(contig_idx));
+
+  return tidx;
+}
+
+//
+// 3D Block Index Conversion Utilities (WHCN Dispatch)
+//
+// These functions convert a 3D thread index (gl_GlobalInvocationID) to a
+// TensorIndex4D using a dispatch pattern:
+//   - thread_idx.x = W threads (divided by 4 if W is part of block)
+//   - thread_idx.y = H threads (divided by 4 if H is part of block)
+//   - thread_idx.z = C * N threads (C divided by 4 if C is part of block)
+//
+// Note: GLSL tensor metadata is in WHCN order (sizes[0]=W, sizes[1]=H,
+// sizes[2]=C, sizes[3]=N), while C++ uses NCHW order.
+//
+
+/*
+ * Convert a 3D block index to a TensorIndex4D using WHCN dispatch.
+ *
+ * Parameters:
+ *   meta: BufferMetadata with tensor sizes in WHCN order
+ *   thread_idx: 3D thread index (x=W, y=H, z=C*N)
+ *   block_config: Packed block configuration from BlockConfig::as_packed_int()
+ *
+ * Returns: TensorIndex4D with logical tensor coordinates
+ */
+TensorIndex4D block_idx_3d_to_tensor4d_idx_with_block_config(
+    const BufferMetadata meta,
+    const uvec3 thread_idx,
+    const int block_config) {
+  TensorIndex4D tidx;
+  const int inner_dim = get_block_inner_dim(block_config);
+  const int outer_dim = get_block_outer_dim(block_config);
+
+  // GLSL metadata is in WHCN order: sizes[0]=W, sizes[1]=H, sizes[2]=C, sizes[3]=N
+
+  // Compute C threads for decomposing thread_idx.z
+  // C is blocked (divided by 4) only if it's part of the block
+  uint C_size = uint(meta.sizes[0][2]);
+  uint num_C;
+  if (inner_dim == 2 || outer_dim == 2) {
+    num_C = div_up_4(C_size);
+  } else {
+    num_C = C_size;
+  }
+
+  // W (dim 0): blocked if inner or outer
+  if (inner_dim == 0 || outer_dim == 0) {
+    tidx.data[0] = int(thread_idx.x) * 4;  // Block-aligned
+  } else {
+    tidx.data[0] = int(thread_idx.x);  // Single value
+  }
+
+  // H (dim 1): blocked if inner or outer
+  if (inner_dim == 1 || outer_dim == 1) {
+    tidx.data[1] = int(thread_idx.y) * 4;  // Block-aligned
+  } else {
+    tidx.data[1] = int(thread_idx.y);  // Single value
+  }
+
+  // C (dim 2): blocked if inner or outer
+  if (inner_dim == 2 || outer_dim == 2) {
+    tidx.data[2] = int(thread_idx.z % num_C) * 4;  // Block-aligned
+  } else {
+    tidx.data[2] = int(thread_idx.z % num_C);  // Single value
+  }
+
+  // N (dim 3): never blocked
+  tidx.data[3] = int(thread_idx.z / num_C);
+
+  return tidx;
+}
+
+/*
+ * Convert a 3D block index to a TensorIndex4D (TextureMetadata variant).
+ */
+TensorIndex4D block_idx_3d_to_tensor4d_idx_with_block_config(
+    const TextureMetadata meta,
+    const uvec3 thread_idx,
+    const int block_config) {
+  TensorIndex4D tidx;
+  const int inner_dim = get_block_inner_dim(block_config);
+  const int outer_dim = get_block_outer_dim(block_config);
+
+  // GLSL metadata is in WHCN order: sizes[0]=W, sizes[1]=H, sizes[2]=C, sizes[3]=N
+
+  // Compute C threads for decomposing thread_idx.z
+  uint C_size = uint(meta.sizes[2]);
+  uint num_C;
+  if (inner_dim == 2 || outer_dim == 2) {
+    num_C = div_up_4(C_size);
+  } else {
+    num_C = C_size;
+  }
+
+  // W (dim 0): blocked if inner or outer
+  if (inner_dim == 0 || outer_dim == 0) {
+    tidx.data[0] = int(thread_idx.x) * 4;
+  } else {
+    tidx.data[0] = int(thread_idx.x);
+  }
+
+  // H (dim 1): blocked if inner or outer
+  if (inner_dim == 1 || outer_dim == 1) {
+    tidx.data[1] = int(thread_idx.y) * 4;
+  } else {
+    tidx.data[1] = int(thread_idx.y);
+  }
+
+  // C (dim 2): blocked if inner or outer
+  if (inner_dim == 2 || outer_dim == 2) {
+    tidx.data[2] = int(thread_idx.z % num_C) * 4;
+  } else {
+    tidx.data[2] = int(thread_idx.z % num_C);
+  }
+
+  // N (dim 3): never blocked
+  tidx.data[3] = int(thread_idx.z / num_C);
+
+  return tidx;
+}
+
+#endif // BLOCK_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_load.glslh
new file mode 100644
index 00000000000..6ea636a0a17
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_load.glslh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Macro to generate int8x4 block loading functions for a specific buffer.
+ *
+ * Usage:
+ *   define_load_int8x4_buffer_fns(t_inp)
+ *
+ * This generates:
+ *   - load_int8x4_block_from_t_inp(meta, tidx_base, layout, block_outer_dim)
+ *
+ * IMPORTANT: block_outer_dim must be such that the inner dimension (packed_dim)
+ * contains 4 contiguous int8 elements packed into one int32. If the loaded
+ * block needs to be transposed to match a different output layout, that
+ * transposition must be done by the caller.
+ */
+
+#ifndef BLOCK_INT8X4_LOAD_GLSLH
+#define BLOCK_INT8X4_LOAD_GLSLH
+
+#define define_load_int8x4_buffer_fns(buffer_name)                           \
+                                                                             \
+  ivec4 load_int8x4_block_from_##buffer_name(                                \
+      const BufferMetadata meta,                                             \
+      const TensorIndex4D tidx_base,                                         \
+      const int hashed_layout,                                               \
+      const int block_outer_dim) {                                           \
+    const int outer_packed_dim = get_outer_packed_dim(hashed_layout);        \
+    const int outer_block_size =                                             \
+        get_outer_packed_dim_block_size(hashed_layout);                      \
+                                                                             \
+    /* Compute base packed index using block-based indexing */               \
+    const uint block_idx =                                                   \
+        tensor4d_idx_to_block_idx(meta, tidx_base, hashed_layout);           \
+    const uint texels_per_block = div_4(get_block_numel(hashed_layout));     \
+    uint buf_idx = block_idx * texels_per_block;                             \
+                                                                             \
+    /* Fast path: contiguous texels when iterating along outer_packed_dim */ \
+    if (outer_block_size == 4) { \
+      if (block_outer_dim == outer_packed_dim) {      \
+        return ivec4(                                                          \
+            buffer_name[buf_idx],                                              \
+            buffer_name[buf_idx + 1],                                          \
+            buffer_name[buf_idx + 2],                                          \
+            buffer_name[buf_idx + 3]);                                         \
+      }                                                                        \
+      else { \
+        buf_idx += mod_4(tidx_base.data[outer_packed_dim]); \
+      } \
+    } \
+                                                                             \
+    /* General path: use stride for non-contiguous access */                 \
+    const uint outer_stride =                                                \
+        stride_at(meta, block_outer_dim) * texels_per_block;                 \
+    const uint outer_size = size_at(meta, block_outer_dim);                  \
+    const int base_outer_idx = tidx_base.data[block_outer_dim];              \
+                                                                             \
+    ivec4 block = ivec4(0);                                                  \
+    [[unroll]] for (int block_y = 0; block_y < 4; ++block_y) {               \
+      if (base_outer_idx + block_y < int(outer_size)) {                      \
+        block[block_y] = buffer_name[buf_idx];                               \
+      }                                                                      \
+      buf_idx += outer_stride;                                               \
+    }                                                                        \
+    return block;                                                            \
+  }
+
+#endif // BLOCK_INT8X4_LOAD_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_store.glslh b/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_store.glslh
new file mode 100644
index 00000000000..2a0e037c291
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_store.glslh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Macro to generate int8x4 block storing functions for a specific buffer.
+ *
+ * Usage:
+ *   define_store_int8x4_buffer_fns(t_out)
+ *
+ * This generates:
+ *   - store_int8x4_block_to_t_out(meta, tidx_base, layout, block_outer_dim,
+ *                                 block)
+ *
+ * IMPORTANT: block_outer_dim must be such that the inner dimension (packed_dim)
+ * contains 4 contiguous int8 elements packed into one int32. If the block needs
+ * to be transposed to match the output layout, that transposition must be done
+ * by the caller before storing.
+ */
+
+#ifndef BLOCK_INT8X4_STORE_GLSLH
+#define BLOCK_INT8X4_STORE_GLSLH
+
+#define define_store_int8x4_buffer_fns(buffer_name)                          \
+                                                                             \
+  void store_int8x4_block_to_##buffer_name(                                  \
+      const BufferMetadata meta,                                             \
+      const TensorIndex4D tidx_base,                                         \
+      const int hashed_layout,                                               \
+      const int block_outer_dim,                                             \
+      const ivec4 block) {                                                   \
+    const int outer_packed_dim = get_outer_packed_dim(hashed_layout);        \
+    const int outer_block_size =                                             \
+        get_outer_packed_dim_block_size(hashed_layout);                      \
+                                                                             \
+    /* Compute base packed index using block-based indexing */               \
+    const uint block_idx =                                                   \
+        tensor4d_idx_to_block_idx(meta, tidx_base, hashed_layout);           \
+    const uint texels_per_block = div_4(get_block_numel(hashed_layout));     \
+    uint buf_idx = block_idx * texels_per_block;                             \
+                                                                             \
+    /* Fast path: contiguous texels when iterating along outer_packed_dim */ \
+    if (outer_block_size == 4) { \
+      if (block_outer_dim == outer_packed_dim) {      \
+        buffer_name[buf_idx] = block[0];                                       \
+        buffer_name[buf_idx + 1] = block[1];                                   \
+        buffer_name[buf_idx + 2] = block[2];                                   \
+        buffer_name[buf_idx + 3] = block[3];                                   \
+        return;                                                                \
+      }                                                                        \
+      else { \
+        buf_idx += mod_4(tidx_base.data[outer_packed_dim]); \
+      } \
+    } \
+                                                                             \
+    /* General path: use stride for non-contiguous access */                 \
+    const uint outer_stride =                                                \
+        stride_at(meta, block_outer_dim) * texels_per_block;                 \
+    const uint outer_size = size_at(meta, block_outer_dim);                  \
+    const int base_outer_idx = tidx_base.data[block_outer_dim];              \
+                                                                             \
+    [[unroll]] for (int block_y = 0; block_y < 4; ++block_y) {               \
+      if (base_outer_idx + block_y < int(outer_size)) {                      \
+        buffer_name[buf_idx] = block[block_y];                               \
+      }                                                                      \
+      buf_idx += outer_stride;                                               \
+    }                                                                        \
+  }
+
+#endif // BLOCK_INT8X4_STORE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_utils.glslh
new file mode 100644
index 00000000000..fea87428d0d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_utils.glslh
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Utility functions for working with int8x4 blocks.
+ *
+ * An int8x4 block is a 4x4 tile of int8 values stored as an ivec4,
+ * where each ivec4 element contains 4 packed int8 values (one row of the
+ * block).
+ */
+
+#ifndef BLOCK_INT8X4_UTILS_GLSLH
+#define BLOCK_INT8X4_UTILS_GLSLH
+
+/*
+ * Write a single int8 value to a specific position in an int8x4 block.
+ *
+ * Parameters:
+ *   block: The block to modify (ivec4 where each element has 4 packed int8)
+ *   y: Row index (0-3)
+ *   x: Column index (0-3)
+ *   val: The int8 value to write
+ */
+void write_int8x4_block_element(
+    inout ivec4 block,
+    const int y,
+    const int x,
+    const int val) {
+  int texel = block[y];
+  // Create a mask to clear the byte at position x, then insert val
+  int shift = x * 8;
+  int mask = ~(0xFF << shift);
+  block[y] = (texel & mask) | ((val & 0xFF) << shift);
+}
+
+/*
+ * Transpose a 4x4 int8 block.
+ *
+ * Given block[y][x], produces result[x][y].
+ * Each ivec4 element contains 4 packed int8 values.
+ */
+ivec4 transpose_int8x4_block(const ivec4 block) {
+  ivec4 result;
+  [[unroll]] for (int y = 0; y < 4; ++y) {
+    int packed = 0;
+    [[unroll]] for (int x = 0; x < 4; ++x) {
+      // Extract byte y from block[x]
+      int val = (block[x] >> (8 * y)) & 0xFF;
+      // Pack into position x
+      packed |= (val << (8 * x));
+    }
+    result[y] = packed;
+  }
+  return result;
+}
+
+//
+// Debug print functions
+//
+
+#ifdef DEBUG_MODE
+/*
+ * Debug print function for Int8x4Block.
+ *
+ * Prints all 16 int8 values as a 4x4 matrix.
+ */
+void printInt8x4Block(const ivec4 block) {
+  // Unpack all 16 int8 values into an array
+  int v[16];
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    int packed = block[i];
+    v[i * 4 + 0] = (packed >> 0) & 0xFF;
+    v[i * 4 + 1] = (packed >> 8) & 0xFF;
+    v[i * 4 + 2] = (packed >> 16) & 0xFF;
+    v[i * 4 + 3] = (packed >> 24) & 0xFF;
+  }
+  // Sign extend from 8-bit to print as signed
+  [[unroll]] for (int i = 0; i < 16; ++i) {
+    if (v[i] > 127)
+      v[i] -= 256;
+  }
+  // Print as a 4x4 square in a single call to avoid interleaving
+  debugPrintfEXT(
+      "Int8x4Block:\\n  [%d, %d, %d, %d]\\n  [%d, %d, %d, %d]\\n  [%d, %d, %d, %d]\\n  [%d, %d, %d, %d]\\n",
+      v[0],
+      v[1],
+      v[2],
+      v[3],
+      v[4],
+      v[5],
+      v[6],
+      v[7],
+      v[8],
+      v[9],
+      v[10],
+      v[11],
+      v[12],
+      v[13],
+      v[14],
+      v[15]);
+}
+#endif // DEBUG_MODE
+
+#endif // BLOCK_INT8X4_UTILS_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/block_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/block_load.glslh
new file mode 100644
index 00000000000..d72a176aa0e
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/block_load.glslh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Macros to generate block load functions for buffers and textures.
+ *
+ * Buffer usage:
+ *   define_load_buffer_fns(t_inp)
+ *
+ * Texture usage:
+ *   define_load_texture_fns(t_inp)
+ *
+ * Both generate functions with the same signature:
+ *   - load_fp_block_from_t_inp(meta, tidx_base, layout, block_outer_dim)
+ *
+ * The block_inner_dim is derived from the packed_dim of the hashed_layout.
+ * If the loaded block needs to be transposed to match a different output
+ * layout, that transposition must be done by the caller.
+ *
+ * Parameters:
+ *   buffer_name/texture_name: The name of the input buffer/texture (e.g., t_inp)
+ */
+
+#ifndef BLOCK_LOAD_GLSLH
+#define BLOCK_LOAD_GLSLH
+
+//
+// Buffer load functions
+//
+
+#define define_load_buffer_fns(buffer_name)                                  \
+                                                                             \
+  mat4 load_fp_block_from_##buffer_name(                                     \
+      const BufferMetadata meta,                                             \
+      const TensorIndex4D tidx_base,                                         \
+      const int hashed_layout,                                               \
+      const int block_outer_dim) {                                           \
+    const int block_inner_dim = get_packed_dim(hashed_layout);               \
+                                                                             \
+    /* Compute base buffer index once and use strides for iteration */       \
+    const uint base_idx =                                                    \
+        tensor4d_idx_to_buf_idx(meta, tidx_base, hashed_layout);             \
+    const uint outer_stride = stride_at(meta, block_outer_dim);              \
+    /* Inner stride is 1 since packed_dim == block_inner_dim */              \
+                                                                             \
+    /* Pre-compute bounds for efficient checking */                          \
+    const uint outer_size = size_at(meta, block_outer_dim);                  \
+    const uint inner_size = size_at(meta, block_inner_dim);                  \
+    const int base_outer_idx = tidx_base.data[block_outer_dim];              \
+    const int base_inner_idx = tidx_base.data[block_inner_dim];              \
+                                                                             \
+    mat4 block;                                                              \
+    [[unroll]] for (int block_y = 0; block_y < 4; ++block_y) {               \
+      if (base_outer_idx + block_y < int(outer_size)) {                      \
+        const uint row_idx = base_idx + block_y * outer_stride;              \
+        [[unroll]] for (int block_x = 0; block_x < 4; ++block_x) {           \
+          if (base_inner_idx + block_x < int(inner_size)) {                  \
+            block[block_y][block_x] = float(buffer_name[row_idx + block_x]); \
+          } else {                                                           \
+            block[block_y][block_x] = 0.0;                                   \
+          }                                                                  \
+        }                                                                    \
+      } else {                                                               \
+        block[block_y] = vec4(0.0);                                          \
+      }                                                                      \
+    }                                                                        \
+    return block;                                                            \
+  }
+
+//
+// Texture load functions
+//
+
+#define define_load_texture_fns(texture_name)                                 \
+                                                                              \
+  mat4 load_fp_block_from_##texture_name(                                     \
+      const TextureMetadata meta,                                             \
+      const TensorIndex4D tidx_base,                                          \
+      const int hashed_layout,                                                \
+      const int block_outer_dim) {                                            \
+    /* Convert tensor index to texture position */                            \
+    /* Use tensor4d_idx_to_texel_pos_simple to properly map the packed dim */ \
+    ivec3 tex_pos = tensor4d_idx_to_texel_pos_simple(meta, tidx_base);        \
+    const int tex_outer_dim = mod_4(block_outer_dim);                         \
+    const int outer_size = meta.sizes[block_outer_dim];                       \
+    const int base_outer_idx = tidx_base.data[block_outer_dim];               \
+                                                                              \
+    mat4 block;                                                               \
+    [[unroll]] for (int block_y = 0; block_y < 4; ++block_y) {                \
+      if (base_outer_idx + block_y < outer_size) {                            \
+        block[block_y] = vec4(texelFetch(texture_name, tex_pos, 0));          \
+      } else {                                                                \
+        block[block_y] = vec4(0.0);                                           \
+      }                                                                       \
+      tex_pos[tex_outer_dim]++;                                               \
+    }                                                                         \
+    return block;                                                             \
+  }
+
+#endif // BLOCK_LOAD_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/block_store.glslh b/backends/vulkan/runtime/graph/ops/glsl/block_store.glslh
new file mode 100644
index 00000000000..66e9ab9fa2b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/block_store.glslh
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Macros to generate block store functions for buffers and textures.
+ *
+ * Buffer usage:
+ *   define_store_buffer_fns(t_outp, T)
+ *
+ * Texture usage:
+ *   define_store_texture_fns(t_outp, VEC4_T)
+ *
+ * Both generate functions with the same signature:
+ *   - store_fp_block_to_t_outp(meta, tidx_base, layout, block_outer_dim, block)
+ *
+ * The block_inner_dim is derived from the packed_dim of the hashed_layout.
+ *
+ * Parameters:
+ *   buffer_name/texture_name: The name of the output buffer/texture (e.g., t_outp)
+ *   scalar_type/vec4_type: The type for casting (e.g., float/vec4)
+ */
+
+#ifndef BLOCK_STORE_GLSLH
+#define BLOCK_STORE_GLSLH
+
+//
+// Buffer store functions
+//
+
+#define define_store_buffer_fns(buffer_name, scalar_type)              \
+                                                                       \
+  void store_fp_block_to_##buffer_name(                                \
+      const BufferMetadata meta,                                       \
+      const TensorIndex4D tidx_base,                                   \
+      const int hashed_layout,                                         \
+      const int block_outer_dim,                                       \
+      const mat4 block) {                                              \
+    const int block_inner_dim = get_packed_dim(hashed_layout);         \
+                                                                       \
+    /* Compute base buffer index once and use strides for iteration */ \
+    const uint base_idx =                                              \
+        tensor4d_idx_to_buf_idx(meta, tidx_base, hashed_layout);       \
+    const uint outer_stride = stride_at(meta, block_outer_dim);        \
+    /* Inner stride is 1 since packed_dim == block_inner_dim */        \
+                                                                       \
+    /* Pre-compute bounds for efficient checking */                    \
+    const uint outer_size = size_at(meta, block_outer_dim);            \
+    const uint inner_size = size_at(meta, block_inner_dim);            \
+    const int base_outer_idx = tidx_base.data[block_outer_dim];        \
+    const int base_inner_idx = tidx_base.data[block_inner_dim];        \
+                                                                       \
+    [[unroll]] for (int block_y = 0; block_y < 4; ++block_y) {         \
+      if (base_outer_idx + block_y < int(outer_size)) {                \
+        const uint row_idx = base_idx + block_y * outer_stride;        \
+        [[unroll]] for (int block_x = 0; block_x < 4; ++block_x) {     \
+          if (base_inner_idx + block_x < int(inner_size)) {            \
+            buffer_name[row_idx + block_x] =                           \
+                scalar_type(block[block_y][block_x]);                  \
+          }                                                            \
+        }                                                              \
+      }                                                                \
+    }                                                                  \
+  }
+
+//
+// Texture store functions
+//
+
+#define define_store_texture_fns(texture_name, vec4_type)                     \
+                                                                              \
+  void store_fp_block_to_##texture_name(                                      \
+      const TextureMetadata meta,                                             \
+      const TensorIndex4D tidx_base,                                          \
+      const int hashed_layout,                                                \
+      const int block_outer_dim,                                              \
+      const mat4 block) {                                                     \
+    /* Convert tensor index to texture position */                            \
+    /* Use tensor4d_idx_to_texel_pos_simple to properly map the packed dim */ \
+    ivec3 tex_pos = tensor4d_idx_to_texel_pos_simple(meta, tidx_base);        \
+    const int tex_outer_dim = mod_4(block_outer_dim);                         \
+    const int outer_size = meta.sizes[block_outer_dim];                       \
+    const int base_outer_idx = tidx_base.data[block_outer_dim];               \
+                                                                              \
+    [[unroll]] for (int block_y = 0; block_y < 4; ++block_y) {                \
+      if (base_outer_idx + block_y < outer_size) {                            \
+        imageStore(texture_name, tex_pos, vec4_type(block[block_y]));         \
+      }                                                                       \
+      tex_pos[tex_outer_dim]++;                                               \
+    }                                                                         \
+  }
+
+#endif // BLOCK_STORE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/common.glslh b/backends/vulkan/runtime/graph/ops/glsl/common.glslh
index 9ade64910f2..3b2010c7963 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/common.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/common.glslh
@@ -25,6 +25,8 @@
 #define div_up_4(x) (((x) + 3) >> 2)
 #define div_up_8(x) (((x) + 7) >> 3)
 
+#define div_up(x, y) (((x) + (y) - 1) / (y))
+
 #define align_up_2(x) ((x + 1) & -2)
 #define align_up_4(x) ((x + 3) & -4)
 #define align_up_8(x) ((x + 7) & -8)
@@ -33,6 +35,7 @@
 #define mod_4(x) ((x) & 3)
 #define mod_8(x) ((x) & 7)
 
+
 int sign_extend_8bit(const int val) {
   if ((val & 0x80) != 0) {
     return val | (~0xFF);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
index deb6f0a9e30..24f050b694e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
@@ -46,6 +46,7 @@ int extract_4b(const int packed, const int pos) {
   return (packed >> (pos * 4)) & 0xF;
 }
 
+#define extract_buffer_packed_dim(layout) int(layout & 0xF)
 
 // Corresponds to dim_order[:4] = [0, 1, 2, 3]
 #define CONTIGUOUS_BUFFER_LAYOUT_ID 0x3210
@@ -69,6 +70,13 @@ bool is_channels_last(const int hashed_layout) {
   return layout_id(hashed_layout) == CHANNELS_LAST_BUFFER_LAYOUT_ID;
 }
 
+// Extract packed dim info from hashed_layout (bits 16-31)
+// These match the format created by create_hashed_layout() in Tensor.cpp
+#define get_packed_dim(layout) (((layout) >> 16) & 0xF)
+#define get_outer_packed_dim(layout) (((layout) >> 20) & 0xF)
+#define get_packed_dim_block_size(layout) (((layout) >> 24) & 0xF)
+#define get_outer_packed_dim_block_size(layout) (((layout) >> 28) & 0xF)
+
 //
 // BufferMetadata
 //
@@ -185,6 +193,11 @@ uint x(const TensorIndex tidx) {
   return tidx.data[0][0];
 }
 
+bool out_of_bounds(const TensorIndex tidx, const BufferMetadata meta) {
+  return any(greaterThanEqual(tidx.data[0], meta.sizes[0])) ||
+         any(greaterThanEqual(tidx.data[1], meta.sizes[1]));
+}
+
 //
 // TensorIndex4D (useful for texture backed tensors)
 //
@@ -406,6 +419,109 @@ uint tensor4d_idx_to_linear_idx(
   return lin_idx;
 }
 
+//
+// Block-packed tensor indexing
+//
+
+/*
+ * Get the number of elements per block from hashed_layout.
+ */
+int get_block_numel(const int hashed_layout) {
+  const int inner_block_size = get_packed_dim_block_size(hashed_layout);
+  const int outer_block_size = get_outer_packed_dim_block_size(hashed_layout);
+  return inner_block_size * outer_block_size;
+}
+
+/*
+ * Compute the intra-block index (position within a block).
+ *
+ * Within a block, elements are stored with inner dimension varying fastest:
+ *   intra_block_idx = outer_offset * inner_block_size + inner_offset
+ *
+ * Parameters:
+ *   tidx: TensorIndex4D with logical tensor coordinates
+ *   hashed_layout: Packed layout info
+ *
+ * Returns: Intra-block index (0 to block_numel-1)
+ */
+int tensor4d_idx_to_intra_block_idx(
+    const TensorIndex4D tidx,
+    const int hashed_layout) {
+  const int inner_dim = get_packed_dim(hashed_layout);
+  const int outer_dim = get_outer_packed_dim(hashed_layout);
+  const int inner_block_size = get_packed_dim_block_size(hashed_layout);
+  const int outer_block_size = get_outer_packed_dim_block_size(hashed_layout);
+
+  const int inner_offset = tidx.data[inner_dim] % inner_block_size;
+  const int outer_offset = tidx.data[outer_dim] % outer_block_size;
+
+  return outer_offset * inner_block_size + inner_offset;
+}
+
+/*
+ * Convert a tensor index to a block-space linear index.
+ *
+ * The tensor index is converted to block-space coordinates by dividing
+ * the packed dimensions by their block sizes, then the linear index is
+ * computed using the block-space strides from BufferMetadata.
+ *
+ * Parameters:
+ *   meta: BufferMetadata with block-space strides
+ *   tidx: TensorIndex4D with logical tensor coordinates
+ *   hashed_layout: Packed layout info
+ *
+ * Returns: Linear index in block space
+ */
+int tensor4d_idx_to_block_idx(
+    const BufferMetadata meta,
+    TensorIndex4D tidx,
+    const int hashed_layout) {
+  // Extract packed dim info
+  const int inner_dim = get_packed_dim(hashed_layout);
+  const int outer_dim = get_outer_packed_dim(hashed_layout);
+  const int inner_block_size = get_packed_dim_block_size(hashed_layout);
+  const int outer_block_size = get_outer_packed_dim_block_size(hashed_layout);
+
+  // Convert to block-space coordinates
+  tidx.data[inner_dim] = tidx.data[inner_dim] / inner_block_size;
+  tidx.data[outer_dim] = tidx.data[outer_dim] / outer_block_size;
+
+  // Compute block-space linear index
+  int block_idx = 0;
+  [[unroll]] for (int d = 0; d < 4; ++d) {
+    block_idx += int(meta.strides[0][d]) * tidx.data[d];
+  }
+  return block_idx;
+}
+
+/*
+ * Convert a tensor index to a linear buffer index for block-packed layouts.
+ *
+ * For block-packed tensors:
+ * - Elements are grouped into blocks of (inner_block_size × outer_block_size)
+ * - Strides in BufferMetadata are in "block space"
+ * - Final index = block_index × block_numel + intra_block_index
+ *
+ * Parameters:
+ *   meta: BufferMetadata containing sizes and block-space strides
+ *   tidx: TensorIndex4D with logical tensor coordinates
+ *   hashed_layout: Packed layout info from create_hashed_layout() (see
+ * indexing.glslh)
+ *
+ * Returns: Linear buffer index for the element
+ */
+int tensor4d_idx_to_buf_idx(
+    const BufferMetadata meta,
+    const TensorIndex4D tidx,
+    const int hashed_layout) {
+  const int block_idx = tensor4d_idx_to_block_idx(meta, tidx, hashed_layout);
+  const int intra_block_idx =
+      tensor4d_idx_to_intra_block_idx(tidx, hashed_layout);
+  const int block_numel = get_block_numel(hashed_layout);
+
+  return block_idx * block_numel + intra_block_idx;
+}
+
 //
 // Debug utilities
 //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_dequantize.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_dequantize.glsl
new file mode 100644
index 00000000000..6989dc2d87d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_dequantize.glsl
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions(OUTPUT_STORAGE, DTYPE)}
+
+#define PRECISION ${PRECISION}
+#define T ${buffer_scalar_type(DTYPE)}
+$if OUTPUT_STORAGE == "texture3d":
+  #define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+
+$if OUTPUT_STORAGE == "buffer":
+  ${define_active_storage_type("buffer")}
+$else:
+  ${define_active_storage_type("texture3d")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+// Output: dequantized floating point values (buffer or texture)
+${layout_declare_tensor(B, "w", "t_outp", DTYPE, OUTPUT_STORAGE)}
+// Input buffer: quantized int32 values (each int32 contains 4 packed int8s)
+${layout_declare_tensor(B, "r", "t_inp", "int", "buffer")}
+
+// Metadata for output tensor (floating point) - buffer or texture
+$if OUTPUT_STORAGE == "buffer":
+  ${layout_declare_ubo(B, "BufferMetadata", "outp")}
+$else:
+  ${layout_declare_ubo(B, "TextureMetadata", "outp")}
+// Metadata for input tensor (quantized int8x4) - always buffer
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+
+layout(push_constant) uniform restrict Block {
+  float scale;
+  int zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "outp_block_config", "0")}
+${layout_declare_spec_const(C, "int", "inp_block_config", "0")}
+
+#include "block_indexing.glslh"
+#include "block_int8x4_load.glslh"
+#include "block_store.glslh"
+
+// Generate loading functions for t_inp buffer
+define_load_int8x4_buffer_fns(t_inp)
+// Generate storing functions for t_outp
+$if OUTPUT_STORAGE == "buffer":
+  define_store_buffer_fns(t_outp, T)
+$else:
+  define_store_texture_fns(t_outp, VEC4_T)
+
+mat4 dequantize_int8x4_block(
+    const ivec4 block, const float scale, const int zp) {
+  mat4 result;
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    // Unpack 4 int8 values from packed int32
+    int packed = block[i];
+    ivec4 unpacked = ivec4(
+        (packed >> 0) & 0xFF,
+        (packed >> 8) & 0xFF,
+        (packed >> 16) & 0xFF,
+        (packed >> 24) & 0xFF);
+    // Sign extend from 8-bit
+    unpacked = (unpacked ^ 0x80) - 0x80;
+    // Dequantize: (q - zp) * scale
+    result[i] = vec4(unpacked - zp) * scale;
+  }
+  return result;
+}
+
+void main() {
+  TensorIndex4D tidx;
+
+#ifdef USING_BUFFER
+  // Buffer storage: use linear dispatch
+  const uint contig_block_idx = gl_GlobalInvocationID.x;
+  tidx = contiguous_block_idx_to_tensor4d_idx_with_block_config(
+      inp, contig_block_idx, inp_block_config);
+#else
+  // Texture storage: use 3D extents dispatch
+  const uvec3 thread_idx = gl_GlobalInvocationID;
+  tidx = block_idx_3d_to_tensor4d_idx_with_block_config(
+      inp, thread_idx, inp_block_config);
+#endif
+
+  if (out_of_bounds(tidx, inp)) {
+    return;
+  }
+
+  // Load int8 block from input using the thread's block index
+  const int inp_block_outer_dim = get_block_outer_dim(inp_block_config);
+  ivec4 int8_block = load_int8x4_block_from_t_inp(
+      inp, tidx, inp_layout, inp_block_outer_dim);
+
+  // If input and output have different block configs (different packed dims),
+  // transpose the block to match output's layout
+  if (inp_block_config != outp_block_config) {
+    int8_block = transpose_int8x4_block(int8_block);
+  }
+
+  // Dequantize the int8 block to float values
+  mat4 fp_block = dequantize_int8x4_block(int8_block, scale, zp);
+
+  // Store dequantized values to output buffer using output's block config
+  const int outp_block_outer_dim = get_block_outer_dim(outp_block_config);
+  store_fp_block_to_t_outp(
+      outp, tidx, outp_layout, outp_block_outer_dim, fp_block);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_dequantize.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_dequantize.yaml
new file mode 100644
index 00000000000..d3a531d6385
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_dequantize.yaml
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+q8ta_dequantize:
+  parameter_names_with_default_values:
+    DTYPE: float
+    OUTPUT_STORAGE: buffer
+  generate_variant_forall:
+    OUTPUT_STORAGE:
+      - VALUE: buffer
+      - VALUE: texture3d
+    DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: q8ta_dequantize
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_quantize.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_quantize.glsl
new file mode 100644
index 00000000000..c0c2a3a914a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_quantize.glsl
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions(INPUT_STORAGE, DTYPE)}
+
+#define PRECISION ${PRECISION}
+#define T ${buffer_scalar_type(DTYPE)}
+$if INPUT_STORAGE == "texture3d":
+  #define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+
+$if INPUT_STORAGE == "buffer":
+  ${define_active_storage_type("buffer")}
+$else:
+  ${define_active_storage_type("texture3d")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+// Output buffer: quantized int32 values (each int32 contains 4 packed int8s)
+${layout_declare_tensor(B, "w", "t_outp", "int", "buffer")}
+// Input: floating point values (buffer or texture)
+${layout_declare_tensor(B, "r", "t_inp", DTYPE, INPUT_STORAGE)}
+
+// Metadata for output tensor (quantized int8x4) - always buffer
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+// Metadata for input tensor (floating point) - buffer or texture
+$if INPUT_STORAGE == "buffer":
+  ${layout_declare_ubo(B, "BufferMetadata", "inp")}
+$else:
+  ${layout_declare_ubo(B, "TextureMetadata", "inp")}
+
+layout(push_constant) uniform restrict Block {
+  float inv_scale;
+  int zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "inp_block_config", "0")}
+${layout_declare_spec_const(C, "int", "outp_block_config", "0")}
+
+#include "block_indexing.glslh"
+#include "block_load.glslh"
+$if INPUT_STORAGE == "buffer":
+  // Generate loading functions for t_inp buffer
+  define_load_buffer_fns(t_inp)
+$else:
+  // Generate loading functions for t_inp texture
+  define_load_texture_fns(t_inp)
+#include "block_int8x4_store.glslh"
+
+// Generate storing functions for t_outp buffer
+define_store_int8x4_buffer_fns(t_outp)
+
+ivec4 quantize_fp_block(
+    const mat4 block, const float inv_scale, const int zp) {
+  ivec4 result;
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    // Quantize: round(val * inv_scale) + zp, clamped to [-128, 127]
+    ivec4 quantized = ivec4(round(block[i] * inv_scale)) + zp;
+    quantized = clamp(quantized, -128, 127);
+    // Pack 4 int8 values into one int32
+    result[i] = ((quantized[0] & 0xFF) << 0) |
+                ((quantized[1] & 0xFF) << 8) |
+                ((quantized[2] & 0xFF) << 16) |
+                ((quantized[3] & 0xFF) << 24);
+  }
+  return result;
+}
+
+void main() {
+  TensorIndex4D tidx;
+
+#ifdef USING_BUFFER
+  // Buffer storage: use linear dispatch
+  const uint contig_block_idx = gl_GlobalInvocationID.x;
+  tidx = contiguous_block_idx_to_tensor4d_idx_with_block_config(
+      inp, contig_block_idx, inp_block_config);
+#else
+  // Texture storage: use 3D extents dispatch
+  const uvec3 thread_idx = gl_GlobalInvocationID;
+  tidx = block_idx_3d_to_tensor4d_idx_with_block_config(
+      inp, thread_idx, inp_block_config);
+#endif
+
+  if (out_of_bounds(tidx, inp)) {
+    return;
+  }
+
+  // Load FP block from input using the thread's block index
+  const int inp_block_outer_dim = get_block_outer_dim(inp_block_config);
+  mat4 fp_block = load_fp_block_from_t_inp(
+      inp, tidx, inp_layout, inp_block_outer_dim);
+
+  // If input and output have different block configs (different packed dims),
+  // transpose the block to match output's layout
+  if (inp_block_config != outp_block_config) {
+    fp_block = transpose(fp_block);
+  }
+
+  // Quantize the float block to int8 values
+  ivec4 int8_block = quantize_fp_block(fp_block, inv_scale, zp);
+
+  // Store quantized values to output buffer using output's block config
+  const int outp_block_outer_dim = get_block_outer_dim(outp_block_config);
+  store_int8x4_block_to_t_outp(
+      outp, tidx, outp_layout, outp_block_outer_dim, int8_block);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_quantize.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_quantize.yaml
new file mode 100644
index 00000000000..b3124fb6d75
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_quantize.yaml
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+q8ta_quantize:
+  parameter_names_with_default_values:
+    DTYPE: float
+    INPUT_STORAGE: buffer
+  generate_variant_forall:
+    INPUT_STORAGE:
+      - VALUE: buffer
+      - VALUE: texture3d
+    DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: q8ta_quantize
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.cpp b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
index 71690ffc604..0286889de5c 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Common.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
@@ -6,10 +6,93 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/vulkan/runtime/graph/Logging.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 
 namespace vkcompute {
 
+//
+// BlockConfig implementation
+//
+
+BlockConfig::BlockConfig(
+    int32_t inner,
+    int32_t inner_block_size,
+    int32_t outer,
+    int32_t outer_block_size,
+    bool transposed)
+    : inner_dim(inner),
+      inner_dim_block_size(inner_block_size),
+      outer_dim(outer),
+      outer_dim_block_size(outer_block_size),
+      block_transposed(transposed),
+      block_dim_order{0, 0, 0, 0} {
+  // Block dims must be different
+  VK_CHECK_COND(outer_dim != inner_dim);
+
+  // Find the two lowest dim indices that are not inner_dim or outer_dim
+  int32_t first_nonblock_dim = -1;
+  int32_t second_nonblock_dim = -1;
+  int32_t other_idx = 0;
+  for (int32_t d = 0; other_idx < 2; ++d) {
+    if (d != inner_dim && d != outer_dim) {
+      if (other_idx == 0) {
+        first_nonblock_dim = d;
+      } else {
+        second_nonblock_dim = d;
+      }
+      ++other_idx;
+    }
+  }
+
+  // Set block_dim_order based on block_transposed
+  if (block_transposed) {
+    // Transposed: {outer_dim, inner_dim, first_nonblock_dim,
+    // second_nonblock_dim}
+    block_dim_order[0] = outer_dim;
+    block_dim_order[1] = inner_dim;
+  } else {
+    // Normal: {inner_dim, outer_dim, first_nonblock_dim, second_nonblock_dim}
+    block_dim_order[0] = inner_dim;
+    block_dim_order[1] = outer_dim;
+  }
+  block_dim_order[2] = first_nonblock_dim;
+  block_dim_order[3] = second_nonblock_dim;
+
+  // Validate all dims are in valid range [0, 3]
+  for (int i = 0; i < 4; ++i) {
+    VK_CHECK_COND(block_dim_order[i] >= 0 && block_dim_order[i] < 4);
+  }
+}
+
+int32_t BlockConfig::as_packed_int() const {
+  int32_t packed = 0;
+  // Pack block_dim_order in bits 0-15 (matches hashed layout format)
+  packed |= (block_dim_order[0] & 0xF); // bits 0-3
+  packed |= (block_dim_order[1] & 0xF) << 4; // bits 4-7
+  packed |= (block_dim_order[2] & 0xF) << 8; // bits 8-11
+  packed |= (block_dim_order[3] & 0xF) << 12; // bits 12-15
+  // Pack packed_dim_info in bits 16-31 (matches hashed layout format)
+  packed |= (inner_dim & 0xF) << 16; // bits 16-19
+  packed |= (outer_dim & 0xF) << 20; // bits 20-23
+  packed |= (inner_dim_block_size & 0xF) << 24; // bits 24-27
+  packed |= (outer_dim_block_size & 0xF) << 28; // bits 28-31
+
+  return packed;
+}
+
+int32_t BlockConfig::inner_dim_from_packed_int(int32_t packed_int) {
+  return (packed_int >> 16) & 0xF; // bits 16-19
+}
+
+int32_t BlockConfig::outer_dim_from_packed_int(int32_t packed_int) {
+  return (packed_int >> 20) & 0xF; // bits 20-23
+}
+
+//
+// Default workgroup size functions
+//
+
 utils::uvec3 default_pick_global_wg_size(
     ComputeGraph* graph,
     const vkapi::ShaderInfo& shader,
@@ -79,4 +162,202 @@ utils::uvec3 pick_wc_square_wg_size(
   return {16u, 1u, 4u};
 }
 
+BlockConfig create_block_config_from_io_packed_dims(
+    ComputeGraph& graph,
+    const ValueRef output,
+    const ValueRef input) {
+  const int32_t block_inner_dim = graph.packed_dim_of(output);
+  int32_t block_outer_dim = graph.packed_dim_of(input);
+
+  // If inner and outer dims are the same, pick a different outer dim
+  if (block_outer_dim == block_inner_dim) {
+    if (block_inner_dim == 0) {
+      block_outer_dim = 1;
+    } else {
+      block_outer_dim = 0;
+    }
+  }
+
+  // Create a BlockConfig with block sizes of 4 for both dimensions
+  return BlockConfig{block_inner_dim, 4, block_outer_dim, 4};
+}
+
+BlockConfig create_block_config_for_tensor(
+    ComputeGraph& graph,
+    const ValueRef tensor) {
+  const int32_t packed_dim = graph.packed_dim_of(tensor);
+
+  // Pick an outer dimension that differs from the packed dimension
+  const int32_t outer_dim = (packed_dim == 0) ? 1 : 0;
+
+  // Create a BlockConfig with block sizes of 4 for both dimensions
+  return BlockConfig{packed_dim, 4, outer_dim, 4};
+}
+
+BlockConfig create_block_config_from_other(
+    ComputeGraph& graph,
+    const ValueRef tensor,
+    const BlockConfig& other) {
+  const int32_t packed_dim = graph.packed_dim_of(tensor);
+
+  // If tensor's packed dim matches other's inner dim, use same config
+  if (packed_dim == other.inner_dim) {
+    return other;
+  }
+
+  // Otherwise, transpose: swap inner and outer dimensions
+  return BlockConfig{
+      other.outer_dim,
+      other.outer_dim_block_size,
+      other.inner_dim,
+      other.inner_dim_block_size};
+}
+
+utils::uvec3 pick_linear_global_wg_with_block_config(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)shader;
+
+  const ValueRef output = args.at(0).refs.at(0);
+  // extra_args contains the packed block config directly as a ValueRef
+  // (int32_t)
+  const int32_t packed_block_config = static_cast<int32_t>(extra_args.at(0));
+
+  // Extract block configuration from packed integer
+  const int32_t inner_dim =
+      BlockConfig::inner_dim_from_packed_int(packed_block_config);
+  const int32_t outer_dim =
+      BlockConfig::outer_dim_from_packed_int(packed_block_config);
+
+  const std::vector<int64_t>& sizes = graph->sizes_of(output);
+  const size_t ndim = sizes.size();
+
+  // Compute number of blocks along inner and outer dimensions
+  const int64_t inner_size = sizes[ndim - 1 - inner_dim];
+  const int64_t outer_size = sizes[ndim - 1 - outer_dim];
+
+  const uint32_t num_inner_blocks =
+      utils::safe_downcast<uint32_t>(utils::div_up(inner_size, int64_t(4)));
+  const uint32_t num_outer_blocks =
+      utils::safe_downcast<uint32_t>(utils::div_up(outer_size, int64_t(4)));
+
+  // Compute number of planes (product of dimensions not in the block)
+  uint32_t num_planes = 1;
+  for (size_t i = 0; i < ndim; ++i) {
+    const int32_t whcn_dim = ndim - 1 - i;
+    if (whcn_dim != inner_dim && whcn_dim != outer_dim) {
+      num_planes *= utils::safe_downcast<uint32_t>(sizes[i]);
+    }
+  }
+
+  // Return linear workgroup size: {total_blocks, 1u, 1u}
+  const uint32_t total_blocks =
+      num_inner_blocks * num_outer_blocks * num_planes;
+  return {total_blocks, 1u, 1u};
+}
+
+utils::uvec3 pick_extents_global_wg_with_block_config(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)shader;
+
+  const ValueRef output = args.at(0).refs.at(0);
+  // extra_args contains the packed block config directly as a ValueRef
+  // (int32_t)
+  const int32_t packed_block_config = static_cast<int32_t>(extra_args.at(0));
+
+  // Extract block configuration from packed integer
+  // Note: inner_dim and outer_dim use WHCN order (0=W, 1=H, 2=C, 3=N)
+  const int32_t inner_dim =
+      BlockConfig::inner_dim_from_packed_int(packed_block_config);
+  const int32_t outer_dim =
+      BlockConfig::outer_dim_from_packed_int(packed_block_config);
+
+  const std::vector<int64_t>& sizes = graph->sizes_of(output);
+
+  // C++ sizes are in NCHW order: sizes[0]=N, sizes[1]=C, sizes[2]=H, sizes[3]=W
+  // Access dimensions from the end for tensors with fewer than 4 dims
+  const int64_t W = utils::val_at(-1, sizes);
+  const int64_t H = utils::val_at(-2, sizes);
+  const int64_t C = utils::val_at(-3, sizes);
+  const int64_t N = utils::val_at(-4, sizes);
+
+  // Dispatch structure: {x_threads, y_threads, z_threads}
+  // - x corresponds to W dimension
+  // - y corresponds to H dimension
+  // - z corresponds to C * N (combined)
+  //
+  // Block dimensions (inner_dim and outer_dim) are divided by 4,
+  // non-block dimensions are not divided.
+
+  uint32_t x_threads, y_threads;
+  int64_t C_for_z;
+
+  // X dimension (W, WHCN dim 0)
+  if (inner_dim == 0 || outer_dim == 0) {
+    x_threads = utils::safe_downcast<uint32_t>(utils::div_up(W, int64_t(4)));
+  } else {
+    x_threads = utils::safe_downcast<uint32_t>(W);
+  }
+
+  // Y dimension (H, WHCN dim 1)
+  if (inner_dim == 1 || outer_dim == 1) {
+    y_threads = utils::safe_downcast<uint32_t>(utils::div_up(H, int64_t(4)));
+  } else {
+    y_threads = utils::safe_downcast<uint32_t>(H);
+  }
+
+  // Z dimension: C * N where C is blocked if it's part of the block
+  if (inner_dim == 2 || outer_dim == 2) {
+    C_for_z = utils::div_up(C, int64_t(4));
+  } else {
+    C_for_z = C;
+  }
+  const uint32_t z_threads = utils::safe_downcast<uint32_t>(C_for_z * N);
+
+  return {x_threads, y_threads, z_threads};
+}
+
+utils::uvec3 pick_square_local_wg_with_block_config(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)graph;
+  (void)shader;
+  (void)args;
+
+  // Detect linear dispatch pattern: global_wg = {total_blocks, 1, 1}
+  if (global_workgroup_size[1u] == 1u && global_workgroup_size[2u] == 1u) {
+    return {64u, 1u, 1u};
+  }
+
+  // Extents dispatch: use 8x8 square on inner_dim and outer_dim axes
+  // extra_args contains the packed block config as a ValueRef (int32_t)
+  const int32_t packed_block_config = static_cast<int32_t>(extra_args.at(0));
+
+  // Extract block configuration from packed integer
+  // inner_dim and outer_dim use WHCN order (0=W, 1=H, 2=C, 3=N)
+  const int32_t inner_dim =
+      BlockConfig::inner_dim_from_packed_int(packed_block_config);
+  const int32_t outer_dim =
+      BlockConfig::outer_dim_from_packed_int(packed_block_config);
+
+  // Build local workgroup size:
+  // - x corresponds to W (WHCN dim 0)
+  // - y corresponds to H (WHCN dim 1)
+  // - z corresponds to C*N (WHCN dim 2 for C)
+  // Set axes in the block (inner_dim, outer_dim) to 8, others to 1
+  uint32_t local_x = (inner_dim == 0 || outer_dim == 0) ? 8u : 1u;
+  uint32_t local_y = (inner_dim == 1 || outer_dim == 1) ? 8u : 1u;
+  uint32_t local_z = (inner_dim == 2 || outer_dim == 2) ? 8u : 1u;
+
+  return {local_x, local_y, local_z};
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.h b/backends/vulkan/runtime/graph/ops/impl/Common.h
index b412f737c13..84cacc8e4f7 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Common.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Common.h
@@ -13,6 +13,73 @@
 
 namespace vkcompute {
 
+/**
+ * BlockConfig describes how a tensor is partitioned into blocks for the purpose
+ * of thread mapping in GPU compute shaders. Each thread processes one block
+ * of elements.
+ *
+ * This is distinct from PackedDimInfo in Tensor.h which describes memory
+ * layout. BlockConfig is used solely for operator implementations to define 4x4
+ * block partitioning schemes.
+ *
+ * The block configuration has two dimensions:
+ *   - inner_dim: The dimension where 4 consecutive elements are processed
+ *     together within a single thread
+ *   - outer_dim: A second dimension where 4 elements are grouped, resulting
+ *     in a 4x4 block of 16 elements per thread
+ */
+struct BlockConfig {
+  // The inner block dimension (WHCN index: 0=W, 1=H, 2=C, 3=N)
+  // 4 consecutive elements along this dimension form the inner part of a block
+  int32_t inner_dim;
+  // Block size along the inner dimension (typically 4)
+  int32_t inner_dim_block_size;
+  // The outer block dimension (WHCN index: 0=W, 1=H, 2=C, 3=N)
+  // 4 elements along this dimension form the outer part of a block
+  int32_t outer_dim;
+  // Block size along the outer dimension (typically 4)
+  int32_t outer_dim_block_size;
+  // Whether the block is transposed (swaps stride ordering of inner/outer dim)
+  bool block_transposed;
+  // Dimension order for the block:
+  // - If block_transposed = false: {inner_dim, outer_dim, first_nonblock_dim,
+  //   second_nonblock_dim}
+  // - If block_transposed = true: {outer_dim, inner_dim, first_nonblock_dim,
+  //   second_nonblock_dim}
+  int32_t block_dim_order[4];
+
+  BlockConfig(
+      int32_t inner,
+      int32_t inner_block_size,
+      int32_t outer,
+      int32_t outer_block_size,
+      bool transposed = false);
+
+  /**
+   * Returns a packed int32_t encoding the block configuration. The structure
+   * matches the hashed layout int format used in shaders:
+   *   bits  0- 3: block_dim_order[0]
+   *   bits  4- 7: block_dim_order[1]
+   *   bits  8-11: block_dim_order[2]
+   *   bits 12-15: block_dim_order[3]
+   *   bits 16-19: inner_dim
+   *   bits 20-23: outer_dim
+   *   bits 24-27: inner_dim_block_size
+   *   bits 28-31: outer_dim_block_size
+   */
+  int32_t as_packed_int() const;
+
+  /**
+   * Extracts inner_dim from a packed int32_t representation.
+   */
+  static int32_t inner_dim_from_packed_int(int32_t packed_int);
+
+  /**
+   * Extracts outer_dim from a packed int32_t representation.
+   */
+  static int32_t outer_dim_from_packed_int(int32_t packed_int);
+};
+
 /**
  * Creates a global workgroup size based on the first output tensor in the args.
  * This is a utility function that extracts the output tensor from
@@ -61,4 +128,133 @@ utils::uvec3 pick_wc_square_wg_size(
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& resize_args);
 
+/**
+ * Creates a BlockConfig based on the packed dimensions of an output and input
+ * tensor pair. This is useful for operations like dequantize where the block
+ * configuration depends on both tensors.
+ *
+ * The inner dimension is determined by the output tensor's packed dimension,
+ * and the outer dimension is determined by the input tensor's packed dimension.
+ * If they are the same, the outer dimension is adjusted to avoid conflict.
+ *
+ * @param graph The compute graph
+ * @param output The output tensor reference
+ * @param input The input tensor reference
+ * @return A BlockConfig configured for block-based operations
+ */
+BlockConfig create_block_config_from_io_packed_dims(
+    ComputeGraph& graph,
+    const ValueRef output,
+    const ValueRef input);
+
+/**
+ * Creates a BlockConfig based on the packed dimension of a single tensor.
+ * This is useful when you need separate block configs for input and output
+ * tensors.
+ *
+ * The inner dimension is determined by the tensor's packed dimension.
+ * The outer dimension is set to an adjacent dimension that differs from
+ * the packed dimension.
+ *
+ * @param graph The compute graph
+ * @param tensor The tensor reference
+ * @return A BlockConfig configured for block-based operations
+ */
+BlockConfig create_block_config_for_tensor(
+    ComputeGraph& graph,
+    const ValueRef tensor);
+
+/**
+ * Creates a BlockConfig for a tensor based on another block config, ensuring
+ * the inner dimension matches the tensor's packed dimension.
+ *
+ * This is useful when you need block configs for both input and output tensors
+ * that share the same block axes but may need to be transposed if the tensors
+ * have different packed dimensions.
+ *
+ * If the tensor's packed dim matches the other config's inner dim, returns
+ * the same config. Otherwise, returns a transposed config (inner/outer
+ * swapped).
+ *
+ * @param graph The compute graph
+ * @param tensor The tensor to create a block config for
+ * @param other The reference block config to base the new config on
+ * @return A BlockConfig with inner_dim = tensor's packed_dim
+ */
+BlockConfig create_block_config_from_other(
+    ComputeGraph& graph,
+    const ValueRef tensor,
+    const BlockConfig& other);
+
+/**
+ * Picks a global workgroup size for block-based dispatching using a linear
+ * (1D flattened) dispatch pattern. This is optimized for buffer storage.
+ *
+ * This function expects:
+ *   - args.at(0).refs.at(0): Output tensor reference
+ *   - extra_args.at(0): Packed int32_t block configuration cast to ValueRef
+ *     (created via static_cast<ValueRef>(BlockConfig::as_packed_int()))
+ *
+ * The global workgroup size is computed as:
+ *   - x = total_blocks = num_inner_blocks * num_outer_blocks * num_planes
+ *   - y = 1
+ *   - z = 1
+ *
+ * @return Global workgroup size as {total_blocks, 1, 1}
+ */
+utils::uvec3 pick_linear_global_wg_with_block_config(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args);
+
+/**
+ * Picks a global workgroup size for block-based dispatching using a 3D
+ * extents-style dispatch pattern. This is optimized for texture storage.
+ *
+ * This function expects:
+ *   - args.at(0).refs.at(0): Output tensor reference
+ *   - extra_args.at(0): Packed int32_t block configuration cast to ValueRef
+ *     (created via static_cast<ValueRef>(BlockConfig::as_packed_int()))
+ *
+ * The global workgroup size is computed as a WHCN-based 3D dispatch:
+ *   - x = W threads (divided by 4 if W is inner or outer dim)
+ *   - y = H threads (divided by 4 if H is inner or outer dim)
+ *   - z = C * N threads (C divided by 4 if C is inner or outer dim)
+ *
+ * @return Global workgroup size as {x_threads, y_threads, z_threads}
+ */
+utils::uvec3 pick_extents_global_wg_with_block_config(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args);
+
+/**
+ * Picks a local workgroup size for block-based dispatching that is optimized
+ * for the dispatch pattern in use.
+ *
+ * This function expects:
+ *   - extra_args.at(0): Packed int32_t block configuration cast to ValueRef
+ *     (created via static_cast<ValueRef>(BlockConfig::as_packed_int()))
+ *
+ * For linear dispatch (buffer storage, global_wg = {total_blocks, 1, 1}):
+ *   - Returns {64, 1, 1}
+ *
+ * For extents dispatch (texture storage, global_wg = {x, y, z}):
+ *   - Returns an 8x8 square configuration where:
+ *     - Axes corresponding to inner_dim and outer_dim are set to 8
+ *     - The remaining axis is set to 1
+ *   - For example: inner_dim=W, outer_dim=H -> {8, 8, 1}
+ *                  inner_dim=W, outer_dim=C -> {8, 1, 8}
+ *
+ * @return Local workgroup size optimized for the dispatch pattern
+ */
+utils::uvec3 pick_square_local_wg_with_block_config(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args);
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp
new file mode 100644
index 00000000000..bca36444725
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_q8ta_quantize_node(
+    ComputeGraph& graph,
+    const ValueRef fp_input,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef packed_int8_output) {
+  float inv_scale = 1.0f / graph.extract_scalar<float>(input_scale);
+  int32_t zp = graph.extract_scalar<int32_t>(input_zp);
+
+  // Detect input storage type to select appropriate shader variant
+  utils::StorageType inp_storage = graph.storage_type_of(fp_input);
+
+  // Build shader name: q8ta_quantize_{buffer|texture3d}_{dtype}
+  std::string kernel_name = "q8ta_quantize";
+  add_storage_type_suffix(kernel_name, inp_storage);
+  add_dtype_suffix(kernel_name, graph.dtype_of(fp_input));
+
+  // Pass metadata for both output and input tensors
+  // Output is always buffer, input can be buffer or texture
+  vkapi::ParamsBindList param_buffers;
+  param_buffers.append(graph.buffer_meta_ubo(packed_int8_output));
+  param_buffers.append(graph.meta_ubo(fp_input));
+
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&inv_scale, sizeof(inv_scale)),
+      PushConstantDataInfo(&zp, sizeof(zp)),
+  };
+
+  // Create block config for output tensor: inner_dim = output's packed_dim
+  const BlockConfig outp_block_config = create_block_config_from_io_packed_dims(
+      graph, packed_int8_output, fp_input);
+
+  // Create block config for input tensor: based on outp_block_config but with
+  // inner_dim = input's packed_dim. If input and output have different packed
+  // dims, the block axes are transposed.
+  const BlockConfig inp_block_config =
+      create_block_config_from_other(graph, fp_input, outp_block_config);
+
+  // Cast block config to ValueRef for pick_*_global_wg_with_block_config
+  // Use inp_block_config since shader uses inp_block_config for indexing
+  const ValueRef block_config_ref =
+      static_cast<ValueRef>(inp_block_config.as_packed_int());
+
+  // Choose dispatch function based on FP input storage type:
+  // - Buffer: use linear dispatch (better performance)
+  // - Texture: use extents-style 3D dispatch (better performance)
+  auto pick_global_wg_size = (inp_storage == utils::kBuffer)
+      ? pick_linear_global_wg_with_block_config
+      : pick_extents_global_wg_with_block_config;
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      pick_global_wg_size,
+      pick_square_local_wg_with_block_config,
+      // Inputs and Outputs
+      {{packed_int8_output, vkapi::kWrite}, {fp_input, vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {graph.hashed_layout_of(fp_input),
+       graph.hashed_layout_of(packed_int8_output),
+       inp_block_config.as_packed_int(),
+       outp_block_config.as_packed_int()},
+      // Resize args
+      {block_config_ref}));
+}
+
+void add_q8ta_dequantize_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef fp_output) {
+  float scale = graph.extract_scalar<float>(output_scale);
+  int32_t zp = graph.extract_scalar<int32_t>(output_zp);
+
+  // Detect output storage type to select appropriate shader variant
+  utils::StorageType outp_storage = graph.storage_type_of(fp_output);
+
+  // Build shader name: q8ta_dequantize_{buffer|texture3d}_{dtype}
+  std::string kernel_name = "q8ta_dequantize";
+  add_storage_type_suffix(kernel_name, outp_storage);
+  add_dtype_suffix(kernel_name, graph.dtype_of(fp_output));
+
+  // Pass metadata for both output and input tensors
+  // Output can be buffer or texture, input is always buffer
+  vkapi::ParamsBindList param_buffers;
+  param_buffers.append(graph.meta_ubo(fp_output));
+  param_buffers.append(graph.buffer_meta_ubo(packed_int8_input));
+
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&scale, sizeof(scale)),
+      PushConstantDataInfo(&zp, sizeof(zp)),
+  };
+
+  // Create block config for output tensor: inner_dim = output's packed_dim
+  const BlockConfig outp_block_config = create_block_config_from_io_packed_dims(
+      graph, fp_output, packed_int8_input);
+
+  // Create block config for input tensor: based on outp_block_config but with
+  // inner_dim = input's packed_dim. If input and output have different packed
+  // dims, the block axes are transposed.
+  const BlockConfig inp_block_config = create_block_config_from_other(
+      graph, packed_int8_input, outp_block_config);
+
+  // Cast block config to ValueRef for pick_*_global_wg_with_block_config
+  // Use inp_block_config since shader uses inp_block_config for indexing
+  const ValueRef block_config_ref =
+      static_cast<ValueRef>(inp_block_config.as_packed_int());
+
+  // Choose dispatch function based on FP output storage type:
+  // - Buffer: use linear dispatch (better performance)
+  // - Texture: use extents-style 3D dispatch (better performance)
+  auto pick_global_wg_size = (outp_storage == utils::kBuffer)
+      ? pick_linear_global_wg_with_block_config
+      : pick_extents_global_wg_with_block_config;
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      pick_global_wg_size,
+      pick_square_local_wg_with_block_config,
+      // Inputs and Outputs
+      {{fp_output, vkapi::kWrite}, {packed_int8_input, vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {graph.hashed_layout_of(fp_output),
+       graph.hashed_layout_of(packed_int8_input),
+       outp_block_config.as_packed_int(),
+       inp_block_config.as_packed_int()},
+      // Resize args
+      {block_config_ref}));
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h
new file mode 100644
index 00000000000..e2c6a2c51c3
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+//
+// Generic Quantize, Dequantize (memory layout agnostic)
+//
+
+void add_q8ta_quantize_node(
+    ComputeGraph& graph,
+    const ValueRef fp_input,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef packed_int8_output);
+
+void add_q8ta_dequantize_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef fp_output);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.cpp
index 8ebbf6dcb99..e02a42f60e1 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.cpp
@@ -9,9 +9,12 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
+#include <executorch/backends/vulkan/runtime/graph/Logging.h>
+
 namespace vkcompute {
 
 //
@@ -383,11 +386,10 @@ void quantize_per_tensor_impl(
 
   const ValueRef int8_output = args[last_arg_idx];
 
-  VK_CHECK_COND(
-      graph.estimate_memory_layout_of(int8_output) == utils::kPackedInt8_4W4C);
+  VK_CHECK_COND(graph.dtype_of(int8_output) == vkapi::kInt8x4);
 
-  add_quantize_and_pack_4w4c_node(
-      graph, fp_input, scale, zero_point, int8_output);
+  // Use unified block-based dispatch for all layouts
+  add_q8ta_quantize_node(graph, fp_input, scale, zero_point, int8_output);
 }
 
 void dequantize_per_tensor_impl(
@@ -409,34 +411,10 @@ void dequantize_per_tensor_impl(
 
   const ValueRef fp_output = args[last_arg_idx];
 
-  VK_CHECK_COND(
-      graph.estimate_memory_layout_of(int8_input) == utils::kPackedInt8_4W4C);
-
-  add_unpack_4w4c_and_dequantize_node(
-      graph, int8_input, scale, zero_point, fp_output);
-}
+  VK_CHECK_COND(graph.dtype_of(int8_input) == vkapi::kInt8x4);
 
-void qdq8ta_conv2d_input(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int32_t idx = 0;
-  const ValueRef fp_input = args.at(idx++);
-  const ValueRef scale = args.at(idx++);
-  const ValueRef zero_point = args.at(idx++);
-  const ValueRef fp_output = args.at(idx++);
-
-  TmpTensor packed_int8_input(
-      &graph,
-      graph.sizes_of(fp_input),
-      vkapi::kInt8x4,
-      utils::kBuffer,
-      utils::kPackedInt8_4W4C);
-
-  add_quantize_and_pack_4w4c_node(
-      graph, fp_input, scale, zero_point, packed_int8_input);
-
-  add_unpack_4w4c_and_dequantize_node(
-      graph, packed_int8_input, scale, zero_point, fp_output);
+  // Use unified block-based dispatch for all layouts
+  add_q8ta_dequantize_node(graph, int8_input, scale, zero_point, fp_output);
 }
 
 REGISTER_OPERATORS {
@@ -446,7 +424,6 @@ REGISTER_OPERATORS {
   VK_REGISTER_OP(
       quantized_decomposed.dequantize_per_tensor.default,
       dequantize_per_tensor_impl);
-  VK_REGISTER_OP(etvk.qdq8ta_conv2d_input.default, qdq8ta_conv2d_input);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
index ff00215efbc..2adb32d8c77 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
@@ -9,7 +9,7 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
@@ -1579,7 +1579,7 @@ void conv2d_q8ta_q8csw_q8to_test(
       io_storage_type,
       utils::kPackedInt8_4W4C);
 
-  add_quantize_and_pack_4w4c_node(
+  add_q8ta_quantize_node(
       graph, fp_input, input_scale, input_zp, packed_int8_input);
 
   std::vector<ValueRef> conv2d_args = {
@@ -1601,7 +1601,7 @@ void conv2d_q8ta_q8csw_q8to_test(
 
   conv2d_q8ta_q8csw_q8to(graph, conv2d_args);
 
-  add_unpack_4w4c_and_dequantize_node(
+  add_q8ta_dequantize_node(
       graph, packed_int8_output, output_scale, output_zp, fp_output);
 }
 
diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt
index 6db814815fb..781d69c10fe 100644
--- a/backends/vulkan/test/custom_ops/CMakeLists.txt
+++ b/backends/vulkan/test/custom_ops/CMakeLists.txt
@@ -97,7 +97,7 @@ if(TARGET vulkan_backend)
   add_operator_prototype(q8csw_conv2d)
   add_operator_prototype(q4gsw_linear)
   add_operator_prototype(choose_qparams_per_row)
-  add_operator_prototype(qdq8ta_conv2d_activations)
+  add_operator_prototype(test_q8ta_qdq)
   add_operator_prototype(q8ta_q8csw_q8to_conv2d)
   add_operator_prototype(q8ta_q8csw_q8to_conv2d_dw)
   add_operator_prototype(q8ta_q8ta_q8to_add)
diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taQuantizeDequantize.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taQuantizeDequantize.cpp
new file mode 100644
index 00000000000..24f19b4d309
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestQ8taQuantizeDequantize.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.h>
+
+namespace vkcompute {
+
+void q_dq_8bit_per_tensor(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef fp_input = args.at(idx++);
+  const ValueRef scale = args.at(idx++);
+  const ValueRef zero_point = args.at(idx++);
+  const ValueRef layout_int = args.at(idx++);
+  const ValueRef impl_selector_str = args.at(idx++);
+  const ValueRef fp_output = args.at(idx++);
+
+  // Extract the layout parameter and cast to GPUMemoryLayout
+  int32_t layout_value = graph.extract_scalar<int32_t>(layout_int);
+  utils::GPUMemoryLayout layout =
+      static_cast<utils::GPUMemoryLayout>(layout_value);
+
+  // Extract the impl_selector string
+  std::string impl_selector = graph.extract_string(impl_selector_str);
+
+  // Use legacy 4W4C implementation if requested and layout matches
+  if (impl_selector == "legacy_4w4c" && layout == utils::kPackedInt8_4W4C) {
+    TmpTensor packed_int8_input(
+        &graph,
+        graph.sizes_of(fp_input),
+        vkapi::kInt8x4,
+        utils::kBuffer,
+        utils::kPackedInt8_4W4C);
+
+    add_quantize_and_pack_4w4c_node(
+        graph, fp_input, scale, zero_point, packed_int8_input);
+
+    add_unpack_4w4c_and_dequantize_node(
+        graph, packed_int8_input, scale, zero_point, fp_output);
+  } else {
+    // Create temporary tensor with the specified layout
+    TmpTensor packed_int8_input(
+        &graph,
+        graph.sizes_of(fp_input),
+        vkapi::kInt8x4,
+        utils::kBuffer,
+        layout);
+
+    // Use unified block-based dispatch
+    add_q8ta_quantize_node(
+        graph, fp_input, scale, zero_point, packed_int8_input);
+
+    add_q8ta_dequantize_node(
+        graph, packed_int8_input, scale, zero_point, fp_output);
+  }
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(test_etvk.q_dq_8bit_per_tensor.default, q_dq_8bit_per_tensor);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations.cpp b/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations.cpp
deleted file mode 100644
index b8b33f30623..00000000000
--- a/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <algorithm>
-#include <cmath>
-#include <iostream>
-#include <vector>
-#include "utils.h"
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-using namespace executorch::vulkan::prototyping;
-using namespace vkcompute;
-
-static constexpr int64_t kRefDimSizeLimit = 512;
-
-// QDQ8TA Conv2D configuration struct for 4D tensor quantize-dequantize testing
-struct QDQ8TAConv2DConfig {
-  int64_t batch_size; // N dimension
-  int64_t in_channels; // C dimension
-  int64_t height; // H dimension
-  int64_t width; // W dimension
-  std::string test_case_name = "placeholder";
-  std::string op_name = "qdq8ta_conv2d_input";
-};
-
-// Utility function to create a test case from a QDQ8TAConv2DConfig
-TestCase create_test_case_from_config(
-    const QDQ8TAConv2DConfig& config,
-    utils::StorageType storage_type,
-    vkapi::ScalarType input_dtype) {
-  TestCase test_case;
-
-  // Create a descriptive name for the test case
-  std::string storage_str =
-      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
-  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
-
-  std::string test_name =
-      config.test_case_name + "_" + storage_str + "_" + dtype_str;
-  test_case.set_name(test_name);
-
-  // Set the operator name for the test case
-  std::string operator_name = "etvk." + config.op_name + ".default";
-  test_case.set_operator_name(operator_name);
-
-  // Input tensor (float) - [N, C, H, W]
-  std::vector<int64_t> input_size = {
-      config.batch_size, config.in_channels, config.height, config.width};
-  ValueSpec input_tensor(
-      input_size,
-      input_dtype,
-      storage_type,
-      utils::kChannelsPacked, // Use channels packed for conv2d tensors
-      DataGenType::RANDOM);
-
-  float scale_val = 0.007112;
-  ValueSpec scale(scale_val);
-
-  // Generate random zero point within quantization range
-  int32_t zero_point_val = -2;
-  ValueSpec zero_point(zero_point_val);
-
-  // Output tensor (float) - same shape as input [N, C, H, W]
-  ValueSpec output_tensor(
-      input_size,
-      input_dtype,
-      storage_type,
-      utils::kChannelsPacked,
-      DataGenType::ZEROS);
-
-  // Add all specs to test case
-  test_case.add_input_spec(input_tensor);
-  test_case.add_input_spec(scale);
-  test_case.add_input_spec(zero_point);
-  test_case.add_output_spec(output_tensor);
-
-  test_case.set_abs_tolerance(scale_val + 1e-4);
-
-  // Use layout-only filter for this test since quantize/dequantize ARE the
-  // operations being tested, not overhead
-  test_case.set_shader_filter(kLayoutOnlyShaderFilter);
-
-  return test_case;
-}
-
-// Generate easy test cases for qdq8ta_conv2d operation (for debugging)
-std::vector<TestCase> generate_qdq8ta_conv2d_easy_cases() {
-  std::vector<TestCase> test_cases;
-
-  // Single simple configuration for debugging
-  QDQ8TAConv2DConfig config = {
-      1, // batch_size
-      3, // in_channels
-      4, // height
-      4, // width
-      "simple", // test_case_name
-  };
-
-  // Test with both storage types
-  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
-  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
-
-  // Generate test cases for each combination
-  for (const auto& storage_type : storage_types) {
-    for (const auto& input_dtype : float_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, input_dtype));
-    }
-  }
-
-  return test_cases;
-}
-
-// Generate test cases for qdq8ta_conv2d operation
-std::vector<TestCase> generate_qdq8ta_conv2d_test_cases() {
-  std::vector<TestCase> test_cases;
-
-  std::vector<QDQ8TAConv2DConfig> configs = {
-      // Small test cases for correctness
-      {1, 3, 16, 16},
-      {1, 8, 32, 32},
-      {1, 16, 24, 24},
-      {1, 32, 12, 12},
-      {1, 1, 64, 64},
-      {1, 3, 64, 64},
-      {1, 4, 16, 16},
-
-      // Different tensor sizes
-      {1, 8, 20, 20},
-      {1, 16, 14, 14},
-      {1, 8, 28, 28},
-
-      // Odd tensor sizes
-      {1, 3, 15, 15},
-      {1, 13, 31, 31},
-      {1, 17, 23, 23},
-
-      // Performance test cases (larger tensors)
-      {1, 64, 128, 128},
-      {1, 32, 64, 64},
-      {1, 128, 56, 56},
-  };
-
-  // Test with different storage types
-  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
-
-  for (auto config : configs) {
-    std::string prefix =
-        (config.batch_size < kRefDimSizeLimit &&
-         config.in_channels < kRefDimSizeLimit &&
-         config.height < kRefDimSizeLimit && config.width < kRefDimSizeLimit)
-        ? "correctness_"
-        : "performance_";
-    std::string generated_test_case_name = prefix +
-        std::to_string(config.batch_size) + "_" +
-        std::to_string(config.in_channels) + "_" +
-        std::to_string(config.height) + "_" + std::to_string(config.width);
-
-    config.test_case_name = generated_test_case_name;
-
-    for (const auto& storage_type : storage_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, vkapi::kFloat));
-    }
-  }
-
-  return test_cases;
-}
-
-// Reference implementation for qdq8ta_conv2d operation
-void qdq8ta_conv2d_reference_impl(TestCase& test_case) {
-  int32_t idx = 0;
-  const ValueSpec& input_spec = test_case.inputs()[idx++];
-  const ValueSpec& scale_spec = test_case.inputs()[idx++];
-  const ValueSpec& zero_point_spec = test_case.inputs()[idx++];
-
-  // Extract output specification
-  ValueSpec& output_spec = test_case.outputs()[0];
-
-  // Get tensor dimensions
-  auto input_sizes = input_spec.get_tensor_sizes(); // [N, C, H, W]
-  int64_t N = input_sizes[0];
-  int64_t C = input_sizes[1];
-  int64_t H = input_sizes[2];
-  int64_t W = input_sizes[3];
-
-  // Skip for large tensors since computation time will be extremely slow
-  if (N > kRefDimSizeLimit || C > kRefDimSizeLimit || H > kRefDimSizeLimit ||
-      W > kRefDimSizeLimit) {
-    throw std::invalid_argument(
-        "One or more dimensions (N, C, H, W) exceed the allowed limit for reference implementation.");
-  }
-
-  if (input_spec.dtype != vkapi::kFloat) {
-    throw std::invalid_argument("Unsupported dtype");
-  }
-
-  // Get raw data pointers
-  auto& input_data = input_spec.get_float_data();
-
-  // Extract the randomized scale and zero point values (following
-  // q8csw_conv2d.cpp pattern)
-  float scale = scale_spec.get_float_value();
-  int32_t zero_point = zero_point_spec.get_int_value();
-  int32_t quant_min = -128;
-  int32_t quant_max = 127;
-
-  // Prepare output data
-  auto& ref_data = output_spec.get_ref_float_data();
-  int64_t num_elements = N * C * H * W;
-  ref_data.resize(num_elements);
-
-  // Perform quantize-dequantize operation on each element
-  for (int64_t i = 0; i < num_elements; ++i) {
-    float input_val = input_data[i];
-
-    // Quantize: quantized = round(input / scale + zero_point)
-    float quantized_float = std::round(input_val / scale) + zero_point;
-
-    // Clamp to quantization range
-    quantized_float = std::max(quantized_float, static_cast<float>(quant_min));
-    quantized_float = std::min(quantized_float, static_cast<float>(quant_max));
-
-    int32_t quantized_int = static_cast<int32_t>(quantized_float);
-
-    // Dequantize: output = (quantized - zero_point) * scale
-    float dequantized = (quantized_int - zero_point) * scale;
-
-    ref_data[i] = dequantized;
-  }
-}
-
-int main(int argc, char* argv[]) {
-  set_debugging(false);
-  set_print_output(false);
-  set_print_latencies(false);
-  set_use_gpu_timestamps(true);
-
-  print_performance_header();
-  std::cout << "QDQ8TA Conv2D Operation Prototyping Framework" << std::endl;
-  print_separator();
-
-  ReferenceComputeFunc ref_fn = qdq8ta_conv2d_reference_impl;
-
-  auto results = execute_test_cases(
-      generate_qdq8ta_conv2d_test_cases, "QDQ8TAConv2D", 0, 1, ref_fn);
-
-  return 0;
-}
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 6f35444570b..6633365838c 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -91,7 +91,7 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("q8csw_conv2d")
     define_custom_op_test_binary("choose_qparams_per_row")
     define_custom_op_test_binary("q4gsw_linear")
-    define_custom_op_test_binary("qdq8ta_conv2d_activations")
+    define_custom_op_test_binary("test_q8ta_qdq")
     define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d")
     define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d_dw")
     define_custom_op_test_binary("q8ta_q8ta_q8to_add")
diff --git a/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp b/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp
new file mode 100644
index 00000000000..e0efd6ea85d
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp
@@ -0,0 +1,348 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <vector>
+#include "utils.h"
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+// #define DEBUG_MODE
+
+using namespace executorch::vulkan::prototyping;
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 512;
+
+// Configuration struct for tensor quantize-dequantize testing
+struct QDQ8BitConfig {
+  std::vector<int64_t> shape; // Tensor shape (can be any dimensionality)
+  std::string test_case_name = "placeholder";
+  std::string op_name = "q_dq_8bit_per_tensor";
+};
+
+// Utility function to create a test case from a QDQ8BitConfig
+TestCase create_test_case_from_config(
+    const QDQ8BitConfig& config,
+    utils::StorageType storage_type,
+    vkapi::ScalarType input_dtype,
+    utils::GPUMemoryLayout fp_memory_layout,
+    utils::GPUMemoryLayout quantized_memory_layout,
+    const std::string& impl_selector = "") {
+  TestCase test_case;
+
+  // Create a descriptive name for the test case
+  // Format: ACCU/PERF  I=N,C,H,W  Tex_FP->Buf_Quant
+  std::string shape_str = shape_string(config.shape);
+  std::string test_name = config.test_case_name + "  I=" + shape_str + "  " +
+      repr_str(storage_type, fp_memory_layout) + "->" +
+      repr_str(utils::kBuffer, quantized_memory_layout);
+  if (!impl_selector.empty()) {
+    test_name += " [" + impl_selector + "]";
+  }
+  test_case.set_name(test_name);
+
+  // Set the operator name for the test case
+  std::string operator_name = "test_etvk." + config.op_name + ".default";
+  test_case.set_operator_name(operator_name);
+
+  // Input tensor (float) - any dimensionality
+  ValueSpec input_tensor(
+      config.shape,
+      input_dtype,
+      storage_type,
+      fp_memory_layout,
+      DataGenType::RANDOM);
+
+  float scale_val = 0.007112;
+  ValueSpec scale(scale_val);
+
+  // Generate random zero point within quantization range
+  int32_t zero_point_val = 0;
+  ValueSpec zero_point(zero_point_val);
+
+  // GPUMemoryLayout as integer (will be cast in the operator)
+  int32_t layout_int = static_cast<int32_t>(quantized_memory_layout);
+  ValueSpec layout_spec(layout_int);
+
+  // impl_selector string
+  ValueSpec impl_selector_spec = ValueSpec::make_string(impl_selector);
+
+  // Output tensor (float) - same shape as input
+  ValueSpec output_tensor(
+      config.shape,
+      input_dtype,
+      storage_type,
+      fp_memory_layout,
+      DataGenType::ZEROS);
+
+  // Add all specs to test case
+  test_case.add_input_spec(input_tensor);
+  test_case.add_input_spec(scale);
+  test_case.add_input_spec(zero_point);
+  test_case.add_input_spec(layout_spec);
+  test_case.add_input_spec(impl_selector_spec);
+  test_case.add_output_spec(output_tensor);
+
+  test_case.set_abs_tolerance(scale_val + 1e-4);
+
+  // Use layout-only filter for this test since quantize/dequantize ARE the
+  // operations being tested, not overhead
+  test_case.set_shader_filter(kLayoutOnlyShaderFilter);
+
+  return test_case;
+}
+
+// Generate easy test cases for q_dq_8bit operation (for debugging)
+std::vector<TestCase> generate_q_dq_8bit_easy_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Single simple configuration for debugging
+  QDQ8BitConfig config = {
+      {1, 16, 16, 16}, // shape: [N, C, H, W]
+      "ACCU", // test_case_name
+  };
+
+  // FP memory layouts to test
+  std::vector<utils::GPUMemoryLayout> fp_layouts = {
+      utils::kWidthPacked,
+      utils::kChannelsPacked,
+  };
+
+  // Quantized memory layouts to test
+  std::vector<utils::GPUMemoryLayout> quant_layouts = {
+      utils::kPackedInt8_4W,
+      utils::kPackedInt8_4C,
+      utils::kPackedInt8_4W4C,
+      utils::kPackedInt8_4H4W,
+      utils::kPackedInt8_4C1W,
+  };
+
+  std::vector<utils::StorageType> storage_types = {utils::kBuffer};
+  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
+
+  // Generate test cases for each combination
+  for (const auto& fp_layout : fp_layouts) {
+    for (const auto& quant_layout : quant_layouts) {
+      for (const auto& storage_type : storage_types) {
+        for (const auto& input_dtype : float_types) {
+          test_cases.push_back(create_test_case_from_config(
+              config, storage_type, input_dtype, fp_layout, quant_layout));
+          // For 4W4C layout, also test with legacy implementation
+          if (quant_layout == utils::kPackedInt8_4W4C) {
+            test_cases.push_back(create_test_case_from_config(
+                config,
+                storage_type,
+                input_dtype,
+                fp_layout,
+                quant_layout,
+                /*impl_selector=*/"legacy_4w4c"));
+          }
+        }
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+// Generate test cases for q_dq_8bit operation
+std::vector<TestCase> generate_q_dq_8bit_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Shapes to test (no layout specified - will be combined with all layouts)
+  std::vector<std::vector<int64_t>> shapes = {
+      // Small test cases for correctness
+      {1, 3, 16, 16},
+      {1, 8, 32, 32},
+      {1, 16, 24, 24},
+      {1, 32, 12, 12},
+      {1, 1, 64, 64},
+      {1, 3, 64, 64},
+      {1, 4, 16, 16},
+
+      // Different tensor sizes
+      {1, 8, 20, 20},
+      {1, 16, 14, 14},
+      {1, 8, 28, 28},
+
+      // Odd tensor sizes
+      {1, 3, 15, 15},
+      {1, 13, 31, 31},
+      {1, 17, 23, 23},
+
+      // Performance test cases (larger tensors)
+      {1, 64, 128, 128},
+      {1, 32, 64, 64},
+      {1, 128, 56, 56},
+  };
+
+  // FP memory layouts to test
+  std::vector<utils::GPUMemoryLayout> fp_layouts = {
+      utils::kWidthPacked,
+      utils::kChannelsPacked,
+  };
+
+  // Quantized memory layouts to test
+  std::vector<utils::GPUMemoryLayout> quant_layouts = {
+      utils::kPackedInt8_4W,
+      utils::kPackedInt8_4C,
+      utils::kPackedInt8_4W4C,
+      utils::kPackedInt8_4H4W,
+      utils::kPackedInt8_4C1W,
+  };
+
+  // Test with buffer storage only - the unified block-based shaders only
+  // support buffer-backed floating-point tensors. Texture storage is tested
+  // separately by qdq8ta_conv2d_activations which uses the layout-specific
+  // shaders.
+  std::vector<utils::StorageType> storage_types = {
+      utils::kBuffer, utils::kTexture3D};
+
+  // Generate all combinations
+  for (const auto& shape : shapes) {
+    // Generate test case name prefix from shape dimensions
+    std::string prefix = "ACCU";
+    for (const auto& dim : shape) {
+      if (dim > kRefDimSizeLimit) {
+        prefix = "PERF";
+        break;
+      }
+    }
+
+    for (const auto& fp_layout : fp_layouts) {
+      for (const auto& quant_layout : quant_layouts) {
+        for (const auto& storage_type : storage_types) {
+          QDQ8BitConfig config;
+          config.shape = shape;
+          config.test_case_name = prefix;
+
+          test_cases.push_back(create_test_case_from_config(
+              config, storage_type, vkapi::kFloat, fp_layout, quant_layout));
+          // For 4W4C layout, also test with legacy implementation
+          if (fp_layout == utils::kChannelsPacked &&
+              quant_layout == utils::kPackedInt8_4W4C) {
+            test_cases.push_back(create_test_case_from_config(
+                config,
+                storage_type,
+                vkapi::kFloat,
+                fp_layout,
+                quant_layout,
+                /*impl_selector=*/"legacy_4w4c"));
+          }
+        }
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+// Reference implementation for q_dq_8bit operation
+void q_dq_8bit_reference_impl(TestCase& test_case) {
+  int32_t idx = 0;
+  const ValueSpec& input_spec = test_case.inputs()[idx++];
+  const ValueSpec& scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& zero_point_spec = test_case.inputs()[idx++];
+  const ValueSpec& layout_spec = test_case.inputs()[idx++];
+  (void)layout_spec; // Not used in reference implementation
+
+  // Extract output specification
+  ValueSpec& output_spec = test_case.outputs()[0];
+
+  // Get tensor dimensions (arbitrary dimensionality)
+  auto input_sizes = input_spec.get_tensor_sizes();
+
+  // Calculate total number of elements
+  int64_t num_elements = 1;
+  for (const auto& dim : input_sizes) {
+    num_elements *= dim;
+  }
+
+  // Skip for large tensors since computation time will be extremely slow
+  for (const auto& dim : input_sizes) {
+    if (dim > kRefDimSizeLimit) {
+      throw std::invalid_argument(
+          "One or more dimensions exceed the allowed limit for reference implementation.");
+    }
+  }
+
+  if (input_spec.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Unsupported dtype");
+  }
+
+  // Get raw data pointers
+  auto& input_data = input_spec.get_float_data();
+
+  // Extract the randomized scale and zero point values
+  float scale = scale_spec.get_float_value();
+  int32_t zero_point = zero_point_spec.get_int_value();
+  int32_t quant_min = -128;
+  int32_t quant_max = 127;
+
+  // Prepare output data
+  auto& ref_data = output_spec.get_ref_float_data();
+  ref_data.resize(num_elements);
+
+  // Perform quantize-dequantize operation on each element
+  for (int64_t i = 0; i < num_elements; ++i) {
+    float input_val = input_data[i];
+
+    // Quantize: quantized = round(input / scale + zero_point)
+    float quantized_float = std::round(input_val / scale) + zero_point;
+
+    // Clamp to quantization range
+    quantized_float = std::max(quantized_float, static_cast<float>(quant_min));
+    quantized_float = std::min(quantized_float, static_cast<float>(quant_max));
+
+    int32_t quantized_int = static_cast<int32_t>(quantized_float);
+
+    // Dequantize: output = (quantized - zero_point) * scale
+    float dequantized = (quantized_int - zero_point) * scale;
+
+    ref_data[i] = dequantized;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+#ifdef DEBUG_MODE
+  set_print_latencies(false);
+#else
+  set_print_latencies(false);
+#endif
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Q/DQ 8-bit Per Tensor Operation Prototyping Framework"
+            << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = q_dq_8bit_reference_impl;
+
+  auto results = execute_test_cases(
+#ifdef DEBUG_MODE
+      generate_q_dq_8bit_easy_cases,
+#else
+      generate_q_dq_8bit_test_cases,
+#endif
+      "QDQ8Bit",
+#ifdef DEBUG_MODE
+      0,
+      1,
+#else
+      3,
+      10,
+#endif
+      ref_fn);
+
+  return 0;
+}