diff --git a/backends/vulkan/runtime/graph/ops/glsl/common.glslh b/backends/vulkan/runtime/graph/ops/glsl/common.glslh
index 3b2010c7963..a752f23d3ed 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/common.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/common.glslh
@@ -98,6 +98,18 @@ void printVec4(vec4 texel) {
       "texel: %f, %f, %f, %f\\n", texel.x, texel.y, texel.z, texel.w);
 }
 
+void printIVec4(ivec4 texel) {
+  debugPrintfEXT(
+      "texel: %d, %d, %d, %d\\n", texel.x, texel.y, texel.z, texel.w);
+}
+
+void printPackedInt(const int packed) {
+  ivec4 unpacked = unpack_int8x4(packed);
+  debugPrintfEXT(
+      "packed: 0x%08x -> [%d, %d, %d, %d]\\n",
+      packed, unpacked.x, unpacked.y, unpacked.z, unpacked.w);
+}
+
 #endif // DEBUG_MODE
 
 #endif // COMMON_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
index 16c4112547c..37c47795214 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
@@ -331,6 +331,52 @@ TensorIndex linear_idx_to_tensor_idx(
   return linear_idx_to_tensor_idx(meta, linear_idx);
 }
 
+/*
+ * Convert a linear texel index to a TensorIndex4D.
+ *
+ * This function is used for texel-based dispatch where each thread handles
+ * one packed texel (4 elements along the packed dimension). The texel index
+ * is decomposed using the dim_order and strides from the tensor's layout.
+ *
+ * The strides in BufferMetadata should already be in texel space (with packed
+ * dimension size divided by 4).
+ *
+ * Parameters:
+ *   meta: BufferMetadata with tensor sizes and texel-space strides
+ *   texel_idx: Linear index into packed texels (0 to num_texels-1)
+ *   hashed_layout: Packed layout info containing dim_order and packed_dim
+ *
+ * Returns: TensorIndex4D with logical tensor coordinates (packed dim is base of 4-element block)
+ */
+TensorIndex4D texel_idx_to_tensor4d_idx(
+    const BufferMetadata meta,
+    uint texel_idx,
+    const int hashed_layout) {
+  TensorIndex4D tidx;
+
+  const int packed_dim = get_packed_dim(hashed_layout);
+
+  // Decompose texel_idx using dim_order from hashed_layout and strides from meta
+  // Iterate from slowest-varying dimension (d=3) to fastest (d=0)
+  // This follows the pattern of linear_idx_to_tensor_idx in indexing.glslh
+  [[unroll]] for (int d = 3; d >= 0; d--) {
+    // Get dim index from hashed_layout's dim_order (bits 0-15)
+    int dim_idx = extract_4b(hashed_layout, d);
+
+    // Get stride for this dimension from BufferMetadata
+    uint dim_stride = meta.strides[0][dim_idx];
+
+    // Compute coordinate for this dimension
+    tidx.data[dim_idx] = int(texel_idx / dim_stride);
+    texel_idx = texel_idx % dim_stride;
+  }
+
+  // Convert packed dimension from texel index to element index
+  tidx.data[packed_dim] *= 4;
+
+  return tidx;
+}
+
 uint tensor_idx_to_linear_idx(
     const BufferMetadata meta,
     const TensorIndex tidx) {
@@ -524,6 +570,39 @@ int tensor4d_idx_to_buf_idx(
   return block_idx * block_numel + intra_block_idx;
 }
 
+/*
+ * Convert a tensor index to a texel index for block-packed layouts.
+ *
+ * For texel-packed tensors (outer_block_size == 1):
+ * - Each block corresponds to one texel
+ * - Returns block_idx directly
+ *
+ * For block-packed tensors (outer_block_size > 1, e.g., 4x4 blocks):
+ * - Each block contains 4 texels (16 elements / 4 elements per texel)
+ * - texel_idx = block_idx * 4 + (intra_block_idx / 4)
+ *
+ * Parameters:
+ *   meta: BufferMetadata containing sizes and block-space strides
+ *   tidx: TensorIndex4D with logical tensor coordinates
+ *   hashed_layout: Packed layout info
+ *
+ * Returns: Linear texel index
+ */
+int tensor4d_idx_to_texel_idx(
+    const BufferMetadata meta,
+    const TensorIndex4D tidx,
+    const int hashed_layout) {
+  const int block_idx = tensor4d_idx_to_block_idx(meta, tidx, hashed_layout);
+
+  if (get_outer_packed_dim_block_size(hashed_layout) == 4) {
+    const int intra_block_idx =
+        tensor4d_idx_to_intra_block_idx(tidx, hashed_layout);
+    return block_idx * 4 + div_4(intra_block_idx);
+  }
+
+  return block_idx;
+}
+
 //
 // Debug utilities
 //
@@ -540,7 +619,7 @@ void printTensorIndex(const TensorIndex tidx) {
 
 void printTensorIndex4D(const TensorIndex4D tidx) {
     debugPrintfEXT(
-        "TensorIndex4D: [%u, %u, %u, %u]\\n",
+        "TensorIndex4: [%d, %d, %d, %d]\\n",
         tidx.data[0], tidx.data[1], tidx.data[2], tidx.data[3]
     );
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.glsl
new file mode 100644
index 00000000000..e6be92e7ba1
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.glsl
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, "buffer")}
+#define T ${texel_load_component_type(DTYPE, "buffer")}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+#include "common.glslh"
+#include "conv2d_common.glslh"
+
+${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", "texture2d", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
+
+// Metadata for input/output tensors (memory layout agnostic)
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
+
+layout(push_constant) uniform restrict Block {
+  float input_scale;
+  int input_zp;
+  float output_inv_scale;
+  int output_zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "apply_bias", "1")}
+
+// Layout specialization constants
+${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
+
+#include "block_indexing.glslh"
+
+// Load a 4xint8 block of weights.
+// Weights are stored in 4W4C format: [kH, kW/4, C/4, 4, 4] where the first 4 is
+// the outer (kW) dimension and the second 4 is the inner (channel) dimension.
+// Returns packed int32 containing 4 int8 values for channels c to c+3.
+int load_weight(int kw, int kh, int c4, int KW4, int C4) {
+  // Find the packed block index (4W4C tiling)
+  const int kw4 = kw / 4;  // W block
+  const int block_x_offset = kw % 4;
+  // Texture layout: x = c4, y = kh * KW4 + kw4
+  return texelFetch(t_packed_int8_weight, ivec2(c4, kh * KW4 + kw4), 0)[block_x_offset];
+}
+
+ivec4 quantize(const vec4 texel, const float inv_scale, const int zp) {
+  vec4 quantized = round(texel * inv_scale) + zp;
+  return clamp(ivec4(quantized), -128, 127);
+}
+
+void main() {
+  const int c4 = int(gl_GlobalInvocationID.z);
+
+  // Initialize output tensor index (WHCN order)
+  // Each thread handles 4 adjacent widths starting at base_out_w
+  TensorIndex4D outp_tidx;
+  outp_tidx.data[0] = int(gl_GlobalInvocationID.x) * 4;
+  outp_tidx.data[1] = int(gl_GlobalInvocationID.y);
+  outp_tidx.data[2] = c4 * 4;
+  outp_tidx.data[3] = 0;
+
+  const int W = int(outp.sizes[0][0]);
+  const int C4 = int(div_up_4(outp.sizes[0][2]));
+
+  // Bounds check
+  if (any(greaterThanEqual(outp_tidx.data, ivec4(outp.sizes[0])))) {
+    return;
+  }
+
+  // Compute weight addressing constants
+  const int KW4 = int(div_up_4(conv2d_params.kernel_size.x));
+
+  // Get strides for width and height dimensions (in texel space)
+  const int w_stride = int(inp.strides[0][0]);
+  const int h_stride = int(inp.strides[0][1]);
+
+  // Pre-compute step sizes for efficient indexing
+  const int w_texel_step = conv2d_params.dilation.x * w_stride;
+  const int h_texel_step = conv2d_params.dilation.y * h_stride;
+  // Step between adjacent output width positions in input texel space
+  const int subtile_w_step = conv2d_params.stride.x * w_stride;
+
+  // Compute base input position for subtile_w=0
+  TensorIndex4D inp_tidx;
+  inp_tidx.data[0] = outp_tidx.data[0] * conv2d_params.stride.x - conv2d_params.padding.x;
+  inp_tidx.data[1] = outp_tidx.data[1] * conv2d_params.stride.y - conv2d_params.padding.y;
+  inp_tidx.data[2] = outp_tidx.data[2];
+  inp_tidx.data[3] = 0;  // batch = 0 since N == 1
+
+  int base_inp_texel_idx;
+  if (get_outer_packed_dim_block_size(inp_layout) == 1) {
+    base_inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout);
+  }
+
+  // Store the base width position to reset the index position at the beginning
+  // of each loop
+  const int base_inp_w = inp_tidx.data[0];
+
+  // Initialize accumulators for 4 width positions × 4 channels each
+  ivec4 acc[4];
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    acc[i] = ivec4(0);
+  }
+
+  // Input dimensions for bounds checking
+  const int inp_W = int(inp.sizes[0][0]);
+  const int inp_H = int(inp.sizes[0][1]);
+
+  // Perform depthwise convolution
+  for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) {
+    const bool h_in_bounds = (inp_tidx.data[1] >= 0 && inp_tidx.data[1] < inp_H);
+
+    // Reset width coordinate at start of each kernel row
+    inp_tidx.data[0] = base_inp_w;
+
+    for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) {
+      // Load weight once, reuse for all 4 width positions
+      const int packed_weight = load_weight(kx, ky, c4, KW4, C4);
+      const ivec4 weight_4c = unpack_int8x4(packed_weight);
+
+      // Process 4 adjacent width positions using stride offsets
+      [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
+        ivec4 input_4c = ivec4(input_zp);
+        if (h_in_bounds && inp_tidx.data[0] >= 0 && inp_tidx.data[0] < inp_W) {
+          // Compute texel index: base + kernel offset + subtile offset
+          int inp_texel_idx;
+          if (get_outer_packed_dim_block_size(inp_layout) == 1) {
+            inp_texel_idx = base_inp_texel_idx + kx * w_texel_step + subtile_w * subtile_w_step;
+          } else {
+            // const int w_offset = kx * conv2d_params.dilation.x + subtile_w * conv2d_params.stride.x;
+            // inp_texel_idx = base_inp_texel_idx + div_4(w_offset) * w_stride + mod_4(w_offset);
+            // inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout);
+            const int w4 = div_4(inp_tidx.data[0]);
+            inp_texel_idx = (inp_tidx.data[1] * h_stride + w4 * w_stride + c4) * 4 + mod_4(inp_tidx.data[0]);
+          }
+          const int packed_input = t_packed_int8_input[inp_texel_idx];
+          input_4c = unpack_int8x4(packed_input);
+        }
+
+        // Accumulate: element-wise multiply for depthwise conv
+        acc[subtile_w] += weight_4c * input_4c;
+
+        // Advance to next output position's input coordinate
+        inp_tidx.data[0] += conv2d_params.stride.x;
+      }
+
+      // We advanced by 4*stride.x during subtile loop; adjust for net dilation step
+      inp_tidx.data[0] += conv2d_params.dilation.x - 4 * conv2d_params.stride.x;
+    }
+
+    // Advance height by dilation for next kernel row
+    inp_tidx.data[1] += conv2d_params.dilation.y;
+
+    if (get_outer_packed_dim_block_size(inp_layout) == 1) {
+      // Advance base index by height step for next kernel row
+      base_inp_texel_idx += h_texel_step;
+    }
+  }
+
+  // Apply input zero point as weight_sum * input_zp
+  const vec4 weight_sums = vec4(t_weight_sums[c4]);
+  const vec4 weight_scales = vec4(t_weight_scales[c4]);
+
+  // Convert to float, apply dequantization, and optionally add bias
+  vec4 facc[4];
+  [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
+    facc[subtile_w] = vec4(acc[subtile_w]);
+    facc[subtile_w] -= weight_sums * input_zp;
+    facc[subtile_w] *= weight_scales * input_scale;
+  }
+
+  // Apply bias if enabled
+  if (apply_bias > 0) {
+    const vec4 bias = vec4(t_bias[c4]);
+    [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
+      facc[subtile_w] += bias;
+    }
+  }
+
+  // Compute base output texel index (for subtile_w=0)
+  const int base_outp_texel_idx = tensor4d_idx_to_texel_idx(outp, outp_tidx, outp_layout);
+  const int out_w_stride = int(outp.strides[0][0]);
+
+  // Quantize and store outputs using stride offsets
+  [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
+    // Skip out-of-bounds width positions
+    if (outp_tidx.data[0] >= W) {
+      continue;
+    }
+
+    const ivec4 quantized_out = quantize(facc[subtile_w], output_inv_scale, output_zp);
+    const int packed_out = pack_into_int32(quantized_out);
+
+    // Store using stride offset from base
+    int outp_texel_idx;
+    if (get_outer_packed_dim_block_size(outp_layout) == 1) {
+      outp_texel_idx = base_outp_texel_idx + subtile_w * out_w_stride;
+    } else {
+      // outp_texel_idx = tensor4d_idx_to_texel_idx(outp, outp_tidx, outp_layout);
+      outp_texel_idx = base_outp_texel_idx + subtile_w;
+    }
+
+    t_packed_int8_output[outp_texel_idx] = packed_out;
+
+    outp_tidx.data[0] += 1;
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.yaml
new file mode 100644
index 00000000000..5b671e1e8d5
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.yaml
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+q8ta_conv2d_dw:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: q8ta_conv2d_dw
diff --git a/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h b/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h
new file mode 100644
index 00000000000..76c17cc80b3
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+struct Conv2DParams {
+  utils::ivec2 kernel_size;
+  utils::ivec2 stride;
+  utils::ivec2 padding;
+  utils::ivec2 dilation;
+  int32_t groups;
+  int32_t out_channels_per_group;
+  int32_t in_channels_per_group;
+  int32_t logical_K_per_group;
+  int32_t K_per_group;
+  int32_t K4_per_group;
+  int32_t logical_K;
+  int32_t K;
+  int32_t K4;
+};
+
+Conv2DParams create_conv2d_params(
+    ComputeGraph& graph,
+    const ValueRef& conv_input,
+    const ValueRef& conv_output,
+    const ValueRef& kernel_size,
+    const ValueRef& stride,
+    const ValueRef& padding,
+    const ValueRef& dilation,
+    const ValueRef& groups);
+
+vkapi::SpecVarList GenerateSpecConstants(
+    ComputeGraph& graph,
+    Conv2DParams& conv_params,
+    const ValueRef& groups,
+    uint32_t apply_bias = 1);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h
new file mode 100644
index 00000000000..5f028caec12
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.h>
+
+namespace vkcompute {
+
+ValueRef prepack_quantized_conv2d_weight(
+    ComputeGraph& graph,
+    const QuantizationConfig& weight_quant_config,
+    const ValueRef weight_data,
+    const ValueRef input,
+    const ValueRef output,
+    const ValueRef groups,
+    const ValueRef kernel_size);
+
+ValueRef prepack_quantized_conv2d_dw_weight(
+    ComputeGraph& graph,
+    const QuantizationConfig& weight_quant_config,
+    const ValueRef weight_data,
+    const ValueRef kernel_size);
+
+void add_q8ta_conv2d_dw_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef packed_weight,
+    const ValueRef packed_weight_sums,
+    const ValueRef packed_weight_scales,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef bias_data,
+    const ValueRef packed_bias,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef packed_int8_output);
+
+void add_conv2d_dw_q8ta_q8csw_q8to_4w4c_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef packed_weight,
+    const ValueRef packed_weight_sums,
+    const ValueRef packed_weight_scales,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef bias_data,
+    const ValueRef packed_bias,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef packed_int8_output);
+
+void add_q8ta_conv2d_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input,
+    const ValueRef packed_int8_input_im2col,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef packed_weight,
+    const ValueRef packed_weight_sums,
+    const ValueRef packed_weight_scales,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef bias_data,
+    const ValueRef packed_bias,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef packed_int8_output);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp
new file mode 100644
index 00000000000..5e1e8aab599
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+//
+// Shader dispatch utilities
+//
+
+utils::uvec3 pick_q8ta_conv2d_dw_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+
+  const ValueRef output = args.at(0).refs.at(0);
+
+  const uint32_t W = graph->size_at<uint32_t>(-1, output);
+  const uint32_t H = graph->size_at<uint32_t>(-2, output);
+  const uint32_t C = graph->size_at<uint32_t>(-3, output);
+
+  // Each thread processes 4 adjacent width positions and 4 channels (4Wx4C
+  // tile)
+  const uint32_t W4 = utils::div_up_4(W);
+  const uint32_t C4 = utils::div_up_4(C);
+
+  return {W4, H, C4};
+}
+
+utils::uvec3 pick_q8ta_conv2d_dw_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)args;
+  (void)resize_args;
+
+  // Some inactive invocations are okay; set 6 as the threshold to use the
+  // a square wg size.
+  if (global_workgroup_size[0u] >= 6 && global_workgroup_size[2u] >= 6) {
+    return {8u, 1u, 8u};
+  }
+  // If channels dim is sufficiently small, then bias towards width dim to
+  // reduce the number of inactive invocations.
+  if (global_workgroup_size[2u] < 2u) {
+    return {64u, 1u, 1u};
+  }
+  return {16u, 1u, 4u};
+}
+
+utils::uvec3 int8_conv2d_dw_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef packed_int8_output = args.at(0).refs.at(0);
+
+  const uint32_t W = graph->size_at<uint32_t>(-1, packed_int8_output);
+  const uint32_t H = graph->size_at<uint32_t>(-2, packed_int8_output);
+  const uint32_t C = graph->size_at<uint32_t>(-3, packed_int8_output);
+
+  const uint32_t W4 = utils::div_up_4(W);
+  const uint32_t C4 = utils::div_up_4(C);
+
+  return {C4 * W4 * H, 1, 1};
+}
+
+//
+// Prepack nodes
+//
+
+ValueRef prepack_quantized_conv2d_dw_weight(
+    ComputeGraph& graph,
+    const QuantizationConfig& weight_quant_config,
+    const ValueRef weight_data,
+    const ValueRef kernel_size) {
+  VK_CHECK_COND(weight_quant_config.nbits == 8);
+  VK_CHECK_COND(weight_quant_config.is_symmetric);
+
+  std::vector<int64_t> weight_orig_sizes = graph.sizes_of(weight_data);
+  const int64_t ndim = graph.dim_of(weight_data);
+
+  // For depthwise convolution, expect weight layout [K_h, aligned_K_w, OC]
+  VK_CHECK_COND(ndim == 3);
+  int64_t K_h = weight_orig_sizes.at(0);
+  int64_t K_w = weight_orig_sizes.at(1);
+  int64_t aligned_K_w = utils::align_up_4(K_w);
+  int64_t OC = weight_orig_sizes.at(2);
+
+  // The packing format packs the weight tensor into blocks of 4 output channels
+  // (OC) and 4 kernel elements (K_h * aligned_K_w)
+  int64_t OC_per_block = 4;
+  int64_t K_per_block = 4;
+
+  // To figure out the size of the output tensor, determine the number of blocks
+  // along each dimension.
+  const int64_t total_K_elements = K_h * aligned_K_w;
+  const int64_t num_blocks_K = utils::div_up(total_K_elements, K_per_block);
+  const int64_t num_blocks_OC = utils::div_up(OC, OC_per_block);
+
+  // The blocks are arranged in a transposed manner, such that the transposed
+  // weight block is indexed like packed_weights[k4][oc4] - this is to allow for
+  // optimal memory coalescing when computing the depthwise convolution.
+  int64_t output_height = num_blocks_K;
+  // The base dtype of the packed tensor is int32 (each int32 contains 4x 8bit
+  // values) and each block is represented as a ivec4. Therefore the width dim
+  // of the packed tensor is multiplied by 4.
+  int64_t output_width = num_blocks_OC * 4;
+
+  // Store the original sizes of the weight data to pass to the shader
+  utils::ivec3 orig_sizes = {
+      utils::safe_downcast<int32_t>(K_h),
+      utils::safe_downcast<int32_t>(K_w),
+      utils::safe_downcast<int32_t>(OC)};
+
+  std::vector<int64_t> packed_weight_sizes{output_height, output_width};
+
+  utils::StorageType storage_type = utils::kTexture2D;
+  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+  if (output_width > max_extent * 4 || output_height > max_extent) {
+    storage_type = utils::kBuffer;
+  }
+
+  ValueRef packed_weight = graph.add_tensor(
+      packed_weight_sizes,
+      vkcompute::vkapi::kInt,
+      storage_type,
+      utils::kWidthPacked);
+
+  utils::uvec3 global_wg_size = {
+      utils::safe_downcast<uint32_t>(num_blocks_OC),
+      utils::safe_downcast<uint32_t>(num_blocks_K),
+      1u};
+
+  std::string kernel_name = "pack_q8_conv2d_dw_weights";
+  add_storage_type_suffix(kernel_name, storage_type);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Inputs and Outputs
+      weight_data,
+      packed_weight,
+      // UBOs
+      {},
+      // Specialization Constants
+      {},
+      // Push Constants
+      {graph.sizes_pc_of(packed_weight),
+       PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec3))}));
+
+  return packed_weight;
+}
+
+//
+// Dispatch nodes
+//
+
+void add_conv2d_dw_q8ta_q8csw_q8to_4w4c_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef packed_weight,
+    const ValueRef packed_weight_sums,
+    const ValueRef packed_weight_scales,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef bias_data,
+    const ValueRef packed_bias,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef packed_int8_output) {
+  Conv2DParams conv_params = create_conv2d_params(
+      graph,
+      packed_int8_input,
+      packed_int8_output,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      groups);
+
+  // Verify this is actually a depthwise convolution
+  const int64_t groups_val = graph.extract_scalar<int64_t>(groups);
+  const int64_t in_channels = graph.size_at<int64_t>(-3, packed_int8_input);
+  VK_CHECK_COND(groups_val == in_channels);
+
+  float input_scale_val = graph.extract_scalar<float>(input_scale);
+  int32_t input_zp_val = graph.extract_scalar<int32_t>(input_zp);
+
+  float output_inv_scale_val = 1.0f / graph.extract_scalar<float>(output_scale);
+  int32_t output_zp_val = graph.extract_scalar<int32_t>(output_zp);
+
+  uint32_t apply_bias = 1;
+  if (graph.val_is_none(bias_data)) {
+    apply_bias = 0;
+  }
+
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)),
+      PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)),
+      PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)),
+      PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)),
+  };
+
+  std::string kernel_name = "conv2d_dw_q8ta_q8csw_q8to";
+  add_storage_type_suffix(
+      kernel_name, graph.storage_type_of(packed_int8_output));
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight));
+  add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales));
+
+  vkapi::ParamsBindList param_buffers = {
+      graph.sizes_ubo(packed_int8_output), graph.sizes_ubo(packed_int8_input)};
+
+  vkapi::SpecVarList spec_constants =
+      GenerateSpecConstants(graph, conv_params, groups, apply_bias);
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      int8_conv2d_dw_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{packed_int8_output, vkapi::kWrite},
+       {{packed_int8_input,
+         packed_weight,
+         packed_weight_sums,
+         packed_weight_scales,
+         packed_bias},
+        vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      spec_constants,
+      // Resize args
+      {},
+      // Resizing Logic
+      nullptr));
+}
+
+void add_q8ta_conv2d_dw_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef packed_weight,
+    const ValueRef packed_weight_sums,
+    const ValueRef packed_weight_scales,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef bias_data,
+    const ValueRef packed_bias,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef packed_int8_output) {
+  Conv2DParams conv_params = create_conv2d_params(
+      graph,
+      packed_int8_input,
+      packed_int8_output,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      groups);
+
+  // Verify this is actually a depthwise convolution
+  const int64_t groups_val = graph.extract_scalar<int64_t>(groups);
+  const int64_t in_channels = graph.size_at<int64_t>(-3, packed_int8_input);
+  VK_CHECK_COND(groups_val == in_channels);
+
+  float input_scale_val = graph.extract_scalar<float>(input_scale);
+  int32_t input_zp_val = graph.extract_scalar<int32_t>(input_zp);
+
+  float output_inv_scale_val = 1.0f / graph.extract_scalar<float>(output_scale);
+  int32_t output_zp_val = graph.extract_scalar<int32_t>(output_zp);
+
+  uint32_t apply_bias = 1;
+  if (graph.val_is_none(bias_data)) {
+    apply_bias = 0;
+  }
+
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)),
+      PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)),
+      PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)),
+      PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)),
+  };
+
+  std::string kernel_name = "q8ta_conv2d_dw";
+  add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales));
+
+  // Pass metadata for both output and input tensors
+  vkapi::ParamsBindList param_buffers = {
+      graph.buffer_meta_ubo(packed_int8_output),
+      graph.buffer_meta_ubo(packed_int8_input),
+      graph.create_params_buffer(conv_params)};
+
+  // Build spec constants: apply_bias + layout constants
+  vkapi::SpecVarList spec_constants = {
+      apply_bias,
+      // Layout specialization constants
+      graph.hashed_layout_of(packed_int8_input),
+      graph.hashed_layout_of(packed_int8_output),
+  };
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      pick_q8ta_conv2d_dw_global_wg_size,
+      pick_q8ta_conv2d_dw_local_wg_size,
+      // Inputs and Outputs
+      {{packed_int8_output, vkapi::kWrite},
+       {{packed_int8_input,
+         packed_weight,
+         packed_weight_sums,
+         packed_weight_scales,
+         packed_bias},
+        vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      spec_constants,
+      // Resize args
+      {}));
+}
+
+//
+// High level operator impl
+//
+
+void q8ta_conv2d_dw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef packed_int8_input = args.at(idx++);
+  const ValueRef input_scale = args.at(idx++);
+  const ValueRef input_zp = args.at(idx++);
+  const ValueRef weight_data = args.at(idx++);
+  const ValueRef weight_sums_data = args.at(idx++);
+  const ValueRef weight_scales_data = args.at(idx++);
+  const ValueRef output_scale = args.at(idx++);
+  const ValueRef output_zp = args.at(idx++);
+  const ValueRef bias_data = args.at(idx++);
+  const ValueRef kernel_size = args.at(idx++);
+  const ValueRef stride = args.at(idx++);
+  const ValueRef padding = args.at(idx++);
+  const ValueRef dilation = args.at(idx++);
+  const ValueRef groups = args.at(idx++);
+  const ValueRef packed_int8_output = args.at(idx++);
+
+  QuantizationConfig weight_quant_config(8, kPerChannel, {});
+
+  // Prepack weight using depthwise-specific packing
+  ValueRef packed_weight = prepack_quantized_conv2d_dw_weight(
+      graph, weight_quant_config, weight_data, kernel_size);
+
+  ValueRef packed_weight_sums = prepack_standard(
+      graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);
+
+  ValueRef packed_weight_scales = prepack_standard(
+      graph, weight_scales_data, utils::kBuffer, utils::kWidthPacked);
+
+  // Create a dummy tensor to fill the binding slot of the bias tensor if it is
+  // not provided. This helps simplify dispatch logic and makes it so that
+  // fewer shader variants need to be generated.
+  TmpTensor dummy_bias(
+      &graph,
+      {},
+      graph.dtype_of(weight_scales_data),
+      utils::kBuffer,
+      utils::kWidthPacked);
+
+  ValueRef packed_bias = dummy_bias.vref;
+  if (graph.val_is_not_none(bias_data)) {
+    packed_bias =
+        prepack_standard(graph, bias_data, utils::kBuffer, utils::kWidthPacked);
+  }
+
+  add_q8ta_conv2d_dw_node(
+      graph,
+      packed_int8_input,
+      input_scale,
+      input_zp,
+      packed_weight,
+      packed_weight_sums,
+      packed_weight_scales,
+      output_scale,
+      output_zp,
+      bias_data,
+      packed_bias,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      groups,
+      packed_int8_output);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(etvk.q8ta_conv2d_dw.default, q8ta_conv2d_dw);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
index 2adb32d8c77..a4d959aec41 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
@@ -9,7 +9,10 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
@@ -101,22 +104,6 @@ bool should_use_im2col(
   return graph->get_int(groups) == 1;
 }
 
-struct Conv2DParams {
-  utils::ivec2 kernel_size;
-  utils::ivec2 stride;
-  utils::ivec2 padding;
-  utils::ivec2 dilation;
-  int32_t groups;
-  int32_t out_channels_per_group;
-  int32_t in_channels_per_group;
-  int32_t logical_K_per_group;
-  int32_t K_per_group;
-  int32_t K4_per_group;
-  int32_t logical_K;
-  int32_t K;
-  int32_t K4;
-};
-
 Conv2DParams create_conv2d_params(
     ComputeGraph& graph,
     const ValueRef& conv_input,
@@ -377,23 +364,6 @@ utils::uvec3 pick_static_quantized_conv2d_local_wg_size(
       graph, shader, global_workgroup_size, args, resize_args);
 }
 
-utils::uvec3 int8_conv2d_dw_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef packed_int8_output = args.at(0).refs.at(0);
-
-  const uint32_t W = graph->size_at<uint32_t>(-1, packed_int8_output);
-  const uint32_t H = graph->size_at<uint32_t>(-2, packed_int8_output);
-  const uint32_t C = graph->size_at<uint32_t>(-3, packed_int8_output);
-
-  const uint32_t W4 = utils::div_up_4(W);
-  const uint32_t C4 = utils::div_up_4(C);
-
-  return {C4 * W4 * H, 1, 1};
-}
-
 //
 // Prepack nodes
 //
@@ -481,91 +451,6 @@ ValueRef prepack_quantized_conv2d_weight(
   return packed_weight;
 }
 
-ValueRef prepack_quantized_conv2d_dw_weight(
-    ComputeGraph& graph,
-    const QuantizationConfig& weight_quant_config,
-    const ValueRef weight_data,
-    const ValueRef kernel_size) {
-  VK_CHECK_COND(weight_quant_config.nbits == 8);
-  VK_CHECK_COND(weight_quant_config.is_symmetric);
-
-  std::vector<int64_t> weight_orig_sizes = graph.sizes_of(weight_data);
-  const int64_t ndim = graph.dim_of(weight_data);
-
-  // For depthwise convolution, expect weight layout [K_h, aligned_K_w, OC]
-  VK_CHECK_COND(ndim == 3);
-  int64_t K_h = weight_orig_sizes.at(0);
-  int64_t K_w = weight_orig_sizes.at(1);
-  int64_t aligned_K_w = utils::align_up_4(K_w);
-  int64_t OC = weight_orig_sizes.at(2);
-
-  // The packing format packs the weight tensor into blocks of 4 output channels
-  // (OC) and 4 kernel elements (K_h * aligned_K_w)
-  int64_t OC_per_block = 4;
-  int64_t K_per_block = 4;
-
-  // To figure out the size of the output tensor, determine the number of blocks
-  // along each dimension.
-  const int64_t total_K_elements = K_h * aligned_K_w;
-  const int64_t num_blocks_K = utils::div_up(total_K_elements, K_per_block);
-  const int64_t num_blocks_OC = utils::div_up(OC, OC_per_block);
-
-  // The blocks are arranged in a transposed manner, such that the transposed
-  // weight block is indexed like packed_weights[k4][oc4] - this is to allow for
-  // optimal memory coalescing when computing the depthwise convolution.
-  int64_t output_height = num_blocks_K;
-  // The base dtype of the packed tensor is int32 (each int32 contains 4x 8bit
-  // values) and each block is represented as a ivec4. Therefore the width dim
-  // of the packed tensor is multiplied by 4.
-  int64_t output_width = num_blocks_OC * 4;
-
-  // Store the original sizes of the weight data to pass to the shader
-  utils::ivec3 orig_sizes = {
-      utils::safe_downcast<int32_t>(K_h),
-      utils::safe_downcast<int32_t>(K_w),
-      utils::safe_downcast<int32_t>(OC)};
-
-  std::vector<int64_t> packed_weight_sizes{output_height, output_width};
-
-  utils::StorageType storage_type = utils::kTexture2D;
-  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
-  if (output_width > max_extent * 4 || output_height > max_extent) {
-    storage_type = utils::kBuffer;
-  }
-
-  ValueRef packed_weight = graph.add_tensor(
-      packed_weight_sizes,
-      vkcompute::vkapi::kInt,
-      storage_type,
-      utils::kWidthPacked);
-
-  utils::uvec3 global_wg_size = {
-      utils::safe_downcast<uint32_t>(num_blocks_OC),
-      utils::safe_downcast<uint32_t>(num_blocks_K),
-      1u};
-
-  std::string kernel_name = "pack_q8_conv2d_dw_weights";
-  add_storage_type_suffix(kernel_name, storage_type);
-
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
-      // Inputs and Outputs
-      weight_data,
-      packed_weight,
-      // UBOs
-      {},
-      // Specialization Constants
-      {},
-      // Push Constants
-      {graph.sizes_pc_of(packed_weight),
-       PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec3))}));
-
-  return packed_weight;
-}
-
 //
 // Dispatch nodes
 //
@@ -573,7 +458,7 @@ vkapi::SpecVarList GenerateSpecConstants(
     ComputeGraph& graph,
     Conv2DParams& conv_params,
     const ValueRef& groups,
-    uint32_t apply_bias = 1) {
+    uint32_t apply_bias) {
   uint32_t conv2d_params_stride_x = conv_params.stride[0];
   uint32_t conv2d_params_stride_y = conv_params.stride[1];
   uint32_t conv2d_params_padding_x = conv_params.padding[0];
@@ -1028,95 +913,6 @@ void add_conv2d_q8ta_q8csw_q8to_node(
       nullptr));
 }
 
-void add_conv2d_dw_q8ta_q8csw_q8to_node(
-    ComputeGraph& graph,
-    const ValueRef packed_int8_input,
-    const ValueRef input_scale,
-    const ValueRef input_zp,
-    const ValueRef packed_weight,
-    const ValueRef packed_weight_sums,
-    const ValueRef packed_weight_scales,
-    const ValueRef output_scale,
-    const ValueRef output_zp,
-    const ValueRef bias_data,
-    const ValueRef packed_bias,
-    const ValueRef kernel_size,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation,
-    const ValueRef groups,
-    const ValueRef packed_int8_output) {
-  Conv2DParams conv_params = create_conv2d_params(
-      graph,
-      packed_int8_input,
-      packed_int8_output,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      groups);
-
-  // Verify this is actually a depthwise convolution
-  const int64_t groups_val = graph.extract_scalar<int64_t>(groups);
-  const int64_t in_channels = graph.size_at<int64_t>(-3, packed_int8_input);
-  VK_CHECK_COND(groups_val == in_channels);
-
-  float input_scale_val = graph.extract_scalar<float>(input_scale);
-  int32_t input_zp_val = graph.extract_scalar<int32_t>(input_zp);
-
-  float output_inv_scale_val = 1.0f / graph.extract_scalar<float>(output_scale);
-  int32_t output_zp_val = graph.extract_scalar<int32_t>(output_zp);
-
-  std::string kernel_name = "conv2d_dw_q8ta_q8csw_q8to";
-  add_storage_type_suffix(
-      kernel_name, graph.storage_type_of(packed_int8_output));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight));
-  add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales));
-  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  vkapi::ParamsBindList param_buffers = {
-      graph.sizes_ubo(packed_int8_output), graph.sizes_ubo(packed_int8_input)};
-
-  std::vector<PushConstantDataInfo> push_constants = {
-      PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)),
-      PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)),
-      PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)),
-      PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)),
-  };
-
-  uint32_t apply_bias = 1;
-  if (graph.val_is_none(bias_data)) {
-    apply_bias = 0;
-  }
-
-  vkapi::SpecVarList spec_constants =
-      GenerateSpecConstants(graph, conv_params, groups, apply_bias);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      int8_conv2d_dw_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{packed_int8_output, vkapi::kWrite},
-       {{packed_int8_input,
-         packed_weight,
-         packed_weight_sums,
-         packed_weight_scales,
-         packed_bias},
-        vkapi::kRead}},
-      // Shader params buffers
-      param_buffers,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      spec_constants,
-      // Resize args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
 //
 // High level operator impl
 //
@@ -1423,7 +1219,7 @@ void static_quantized_conv2d_impl(
 
   // Depthwise conv path
   if (is_depthwise) {
-    add_conv2d_dw_q8ta_q8csw_q8to_node(
+    add_conv2d_dw_q8ta_q8csw_q8to_4w4c_node(
         graph,
         packed_int8_input,
         input_scale,
@@ -1544,10 +1340,9 @@ void conv2d_q8ta_q8csw_q8to(
 // Test operators
 //
 
-void conv2d_q8ta_q8csw_q8to_test(
+void test_conv2d_q8ta_q8csw_q8to(
     ComputeGraph& graph,
-    const std::vector<ValueRef>& args,
-    utils::StorageType io_storage_type) {
+    const std::vector<ValueRef>& args) {
   int32_t idx = 0;
   const ValueRef fp_input = args.at(idx++);
   const ValueRef input_scale = args.at(idx++);
@@ -1563,21 +1358,23 @@ void conv2d_q8ta_q8csw_q8to_test(
   const ValueRef padding = args.at(idx++);
   const ValueRef dilation = args.at(idx++);
   const ValueRef groups = args.at(idx++);
+  const ValueRef layout_int = args.at(idx++);
   const ValueRef fp_output = args.at(idx++);
 
+  // Extract the layout parameter and cast to GPUMemoryLayout
+  int32_t layout_value = graph.extract_scalar<int32_t>(layout_int);
+  utils::GPUMemoryLayout layout =
+      static_cast<utils::GPUMemoryLayout>(layout_value);
+
   TmpTensor packed_int8_input(
-      &graph,
-      graph.sizes_of(fp_input),
-      vkapi::kInt8x4,
-      io_storage_type,
-      utils::kPackedInt8_4W4C);
+      &graph, graph.sizes_of(fp_input), vkapi::kInt8x4, utils::kBuffer, layout);
 
   TmpTensor packed_int8_output(
       &graph,
       graph.sizes_of(fp_output),
       vkapi::kInt8x4,
-      io_storage_type,
-      utils::kPackedInt8_4W4C);
+      utils::kBuffer,
+      layout);
 
   add_q8ta_quantize_node(
       graph, fp_input, input_scale, input_zp, packed_int8_input);
@@ -1605,27 +1402,11 @@ void conv2d_q8ta_q8csw_q8to_test(
       graph, packed_int8_output, output_scale, output_zp, fp_output);
 }
 
-void conv2d_q8ta_q8csw_q8to_test_buffer(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  conv2d_q8ta_q8csw_q8to_test(graph, args, utils::kBuffer);
-}
-
-void conv2d_q8ta_q8csw_q8to_test_texture(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  conv2d_q8ta_q8csw_q8to_test(graph, args, utils::kBuffer);
-}
-
 REGISTER_OPERATORS {
   VK_REGISTER_OP(et_vk.conv2d_q8ta_q8csw.default, conv2d_q8ta_q8csw);
   VK_REGISTER_OP(et_vk.conv2d_q8csw.default, conv2d_q8csw);
   VK_REGISTER_OP(
-      etvk.conv2d_q8ta_q8csw_q8to.test_texture,
-      conv2d_q8ta_q8csw_q8to_test_texture);
-  VK_REGISTER_OP(
-      etvk.conv2d_q8ta_q8csw_q8to.test_buffer,
-      conv2d_q8ta_q8csw_q8to_test_buffer);
+      etvk.test_conv2d_q8ta_q8csw_q8to.default, test_conv2d_q8ta_q8csw_q8to);
   VK_REGISTER_OP(et_vk.conv2d_q8ta_q8csw_q8to.default, conv2d_q8ta_q8csw_q8to);
   VK_REGISTER_OP(
       et_vk.conv2d_q8ta_q8csw_q8to_dw.default, conv2d_q8ta_q8csw_q8to);
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h
index c3ea15bc318..8913e639f0e 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h
@@ -8,11 +8,10 @@
 
 #pragma once
 
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
 
 namespace vkcompute {
 
-// This header is intentionally empty as all quantize/dequantize functions
-// have been moved to QuantizeDequantize.h
+// This header re-exports ConvolutionUtils.h for backward compatibility
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt
index 781d69c10fe..3108565361e 100644
--- a/backends/vulkan/test/custom_ops/CMakeLists.txt
+++ b/backends/vulkan/test/custom_ops/CMakeLists.txt
@@ -99,6 +99,6 @@ if(TARGET vulkan_backend)
   add_operator_prototype(choose_qparams_per_row)
   add_operator_prototype(test_q8ta_qdq)
   add_operator_prototype(q8ta_q8csw_q8to_conv2d)
-  add_operator_prototype(q8ta_q8csw_q8to_conv2d_dw)
+  add_operator_prototype(test_q8ta_conv2d_dw)
   add_operator_prototype(q8ta_q8ta_q8to_add)
 endif()
diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp
new file mode 100644
index 00000000000..861f25c0606
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
+
+namespace vkcompute {
+
+void test_q8ta_conv2d_dw(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef fp_input = args.at(idx++);
+  const ValueRef input_scale = args.at(idx++);
+  const ValueRef input_zp = args.at(idx++);
+  const ValueRef weight_data = args.at(idx++);
+  const ValueRef weight_sums_data = args.at(idx++);
+  const ValueRef weight_scales_data = args.at(idx++);
+  const ValueRef output_scale = args.at(idx++);
+  const ValueRef output_zp = args.at(idx++);
+  const ValueRef bias_data = args.at(idx++);
+  const ValueRef kernel_size = args.at(idx++);
+  const ValueRef stride = args.at(idx++);
+  const ValueRef padding = args.at(idx++);
+  const ValueRef dilation = args.at(idx++);
+  const ValueRef groups = args.at(idx++);
+  const ValueRef layout_int = args.at(idx++);
+  const ValueRef impl_selector_str = args.at(idx++);
+  const ValueRef fp_output = args.at(idx++);
+
+  // Extract the layout parameter and cast to GPUMemoryLayout
+  int32_t layout_value = graph.extract_scalar<int32_t>(layout_int);
+  utils::GPUMemoryLayout layout =
+      static_cast<utils::GPUMemoryLayout>(layout_value);
+
+  // Extract the impl_selector string
+  std::string impl_selector = graph.extract_string(impl_selector_str);
+
+  // Create temporary packed int8 tensors for input and output
+  TmpTensor packed_int8_input(
+      &graph, graph.sizes_of(fp_input), vkapi::kInt8x4, utils::kBuffer, layout);
+
+  TmpTensor packed_int8_output(
+      &graph,
+      graph.sizes_of(fp_output),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      layout);
+
+  // Quantize floating point input to packed int8
+  add_q8ta_quantize_node(
+      graph, fp_input, input_scale, input_zp, packed_int8_input);
+
+  // Build args for conv operator
+  std::vector<ValueRef> conv_args = {
+      packed_int8_input,
+      input_scale,
+      input_zp,
+      weight_data,
+      weight_sums_data,
+      weight_scales_data,
+      output_scale,
+      output_zp,
+      bias_data,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      groups,
+      packed_int8_output};
+
+  if (impl_selector == "legacy_4w4c") {
+    // Use the general quantized conv2d operator for legacy path
+    VK_GET_OP_FN("et_vk.conv2d_q8ta_q8csw_q8to.default")(graph, conv_args);
+  } else {
+    // Use the dedicated depthwise conv2d operator
+    VK_GET_OP_FN("etvk.q8ta_conv2d_dw.default")(graph, conv_args);
+  }
+
+  // Dequantize packed int8 output to floating point
+  add_q8ta_dequantize_node(
+      graph, packed_int8_output, output_scale, output_zp, fp_output);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(test_etvk.test_q8ta_conv2d_dw.default, test_q8ta_conv2d_dw);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 6633365838c..5e556ba6f08 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -93,5 +93,5 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("q4gsw_linear")
     define_custom_op_test_binary("test_q8ta_qdq")
     define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d")
-    define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d_dw")
+    define_custom_op_test_binary("test_q8ta_conv2d_dw")
     define_custom_op_test_binary("q8ta_q8ta_q8to_add")
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/test_q8ta_conv2d_dw.cpp
similarity index 83%
rename from backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp
rename to backends/vulkan/test/custom_ops/test_q8ta_conv2d_dw.cpp
index 2d8d32dde74..b4583071acd 100644
--- a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp
+++ b/backends/vulkan/test/custom_ops/test_q8ta_conv2d_dw.cpp
@@ -29,20 +29,9 @@ TestCase create_test_case_from_config(
     const Conv2dConfig& config,
     vkapi::ScalarType input_dtype,
     utils::StorageType fp_storage_type,
-    utils::StorageType int8_storage_type) {
+    utils::GPUMemoryLayout int8_memory_layout,
+    const std::string& impl_selector = "") {
   TestCase test_case;
-  test_case.set_name(config.test_case_name);
-
-  std::string operator_suffix = ".test";
-  if (int8_storage_type == utils::kTexture3D) {
-    operator_suffix += "_texture";
-  } else {
-    operator_suffix += "_buffer";
-  }
-
-  // Set the operator name for the test case
-  std::string operator_name = "etvk." + config.op_name + operator_suffix;
-  test_case.set_operator_name(operator_name);
 
   // Calculate output dimensions
   int64_t H_out = config.get_output_height();
@@ -56,6 +45,26 @@ TestCase create_test_case_from_config(
       ? utils::kWidthPacked
       : utils::kChannelsPacked;
 
+  // Create test case name
+  // Format: ACCU/PERF  OC->IC  I=H,W  g=groups  k=kernel  Tex(CP)->Buf(4C1W)
+  std::string prefix = config.test_case_name.substr(0, 4); // "ACCU" or "PERF"
+  std::string test_name = prefix + "  " + std::to_string(config.channels.out) +
+      "->" + std::to_string(config.channels.in) + "  " +
+      "I=" + std::to_string(config.input_size.h) + "," +
+      std::to_string(config.input_size.w) + "  " +
+      "g=" + std::to_string(config.groups) + "  " +
+      "k=" + std::to_string(config.kernel.h) + "  " +
+      repr_str(fp_storage_type, fp_memory_layout) + "->" +
+      repr_str(utils::kBuffer, int8_memory_layout);
+  if (!impl_selector.empty()) {
+    test_name += " [" + impl_selector + "]";
+  }
+  test_case.set_name(test_name);
+
+  // Set the operator name for the test case - use the new unified test operator
+  std::string operator_name = "test_etvk.test_q8ta_conv2d_dw.default";
+  test_case.set_operator_name(operator_name);
+
   ValueSpec input_tensor(
       input_size,
       input_dtype,
@@ -179,10 +188,26 @@ TestCase create_test_case_from_config(
   test_case.add_input_spec(dilation);
   test_case.add_input_spec(groups);
 
+  // Add memory layout parameter for the quantized tensors
+  ValueSpec layout_int(static_cast<int32_t>(int8_memory_layout));
+  test_case.add_input_spec(layout_int);
+
+  // Add impl_selector string
+  ValueSpec impl_selector_spec = ValueSpec::make_string(impl_selector);
+  test_case.add_input_spec(impl_selector_spec);
+
   test_case.add_output_spec(output);
 
   test_case.set_abs_tolerance(output_scale_val + 1e-4f);
 
+  // Filter out quantize/dequantize shaders from timing measurements
+  test_case.set_shader_filter({
+      "nchw_to",
+      "to_nchw",
+      "q8ta_quantize",
+      "q8ta_dequantize",
+  });
+
   return test_case;
 }
 
@@ -203,16 +228,32 @@ std::vector<TestCase> generate_quantized_conv2d_dw_easy_cases() {
   };
   config.op_name = "conv2d_q8ta_q8csw_q8to";
 
-  std::vector<utils::StorageType> storage_types = {
+  std::vector<utils::StorageType> fp_storage_types = {
       utils::kTexture3D, utils::kBuffer};
 
+  // Memory layouts for int8 tensors - test both optimized (4W4C) and general
+  // paths
+  std::vector<utils::GPUMemoryLayout> int8_memory_layouts = {
+      utils::kPackedInt8_4C1W, utils::kPackedInt8_4W4C, utils::kPackedInt8_4C};
+
   // Generate test cases for each combination
-  for (const utils::StorageType fp_storage_type : storage_types) {
-    for (const utils::StorageType int8_storage_type : storage_types) {
-      config.test_case_name = make_test_case_name(
-          config, false, fp_storage_type, int8_storage_type);
+  for (const utils::StorageType fp_storage_type : fp_storage_types) {
+    for (const utils::GPUMemoryLayout int8_memory_layout :
+         int8_memory_layouts) {
+      config.test_case_name =
+          make_test_case_name(config, false, fp_storage_type, utils::kBuffer);
       test_cases.push_back(create_test_case_from_config(
-          config, vkapi::kFloat, fp_storage_type, int8_storage_type));
+          config, vkapi::kFloat, fp_storage_type, int8_memory_layout));
+
+      // For 4W4C layout, also test the legacy implementation
+      if (int8_memory_layout == utils::kPackedInt8_4W4C) {
+        test_cases.push_back(create_test_case_from_config(
+            config,
+            vkapi::kFloat,
+            fp_storage_type,
+            int8_memory_layout,
+            /*impl_selector=*/"legacy_4w4c"));
+      }
     }
   }
 
@@ -310,9 +351,14 @@ std::vector<TestCase> generate_quantized_conv2d_dw_test_cases() {
        32}};
 
   // Test with different storage types and data types
-  std::vector<utils::StorageType> storage_types = {
+  std::vector<utils::StorageType> fp_storage_types = {
       utils::kTexture3D, utils::kBuffer};
 
+  // Memory layouts for int8 tensors - test both optimized (4W4C) and general
+  // paths
+  std::vector<utils::GPUMemoryLayout> int8_memory_layouts = {
+      utils::kPackedInt8_4C1W, utils::kPackedInt8_4W4C, utils::kPackedInt8_4C};
+
   // Generate test cases for each combination
   for (auto& config : configs) {
     bool is_performance = config.channels.out > kRefDimSizeLimit ||
@@ -322,12 +368,23 @@ std::vector<TestCase> generate_quantized_conv2d_dw_test_cases() {
 
     config.op_name = "conv2d_q8ta_q8csw_q8to";
 
-    for (const utils::StorageType fp_storage_type : storage_types) {
-      for (const utils::StorageType int8_storage_type : storage_types) {
+    for (const utils::StorageType fp_storage_type : fp_storage_types) {
+      for (const utils::GPUMemoryLayout int8_memory_layout :
+           int8_memory_layouts) {
         config.test_case_name = make_test_case_name(
             config, is_performance, fp_storage_type, utils::kBuffer);
         test_cases.push_back(create_test_case_from_config(
-            config, vkapi::kFloat, fp_storage_type, int8_storage_type));
+            config, vkapi::kFloat, fp_storage_type, int8_memory_layout));
+
+        // For 4W4C layout, also test the legacy implementation
+        if (int8_memory_layout == utils::kPackedInt8_4W4C) {
+          test_cases.push_back(create_test_case_from_config(
+              config,
+              vkapi::kFloat,
+              fp_storage_type,
+              int8_memory_layout,
+              /*impl_selector=*/"legacy_4w4c"));
+        }
       }
     }
   }
@@ -446,6 +503,9 @@ void conv2d_q8ta_q8csw_q8to_dw_reference_impl(TestCase& test_case) {
               int64_t in_h = out_h * stride_h - pad_h + kh * dilation_h;
               int64_t in_w = out_w * stride_w - pad_w + kw * dilation_w;
 
+              int8_t quantized_input = 0;
+              int8_t quantized_weight = 0;
+
               // Check bounds (zero padding)
               if (in_h >= 0 && in_h < H_in && in_w >= 0 && in_w < W_in) {
                 // Get input value and quantize to int8
@@ -457,19 +517,12 @@ void conv2d_q8ta_q8csw_q8to_dw_reference_impl(TestCase& test_case) {
                     input_zero_point;
                 quant_input_f =
                     std::min(std::max(quant_input_f, -128.0f), 127.0f);
-                int8_t quantized_input = static_cast<int8_t>(quant_input_f);
+                quantized_input = static_cast<int8_t>(quant_input_f);
 
                 // Get quantized weight using depthwise layout [K_h, K_w, OC]
                 int64_t weight_idx = kh * (K_w * C_out) + kw * C_out + out_c;
-                int8_t quantized_weight = weight_data[weight_idx];
-
-                if (false && in_w == 0 && in_h == 0 && out_c == 0) {
-                  std::cout << "input: " << input_data[input_idx] << std::endl;
-                  std::cout << "quantized_input: " << (int)quantized_input
-                            << std::endl;
-                  std::cout << "quantized_weight: " << (int)quantized_weight
-                            << std::endl;
-                }
+                quantized_weight = weight_data[weight_idx];
+
                 // Integer multiplication and accumulation
                 int_sum += static_cast<int32_t>(quantized_input) *
                     static_cast<int32_t>(quantized_weight);
@@ -481,7 +534,10 @@ void conv2d_q8ta_q8csw_q8to_dw_reference_impl(TestCase& test_case) {
                 // in weight_sum when input is effectively 0 (but quantized 0
                 // is input_zero_point)
                 int64_t weight_idx = kh * (K_w * C_out) + kw * C_out + out_c;
-                int8_t quantized_weight = weight_data[weight_idx];
+                quantized_weight = weight_data[weight_idx];
+
+                // Use input_zero_point as the quantized input for padding
+                quantized_input = static_cast<int8_t>(input_zero_point);
 
                 // Add contribution from zero-padded input (quantized zero =
                 // input_zero_point)
@@ -512,12 +568,6 @@ void conv2d_q8ta_q8csw_q8to_dw_reference_impl(TestCase& test_case) {
           quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f);
           int8_t quantized_output = static_cast<int8_t>(quant_output_f);
 
-          if (false && out_c < 4 && out_h < 1 && out_w < 4) {
-            std::cout << "int_sum[" << out_c << ", " << out_h << ", " << out_w
-                      << "] = " << int_sum << ", " << float_result << ", "
-                      << output_scale << ", " << quant_output_f << std::endl;
-          }
-
           // Dequantize back to float
           float dequant_output =
               (static_cast<float>(quantized_output) - output_zero_point) *
@@ -593,8 +643,8 @@ int main(int argc, char* argv[]) {
       0,
       1,
 #else
-      3,
-      10,
+      5,
+      40,
 #endif
       ref_fn);