pytorch · meta-codesync · Feb 5, 2026 · Feb 2, 2026 · Feb 3, 2026 · Feb 4, 2026
@@ -98,6 +98,18 @@ void printVec4(vec4 texel) {
       "texel: %f, %f, %f, %f\\n", texel.x, texel.y, texel.z, texel.w);
 }
 
+void printIVec4(ivec4 texel) {
+  debugPrintfEXT(
+      "texel: %d, %d, %d, %d\\n", texel.x, texel.y, texel.z, texel.w);
+}
+
+void printPackedInt(const int packed) {
+  ivec4 unpacked = unpack_int8x4(packed);
+  debugPrintfEXT(
+      "packed: 0x%08x -> [%d, %d, %d, %d]\\n",
+      packed, unpacked.x, unpacked.y, unpacked.z, unpacked.w);
+}
+
 #endif // DEBUG_MODE
 
 #endif // COMMON_GLSLH
@@ -331,6 +331,52 @@ TensorIndex linear_idx_to_tensor_idx(
   return linear_idx_to_tensor_idx(meta, linear_idx);
 }
 
+/*
+ * Convert a linear texel index to a TensorIndex4D.
+ *
+ * This function is used for texel-based dispatch where each thread handles
+ * one packed texel (4 elements along the packed dimension). The texel index
+ * is decomposed using the dim_order and strides from the tensor's layout.
+ *
+ * The strides in BufferMetadata should already be in texel space (with packed
+ * dimension size divided by 4).
+ *
+ * Parameters:
+ *   meta: BufferMetadata with tensor sizes and texel-space strides
+ *   texel_idx: Linear index into packed texels (0 to num_texels-1)
+ *   hashed_layout: Packed layout info containing dim_order and packed_dim
+ *
+ * Returns: TensorIndex4D with logical tensor coordinates (packed dim is base of 4-element block)
+ */
+TensorIndex4D texel_idx_to_tensor4d_idx(
+    const BufferMetadata meta,
+    uint texel_idx,
+    const int hashed_layout) {
+  TensorIndex4D tidx;
+
+  const int packed_dim = get_packed_dim(hashed_layout);
+
+  // Decompose texel_idx using dim_order from hashed_layout and strides from meta
+  // Iterate from slowest-varying dimension (d=3) to fastest (d=0)
+  // This follows the pattern of linear_idx_to_tensor_idx in indexing.glslh
+  [[unroll]] for (int d = 3; d >= 0; d--) {
+    // Get dim index from hashed_layout's dim_order (bits 0-15)
+    int dim_idx = extract_4b(hashed_layout, d);
+
+    // Get stride for this dimension from BufferMetadata
+    uint dim_stride = meta.strides[0][dim_idx];
+
+    // Compute coordinate for this dimension
+    tidx.data[dim_idx] = int(texel_idx / dim_stride);
+    texel_idx = texel_idx % dim_stride;
+  }
+
+  // Convert packed dimension from texel index to element index
+  tidx.data[packed_dim] *= 4;
+
+  return tidx;
+}
+
 uint tensor_idx_to_linear_idx(
     const BufferMetadata meta,
     const TensorIndex tidx) {
@@ -524,6 +570,39 @@ int tensor4d_idx_to_buf_idx(
   return block_idx * block_numel + intra_block_idx;
 }
 
+/*
+ * Convert a tensor index to a texel index for block-packed layouts.
+ *
+ * For texel-packed tensors (outer_block_size == 1):
+ * - Each block corresponds to one texel
+ * - Returns block_idx directly
+ *
+ * For block-packed tensors (outer_block_size > 1, e.g., 4x4 blocks):
+ * - Each block contains 4 texels (16 elements / 4 elements per texel)
+ * - texel_idx = block_idx * 4 + (intra_block_idx / 4)
+ *
+ * Parameters:
+ *   meta: BufferMetadata containing sizes and block-space strides
+ *   tidx: TensorIndex4D with logical tensor coordinates
+ *   hashed_layout: Packed layout info
+ *
+ * Returns: Linear texel index
+ */
+int tensor4d_idx_to_texel_idx(
+    const BufferMetadata meta,
+    const TensorIndex4D tidx,
+    const int hashed_layout) {
+  const int block_idx = tensor4d_idx_to_block_idx(meta, tidx, hashed_layout);
+
+  if (get_outer_packed_dim_block_size(hashed_layout) == 4) {
+    const int intra_block_idx =
+        tensor4d_idx_to_intra_block_idx(tidx, hashed_layout);
+    return block_idx * 4 + div_4(intra_block_idx);
+  }
+
+  return block_idx;
+}
+
 //
 // Debug utilities
 //
@@ -540,7 +619,7 @@ void printTensorIndex(const TensorIndex tidx) {
 
 void printTensorIndex4D(const TensorIndex4D tidx) {
     debugPrintfEXT(
-        "TensorIndex4D: [%u, %u, %u, %u]\\n",
+        "TensorIndex4: [%d, %d, %d, %d]\\n",
         tidx.data[0], tidx.data[1], tidx.data[2], tidx.data[3]
     );
 }

@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, "buffer")}
+#define T ${texel_load_component_type(DTYPE, "buffer")}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+#include "common.glslh"
+#include "conv2d_common.glslh"
+
+${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", "texture2d", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
+
+// Metadata for input/output tensors (memory layout agnostic)
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
+
+layout(push_constant) uniform restrict Block {
+  float input_scale;
+  int input_zp;
+  float output_inv_scale;
+  int output_zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "apply_bias", "1")}
+
+// Layout specialization constants
+${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
+
+#include "block_indexing.glslh"
+
+// Load a 4xint8 block of weights.
+// Weights are stored in 4W4C format: [kH, kW/4, C/4, 4, 4] where the first 4 is
+// the outer (kW) dimension and the second 4 is the inner (channel) dimension.
+// Returns packed int32 containing 4 int8 values for channels c to c+3.
+int load_weight(int kw, int kh, int c4, int KW4, int C4) {
+  // Find the packed block index (4W4C tiling)
+  const int kw4 = kw / 4;  // W block
+  const int block_x_offset = kw % 4;
+  // Texture layout: x = c4, y = kh * KW4 + kw4
+  return texelFetch(t_packed_int8_weight, ivec2(c4, kh * KW4 + kw4), 0)[block_x_offset];
+}
+
+ivec4 quantize(const vec4 texel, const float inv_scale, const int zp) {
+  vec4 quantized = round(texel * inv_scale) + zp;
+  return clamp(ivec4(quantized), -128, 127);
+}
+
+void main() {
+  const int c4 = int(gl_GlobalInvocationID.z);
+
+  // Initialize output tensor index (WHCN order)
+  // Each thread handles 4 adjacent widths starting at base_out_w
+  TensorIndex4D outp_tidx;
+  outp_tidx.data[0] = int(gl_GlobalInvocationID.x) * 4;
+  outp_tidx.data[1] = int(gl_GlobalInvocationID.y);
+  outp_tidx.data[2] = c4 * 4;
+  outp_tidx.data[3] = 0;
+
+  const int W = int(outp.sizes[0][0]);
+  const int C4 = int(div_up_4(outp.sizes[0][2]));
+
+  // Bounds check
+  if (any(greaterThanEqual(outp_tidx.data, ivec4(outp.sizes[0])))) {
+    return;
+  }
+
+  // Compute weight addressing constants
+  const int KW4 = int(div_up_4(conv2d_params.kernel_size.x));
+
+  // Get strides for width and height dimensions (in texel space)
+  const int w_stride = int(inp.strides[0][0]);
+  const int h_stride = int(inp.strides[0][1]);
+
+  // Pre-compute step sizes for efficient indexing
+  const int w_texel_step = conv2d_params.dilation.x * w_stride;
+  const int h_texel_step = conv2d_params.dilation.y * h_stride;
+  // Step between adjacent output width positions in input texel space
+  const int subtile_w_step = conv2d_params.stride.x * w_stride;
+
+  // Compute base input position for subtile_w=0
+  TensorIndex4D inp_tidx;
+  inp_tidx.data[0] = outp_tidx.data[0] * conv2d_params.stride.x - conv2d_params.padding.x;
+  inp_tidx.data[1] = outp_tidx.data[1] * conv2d_params.stride.y - conv2d_params.padding.y;
+  inp_tidx.data[2] = outp_tidx.data[2];
+  inp_tidx.data[3] = 0;  // batch = 0 since N == 1
+
+  int base_inp_texel_idx;
+  if (get_outer_packed_dim_block_size(inp_layout) == 1) {
+    base_inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout);
+  }
+
+  // Store the base width position to reset the index position at the beginning
+  // of each loop
+  const int base_inp_w = inp_tidx.data[0];
+
+  // Initialize accumulators for 4 width positions × 4 channels each
+  ivec4 acc[4];
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    acc[i] = ivec4(0);
+  }
+
+  // Input dimensions for bounds checking
+  const int inp_W = int(inp.sizes[0][0]);
+  const int inp_H = int(inp.sizes[0][1]);
+
+  // Perform depthwise convolution
+  for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) {
+    const bool h_in_bounds = (inp_tidx.data[1] >= 0 && inp_tidx.data[1] < inp_H);
+
+    // Reset width coordinate at start of each kernel row
+    inp_tidx.data[0] = base_inp_w;
+
+    for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) {
+      // Load weight once, reuse for all 4 width positions
+      const int packed_weight = load_weight(kx, ky, c4, KW4, C4);
+      const ivec4 weight_4c = unpack_int8x4(packed_weight);
+
+      // Process 4 adjacent width positions using stride offsets
+      [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
+        ivec4 input_4c = ivec4(input_zp);
+        if (h_in_bounds && inp_tidx.data[0] >= 0 && inp_tidx.data[0] < inp_W) {
+          // Compute texel index: base + kernel offset + subtile offset
+          int inp_texel_idx;
+          if (get_outer_packed_dim_block_size(inp_layout) == 1) {
+            inp_texel_idx = base_inp_texel_idx + kx * w_texel_step + subtile_w * subtile_w_step;
+          } else {
+            // const int w_offset = kx * conv2d_params.dilation.x + subtile_w * conv2d_params.stride.x;
+            // inp_texel_idx = base_inp_texel_idx + div_4(w_offset) * w_stride + mod_4(w_offset);
+            // inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout);
+            const int w4 = div_4(inp_tidx.data[0]);
+            inp_texel_idx = (inp_tidx.data[1] * h_stride + w4 * w_stride + c4) * 4 + mod_4(inp_tidx.data[0]);
+          }
+          const int packed_input = t_packed_int8_input[inp_texel_idx];
+          input_4c = unpack_int8x4(packed_input);
+        }
+
+        // Accumulate: element-wise multiply for depthwise conv
+        acc[subtile_w] += weight_4c * input_4c;
+
+        // Advance to next output position's input coordinate
+        inp_tidx.data[0] += conv2d_params.stride.x;
+      }
+
+      // We advanced by 4*stride.x during subtile loop; adjust for net dilation step
+      inp_tidx.data[0] += conv2d_params.dilation.x - 4 * conv2d_params.stride.x;
+    }
+
+    // Advance height by dilation for next kernel row
+    inp_tidx.data[1] += conv2d_params.dilation.y;
+
+    if (get_outer_packed_dim_block_size(inp_layout) == 1) {
+      // Advance base index by height step for next kernel row
+      base_inp_texel_idx += h_texel_step;
+    }
+  }
+
+  // Apply input zero point as weight_sum * input_zp
+  const vec4 weight_sums = vec4(t_weight_sums[c4]);
+  const vec4 weight_scales = vec4(t_weight_scales[c4]);
+
+  // Convert to float, apply dequantization, and optionally add bias
+  vec4 facc[4];
+  [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
+    facc[subtile_w] = vec4(acc[subtile_w]);
+    facc[subtile_w] -= weight_sums * input_zp;
+    facc[subtile_w] *= weight_scales * input_scale;
+  }
+
+  // Apply bias if enabled
+  if (apply_bias > 0) {
+    const vec4 bias = vec4(t_bias[c4]);
+    [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
+      facc[subtile_w] += bias;
+    }
+  }
+
+  // Compute base output texel index (for subtile_w=0)
+  const int base_outp_texel_idx = tensor4d_idx_to_texel_idx(outp, outp_tidx, outp_layout);
+  const int out_w_stride = int(outp.strides[0][0]);
+
+  // Quantize and store outputs using stride offsets
+  [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
+    // Skip out-of-bounds width positions
+    if (outp_tidx.data[0] >= W) {
+      continue;
+    }
+
+    const ivec4 quantized_out = quantize(facc[subtile_w], output_inv_scale, output_zp);
+    const int packed_out = pack_into_int32(quantized_out);
+
+    // Store using stride offset from base
+    int outp_texel_idx;
+    if (get_outer_packed_dim_block_size(outp_layout) == 1) {
+      outp_texel_idx = base_outp_texel_idx + subtile_w * out_w_stride;
+    } else {
+      // outp_texel_idx = tensor4d_idx_to_texel_idx(outp, outp_tidx, outp_layout);
+      outp_texel_idx = base_outp_texel_idx + subtile_w;
+    }
+
+    t_packed_int8_output[outp_texel_idx] = packed_out;
+
+    outp_tidx.data[0] += 1;
+  }
+}
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+q8ta_conv2d_dw:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: q8ta_conv2d_dw