diff --git a/backends/vulkan/runtime/graph/ops/glsl/common.glslh b/backends/vulkan/runtime/graph/ops/glsl/common.glslh index 3b2010c7963..a752f23d3ed 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/common.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/common.glslh @@ -98,6 +98,18 @@ void printVec4(vec4 texel) { "texel: %f, %f, %f, %f\\n", texel.x, texel.y, texel.z, texel.w); } +void printIVec4(ivec4 texel) { + debugPrintfEXT( + "texel: %d, %d, %d, %d\\n", texel.x, texel.y, texel.z, texel.w); +} + +void printPackedInt(const int packed) { + ivec4 unpacked = unpack_int8x4(packed); + debugPrintfEXT( + "packed: 0x%08x -> [%d, %d, %d, %d]\\n", + packed, unpacked.x, unpacked.y, unpacked.z, unpacked.w); +} + #endif // DEBUG_MODE #endif // COMMON_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh index 16c4112547c..37c47795214 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh @@ -331,6 +331,52 @@ TensorIndex linear_idx_to_tensor_idx( return linear_idx_to_tensor_idx(meta, linear_idx); } +/* + * Convert a linear texel index to a TensorIndex4D. + * + * This function is used for texel-based dispatch where each thread handles + * one packed texel (4 elements along the packed dimension). The texel index + * is decomposed using the dim_order and strides from the tensor's layout. + * + * The strides in BufferMetadata should already be in texel space (with packed + * dimension size divided by 4). + * + * Parameters: + * meta: BufferMetadata with tensor sizes and texel-space strides + * texel_idx: Linear index into packed texels (0 to num_texels-1) + * hashed_layout: Packed layout info containing dim_order and packed_dim + * + * Returns: TensorIndex4D with logical tensor coordinates (packed dim is base of 4-element block) + */ +TensorIndex4D texel_idx_to_tensor4d_idx( + const BufferMetadata meta, + uint texel_idx, + const int hashed_layout) { + TensorIndex4D tidx; + + const int packed_dim = get_packed_dim(hashed_layout); + + // Decompose texel_idx using dim_order from hashed_layout and strides from meta + // Iterate from slowest-varying dimension (d=3) to fastest (d=0) + // This follows the pattern of linear_idx_to_tensor_idx in indexing.glslh + [[unroll]] for (int d = 3; d >= 0; d--) { + // Get dim index from hashed_layout's dim_order (bits 0-15) + int dim_idx = extract_4b(hashed_layout, d); + + // Get stride for this dimension from BufferMetadata + uint dim_stride = meta.strides[0][dim_idx]; + + // Compute coordinate for this dimension + tidx.data[dim_idx] = int(texel_idx / dim_stride); + texel_idx = texel_idx % dim_stride; + } + + // Convert packed dimension from texel index to element index + tidx.data[packed_dim] *= 4; + + return tidx; +} + uint tensor_idx_to_linear_idx( const BufferMetadata meta, const TensorIndex tidx) { @@ -524,6 +570,39 @@ int tensor4d_idx_to_buf_idx( return block_idx * block_numel + intra_block_idx; } +/* + * Convert a tensor index to a texel index for block-packed layouts. + * + * For texel-packed tensors (outer_block_size == 1): + * - Each block corresponds to one texel + * - Returns block_idx directly + * + * For block-packed tensors (outer_block_size > 1, e.g., 4x4 blocks): + * - Each block contains 4 texels (16 elements / 4 elements per texel) + * - texel_idx = block_idx * 4 + (intra_block_idx / 4) + * + * Parameters: + * meta: BufferMetadata containing sizes and block-space strides + * tidx: TensorIndex4D with logical tensor coordinates + * hashed_layout: Packed layout info + * + * Returns: Linear texel index + */ +int tensor4d_idx_to_texel_idx( + const BufferMetadata meta, + const TensorIndex4D tidx, + const int hashed_layout) { + const int block_idx = tensor4d_idx_to_block_idx(meta, tidx, hashed_layout); + + if (get_outer_packed_dim_block_size(hashed_layout) == 4) { + const int intra_block_idx = + tensor4d_idx_to_intra_block_idx(tidx, hashed_layout); + return block_idx * 4 + div_4(intra_block_idx); + } + + return block_idx; +} + // // Debug utilities // @@ -540,7 +619,7 @@ void printTensorIndex(const TensorIndex tidx) { void printTensorIndex4D(const TensorIndex4D tidx) { debugPrintfEXT( - "TensorIndex4D: [%u, %u, %u, %u]\\n", + "TensorIndex4: [%d, %d, %d, %d]\\n", tidx.data[0], tidx.data[1], tidx.data[2], tidx.data[3] ); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.glsl new file mode 100644 index 00000000000..e6be92e7ba1 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.glsl @@ -0,0 +1,227 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +${define_required_extensions("buffer", DTYPE)} + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_load_type(DTYPE, "buffer")} +#define T ${texel_load_component_type(DTYPE, "buffer")} + +${define_active_storage_type("buffer")} + +layout(std430) buffer; + +#include "indexing.glslh" +#include "common.glslh" +#include "conv2d_common.glslh" + +${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=True)} +${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)} +${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", "texture2d", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)} + +// Metadata for input/output tensors (memory layout agnostic) +${layout_declare_ubo(B, "BufferMetadata", "outp")} +${layout_declare_ubo(B, "BufferMetadata", "inp")} +${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} + +layout(push_constant) uniform restrict Block { + float input_scale; + int input_zp; + float output_inv_scale; + int output_zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "apply_bias", "1")} + +// Layout specialization constants +${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")} + +#include "block_indexing.glslh" + +// Load a 4xint8 block of weights. +// Weights are stored in 4W4C format: [kH, kW/4, C/4, 4, 4] where the first 4 is +// the outer (kW) dimension and the second 4 is the inner (channel) dimension. +// Returns packed int32 containing 4 int8 values for channels c to c+3. +int load_weight(int kw, int kh, int c4, int KW4, int C4) { + // Find the packed block index (4W4C tiling) + const int kw4 = kw / 4; // W block + const int block_x_offset = kw % 4; + // Texture layout: x = c4, y = kh * KW4 + kw4 + return texelFetch(t_packed_int8_weight, ivec2(c4, kh * KW4 + kw4), 0)[block_x_offset]; +} + +ivec4 quantize(const vec4 texel, const float inv_scale, const int zp) { + vec4 quantized = round(texel * inv_scale) + zp; + return clamp(ivec4(quantized), -128, 127); +} + +void main() { + const int c4 = int(gl_GlobalInvocationID.z); + + // Initialize output tensor index (WHCN order) + // Each thread handles 4 adjacent widths starting at base_out_w + TensorIndex4D outp_tidx; + outp_tidx.data[0] = int(gl_GlobalInvocationID.x) * 4; + outp_tidx.data[1] = int(gl_GlobalInvocationID.y); + outp_tidx.data[2] = c4 * 4; + outp_tidx.data[3] = 0; + + const int W = int(outp.sizes[0][0]); + const int C4 = int(div_up_4(outp.sizes[0][2])); + + // Bounds check + if (any(greaterThanEqual(outp_tidx.data, ivec4(outp.sizes[0])))) { + return; + } + + // Compute weight addressing constants + const int KW4 = int(div_up_4(conv2d_params.kernel_size.x)); + + // Get strides for width and height dimensions (in texel space) + const int w_stride = int(inp.strides[0][0]); + const int h_stride = int(inp.strides[0][1]); + + // Pre-compute step sizes for efficient indexing + const int w_texel_step = conv2d_params.dilation.x * w_stride; + const int h_texel_step = conv2d_params.dilation.y * h_stride; + // Step between adjacent output width positions in input texel space + const int subtile_w_step = conv2d_params.stride.x * w_stride; + + // Compute base input position for subtile_w=0 + TensorIndex4D inp_tidx; + inp_tidx.data[0] = outp_tidx.data[0] * conv2d_params.stride.x - conv2d_params.padding.x; + inp_tidx.data[1] = outp_tidx.data[1] * conv2d_params.stride.y - conv2d_params.padding.y; + inp_tidx.data[2] = outp_tidx.data[2]; + inp_tidx.data[3] = 0; // batch = 0 since N == 1 + + int base_inp_texel_idx; + if (get_outer_packed_dim_block_size(inp_layout) == 1) { + base_inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout); + } + + // Store the base width position to reset the index position at the beginning + // of each loop + const int base_inp_w = inp_tidx.data[0]; + + // Initialize accumulators for 4 width positions × 4 channels each + ivec4 acc[4]; + [[unroll]] for (int i = 0; i < 4; ++i) { + acc[i] = ivec4(0); + } + + // Input dimensions for bounds checking + const int inp_W = int(inp.sizes[0][0]); + const int inp_H = int(inp.sizes[0][1]); + + // Perform depthwise convolution + for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) { + const bool h_in_bounds = (inp_tidx.data[1] >= 0 && inp_tidx.data[1] < inp_H); + + // Reset width coordinate at start of each kernel row + inp_tidx.data[0] = base_inp_w; + + for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) { + // Load weight once, reuse for all 4 width positions + const int packed_weight = load_weight(kx, ky, c4, KW4, C4); + const ivec4 weight_4c = unpack_int8x4(packed_weight); + + // Process 4 adjacent width positions using stride offsets + [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) { + ivec4 input_4c = ivec4(input_zp); + if (h_in_bounds && inp_tidx.data[0] >= 0 && inp_tidx.data[0] < inp_W) { + // Compute texel index: base + kernel offset + subtile offset + int inp_texel_idx; + if (get_outer_packed_dim_block_size(inp_layout) == 1) { + inp_texel_idx = base_inp_texel_idx + kx * w_texel_step + subtile_w * subtile_w_step; + } else { + // const int w_offset = kx * conv2d_params.dilation.x + subtile_w * conv2d_params.stride.x; + // inp_texel_idx = base_inp_texel_idx + div_4(w_offset) * w_stride + mod_4(w_offset); + // inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout); + const int w4 = div_4(inp_tidx.data[0]); + inp_texel_idx = (inp_tidx.data[1] * h_stride + w4 * w_stride + c4) * 4 + mod_4(inp_tidx.data[0]); + } + const int packed_input = t_packed_int8_input[inp_texel_idx]; + input_4c = unpack_int8x4(packed_input); + } + + // Accumulate: element-wise multiply for depthwise conv + acc[subtile_w] += weight_4c * input_4c; + + // Advance to next output position's input coordinate + inp_tidx.data[0] += conv2d_params.stride.x; + } + + // We advanced by 4*stride.x during subtile loop; adjust for net dilation step + inp_tidx.data[0] += conv2d_params.dilation.x - 4 * conv2d_params.stride.x; + } + + // Advance height by dilation for next kernel row + inp_tidx.data[1] += conv2d_params.dilation.y; + + if (get_outer_packed_dim_block_size(inp_layout) == 1) { + // Advance base index by height step for next kernel row + base_inp_texel_idx += h_texel_step; + } + } + + // Apply input zero point as weight_sum * input_zp + const vec4 weight_sums = vec4(t_weight_sums[c4]); + const vec4 weight_scales = vec4(t_weight_scales[c4]); + + // Convert to float, apply dequantization, and optionally add bias + vec4 facc[4]; + [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) { + facc[subtile_w] = vec4(acc[subtile_w]); + facc[subtile_w] -= weight_sums * input_zp; + facc[subtile_w] *= weight_scales * input_scale; + } + + // Apply bias if enabled + if (apply_bias > 0) { + const vec4 bias = vec4(t_bias[c4]); + [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) { + facc[subtile_w] += bias; + } + } + + // Compute base output texel index (for subtile_w=0) + const int base_outp_texel_idx = tensor4d_idx_to_texel_idx(outp, outp_tidx, outp_layout); + const int out_w_stride = int(outp.strides[0][0]); + + // Quantize and store outputs using stride offsets + [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) { + // Skip out-of-bounds width positions + if (outp_tidx.data[0] >= W) { + continue; + } + + const ivec4 quantized_out = quantize(facc[subtile_w], output_inv_scale, output_zp); + const int packed_out = pack_into_int32(quantized_out); + + // Store using stride offset from base + int outp_texel_idx; + if (get_outer_packed_dim_block_size(outp_layout) == 1) { + outp_texel_idx = base_outp_texel_idx + subtile_w * out_w_stride; + } else { + // outp_texel_idx = tensor4d_idx_to_texel_idx(outp, outp_tidx, outp_layout); + outp_texel_idx = base_outp_texel_idx + subtile_w; + } + + t_packed_int8_output[outp_texel_idx] = packed_out; + + outp_tidx.data[0] += 1; + } +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.yaml new file mode 100644 index 00000000000..5b671e1e8d5 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.yaml @@ -0,0 +1,14 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +q8ta_conv2d_dw: + parameter_names_with_default_values: + DTYPE: float + generate_variant_forall: + DTYPE: + - VALUE: float + shader_variants: + - NAME: q8ta_conv2d_dw diff --git a/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h b/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h new file mode 100644 index 00000000000..76c17cc80b3 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace vkcompute { + +struct Conv2DParams { + utils::ivec2 kernel_size; + utils::ivec2 stride; + utils::ivec2 padding; + utils::ivec2 dilation; + int32_t groups; + int32_t out_channels_per_group; + int32_t in_channels_per_group; + int32_t logical_K_per_group; + int32_t K_per_group; + int32_t K4_per_group; + int32_t logical_K; + int32_t K; + int32_t K4; +}; + +Conv2DParams create_conv2d_params( + ComputeGraph& graph, + const ValueRef& conv_input, + const ValueRef& conv_output, + const ValueRef& kernel_size, + const ValueRef& stride, + const ValueRef& padding, + const ValueRef& dilation, + const ValueRef& groups); + +vkapi::SpecVarList GenerateSpecConstants( + ComputeGraph& graph, + Conv2DParams& conv_params, + const ValueRef& groups, + uint32_t apply_bias = 1); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h new file mode 100644 index 00000000000..5f028caec12 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace vkcompute { + +ValueRef prepack_quantized_conv2d_weight( + ComputeGraph& graph, + const QuantizationConfig& weight_quant_config, + const ValueRef weight_data, + const ValueRef input, + const ValueRef output, + const ValueRef groups, + const ValueRef kernel_size); + +ValueRef prepack_quantized_conv2d_dw_weight( + ComputeGraph& graph, + const QuantizationConfig& weight_quant_config, + const ValueRef weight_data, + const ValueRef kernel_size); + +void add_q8ta_conv2d_dw_node( + ComputeGraph& graph, + const ValueRef packed_int8_input, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef packed_weight, + const ValueRef packed_weight_sums, + const ValueRef packed_weight_scales, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef bias_data, + const ValueRef packed_bias, + const ValueRef kernel_size, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef groups, + const ValueRef packed_int8_output); + +void add_conv2d_dw_q8ta_q8csw_q8to_4w4c_node( + ComputeGraph& graph, + const ValueRef packed_int8_input, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef packed_weight, + const ValueRef packed_weight_sums, + const ValueRef packed_weight_scales, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef bias_data, + const ValueRef packed_bias, + const ValueRef kernel_size, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef groups, + const ValueRef packed_int8_output); + +void add_q8ta_conv2d_node( + ComputeGraph& graph, + const ValueRef packed_int8_input, + const ValueRef packed_int8_input_im2col, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef packed_weight, + const ValueRef packed_weight_sums, + const ValueRef packed_weight_scales, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef bias_data, + const ValueRef packed_bias, + const ValueRef kernel_size, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef groups, + const ValueRef packed_int8_output); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp new file mode 100644 index 00000000000..5e1e8aab599 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp @@ -0,0 +1,432 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +namespace vkcompute { + +// +// Shader dispatch utilities +// + +utils::uvec3 pick_q8ta_conv2d_dw_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + + const ValueRef output = args.at(0).refs.at(0); + + const uint32_t W = graph->size_at(-1, output); + const uint32_t H = graph->size_at(-2, output); + const uint32_t C = graph->size_at(-3, output); + + // Each thread processes 4 adjacent width positions and 4 channels (4Wx4C + // tile) + const uint32_t W4 = utils::div_up_4(W); + const uint32_t C4 = utils::div_up_4(C); + + return {W4, H, C4}; +} + +utils::uvec3 pick_q8ta_conv2d_dw_local_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& resize_args) { + (void)graph; + (void)shader; + (void)args; + (void)resize_args; + + // Some inactive invocations are okay; set 6 as the threshold to use the + // a square wg size. + if (global_workgroup_size[0u] >= 6 && global_workgroup_size[2u] >= 6) { + return {8u, 1u, 8u}; + } + // If channels dim is sufficiently small, then bias towards width dim to + // reduce the number of inactive invocations. + if (global_workgroup_size[2u] < 2u) { + return {64u, 1u, 1u}; + } + return {16u, 1u, 4u}; +} + +utils::uvec3 int8_conv2d_dw_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + const ValueRef packed_int8_output = args.at(0).refs.at(0); + + const uint32_t W = graph->size_at(-1, packed_int8_output); + const uint32_t H = graph->size_at(-2, packed_int8_output); + const uint32_t C = graph->size_at(-3, packed_int8_output); + + const uint32_t W4 = utils::div_up_4(W); + const uint32_t C4 = utils::div_up_4(C); + + return {C4 * W4 * H, 1, 1}; +} + +// +// Prepack nodes +// + +ValueRef prepack_quantized_conv2d_dw_weight( + ComputeGraph& graph, + const QuantizationConfig& weight_quant_config, + const ValueRef weight_data, + const ValueRef kernel_size) { + VK_CHECK_COND(weight_quant_config.nbits == 8); + VK_CHECK_COND(weight_quant_config.is_symmetric); + + std::vector weight_orig_sizes = graph.sizes_of(weight_data); + const int64_t ndim = graph.dim_of(weight_data); + + // For depthwise convolution, expect weight layout [K_h, aligned_K_w, OC] + VK_CHECK_COND(ndim == 3); + int64_t K_h = weight_orig_sizes.at(0); + int64_t K_w = weight_orig_sizes.at(1); + int64_t aligned_K_w = utils::align_up_4(K_w); + int64_t OC = weight_orig_sizes.at(2); + + // The packing format packs the weight tensor into blocks of 4 output channels + // (OC) and 4 kernel elements (K_h * aligned_K_w) + int64_t OC_per_block = 4; + int64_t K_per_block = 4; + + // To figure out the size of the output tensor, determine the number of blocks + // along each dimension. + const int64_t total_K_elements = K_h * aligned_K_w; + const int64_t num_blocks_K = utils::div_up(total_K_elements, K_per_block); + const int64_t num_blocks_OC = utils::div_up(OC, OC_per_block); + + // The blocks are arranged in a transposed manner, such that the transposed + // weight block is indexed like packed_weights[k4][oc4] - this is to allow for + // optimal memory coalescing when computing the depthwise convolution. + int64_t output_height = num_blocks_K; + // The base dtype of the packed tensor is int32 (each int32 contains 4x 8bit + // values) and each block is represented as a ivec4. Therefore the width dim + // of the packed tensor is multiplied by 4. + int64_t output_width = num_blocks_OC * 4; + + // Store the original sizes of the weight data to pass to the shader + utils::ivec3 orig_sizes = { + utils::safe_downcast(K_h), + utils::safe_downcast(K_w), + utils::safe_downcast(OC)}; + + std::vector packed_weight_sizes{output_height, output_width}; + + utils::StorageType storage_type = utils::kTexture2D; + uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); + if (output_width > max_extent * 4 || output_height > max_extent) { + storage_type = utils::kBuffer; + } + + ValueRef packed_weight = graph.add_tensor( + packed_weight_sizes, + vkcompute::vkapi::kInt, + storage_type, + utils::kWidthPacked); + + utils::uvec3 global_wg_size = { + utils::safe_downcast(num_blocks_OC), + utils::safe_downcast(num_blocks_K), + 1u}; + + std::string kernel_name = "pack_q8_conv2d_dw_weights"; + add_storage_type_suffix(kernel_name, storage_type); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + graph.create_local_wg_size(global_wg_size), + // Inputs and Outputs + weight_data, + packed_weight, + // UBOs + {}, + // Specialization Constants + {}, + // Push Constants + {graph.sizes_pc_of(packed_weight), + PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec3))})); + + return packed_weight; +} + +// +// Dispatch nodes +// + +void add_conv2d_dw_q8ta_q8csw_q8to_4w4c_node( + ComputeGraph& graph, + const ValueRef packed_int8_input, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef packed_weight, + const ValueRef packed_weight_sums, + const ValueRef packed_weight_scales, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef bias_data, + const ValueRef packed_bias, + const ValueRef kernel_size, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef groups, + const ValueRef packed_int8_output) { + Conv2DParams conv_params = create_conv2d_params( + graph, + packed_int8_input, + packed_int8_output, + kernel_size, + stride, + padding, + dilation, + groups); + + // Verify this is actually a depthwise convolution + const int64_t groups_val = graph.extract_scalar(groups); + const int64_t in_channels = graph.size_at(-3, packed_int8_input); + VK_CHECK_COND(groups_val == in_channels); + + float input_scale_val = graph.extract_scalar(input_scale); + int32_t input_zp_val = graph.extract_scalar(input_zp); + + float output_inv_scale_val = 1.0f / graph.extract_scalar(output_scale); + int32_t output_zp_val = graph.extract_scalar(output_zp); + + uint32_t apply_bias = 1; + if (graph.val_is_none(bias_data)) { + apply_bias = 0; + } + + std::vector push_constants = { + PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)), + PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)), + PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)), + PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)), + }; + + std::string kernel_name = "conv2d_dw_q8ta_q8csw_q8to"; + add_storage_type_suffix( + kernel_name, graph.storage_type_of(packed_int8_output)); + add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight)); + add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales)); + + vkapi::ParamsBindList param_buffers = { + graph.sizes_ubo(packed_int8_output), graph.sizes_ubo(packed_int8_input)}; + + vkapi::SpecVarList spec_constants = + GenerateSpecConstants(graph, conv_params, groups, apply_bias); + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + int8_conv2d_dw_global_wg_size, + default_pick_local_wg_size, + // Inputs and Outputs + {{packed_int8_output, vkapi::kWrite}, + {{packed_int8_input, + packed_weight, + packed_weight_sums, + packed_weight_scales, + packed_bias}, + vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + spec_constants, + // Resize args + {}, + // Resizing Logic + nullptr)); +} + +void add_q8ta_conv2d_dw_node( + ComputeGraph& graph, + const ValueRef packed_int8_input, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef packed_weight, + const ValueRef packed_weight_sums, + const ValueRef packed_weight_scales, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef bias_data, + const ValueRef packed_bias, + const ValueRef kernel_size, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef groups, + const ValueRef packed_int8_output) { + Conv2DParams conv_params = create_conv2d_params( + graph, + packed_int8_input, + packed_int8_output, + kernel_size, + stride, + padding, + dilation, + groups); + + // Verify this is actually a depthwise convolution + const int64_t groups_val = graph.extract_scalar(groups); + const int64_t in_channels = graph.size_at(-3, packed_int8_input); + VK_CHECK_COND(groups_val == in_channels); + + float input_scale_val = graph.extract_scalar(input_scale); + int32_t input_zp_val = graph.extract_scalar(input_zp); + + float output_inv_scale_val = 1.0f / graph.extract_scalar(output_scale); + int32_t output_zp_val = graph.extract_scalar(output_zp); + + uint32_t apply_bias = 1; + if (graph.val_is_none(bias_data)) { + apply_bias = 0; + } + + std::vector push_constants = { + PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)), + PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)), + PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)), + PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)), + }; + + std::string kernel_name = "q8ta_conv2d_dw"; + add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales)); + + // Pass metadata for both output and input tensors + vkapi::ParamsBindList param_buffers = { + graph.buffer_meta_ubo(packed_int8_output), + graph.buffer_meta_ubo(packed_int8_input), + graph.create_params_buffer(conv_params)}; + + // Build spec constants: apply_bias + layout constants + vkapi::SpecVarList spec_constants = { + apply_bias, + // Layout specialization constants + graph.hashed_layout_of(packed_int8_input), + graph.hashed_layout_of(packed_int8_output), + }; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + pick_q8ta_conv2d_dw_global_wg_size, + pick_q8ta_conv2d_dw_local_wg_size, + // Inputs and Outputs + {{packed_int8_output, vkapi::kWrite}, + {{packed_int8_input, + packed_weight, + packed_weight_sums, + packed_weight_scales, + packed_bias}, + vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + spec_constants, + // Resize args + {})); +} + +// +// High level operator impl +// + +void q8ta_conv2d_dw(ComputeGraph& graph, const std::vector& args) { + int32_t idx = 0; + const ValueRef packed_int8_input = args.at(idx++); + const ValueRef input_scale = args.at(idx++); + const ValueRef input_zp = args.at(idx++); + const ValueRef weight_data = args.at(idx++); + const ValueRef weight_sums_data = args.at(idx++); + const ValueRef weight_scales_data = args.at(idx++); + const ValueRef output_scale = args.at(idx++); + const ValueRef output_zp = args.at(idx++); + const ValueRef bias_data = args.at(idx++); + const ValueRef kernel_size = args.at(idx++); + const ValueRef stride = args.at(idx++); + const ValueRef padding = args.at(idx++); + const ValueRef dilation = args.at(idx++); + const ValueRef groups = args.at(idx++); + const ValueRef packed_int8_output = args.at(idx++); + + QuantizationConfig weight_quant_config(8, kPerChannel, {}); + + // Prepack weight using depthwise-specific packing + ValueRef packed_weight = prepack_quantized_conv2d_dw_weight( + graph, weight_quant_config, weight_data, kernel_size); + + ValueRef packed_weight_sums = prepack_standard( + graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked); + + ValueRef packed_weight_scales = prepack_standard( + graph, weight_scales_data, utils::kBuffer, utils::kWidthPacked); + + // Create a dummy tensor to fill the binding slot of the bias tensor if it is + // not provided. This helps simplify dispatch logic and makes it so that + // fewer shader variants need to be generated. + TmpTensor dummy_bias( + &graph, + {}, + graph.dtype_of(weight_scales_data), + utils::kBuffer, + utils::kWidthPacked); + + ValueRef packed_bias = dummy_bias.vref; + if (graph.val_is_not_none(bias_data)) { + packed_bias = + prepack_standard(graph, bias_data, utils::kBuffer, utils::kWidthPacked); + } + + add_q8ta_conv2d_dw_node( + graph, + packed_int8_input, + input_scale, + input_zp, + packed_weight, + packed_weight_sums, + packed_weight_scales, + output_scale, + output_zp, + bias_data, + packed_bias, + kernel_size, + stride, + padding, + dilation, + groups, + packed_int8_output); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(etvk.q8ta_conv2d_dw.default, q8ta_conv2d_dw); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp index 2adb32d8c77..a4d959aec41 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp @@ -9,7 +9,10 @@ #include #include +#include +#include #include +#include #include #include #include @@ -101,22 +104,6 @@ bool should_use_im2col( return graph->get_int(groups) == 1; } -struct Conv2DParams { - utils::ivec2 kernel_size; - utils::ivec2 stride; - utils::ivec2 padding; - utils::ivec2 dilation; - int32_t groups; - int32_t out_channels_per_group; - int32_t in_channels_per_group; - int32_t logical_K_per_group; - int32_t K_per_group; - int32_t K4_per_group; - int32_t logical_K; - int32_t K; - int32_t K4; -}; - Conv2DParams create_conv2d_params( ComputeGraph& graph, const ValueRef& conv_input, @@ -377,23 +364,6 @@ utils::uvec3 pick_static_quantized_conv2d_local_wg_size( graph, shader, global_workgroup_size, args, resize_args); } -utils::uvec3 int8_conv2d_dw_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef packed_int8_output = args.at(0).refs.at(0); - - const uint32_t W = graph->size_at(-1, packed_int8_output); - const uint32_t H = graph->size_at(-2, packed_int8_output); - const uint32_t C = graph->size_at(-3, packed_int8_output); - - const uint32_t W4 = utils::div_up_4(W); - const uint32_t C4 = utils::div_up_4(C); - - return {C4 * W4 * H, 1, 1}; -} - // // Prepack nodes // @@ -481,91 +451,6 @@ ValueRef prepack_quantized_conv2d_weight( return packed_weight; } -ValueRef prepack_quantized_conv2d_dw_weight( - ComputeGraph& graph, - const QuantizationConfig& weight_quant_config, - const ValueRef weight_data, - const ValueRef kernel_size) { - VK_CHECK_COND(weight_quant_config.nbits == 8); - VK_CHECK_COND(weight_quant_config.is_symmetric); - - std::vector weight_orig_sizes = graph.sizes_of(weight_data); - const int64_t ndim = graph.dim_of(weight_data); - - // For depthwise convolution, expect weight layout [K_h, aligned_K_w, OC] - VK_CHECK_COND(ndim == 3); - int64_t K_h = weight_orig_sizes.at(0); - int64_t K_w = weight_orig_sizes.at(1); - int64_t aligned_K_w = utils::align_up_4(K_w); - int64_t OC = weight_orig_sizes.at(2); - - // The packing format packs the weight tensor into blocks of 4 output channels - // (OC) and 4 kernel elements (K_h * aligned_K_w) - int64_t OC_per_block = 4; - int64_t K_per_block = 4; - - // To figure out the size of the output tensor, determine the number of blocks - // along each dimension. - const int64_t total_K_elements = K_h * aligned_K_w; - const int64_t num_blocks_K = utils::div_up(total_K_elements, K_per_block); - const int64_t num_blocks_OC = utils::div_up(OC, OC_per_block); - - // The blocks are arranged in a transposed manner, such that the transposed - // weight block is indexed like packed_weights[k4][oc4] - this is to allow for - // optimal memory coalescing when computing the depthwise convolution. - int64_t output_height = num_blocks_K; - // The base dtype of the packed tensor is int32 (each int32 contains 4x 8bit - // values) and each block is represented as a ivec4. Therefore the width dim - // of the packed tensor is multiplied by 4. - int64_t output_width = num_blocks_OC * 4; - - // Store the original sizes of the weight data to pass to the shader - utils::ivec3 orig_sizes = { - utils::safe_downcast(K_h), - utils::safe_downcast(K_w), - utils::safe_downcast(OC)}; - - std::vector packed_weight_sizes{output_height, output_width}; - - utils::StorageType storage_type = utils::kTexture2D; - uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); - if (output_width > max_extent * 4 || output_height > max_extent) { - storage_type = utils::kBuffer; - } - - ValueRef packed_weight = graph.add_tensor( - packed_weight_sizes, - vkcompute::vkapi::kInt, - storage_type, - utils::kWidthPacked); - - utils::uvec3 global_wg_size = { - utils::safe_downcast(num_blocks_OC), - utils::safe_downcast(num_blocks_K), - 1u}; - - std::string kernel_name = "pack_q8_conv2d_dw_weights"; - add_storage_type_suffix(kernel_name, storage_type); - - graph.prepack_nodes().emplace_back(new PrepackNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_wg_size, - graph.create_local_wg_size(global_wg_size), - // Inputs and Outputs - weight_data, - packed_weight, - // UBOs - {}, - // Specialization Constants - {}, - // Push Constants - {graph.sizes_pc_of(packed_weight), - PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec3))})); - - return packed_weight; -} - // // Dispatch nodes // @@ -573,7 +458,7 @@ vkapi::SpecVarList GenerateSpecConstants( ComputeGraph& graph, Conv2DParams& conv_params, const ValueRef& groups, - uint32_t apply_bias = 1) { + uint32_t apply_bias) { uint32_t conv2d_params_stride_x = conv_params.stride[0]; uint32_t conv2d_params_stride_y = conv_params.stride[1]; uint32_t conv2d_params_padding_x = conv_params.padding[0]; @@ -1028,95 +913,6 @@ void add_conv2d_q8ta_q8csw_q8to_node( nullptr)); } -void add_conv2d_dw_q8ta_q8csw_q8to_node( - ComputeGraph& graph, - const ValueRef packed_int8_input, - const ValueRef input_scale, - const ValueRef input_zp, - const ValueRef packed_weight, - const ValueRef packed_weight_sums, - const ValueRef packed_weight_scales, - const ValueRef output_scale, - const ValueRef output_zp, - const ValueRef bias_data, - const ValueRef packed_bias, - const ValueRef kernel_size, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation, - const ValueRef groups, - const ValueRef packed_int8_output) { - Conv2DParams conv_params = create_conv2d_params( - graph, - packed_int8_input, - packed_int8_output, - kernel_size, - stride, - padding, - dilation, - groups); - - // Verify this is actually a depthwise convolution - const int64_t groups_val = graph.extract_scalar(groups); - const int64_t in_channels = graph.size_at(-3, packed_int8_input); - VK_CHECK_COND(groups_val == in_channels); - - float input_scale_val = graph.extract_scalar(input_scale); - int32_t input_zp_val = graph.extract_scalar(input_zp); - - float output_inv_scale_val = 1.0f / graph.extract_scalar(output_scale); - int32_t output_zp_val = graph.extract_scalar(output_zp); - - std::string kernel_name = "conv2d_dw_q8ta_q8csw_q8to"; - add_storage_type_suffix( - kernel_name, graph.storage_type_of(packed_int8_output)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight)); - add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales)); - vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); - - vkapi::ParamsBindList param_buffers = { - graph.sizes_ubo(packed_int8_output), graph.sizes_ubo(packed_int8_input)}; - - std::vector push_constants = { - PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)), - PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)), - PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)), - PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)), - }; - - uint32_t apply_bias = 1; - if (graph.val_is_none(bias_data)) { - apply_bias = 0; - } - - vkapi::SpecVarList spec_constants = - GenerateSpecConstants(graph, conv_params, groups, apply_bias); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - int8_conv2d_dw_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{packed_int8_output, vkapi::kWrite}, - {{packed_int8_input, - packed_weight, - packed_weight_sums, - packed_weight_scales, - packed_bias}, - vkapi::kRead}}, - // Shader params buffers - param_buffers, - // Push Constants - push_constants, - // Specialization Constants - spec_constants, - // Resize args - {}, - // Resizing Logic - nullptr)); -} - // // High level operator impl // @@ -1423,7 +1219,7 @@ void static_quantized_conv2d_impl( // Depthwise conv path if (is_depthwise) { - add_conv2d_dw_q8ta_q8csw_q8to_node( + add_conv2d_dw_q8ta_q8csw_q8to_4w4c_node( graph, packed_int8_input, input_scale, @@ -1544,10 +1340,9 @@ void conv2d_q8ta_q8csw_q8to( // Test operators // -void conv2d_q8ta_q8csw_q8to_test( +void test_conv2d_q8ta_q8csw_q8to( ComputeGraph& graph, - const std::vector& args, - utils::StorageType io_storage_type) { + const std::vector& args) { int32_t idx = 0; const ValueRef fp_input = args.at(idx++); const ValueRef input_scale = args.at(idx++); @@ -1563,21 +1358,23 @@ void conv2d_q8ta_q8csw_q8to_test( const ValueRef padding = args.at(idx++); const ValueRef dilation = args.at(idx++); const ValueRef groups = args.at(idx++); + const ValueRef layout_int = args.at(idx++); const ValueRef fp_output = args.at(idx++); + // Extract the layout parameter and cast to GPUMemoryLayout + int32_t layout_value = graph.extract_scalar(layout_int); + utils::GPUMemoryLayout layout = + static_cast(layout_value); + TmpTensor packed_int8_input( - &graph, - graph.sizes_of(fp_input), - vkapi::kInt8x4, - io_storage_type, - utils::kPackedInt8_4W4C); + &graph, graph.sizes_of(fp_input), vkapi::kInt8x4, utils::kBuffer, layout); TmpTensor packed_int8_output( &graph, graph.sizes_of(fp_output), vkapi::kInt8x4, - io_storage_type, - utils::kPackedInt8_4W4C); + utils::kBuffer, + layout); add_q8ta_quantize_node( graph, fp_input, input_scale, input_zp, packed_int8_input); @@ -1605,27 +1402,11 @@ void conv2d_q8ta_q8csw_q8to_test( graph, packed_int8_output, output_scale, output_zp, fp_output); } -void conv2d_q8ta_q8csw_q8to_test_buffer( - ComputeGraph& graph, - const std::vector& args) { - conv2d_q8ta_q8csw_q8to_test(graph, args, utils::kBuffer); -} - -void conv2d_q8ta_q8csw_q8to_test_texture( - ComputeGraph& graph, - const std::vector& args) { - conv2d_q8ta_q8csw_q8to_test(graph, args, utils::kBuffer); -} - REGISTER_OPERATORS { VK_REGISTER_OP(et_vk.conv2d_q8ta_q8csw.default, conv2d_q8ta_q8csw); VK_REGISTER_OP(et_vk.conv2d_q8csw.default, conv2d_q8csw); VK_REGISTER_OP( - etvk.conv2d_q8ta_q8csw_q8to.test_texture, - conv2d_q8ta_q8csw_q8to_test_texture); - VK_REGISTER_OP( - etvk.conv2d_q8ta_q8csw_q8to.test_buffer, - conv2d_q8ta_q8csw_q8to_test_buffer); + etvk.test_conv2d_q8ta_q8csw_q8to.default, test_conv2d_q8ta_q8csw_q8to); VK_REGISTER_OP(et_vk.conv2d_q8ta_q8csw_q8to.default, conv2d_q8ta_q8csw_q8to); VK_REGISTER_OP( et_vk.conv2d_q8ta_q8csw_q8to_dw.default, conv2d_q8ta_q8csw_q8to); diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h index c3ea15bc318..8913e639f0e 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h @@ -8,11 +8,10 @@ #pragma once -#include +#include namespace vkcompute { -// This header is intentionally empty as all quantize/dequantize functions -// have been moved to QuantizeDequantize.h +// This header re-exports ConvolutionUtils.h for backward compatibility } // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt index 781d69c10fe..3108565361e 100644 --- a/backends/vulkan/test/custom_ops/CMakeLists.txt +++ b/backends/vulkan/test/custom_ops/CMakeLists.txt @@ -99,6 +99,6 @@ if(TARGET vulkan_backend) add_operator_prototype(choose_qparams_per_row) add_operator_prototype(test_q8ta_qdq) add_operator_prototype(q8ta_q8csw_q8to_conv2d) - add_operator_prototype(q8ta_q8csw_q8to_conv2d_dw) + add_operator_prototype(test_q8ta_conv2d_dw) add_operator_prototype(q8ta_q8ta_q8to_add) endif() diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp new file mode 100644 index 00000000000..861f25c0606 --- /dev/null +++ b/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +namespace vkcompute { + +void test_q8ta_conv2d_dw( + ComputeGraph& graph, + const std::vector& args) { + int32_t idx = 0; + const ValueRef fp_input = args.at(idx++); + const ValueRef input_scale = args.at(idx++); + const ValueRef input_zp = args.at(idx++); + const ValueRef weight_data = args.at(idx++); + const ValueRef weight_sums_data = args.at(idx++); + const ValueRef weight_scales_data = args.at(idx++); + const ValueRef output_scale = args.at(idx++); + const ValueRef output_zp = args.at(idx++); + const ValueRef bias_data = args.at(idx++); + const ValueRef kernel_size = args.at(idx++); + const ValueRef stride = args.at(idx++); + const ValueRef padding = args.at(idx++); + const ValueRef dilation = args.at(idx++); + const ValueRef groups = args.at(idx++); + const ValueRef layout_int = args.at(idx++); + const ValueRef impl_selector_str = args.at(idx++); + const ValueRef fp_output = args.at(idx++); + + // Extract the layout parameter and cast to GPUMemoryLayout + int32_t layout_value = graph.extract_scalar(layout_int); + utils::GPUMemoryLayout layout = + static_cast(layout_value); + + // Extract the impl_selector string + std::string impl_selector = graph.extract_string(impl_selector_str); + + // Create temporary packed int8 tensors for input and output + TmpTensor packed_int8_input( + &graph, graph.sizes_of(fp_input), vkapi::kInt8x4, utils::kBuffer, layout); + + TmpTensor packed_int8_output( + &graph, + graph.sizes_of(fp_output), + vkapi::kInt8x4, + utils::kBuffer, + layout); + + // Quantize floating point input to packed int8 + add_q8ta_quantize_node( + graph, fp_input, input_scale, input_zp, packed_int8_input); + + // Build args for conv operator + std::vector conv_args = { + packed_int8_input, + input_scale, + input_zp, + weight_data, + weight_sums_data, + weight_scales_data, + output_scale, + output_zp, + bias_data, + kernel_size, + stride, + padding, + dilation, + groups, + packed_int8_output}; + + if (impl_selector == "legacy_4w4c") { + // Use the general quantized conv2d operator for legacy path + VK_GET_OP_FN("et_vk.conv2d_q8ta_q8csw_q8to.default")(graph, conv_args); + } else { + // Use the dedicated depthwise conv2d operator + VK_GET_OP_FN("etvk.q8ta_conv2d_dw.default")(graph, conv_args); + } + + // Dequantize packed int8 output to floating point + add_q8ta_dequantize_node( + graph, packed_int8_output, output_scale, output_zp, fp_output); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(test_etvk.test_q8ta_conv2d_dw.default, test_q8ta_conv2d_dw); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl index 6633365838c..5e556ba6f08 100644 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ b/backends/vulkan/test/custom_ops/targets.bzl @@ -93,5 +93,5 @@ def define_common_targets(is_fbcode = False): define_custom_op_test_binary("q4gsw_linear") define_custom_op_test_binary("test_q8ta_qdq") define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d") - define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d_dw") + define_custom_op_test_binary("test_q8ta_conv2d_dw") define_custom_op_test_binary("q8ta_q8ta_q8to_add") diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/test_q8ta_conv2d_dw.cpp similarity index 83% rename from backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp rename to backends/vulkan/test/custom_ops/test_q8ta_conv2d_dw.cpp index 2d8d32dde74..b4583071acd 100644 --- a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp +++ b/backends/vulkan/test/custom_ops/test_q8ta_conv2d_dw.cpp @@ -29,20 +29,9 @@ TestCase create_test_case_from_config( const Conv2dConfig& config, vkapi::ScalarType input_dtype, utils::StorageType fp_storage_type, - utils::StorageType int8_storage_type) { + utils::GPUMemoryLayout int8_memory_layout, + const std::string& impl_selector = "") { TestCase test_case; - test_case.set_name(config.test_case_name); - - std::string operator_suffix = ".test"; - if (int8_storage_type == utils::kTexture3D) { - operator_suffix += "_texture"; - } else { - operator_suffix += "_buffer"; - } - - // Set the operator name for the test case - std::string operator_name = "etvk." + config.op_name + operator_suffix; - test_case.set_operator_name(operator_name); // Calculate output dimensions int64_t H_out = config.get_output_height(); @@ -56,6 +45,26 @@ TestCase create_test_case_from_config( ? utils::kWidthPacked : utils::kChannelsPacked; + // Create test case name + // Format: ACCU/PERF OC->IC I=H,W g=groups k=kernel Tex(CP)->Buf(4C1W) + std::string prefix = config.test_case_name.substr(0, 4); // "ACCU" or "PERF" + std::string test_name = prefix + " " + std::to_string(config.channels.out) + + "->" + std::to_string(config.channels.in) + " " + + "I=" + std::to_string(config.input_size.h) + "," + + std::to_string(config.input_size.w) + " " + + "g=" + std::to_string(config.groups) + " " + + "k=" + std::to_string(config.kernel.h) + " " + + repr_str(fp_storage_type, fp_memory_layout) + "->" + + repr_str(utils::kBuffer, int8_memory_layout); + if (!impl_selector.empty()) { + test_name += " [" + impl_selector + "]"; + } + test_case.set_name(test_name); + + // Set the operator name for the test case - use the new unified test operator + std::string operator_name = "test_etvk.test_q8ta_conv2d_dw.default"; + test_case.set_operator_name(operator_name); + ValueSpec input_tensor( input_size, input_dtype, @@ -179,10 +188,26 @@ TestCase create_test_case_from_config( test_case.add_input_spec(dilation); test_case.add_input_spec(groups); + // Add memory layout parameter for the quantized tensors + ValueSpec layout_int(static_cast(int8_memory_layout)); + test_case.add_input_spec(layout_int); + + // Add impl_selector string + ValueSpec impl_selector_spec = ValueSpec::make_string(impl_selector); + test_case.add_input_spec(impl_selector_spec); + test_case.add_output_spec(output); test_case.set_abs_tolerance(output_scale_val + 1e-4f); + // Filter out quantize/dequantize shaders from timing measurements + test_case.set_shader_filter({ + "nchw_to", + "to_nchw", + "q8ta_quantize", + "q8ta_dequantize", + }); + return test_case; } @@ -203,16 +228,32 @@ std::vector generate_quantized_conv2d_dw_easy_cases() { }; config.op_name = "conv2d_q8ta_q8csw_q8to"; - std::vector storage_types = { + std::vector fp_storage_types = { utils::kTexture3D, utils::kBuffer}; + // Memory layouts for int8 tensors - test both optimized (4W4C) and general + // paths + std::vector int8_memory_layouts = { + utils::kPackedInt8_4C1W, utils::kPackedInt8_4W4C, utils::kPackedInt8_4C}; + // Generate test cases for each combination - for (const utils::StorageType fp_storage_type : storage_types) { - for (const utils::StorageType int8_storage_type : storage_types) { - config.test_case_name = make_test_case_name( - config, false, fp_storage_type, int8_storage_type); + for (const utils::StorageType fp_storage_type : fp_storage_types) { + for (const utils::GPUMemoryLayout int8_memory_layout : + int8_memory_layouts) { + config.test_case_name = + make_test_case_name(config, false, fp_storage_type, utils::kBuffer); test_cases.push_back(create_test_case_from_config( - config, vkapi::kFloat, fp_storage_type, int8_storage_type)); + config, vkapi::kFloat, fp_storage_type, int8_memory_layout)); + + // For 4W4C layout, also test the legacy implementation + if (int8_memory_layout == utils::kPackedInt8_4W4C) { + test_cases.push_back(create_test_case_from_config( + config, + vkapi::kFloat, + fp_storage_type, + int8_memory_layout, + /*impl_selector=*/"legacy_4w4c")); + } } } @@ -310,9 +351,14 @@ std::vector generate_quantized_conv2d_dw_test_cases() { 32}}; // Test with different storage types and data types - std::vector storage_types = { + std::vector fp_storage_types = { utils::kTexture3D, utils::kBuffer}; + // Memory layouts for int8 tensors - test both optimized (4W4C) and general + // paths + std::vector int8_memory_layouts = { + utils::kPackedInt8_4C1W, utils::kPackedInt8_4W4C, utils::kPackedInt8_4C}; + // Generate test cases for each combination for (auto& config : configs) { bool is_performance = config.channels.out > kRefDimSizeLimit || @@ -322,12 +368,23 @@ std::vector generate_quantized_conv2d_dw_test_cases() { config.op_name = "conv2d_q8ta_q8csw_q8to"; - for (const utils::StorageType fp_storage_type : storage_types) { - for (const utils::StorageType int8_storage_type : storage_types) { + for (const utils::StorageType fp_storage_type : fp_storage_types) { + for (const utils::GPUMemoryLayout int8_memory_layout : + int8_memory_layouts) { config.test_case_name = make_test_case_name( config, is_performance, fp_storage_type, utils::kBuffer); test_cases.push_back(create_test_case_from_config( - config, vkapi::kFloat, fp_storage_type, int8_storage_type)); + config, vkapi::kFloat, fp_storage_type, int8_memory_layout)); + + // For 4W4C layout, also test the legacy implementation + if (int8_memory_layout == utils::kPackedInt8_4W4C) { + test_cases.push_back(create_test_case_from_config( + config, + vkapi::kFloat, + fp_storage_type, + int8_memory_layout, + /*impl_selector=*/"legacy_4w4c")); + } } } } @@ -446,6 +503,9 @@ void conv2d_q8ta_q8csw_q8to_dw_reference_impl(TestCase& test_case) { int64_t in_h = out_h * stride_h - pad_h + kh * dilation_h; int64_t in_w = out_w * stride_w - pad_w + kw * dilation_w; + int8_t quantized_input = 0; + int8_t quantized_weight = 0; + // Check bounds (zero padding) if (in_h >= 0 && in_h < H_in && in_w >= 0 && in_w < W_in) { // Get input value and quantize to int8 @@ -457,19 +517,12 @@ void conv2d_q8ta_q8csw_q8to_dw_reference_impl(TestCase& test_case) { input_zero_point; quant_input_f = std::min(std::max(quant_input_f, -128.0f), 127.0f); - int8_t quantized_input = static_cast(quant_input_f); + quantized_input = static_cast(quant_input_f); // Get quantized weight using depthwise layout [K_h, K_w, OC] int64_t weight_idx = kh * (K_w * C_out) + kw * C_out + out_c; - int8_t quantized_weight = weight_data[weight_idx]; - - if (false && in_w == 0 && in_h == 0 && out_c == 0) { - std::cout << "input: " << input_data[input_idx] << std::endl; - std::cout << "quantized_input: " << (int)quantized_input - << std::endl; - std::cout << "quantized_weight: " << (int)quantized_weight - << std::endl; - } + quantized_weight = weight_data[weight_idx]; + // Integer multiplication and accumulation int_sum += static_cast(quantized_input) * static_cast(quantized_weight); @@ -481,7 +534,10 @@ void conv2d_q8ta_q8csw_q8to_dw_reference_impl(TestCase& test_case) { // in weight_sum when input is effectively 0 (but quantized 0 // is input_zero_point) int64_t weight_idx = kh * (K_w * C_out) + kw * C_out + out_c; - int8_t quantized_weight = weight_data[weight_idx]; + quantized_weight = weight_data[weight_idx]; + + // Use input_zero_point as the quantized input for padding + quantized_input = static_cast(input_zero_point); // Add contribution from zero-padded input (quantized zero = // input_zero_point) @@ -512,12 +568,6 @@ void conv2d_q8ta_q8csw_q8to_dw_reference_impl(TestCase& test_case) { quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f); int8_t quantized_output = static_cast(quant_output_f); - if (false && out_c < 4 && out_h < 1 && out_w < 4) { - std::cout << "int_sum[" << out_c << ", " << out_h << ", " << out_w - << "] = " << int_sum << ", " << float_result << ", " - << output_scale << ", " << quant_output_f << std::endl; - } - // Dequantize back to float float dequant_output = (static_cast(quantized_output) - output_zero_point) * @@ -593,8 +643,8 @@ int main(int argc, char* argv[]) { 0, 1, #else - 3, - 10, + 5, + 40, #endif ref_fn);