diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt index 5e794e3b48b..0121c84bb5b 100644 --- a/backends/vulkan/test/custom_ops/CMakeLists.txt +++ b/backends/vulkan/test/custom_ops/CMakeLists.txt @@ -100,6 +100,7 @@ if(TARGET vulkan_backend) add_operator_prototype(test_q8ta_qdq) add_operator_prototype(test_q8ta_clone) add_operator_prototype(test_q8ta_conv2d) + add_operator_prototype(test_q8ta_conv2d_pw) add_operator_prototype(test_q8ta_conv2d_dw) add_operator_prototype(q8ta_q8ta_q8to_add) endif() diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl index bb84a94399b..63423ed410f 100644 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ b/backends/vulkan/test/custom_ops/targets.bzl @@ -94,5 +94,6 @@ def define_common_targets(is_fbcode = False): define_custom_op_test_binary("test_q8ta_qdq") define_custom_op_test_binary("test_q8ta_clone") define_custom_op_test_binary("test_q8ta_conv2d") + define_custom_op_test_binary("test_q8ta_conv2d_pw") define_custom_op_test_binary("test_q8ta_conv2d_dw") define_custom_op_test_binary("q8ta_q8ta_q8to_add") diff --git a/backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp b/backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp index 6fe5db293c8..5d5ad356122 100644 --- a/backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp +++ b/backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp @@ -258,35 +258,6 @@ static std::vector generate_quantized_conv2d_test_cases() { } std::vector configs = { - // Pointwise convolutions: kernel size 1x1 - {OutInChannels(32, 3), - InputSize2D(64, 64), - KernelSize(1, 1), - Stride(1, 1), - Padding(0, 0), - Dilation(1, 1), - 1}, - {OutInChannels(64, 32), - InputSize2D(32, 32), - KernelSize(1, 1), - Stride(1, 1), - Padding(0, 0), - Dilation(1, 1), - 1}, - {OutInChannels(96, 64), - InputSize2D(16, 16), - KernelSize(1, 1), - Stride(1, 1), - Padding(0, 0), - Dilation(1, 1), - 1}, - {OutInChannels(13, 7), - InputSize2D(57, 33), - KernelSize(1, 1), - Stride(1, 1), - Padding(0, 0), - Dilation(1, 1), - 1}, // General 2D convolutions {OutInChannels(32, 3), InputSize2D(64, 64), @@ -352,21 +323,6 @@ static std::vector generate_quantized_conv2d_test_cases() { Padding(2, 2), Dilation(1, 1), 4}, - // Performance cases (pointwise - will use im2col) - {OutInChannels(128, 128), - InputSize2D(128, 128), - KernelSize(1, 1), - Stride(1, 1), - Padding(0, 0), - Dilation(1, 1), - 1}, - {OutInChannels(128, 128), - InputSize2D(128, 128), - KernelSize(1, 1), - Stride(1, 1), - Padding(0, 0), - Dilation(1, 1), - 1}, // Performance cases (3x3 convs - will use im2col) {OutInChannels(32, 3), InputSize2D(256, 256), diff --git a/backends/vulkan/test/custom_ops/test_q8ta_conv2d_pw.cpp b/backends/vulkan/test/custom_ops/test_q8ta_conv2d_pw.cpp new file mode 100644 index 00000000000..1cd6151071d --- /dev/null +++ b/backends/vulkan/test/custom_ops/test_q8ta_conv2d_pw.cpp @@ -0,0 +1,591 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include +#include + +#include + +#include "conv2d_utils.h" +#include "utils.h" + +// #define DEBUG_MODE + +using namespace executorch::vulkan::prototyping; + +using namespace vkcompute; + +static constexpr int64_t kRefDimSizeLimit = 100; + +// Utility function to create a test case from a Conv2dConfig +static TestCase create_test_case_from_config( + const Conv2dConfig& config, + vkapi::ScalarType input_dtype, + utils::StorageType fp_storage_type, + utils::GPUMemoryLayout int8_memory_layout, + const std::string& impl_selector = "") { + TestCase test_case; + + // Calculate output dimensions + int64_t H_out = config.get_output_height(); + int64_t W_out = config.get_output_width(); + + // Input tensor (float/half) - [1, C_in, H_in, W_in] (batch size always 1) + std::vector input_size = { + 1, config.channels.in, config.input_size.h, config.input_size.w}; + + utils::GPUMemoryLayout fp_memory_layout = fp_storage_type == utils::kBuffer + ? utils::kWidthPacked + : utils::kChannelsPacked; + + // Create test case name + // Format: ACCU/PERF OC->IC I=H,W g=groups k=kernel Tex(CP)->Buf(4C1W) + std::string prefix = config.test_case_name.substr(0, 4); // "ACCU" or "PERF" + std::string test_name = prefix + " " + std::to_string(config.channels.out) + + "->" + std::to_string(config.channels.in) + " " + + "I=" + std::to_string(config.input_size.h) + "," + + std::to_string(config.input_size.w) + " " + + "g=" + std::to_string(config.groups) + " " + + "k=" + std::to_string(config.kernel.h) + " " + + repr_str(fp_storage_type, fp_memory_layout) + "->" + + repr_str(utils::kBuffer, int8_memory_layout); + if (!impl_selector.empty()) { + test_name += " [" + impl_selector + "]"; + } + test_case.set_name(test_name); + + // Set the operator name for the test case - use the unified test operator + std::string operator_name = "test_etvk.test_q8ta_conv2d.default"; + test_case.set_operator_name(operator_name); + + ValueSpec input_tensor( + input_size, + input_dtype, + fp_storage_type, + fp_memory_layout, +#ifdef DEBUG_MODE + DataGenType::RANDOM +#else + DataGenType::RANDOM +#endif + ); + + if (debugging()) { + print_valuespec_data(input_tensor, "input_tensor"); + } + + float input_scale_val = 0.008123; + ValueSpec input_scale(input_scale_val); + + int32_t input_zero_point_val = 2; + ValueSpec input_zero_point(input_zero_point_val); + + // Quantized weight tensor (int8) - [C_out, C_in_per_group * K_h * K_w] + // Memory layout: height, width, then channels - in_c is innermost (stride 1) + // in the second dimension + const int64_t in_channels_per_group = config.channels.in / config.groups; + const int64_t in_features = utils::align_up_4( + in_channels_per_group * config.kernel.h * config.kernel.w); + std::vector weight_size = {config.channels.out, in_features}; + ValueSpec quantized_weight( + weight_size, + vkapi::kChar, // int8 for quantized weights + fp_storage_type, + utils::kWidthPacked, + DataGenType::RANDINT8); + quantized_weight.set_constant(true); + + if (debugging()) { + print_valuespec_data(quantized_weight, "weight_tensor"); + } + + const int64_t aligned_out_channels = utils::align_up_4(config.channels.out); + + // Weight quantization scales (float/half, per-channel) + ValueSpec weight_scales( + {aligned_out_channels}, // Per output channel + input_dtype, + fp_storage_type, + utils::kWidthPacked, + DataGenType::RANDOM_SCALES); + weight_scales.set_constant(true); + + ValueSpec weight_sums( + {aligned_out_channels}, // Per output channel + vkapi::kInt, + fp_storage_type, + utils::kWidthPacked, + DataGenType::ZEROS); + weight_sums.set_constant(true); + + // Compute weight_sums data based on quantized weights + compute_weight_sums( + weight_sums, quantized_weight, config.channels.out, in_features); + + // Bias (optional, float/half) - [C_out] + ValueSpec bias( + {aligned_out_channels}, // Per output channel + input_dtype, + fp_storage_type, + utils::kWidthPacked, + DataGenType::ZEROS); + bias.set_constant(true); + + // Output quantization parameters + float output_scale_val = 0.05314; + ValueSpec output_scale(output_scale_val); + + int32_t output_zero_point_val = -1; + ValueSpec output_zero_point(output_zero_point_val); + + // Stride and padding parameters + ValueSpec stride({config.stride.h, config.stride.w}); + ValueSpec padding({config.padding.h, config.padding.w}); + + // Dilation and groups parameters + ValueSpec dilation({config.dilation.h, config.dilation.w}); + ValueSpec groups(config.groups); + + // Kernel size parameters + ValueSpec kernel_size({config.kernel.h, config.kernel.w}); + + // Output tensor (float/half) - [1, C_out, H_out, W_out] (batch size always 1) + ValueSpec output( + {1, config.channels.out, H_out, W_out}, + input_dtype, + fp_storage_type, + fp_memory_layout, + DataGenType::ZEROS); + + // Add all specs to test case for q8ta_q8csw_q8to operation + test_case.add_input_spec(input_tensor); + test_case.add_input_spec(input_scale); + test_case.add_input_spec(input_zero_point); + test_case.add_input_spec(quantized_weight); + test_case.add_input_spec(weight_sums); + test_case.add_input_spec(weight_scales); + test_case.add_input_spec(output_scale); + test_case.add_input_spec(output_zero_point); + test_case.add_input_spec(bias); + test_case.add_input_spec(kernel_size); + test_case.add_input_spec(stride); + test_case.add_input_spec(padding); + test_case.add_input_spec(dilation); + test_case.add_input_spec(groups); + + // Add memory layout parameter for the quantized tensors + ValueSpec layout_int(static_cast(int8_memory_layout)); + test_case.add_input_spec(layout_int); + + // Add impl_selector string + ValueSpec impl_selector_spec = ValueSpec::make_string(impl_selector); + test_case.add_input_spec(impl_selector_spec); + + test_case.add_output_spec(output); + + test_case.set_abs_tolerance(output_scale_val + 1e-4f); + + // Filter out quantize/dequantize shaders from timing measurements + test_case.set_shader_filter({ + "nchw_to", + "to_nchw", + "q8ta_quantize", + "q8ta_dequantize", + }); + + return test_case; +} + +// Generate test cases for quantized pointwise conv2d operation +static std::vector generate_quantized_conv2d_pw_test_cases() { + std::vector test_cases; + if (!vkcompute::api::context()->adapter_ptr()->supports_int8_dot_product()) { + return test_cases; + } + + std::vector configs = { + // Pointwise convolutions: kernel size 1x1 + {OutInChannels(32, 3), + InputSize2D(64, 64), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + {OutInChannels(64, 32), + InputSize2D(32, 32), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + {OutInChannels(96, 64), + InputSize2D(16, 16), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + {OutInChannels(13, 7), + InputSize2D(57, 33), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + {OutInChannels(80, 40), + InputSize2D(64, 64), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + // Performance cases (pointwise - will use im2col) + {OutInChannels(160, 480), + InputSize2D(8, 8), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + {OutInChannels(22, 48), + InputSize2D(256, 256), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + {OutInChannels(48, 48), + InputSize2D(128, 128), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + {OutInChannels(128, 128), + InputSize2D(128, 128), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + }; + + // Test with different storage types and memory layouts + std::vector fp_storage_types = { + utils::kTexture3D, utils::kBuffer}; + + // Memory layouts for int8 tensors - test both optimized (4W4C) and general + // paths + std::vector int8_memory_layouts = { + utils::kPackedInt8_4C1W, utils::kPackedInt8_4W4C, utils::kPackedInt8_4C}; + + // Generate test cases for each combination + for (auto& config : configs) { + bool is_performance = config.channels.out > kRefDimSizeLimit || + config.channels.in > kRefDimSizeLimit || + config.input_size.h > kRefDimSizeLimit || + config.input_size.w > kRefDimSizeLimit; + + config.op_name = "conv2d_q8ta_q8csw_q8to"; + + for (const utils::StorageType fp_storage_type : fp_storage_types) { + for (const utils::GPUMemoryLayout int8_memory_layout : + int8_memory_layouts) { + config.test_case_name = make_test_case_name( + config, is_performance, fp_storage_type, utils::kBuffer); + test_cases.push_back(create_test_case_from_config( + config, vkapi::kFloat, fp_storage_type, int8_memory_layout)); + + // For 4W4C layout, also test the legacy implementation + if (int8_memory_layout == utils::kPackedInt8_4W4C) { + test_cases.push_back(create_test_case_from_config( + config, + vkapi::kFloat, + fp_storage_type, + int8_memory_layout, + /*impl_selector=*/"legacy_4w4c")); + } + } + } + } + + return test_cases; +} + +// Reference implementation for activation, weight, and output quantized conv2d +static void conv2d_q8ta_q8csw_q8to_reference_impl(TestCase& test_case) { + // Extract input specifications + int32_t idx = 0; + const ValueSpec& input_spec = test_case.inputs()[idx++]; + const ValueSpec& input_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& input_zeros_spec = test_case.inputs()[idx++]; + const ValueSpec& weight_spec = test_case.inputs()[idx++]; + const ValueSpec& weight_sums_spec = test_case.inputs()[idx++]; + (void)weight_sums_spec; + const ValueSpec& weight_scales_spec = test_case.inputs()[idx++]; + const ValueSpec& output_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& output_zeros_spec = test_case.inputs()[idx++]; + const ValueSpec& bias_spec = test_case.inputs()[idx++]; + const ValueSpec& kernel_size_spec = test_case.inputs()[idx++]; + const ValueSpec& stride_spec = test_case.inputs()[idx++]; + const ValueSpec& padding_spec = test_case.inputs()[idx++]; + const ValueSpec& dilation_spec = test_case.inputs()[idx++]; + const ValueSpec& groups_spec = test_case.inputs()[idx++]; + const ValueSpec& layout_spec = test_case.inputs()[idx++]; + (void)layout_spec; // Not used in reference implementation + const ValueSpec& impl_selector_spec = test_case.inputs()[idx++]; + (void)impl_selector_spec; // Not used in reference implementation + + // Extract output specification (mutable reference) + ValueSpec& output_spec = test_case.outputs()[0]; + + // Get tensor dimensions + auto input_sizes = input_spec.get_tensor_sizes(); // [N, C_in, H_in, W_in] + auto weight_sizes = + weight_spec.get_tensor_sizes(); // [C_out, C_in_per_group * K_h * K_w] + auto output_sizes = + output_spec.get_tensor_sizes(); // [N, C_out, H_out, W_out] + + int64_t N = input_sizes[0]; + int64_t C_in = input_sizes[1]; + int64_t H_in = input_sizes[2]; + int64_t W_in = input_sizes[3]; + int64_t C_out = output_sizes[1]; + int64_t H_out = output_sizes[2]; + int64_t W_out = output_sizes[3]; + + // Get kernel dimensions from kernel_size ValueSpec + auto kernel_size_data = kernel_size_spec.get_int32_data(); + int64_t K_h = kernel_size_data[0]; + int64_t K_w = kernel_size_data[1]; + + // Get stride, padding, dilation, and groups + auto stride_data = stride_spec.get_int32_data(); + auto padding_data = padding_spec.get_int32_data(); + auto dilation_data = dilation_spec.get_int32_data(); + int64_t stride_h = stride_data[0]; + int64_t stride_w = stride_data[1]; + int64_t pad_h = padding_data[0]; + int64_t pad_w = padding_data[1]; + int64_t dilation_h = dilation_data[0]; + int64_t dilation_w = dilation_data[1]; + int64_t groups = groups_spec.get_int_value(); + + // Skip for large tensors since computation time will be extremely slow + if (N > kRefDimSizeLimit || C_in > kRefDimSizeLimit || + H_in > kRefDimSizeLimit || W_in > kRefDimSizeLimit || + C_out > kRefDimSizeLimit) { + throw std::invalid_argument( + "One or more dimensions exceed the allowed limit for reference implementation."); + std::cout + << "Reference implementation: computation may take some time for large tensors..." + << std::endl; + } + + if (input_spec.dtype != vkapi::kFloat) { + throw std::invalid_argument("Unsupported dtype"); + } + + // Get raw data pointers + auto& input_data = input_spec.get_float_data(); + const float input_scale = input_scale_spec.get_float_value(); + const int32_t input_zero_point = input_zeros_spec.get_int_value(); + + auto& weight_data = weight_spec.get_int8_data(); + auto& weight_scales_data = weight_scales_spec.get_float_data(); + auto& bias_data = bias_spec.get_float_data(); + + const float output_scale = output_scale_spec.get_float_value(); + const int32_t output_zero_point = output_zeros_spec.get_int_value(); + + // Calculate channels per group for grouped convolution + int64_t C_in_per_group = C_in / groups; + int64_t C_out_per_group = C_out / groups; + + // Calculate number of output elements + int64_t num_output_elements = N * C_out * H_out * W_out; + + auto& ref_data = output_spec.get_ref_float_data(); + ref_data.resize(num_output_elements); + + const int in_features = utils::align_up_4(C_in_per_group * K_h * K_w); + + // Perform activation, weight, and output quantized conv2d operation + for (int64_t n = 0; n < N; ++n) { + for (int64_t out_c = 0; out_c < C_out; ++out_c) { + for (int64_t out_h = 0; out_h < H_out; ++out_h) { + for (int64_t out_w = 0; out_w < W_out; ++out_w) { + int32_t int_sum = 0; + int32_t weight_sum = 0; // Track weight sum on the fly + + // Determine which group this output channel belongs to + int64_t group_idx = out_c / C_out_per_group; + int64_t in_c_start = group_idx * C_in_per_group; + int64_t in_c_end = (group_idx + 1) * C_in_per_group; + + // Convolution operation with integer accumulation + for (int64_t in_c = in_c_start; in_c < in_c_end; ++in_c) { + for (int64_t kh = 0; kh < K_h; ++kh) { + for (int64_t kw = 0; kw < K_w; ++kw) { + // Calculate input position with dilation + int64_t in_h = out_h * stride_h - pad_h + kh * dilation_h; + int64_t in_w = out_w * stride_w - pad_w + kw * dilation_w; + + // Check bounds (zero padding) + if (in_h >= 0 && in_h < H_in && in_w >= 0 && in_w < W_in) { + // Get input value and quantize to int8 + int64_t input_idx = n * (C_in * H_in * W_in) + + in_c * (H_in * W_in) + in_h * W_in + in_w; + + float quant_input_f = + std::round(input_data[input_idx] / input_scale) + + input_zero_point; + quant_input_f = + std::min(std::max(quant_input_f, -128.0f), 127.0f); + int8_t quantized_input = static_cast(quant_input_f); + + // Get quantized weight (already int8) + // Weight layout: [C_out, C_in_per_group * K_h * K_w] + int64_t weight_idx = out_c * in_features + + (kh * (K_w * C_in_per_group) + kw * C_in_per_group + + (in_c % C_in_per_group)); + int8_t quantized_weight = weight_data[weight_idx]; + + // Integer multiplication and accumulation + int_sum += static_cast(quantized_input) * + static_cast(quantized_weight); + + // Track weight sum for this output channel on the fly + weight_sum += static_cast(quantized_weight); + } else { + // For zero padding, we still need to account for the weight + // in weight_sum when input is effectively 0 (but quantized 0 + // is input_zero_point) + int64_t weight_idx = out_c * in_features + + (kh * (K_w * C_in_per_group) + kw * C_in_per_group + + (in_c % C_in_per_group)); + int8_t quantized_weight = weight_data[weight_idx]; + + // Add contribution from zero-padded input (quantized zero = + // input_zero_point) + int_sum += static_cast(input_zero_point) * + static_cast(quantized_weight); + + // Track weight sum for this output channel on the fly + weight_sum += static_cast(quantized_weight); + } + } + } + } + + // Convert accumulated integer result to float and apply scales + // Final result = (int_sum - zero_point_correction) * input_scale * + // weight_scale + bias zero_point_correction = input_zero_point * + // sum_of_weights_for_this_output_channel + int32_t zero_point_correction = input_zero_point * weight_sum; + int32_t accum_adjusted = int_sum - zero_point_correction; + float float_result = + accum_adjusted * input_scale * weight_scales_data[out_c]; + + // Add bias and store result + float_result += bias_data[out_c]; + + // Quantize the output to int8 + float quant_output_f = + std::round(float_result / output_scale) + output_zero_point; + quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f); + int8_t quantized_output = static_cast(quant_output_f); + + // Dequantize back to float + float dequant_output = + (static_cast(quantized_output) - output_zero_point) * + output_scale; + + int64_t output_idx = n * (C_out * H_out * W_out) + + out_c * (H_out * W_out) + out_h * W_out + out_w; + ref_data[output_idx] = dequant_output; + } + } + } + } +} + +static void reference_impl(TestCase& test_case) { + conv2d_q8ta_q8csw_q8to_reference_impl(test_case); +} + +// Custom FLOP calculator for quantized conv2d operation +static int64_t quantized_conv2d_flop_calculator(const TestCase& test_case) { + int kernel_idx = 9; // kernel_size is at index 9 for q8ta_q8csw_q8to + + // Get input and weight dimensions + const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes(); + const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes(); + + const auto& kernel_sizes = test_case.inputs()[kernel_idx].get_int32_data(); + + int64_t N = input_sizes[0]; + int64_t C_in = input_sizes[1]; + int64_t C_out = output_sizes[1]; + int64_t K_h = kernel_sizes[0]; + int64_t K_w = kernel_sizes[1]; + int64_t H_out = output_sizes[2]; + int64_t W_out = output_sizes[3]; + + // Calculate FLOPs for quantized conv2d operation + // Each output element requires: + // - C_in * K_h * K_w multiply-accumulate operations + // - Additional operations for quantization/dequantization + int64_t output_elements = N * C_out * H_out * W_out; + int64_t ops_per_output = C_in * K_h * K_w; + + int64_t flop = output_elements * (ops_per_output); + + return flop; +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); +#ifdef DEBUG_MODE + set_print_latencies(true); +#else + set_print_latencies(false); +#endif + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout + << "Quantized Pointwise Conv2d (1x1) Operation Prototyping Framework" + << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = reference_impl; + + // Execute test cases using the new framework with custom FLOP calculator + auto results = execute_test_cases( +#ifdef DEBUG_MODE + generate_quantized_conv2d_pw_test_cases, +#else + generate_quantized_conv2d_pw_test_cases, +#endif + quantized_conv2d_flop_calculator, + "QuantizedConv2dPW", +#ifdef DEBUG_MODE + 0, + 1, +#else + 3, + 10, +#endif + ref_fn); + + return 0; +}