diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 2645be6478e..874b2a65168 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -1137,7 +1137,7 @@ jobs: ./cmake-out/backends/vulkan/test/custom_ops/q8csw_conv2d ./cmake-out/backends/vulkan/test/custom_ops/q4gsw_linear ./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row - ./cmake-out/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations + ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_qdq ./cmake-out/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add # "Classic" Operator tests diff --git a/backends/vulkan/runtime/graph/ops/glsl/block_indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/block_indexing.glslh new file mode 100644 index 00000000000..e7e64a601ef --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/block_indexing.glslh @@ -0,0 +1,280 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef BLOCK_GLSLH +#define BLOCK_GLSLH + +#include "block_int8x4_utils.glslh" +#include "common.glslh" + +// +// Block Layout Int Utils +// + +// These macros extract fields from the packed int returned by +// BlockConfig::as_packed_int(). See Common.h for the bit layout. +// +// Bit layout matches hashed layout format: +// bits 0- 3: block_dim_order[0] (inner_dim if not transposed, outer_dim if transposed) +// bits 4- 7: block_dim_order[1] (outer_dim if not transposed, inner_dim if transposed) +// bits 8-11: block_dim_order[2] (first_nonblock_dim) +// bits 12-15: block_dim_order[3] (second_nonblock_dim) +// bits 16-19: inner_dim +// bits 20-23: outer_dim +// bits 24-27: inner_dim_block_size +// bits 28-31: outer_dim_block_size + +// Extract block_dim_order elements (bits 0-15) +#define get_block_dim_order_0(x) ((x) & 0xF) +#define get_block_dim_order_1(x) (((x) >> 4) & 0xF) +#define get_block_dim_order_2(x) (((x) >> 8) & 0xF) +#define get_block_dim_order_3(x) (((x) >> 12) & 0xF) + +// Extract packed_dim_info (bits 16-31) +#define get_block_inner_dim(x) (((x) >> 16) & 0xF) +#define get_block_outer_dim(x) (((x) >> 20) & 0xF) +#define get_block_inner_dim_block_size(x) (((x) >> 24) & 0xF) +#define get_block_outer_dim_block_size(x) (((x) >> 28) & 0xF) + +/* + * Block-based programming utilities for compute shaders. + * + * A "block" is a 4x4 tile of tensor elements. The two dimensions of the block + * are called the "inner" dimension and the "outer" dimension. The inner dim is + * the one that is kept contiguous in memory (i.e. the packed dimension of the + * tensor), and the outer dim forms the other axis of the 2D block. + * + * For texel-packed tensors (single level of packing), a block is effectively a + * single texel repeated 4 times along the outer dimension. For block-packed + * tensors (two levels of packing), a block corresponds exactly to the 4x4 + * packed unit. + * + * When dispatching a block-based shader: + * - gl_GlobalInvocationID.x = block index along inner dimension + * - gl_GlobalInvocationID.y = block index along outer dimension + * - gl_GlobalInvocationID.z = plane index (remaining dimensions flattened) + */ + +// +// Index Conversion Utilities (TensorIndex4D versions) +// + +TensorIndex4D contiguous_block_idx_to_tensor4d_idx_with_block_config( + const BufferMetadata meta, + const uint block_idx, + const int block_config) { + TensorIndex4D tidx; + + uint block_strides[4]; + + uint stride = 1; + // Inner block dim + const int packed_dim_1 = get_block_inner_dim(block_config); + block_strides[packed_dim_1] = 1; + const uint block_size_1 = uint(get_block_inner_dim_block_size(block_config)); + stride = div_up(meta.sizes[0][packed_dim_1], block_size_1); + // Outer block dim + const int packed_dim_2 = get_block_outer_dim(block_config); + block_strides[packed_dim_2] = stride; + const uint block_size_2 = + uint(get_block_outer_dim_block_size(block_config)); + stride *= div_up(meta.sizes[0][packed_dim_2], block_size_2); + // First non-block dim + const int outer_dim_1 = get_block_dim_order_2(block_config); + block_strides[outer_dim_1] = stride; + stride *= meta.sizes[0][outer_dim_1]; + // Second non-block dim + const int outer_dim_2 = get_block_dim_order_3(block_config); + block_strides[outer_dim_2] = stride; + + uint contig_idx = block_idx; + // Second non-block dim + tidx.data[outer_dim_2] = int(contig_idx / block_strides[outer_dim_2]); + contig_idx %= block_strides[outer_dim_2]; + // First non-block dim (1; height) + tidx.data[outer_dim_1] = int(contig_idx / block_strides[outer_dim_1]); + contig_idx %= block_strides[outer_dim_1]; + // Outer block dim (0; width) + tidx.data[packed_dim_2] = + int(mul_4(contig_idx / block_strides[packed_dim_2])); + contig_idx %= block_strides[packed_dim_2]; + // Inner block dim (2; channels) + tidx.data[packed_dim_1] = int(mul_4(contig_idx)); + + return tidx; +} + +// +// TextureMetadata variants of block indexing +// + +TensorIndex4D contiguous_block_idx_to_tensor4d_idx_with_block_config( + const TextureMetadata meta, + const uint block_idx, + const int block_config) { + TensorIndex4D tidx; + + uint block_strides[4]; + + uint stride = 1; + // Inner block dim + const int packed_dim_1 = get_block_inner_dim(block_config); + block_strides[packed_dim_1] = 1; + const uint block_size_1 = uint(get_block_inner_dim_block_size(block_config)); + stride = div_up(meta.sizes[packed_dim_1], block_size_1); + // Outer block dim + const int packed_dim_2 = get_block_outer_dim(block_config); + block_strides[packed_dim_2] = stride; + const uint block_size_2 = + uint(get_block_outer_dim_block_size(block_config)); + stride *= div_up(meta.sizes[packed_dim_2], block_size_2); + // First non-block dim + const int outer_dim_1 = get_block_dim_order_2(block_config); + block_strides[outer_dim_1] = stride; + stride *= meta.sizes[outer_dim_1]; + // Second non-block dim + const int outer_dim_2 = get_block_dim_order_3(block_config); + block_strides[outer_dim_2] = stride; + + uint contig_idx = block_idx; + // Second non-block dim + tidx.data[outer_dim_2] = int(contig_idx / block_strides[outer_dim_2]); + contig_idx %= block_strides[outer_dim_2]; + // First non-block dim + tidx.data[outer_dim_1] = int(contig_idx / block_strides[outer_dim_1]); + contig_idx %= block_strides[outer_dim_1]; + // Outer block dim + tidx.data[packed_dim_2] = + int(mul_4(contig_idx / block_strides[packed_dim_2])); + contig_idx %= block_strides[packed_dim_2]; + // Inner block dim + tidx.data[packed_dim_1] = int(mul_4(contig_idx)); + + return tidx; +} + +// +// 3D Block Index Conversion Utilities (WHCN Dispatch) +// +// These functions convert a 3D thread index (gl_GlobalInvocationID) to a +// TensorIndex4D using a dispatch pattern: +// - thread_idx.x = W threads (divided by 4 if W is part of block) +// - thread_idx.y = H threads (divided by 4 if H is part of block) +// - thread_idx.z = C * N threads (C divided by 4 if C is part of block) +// +// Note: GLSL tensor metadata is in WHCN order (sizes[0]=W, sizes[1]=H, +// sizes[2]=C, sizes[3]=N), while C++ uses NCHW order. +// + +/* + * Convert a 3D block index to a TensorIndex4D using WHCN dispatch. + * + * Parameters: + * meta: BufferMetadata with tensor sizes in WHCN order + * thread_idx: 3D thread index (x=W, y=H, z=C*N) + * block_config: Packed block configuration from BlockConfig::as_packed_int() + * + * Returns: TensorIndex4D with logical tensor coordinates + */ +TensorIndex4D block_idx_3d_to_tensor4d_idx_with_block_config( + const BufferMetadata meta, + const uvec3 thread_idx, + const int block_config) { + TensorIndex4D tidx; + const int inner_dim = get_block_inner_dim(block_config); + const int outer_dim = get_block_outer_dim(block_config); + + // GLSL metadata is in WHCN order: sizes[0]=W, sizes[1]=H, sizes[2]=C, sizes[3]=N + + // Compute C threads for decomposing thread_idx.z + // C is blocked (divided by 4) only if it's part of the block + uint C_size = uint(meta.sizes[0][2]); + uint num_C; + if (inner_dim == 2 || outer_dim == 2) { + num_C = div_up_4(C_size); + } else { + num_C = C_size; + } + + // W (dim 0): blocked if inner or outer + if (inner_dim == 0 || outer_dim == 0) { + tidx.data[0] = int(thread_idx.x) * 4; // Block-aligned + } else { + tidx.data[0] = int(thread_idx.x); // Single value + } + + // H (dim 1): blocked if inner or outer + if (inner_dim == 1 || outer_dim == 1) { + tidx.data[1] = int(thread_idx.y) * 4; // Block-aligned + } else { + tidx.data[1] = int(thread_idx.y); // Single value + } + + // C (dim 2): blocked if inner or outer + if (inner_dim == 2 || outer_dim == 2) { + tidx.data[2] = int(thread_idx.z % num_C) * 4; // Block-aligned + } else { + tidx.data[2] = int(thread_idx.z % num_C); // Single value + } + + // N (dim 3): never blocked + tidx.data[3] = int(thread_idx.z / num_C); + + return tidx; +} + +/* + * Convert a 3D block index to a TensorIndex4D (TextureMetadata variant). + */ +TensorIndex4D block_idx_3d_to_tensor4d_idx_with_block_config( + const TextureMetadata meta, + const uvec3 thread_idx, + const int block_config) { + TensorIndex4D tidx; + const int inner_dim = get_block_inner_dim(block_config); + const int outer_dim = get_block_outer_dim(block_config); + + // GLSL metadata is in WHCN order: sizes[0]=W, sizes[1]=H, sizes[2]=C, sizes[3]=N + + // Compute C threads for decomposing thread_idx.z + uint C_size = uint(meta.sizes[2]); + uint num_C; + if (inner_dim == 2 || outer_dim == 2) { + num_C = div_up_4(C_size); + } else { + num_C = C_size; + } + + // W (dim 0): blocked if inner or outer + if (inner_dim == 0 || outer_dim == 0) { + tidx.data[0] = int(thread_idx.x) * 4; + } else { + tidx.data[0] = int(thread_idx.x); + } + + // H (dim 1): blocked if inner or outer + if (inner_dim == 1 || outer_dim == 1) { + tidx.data[1] = int(thread_idx.y) * 4; + } else { + tidx.data[1] = int(thread_idx.y); + } + + // C (dim 2): blocked if inner or outer + if (inner_dim == 2 || outer_dim == 2) { + tidx.data[2] = int(thread_idx.z % num_C) * 4; + } else { + tidx.data[2] = int(thread_idx.z % num_C); + } + + // N (dim 3): never blocked + tidx.data[3] = int(thread_idx.z / num_C); + + return tidx; +} + +#endif // BLOCK_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_load.glslh new file mode 100644 index 00000000000..6ea636a0a17 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_load.glslh @@ -0,0 +1,74 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Macro to generate int8x4 block loading functions for a specific buffer. + * + * Usage: + * define_load_int8x4_buffer_fns(t_inp) + * + * This generates: + * - load_int8x4_block_from_t_inp(meta, tidx_base, layout, block_outer_dim) + * + * IMPORTANT: block_outer_dim must be such that the inner dimension (packed_dim) + * contains 4 contiguous int8 elements packed into one int32. If the loaded + * block needs to be transposed to match a different output layout, that + * transposition must be done by the caller. + */ + +#ifndef BLOCK_INT8X4_LOAD_GLSLH +#define BLOCK_INT8X4_LOAD_GLSLH + +#define define_load_int8x4_buffer_fns(buffer_name) \ + \ + ivec4 load_int8x4_block_from_##buffer_name( \ + const BufferMetadata meta, \ + const TensorIndex4D tidx_base, \ + const int hashed_layout, \ + const int block_outer_dim) { \ + const int outer_packed_dim = get_outer_packed_dim(hashed_layout); \ + const int outer_block_size = \ + get_outer_packed_dim_block_size(hashed_layout); \ + \ + /* Compute base packed index using block-based indexing */ \ + const uint block_idx = \ + tensor4d_idx_to_block_idx(meta, tidx_base, hashed_layout); \ + const uint texels_per_block = div_4(get_block_numel(hashed_layout)); \ + uint buf_idx = block_idx * texels_per_block; \ + \ + /* Fast path: contiguous texels when iterating along outer_packed_dim */ \ + if (outer_block_size == 4) { \ + if (block_outer_dim == outer_packed_dim) { \ + return ivec4( \ + buffer_name[buf_idx], \ + buffer_name[buf_idx + 1], \ + buffer_name[buf_idx + 2], \ + buffer_name[buf_idx + 3]); \ + } \ + else { \ + buf_idx += mod_4(tidx_base.data[outer_packed_dim]); \ + } \ + } \ + \ + /* General path: use stride for non-contiguous access */ \ + const uint outer_stride = \ + stride_at(meta, block_outer_dim) * texels_per_block; \ + const uint outer_size = size_at(meta, block_outer_dim); \ + const int base_outer_idx = tidx_base.data[block_outer_dim]; \ + \ + ivec4 block = ivec4(0); \ + [[unroll]] for (int block_y = 0; block_y < 4; ++block_y) { \ + if (base_outer_idx + block_y < int(outer_size)) { \ + block[block_y] = buffer_name[buf_idx]; \ + } \ + buf_idx += outer_stride; \ + } \ + return block; \ + } + +#endif // BLOCK_INT8X4_LOAD_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_store.glslh b/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_store.glslh new file mode 100644 index 00000000000..2a0e037c291 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_store.glslh @@ -0,0 +1,74 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Macro to generate int8x4 block storing functions for a specific buffer. + * + * Usage: + * define_store_int8x4_buffer_fns(t_out) + * + * This generates: + * - store_int8x4_block_to_t_out(meta, tidx_base, layout, block_outer_dim, + * block) + * + * IMPORTANT: block_outer_dim must be such that the inner dimension (packed_dim) + * contains 4 contiguous int8 elements packed into one int32. If the block needs + * to be transposed to match the output layout, that transposition must be done + * by the caller before storing. + */ + +#ifndef BLOCK_INT8X4_STORE_GLSLH +#define BLOCK_INT8X4_STORE_GLSLH + +#define define_store_int8x4_buffer_fns(buffer_name) \ + \ + void store_int8x4_block_to_##buffer_name( \ + const BufferMetadata meta, \ + const TensorIndex4D tidx_base, \ + const int hashed_layout, \ + const int block_outer_dim, \ + const ivec4 block) { \ + const int outer_packed_dim = get_outer_packed_dim(hashed_layout); \ + const int outer_block_size = \ + get_outer_packed_dim_block_size(hashed_layout); \ + \ + /* Compute base packed index using block-based indexing */ \ + const uint block_idx = \ + tensor4d_idx_to_block_idx(meta, tidx_base, hashed_layout); \ + const uint texels_per_block = div_4(get_block_numel(hashed_layout)); \ + uint buf_idx = block_idx * texels_per_block; \ + \ + /* Fast path: contiguous texels when iterating along outer_packed_dim */ \ + if (outer_block_size == 4) { \ + if (block_outer_dim == outer_packed_dim) { \ + buffer_name[buf_idx] = block[0]; \ + buffer_name[buf_idx + 1] = block[1]; \ + buffer_name[buf_idx + 2] = block[2]; \ + buffer_name[buf_idx + 3] = block[3]; \ + return; \ + } \ + else { \ + buf_idx += mod_4(tidx_base.data[outer_packed_dim]); \ + } \ + } \ + \ + /* General path: use stride for non-contiguous access */ \ + const uint outer_stride = \ + stride_at(meta, block_outer_dim) * texels_per_block; \ + const uint outer_size = size_at(meta, block_outer_dim); \ + const int base_outer_idx = tidx_base.data[block_outer_dim]; \ + \ + [[unroll]] for (int block_y = 0; block_y < 4; ++block_y) { \ + if (base_outer_idx + block_y < int(outer_size)) { \ + buffer_name[buf_idx] = block[block_y]; \ + } \ + buf_idx += outer_stride; \ + } \ + } + +#endif // BLOCK_INT8X4_STORE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_utils.glslh new file mode 100644 index 00000000000..fea87428d0d --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/block_int8x4_utils.glslh @@ -0,0 +1,109 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Utility functions for working with int8x4 blocks. + * + * An int8x4 block is a 4x4 tile of int8 values stored as an ivec4, + * where each ivec4 element contains 4 packed int8 values (one row of the + * block). + */ + +#ifndef BLOCK_INT8X4_UTILS_GLSLH +#define BLOCK_INT8X4_UTILS_GLSLH + +/* + * Write a single int8 value to a specific position in an int8x4 block. + * + * Parameters: + * block: The block to modify (ivec4 where each element has 4 packed int8) + * y: Row index (0-3) + * x: Column index (0-3) + * val: The int8 value to write + */ +void write_int8x4_block_element( + inout ivec4 block, + const int y, + const int x, + const int val) { + int texel = block[y]; + // Create a mask to clear the byte at position x, then insert val + int shift = x * 8; + int mask = ~(0xFF << shift); + block[y] = (texel & mask) | ((val & 0xFF) << shift); +} + +/* + * Transpose a 4x4 int8 block. + * + * Given block[y][x], produces result[x][y]. + * Each ivec4 element contains 4 packed int8 values. + */ +ivec4 transpose_int8x4_block(const ivec4 block) { + ivec4 result; + [[unroll]] for (int y = 0; y < 4; ++y) { + int packed = 0; + [[unroll]] for (int x = 0; x < 4; ++x) { + // Extract byte y from block[x] + int val = (block[x] >> (8 * y)) & 0xFF; + // Pack into position x + packed |= (val << (8 * x)); + } + result[y] = packed; + } + return result; +} + +// +// Debug print functions +// + +#ifdef DEBUG_MODE +/* + * Debug print function for Int8x4Block. + * + * Prints all 16 int8 values as a 4x4 matrix. + */ +void printInt8x4Block(const ivec4 block) { + // Unpack all 16 int8 values into an array + int v[16]; + [[unroll]] for (int i = 0; i < 4; ++i) { + int packed = block[i]; + v[i * 4 + 0] = (packed >> 0) & 0xFF; + v[i * 4 + 1] = (packed >> 8) & 0xFF; + v[i * 4 + 2] = (packed >> 16) & 0xFF; + v[i * 4 + 3] = (packed >> 24) & 0xFF; + } + // Sign extend from 8-bit to print as signed + [[unroll]] for (int i = 0; i < 16; ++i) { + if (v[i] > 127) + v[i] -= 256; + } + // Print as a 4x4 square in a single call to avoid interleaving + debugPrintfEXT( + "Int8x4Block:\\n [%d, %d, %d, %d]\\n [%d, %d, %d, %d]\\n [%d, %d, %d, %d]\\n [%d, %d, %d, %d]\\n", + v[0], + v[1], + v[2], + v[3], + v[4], + v[5], + v[6], + v[7], + v[8], + v[9], + v[10], + v[11], + v[12], + v[13], + v[14], + v[15]); +} +#endif // DEBUG_MODE + +#endif // BLOCK_INT8X4_UTILS_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/block_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/block_load.glslh new file mode 100644 index 00000000000..d72a176aa0e --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/block_load.glslh @@ -0,0 +1,105 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Macros to generate block load functions for buffers and textures. + * + * Buffer usage: + * define_load_buffer_fns(t_inp) + * + * Texture usage: + * define_load_texture_fns(t_inp) + * + * Both generate functions with the same signature: + * - load_fp_block_from_t_inp(meta, tidx_base, layout, block_outer_dim) + * + * The block_inner_dim is derived from the packed_dim of the hashed_layout. + * If the loaded block needs to be transposed to match a different output + * layout, that transposition must be done by the caller. + * + * Parameters: + * buffer_name/texture_name: The name of the input buffer/texture (e.g., t_inp) + */ + +#ifndef BLOCK_LOAD_GLSLH +#define BLOCK_LOAD_GLSLH + +// +// Buffer load functions +// + +#define define_load_buffer_fns(buffer_name) \ + \ + mat4 load_fp_block_from_##buffer_name( \ + const BufferMetadata meta, \ + const TensorIndex4D tidx_base, \ + const int hashed_layout, \ + const int block_outer_dim) { \ + const int block_inner_dim = get_packed_dim(hashed_layout); \ + \ + /* Compute base buffer index once and use strides for iteration */ \ + const uint base_idx = \ + tensor4d_idx_to_buf_idx(meta, tidx_base, hashed_layout); \ + const uint outer_stride = stride_at(meta, block_outer_dim); \ + /* Inner stride is 1 since packed_dim == block_inner_dim */ \ + \ + /* Pre-compute bounds for efficient checking */ \ + const uint outer_size = size_at(meta, block_outer_dim); \ + const uint inner_size = size_at(meta, block_inner_dim); \ + const int base_outer_idx = tidx_base.data[block_outer_dim]; \ + const int base_inner_idx = tidx_base.data[block_inner_dim]; \ + \ + mat4 block; \ + [[unroll]] for (int block_y = 0; block_y < 4; ++block_y) { \ + if (base_outer_idx + block_y < int(outer_size)) { \ + const uint row_idx = base_idx + block_y * outer_stride; \ + [[unroll]] for (int block_x = 0; block_x < 4; ++block_x) { \ + if (base_inner_idx + block_x < int(inner_size)) { \ + block[block_y][block_x] = float(buffer_name[row_idx + block_x]); \ + } else { \ + block[block_y][block_x] = 0.0; \ + } \ + } \ + } else { \ + block[block_y] = vec4(0.0); \ + } \ + } \ + return block; \ + } + +// +// Texture load functions +// + +#define define_load_texture_fns(texture_name) \ + \ + mat4 load_fp_block_from_##texture_name( \ + const TextureMetadata meta, \ + const TensorIndex4D tidx_base, \ + const int hashed_layout, \ + const int block_outer_dim) { \ + /* Convert tensor index to texture position */ \ + /* Use tensor4d_idx_to_texel_pos_simple to properly map the packed dim */ \ + ivec3 tex_pos = tensor4d_idx_to_texel_pos_simple(meta, tidx_base); \ + const int tex_outer_dim = mod_4(block_outer_dim); \ + const int outer_size = meta.sizes[block_outer_dim]; \ + const int base_outer_idx = tidx_base.data[block_outer_dim]; \ + \ + mat4 block; \ + [[unroll]] for (int block_y = 0; block_y < 4; ++block_y) { \ + if (base_outer_idx + block_y < outer_size) { \ + block[block_y] = vec4(texelFetch(texture_name, tex_pos, 0)); \ + } else { \ + block[block_y] = vec4(0.0); \ + } \ + tex_pos[tex_outer_dim]++; \ + } \ + return block; \ + } + +#endif // BLOCK_LOAD_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/block_store.glslh b/backends/vulkan/runtime/graph/ops/glsl/block_store.glslh new file mode 100644 index 00000000000..66e9ab9fa2b --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/block_store.glslh @@ -0,0 +1,97 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Macros to generate block store functions for buffers and textures. + * + * Buffer usage: + * define_store_buffer_fns(t_outp, T) + * + * Texture usage: + * define_store_texture_fns(t_outp, VEC4_T) + * + * Both generate functions with the same signature: + * - store_fp_block_to_t_outp(meta, tidx_base, layout, block_outer_dim, block) + * + * The block_inner_dim is derived from the packed_dim of the hashed_layout. + * + * Parameters: + * buffer_name/texture_name: The name of the output buffer/texture (e.g., t_outp) + * scalar_type/vec4_type: The type for casting (e.g., float/vec4) + */ + +#ifndef BLOCK_STORE_GLSLH +#define BLOCK_STORE_GLSLH + +// +// Buffer store functions +// + +#define define_store_buffer_fns(buffer_name, scalar_type) \ + \ + void store_fp_block_to_##buffer_name( \ + const BufferMetadata meta, \ + const TensorIndex4D tidx_base, \ + const int hashed_layout, \ + const int block_outer_dim, \ + const mat4 block) { \ + const int block_inner_dim = get_packed_dim(hashed_layout); \ + \ + /* Compute base buffer index once and use strides for iteration */ \ + const uint base_idx = \ + tensor4d_idx_to_buf_idx(meta, tidx_base, hashed_layout); \ + const uint outer_stride = stride_at(meta, block_outer_dim); \ + /* Inner stride is 1 since packed_dim == block_inner_dim */ \ + \ + /* Pre-compute bounds for efficient checking */ \ + const uint outer_size = size_at(meta, block_outer_dim); \ + const uint inner_size = size_at(meta, block_inner_dim); \ + const int base_outer_idx = tidx_base.data[block_outer_dim]; \ + const int base_inner_idx = tidx_base.data[block_inner_dim]; \ + \ + [[unroll]] for (int block_y = 0; block_y < 4; ++block_y) { \ + if (base_outer_idx + block_y < int(outer_size)) { \ + const uint row_idx = base_idx + block_y * outer_stride; \ + [[unroll]] for (int block_x = 0; block_x < 4; ++block_x) { \ + if (base_inner_idx + block_x < int(inner_size)) { \ + buffer_name[row_idx + block_x] = \ + scalar_type(block[block_y][block_x]); \ + } \ + } \ + } \ + } \ + } + +// +// Texture store functions +// + +#define define_store_texture_fns(texture_name, vec4_type) \ + \ + void store_fp_block_to_##texture_name( \ + const TextureMetadata meta, \ + const TensorIndex4D tidx_base, \ + const int hashed_layout, \ + const int block_outer_dim, \ + const mat4 block) { \ + /* Convert tensor index to texture position */ \ + /* Use tensor4d_idx_to_texel_pos_simple to properly map the packed dim */ \ + ivec3 tex_pos = tensor4d_idx_to_texel_pos_simple(meta, tidx_base); \ + const int tex_outer_dim = mod_4(block_outer_dim); \ + const int outer_size = meta.sizes[block_outer_dim]; \ + const int base_outer_idx = tidx_base.data[block_outer_dim]; \ + \ + [[unroll]] for (int block_y = 0; block_y < 4; ++block_y) { \ + if (base_outer_idx + block_y < outer_size) { \ + imageStore(texture_name, tex_pos, vec4_type(block[block_y])); \ + } \ + tex_pos[tex_outer_dim]++; \ + } \ + } + +#endif // BLOCK_STORE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/common.glslh b/backends/vulkan/runtime/graph/ops/glsl/common.glslh index 9ade64910f2..3b2010c7963 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/common.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/common.glslh @@ -25,6 +25,8 @@ #define div_up_4(x) (((x) + 3) >> 2) #define div_up_8(x) (((x) + 7) >> 3) +#define div_up(x, y) (((x) + (y) - 1) / (y)) + #define align_up_2(x) ((x + 1) & -2) #define align_up_4(x) ((x + 3) & -4) #define align_up_8(x) ((x + 7) & -8) @@ -33,6 +35,7 @@ #define mod_4(x) ((x) & 3) #define mod_8(x) ((x) & 7) + int sign_extend_8bit(const int val) { if ((val & 0x80) != 0) { return val | (~0xFF); diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh index deb6f0a9e30..24f050b694e 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh @@ -46,6 +46,7 @@ int extract_4b(const int packed, const int pos) { return (packed >> (pos * 4)) & 0xF; } +#define extract_buffer_packed_dim(layout) int(layout & 0xF) // Corresponds to dim_order[:4] = [0, 1, 2, 3] #define CONTIGUOUS_BUFFER_LAYOUT_ID 0x3210 @@ -69,6 +70,13 @@ bool is_channels_last(const int hashed_layout) { return layout_id(hashed_layout) == CHANNELS_LAST_BUFFER_LAYOUT_ID; } +// Extract packed dim info from hashed_layout (bits 16-31) +// These match the format created by create_hashed_layout() in Tensor.cpp +#define get_packed_dim(layout) (((layout) >> 16) & 0xF) +#define get_outer_packed_dim(layout) (((layout) >> 20) & 0xF) +#define get_packed_dim_block_size(layout) (((layout) >> 24) & 0xF) +#define get_outer_packed_dim_block_size(layout) (((layout) >> 28) & 0xF) + // // BufferMetadata // @@ -185,6 +193,11 @@ uint x(const TensorIndex tidx) { return tidx.data[0][0]; } +bool out_of_bounds(const TensorIndex tidx, const BufferMetadata meta) { + return any(greaterThanEqual(tidx.data[0], meta.sizes[0])) || + any(greaterThanEqual(tidx.data[1], meta.sizes[1])); +} + // // TensorIndex4D (useful for texture backed tensors) // @@ -406,6 +419,109 @@ uint tensor4d_idx_to_linear_idx( return lin_idx; } +// +// Block-packed tensor indexing +// + +/* + * Get the number of elements per block from hashed_layout. + */ +int get_block_numel(const int hashed_layout) { + const int inner_block_size = get_packed_dim_block_size(hashed_layout); + const int outer_block_size = get_outer_packed_dim_block_size(hashed_layout); + return inner_block_size * outer_block_size; +} + +/* + * Compute the intra-block index (position within a block). + * + * Within a block, elements are stored with inner dimension varying fastest: + * intra_block_idx = outer_offset * inner_block_size + inner_offset + * + * Parameters: + * tidx: TensorIndex4D with logical tensor coordinates + * hashed_layout: Packed layout info + * + * Returns: Intra-block index (0 to block_numel-1) + */ +int tensor4d_idx_to_intra_block_idx( + const TensorIndex4D tidx, + const int hashed_layout) { + const int inner_dim = get_packed_dim(hashed_layout); + const int outer_dim = get_outer_packed_dim(hashed_layout); + const int inner_block_size = get_packed_dim_block_size(hashed_layout); + const int outer_block_size = get_outer_packed_dim_block_size(hashed_layout); + + const int inner_offset = tidx.data[inner_dim] % inner_block_size; + const int outer_offset = tidx.data[outer_dim] % outer_block_size; + + return outer_offset * inner_block_size + inner_offset; +} + +/* + * Convert a tensor index to a block-space linear index. + * + * The tensor index is converted to block-space coordinates by dividing + * the packed dimensions by their block sizes, then the linear index is + * computed using the block-space strides from BufferMetadata. + * + * Parameters: + * meta: BufferMetadata with block-space strides + * tidx: TensorIndex4D with logical tensor coordinates + * hashed_layout: Packed layout info + * + * Returns: Linear index in block space + */ +int tensor4d_idx_to_block_idx( + const BufferMetadata meta, + TensorIndex4D tidx, + const int hashed_layout) { + // Extract packed dim info + const int inner_dim = get_packed_dim(hashed_layout); + const int outer_dim = get_outer_packed_dim(hashed_layout); + const int inner_block_size = get_packed_dim_block_size(hashed_layout); + const int outer_block_size = get_outer_packed_dim_block_size(hashed_layout); + + // Convert to block-space coordinates + tidx.data[inner_dim] = tidx.data[inner_dim] / inner_block_size; + tidx.data[outer_dim] = tidx.data[outer_dim] / outer_block_size; + + // Compute block-space linear index + int block_idx = 0; + [[unroll]] for (int d = 0; d < 4; ++d) { + block_idx += int(meta.strides[0][d]) * tidx.data[d]; + } + return block_idx; +} + +/* + * Convert a tensor index to a linear buffer index for block-packed layouts. + * + * For block-packed tensors: + * - Elements are grouped into blocks of (inner_block_size × outer_block_size) + * - Strides in BufferMetadata are in "block space" + * - Final index = block_index × block_numel + intra_block_index + * + * Parameters: + * meta: BufferMetadata containing sizes and block-space strides + * tidx: TensorIndex4D with logical tensor coordinates + * hashed_layout: Packed layout info from create_hashed_layout() (see + * indexing.glslh) + * + * Returns: Linear buffer index for the element + */ +int tensor4d_idx_to_buf_idx( + const BufferMetadata meta, + const TensorIndex4D tidx, + const int hashed_layout) { + const int block_idx = tensor4d_idx_to_block_idx(meta, tidx, hashed_layout); + const int intra_block_idx = + tensor4d_idx_to_intra_block_idx(tidx, hashed_layout); + const int block_numel = get_block_numel(hashed_layout); + + return block_idx * block_numel + intra_block_idx; +} + // // Debug utilities // diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_dequantize.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_dequantize.glsl new file mode 100644 index 00000000000..6989dc2d87d --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_dequantize.glsl @@ -0,0 +1,120 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +${define_required_extensions(OUTPUT_STORAGE, DTYPE)} + +#define PRECISION ${PRECISION} +#define T ${buffer_scalar_type(DTYPE)} +$if OUTPUT_STORAGE == "texture3d": + #define VEC4_T ${texel_load_type(DTYPE, "texture3d")} + +$if OUTPUT_STORAGE == "buffer": + ${define_active_storage_type("buffer")} +$else: + ${define_active_storage_type("texture3d")} + +layout(std430) buffer; + +#include "indexing.glslh" + +// Output: dequantized floating point values (buffer or texture) +${layout_declare_tensor(B, "w", "t_outp", DTYPE, OUTPUT_STORAGE)} +// Input buffer: quantized int32 values (each int32 contains 4 packed int8s) +${layout_declare_tensor(B, "r", "t_inp", "int", "buffer")} + +// Metadata for output tensor (floating point) - buffer or texture +$if OUTPUT_STORAGE == "buffer": + ${layout_declare_ubo(B, "BufferMetadata", "outp")} +$else: + ${layout_declare_ubo(B, "TextureMetadata", "outp")} +// Metadata for input tensor (quantized int8x4) - always buffer +${layout_declare_ubo(B, "BufferMetadata", "inp")} + +layout(push_constant) uniform restrict Block { + float scale; + int zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "outp_block_config", "0")} +${layout_declare_spec_const(C, "int", "inp_block_config", "0")} + +#include "block_indexing.glslh" +#include "block_int8x4_load.glslh" +#include "block_store.glslh" + +// Generate loading functions for t_inp buffer +define_load_int8x4_buffer_fns(t_inp) +// Generate storing functions for t_outp +$if OUTPUT_STORAGE == "buffer": + define_store_buffer_fns(t_outp, T) +$else: + define_store_texture_fns(t_outp, VEC4_T) + +mat4 dequantize_int8x4_block( + const ivec4 block, const float scale, const int zp) { + mat4 result; + [[unroll]] for (int i = 0; i < 4; ++i) { + // Unpack 4 int8 values from packed int32 + int packed = block[i]; + ivec4 unpacked = ivec4( + (packed >> 0) & 0xFF, + (packed >> 8) & 0xFF, + (packed >> 16) & 0xFF, + (packed >> 24) & 0xFF); + // Sign extend from 8-bit + unpacked = (unpacked ^ 0x80) - 0x80; + // Dequantize: (q - zp) * scale + result[i] = vec4(unpacked - zp) * scale; + } + return result; +} + +void main() { + TensorIndex4D tidx; + +#ifdef USING_BUFFER + // Buffer storage: use linear dispatch + const uint contig_block_idx = gl_GlobalInvocationID.x; + tidx = contiguous_block_idx_to_tensor4d_idx_with_block_config( + inp, contig_block_idx, inp_block_config); +#else + // Texture storage: use 3D extents dispatch + const uvec3 thread_idx = gl_GlobalInvocationID; + tidx = block_idx_3d_to_tensor4d_idx_with_block_config( + inp, thread_idx, inp_block_config); +#endif + + if (out_of_bounds(tidx, inp)) { + return; + } + + // Load int8 block from input using the thread's block index + const int inp_block_outer_dim = get_block_outer_dim(inp_block_config); + ivec4 int8_block = load_int8x4_block_from_t_inp( + inp, tidx, inp_layout, inp_block_outer_dim); + + // If input and output have different block configs (different packed dims), + // transpose the block to match output's layout + if (inp_block_config != outp_block_config) { + int8_block = transpose_int8x4_block(int8_block); + } + + // Dequantize the int8 block to float values + mat4 fp_block = dequantize_int8x4_block(int8_block, scale, zp); + + // Store dequantized values to output buffer using output's block config + const int outp_block_outer_dim = get_block_outer_dim(outp_block_config); + store_fp_block_to_t_outp( + outp, tidx, outp_layout, outp_block_outer_dim, fp_block); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_dequantize.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_dequantize.yaml new file mode 100644 index 00000000000..d3a531d6385 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_dequantize.yaml @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +q8ta_dequantize: + parameter_names_with_default_values: + DTYPE: float + OUTPUT_STORAGE: buffer + generate_variant_forall: + OUTPUT_STORAGE: + - VALUE: buffer + - VALUE: texture3d + DTYPE: + - VALUE: float + shader_variants: + - NAME: q8ta_dequantize diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_quantize.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_quantize.glsl new file mode 100644 index 00000000000..c0c2a3a914a --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_quantize.glsl @@ -0,0 +1,118 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +${define_required_extensions(INPUT_STORAGE, DTYPE)} + +#define PRECISION ${PRECISION} +#define T ${buffer_scalar_type(DTYPE)} +$if INPUT_STORAGE == "texture3d": + #define VEC4_T ${texel_load_type(DTYPE, "texture3d")} + +$if INPUT_STORAGE == "buffer": + ${define_active_storage_type("buffer")} +$else: + ${define_active_storage_type("texture3d")} + +layout(std430) buffer; + +#include "indexing.glslh" + +// Output buffer: quantized int32 values (each int32 contains 4 packed int8s) +${layout_declare_tensor(B, "w", "t_outp", "int", "buffer")} +// Input: floating point values (buffer or texture) +${layout_declare_tensor(B, "r", "t_inp", DTYPE, INPUT_STORAGE)} + +// Metadata for output tensor (quantized int8x4) - always buffer +${layout_declare_ubo(B, "BufferMetadata", "outp")} +// Metadata for input tensor (floating point) - buffer or texture +$if INPUT_STORAGE == "buffer": + ${layout_declare_ubo(B, "BufferMetadata", "inp")} +$else: + ${layout_declare_ubo(B, "TextureMetadata", "inp")} + +layout(push_constant) uniform restrict Block { + float inv_scale; + int zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "inp_block_config", "0")} +${layout_declare_spec_const(C, "int", "outp_block_config", "0")} + +#include "block_indexing.glslh" +#include "block_load.glslh" +$if INPUT_STORAGE == "buffer": + // Generate loading functions for t_inp buffer + define_load_buffer_fns(t_inp) +$else: + // Generate loading functions for t_inp texture + define_load_texture_fns(t_inp) +#include "block_int8x4_store.glslh" + +// Generate storing functions for t_outp buffer +define_store_int8x4_buffer_fns(t_outp) + +ivec4 quantize_fp_block( + const mat4 block, const float inv_scale, const int zp) { + ivec4 result; + [[unroll]] for (int i = 0; i < 4; ++i) { + // Quantize: round(val * inv_scale) + zp, clamped to [-128, 127] + ivec4 quantized = ivec4(round(block[i] * inv_scale)) + zp; + quantized = clamp(quantized, -128, 127); + // Pack 4 int8 values into one int32 + result[i] = ((quantized[0] & 0xFF) << 0) | + ((quantized[1] & 0xFF) << 8) | + ((quantized[2] & 0xFF) << 16) | + ((quantized[3] & 0xFF) << 24); + } + return result; +} + +void main() { + TensorIndex4D tidx; + +#ifdef USING_BUFFER + // Buffer storage: use linear dispatch + const uint contig_block_idx = gl_GlobalInvocationID.x; + tidx = contiguous_block_idx_to_tensor4d_idx_with_block_config( + inp, contig_block_idx, inp_block_config); +#else + // Texture storage: use 3D extents dispatch + const uvec3 thread_idx = gl_GlobalInvocationID; + tidx = block_idx_3d_to_tensor4d_idx_with_block_config( + inp, thread_idx, inp_block_config); +#endif + + if (out_of_bounds(tidx, inp)) { + return; + } + + // Load FP block from input using the thread's block index + const int inp_block_outer_dim = get_block_outer_dim(inp_block_config); + mat4 fp_block = load_fp_block_from_t_inp( + inp, tidx, inp_layout, inp_block_outer_dim); + + // If input and output have different block configs (different packed dims), + // transpose the block to match output's layout + if (inp_block_config != outp_block_config) { + fp_block = transpose(fp_block); + } + + // Quantize the float block to int8 values + ivec4 int8_block = quantize_fp_block(fp_block, inv_scale, zp); + + // Store quantized values to output buffer using output's block config + const int outp_block_outer_dim = get_block_outer_dim(outp_block_config); + store_int8x4_block_to_t_outp( + outp, tidx, outp_layout, outp_block_outer_dim, int8_block); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_quantize.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_quantize.yaml new file mode 100644 index 00000000000..b3124fb6d75 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_quantize.yaml @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +q8ta_quantize: + parameter_names_with_default_values: + DTYPE: float + INPUT_STORAGE: buffer + generate_variant_forall: + INPUT_STORAGE: + - VALUE: buffer + - VALUE: texture3d + DTYPE: + - VALUE: float + shader_variants: + - NAME: q8ta_quantize diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.cpp b/backends/vulkan/runtime/graph/ops/impl/Common.cpp index 71690ffc604..0286889de5c 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Common.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Common.cpp @@ -6,10 +6,93 @@ * LICENSE file in the root directory of this source tree. */ +#include #include namespace vkcompute { +// +// BlockConfig implementation +// + +BlockConfig::BlockConfig( + int32_t inner, + int32_t inner_block_size, + int32_t outer, + int32_t outer_block_size, + bool transposed) + : inner_dim(inner), + inner_dim_block_size(inner_block_size), + outer_dim(outer), + outer_dim_block_size(outer_block_size), + block_transposed(transposed), + block_dim_order{0, 0, 0, 0} { + // Block dims must be different + VK_CHECK_COND(outer_dim != inner_dim); + + // Find the two lowest dim indices that are not inner_dim or outer_dim + int32_t first_nonblock_dim = -1; + int32_t second_nonblock_dim = -1; + int32_t other_idx = 0; + for (int32_t d = 0; other_idx < 2; ++d) { + if (d != inner_dim && d != outer_dim) { + if (other_idx == 0) { + first_nonblock_dim = d; + } else { + second_nonblock_dim = d; + } + ++other_idx; + } + } + + // Set block_dim_order based on block_transposed + if (block_transposed) { + // Transposed: {outer_dim, inner_dim, first_nonblock_dim, + // second_nonblock_dim} + block_dim_order[0] = outer_dim; + block_dim_order[1] = inner_dim; + } else { + // Normal: {inner_dim, outer_dim, first_nonblock_dim, second_nonblock_dim} + block_dim_order[0] = inner_dim; + block_dim_order[1] = outer_dim; + } + block_dim_order[2] = first_nonblock_dim; + block_dim_order[3] = second_nonblock_dim; + + // Validate all dims are in valid range [0, 3] + for (int i = 0; i < 4; ++i) { + VK_CHECK_COND(block_dim_order[i] >= 0 && block_dim_order[i] < 4); + } +} + +int32_t BlockConfig::as_packed_int() const { + int32_t packed = 0; + // Pack block_dim_order in bits 0-15 (matches hashed layout format) + packed |= (block_dim_order[0] & 0xF); // bits 0-3 + packed |= (block_dim_order[1] & 0xF) << 4; // bits 4-7 + packed |= (block_dim_order[2] & 0xF) << 8; // bits 8-11 + packed |= (block_dim_order[3] & 0xF) << 12; // bits 12-15 + // Pack packed_dim_info in bits 16-31 (matches hashed layout format) + packed |= (inner_dim & 0xF) << 16; // bits 16-19 + packed |= (outer_dim & 0xF) << 20; // bits 20-23 + packed |= (inner_dim_block_size & 0xF) << 24; // bits 24-27 + packed |= (outer_dim_block_size & 0xF) << 28; // bits 28-31 + + return packed; +} + +int32_t BlockConfig::inner_dim_from_packed_int(int32_t packed_int) { + return (packed_int >> 16) & 0xF; // bits 16-19 +} + +int32_t BlockConfig::outer_dim_from_packed_int(int32_t packed_int) { + return (packed_int >> 20) & 0xF; // bits 20-23 +} + +// +// Default workgroup size functions +// + utils::uvec3 default_pick_global_wg_size( ComputeGraph* graph, const vkapi::ShaderInfo& shader, @@ -79,4 +162,202 @@ utils::uvec3 pick_wc_square_wg_size( return {16u, 1u, 4u}; } +BlockConfig create_block_config_from_io_packed_dims( + ComputeGraph& graph, + const ValueRef output, + const ValueRef input) { + const int32_t block_inner_dim = graph.packed_dim_of(output); + int32_t block_outer_dim = graph.packed_dim_of(input); + + // If inner and outer dims are the same, pick a different outer dim + if (block_outer_dim == block_inner_dim) { + if (block_inner_dim == 0) { + block_outer_dim = 1; + } else { + block_outer_dim = 0; + } + } + + // Create a BlockConfig with block sizes of 4 for both dimensions + return BlockConfig{block_inner_dim, 4, block_outer_dim, 4}; +} + +BlockConfig create_block_config_for_tensor( + ComputeGraph& graph, + const ValueRef tensor) { + const int32_t packed_dim = graph.packed_dim_of(tensor); + + // Pick an outer dimension that differs from the packed dimension + const int32_t outer_dim = (packed_dim == 0) ? 1 : 0; + + // Create a BlockConfig with block sizes of 4 for both dimensions + return BlockConfig{packed_dim, 4, outer_dim, 4}; +} + +BlockConfig create_block_config_from_other( + ComputeGraph& graph, + const ValueRef tensor, + const BlockConfig& other) { + const int32_t packed_dim = graph.packed_dim_of(tensor); + + // If tensor's packed dim matches other's inner dim, use same config + if (packed_dim == other.inner_dim) { + return other; + } + + // Otherwise, transpose: swap inner and outer dimensions + return BlockConfig{ + other.outer_dim, + other.outer_dim_block_size, + other.inner_dim, + other.inner_dim_block_size}; +} + +utils::uvec3 pick_linear_global_wg_with_block_config( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& extra_args) { + (void)shader; + + const ValueRef output = args.at(0).refs.at(0); + // extra_args contains the packed block config directly as a ValueRef + // (int32_t) + const int32_t packed_block_config = static_cast(extra_args.at(0)); + + // Extract block configuration from packed integer + const int32_t inner_dim = + BlockConfig::inner_dim_from_packed_int(packed_block_config); + const int32_t outer_dim = + BlockConfig::outer_dim_from_packed_int(packed_block_config); + + const std::vector& sizes = graph->sizes_of(output); + const size_t ndim = sizes.size(); + + // Compute number of blocks along inner and outer dimensions + const int64_t inner_size = sizes[ndim - 1 - inner_dim]; + const int64_t outer_size = sizes[ndim - 1 - outer_dim]; + + const uint32_t num_inner_blocks = + utils::safe_downcast(utils::div_up(inner_size, int64_t(4))); + const uint32_t num_outer_blocks = + utils::safe_downcast(utils::div_up(outer_size, int64_t(4))); + + // Compute number of planes (product of dimensions not in the block) + uint32_t num_planes = 1; + for (size_t i = 0; i < ndim; ++i) { + const int32_t whcn_dim = ndim - 1 - i; + if (whcn_dim != inner_dim && whcn_dim != outer_dim) { + num_planes *= utils::safe_downcast(sizes[i]); + } + } + + // Return linear workgroup size: {total_blocks, 1u, 1u} + const uint32_t total_blocks = + num_inner_blocks * num_outer_blocks * num_planes; + return {total_blocks, 1u, 1u}; +} + +utils::uvec3 pick_extents_global_wg_with_block_config( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& extra_args) { + (void)shader; + + const ValueRef output = args.at(0).refs.at(0); + // extra_args contains the packed block config directly as a ValueRef + // (int32_t) + const int32_t packed_block_config = static_cast(extra_args.at(0)); + + // Extract block configuration from packed integer + // Note: inner_dim and outer_dim use WHCN order (0=W, 1=H, 2=C, 3=N) + const int32_t inner_dim = + BlockConfig::inner_dim_from_packed_int(packed_block_config); + const int32_t outer_dim = + BlockConfig::outer_dim_from_packed_int(packed_block_config); + + const std::vector& sizes = graph->sizes_of(output); + + // C++ sizes are in NCHW order: sizes[0]=N, sizes[1]=C, sizes[2]=H, sizes[3]=W + // Access dimensions from the end for tensors with fewer than 4 dims + const int64_t W = utils::val_at(-1, sizes); + const int64_t H = utils::val_at(-2, sizes); + const int64_t C = utils::val_at(-3, sizes); + const int64_t N = utils::val_at(-4, sizes); + + // Dispatch structure: {x_threads, y_threads, z_threads} + // - x corresponds to W dimension + // - y corresponds to H dimension + // - z corresponds to C * N (combined) + // + // Block dimensions (inner_dim and outer_dim) are divided by 4, + // non-block dimensions are not divided. + + uint32_t x_threads, y_threads; + int64_t C_for_z; + + // X dimension (W, WHCN dim 0) + if (inner_dim == 0 || outer_dim == 0) { + x_threads = utils::safe_downcast(utils::div_up(W, int64_t(4))); + } else { + x_threads = utils::safe_downcast(W); + } + + // Y dimension (H, WHCN dim 1) + if (inner_dim == 1 || outer_dim == 1) { + y_threads = utils::safe_downcast(utils::div_up(H, int64_t(4))); + } else { + y_threads = utils::safe_downcast(H); + } + + // Z dimension: C * N where C is blocked if it's part of the block + if (inner_dim == 2 || outer_dim == 2) { + C_for_z = utils::div_up(C, int64_t(4)); + } else { + C_for_z = C; + } + const uint32_t z_threads = utils::safe_downcast(C_for_z * N); + + return {x_threads, y_threads, z_threads}; +} + +utils::uvec3 pick_square_local_wg_with_block_config( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& extra_args) { + (void)graph; + (void)shader; + (void)args; + + // Detect linear dispatch pattern: global_wg = {total_blocks, 1, 1} + if (global_workgroup_size[1u] == 1u && global_workgroup_size[2u] == 1u) { + return {64u, 1u, 1u}; + } + + // Extents dispatch: use 8x8 square on inner_dim and outer_dim axes + // extra_args contains the packed block config as a ValueRef (int32_t) + const int32_t packed_block_config = static_cast(extra_args.at(0)); + + // Extract block configuration from packed integer + // inner_dim and outer_dim use WHCN order (0=W, 1=H, 2=C, 3=N) + const int32_t inner_dim = + BlockConfig::inner_dim_from_packed_int(packed_block_config); + const int32_t outer_dim = + BlockConfig::outer_dim_from_packed_int(packed_block_config); + + // Build local workgroup size: + // - x corresponds to W (WHCN dim 0) + // - y corresponds to H (WHCN dim 1) + // - z corresponds to C*N (WHCN dim 2 for C) + // Set axes in the block (inner_dim, outer_dim) to 8, others to 1 + uint32_t local_x = (inner_dim == 0 || outer_dim == 0) ? 8u : 1u; + uint32_t local_y = (inner_dim == 1 || outer_dim == 1) ? 8u : 1u; + uint32_t local_z = (inner_dim == 2 || outer_dim == 2) ? 8u : 1u; + + return {local_x, local_y, local_z}; +} + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.h b/backends/vulkan/runtime/graph/ops/impl/Common.h index b412f737c13..84cacc8e4f7 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Common.h +++ b/backends/vulkan/runtime/graph/ops/impl/Common.h @@ -13,6 +13,73 @@ namespace vkcompute { +/** + * BlockConfig describes how a tensor is partitioned into blocks for the purpose + * of thread mapping in GPU compute shaders. Each thread processes one block + * of elements. + * + * This is distinct from PackedDimInfo in Tensor.h which describes memory + * layout. BlockConfig is used solely for operator implementations to define 4x4 + * block partitioning schemes. + * + * The block configuration has two dimensions: + * - inner_dim: The dimension where 4 consecutive elements are processed + * together within a single thread + * - outer_dim: A second dimension where 4 elements are grouped, resulting + * in a 4x4 block of 16 elements per thread + */ +struct BlockConfig { + // The inner block dimension (WHCN index: 0=W, 1=H, 2=C, 3=N) + // 4 consecutive elements along this dimension form the inner part of a block + int32_t inner_dim; + // Block size along the inner dimension (typically 4) + int32_t inner_dim_block_size; + // The outer block dimension (WHCN index: 0=W, 1=H, 2=C, 3=N) + // 4 elements along this dimension form the outer part of a block + int32_t outer_dim; + // Block size along the outer dimension (typically 4) + int32_t outer_dim_block_size; + // Whether the block is transposed (swaps stride ordering of inner/outer dim) + bool block_transposed; + // Dimension order for the block: + // - If block_transposed = false: {inner_dim, outer_dim, first_nonblock_dim, + // second_nonblock_dim} + // - If block_transposed = true: {outer_dim, inner_dim, first_nonblock_dim, + // second_nonblock_dim} + int32_t block_dim_order[4]; + + BlockConfig( + int32_t inner, + int32_t inner_block_size, + int32_t outer, + int32_t outer_block_size, + bool transposed = false); + + /** + * Returns a packed int32_t encoding the block configuration. The structure + * matches the hashed layout int format used in shaders: + * bits 0- 3: block_dim_order[0] + * bits 4- 7: block_dim_order[1] + * bits 8-11: block_dim_order[2] + * bits 12-15: block_dim_order[3] + * bits 16-19: inner_dim + * bits 20-23: outer_dim + * bits 24-27: inner_dim_block_size + * bits 28-31: outer_dim_block_size + */ + int32_t as_packed_int() const; + + /** + * Extracts inner_dim from a packed int32_t representation. + */ + static int32_t inner_dim_from_packed_int(int32_t packed_int); + + /** + * Extracts outer_dim from a packed int32_t representation. + */ + static int32_t outer_dim_from_packed_int(int32_t packed_int); +}; + /** * Creates a global workgroup size based on the first output tensor in the args. * This is a utility function that extracts the output tensor from @@ -61,4 +128,133 @@ utils::uvec3 pick_wc_square_wg_size( const std::vector& args, const std::vector& resize_args); +/** + * Creates a BlockConfig based on the packed dimensions of an output and input + * tensor pair. This is useful for operations like dequantize where the block + * configuration depends on both tensors. + * + * The inner dimension is determined by the output tensor's packed dimension, + * and the outer dimension is determined by the input tensor's packed dimension. + * If they are the same, the outer dimension is adjusted to avoid conflict. + * + * @param graph The compute graph + * @param output The output tensor reference + * @param input The input tensor reference + * @return A BlockConfig configured for block-based operations + */ +BlockConfig create_block_config_from_io_packed_dims( + ComputeGraph& graph, + const ValueRef output, + const ValueRef input); + +/** + * Creates a BlockConfig based on the packed dimension of a single tensor. + * This is useful when you need separate block configs for input and output + * tensors. + * + * The inner dimension is determined by the tensor's packed dimension. + * The outer dimension is set to an adjacent dimension that differs from + * the packed dimension. + * + * @param graph The compute graph + * @param tensor The tensor reference + * @return A BlockConfig configured for block-based operations + */ +BlockConfig create_block_config_for_tensor( + ComputeGraph& graph, + const ValueRef tensor); + +/** + * Creates a BlockConfig for a tensor based on another block config, ensuring + * the inner dimension matches the tensor's packed dimension. + * + * This is useful when you need block configs for both input and output tensors + * that share the same block axes but may need to be transposed if the tensors + * have different packed dimensions. + * + * If the tensor's packed dim matches the other config's inner dim, returns + * the same config. Otherwise, returns a transposed config (inner/outer + * swapped). + * + * @param graph The compute graph + * @param tensor The tensor to create a block config for + * @param other The reference block config to base the new config on + * @return A BlockConfig with inner_dim = tensor's packed_dim + */ +BlockConfig create_block_config_from_other( + ComputeGraph& graph, + const ValueRef tensor, + const BlockConfig& other); + +/** + * Picks a global workgroup size for block-based dispatching using a linear + * (1D flattened) dispatch pattern. This is optimized for buffer storage. + * + * This function expects: + * - args.at(0).refs.at(0): Output tensor reference + * - extra_args.at(0): Packed int32_t block configuration cast to ValueRef + * (created via static_cast(BlockConfig::as_packed_int())) + * + * The global workgroup size is computed as: + * - x = total_blocks = num_inner_blocks * num_outer_blocks * num_planes + * - y = 1 + * - z = 1 + * + * @return Global workgroup size as {total_blocks, 1, 1} + */ +utils::uvec3 pick_linear_global_wg_with_block_config( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& extra_args); + +/** + * Picks a global workgroup size for block-based dispatching using a 3D + * extents-style dispatch pattern. This is optimized for texture storage. + * + * This function expects: + * - args.at(0).refs.at(0): Output tensor reference + * - extra_args.at(0): Packed int32_t block configuration cast to ValueRef + * (created via static_cast(BlockConfig::as_packed_int())) + * + * The global workgroup size is computed as a WHCN-based 3D dispatch: + * - x = W threads (divided by 4 if W is inner or outer dim) + * - y = H threads (divided by 4 if H is inner or outer dim) + * - z = C * N threads (C divided by 4 if C is inner or outer dim) + * + * @return Global workgroup size as {x_threads, y_threads, z_threads} + */ +utils::uvec3 pick_extents_global_wg_with_block_config( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& extra_args); + +/** + * Picks a local workgroup size for block-based dispatching that is optimized + * for the dispatch pattern in use. + * + * This function expects: + * - extra_args.at(0): Packed int32_t block configuration cast to ValueRef + * (created via static_cast(BlockConfig::as_packed_int())) + * + * For linear dispatch (buffer storage, global_wg = {total_blocks, 1, 1}): + * - Returns {64, 1, 1} + * + * For extents dispatch (texture storage, global_wg = {x, y, z}): + * - Returns an 8x8 square configuration where: + * - Axes corresponding to inner_dim and outer_dim are set to 8 + * - The remaining axis is set to 1 + * - For example: inner_dim=W, outer_dim=H -> {8, 8, 1} + * inner_dim=W, outer_dim=C -> {8, 1, 8} + * + * @return Local workgroup size optimized for the dispatch pattern + */ +utils::uvec3 pick_square_local_wg_with_block_config( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& extra_args); + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp new file mode 100644 index 00000000000..bca36444725 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.cpp @@ -0,0 +1,156 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +namespace vkcompute { + +void add_q8ta_quantize_node( + ComputeGraph& graph, + const ValueRef fp_input, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef packed_int8_output) { + float inv_scale = 1.0f / graph.extract_scalar(input_scale); + int32_t zp = graph.extract_scalar(input_zp); + + // Detect input storage type to select appropriate shader variant + utils::StorageType inp_storage = graph.storage_type_of(fp_input); + + // Build shader name: q8ta_quantize_{buffer|texture3d}_{dtype} + std::string kernel_name = "q8ta_quantize"; + add_storage_type_suffix(kernel_name, inp_storage); + add_dtype_suffix(kernel_name, graph.dtype_of(fp_input)); + + // Pass metadata for both output and input tensors + // Output is always buffer, input can be buffer or texture + vkapi::ParamsBindList param_buffers; + param_buffers.append(graph.buffer_meta_ubo(packed_int8_output)); + param_buffers.append(graph.meta_ubo(fp_input)); + + std::vector push_constants = { + PushConstantDataInfo(&inv_scale, sizeof(inv_scale)), + PushConstantDataInfo(&zp, sizeof(zp)), + }; + + // Create block config for output tensor: inner_dim = output's packed_dim + const BlockConfig outp_block_config = create_block_config_from_io_packed_dims( + graph, packed_int8_output, fp_input); + + // Create block config for input tensor: based on outp_block_config but with + // inner_dim = input's packed_dim. If input and output have different packed + // dims, the block axes are transposed. + const BlockConfig inp_block_config = + create_block_config_from_other(graph, fp_input, outp_block_config); + + // Cast block config to ValueRef for pick_*_global_wg_with_block_config + // Use inp_block_config since shader uses inp_block_config for indexing + const ValueRef block_config_ref = + static_cast(inp_block_config.as_packed_int()); + + // Choose dispatch function based on FP input storage type: + // - Buffer: use linear dispatch (better performance) + // - Texture: use extents-style 3D dispatch (better performance) + auto pick_global_wg_size = (inp_storage == utils::kBuffer) + ? pick_linear_global_wg_with_block_config + : pick_extents_global_wg_with_block_config; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + pick_global_wg_size, + pick_square_local_wg_with_block_config, + // Inputs and Outputs + {{packed_int8_output, vkapi::kWrite}, {fp_input, vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + {graph.hashed_layout_of(fp_input), + graph.hashed_layout_of(packed_int8_output), + inp_block_config.as_packed_int(), + outp_block_config.as_packed_int()}, + // Resize args + {block_config_ref})); +} + +void add_q8ta_dequantize_node( + ComputeGraph& graph, + const ValueRef packed_int8_input, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef fp_output) { + float scale = graph.extract_scalar(output_scale); + int32_t zp = graph.extract_scalar(output_zp); + + // Detect output storage type to select appropriate shader variant + utils::StorageType outp_storage = graph.storage_type_of(fp_output); + + // Build shader name: q8ta_dequantize_{buffer|texture3d}_{dtype} + std::string kernel_name = "q8ta_dequantize"; + add_storage_type_suffix(kernel_name, outp_storage); + add_dtype_suffix(kernel_name, graph.dtype_of(fp_output)); + + // Pass metadata for both output and input tensors + // Output can be buffer or texture, input is always buffer + vkapi::ParamsBindList param_buffers; + param_buffers.append(graph.meta_ubo(fp_output)); + param_buffers.append(graph.buffer_meta_ubo(packed_int8_input)); + + std::vector push_constants = { + PushConstantDataInfo(&scale, sizeof(scale)), + PushConstantDataInfo(&zp, sizeof(zp)), + }; + + // Create block config for output tensor: inner_dim = output's packed_dim + const BlockConfig outp_block_config = create_block_config_from_io_packed_dims( + graph, fp_output, packed_int8_input); + + // Create block config for input tensor: based on outp_block_config but with + // inner_dim = input's packed_dim. If input and output have different packed + // dims, the block axes are transposed. + const BlockConfig inp_block_config = create_block_config_from_other( + graph, packed_int8_input, outp_block_config); + + // Cast block config to ValueRef for pick_*_global_wg_with_block_config + // Use inp_block_config since shader uses inp_block_config for indexing + const ValueRef block_config_ref = + static_cast(inp_block_config.as_packed_int()); + + // Choose dispatch function based on FP output storage type: + // - Buffer: use linear dispatch (better performance) + // - Texture: use extents-style 3D dispatch (better performance) + auto pick_global_wg_size = (outp_storage == utils::kBuffer) + ? pick_linear_global_wg_with_block_config + : pick_extents_global_wg_with_block_config; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + pick_global_wg_size, + pick_square_local_wg_with_block_config, + // Inputs and Outputs + {{fp_output, vkapi::kWrite}, {packed_int8_input, vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + {graph.hashed_layout_of(fp_output), + graph.hashed_layout_of(packed_int8_input), + outp_block_config.as_packed_int(), + inp_block_config.as_packed_int()}, + // Resize args + {block_config_ref})); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h new file mode 100644 index 00000000000..e2c6a2c51c3 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace vkcompute { + +// +// Generic Quantize, Dequantize (memory layout agnostic) +// + +void add_q8ta_quantize_node( + ComputeGraph& graph, + const ValueRef fp_input, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef packed_int8_output); + +void add_q8ta_dequantize_node( + ComputeGraph& graph, + const ValueRef packed_int8_input, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef fp_output); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.cpp index 8ebbf6dcb99..e02a42f60e1 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.cpp @@ -9,9 +9,12 @@ #include #include +#include #include #include +#include + namespace vkcompute { // @@ -383,11 +386,10 @@ void quantize_per_tensor_impl( const ValueRef int8_output = args[last_arg_idx]; - VK_CHECK_COND( - graph.estimate_memory_layout_of(int8_output) == utils::kPackedInt8_4W4C); + VK_CHECK_COND(graph.dtype_of(int8_output) == vkapi::kInt8x4); - add_quantize_and_pack_4w4c_node( - graph, fp_input, scale, zero_point, int8_output); + // Use unified block-based dispatch for all layouts + add_q8ta_quantize_node(graph, fp_input, scale, zero_point, int8_output); } void dequantize_per_tensor_impl( @@ -409,34 +411,10 @@ void dequantize_per_tensor_impl( const ValueRef fp_output = args[last_arg_idx]; - VK_CHECK_COND( - graph.estimate_memory_layout_of(int8_input) == utils::kPackedInt8_4W4C); - - add_unpack_4w4c_and_dequantize_node( - graph, int8_input, scale, zero_point, fp_output); -} + VK_CHECK_COND(graph.dtype_of(int8_input) == vkapi::kInt8x4); -void qdq8ta_conv2d_input( - ComputeGraph& graph, - const std::vector& args) { - int32_t idx = 0; - const ValueRef fp_input = args.at(idx++); - const ValueRef scale = args.at(idx++); - const ValueRef zero_point = args.at(idx++); - const ValueRef fp_output = args.at(idx++); - - TmpTensor packed_int8_input( - &graph, - graph.sizes_of(fp_input), - vkapi::kInt8x4, - utils::kBuffer, - utils::kPackedInt8_4W4C); - - add_quantize_and_pack_4w4c_node( - graph, fp_input, scale, zero_point, packed_int8_input); - - add_unpack_4w4c_and_dequantize_node( - graph, packed_int8_input, scale, zero_point, fp_output); + // Use unified block-based dispatch for all layouts + add_q8ta_dequantize_node(graph, int8_input, scale, zero_point, fp_output); } REGISTER_OPERATORS { @@ -446,7 +424,6 @@ REGISTER_OPERATORS { VK_REGISTER_OP( quantized_decomposed.dequantize_per_tensor.default, dequantize_per_tensor_impl); - VK_REGISTER_OP(etvk.qdq8ta_conv2d_input.default, qdq8ta_conv2d_input); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp index ff00215efbc..2adb32d8c77 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp @@ -9,7 +9,7 @@ #include #include -#include +#include #include #include #include @@ -1579,7 +1579,7 @@ void conv2d_q8ta_q8csw_q8to_test( io_storage_type, utils::kPackedInt8_4W4C); - add_quantize_and_pack_4w4c_node( + add_q8ta_quantize_node( graph, fp_input, input_scale, input_zp, packed_int8_input); std::vector conv2d_args = { @@ -1601,7 +1601,7 @@ void conv2d_q8ta_q8csw_q8to_test( conv2d_q8ta_q8csw_q8to(graph, conv2d_args); - add_unpack_4w4c_and_dequantize_node( + add_q8ta_dequantize_node( graph, packed_int8_output, output_scale, output_zp, fp_output); } diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt index 6db814815fb..781d69c10fe 100644 --- a/backends/vulkan/test/custom_ops/CMakeLists.txt +++ b/backends/vulkan/test/custom_ops/CMakeLists.txt @@ -97,7 +97,7 @@ if(TARGET vulkan_backend) add_operator_prototype(q8csw_conv2d) add_operator_prototype(q4gsw_linear) add_operator_prototype(choose_qparams_per_row) - add_operator_prototype(qdq8ta_conv2d_activations) + add_operator_prototype(test_q8ta_qdq) add_operator_prototype(q8ta_q8csw_q8to_conv2d) add_operator_prototype(q8ta_q8csw_q8to_conv2d_dw) add_operator_prototype(q8ta_q8ta_q8to_add) diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taQuantizeDequantize.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taQuantizeDequantize.cpp new file mode 100644 index 00000000000..24f19b4d309 --- /dev/null +++ b/backends/vulkan/test/custom_ops/impl/TestQ8taQuantizeDequantize.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +namespace vkcompute { + +void q_dq_8bit_per_tensor( + ComputeGraph& graph, + const std::vector& args) { + int32_t idx = 0; + const ValueRef fp_input = args.at(idx++); + const ValueRef scale = args.at(idx++); + const ValueRef zero_point = args.at(idx++); + const ValueRef layout_int = args.at(idx++); + const ValueRef impl_selector_str = args.at(idx++); + const ValueRef fp_output = args.at(idx++); + + // Extract the layout parameter and cast to GPUMemoryLayout + int32_t layout_value = graph.extract_scalar(layout_int); + utils::GPUMemoryLayout layout = + static_cast(layout_value); + + // Extract the impl_selector string + std::string impl_selector = graph.extract_string(impl_selector_str); + + // Use legacy 4W4C implementation if requested and layout matches + if (impl_selector == "legacy_4w4c" && layout == utils::kPackedInt8_4W4C) { + TmpTensor packed_int8_input( + &graph, + graph.sizes_of(fp_input), + vkapi::kInt8x4, + utils::kBuffer, + utils::kPackedInt8_4W4C); + + add_quantize_and_pack_4w4c_node( + graph, fp_input, scale, zero_point, packed_int8_input); + + add_unpack_4w4c_and_dequantize_node( + graph, packed_int8_input, scale, zero_point, fp_output); + } else { + // Create temporary tensor with the specified layout + TmpTensor packed_int8_input( + &graph, + graph.sizes_of(fp_input), + vkapi::kInt8x4, + utils::kBuffer, + layout); + + // Use unified block-based dispatch + add_q8ta_quantize_node( + graph, fp_input, scale, zero_point, packed_int8_input); + + add_q8ta_dequantize_node( + graph, packed_int8_input, scale, zero_point, fp_output); + } +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(test_etvk.q_dq_8bit_per_tensor.default, q_dq_8bit_per_tensor); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations.cpp b/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations.cpp deleted file mode 100644 index b8b33f30623..00000000000 --- a/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations.cpp +++ /dev/null @@ -1,255 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include "utils.h" - -#include - -using namespace executorch::vulkan::prototyping; -using namespace vkcompute; - -static constexpr int64_t kRefDimSizeLimit = 512; - -// QDQ8TA Conv2D configuration struct for 4D tensor quantize-dequantize testing -struct QDQ8TAConv2DConfig { - int64_t batch_size; // N dimension - int64_t in_channels; // C dimension - int64_t height; // H dimension - int64_t width; // W dimension - std::string test_case_name = "placeholder"; - std::string op_name = "qdq8ta_conv2d_input"; -}; - -// Utility function to create a test case from a QDQ8TAConv2DConfig -TestCase create_test_case_from_config( - const QDQ8TAConv2DConfig& config, - utils::StorageType storage_type, - vkapi::ScalarType input_dtype) { - TestCase test_case; - - // Create a descriptive name for the test case - std::string storage_str = - (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; - std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; - - std::string test_name = - config.test_case_name + "_" + storage_str + "_" + dtype_str; - test_case.set_name(test_name); - - // Set the operator name for the test case - std::string operator_name = "etvk." + config.op_name + ".default"; - test_case.set_operator_name(operator_name); - - // Input tensor (float) - [N, C, H, W] - std::vector input_size = { - config.batch_size, config.in_channels, config.height, config.width}; - ValueSpec input_tensor( - input_size, - input_dtype, - storage_type, - utils::kChannelsPacked, // Use channels packed for conv2d tensors - DataGenType::RANDOM); - - float scale_val = 0.007112; - ValueSpec scale(scale_val); - - // Generate random zero point within quantization range - int32_t zero_point_val = -2; - ValueSpec zero_point(zero_point_val); - - // Output tensor (float) - same shape as input [N, C, H, W] - ValueSpec output_tensor( - input_size, - input_dtype, - storage_type, - utils::kChannelsPacked, - DataGenType::ZEROS); - - // Add all specs to test case - test_case.add_input_spec(input_tensor); - test_case.add_input_spec(scale); - test_case.add_input_spec(zero_point); - test_case.add_output_spec(output_tensor); - - test_case.set_abs_tolerance(scale_val + 1e-4); - - // Use layout-only filter for this test since quantize/dequantize ARE the - // operations being tested, not overhead - test_case.set_shader_filter(kLayoutOnlyShaderFilter); - - return test_case; -} - -// Generate easy test cases for qdq8ta_conv2d operation (for debugging) -std::vector generate_qdq8ta_conv2d_easy_cases() { - std::vector test_cases; - - // Single simple configuration for debugging - QDQ8TAConv2DConfig config = { - 1, // batch_size - 3, // in_channels - 4, // height - 4, // width - "simple", // test_case_name - }; - - // Test with both storage types - std::vector storage_types = {utils::kTexture3D}; - std::vector float_types = {vkapi::kFloat}; - - // Generate test cases for each combination - for (const auto& storage_type : storage_types) { - for (const auto& input_dtype : float_types) { - test_cases.push_back( - create_test_case_from_config(config, storage_type, input_dtype)); - } - } - - return test_cases; -} - -// Generate test cases for qdq8ta_conv2d operation -std::vector generate_qdq8ta_conv2d_test_cases() { - std::vector test_cases; - - std::vector configs = { - // Small test cases for correctness - {1, 3, 16, 16}, - {1, 8, 32, 32}, - {1, 16, 24, 24}, - {1, 32, 12, 12}, - {1, 1, 64, 64}, - {1, 3, 64, 64}, - {1, 4, 16, 16}, - - // Different tensor sizes - {1, 8, 20, 20}, - {1, 16, 14, 14}, - {1, 8, 28, 28}, - - // Odd tensor sizes - {1, 3, 15, 15}, - {1, 13, 31, 31}, - {1, 17, 23, 23}, - - // Performance test cases (larger tensors) - {1, 64, 128, 128}, - {1, 32, 64, 64}, - {1, 128, 56, 56}, - }; - - // Test with different storage types - std::vector storage_types = {utils::kTexture3D}; - - for (auto config : configs) { - std::string prefix = - (config.batch_size < kRefDimSizeLimit && - config.in_channels < kRefDimSizeLimit && - config.height < kRefDimSizeLimit && config.width < kRefDimSizeLimit) - ? "correctness_" - : "performance_"; - std::string generated_test_case_name = prefix + - std::to_string(config.batch_size) + "_" + - std::to_string(config.in_channels) + "_" + - std::to_string(config.height) + "_" + std::to_string(config.width); - - config.test_case_name = generated_test_case_name; - - for (const auto& storage_type : storage_types) { - test_cases.push_back( - create_test_case_from_config(config, storage_type, vkapi::kFloat)); - } - } - - return test_cases; -} - -// Reference implementation for qdq8ta_conv2d operation -void qdq8ta_conv2d_reference_impl(TestCase& test_case) { - int32_t idx = 0; - const ValueSpec& input_spec = test_case.inputs()[idx++]; - const ValueSpec& scale_spec = test_case.inputs()[idx++]; - const ValueSpec& zero_point_spec = test_case.inputs()[idx++]; - - // Extract output specification - ValueSpec& output_spec = test_case.outputs()[0]; - - // Get tensor dimensions - auto input_sizes = input_spec.get_tensor_sizes(); // [N, C, H, W] - int64_t N = input_sizes[0]; - int64_t C = input_sizes[1]; - int64_t H = input_sizes[2]; - int64_t W = input_sizes[3]; - - // Skip for large tensors since computation time will be extremely slow - if (N > kRefDimSizeLimit || C > kRefDimSizeLimit || H > kRefDimSizeLimit || - W > kRefDimSizeLimit) { - throw std::invalid_argument( - "One or more dimensions (N, C, H, W) exceed the allowed limit for reference implementation."); - } - - if (input_spec.dtype != vkapi::kFloat) { - throw std::invalid_argument("Unsupported dtype"); - } - - // Get raw data pointers - auto& input_data = input_spec.get_float_data(); - - // Extract the randomized scale and zero point values (following - // q8csw_conv2d.cpp pattern) - float scale = scale_spec.get_float_value(); - int32_t zero_point = zero_point_spec.get_int_value(); - int32_t quant_min = -128; - int32_t quant_max = 127; - - // Prepare output data - auto& ref_data = output_spec.get_ref_float_data(); - int64_t num_elements = N * C * H * W; - ref_data.resize(num_elements); - - // Perform quantize-dequantize operation on each element - for (int64_t i = 0; i < num_elements; ++i) { - float input_val = input_data[i]; - - // Quantize: quantized = round(input / scale + zero_point) - float quantized_float = std::round(input_val / scale) + zero_point; - - // Clamp to quantization range - quantized_float = std::max(quantized_float, static_cast(quant_min)); - quantized_float = std::min(quantized_float, static_cast(quant_max)); - - int32_t quantized_int = static_cast(quantized_float); - - // Dequantize: output = (quantized - zero_point) * scale - float dequantized = (quantized_int - zero_point) * scale; - - ref_data[i] = dequantized; - } -} - -int main(int argc, char* argv[]) { - set_debugging(false); - set_print_output(false); - set_print_latencies(false); - set_use_gpu_timestamps(true); - - print_performance_header(); - std::cout << "QDQ8TA Conv2D Operation Prototyping Framework" << std::endl; - print_separator(); - - ReferenceComputeFunc ref_fn = qdq8ta_conv2d_reference_impl; - - auto results = execute_test_cases( - generate_qdq8ta_conv2d_test_cases, "QDQ8TAConv2D", 0, 1, ref_fn); - - return 0; -} diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl index 6f35444570b..6633365838c 100644 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ b/backends/vulkan/test/custom_ops/targets.bzl @@ -91,7 +91,7 @@ def define_common_targets(is_fbcode = False): define_custom_op_test_binary("q8csw_conv2d") define_custom_op_test_binary("choose_qparams_per_row") define_custom_op_test_binary("q4gsw_linear") - define_custom_op_test_binary("qdq8ta_conv2d_activations") + define_custom_op_test_binary("test_q8ta_qdq") define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d") define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d_dw") define_custom_op_test_binary("q8ta_q8ta_q8to_add") diff --git a/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp b/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp new file mode 100644 index 00000000000..e0efd6ea85d --- /dev/null +++ b/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp @@ -0,0 +1,348 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include +#include +#include "utils.h" + +#include + +// #define DEBUG_MODE + +using namespace executorch::vulkan::prototyping; +using namespace vkcompute; + +static constexpr int64_t kRefDimSizeLimit = 512; + +// Configuration struct for tensor quantize-dequantize testing +struct QDQ8BitConfig { + std::vector shape; // Tensor shape (can be any dimensionality) + std::string test_case_name = "placeholder"; + std::string op_name = "q_dq_8bit_per_tensor"; +}; + +// Utility function to create a test case from a QDQ8BitConfig +TestCase create_test_case_from_config( + const QDQ8BitConfig& config, + utils::StorageType storage_type, + vkapi::ScalarType input_dtype, + utils::GPUMemoryLayout fp_memory_layout, + utils::GPUMemoryLayout quantized_memory_layout, + const std::string& impl_selector = "") { + TestCase test_case; + + // Create a descriptive name for the test case + // Format: ACCU/PERF I=N,C,H,W Tex_FP->Buf_Quant + std::string shape_str = shape_string(config.shape); + std::string test_name = config.test_case_name + " I=" + shape_str + " " + + repr_str(storage_type, fp_memory_layout) + "->" + + repr_str(utils::kBuffer, quantized_memory_layout); + if (!impl_selector.empty()) { + test_name += " [" + impl_selector + "]"; + } + test_case.set_name(test_name); + + // Set the operator name for the test case + std::string operator_name = "test_etvk." + config.op_name + ".default"; + test_case.set_operator_name(operator_name); + + // Input tensor (float) - any dimensionality + ValueSpec input_tensor( + config.shape, + input_dtype, + storage_type, + fp_memory_layout, + DataGenType::RANDOM); + + float scale_val = 0.007112; + ValueSpec scale(scale_val); + + // Generate random zero point within quantization range + int32_t zero_point_val = 0; + ValueSpec zero_point(zero_point_val); + + // GPUMemoryLayout as integer (will be cast in the operator) + int32_t layout_int = static_cast(quantized_memory_layout); + ValueSpec layout_spec(layout_int); + + // impl_selector string + ValueSpec impl_selector_spec = ValueSpec::make_string(impl_selector); + + // Output tensor (float) - same shape as input + ValueSpec output_tensor( + config.shape, + input_dtype, + storage_type, + fp_memory_layout, + DataGenType::ZEROS); + + // Add all specs to test case + test_case.add_input_spec(input_tensor); + test_case.add_input_spec(scale); + test_case.add_input_spec(zero_point); + test_case.add_input_spec(layout_spec); + test_case.add_input_spec(impl_selector_spec); + test_case.add_output_spec(output_tensor); + + test_case.set_abs_tolerance(scale_val + 1e-4); + + // Use layout-only filter for this test since quantize/dequantize ARE the + // operations being tested, not overhead + test_case.set_shader_filter(kLayoutOnlyShaderFilter); + + return test_case; +} + +// Generate easy test cases for q_dq_8bit operation (for debugging) +std::vector generate_q_dq_8bit_easy_cases() { + std::vector test_cases; + + // Single simple configuration for debugging + QDQ8BitConfig config = { + {1, 16, 16, 16}, // shape: [N, C, H, W] + "ACCU", // test_case_name + }; + + // FP memory layouts to test + std::vector fp_layouts = { + utils::kWidthPacked, + utils::kChannelsPacked, + }; + + // Quantized memory layouts to test + std::vector quant_layouts = { + utils::kPackedInt8_4W, + utils::kPackedInt8_4C, + utils::kPackedInt8_4W4C, + utils::kPackedInt8_4H4W, + utils::kPackedInt8_4C1W, + }; + + std::vector storage_types = {utils::kBuffer}; + std::vector float_types = {vkapi::kFloat}; + + // Generate test cases for each combination + for (const auto& fp_layout : fp_layouts) { + for (const auto& quant_layout : quant_layouts) { + for (const auto& storage_type : storage_types) { + for (const auto& input_dtype : float_types) { + test_cases.push_back(create_test_case_from_config( + config, storage_type, input_dtype, fp_layout, quant_layout)); + // For 4W4C layout, also test with legacy implementation + if (quant_layout == utils::kPackedInt8_4W4C) { + test_cases.push_back(create_test_case_from_config( + config, + storage_type, + input_dtype, + fp_layout, + quant_layout, + /*impl_selector=*/"legacy_4w4c")); + } + } + } + } + } + + return test_cases; +} + +// Generate test cases for q_dq_8bit operation +std::vector generate_q_dq_8bit_test_cases() { + std::vector test_cases; + + // Shapes to test (no layout specified - will be combined with all layouts) + std::vector> shapes = { + // Small test cases for correctness + {1, 3, 16, 16}, + {1, 8, 32, 32}, + {1, 16, 24, 24}, + {1, 32, 12, 12}, + {1, 1, 64, 64}, + {1, 3, 64, 64}, + {1, 4, 16, 16}, + + // Different tensor sizes + {1, 8, 20, 20}, + {1, 16, 14, 14}, + {1, 8, 28, 28}, + + // Odd tensor sizes + {1, 3, 15, 15}, + {1, 13, 31, 31}, + {1, 17, 23, 23}, + + // Performance test cases (larger tensors) + {1, 64, 128, 128}, + {1, 32, 64, 64}, + {1, 128, 56, 56}, + }; + + // FP memory layouts to test + std::vector fp_layouts = { + utils::kWidthPacked, + utils::kChannelsPacked, + }; + + // Quantized memory layouts to test + std::vector quant_layouts = { + utils::kPackedInt8_4W, + utils::kPackedInt8_4C, + utils::kPackedInt8_4W4C, + utils::kPackedInt8_4H4W, + utils::kPackedInt8_4C1W, + }; + + // Test with buffer storage only - the unified block-based shaders only + // support buffer-backed floating-point tensors. Texture storage is tested + // separately by qdq8ta_conv2d_activations which uses the layout-specific + // shaders. + std::vector storage_types = { + utils::kBuffer, utils::kTexture3D}; + + // Generate all combinations + for (const auto& shape : shapes) { + // Generate test case name prefix from shape dimensions + std::string prefix = "ACCU"; + for (const auto& dim : shape) { + if (dim > kRefDimSizeLimit) { + prefix = "PERF"; + break; + } + } + + for (const auto& fp_layout : fp_layouts) { + for (const auto& quant_layout : quant_layouts) { + for (const auto& storage_type : storage_types) { + QDQ8BitConfig config; + config.shape = shape; + config.test_case_name = prefix; + + test_cases.push_back(create_test_case_from_config( + config, storage_type, vkapi::kFloat, fp_layout, quant_layout)); + // For 4W4C layout, also test with legacy implementation + if (fp_layout == utils::kChannelsPacked && + quant_layout == utils::kPackedInt8_4W4C) { + test_cases.push_back(create_test_case_from_config( + config, + storage_type, + vkapi::kFloat, + fp_layout, + quant_layout, + /*impl_selector=*/"legacy_4w4c")); + } + } + } + } + } + + return test_cases; +} + +// Reference implementation for q_dq_8bit operation +void q_dq_8bit_reference_impl(TestCase& test_case) { + int32_t idx = 0; + const ValueSpec& input_spec = test_case.inputs()[idx++]; + const ValueSpec& scale_spec = test_case.inputs()[idx++]; + const ValueSpec& zero_point_spec = test_case.inputs()[idx++]; + const ValueSpec& layout_spec = test_case.inputs()[idx++]; + (void)layout_spec; // Not used in reference implementation + + // Extract output specification + ValueSpec& output_spec = test_case.outputs()[0]; + + // Get tensor dimensions (arbitrary dimensionality) + auto input_sizes = input_spec.get_tensor_sizes(); + + // Calculate total number of elements + int64_t num_elements = 1; + for (const auto& dim : input_sizes) { + num_elements *= dim; + } + + // Skip for large tensors since computation time will be extremely slow + for (const auto& dim : input_sizes) { + if (dim > kRefDimSizeLimit) { + throw std::invalid_argument( + "One or more dimensions exceed the allowed limit for reference implementation."); + } + } + + if (input_spec.dtype != vkapi::kFloat) { + throw std::invalid_argument("Unsupported dtype"); + } + + // Get raw data pointers + auto& input_data = input_spec.get_float_data(); + + // Extract the randomized scale and zero point values + float scale = scale_spec.get_float_value(); + int32_t zero_point = zero_point_spec.get_int_value(); + int32_t quant_min = -128; + int32_t quant_max = 127; + + // Prepare output data + auto& ref_data = output_spec.get_ref_float_data(); + ref_data.resize(num_elements); + + // Perform quantize-dequantize operation on each element + for (int64_t i = 0; i < num_elements; ++i) { + float input_val = input_data[i]; + + // Quantize: quantized = round(input / scale + zero_point) + float quantized_float = std::round(input_val / scale) + zero_point; + + // Clamp to quantization range + quantized_float = std::max(quantized_float, static_cast(quant_min)); + quantized_float = std::min(quantized_float, static_cast(quant_max)); + + int32_t quantized_int = static_cast(quantized_float); + + // Dequantize: output = (quantized - zero_point) * scale + float dequantized = (quantized_int - zero_point) * scale; + + ref_data[i] = dequantized; + } +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); +#ifdef DEBUG_MODE + set_print_latencies(false); +#else + set_print_latencies(false); +#endif + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "Q/DQ 8-bit Per Tensor Operation Prototyping Framework" + << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = q_dq_8bit_reference_impl; + + auto results = execute_test_cases( +#ifdef DEBUG_MODE + generate_q_dq_8bit_easy_cases, +#else + generate_q_dq_8bit_test_cases, +#endif + "QDQ8Bit", +#ifdef DEBUG_MODE + 0, + 1, +#else + 3, + 10, +#endif + ref_fn); + + return 0; +}