Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1137,7 +1137,7 @@ jobs:
./cmake-out/backends/vulkan/test/custom_ops/q8csw_conv2d
./cmake-out/backends/vulkan/test/custom_ops/q4gsw_linear
./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
./cmake-out/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations
./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_qdq
./cmake-out/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add
# "Classic" Operator tests
Expand Down
280 changes: 280 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/block_indexing.glslh
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#ifndef BLOCK_GLSLH
#define BLOCK_GLSLH

#include "block_int8x4_utils.glslh"
#include "common.glslh"

//
// Block Layout Int Utils
//

// These macros extract fields from the packed int returned by
// BlockConfig::as_packed_int(). See Common.h for the bit layout.
//
// Bit layout matches hashed layout format:
// bits 0- 3: block_dim_order[0] (inner_dim if not transposed, outer_dim if transposed)
// bits 4- 7: block_dim_order[1] (outer_dim if not transposed, inner_dim if transposed)
// bits 8-11: block_dim_order[2] (first_nonblock_dim)
// bits 12-15: block_dim_order[3] (second_nonblock_dim)
// bits 16-19: inner_dim
// bits 20-23: outer_dim
// bits 24-27: inner_dim_block_size
// bits 28-31: outer_dim_block_size

// Extract block_dim_order elements (bits 0-15)
#define get_block_dim_order_0(x) ((x) & 0xF)
#define get_block_dim_order_1(x) (((x) >> 4) & 0xF)
#define get_block_dim_order_2(x) (((x) >> 8) & 0xF)
#define get_block_dim_order_3(x) (((x) >> 12) & 0xF)

// Extract packed_dim_info (bits 16-31)
#define get_block_inner_dim(x) (((x) >> 16) & 0xF)
#define get_block_outer_dim(x) (((x) >> 20) & 0xF)
#define get_block_inner_dim_block_size(x) (((x) >> 24) & 0xF)
#define get_block_outer_dim_block_size(x) (((x) >> 28) & 0xF)

/*
* Block-based programming utilities for compute shaders.
*
* A "block" is a 4x4 tile of tensor elements. The two dimensions of the block
* are called the "inner" dimension and the "outer" dimension. The inner dim is
* the one that is kept contiguous in memory (i.e. the packed dimension of the
* tensor), and the outer dim forms the other axis of the 2D block.
*
* For texel-packed tensors (single level of packing), a block is effectively a
* single texel repeated 4 times along the outer dimension. For block-packed
* tensors (two levels of packing), a block corresponds exactly to the 4x4
* packed unit.
*
* When dispatching a block-based shader:
* - gl_GlobalInvocationID.x = block index along inner dimension
* - gl_GlobalInvocationID.y = block index along outer dimension
* - gl_GlobalInvocationID.z = plane index (remaining dimensions flattened)
*/

//
// Index Conversion Utilities (TensorIndex4D versions)
//

TensorIndex4D contiguous_block_idx_to_tensor4d_idx_with_block_config(
const BufferMetadata meta,
const uint block_idx,
const int block_config) {
TensorIndex4D tidx;

uint block_strides[4];

uint stride = 1;
// Inner block dim
const int packed_dim_1 = get_block_inner_dim(block_config);
block_strides[packed_dim_1] = 1;
const uint block_size_1 = uint(get_block_inner_dim_block_size(block_config));
stride = div_up(meta.sizes[0][packed_dim_1], block_size_1);
// Outer block dim
const int packed_dim_2 = get_block_outer_dim(block_config);
block_strides[packed_dim_2] = stride;
const uint block_size_2 =
uint(get_block_outer_dim_block_size(block_config));
stride *= div_up(meta.sizes[0][packed_dim_2], block_size_2);
// First non-block dim
const int outer_dim_1 = get_block_dim_order_2(block_config);
block_strides[outer_dim_1] = stride;
stride *= meta.sizes[0][outer_dim_1];
// Second non-block dim
const int outer_dim_2 = get_block_dim_order_3(block_config);
block_strides[outer_dim_2] = stride;

uint contig_idx = block_idx;
// Second non-block dim
tidx.data[outer_dim_2] = int(contig_idx / block_strides[outer_dim_2]);
contig_idx %= block_strides[outer_dim_2];
// First non-block dim (1; height)
tidx.data[outer_dim_1] = int(contig_idx / block_strides[outer_dim_1]);
contig_idx %= block_strides[outer_dim_1];
// Outer block dim (0; width)
tidx.data[packed_dim_2] =
int(mul_4(contig_idx / block_strides[packed_dim_2]));
contig_idx %= block_strides[packed_dim_2];
// Inner block dim (2; channels)
tidx.data[packed_dim_1] = int(mul_4(contig_idx));

return tidx;
}

//
// TextureMetadata variants of block indexing
//

TensorIndex4D contiguous_block_idx_to_tensor4d_idx_with_block_config(
const TextureMetadata meta,
const uint block_idx,
const int block_config) {
TensorIndex4D tidx;

uint block_strides[4];

uint stride = 1;
// Inner block dim
const int packed_dim_1 = get_block_inner_dim(block_config);
block_strides[packed_dim_1] = 1;
const uint block_size_1 = uint(get_block_inner_dim_block_size(block_config));
stride = div_up(meta.sizes[packed_dim_1], block_size_1);
// Outer block dim
const int packed_dim_2 = get_block_outer_dim(block_config);
block_strides[packed_dim_2] = stride;
const uint block_size_2 =
uint(get_block_outer_dim_block_size(block_config));
stride *= div_up(meta.sizes[packed_dim_2], block_size_2);
// First non-block dim
const int outer_dim_1 = get_block_dim_order_2(block_config);
block_strides[outer_dim_1] = stride;
stride *= meta.sizes[outer_dim_1];
// Second non-block dim
const int outer_dim_2 = get_block_dim_order_3(block_config);
block_strides[outer_dim_2] = stride;

uint contig_idx = block_idx;
// Second non-block dim
tidx.data[outer_dim_2] = int(contig_idx / block_strides[outer_dim_2]);
contig_idx %= block_strides[outer_dim_2];
// First non-block dim
tidx.data[outer_dim_1] = int(contig_idx / block_strides[outer_dim_1]);
contig_idx %= block_strides[outer_dim_1];
// Outer block dim
tidx.data[packed_dim_2] =
int(mul_4(contig_idx / block_strides[packed_dim_2]));
contig_idx %= block_strides[packed_dim_2];
// Inner block dim
tidx.data[packed_dim_1] = int(mul_4(contig_idx));

return tidx;
}

//
// 3D Block Index Conversion Utilities (WHCN Dispatch)
//
// These functions convert a 3D thread index (gl_GlobalInvocationID) to a
// TensorIndex4D using a dispatch pattern:
// - thread_idx.x = W threads (divided by 4 if W is part of block)
// - thread_idx.y = H threads (divided by 4 if H is part of block)
// - thread_idx.z = C * N threads (C divided by 4 if C is part of block)
//
// Note: GLSL tensor metadata is in WHCN order (sizes[0]=W, sizes[1]=H,
// sizes[2]=C, sizes[3]=N), while C++ uses NCHW order.
//

/*
* Convert a 3D block index to a TensorIndex4D using WHCN dispatch.
*
* Parameters:
* meta: BufferMetadata with tensor sizes in WHCN order
* thread_idx: 3D thread index (x=W, y=H, z=C*N)
* block_config: Packed block configuration from BlockConfig::as_packed_int()
*
* Returns: TensorIndex4D with logical tensor coordinates
*/
TensorIndex4D block_idx_3d_to_tensor4d_idx_with_block_config(
const BufferMetadata meta,
const uvec3 thread_idx,
const int block_config) {
TensorIndex4D tidx;
const int inner_dim = get_block_inner_dim(block_config);
const int outer_dim = get_block_outer_dim(block_config);

// GLSL metadata is in WHCN order: sizes[0]=W, sizes[1]=H, sizes[2]=C, sizes[3]=N

// Compute C threads for decomposing thread_idx.z
// C is blocked (divided by 4) only if it's part of the block
uint C_size = uint(meta.sizes[0][2]);
uint num_C;
if (inner_dim == 2 || outer_dim == 2) {
num_C = div_up_4(C_size);
} else {
num_C = C_size;
}

// W (dim 0): blocked if inner or outer
if (inner_dim == 0 || outer_dim == 0) {
tidx.data[0] = int(thread_idx.x) * 4; // Block-aligned
} else {
tidx.data[0] = int(thread_idx.x); // Single value
}

// H (dim 1): blocked if inner or outer
if (inner_dim == 1 || outer_dim == 1) {
tidx.data[1] = int(thread_idx.y) * 4; // Block-aligned
} else {
tidx.data[1] = int(thread_idx.y); // Single value
}

// C (dim 2): blocked if inner or outer
if (inner_dim == 2 || outer_dim == 2) {
tidx.data[2] = int(thread_idx.z % num_C) * 4; // Block-aligned
} else {
tidx.data[2] = int(thread_idx.z % num_C); // Single value
}

// N (dim 3): never blocked
tidx.data[3] = int(thread_idx.z / num_C);

return tidx;
}

/*
* Convert a 3D block index to a TensorIndex4D (TextureMetadata variant).
*/
TensorIndex4D block_idx_3d_to_tensor4d_idx_with_block_config(
const TextureMetadata meta,
const uvec3 thread_idx,
const int block_config) {
TensorIndex4D tidx;
const int inner_dim = get_block_inner_dim(block_config);
const int outer_dim = get_block_outer_dim(block_config);

// GLSL metadata is in WHCN order: sizes[0]=W, sizes[1]=H, sizes[2]=C, sizes[3]=N

// Compute C threads for decomposing thread_idx.z
uint C_size = uint(meta.sizes[2]);
uint num_C;
if (inner_dim == 2 || outer_dim == 2) {
num_C = div_up_4(C_size);
} else {
num_C = C_size;
}

// W (dim 0): blocked if inner or outer
if (inner_dim == 0 || outer_dim == 0) {
tidx.data[0] = int(thread_idx.x) * 4;
} else {
tidx.data[0] = int(thread_idx.x);
}

// H (dim 1): blocked if inner or outer
if (inner_dim == 1 || outer_dim == 1) {
tidx.data[1] = int(thread_idx.y) * 4;
} else {
tidx.data[1] = int(thread_idx.y);
}

// C (dim 2): blocked if inner or outer
if (inner_dim == 2 || outer_dim == 2) {
tidx.data[2] = int(thread_idx.z % num_C) * 4;
} else {
tidx.data[2] = int(thread_idx.z % num_C);
}

// N (dim 3): never blocked
tidx.data[3] = int(thread_idx.z / num_C);

return tidx;
}

#endif // BLOCK_GLSLH
74 changes: 74 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/block_int8x4_load.glslh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

/*
* Macro to generate int8x4 block loading functions for a specific buffer.
*
* Usage:
* define_load_int8x4_buffer_fns(t_inp)
*
* This generates:
* - load_int8x4_block_from_t_inp(meta, tidx_base, layout, block_outer_dim)
*
* IMPORTANT: block_outer_dim must be such that the inner dimension (packed_dim)
* contains 4 contiguous int8 elements packed into one int32. If the loaded
* block needs to be transposed to match a different output layout, that
* transposition must be done by the caller.
*/

#ifndef BLOCK_INT8X4_LOAD_GLSLH
#define BLOCK_INT8X4_LOAD_GLSLH

#define define_load_int8x4_buffer_fns(buffer_name) \
\
ivec4 load_int8x4_block_from_##buffer_name( \
const BufferMetadata meta, \
const TensorIndex4D tidx_base, \
const int hashed_layout, \
const int block_outer_dim) { \
const int outer_packed_dim = get_outer_packed_dim(hashed_layout); \
const int outer_block_size = \
get_outer_packed_dim_block_size(hashed_layout); \
\
/* Compute base packed index using block-based indexing */ \
const uint block_idx = \
tensor4d_idx_to_block_idx(meta, tidx_base, hashed_layout); \
const uint texels_per_block = div_4(get_block_numel(hashed_layout)); \
uint buf_idx = block_idx * texels_per_block; \
\
/* Fast path: contiguous texels when iterating along outer_packed_dim */ \
if (outer_block_size == 4) { \
if (block_outer_dim == outer_packed_dim) { \
return ivec4( \
buffer_name[buf_idx], \
buffer_name[buf_idx + 1], \
buffer_name[buf_idx + 2], \
buffer_name[buf_idx + 3]); \
} \
else { \
buf_idx += mod_4(tidx_base.data[outer_packed_dim]); \
} \
} \
\
/* General path: use stride for non-contiguous access */ \
const uint outer_stride = \
stride_at(meta, block_outer_dim) * texels_per_block; \
const uint outer_size = size_at(meta, block_outer_dim); \
const int base_outer_idx = tidx_base.data[block_outer_dim]; \
\
ivec4 block = ivec4(0); \
[[unroll]] for (int block_y = 0; block_y < 4; ++block_y) { \
if (base_outer_idx + block_y < int(outer_size)) { \
block[block_y] = buffer_name[buf_idx]; \
} \
buf_idx += outer_stride; \
} \
return block; \
}

#endif // BLOCK_INT8X4_LOAD_GLSLH
Loading
Loading