From 178f98e71e261d05105983f0ab6a8eb9ce8a9ad7 Mon Sep 17 00:00:00 2001 From: Zonglin Peng Date: Tue, 3 Feb 2026 14:58:29 -0800 Subject: [PATCH] [ET][Cadence] patch transpose kernel for safe load & store MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Original patch: https://github.com/cad-audio/executorch/commit/1cb9ee714731d4d615850d26a34b8d4480bf26fe#diff-72cb6b6b59f6d8be4021885fa14490cd182776f883b6d2132678968b9e7cb267 # Fixing Crash Issue in `xa_nn_transpose_32` for Unaligned Buffers ## Overview This document explains the code changes made to `xa_nn_transpose_32.c` to fix a **memory alignment crash issue** in the transpose operation for Cadence HiFi DSPs. The patch addresses two critical bugs that caused crashes when processing unaligned input/output buffers. ## Files Modified | File | Description | |------|-------------| | `backends/cadence/hifi/operators/op_permute_copy.cpp` | Permute copy operator | | `backends/cadence/hifi/operators/op_transpose_copy.cpp` | Transpose copy operator | | `backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c` | Core transpose kernel | --- ## Change 1: Simplified Inner Loop (Lines 170-191) ### The Problem The original code had a **flawed alignment check**: ```c // OLD CODE - BUGGY if((((unsigned)p_inp4 & 1) == 0) && (((unsigned)p_out & 1) == 0)) { // "Aligned" path using AE_L32X2_IP / AE_S32X2_IP ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); ae_int32x2 d0; for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) { AE_L32X2_IP(d0, pae_i, 2 * sizeof(WORD32)); // Aligned load AE_S32X2_IP(d0, pae_o, 2 * sizeof(WORD32)); // Aligned store } // ... handle remainder } else { // Unaligned path using AE_LA32X2_IP / AE_SA32X2_IP // ... } ``` ### Root Cause Analysis The alignment check `& 1` only verifies **2-byte alignment** (checking if the least significant bit is 0). However: | Data Type | Size | Required Alignment | |-----------|------|-------------------| | `WORD32` | 4 bytes | 4-byte alignment | | `ae_int32x2` | 8 bytes (64-bit SIMD) | **8-byte alignment** | **The bug:** The check should be: - `(ptr & 0x7) == 0` for 8-byte alignment, OR - `(ptr & 0x3) == 0` for 4-byte alignment (minimum) The incorrect `& 1` check meant that when pointers were 2-byte aligned but **NOT** 8-byte aligned, the code incorrectly took the "aligned" path. ### Consequences of the Bug When `AE_L32X2_IP` / `AE_S32X2_IP` (aligned intrinsics) are used on misaligned addresses: 1. **Hardware Exception**: Cadence HiFi DSPs may generate an alignment exception 2. **Crash**: The program terminates unexpectedly 3. **Data Corruption**: In some configurations, incorrect data may be loaded/stored silently ### The Fix ```c // NEW CODE - SAFE (always uses unaligned intrinsics) WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); ae_valign a_inp = AE_LA64_PP(pae_i); // Prime alignment register for input ae_valign a_out = AE_ZALIGN64(); // Initialize alignment register for output ae_int32x2 d0; for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) { AE_LA32X2_IP(d0, a_inp, pae_i); // Unaligned load (safe for any alignment) AE_SA32X2_IP(d0, a_out, pae_o); // Unaligned store (safe for any alignment) } AE_SA64POS_FP(a_out, pae_o); // Flush remaining data in alignment register ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); #pragma loop_count max=3 for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) { puae_o[itr4] = puae_i[itr4]; // Handle odd element } ``` ### Why This Fix Works 1. **Always uses unaligned intrinsics** (`AE_LA32X2_IP` / `AE_SA32X2_IP`) - These intrinsics work correctly for **both** aligned and unaligned data - They use internal alignment registers to handle any pointer alignment 2. **Eliminates the faulty branch entirely** - No alignment check needed - No risk of taking the wrong code path 3. **Minimal performance impact** - On modern HiFi DSPs, unaligned intrinsics on aligned data have negligible overhead - The hardware efficiently handles the alignment internally ### Intrinsic Comparison | Intrinsic | Type | Alignment Requirement | Use Case | |-----------|------|----------------------|----------| | `AE_L32X2_IP` | Aligned load | **Requires 8-byte aligned pointer** | Only when alignment is guaranteed | | `AE_LA32X2_IP` | Unaligned load | Works with any alignment | Safe for all cases | | `AE_S32X2_IP` | Aligned store | **Requires 8-byte aligned pointer** | Only when alignment is guaranteed | | `AE_SA32X2_IP` | Unaligned store | Works with any alignment | Safe for all cases | | `AE_SA64POS_FP` | Flush | N/A | Must be called after unaligned stores | --- ## Change 2: Fixed Strided Load (Lines 216-222) ### The Problem ```c // OLD CODE - PROBLEMATIC AE_L32_XP(d0, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2); AE_L32_XP(d1, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2); ``` ### Root Cause Analysis The `AE_L32_XP` intrinsic: - Loads a 32-bit value from the pointer - **Modifies the pointer** by adding the byte offset (second parameter) - Has specific alignment and type requirements **Issues with the old code:** 1. **Byte offset calculation**: `inp_stride[...] << 2` converts stride to bytes, but the pointer increment behavior inside the macro can be problematic with unaligned addresses 2. **Intrinsic alignment requirements**: `AE_L32_XP` may have stricter alignment requirements than expected on some HiFi variants 3. **Pointer type mismatch**: The cast `(ae_int32 *)p_inp4` combined with the post-increment behavior can cause undefined behavior ### The Fix ```c // NEW CODE - EXPLICIT AND SAFE d0 = AE_L32_X((ae_int32 *)p_inp4, 0); // Load from p_inp4 + 0 bytes p_inp4 += inp_stride[p_5D_permute_vec[4]]; // Manual pointer increment (elements) d1 = AE_L32_X((ae_int32 *)p_inp4, 0); // Load from updated p_inp4 p_inp4 += inp_stride[p_5D_permute_vec[4]]; // Manual pointer increment (elements) ``` ### Why This Fix Works 1. **`AE_L32_X(ptr, offset)`** - Loads from `ptr + offset` **without modifying** the pointer - More predictable behavior - No side effects on the pointer variable 2. **Explicit pointer arithmetic** - `p_inp4 += stride` increments by **elements** (not bytes) - Correct for `WORD32*` pointer type - The `<< 2` byte conversion is no longer needed 3. **Clearer semantics** - Load and pointer update are separate operations - Easier to debug and maintain - No reliance on macro-specific behavior ### Comparison | Aspect | Old (`AE_L32_XP`) | New (`AE_L32_X` + manual increment) | |--------|-------------------|-------------------------------------| | Pointer update | Inside intrinsic (byte offset) | Explicit (element offset) | | Offset calculation | `stride << 2` (bytes) | `stride` (elements) | | Clarity | Implicit side effect | Explicit, readable | | Safety | Potential alignment issues | Guaranteed correct | | Debuggability | Hard to trace | Easy to step through | --- ## Summary | Change | Root Cause | Fix Strategy | Impact | |--------|------------|--------------|--------| | Inner loop simplification | Wrong alignment check (`& 1` instead of `& 0x7`) | Always use unaligned intrinsics | Prevents crashes on misaligned buffers | | Strided load fix | `AE_L32_XP` intrinsic issues | Use `AE_L32_X` + manual pointer increment | Prevents potential undefined behavior | ### Key Takeaway **Both fixes follow the same principle:** Instead of trying to detect alignment and use different code paths, use intrinsics that handle unaligned data correctly in all cases. This approach is: - ✅ **Safer** - No risk of incorrect alignment detection - ✅ **Simpler** - Less code, fewer branches - ✅ **Minimal performance impact** - Modern HiFi DSPs handle unaligned access efficiently - ✅ **More maintainable** - Easier to understand and debug --- ## Additional Changes in Operator Files ### `op_permute_copy.cpp` ```cpp // Changed: Use input tensor's scalar type, not output const auto in_type = in.scalar_type(); // Was: out.scalar_type() // Changed: Reduced max dimensions from 16 to 5 constexpr int kNnlibMaxDim = 5; // Was: 16 // Changed: Separate loops for clarity for (int i = 0; i < num_inp_dims; i++) { p_inp_shape[i] = in.size(i); } for (int i = 0; i < num_out_dims; i++) { p_out_shape[i] = out.size(i); } for (int i = 0; i < num_inp_dims; i++) { p_permute_vec[i] = dims[i]; } ``` ### `op_transpose_copy.cpp` ```cpp // Changed: Use input tensor's scalar type const auto in_type = in.scalar_type(); // Was: out.scalar_type() // Changed: Use consistent loop variable for (int i = 0; i < num_inp_dims; i++) { // Was: in.dim() p_inp_shape[i] = in.size(i); p_out_shape[i] = out.size(i); } ``` These changes ensure consistency between the input type checking and the actual data being processed. --- ## Testing Recommendations 1. **Test with unaligned buffers**: Create test cases where input/output buffers are not 8-byte aligned 2. **Test various tensor shapes**: Include odd dimensions that result in unaligned memory access patterns 3. **Test on actual HiFi hardware**: Alignment issues may not manifest in simulation 4. **Stress test with random data**: Verify correctness across many iterations --- ## References - Cadence HiFi DSP Programmer's Reference Manual - Xtensa Intrinsics Guide for HiFi4/HiFi5 - ExecutorTorch Cadence Backend Documentation Differential Revision: [D92083443](https://our.internmc.facebook.com/intern/diff/D92083443/) [ghstack-poisoned] --- .../hifi/operators/op_permute_copy.cpp | 12 +++- .../hifi/operators/op_transpose_copy.cpp | 8 +-- .../third-party/nnlib/xa_nn_transpose_32.c | 57 +++++++------------ 3 files changed, 31 insertions(+), 46 deletions(-) diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp index fc162d6c7f1..3d5f8aabde8 100644 --- a/backends/cadence/hifi/operators/op_permute_copy.cpp +++ b/backends/cadence/hifi/operators/op_permute_copy.cpp @@ -68,8 +68,8 @@ Tensor& permute_copy_out( InvalidArgument, out); - const auto in_type = out.scalar_type(); - constexpr int kNnlibMaxDim = 16; + const auto in_type = in.scalar_type(); + constexpr int kNnlibMaxDim = 5; bool optimized = false; @@ -91,7 +91,13 @@ Tensor& permute_copy_out( for (int i = 0; i < num_inp_dims; i++) { p_inp_shape[i] = in.size(i); - p_out_shape[i] = in.size(dims[i]); + } + + for (int i = 0; i < num_out_dims; i++) { + p_out_shape[i] = out.size(i); + } + + for (int i = 0; i < num_inp_dims; i++) { p_permute_vec[i] = dims[i]; } diff --git a/backends/cadence/hifi/operators/op_transpose_copy.cpp b/backends/cadence/hifi/operators/op_transpose_copy.cpp index a21a7f6178c..d872bb8ed58 100644 --- a/backends/cadence/hifi/operators/op_transpose_copy.cpp +++ b/backends/cadence/hifi/operators/op_transpose_copy.cpp @@ -64,7 +64,7 @@ Tensor& transpose_copy_int_out( ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); - const auto in_type = out.scalar_type(); + const auto in_type = in.scalar_type(); constexpr int kNnlibMaxDim = 5; bool optimized = false; @@ -85,14 +85,12 @@ Tensor& transpose_copy_int_out( WORD32 p_out_shape[kNnlibMaxDim]; WORD32 p_permute_vec[kNnlibMaxDim]; - for (int i = 0; i < in.dim(); i++) { + for (int i = 0; i < num_inp_dims; i++) { p_inp_shape[i] = in.size(i); - } - for (int i = 0; i < out.dim(); i++) { p_out_shape[i] = out.size(i); } - for (int i = 0; i < in.dim(); i++) { + for (int i = 0; i < num_inp_dims; i++) { p_permute_vec[i] = i; } diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c index e7b80e3a1d9..5b3ed385568 100644 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c @@ -170,44 +170,23 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4) { WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); - if((((unsigned)p_inp4 & 1) == 0) && (((unsigned)p_out & 1) == 0)) + ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); + ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); + ae_valign a_inp = AE_LA64_PP(pae_i); + ae_valign a_out = AE_ZALIGN64(); + ae_int32x2 d0; + for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) { - ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); - ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); - ae_int32x2 d0; - for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) - { - AE_L32X2_IP(d0, pae_i, 2 * sizeof(WORD32)); - AE_S32X2_IP(d0, pae_o, 2 * sizeof(WORD32)); - } - ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); - ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); -#pragma loop_count max=3 - for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) - { - puae_o[itr4] = puae_i[itr4]; - } + AE_LA32X2_IP(d0, a_inp, pae_i); + AE_SA32X2_IP(d0, a_out, pae_o); } - else - { - ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); - ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); - ae_valign a_inp = AE_LA64_PP(pae_i); - ae_valign a_out = AE_ZALIGN64(); - ae_int32x2 d0; - for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) - { - AE_LA32X2_IP(d0, a_inp, pae_i); - AE_SA32X2_IP(d0, a_out, pae_o); - } - AE_SA64POS_FP(a_out, pae_o); - ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); - ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); + AE_SA64POS_FP(a_out, pae_o); + ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); + ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); #pragma loop_count max=3 - for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) - { - puae_o[itr4] = puae_i[itr4]; - } + for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) + { + puae_o[itr4] = puae_i[itr4]; } } } @@ -237,8 +216,10 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out ae_int32x2 d0, d1; ae_int32x2 tmp0; - AE_L32_XP(d0, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2); - AE_L32_XP(d1, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2); + d0 = AE_L32_X((ae_int32 *)p_inp4, 0); + p_inp4 += inp_stride[p_5D_permute_vec[4]]; + d1 = AE_L32_X((ae_int32 *)p_inp4, 0); + p_inp4 += inp_stride[p_5D_permute_vec[4]]; tmp0 = AE_SEL32_HH(d0, d1); @@ -257,4 +238,4 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out } return 0; -} \ No newline at end of file +}