From 178f98e71e261d05105983f0ab6a8eb9ce8a9ad7 Mon Sep 17 00:00:00 2001
From: Zonglin Peng <zonglinpeng@fb.com>
Date: Tue, 3 Feb 2026 14:58:29 -0800
Subject: [PATCH] [ET][Cadence] patch transpose kernel for safe load & store
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Original patch: https://github.com/cad-audio/executorch/commit/1cb9ee714731d4d615850d26a34b8d4480bf26fe#diff-72cb6b6b59f6d8be4021885fa14490cd182776f883b6d2132678968b9e7cb267


# Fixing Crash Issue in `xa_nn_transpose_32` for Unaligned Buffers

## Overview

This document explains the code changes made to `xa_nn_transpose_32.c` to fix a **memory alignment crash issue** in the transpose operation for Cadence HiFi DSPs. The patch addresses two critical bugs that caused crashes when processing unaligned input/output buffers.

## Files Modified

| File | Description |
|------|-------------|
| `backends/cadence/hifi/operators/op_permute_copy.cpp` | Permute copy operator |
| `backends/cadence/hifi/operators/op_transpose_copy.cpp` | Transpose copy operator |
| `backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c` | Core transpose kernel |

---

## Change 1: Simplified Inner Loop (Lines 170-191)

### The Problem

The original code had a **flawed alignment check**:

```c
// OLD CODE - BUGGY
if((((unsigned)p_inp4 & 1) == 0) && (((unsigned)p_out & 1) == 0))
{
    // "Aligned" path using AE_L32X2_IP / AE_S32X2_IP
    ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
    ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
    ae_int32x2 d0;
    for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
    {
        AE_L32X2_IP(d0, pae_i, 2 * sizeof(WORD32));  // Aligned load
        AE_S32X2_IP(d0, pae_o, 2 * sizeof(WORD32));  // Aligned store
    }
    // ... handle remainder
}
else
{
    // Unaligned path using AE_LA32X2_IP / AE_SA32X2_IP
    // ...
}
```

### Root Cause Analysis

The alignment check `& 1` only verifies **2-byte alignment** (checking if the least significant bit is 0). However:

| Data Type | Size | Required Alignment |
|-----------|------|-------------------|
| `WORD32` | 4 bytes | 4-byte alignment |
| `ae_int32x2` | 8 bytes (64-bit SIMD) | **8-byte alignment** |

**The bug:** The check should be:
- `(ptr & 0x7) == 0` for 8-byte alignment, OR
- `(ptr & 0x3) == 0` for 4-byte alignment (minimum)

The incorrect `& 1` check meant that when pointers were 2-byte aligned but **NOT** 8-byte aligned, the code incorrectly took the "aligned" path.

### Consequences of the Bug

When `AE_L32X2_IP` / `AE_S32X2_IP` (aligned intrinsics) are used on misaligned addresses:

1. **Hardware Exception**: Cadence HiFi DSPs may generate an alignment exception
2. **Crash**: The program terminates unexpectedly
3. **Data Corruption**: In some configurations, incorrect data may be loaded/stored silently

### The Fix

```c
// NEW CODE - SAFE (always uses unaligned intrinsics)
WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
ae_valign a_inp = AE_LA64_PP(pae_i);      // Prime alignment register for input
ae_valign a_out = AE_ZALIGN64();           // Initialize alignment register for output
ae_int32x2 d0;
for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
{
    AE_LA32X2_IP(d0, a_inp, pae_i);        // Unaligned load (safe for any alignment)
    AE_SA32X2_IP(d0, a_out, pae_o);        // Unaligned store (safe for any alignment)
}
AE_SA64POS_FP(a_out, pae_o);               // Flush remaining data in alignment register
ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
#pragma loop_count max=3
for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
{
    puae_o[itr4] = puae_i[itr4];           // Handle odd element
}
```

### Why This Fix Works

1. **Always uses unaligned intrinsics** (`AE_LA32X2_IP` / `AE_SA32X2_IP`)
   - These intrinsics work correctly for **both** aligned and unaligned data
   - They use internal alignment registers to handle any pointer alignment

2. **Eliminates the faulty branch entirely**
   - No alignment check needed
   - No risk of taking the wrong code path

3. **Minimal performance impact**
   - On modern HiFi DSPs, unaligned intrinsics on aligned data have negligible overhead
   - The hardware efficiently handles the alignment internally

### Intrinsic Comparison

| Intrinsic | Type | Alignment Requirement | Use Case |
|-----------|------|----------------------|----------|
| `AE_L32X2_IP` | Aligned load | **Requires 8-byte aligned pointer** | Only when alignment is guaranteed |
| `AE_LA32X2_IP` | Unaligned load | Works with any alignment | Safe for all cases |
| `AE_S32X2_IP` | Aligned store | **Requires 8-byte aligned pointer** | Only when alignment is guaranteed |
| `AE_SA32X2_IP` | Unaligned store | Works with any alignment | Safe for all cases |
| `AE_SA64POS_FP` | Flush | N/A | Must be called after unaligned stores |

---

## Change 2: Fixed Strided Load (Lines 216-222)

### The Problem

```c
// OLD CODE - PROBLEMATIC
AE_L32_XP(d0, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2);
AE_L32_XP(d1, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2);
```

### Root Cause Analysis

The `AE_L32_XP` intrinsic:
- Loads a 32-bit value from the pointer
- **Modifies the pointer** by adding the byte offset (second parameter)
- Has specific alignment and type requirements

**Issues with the old code:**

1. **Byte offset calculation**: `inp_stride[...] << 2` converts stride to bytes, but the pointer increment behavior inside the macro can be problematic with unaligned addresses

2. **Intrinsic alignment requirements**: `AE_L32_XP` may have stricter alignment requirements than expected on some HiFi variants

3. **Pointer type mismatch**: The cast `(ae_int32 *)p_inp4` combined with the post-increment behavior can cause undefined behavior

### The Fix

```c
// NEW CODE - EXPLICIT AND SAFE
d0 = AE_L32_X((ae_int32 *)p_inp4, 0);           // Load from p_inp4 + 0 bytes
p_inp4 += inp_stride[p_5D_permute_vec[4]];      // Manual pointer increment (elements)
d1 = AE_L32_X((ae_int32 *)p_inp4, 0);           // Load from updated p_inp4
p_inp4 += inp_stride[p_5D_permute_vec[4]];      // Manual pointer increment (elements)
```

### Why This Fix Works

1. **`AE_L32_X(ptr, offset)`** - Loads from `ptr + offset` **without modifying** the pointer
   - More predictable behavior
   - No side effects on the pointer variable

2. **Explicit pointer arithmetic** - `p_inp4 += stride` increments by **elements** (not bytes)
   - Correct for `WORD32*` pointer type
   - The `<< 2` byte conversion is no longer needed

3. **Clearer semantics**
   - Load and pointer update are separate operations
   - Easier to debug and maintain
   - No reliance on macro-specific behavior

### Comparison

| Aspect | Old (`AE_L32_XP`) | New (`AE_L32_X` + manual increment) |
|--------|-------------------|-------------------------------------|
| Pointer update | Inside intrinsic (byte offset) | Explicit (element offset) |
| Offset calculation | `stride << 2` (bytes) | `stride` (elements) |
| Clarity | Implicit side effect | Explicit, readable |
| Safety | Potential alignment issues | Guaranteed correct |
| Debuggability | Hard to trace | Easy to step through |

---

## Summary

| Change | Root Cause | Fix Strategy | Impact |
|--------|------------|--------------|--------|
| Inner loop simplification | Wrong alignment check (`& 1` instead of `& 0x7`) | Always use unaligned intrinsics | Prevents crashes on misaligned buffers |
| Strided load fix | `AE_L32_XP` intrinsic issues | Use `AE_L32_X` + manual pointer increment | Prevents potential undefined behavior |

### Key Takeaway

**Both fixes follow the same principle:** Instead of trying to detect alignment and use different code paths, use intrinsics that handle unaligned data correctly in all cases. This approach is:

- ✅ **Safer** - No risk of incorrect alignment detection
- ✅ **Simpler** - Less code, fewer branches
- ✅ **Minimal performance impact** - Modern HiFi DSPs handle unaligned access efficiently
- ✅ **More maintainable** - Easier to understand and debug

---

## Additional Changes in Operator Files

### `op_permute_copy.cpp`

```cpp
// Changed: Use input tensor's scalar type, not output
const auto in_type = in.scalar_type();  // Was: out.scalar_type()

// Changed: Reduced max dimensions from 16 to 5
constexpr int kNnlibMaxDim = 5;  // Was: 16

// Changed: Separate loops for clarity
for (int i = 0; i < num_inp_dims; i++) {
    p_inp_shape[i] = in.size(i);
}
for (int i = 0; i < num_out_dims; i++) {
    p_out_shape[i] = out.size(i);
}
for (int i = 0; i < num_inp_dims; i++) {
    p_permute_vec[i] = dims[i];
}
```

### `op_transpose_copy.cpp`

```cpp
// Changed: Use input tensor's scalar type
const auto in_type = in.scalar_type();  // Was: out.scalar_type()

// Changed: Use consistent loop variable
for (int i = 0; i < num_inp_dims; i++) {  // Was: in.dim()
    p_inp_shape[i] = in.size(i);
    p_out_shape[i] = out.size(i);
}
```

These changes ensure consistency between the input type checking and the actual data being processed.

---

## Testing Recommendations

1. **Test with unaligned buffers**: Create test cases where input/output buffers are not 8-byte aligned
2. **Test various tensor shapes**: Include odd dimensions that result in unaligned memory access patterns
3. **Test on actual HiFi hardware**: Alignment issues may not manifest in simulation
4. **Stress test with random data**: Verify correctness across many iterations

---

## References

- Cadence HiFi DSP Programmer's Reference Manual
- Xtensa Intrinsics Guide for HiFi4/HiFi5
- ExecutorTorch Cadence Backend Documentation

Differential Revision: [D92083443](https://our.internmc.facebook.com/intern/diff/D92083443/)

[ghstack-poisoned]
---
 .../hifi/operators/op_permute_copy.cpp        | 12 +++-
 .../hifi/operators/op_transpose_copy.cpp      |  8 +--
 .../third-party/nnlib/xa_nn_transpose_32.c    | 57 +++++++------------
 3 files changed, 31 insertions(+), 46 deletions(-)

diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp
index fc162d6c7f1..3d5f8aabde8 100644
--- a/backends/cadence/hifi/operators/op_permute_copy.cpp
+++ b/backends/cadence/hifi/operators/op_permute_copy.cpp
@@ -68,8 +68,8 @@ Tensor& permute_copy_out(
       InvalidArgument,
       out);
 
-  const auto in_type = out.scalar_type();
-  constexpr int kNnlibMaxDim = 16;
+  const auto in_type = in.scalar_type();
+  constexpr int kNnlibMaxDim = 5;
 
   bool optimized = false;
 
@@ -91,7 +91,13 @@ Tensor& permute_copy_out(
 
     for (int i = 0; i < num_inp_dims; i++) {
       p_inp_shape[i] = in.size(i);
-      p_out_shape[i] = in.size(dims[i]);
+    }
+
+    for (int i = 0; i < num_out_dims; i++) {
+      p_out_shape[i] = out.size(i);
+    }
+
+    for (int i = 0; i < num_inp_dims; i++) {
       p_permute_vec[i] = dims[i];
     }
 
diff --git a/backends/cadence/hifi/operators/op_transpose_copy.cpp b/backends/cadence/hifi/operators/op_transpose_copy.cpp
index a21a7f6178c..d872bb8ed58 100644
--- a/backends/cadence/hifi/operators/op_transpose_copy.cpp
+++ b/backends/cadence/hifi/operators/op_transpose_copy.cpp
@@ -64,7 +64,7 @@ Tensor& transpose_copy_int_out(
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
-  const auto in_type = out.scalar_type();
+  const auto in_type = in.scalar_type();
   constexpr int kNnlibMaxDim = 5;
 
   bool optimized = false;
@@ -85,14 +85,12 @@ Tensor& transpose_copy_int_out(
     WORD32 p_out_shape[kNnlibMaxDim];
     WORD32 p_permute_vec[kNnlibMaxDim];
 
-    for (int i = 0; i < in.dim(); i++) {
+    for (int i = 0; i < num_inp_dims; i++) {
       p_inp_shape[i] = in.size(i);
-    }
-    for (int i = 0; i < out.dim(); i++) {
       p_out_shape[i] = out.size(i);
     }
 
-    for (int i = 0; i < in.dim(); i++) {
+    for (int i = 0; i < num_inp_dims; i++) {
       p_permute_vec[i] = i;
     }
 
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
index e7b80e3a1d9..5b3ed385568 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
@@ -170,44 +170,23 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
           for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4)
           {
             WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
-            if((((unsigned)p_inp4 & 1) == 0) && (((unsigned)p_out & 1) == 0))
+            ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
+            ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
+            ae_valign a_inp = AE_LA64_PP(pae_i);
+            ae_valign a_out = AE_ZALIGN64();
+            ae_int32x2 d0;
+            for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
             {
-              ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
-              ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
-              ae_int32x2 d0;
-              for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
-              {
-                AE_L32X2_IP(d0, pae_i, 2 * sizeof(WORD32));
-                AE_S32X2_IP(d0, pae_o, 2 * sizeof(WORD32));
-              }
-              ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
-              ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
-#pragma loop_count max=3
-              for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
-              {
-                puae_o[itr4] = puae_i[itr4];
-              }
+              AE_LA32X2_IP(d0, a_inp, pae_i);
+              AE_SA32X2_IP(d0, a_out, pae_o);
             }
-            else
-            {
-              ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
-              ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
-              ae_valign a_inp = AE_LA64_PP(pae_i);
-              ae_valign a_out = AE_ZALIGN64();
-              ae_int32x2 d0;
-              for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
-              {
-                AE_LA32X2_IP(d0, a_inp, pae_i);
-                AE_SA32X2_IP(d0, a_out, pae_o);
-              }
-              AE_SA64POS_FP(a_out, pae_o);
-              ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
-              ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
+            AE_SA64POS_FP(a_out, pae_o);
+            ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
+            ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
 #pragma loop_count max=3
-              for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
-              {
-                puae_o[itr4] = puae_i[itr4];
-              }
+            for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
+            {
+              puae_o[itr4] = puae_i[itr4];
             }
           }
         }
@@ -237,8 +216,10 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
               ae_int32x2 d0, d1;
               ae_int32x2 tmp0;
 
-              AE_L32_XP(d0, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2);
-              AE_L32_XP(d1, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2);
+              d0 = AE_L32_X((ae_int32 *)p_inp4, 0);
+              p_inp4 += inp_stride[p_5D_permute_vec[4]];
+              d1 = AE_L32_X((ae_int32 *)p_inp4, 0);
+              p_inp4 += inp_stride[p_5D_permute_vec[4]];
 
               tmp0 = AE_SEL32_HH(d0, d1);
 
@@ -257,4 +238,4 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
   }
 
   return 0;
-}
\ No newline at end of file
+}