From 6c8650156ce0eb9c205dd783a99d8635d65d9cb1 Mon Sep 17 00:00:00 2001
From: Zonglin Peng <zonglinpeng@meta.com>
Date: Tue, 3 Feb 2026 16:01:59 -0800
Subject: [PATCH] Adding fixes for im2row and conv1d, link im2row, patch
 transpose kernel for safe load & store, add cpp test for im2row, link
 transpose to internal, add cpp (#17180)

Summary:

...test for transpose

Summary
Adding fixes for im2row and conv1d
Original patch: https://github.com/cad-audio/executorch/commit/1cb9ee714731d4d615850d26a34b8d4480bf26fe#diff-72cb6b6b59f6d8be4021885fa14490cd182776f883b6d2132678968b9e7cb267

Explanation of Code Changes in xa_nn_transpose_32.c
This patch fixes a memory alignment crash issue in the transpose operation for Cadence HiFi DSPs. Let me explain both changes:

Change 1: Simplified Inner Loop (Lines 170-191)
The Problem
The original code had a flawed alignment check:

// OLD CODE - BUGGY
if((((unsigned)p_inp4 & 1) == 0) && (((unsigned)p_out & 1) == 0))
{
    // "Aligned" path using AE_L32X2_IP / AE_S32X2_IP
}
else
{
    // Unaligned path using AE_LA32X2_IP / AE_SA32X2_IP
}

The bug: The check & 1 only verifies 2-byte alignment (checking if the least significant bit is 0). However:

ae_int32x2 is a 64-bit (8-byte) SIMD type
AE_L32X2_IP / AE_S32X2_IP are aligned load/store intrinsics that require 8-byte alignment
The correct check should be & 0x7 (for 8-byte alignment) or at minimum & 0x3 (for 4-byte alignment)
Result: When pointers were 2-byte aligned but NOT 8-byte aligned, the code incorrectly took the "aligned" path, causing:

Crashes on HiFi DSPs due to misaligned memory access
Data corruption in some cases
The Fix
// NEW CODE - SAFE
ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
ae_valign a_inp = AE_LA64_PP(pae_i);      // Prime alignment register for input
ae_valign a_out = AE_ZALIGN64();           // Initialize alignment register for output
ae_int32x2 d0;
for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
{
    AE_LA32X2_IP(d0, a_inp, pae_i);        // Unaligned load
    AE_SA32X2_IP(d0, a_out, pae_o);        // Unaligned store
}
AE_SA64POS_FP(a_out, pae_o);               // Flush remaining data

Why this works:

Always uses unaligned intrinsics (AE_LA32X2_IP / AE_SA32X2_IP) which work correctly for both aligned and unaligned data
Eliminates the faulty branch entirely - no alignment check needed
Minimal performance impact - on modern HiFi DSPs, unaligned intrinsics on aligned data have negligible overhead
Simpler code - easier to maintain and less error-prone
Intrinsic	Type	Requirement
AE_L32X2_IP	Aligned load	Requires 8-byte aligned pointer
AE_LA32X2_IP	Unaligned load	Works with any alignment
AE_S32X2_IP	Aligned store	Requires 8-byte aligned pointer
AE_SA32X2_IP	Unaligned store	Works with any alignment
Change 2: Fixed Strided Load (Lines 216-222)
The Problem
// OLD CODE - PROBLEMATIC
AE_L32_XP(d0, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2);
AE_L32_XP(d1, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2);

The AE_L32_XP intrinsic:

Loads a 32-bit value from the pointer
Modifies the pointer by adding the byte offset
Has specific alignment and type requirements
Issues:

The << 2 converts stride to bytes, but the pointer increment behavior inside the macro can be problematic with unaligned addresses
The intrinsic may have stricter alignment requirements than expected
The cast (ae_int32 *)p_inp4 combined with the post-increment behavior can cause undefined behavior on some platforms
The Fix
// NEW CODE - EXPLICIT AND SAFE
d0 = AE_L32_X((ae_int32 *)p_inp4, 0);           // Load with explicit offset
p_inp4 += inp_stride[p_5D_permute_vec[4]];      // Manual pointer increment
d1 = AE_L32_X((ae_int32 *)p_inp4, 0);           // Load with explicit offset
p_inp4 += inp_stride[p_5D_permute_vec[4]];      // Manual pointer increment

Why this works:

AE_L32_X(ptr, offset) - Loads from ptr + offset without modifying the pointer
Explicit pointer arithmetic - p_inp4 += stride increments by elements (not bytes), which is correct for WORD32*
Clearer semantics - Separates load and pointer update, making behavior predictable
Avoids intrinsic quirks - No reliance on how the intrinsic handles pointer modification internally
Aspect	Old (AE_L32_XP)	New (AE_L32_X + manual)
Pointer update	Inside intrinsic (byte offset)	Explicit (element offset)
Clarity	Implicit behavior	Explicit, readable
Safety	Potential alignment issues	Guaranteed correct
Summary
Change	Root Cause	Fix Strategy
Inner loop simplification	Wrong alignment check (& 1 instead of & 0x7)	Always use unaligned intrinsics
Strided load fix	AE_L32_XP intrinsic issues with unaligned pointers	Use AE_L32_X + manual pointer increment
Both fixes follow the same principle: Instead of trying to detect alignment and use different code paths, use intrinsics that handle unaligned data correctly in all cases. This is safer, simpler, and has minimal performance impact on modern HiFi DSPs.
titled
titled

titled

Differential Revision: D92207944
---
 backends/cadence/aot/functions_hifi.yaml      |  15 +
 backends/cadence/hifi/kernels/CMakeLists.txt  |   1 +
 backends/cadence/hifi/kernels/kernels.h       |  22 +
 .../cadence/hifi/operators/CMakeLists.txt     |   6 +
 .../cadence/hifi/operators/op_im2row_out.cpp  | 376 ++++++++++++++++
 .../hifi/operators/op_permute_copy.cpp        |  12 +-
 ...ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp | 113 +++--
 ...ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp |  74 +--
 ...nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp |  82 +++-
 ...nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp |  12 +-
 .../hifi/operators/op_transpose_copy.cpp      | 164 +++++++
 backends/cadence/hifi/operators/operators.h   |  29 ++
 backends/cadence/hifi/operators/targets.bzl   |   2 +
 backends/cadence/hifi/operators/tests/BUCK    |  42 ++
 .../operators/tests/test_op_im2row_out.cpp    | 402 +++++++++++++++++
 .../tests/test_op_transpose_copy.cpp          | 425 ++++++++++++++++++
 .../hifi/third-party/nnlib/xa_nn_im2row.c     | 106 +++++
 .../third-party/nnlib/xa_nn_transpose_32.c    |  57 +--
 18 files changed, 1816 insertions(+), 124 deletions(-)
 create mode 100644 backends/cadence/hifi/operators/op_im2row_out.cpp
 create mode 100644 backends/cadence/hifi/operators/op_transpose_copy.cpp
 create mode 100644 backends/cadence/hifi/operators/tests/test_op_im2row_out.cpp
 create mode 100644 backends/cadence/hifi/operators/tests/test_op_transpose_copy.cpp
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c

diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index 3bdbb33d59b..2d33353001a 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -267,6 +267,11 @@
     - arg_meta: null
       kernel_name: impl::HiFi::tanh_out
 
+- op: transpose_copy.int_out
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::transpose_copy_int_out
+
 - op: view_copy.out
   kernels:
     - arg_meta: null
@@ -278,6 +283,16 @@
       kernel_name: impl::HiFi::where_self_out
 
 # custom ops
+- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::native::im2row_out
+
+- func: cadence::im2row.per_tensor_out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, int in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::native::im2row_per_tensor_out
+
 - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 936e28e2241..c366cecbe0c 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -18,6 +18,7 @@ add_library(
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 08343e2528b..6a3dcd1d245 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -196,6 +196,28 @@ extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(
     const unsigned char* __restrict__ p_condition,
     const WORD32* const p_condition_shape);
 
+extern "C" WORD32 xa_nn_im2row_quantized(
+    const WORD8* __restrict__ data_im,
+    const WORD32 in_zero_point,
+    /* input parameters*/
+    const WORD32 channels,
+    const WORD32 height,
+    const WORD32 width,
+    /* output parameters */
+    const WORD32 out_height,
+    const WORD32 out_width,
+    /* convolution parameters */
+    const WORD32 kernel_h,
+    const WORD32 kernel_w,
+    const WORD32 pad_h,
+    const WORD32 pad_w,
+    const WORD32 stride_h,
+    const WORD32 stride_w,
+    const WORD32 dilation_h,
+    const WORD32 dilation_w,
+    WORD8* __restrict__ data_col,
+    WORD32 channels_last);
+
 extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
     FLOAT32* __restrict__ p_out,
     const WORD32* const p_out_shape,
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index 26555da9760..c4f1773d57f 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -16,6 +16,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 # ATen compliant ops that are needed to run this model.
 set(_aten_ops__srcs
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_im2row_out.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_atan2.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bitwise_and.cpp"
@@ -52,6 +53,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_transpose_copy.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_view_copy.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
@@ -96,6 +98,10 @@ add_library(
   "op_quantize_per_tensor.cpp"
   "op_quantized_relu_out.cpp"
   "op_dequantize_per_tensor.cpp"
+  "op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out"
+  "op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out"
+  "op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out"
+  "op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out"
   "op_quantized_conv2d_nchw_out.cpp"
   "op_quantized_conv2d_nhwc_out.cpp"
   "op_quantized_fully_connected_out"
diff --git a/backends/cadence/hifi/operators/op_im2row_out.cpp b/backends/cadence/hifi/operators/op_im2row_out.cpp
new file mode 100644
index 00000000000..0ff977c471c
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_im2row_out.cpp
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/dtype_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+#include <algorithm>
+
+#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+template <typename T>
+__attribute__((always_inline)) void im2row_(
+    const T* __restrict__ data_im,
+    const int32_t in_zero_point,
+    /* input parameters*/
+    const int32_t channels,
+    const int32_t height,
+    const int32_t width,
+    /* output parameters */
+    const int32_t out_height,
+    const int32_t out_width,
+    /* convolution parameters */
+    const int32_t kernel_h,
+    const int32_t kernel_w,
+    const int32_t pad_h,
+    const int32_t pad_w,
+    const int32_t stride_h,
+    const int32_t stride_w,
+    const int32_t dilation_h,
+    const int32_t dilation_w,
+    T* __restrict__ data_col,
+    bool channels_last) {
+  // Consider convolving the input image of dimensions channels * height * width
+  // (or height * width * channels for NHWC layout) with a filter of dimensions
+  // channels * kernels_h * kernels_w. Assume that this convolution will produce
+  // an output of dimensinos out_height x out_width. For each point the output,
+  // im2row takes the data from the input that is used in the computation of
+  // that output point, and flattens it into a vector of size channels_col =
+  // channels * kernel_h * kernel_w. The output of im2row will therefore be a 2D
+  // array of size (out_height * out_width) x channels_col
+  const int32_t channels_col = channels * kernel_h * kernel_w;
+
+  // If the layout is NHWC, we can copy 'channels' worth of contiguous data
+  // points when performing im2row.
+  if (channels_last) {
+    // Iterate over the output domain
+    for (int _h = 0; _h < out_height; ++_h) {
+      for (int _w = 0; _w < out_width; ++_w) {
+        int32_t i_col = _h * out_width + _w;
+        // Each point in the output domain is the result of applying a filter of
+        // size kernel_h x kernel_w x channels on the input. But since channels
+        // is contiguous, we will not explicitly have a loop for it.
+        for (int _kh = 0; _kh < kernel_h; ++_kh) {
+          int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
+          for (int _kw = 0; _kw < kernel_w; ++_kw) {
+            int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
+
+            // h_im and w_im are the actual height and width coordinates of the
+            // input tensor from where we need to copy 'channels' points.
+            const T* __restrict__ slice_im =
+                data_im + (h_im * width + w_im) * channels;
+            T* __restrict__ slice_col = data_col + i_col * channels_col +
+                (_kh * kernel_w + _kw) * channels;
+            // If the coordinates were within the input domain, we copy
+            // 'channels' contiguous values. Otherwise we will fill the output
+            // with 0's.
+            if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+              std::memcpy(slice_col, slice_im, channels * sizeof(T));
+            } else {
+              std::fill_n(slice_col, channels, T(in_zero_point));
+            }
+          }
+        }
+      }
+    }
+  } else {
+    // Iterate over the output domain
+    for (int _h = 0; _h < out_height; ++_h) {
+      for (int _w = 0; _w < out_width; ++_w) {
+        int32_t i_col = _h * out_width + _w;
+
+        // Each point in the output domain is the result of applying a filter
+        // of size chanenls * kernel_h x kernel_w on the input
+        for (int _c = 0; _c < channels; ++_c) {
+          for (int _kh = 0; _kh < kernel_h; ++_kh) {
+            for (int _kw = 0; _kw < kernel_w; ++_kw) {
+              // c_col is the linearized access in the channels_col vector.
+              int32_t c_col = (_c * kernel_h + _kh) * kernel_w + _kw;
+              // h_im and w_im are the actual height and width coordinates of
+              // the input tensor that we need to copy to the output.
+              int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
+              int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
+              // If the current data access is within the input tensor, copy the
+              // value
+              data_col[i_col * channels_col + c_col] =
+                  (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
+                  ? data_im[(_c * height + h_im) * width + w_im]
+                  : static_cast<T>(in_zero_point);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void im2row_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    const Tensor& in_zero_point,
+    bool channel_last,
+    Tensor& out) {
+  // Compute the input tensor's dims
+  bool unit_height = input.dim() == 3;
+  const int32_t batch_size = input.size(0);
+  const int32_t in_c =
+      channel_last ? input.size(3 - unit_height) : input.size(1);
+  const int32_t in_h =
+      unit_height ? 1 : (channel_last ? input.size(1) : input.size(2));
+  const int32_t in_w =
+      channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height);
+
+  // Get the kernel parameters
+  int32_t kernel_h = kernel_size[0];
+  int32_t kernel_w = kernel_size[1];
+  int32_t dilation_h = dilation[0];
+  int32_t dilation_w = dilation[1];
+  int32_t pad_h = padding[0];
+  int32_t pad_w = padding[1];
+  int32_t stride_h = stride[0];
+  int32_t stride_w = stride[1];
+
+  // If we were to apply a convolution on the input tensor, compute the output
+  // height and width.
+  int32_t out_h =
+      (in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
+  int32_t out_w =
+      (in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
+
+  ET_DCHECK_MSG(
+      (out_h * out_w) == out.size(1), "dimension mismatch for output");
+  ET_DCHECK_MSG(
+      (kernel_h * kernel_w * in_c) == out.size(2),
+      "dimension mismatch for output");
+  // Check if the input is per-tensor quantized or per-channel quantized. The
+  // zero point for each batch could differ for per-channel quantized input.
+  bool per_tensor_quantized = in_zero_point.numel() == 1;
+
+  bool optimized = false;
+  if (input.scalar_type() == ScalarType::Char ||
+      input.scalar_type() == ScalarType::Byte)
+    optimized = true;
+
+  if (optimized) {
+    const int8_t* __restrict__ in_data =
+        (WORD8* __restrict__)input.const_data_ptr<int8_t>();
+    int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();
+    const int32_t* __restrict__ zero_point =
+        in_zero_point.const_data_ptr<int32_t>();
+    int32_t in_plane = in_c * in_h * in_w;
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;
+    for (size_t n = 0; n < batch_size; ++n) {
+      xa_nn_im2row_quantized(
+          &in_data[n * in_plane],
+          per_tensor_quantized ? zero_point[0] : zero_point[n],
+          in_c,
+          in_h,
+          in_w,
+          out_h,
+          out_w,
+          kernel_h,
+          kernel_w,
+          pad_h,
+          pad_w,
+          stride_h,
+          stride_w,
+          dilation_h,
+          dilation_w,
+          &out_data[n * out_plane],
+          channel_last ? 1 : 0);
+    }
+  } else {
+#define typed_im2row(dtype, ctype)                                     \
+  case ScalarType::dtype: {                                            \
+    const ctype* __restrict__ in_data = input.const_data_ptr<ctype>(); \
+    ctype* __restrict__ out_data = out.mutable_data_ptr<ctype>();      \
+    const int32_t* __restrict__ zero_point =                           \
+        in_zero_point.const_data_ptr<int32_t>();                       \
+    int32_t in_plane = in_c * in_h * in_w;                             \
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;    \
+    for (size_t n = 0; n < batch_size; ++n) {                          \
+      im2row_<ctype>(                                                  \
+          &in_data[n * in_plane],                                      \
+          per_tensor_quantized ? zero_point[0] : zero_point[n],        \
+          in_c,                                                        \
+          in_h,                                                        \
+          in_w,                                                        \
+          out_h,                                                       \
+          out_w,                                                       \
+          kernel_h,                                                    \
+          kernel_w,                                                    \
+          pad_h,                                                       \
+          pad_w,                                                       \
+          stride_h,                                                    \
+          stride_w,                                                    \
+          dilation_h,                                                  \
+          dilation_w,                                                  \
+          &out_data[n * out_plane],                                    \
+          channel_last);                                               \
+    }                                                                  \
+    break;                                                             \
+  }
+
+    ScalarType dtype = input.scalar_type();
+    switch (dtype) {
+      typed_im2row(Float, float);
+      typed_im2row(Byte, uint8_t);
+      typed_im2row(Char, int8_t);
+      default:
+        ET_DCHECK_MSG(
+            false,
+            "im2row not implemented for dtype %s",
+            torch::executor::toString(dtype));
+    }
+#undef typed_im2row
+  }
+}
+
+void im2row_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    int64_t in_zero_point,
+    bool channel_last,
+    Tensor& out) {
+  // Compute the input tensor's dims
+  bool unit_height = input.dim() == 3;
+  const int32_t batch_size = input.size(0);
+  const int32_t in_c =
+      channel_last ? input.size(3 - unit_height) : input.size(1);
+  const int32_t in_h =
+      unit_height ? 1 : (channel_last ? input.size(1) : input.size(2));
+  const int32_t in_w =
+      channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height);
+
+  // Get the kernel parameters
+  int32_t kernel_h = kernel_size[0];
+  int32_t kernel_w = kernel_size[1];
+  int32_t dilation_h = dilation[0];
+  int32_t dilation_w = dilation[1];
+  int32_t pad_h = padding[0];
+  int32_t pad_w = padding[1];
+  int32_t stride_h = stride[0];
+  int32_t stride_w = stride[1];
+
+  // If we were to apply a convolution on the input tensor, compute the output
+  // height and width.
+  int32_t out_h =
+      (in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
+  int32_t out_w =
+      (in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
+
+  ET_DCHECK_MSG(
+      (out_h * out_w) == out.size(1), "dimension mismatch for output");
+  ET_DCHECK_MSG(
+      (kernel_h * kernel_w * in_c) == out.size(2),
+      "dimension mismatch for output");
+
+  bool optimized = false;
+  if (input.scalar_type() == ScalarType::Char ||
+      input.scalar_type() == ScalarType::Byte)
+    optimized = true;
+
+  if (optimized) {
+    const int8_t* __restrict__ in_data =
+        (WORD8* __restrict__)input.const_data_ptr<int8_t>();
+    int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();
+    int32_t in_plane = in_c * in_h * in_w;
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;
+
+    for (size_t n = 0; n < batch_size; ++n) {
+      xa_nn_im2row_quantized(
+          &in_data[n * in_plane],
+          (int32_t)in_zero_point,
+          in_c,
+          in_h,
+          in_w,
+          out_h,
+          out_w,
+          kernel_h,
+          kernel_w,
+          pad_h,
+          pad_w,
+          stride_h,
+          stride_w,
+          dilation_h,
+          dilation_w,
+          &out_data[n * out_plane],
+          channel_last ? 1 : 0);
+    }
+  } else {
+#define typed_im2row_per_tensor(dtype, ctype)                          \
+  case ScalarType::dtype: {                                            \
+    const ctype* __restrict__ in_data = input.const_data_ptr<ctype>(); \
+    ctype* __restrict__ out_data = out.mutable_data_ptr<ctype>();      \
+    int32_t in_plane = in_c * in_h * in_w;                             \
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;    \
+    for (size_t n = 0; n < batch_size; ++n) {                          \
+      im2row_<ctype>(                                                  \
+          &in_data[n * in_plane],                                      \
+          in_zero_point,                                               \
+          in_c,                                                        \
+          in_h,                                                        \
+          in_w,                                                        \
+          out_h,                                                       \
+          out_w,                                                       \
+          kernel_h,                                                    \
+          kernel_w,                                                    \
+          pad_h,                                                       \
+          pad_w,                                                       \
+          stride_h,                                                    \
+          stride_w,                                                    \
+          dilation_h,                                                  \
+          dilation_w,                                                  \
+          &out_data[n * out_plane],                                    \
+          channel_last);                                               \
+    }                                                                  \
+    break;                                                             \
+  }
+
+    ScalarType dtype = input.scalar_type();
+    switch (dtype) {
+      typed_im2row_per_tensor(Float, float);
+      typed_im2row_per_tensor(Byte, uint8_t);
+      typed_im2row_per_tensor(Char, int8_t);
+      default:
+        ET_DCHECK_MSG(
+            false,
+            "im2row.per_tensor not implemented for dtype %s",
+            torch::executor::toString(dtype));
+    }
+#undef typed_im2row_per_tensor
+  }
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp
index fc162d6c7f1..3d5f8aabde8 100644
--- a/backends/cadence/hifi/operators/op_permute_copy.cpp
+++ b/backends/cadence/hifi/operators/op_permute_copy.cpp
@@ -68,8 +68,8 @@ Tensor& permute_copy_out(
       InvalidArgument,
       out);
 
-  const auto in_type = out.scalar_type();
-  constexpr int kNnlibMaxDim = 16;
+  const auto in_type = in.scalar_type();
+  constexpr int kNnlibMaxDim = 5;
 
   bool optimized = false;
 
@@ -91,7 +91,13 @@ Tensor& permute_copy_out(
 
     for (int i = 0; i < num_inp_dims; i++) {
       p_inp_shape[i] = in.size(i);
-      p_out_shape[i] = in.size(dims[i]);
+    }
+
+    for (int i = 0; i < num_out_dims; i++) {
+      p_out_shape[i] = out.size(i);
+    }
+
+    for (int i = 0; i < num_inp_dims; i++) {
       p_permute_vec[i] = dims[i];
     }
 
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp
index b5ab0cdbaa2..f543f4633cf 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -35,7 +35,7 @@ void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
     float output_scale,
     int32_t output_zero_point,
     Tensor& out) {
-  constexpr int kNnlibMaxDim = 3;
+  constexpr int kNnlibMaxDim = 5;
 
   WORD8* __restrict__ p_out =
       (WORD8* __restrict__)out.mutable_data_ptr<int8_t>();
@@ -49,19 +49,29 @@ void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
   WORD32 batches = input.size(0);
   WORD32 input_channels = input.size(1);
   WORD32 input_width = input.size(2);
+  WORD32 input_height = 1;
+  WORD32 kernel_height = 1;
   WORD32 out_channels = weight.size(0);
   WORD32 kernel_channels = weight.size(1);
   WORD32 kernel_width = weight.size(2);
   WORD32 out_width = out.size(2);
+  WORD32 out_height = 1;
   WORD32 x_stride = stride[1];
+  WORD32 y_stride = stride[0];
   WORD32 x_padding = padding[1];
+  WORD32 y_padding = padding[0];
+  WORD32 dilation_height = 1;
+  WORD32 dilation_width = 1;
   WORD32 input_zero_bias = -in_zero_point;
-  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
-  WORD32 out_shift32 = 0;
   WORD32 kernel_zero_bias = -weight_zero_point;
 
   WORD32 out_zero_bias = output_zero_point;
+
+  WORD32 input_precision = 8;
+  WORD32 kernel_precision = 8;
+
   WORD32 out_data_format = 1;
+
   WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
       ctx, ((batches * input_channels * input_width) + 8) * sizeof(WORD8));
   WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory(
@@ -71,16 +81,20 @@ void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
   WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8);
 
   WORD32 p_inp_shape[kNnlibMaxDim];
-  p_inp_shape[0] = batches;
-  p_inp_shape[1] = input_channels;
-  p_inp_shape[2] = input_width;
+  p_inp_shape[0] = 1;
+  p_inp_shape[1] = 1;
+  p_inp_shape[2] = batches;
+  p_inp_shape[3] = input_channels;
+  p_inp_shape[4] = input_width;
 
   WORD32 p_out_shape[kNnlibMaxDim];
-  p_out_shape[0] = batches;
-  p_out_shape[1] = input_width;
-  p_out_shape[2] = input_channels;
+  p_out_shape[0] = 1;
+  p_out_shape[1] = 1;
+  p_out_shape[2] = batches;
+  p_out_shape[3] = input_width;
+  p_out_shape[4] = input_channels;
 
-  WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 1};
+  WORD32 p_permute_vec[kNnlibMaxDim] = {0, 1, 2, 4, 3};
 
   xa_nn_transpose_8_8(
       pin,
@@ -92,14 +106,18 @@ void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
       kNnlibMaxDim);
 
   WORD32 p_inp_shape1[kNnlibMaxDim];
-  p_inp_shape1[0] = out_channels;
-  p_inp_shape1[1] = kernel_channels;
-  p_inp_shape1[2] = kernel_width;
+  p_inp_shape1[0] = 1;
+  p_inp_shape1[1] = 1;
+  p_inp_shape1[2] = out_channels;
+  p_inp_shape1[3] = kernel_channels;
+  p_inp_shape1[4] = kernel_width;
 
   WORD32 p_out_shape1[kNnlibMaxDim];
-  p_out_shape1[0] = out_channels;
-  p_out_shape1[1] = kernel_width;
-  p_out_shape1[2] = kernel_channels;
+  p_out_shape1[0] = 1;
+  p_out_shape1[1] = 1;
+  p_out_shape1[2] = out_channels;
+  p_out_shape1[3] = kernel_width;
+  p_out_shape1[4] = kernel_channels;
 
   xa_nn_transpose_8_8(
       pkernel,
@@ -110,34 +128,71 @@ void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
       kNnlibMaxDim,
       kNnlibMaxDim);
 
-  WORD32 scratch_size =
-      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+  WORD32 p_out_multiplier32[out_channels];
+  WORD32 p_out_shift32[out_channels];
+
+  float out_scale = 1. / output_scale;
+
+  for (int i = 0; i < out_channels; i++) {
+    p_out_multiplier32[i] = bias_scale * out_scale * 2147483648;
+    p_out_shift32[i] = 0;
+  }
+
+  WORD32 scratch_size = xa_nn_conv2d_getsize(
+      input_height,
+      input_width,
+      input_channels,
+      kernel_height,
+      kernel_width,
+      kernel_channels,
+      dilation_height,
+      dilation_width,
+      x_stride,
+      x_padding,
+      y_stride,
+      y_padding,
+      out_height,
+      out_width,
+      out_channels,
+      input_precision,
+      kernel_precision,
+      out_data_format);
+
   scratch_size = scratch_size < 0 ? 0 : scratch_size;
+
+  pVOID p_scratch = nullptr;
   WORD32* ptr_scratch =
       (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
-  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+  p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
 
   for (int _n = 0; _n < batches; _n++) {
-    WORD8* in_batch = pin + _n * input_channels * input_width;
-    WORD8* out_batch = p_out + _n * out_channels * out_width;
+    WORD8* in_batch = pin + _n * input_channels * 1 * input_width;
+    WORD8* out_batch = p_out + _n * out_channels * 1 * out_width;
 
-    xa_nn_conv1d_std_asym8xasym8(
-        (UWORD8*)out_batch,
-        (UWORD8*)in_batch,
-        (UWORD8*)pkernel,
+    xa_nn_conv2d_per_chan_sym8sxasym8s(
+        out_batch,
+        in_batch,
+        pkernel,
         p_bias,
-        1,
+        input_height,
         input_width,
         input_channels,
+        kernel_height,
         kernel_width,
+        kernel_channels,
+        dilation_height,
+        dilation_width,
         out_channels,
         x_stride,
+        y_stride,
         x_padding,
+        y_padding,
+        out_height,
         out_width,
         input_zero_bias,
-        kernel_zero_bias,
-        out_multiplier32,
-        out_shift32,
+        p_out_multiplier32,
+        p_out_shift32,
         out_zero_bias,
         out_data_format,
         p_scratch);
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp
index 60e700f563b..4ad36a3b5fa 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -35,7 +35,7 @@ void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
     float output_scale,
     int32_t output_zero_point,
     Tensor& out) {
-  constexpr int kNnlibMaxDim = 3;
+  constexpr int kNnlibMaxDim = 5;
 
   UWORD8* __restrict__ p_out =
       (UWORD8* __restrict__)out.mutable_data_ptr<uint8_t>();
@@ -49,10 +49,13 @@ void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
   WORD32 batches = input.size(0);
   WORD32 input_channels = input.size(1);
   WORD32 input_width = input.size(2);
+  WORD32 input_height = 1;
+  WORD32 kernel_height = 1;
   WORD32 out_channels = weight.size(0);
   WORD32 kernel_channels = weight.size(1);
   WORD32 kernel_width = weight.size(2);
   WORD32 out_width = out.size(2);
+  WORD32 out_height = 1;
   WORD32 x_stride = stride[1];
   WORD32 x_padding = padding[1];
   WORD32 input_zero_bias = -in_zero_point;
@@ -62,25 +65,37 @@ void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
 
   WORD32 out_zero_bias = output_zero_point;
   WORD32 out_data_format = 1;
-  UWORD8* ptr1 = (UWORD8*)kernels::allocate_temp_memory(
-      ctx, ((batches * input_channels * input_width) + 8) * sizeof(UWORD8));
-  UWORD8* ptr2 = (UWORD8*)kernels::allocate_temp_memory(
+
+  WORD32 scratch_size =
+      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+  scratch_size = scratch_size < 0 ? 0 : scratch_size;
+  WORD32* ptr_scratch =
+      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+  WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
+      ctx, ((batches * input_channels * input_width) + 8) * sizeof(WORD8));
+  WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory(
       ctx,
-      ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(UWORD8));
-  UWORD8* pin = (UWORD8*)ALIGN_PTR(ptr1, 8);
-  UWORD8* pkernel = (UWORD8*)ALIGN_PTR(ptr2, 8);
+      ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(WORD8));
+  WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8);
+  WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8);
 
   WORD32 p_inp_shape[kNnlibMaxDim];
-  p_inp_shape[0] = batches;
-  p_inp_shape[1] = input_channels;
-  p_inp_shape[2] = input_width;
+  p_inp_shape[0] = 1;
+  p_inp_shape[1] = 1;
+  p_inp_shape[2] = batches;
+  p_inp_shape[3] = input_channels;
+  p_inp_shape[4] = input_width;
 
   WORD32 p_out_shape[kNnlibMaxDim];
-  p_out_shape[0] = batches;
-  p_out_shape[1] = input_width;
-  p_out_shape[2] = input_channels;
+  p_out_shape[0] = 1;
+  p_out_shape[1] = 1;
+  p_out_shape[2] = batches;
+  p_out_shape[3] = input_width;
+  p_out_shape[4] = input_channels;
 
-  WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 1};
+  WORD32 p_permute_vec[kNnlibMaxDim] = {0, 1, 2, 4, 3};
 
   xa_nn_transpose_8_8(
       (WORD8*)pin,
@@ -92,14 +107,18 @@ void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
       kNnlibMaxDim);
 
   WORD32 p_inp_shape1[kNnlibMaxDim];
-  p_inp_shape1[0] = out_channels;
-  p_inp_shape1[1] = kernel_channels;
-  p_inp_shape1[2] = kernel_width;
+  p_inp_shape1[0] = 1;
+  p_inp_shape1[1] = 1;
+  p_inp_shape1[2] = out_channels;
+  p_inp_shape1[3] = kernel_channels;
+  p_inp_shape1[4] = kernel_width;
 
   WORD32 p_out_shape1[kNnlibMaxDim];
-  p_out_shape1[0] = out_channels;
-  p_out_shape1[1] = kernel_width;
-  p_out_shape1[2] = kernel_channels;
+  p_out_shape1[0] = 1;
+  p_out_shape1[1] = 1;
+  p_out_shape1[2] = out_channels;
+  p_out_shape1[3] = kernel_width;
+  p_out_shape1[4] = kernel_channels;
 
   xa_nn_transpose_8_8(
       (WORD8*)pkernel,
@@ -110,24 +129,17 @@ void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
       kNnlibMaxDim,
       kNnlibMaxDim);
 
-  WORD32 scratch_size =
-      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
-  scratch_size = scratch_size < 0 ? 0 : scratch_size;
-  WORD32* ptr_scratch =
-      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
-  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
-
   for (int _n = 0; _n < batches; _n++) {
-    UWORD8* in_batch = pin + _n * input_channels * input_width;
-    UWORD8* out_batch = p_out + _n * out_channels * out_width;
+    UWORD8* in_batch = (UWORD8*)(pin + _n * input_channels * input_width);
+    UWORD8* out_batch = (UWORD8*)(p_out + _n * out_channels * out_width);
 
     xa_nn_conv1d_std_asym8uxasym8u(
         out_batch,
         in_batch,
-        pkernel,
+        (UWORD8*)pkernel,
         p_bias,
-        1,
         input_width,
+        input_height,
         input_channels,
         kernel_width,
         out_channels,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp
index c9a3d2b58de..3b1c7b9a900 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -47,46 +47,94 @@ void xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s(
   WORD32 batches = input.size(0);
   WORD32 input_channels = input.size(1);
   WORD32 input_width = input.size(2);
+  WORD32 input_height = 1;
+  WORD32 kernel_height = 1;
   WORD32 out_channels = weight.size(0);
+  WORD32 kernel_channels = weight.size(1);
   WORD32 kernel_width = weight.size(2);
   WORD32 out_width = out.size(2);
+  WORD32 out_height = 1;
   WORD32 x_stride = stride[1];
+  WORD32 y_stride = stride[0];
   WORD32 x_padding = padding[1];
+  WORD32 y_padding = padding[0];
+  WORD32 dilation_height = 1;
+  WORD32 dilation_width = 1;
   WORD32 input_zero_bias = -in_zero_point;
-  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
-  WORD32 out_shift32 = 0;
   WORD32 kernel_zero_bias = -weight_zero_point;
 
+  WORD32 input_precision = 8;
+  WORD32 kernel_precision = 8;
+
   WORD32 out_zero_bias = output_zero_point;
-  WORD32 out_data_format = 0;
-  WORD32 scratch_size =
-      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+
+  WORD32 out_data_format = 1;
+
+  WORD32 p_out_multiplier32[out_channels];
+  WORD32 p_out_shift32[out_channels];
+
+  float out_scale = 1. / output_scale;
+
+  for (int i = 0; i < out_channels; i++) {
+    p_out_multiplier32[i] = bias_scale * out_scale * 2147483648;
+    p_out_shift32[i] = 0;
+  }
+
+  WORD32 scratch_size = xa_nn_conv2d_getsize(
+      input_height,
+      input_width,
+      input_channels,
+      kernel_height,
+      kernel_width,
+      kernel_channels,
+      dilation_height,
+      dilation_width,
+      y_stride,
+      y_padding,
+      x_stride,
+      x_padding,
+      out_height,
+      out_width,
+      out_channels,
+      input_precision,
+      kernel_precision,
+      out_data_format);
+
   scratch_size = scratch_size < 0 ? 0 : scratch_size;
+
+  pVOID p_scratch = nullptr;
   WORD32* ptr_scratch =
-      (WORD32*)::impl::HiFi::kernels::allocate_temp_memory(ctx, scratch_size);
-  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+
+  p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
 
   for (int _n = 0; _n < batches; _n++) {
-    WORD8* in_batch = p_inp + _n * input_channels * input_width;
-    WORD8* out_batch = p_out + _n * out_channels * out_width;
+    WORD8* in_batch = p_inp + _n * input_channels * 1 * input_width;
+    WORD8* out_batch = p_out + _n * out_channels * 1 * out_width;
 
-    xa_nn_conv1d_std_asym8xasym8(
-        (UWORD8*)out_batch,
-        (UWORD8*)in_batch,
-        (UWORD8*)p_kernel,
+    xa_nn_conv2d_per_chan_sym8sxasym8s(
+        out_batch,
+        in_batch,
+        p_kernel,
         p_bias,
-        1,
+        input_height,
         input_width,
         input_channels,
+        kernel_height,
         kernel_width,
+        kernel_channels,
+        dilation_height,
+        dilation_width,
         out_channels,
         x_stride,
+        y_stride,
         x_padding,
+        y_padding,
+        out_height,
         out_width,
         input_zero_bias,
-        kernel_zero_bias,
-        out_multiplier32,
-        out_shift32,
+        p_out_multiplier32,
+        p_out_shift32,
         out_zero_bias,
         out_data_format,
         p_scratch);
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp
index 2d7a4cba509..5539410f46e 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -45,11 +45,11 @@ void xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
       (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
 
   WORD32 batches = input.size(0);
-  WORD32 input_channels = input.size(1);
-  WORD32 input_width = input.size(2);
-  WORD32 out_channels = weight.size(0);
-  WORD32 kernel_width = weight.size(2);
-  WORD32 out_width = out.size(2);
+  WORD32 input_channels = input.size(2);
+  WORD32 input_width = input.size(1);
+  WORD32 out_channels = weight.size(2);
+  WORD32 kernel_width = weight.size(1);
+  WORD32 out_width = out.size(1);
   WORD32 x_stride = stride[1];
   WORD32 x_padding = padding[1];
   WORD32 input_zero_bias = -in_zero_point;
@@ -75,8 +75,8 @@ void xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
         in_batch,
         p_kernel,
         p_bias,
-        1,
         input_width,
+        1,
         input_channels,
         kernel_width,
         out_channels,
diff --git a/backends/cadence/hifi/operators/op_transpose_copy.cpp b/backends/cadence/hifi/operators/op_transpose_copy.cpp
new file mode 100644
index 00000000000..d872bb8ed58
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_transpose_copy.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/util/transpose_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using executorch::aten::ScalarType;
+using executorch::aten::SizesType;
+using executorch::aten::Tensor;
+using executorch::runtime::Error;
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::kTensorDimensionLimit;
+using executorch::runtime::nonzero_dim;
+using executorch::runtime::resize_tensor;
+using executorch::runtime::tensors_have_same_dim_order;
+using torch::executor::check_transpose_copy_args;
+using torch::executor::get_transpose_out_target_size;
+using torch::executor::transpose_tensors;
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+/**
+ * Swaps dimension 'dim0' of 'a' with 'dim1', and copying
+ * that mutation into `out` in a manner such that the data is densely packed
+ * and is_contiguous() would return true (stride dim[size-1] = 1).
+ *
+ * transpose_copy.int_out(Tensor self, int dim0, int dim1, *, Tensor(a!) out)
+ */
+Tensor& transpose_copy_int_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    int64_t dim0,
+    int64_t dim1,
+    Tensor& out) {
+  (void)ctx;
+
+  if (dim0 < 0) {
+    dim0 += nonzero_dim(in);
+  }
+  if (dim1 < 0) {
+    dim1 += nonzero_dim(in);
+  }
+
+  Tensor::SizesType expected_out_size[kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+  get_transpose_out_target_size(
+      in, dim0, dim1, expected_out_size, &expected_out_dim);
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  const auto in_type = in.scalar_type();
+  constexpr int kNnlibMaxDim = 5;
+
+  bool optimized = false;
+
+  if (out.scalar_type() == ScalarType::Float ||
+      out.scalar_type() == ScalarType::Char ||
+      out.scalar_type() == ScalarType::Byte)
+    optimized = true;
+
+  if (in.dim() > kNnlibMaxDim)
+    optimized = false;
+
+  if (optimized) {
+    WORD32 num_inp_dims = in.dim();
+    WORD32 num_out_dims = num_inp_dims;
+
+    WORD32 p_inp_shape[kNnlibMaxDim];
+    WORD32 p_out_shape[kNnlibMaxDim];
+    WORD32 p_permute_vec[kNnlibMaxDim];
+
+    for (int i = 0; i < num_inp_dims; i++) {
+      p_inp_shape[i] = in.size(i);
+      p_out_shape[i] = out.size(i);
+    }
+
+    for (int i = 0; i < num_inp_dims; i++) {
+      p_permute_vec[i] = i;
+    }
+
+    p_permute_vec[dim0] = dim1;
+    p_permute_vec[dim1] = dim0;
+
+    if (in_type == ScalarType::Float) {
+      WORD32* p_inp = (WORD32*)in.const_data_ptr<float>();
+      WORD32* p_out = (WORD32*)out.mutable_data_ptr<float>();
+
+      WORD32 ret_val = xa_nn_transpose_32_32(
+          p_out,
+          p_out_shape,
+          p_inp,
+          p_inp_shape,
+          p_permute_vec,
+          num_out_dims,
+          num_inp_dims);
+
+      ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
+
+    } else if (in_type == ScalarType::Char) {
+      WORD8* p_inp = (WORD8*)in.const_data_ptr<char>();
+      WORD8* p_out = (WORD8*)out.mutable_data_ptr<char>();
+
+      WORD32 val = xa_nn_transpose_8_8(
+          p_out,
+          p_out_shape,
+          p_inp,
+          p_inp_shape,
+          p_permute_vec,
+          num_out_dims,
+          num_inp_dims);
+
+      ET_KERNEL_CHECK(ctx, val == 0, Internal, out);
+
+    } else if (in_type == ScalarType::Byte) {
+      WORD8* p_inp = (WORD8*)in.const_data_ptr<uint8_t>();
+      WORD8* p_out = (WORD8*)out.mutable_data_ptr<uint8_t>();
+
+      WORD32 val = xa_nn_transpose_8_8(
+          p_out,
+          p_out_shape,
+          p_inp,
+          p_inp_shape,
+          p_permute_vec,
+          num_out_dims,
+          num_inp_dims);
+
+      ET_KERNEL_CHECK(ctx, val == 0, Internal, out);
+    }
+
+    return out;
+  }
+
+  ET_KERNEL_CHECK(
+      ctx,
+      check_transpose_copy_args(in, dim0, dim1, out),
+      InvalidArgument,
+      out);
+
+  ET_SWITCH_ALL_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] {
+    transpose_tensors<CTYPE>(in, dim0, dim1, out);
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
diff --git a/backends/cadence/hifi/operators/operators.h b/backends/cadence/hifi/operators/operators.h
index 90028535848..6adc027fed9 100644
--- a/backends/cadence/hifi/operators/operators.h
+++ b/backends/cadence/hifi/operators/operators.h
@@ -196,6 +196,35 @@ void quantized_add_asym8uxasym8u_asym8u_per_tensor_out(
     int64_t out_zero_point,
     ::executorch::aten::Tensor& out);
 
+void im2row_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& input,
+    ::executorch::aten::IntArrayRef kernel_size,
+    ::executorch::aten::IntArrayRef dilation,
+    ::executorch::aten::IntArrayRef padding,
+    ::executorch::aten::IntArrayRef stride,
+    const ::executorch::aten::Tensor& in_zero_point,
+    bool channel_last,
+    ::executorch::aten::Tensor& out);
+
+void im2row_per_tensor_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& input,
+    ::executorch::aten::IntArrayRef kernel_size,
+    ::executorch::aten::IntArrayRef dilation,
+    ::executorch::aten::IntArrayRef padding,
+    ::executorch::aten::IntArrayRef stride,
+    int64_t in_zero_point,
+    bool channel_last,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& transpose_copy_int_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    int64_t dim0,
+    int64_t dim1,
+    ::executorch::aten::Tensor& out);
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
index 7937bea0682..9b33afd0b02 100644
--- a/backends/cadence/hifi/operators/targets.bzl
+++ b/backends/cadence/hifi/operators/targets.bzl
@@ -53,6 +53,7 @@ OPERATORS = [
     "ge",
     "gt",
     "hardtanh",
+    "im2row_out",
     "le",
     "lt",
     "masked_fill",
@@ -103,6 +104,7 @@ OPERATORS = [
     "split_with_sizes_copy",
     "sub",
     "tanh",
+    "transpose_copy",
     "view_copy",
     "where"
 ]
diff --git a/backends/cadence/hifi/operators/tests/BUCK b/backends/cadence/hifi/operators/tests/BUCK
index b7a8d5fcf8d..56de9784be2 100644
--- a/backends/cadence/hifi/operators/tests/BUCK
+++ b/backends/cadence/hifi/operators/tests/BUCK
@@ -199,3 +199,45 @@ jarvis_wrapper.cxx_test(
         "fbsource//xplat/executorch/runtime/core/exec_aten/testing_util:tensor_util",
     ],
 )
+
+jarvis_wrapper.cxx_test(
+    name = "test_op_im2row_out",
+    srcs = [
+        "test_op_im2row_out.cpp",
+    ],
+    compatible_backends = ["hifi"],
+    labels = [ci.linux(ci.mode("fbsource//arvr/mode/platform010/dev"))],
+    remote_execution = XTENSA_TEST_REMOTE_EXECUTION_ASIC,
+    platforms = CXX,
+    visibility = [
+        "fbsource//xplat/executorch/backends/cadence/...",
+        "fbcode//executorch/backends/cadence/...",
+    ],
+    deps = [
+        "fbsource//xplat/executorch/kernels/test:gtest_utils",
+        "fbsource//xplat/executorch/backends/cadence/hifi/operators:op_im2row_out",
+        "fbsource//xplat/executorch/backends/cadence/runtime:et_pal",
+        "fbsource//xplat/executorch/runtime/core/exec_aten/testing_util:tensor_util",
+    ],
+)
+
+jarvis_wrapper.cxx_test(
+    name = "test_op_transpose_copy",
+    srcs = [
+        "test_op_transpose_copy.cpp",
+    ],
+    compatible_backends = ["hifi"],
+    labels = [ci.linux(ci.mode("fbsource//arvr/mode/platform010/dev"))],
+    remote_execution = XTENSA_TEST_REMOTE_EXECUTION_ASIC,
+    platforms = CXX,
+    visibility = [
+        "fbsource//xplat/executorch/backends/cadence/...",
+        "fbcode//executorch/backends/cadence/...",
+    ],
+    deps = [
+        "fbsource//xplat/executorch/kernels/test:gtest_utils",
+        "fbsource//xplat/executorch/backends/cadence/hifi/operators:op_transpose_copy",
+        "fbsource//xplat/executorch/backends/cadence/runtime:et_pal",
+        "fbsource//xplat/executorch/runtime/core/exec_aten/testing_util:tensor_util",
+    ],
+)
diff --git a/backends/cadence/hifi/operators/tests/test_op_im2row_out.cpp b/backends/cadence/hifi/operators/tests/test_op_im2row_out.cpp
new file mode 100644
index 00000000000..bddb1776e1e
--- /dev/null
+++ b/backends/cadence/hifi/operators/tests/test_op_im2row_out.cpp
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+#include <sys/times.h>
+#include <xtensa/sim.h>
+
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <executorch/backends/cadence/hifi/operators/operators.h>
+
+namespace impl {
+namespace HiFi {
+namespace native {
+namespace {
+
+using ::executorch::aten::ArrayRef;
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::Scalar;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::aten::TensorImpl;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::executorch::runtime::runtime_init;
+using ::executorch::runtime::testing::TensorFactory;
+
+class HiFiIm2rowTest : public OperatorTest {
+ public:
+ protected:
+  void im2row_out(
+      const Tensor& input,
+      IntArrayRef kernel_size,
+      IntArrayRef dilation,
+      IntArrayRef padding,
+      IntArrayRef stride,
+      const Tensor& in_zero_point,
+      bool channel_last,
+      Tensor& out) {
+    ::impl::HiFi::native::im2row_out(
+        context_,
+        input,
+        kernel_size,
+        dilation,
+        padding,
+        stride,
+        in_zero_point,
+        channel_last,
+        out);
+  }
+
+  void im2row_per_tensor_out(
+      const Tensor& input,
+      IntArrayRef kernel_size,
+      IntArrayRef dilation,
+      IntArrayRef padding,
+      IntArrayRef stride,
+      int64_t in_zero_point,
+      bool channel_last,
+      Tensor& out) {
+    ::impl::HiFi::native::im2row_per_tensor_out(
+        context_,
+        input,
+        kernel_size,
+        dilation,
+        padding,
+        stride,
+        in_zero_point,
+        channel_last,
+        out);
+  }
+
+  // Helper to count occurrences of a value in output tensor
+  int countValue(const Tensor& tensor, float value) {
+    const float* data = tensor.const_data_ptr<float>();
+    int count = 0;
+    for (int i = 0; i < tensor.numel(); ++i) {
+      if (data[i] == value) {
+        count++;
+      }
+    }
+    return count;
+  }
+};
+
+// Test basic 3x3 kernel with NCHW layout, no padding
+// Input is all ones, so output should be all ones
+TEST_F(HiFiIm2rowTest, Basic3x3Kernel) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Int> tf_int;
+
+  // Input: (1, 8, 5, 4) - batch=1, channels=8, height=5, width=4
+  const std::vector<int32_t> input_sizes{1, 8, 5, 4};
+  const int64_t kernel_size[] = {3, 3};
+  const int64_t dilation[] = {1, 1};
+  const int64_t padding[] = {0, 0};
+  const int64_t stride[] = {1, 1};
+  const bool channel_last = false;
+
+  // out_h = (5 - 3) / 1 + 1 = 3
+  // out_w = (4 - 3) / 1 + 1 = 2
+  // output: (1, 3*2, 3*3*8) = (1, 6, 72)
+  const std::vector<int32_t> output_sizes{1, 6, 72};
+
+  Tensor input = tf.ones(input_sizes);
+  Tensor zero_point = tf_int.zeros({1});
+  Tensor out = tf.zeros(output_sizes);
+
+  im2row_out(
+      input,
+      kernel_size,
+      dilation,
+      padding,
+      stride,
+      zero_point,
+      channel_last,
+      out);
+
+  // Without padding, all output values should be 1.0 (from input)
+  EXPECT_EQ(countValue(out, 1.0f), out.numel());
+  EXPECT_EQ(countValue(out, 0.0f), 0);
+}
+
+// Test with stride=2, no padding
+// Input is all ones, so output should be all ones
+TEST_F(HiFiIm2rowTest, WithStride2) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Int> tf_int;
+
+  const std::vector<int32_t> input_sizes{1, 8, 5, 4};
+  const int64_t kernel_size[] = {3, 3};
+  const int64_t dilation[] = {1, 1};
+  const int64_t padding[] = {0, 0};
+  const int64_t stride[] = {2, 2};
+  const bool channel_last = false;
+
+  // out_h = (5 - 3) / 2 + 1 = 2
+  // out_w = (4 - 3) / 2 + 1 = 1
+  // output: (1, 2*1, 3*3*8) = (1, 2, 72)
+  const std::vector<int32_t> output_sizes{1, 2, 72};
+
+  Tensor input = tf.ones(input_sizes);
+  Tensor zero_point = tf_int.zeros({1});
+  Tensor out = tf.zeros(output_sizes);
+
+  im2row_out(
+      input,
+      kernel_size,
+      dilation,
+      padding,
+      stride,
+      zero_point,
+      channel_last,
+      out);
+
+  // Without padding, all output values should be 1.0 (from input)
+  EXPECT_EQ(countValue(out, 1.0f), out.numel());
+  EXPECT_EQ(countValue(out, 0.0f), 0);
+}
+
+// Test with padding=1
+// With zero_point=0, padded regions produce zeros in output
+TEST_F(HiFiIm2rowTest, WithPadding) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Int> tf_int;
+
+  const std::vector<int32_t> input_sizes{1, 8, 5, 4};
+  const int64_t kernel_size[] = {3, 3};
+  const int64_t dilation[] = {1, 1};
+  const int64_t padding[] = {1, 1};
+  const int64_t stride[] = {1, 1};
+  const bool channel_last = false;
+
+  // out_h = (5 + 2*1 - 3) / 1 + 1 = 5
+  // out_w = (4 + 2*1 - 3) / 1 + 1 = 4
+  // output: (1, 5*4, 3*3*8) = (1, 20, 72)
+  const std::vector<int32_t> output_sizes{1, 20, 72};
+
+  Tensor input = tf.ones(input_sizes);
+  Tensor zero_point = tf_int.zeros({1});
+  Tensor out = tf.zeros(output_sizes);
+
+  im2row_out(
+      input,
+      kernel_size,
+      dilation,
+      padding,
+      stride,
+      zero_point,
+      channel_last,
+      out);
+
+  int one_count = countValue(out, 1.0f);
+  int zero_count = countValue(out, 0.0f);
+
+  // With padding and zero_point=0: expect both ones (from input) and zeros
+  // (from padding)
+  EXPECT_GT(one_count, 0) << "Should have ones from input data";
+  EXPECT_GT(zero_count, 0) << "Should have zeros from padded regions";
+  EXPECT_EQ(one_count + zero_count, out.numel())
+      << "All values should be 0 or 1";
+}
+
+// Test channels last (NHWC) layout, no padding
+// Input is all ones, so output should be all ones
+TEST_F(HiFiIm2rowTest, ChannelsLast) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Int> tf_int;
+
+  // Input for NHWC: (1, 5, 8, 8) - batch=1, height=5, width=8, channels=8
+  const std::vector<int32_t> input_sizes{1, 5, 8, 8};
+  const int64_t kernel_size[] = {3, 3};
+  const int64_t dilation[] = {1, 1};
+  const int64_t padding[] = {0, 0};
+  const int64_t stride[] = {1, 1};
+  const bool channel_last = true;
+
+  // out_h = (5 - 3) / 1 + 1 = 3
+  // out_w = (8 - 3) / 1 + 1 = 6
+  // output: (1, 3*6, 3*3*8) = (1, 18, 72)
+  const std::vector<int32_t> output_sizes{1, 18, 72};
+
+  Tensor input = tf.ones(input_sizes);
+  Tensor zero_point = tf_int.zeros({1});
+  Tensor out = tf.zeros(output_sizes);
+
+  im2row_out(
+      input,
+      kernel_size,
+      dilation,
+      padding,
+      stride,
+      zero_point,
+      channel_last,
+      out);
+
+  // Without padding, all output values should be 1.0 (from input)
+  EXPECT_EQ(countValue(out, 1.0f), out.numel());
+  EXPECT_EQ(countValue(out, 0.0f), 0);
+}
+
+// Test with dilation=2 and padding=2
+// With zero_point=0, dilated regions outside input produce zeros
+TEST_F(HiFiIm2rowTest, WithDilation) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Int> tf_int;
+
+  const std::vector<int32_t> input_sizes{1, 8, 6, 5};
+  const int64_t kernel_size[] = {3, 3};
+  const int64_t dilation[] = {2, 2};
+  const int64_t padding[] = {2, 2};
+  const int64_t stride[] = {1, 1};
+  const bool channel_last = false;
+
+  // effective_kernel_h = 2*(3-1) + 1 = 5
+  // effective_kernel_w = 2*(3-1) + 1 = 5
+  // out_h = (6 + 2*2 - 5) / 1 + 1 = 6
+  // out_w = (5 + 2*2 - 5) / 1 + 1 = 5
+  // output: (1, 6*5, 3*3*8) = (1, 30, 72)
+  const std::vector<int32_t> output_sizes{1, 30, 72};
+
+  Tensor input = tf.ones(input_sizes);
+  Tensor zero_point = tf_int.zeros({1});
+  Tensor out = tf.zeros(output_sizes);
+
+  im2row_out(
+      input,
+      kernel_size,
+      dilation,
+      padding,
+      stride,
+      zero_point,
+      channel_last,
+      out);
+
+  int one_count = countValue(out, 1.0f);
+  int zero_count = countValue(out, 0.0f);
+
+  // With dilation/padding and zero_point=0: expect both ones and zeros
+  EXPECT_GT(one_count, 0) << "Should have ones from input data";
+  EXPECT_GT(zero_count, 0) << "Should have zeros from padded/dilated regions";
+  EXPECT_EQ(one_count + zero_count, out.numel())
+      << "All values should be 0 or 1";
+}
+
+// Test im2row_per_tensor_out with zero_point=0, no padding
+// Input is all ones, so output should be all ones
+TEST_F(HiFiIm2rowTest, PerTensorZeroPointZero) {
+  TensorFactory<ScalarType::Float> tf;
+
+  const std::vector<int32_t> input_sizes{1, 8, 5, 4};
+  const int64_t kernel_size[] = {3, 3};
+  const int64_t dilation[] = {1, 1};
+  const int64_t padding[] = {0, 0};
+  const int64_t stride[] = {1, 1};
+  const int64_t in_zero_point = 0;
+  const bool channel_last = false;
+
+  // output: (1, 6, 72)
+  const std::vector<int32_t> output_sizes{1, 6, 72};
+
+  Tensor input = tf.ones(input_sizes);
+  Tensor out = tf.zeros(output_sizes);
+
+  im2row_per_tensor_out(
+      input,
+      kernel_size,
+      dilation,
+      padding,
+      stride,
+      in_zero_point,
+      channel_last,
+      out);
+
+  // Without padding, all output values should be 1.0 (from input)
+  EXPECT_EQ(countValue(out, 1.0f), out.numel());
+  EXPECT_EQ(countValue(out, 0.0f), 0);
+}
+
+// Test im2row_per_tensor_out with non-zero zero_point=128, no padding
+// Input is all ones, so output should be all ones
+TEST_F(HiFiIm2rowTest, PerTensorNonZeroZeroPoint) {
+  TensorFactory<ScalarType::Float> tf;
+
+  const std::vector<int32_t> input_sizes{1, 8, 5, 4};
+  const int64_t kernel_size[] = {3, 3};
+  const int64_t dilation[] = {1, 1};
+  const int64_t padding[] = {0, 0};
+  const int64_t stride[] = {1, 1};
+  const int64_t in_zero_point = 128;
+  const bool channel_last = false;
+
+  // output: (1, 6, 72)
+  const std::vector<int32_t> output_sizes{1, 6, 72};
+
+  Tensor input = tf.ones(input_sizes);
+  Tensor out = tf.zeros(output_sizes);
+
+  im2row_per_tensor_out(
+      input,
+      kernel_size,
+      dilation,
+      padding,
+      stride,
+      in_zero_point,
+      channel_last,
+      out);
+
+  // Without padding, all output values should be 1.0 (from input)
+  // zero_point only affects padded regions, which don't exist here
+  EXPECT_EQ(countValue(out, 1.0f), out.numel());
+  EXPECT_EQ(countValue(out, 0.0f), 0);
+}
+
+// Test im2row_per_tensor_out with channels last layout and non-zero zero_point
+// Input is all ones, so output should be all ones
+TEST_F(HiFiIm2rowTest, PerTensorChannelsLastNonZeroZeroPoint) {
+  TensorFactory<ScalarType::Float> tf;
+
+  const std::vector<int32_t> input_sizes{1, 5, 8, 8};
+  const int64_t kernel_size[] = {3, 3};
+  const int64_t dilation[] = {1, 1};
+  const int64_t padding[] = {0, 0};
+  const int64_t stride[] = {1, 1};
+  const int64_t in_zero_point = 64;
+  const bool channel_last = true;
+
+  // output: (1, 18, 72)
+  const std::vector<int32_t> output_sizes{1, 18, 72};
+
+  Tensor input = tf.ones(input_sizes);
+  Tensor out = tf.zeros(output_sizes);
+
+  im2row_per_tensor_out(
+      input,
+      kernel_size,
+      dilation,
+      padding,
+      stride,
+      in_zero_point,
+      channel_last,
+      out);
+
+  // Without padding, all output values should be 1.0 (from input)
+  EXPECT_EQ(countValue(out, 1.0f), out.numel());
+  EXPECT_EQ(countValue(out, 0.0f), 0);
+}
+
+} // namespace
+} // namespace native
+} // namespace HiFi
+} // namespace impl
diff --git a/backends/cadence/hifi/operators/tests/test_op_transpose_copy.cpp b/backends/cadence/hifi/operators/tests/test_op_transpose_copy.cpp
new file mode 100644
index 00000000000..200135848a7
--- /dev/null
+++ b/backends/cadence/hifi/operators/tests/test_op_transpose_copy.cpp
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <sys/times.h>
+#include <xtensa/sim.h>
+#include <array>
+
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <executorch/backends/cadence/hifi/operators/operators.h>
+
+namespace impl {
+namespace HiFi {
+namespace native {
+namespace {
+
+using ::executorch::aten::ArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::aten::TensorImpl;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::executorch::runtime::runtime_init;
+using ::executorch::runtime::testing::TensorFactory;
+
+class HiFiTransposeCopyTest : public OperatorTest {
+ public:
+ protected:
+  Tensor& transpose_copy_int_out(
+      const Tensor& in,
+      int64_t dim0,
+      int64_t dim1,
+      Tensor& out) {
+    return ::impl::HiFi::native::transpose_copy_int_out(
+        context_, in, dim0, dim1, out);
+  }
+};
+
+// Test basic 2D float transpose (matrix transpose)
+// Verifies that the optimized xa_nn_transpose_32_32 path works correctly
+TEST_F(HiFiTransposeCopyTest, Basic2DFloatTranspose) {
+  TensorFactory<ScalarType::Float> tf;
+
+  // Input: (2, 3) matrix with sequential values
+  const std::vector<int32_t> input_sizes{2, 3};
+  const std::vector<int32_t> output_sizes{3, 2};
+
+  // Input matrix:
+  // [[1, 2, 3],
+  //  [4, 5, 6]]
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  // Expected transposed:
+  // [[1, 4],
+  //  [2, 5],
+  //  [3, 6]]
+  std::vector<float> expected_data = {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f};
+
+  Tensor input = tf.make(input_sizes, input_data);
+  Tensor out = tf.zeros(output_sizes);
+  Tensor expected = tf.make(output_sizes, expected_data);
+
+  transpose_copy_int_out(input, 0, 1, out);
+
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+// Test 2D int8 transpose - uses optimized xa_nn_transpose_8_8 path
+TEST_F(HiFiTransposeCopyTest, Basic2DInt8Transpose) {
+  TensorFactory<ScalarType::Char> tf;
+
+  const std::vector<int32_t> input_sizes{2, 3};
+  const std::vector<int32_t> output_sizes{3, 2};
+
+  std::vector<int8_t> input_data = {1, 2, 3, 4, 5, 6};
+  std::vector<int8_t> expected_data = {1, 4, 2, 5, 3, 6};
+
+  Tensor input = tf.make(input_sizes, input_data);
+  Tensor out = tf.zeros(output_sizes);
+  Tensor expected = tf.make(output_sizes, expected_data);
+
+  transpose_copy_int_out(input, 0, 1, out);
+
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+// Test 2D uint8 transpose - uses optimized xa_nn_transpose_8_8 path
+TEST_F(HiFiTransposeCopyTest, Basic2DUInt8Transpose) {
+  TensorFactory<ScalarType::Byte> tf;
+
+  const std::vector<int32_t> input_sizes{2, 3};
+  const std::vector<int32_t> output_sizes{3, 2};
+
+  std::vector<uint8_t> input_data = {10, 20, 30, 40, 50, 60};
+  std::vector<uint8_t> expected_data = {10, 40, 20, 50, 30, 60};
+
+  Tensor input = tf.make(input_sizes, input_data);
+  Tensor out = tf.zeros(output_sizes);
+  Tensor expected = tf.make(output_sizes, expected_data);
+
+  transpose_copy_int_out(input, 0, 1, out);
+
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+// Test 3D transpose swapping first two dimensions
+TEST_F(HiFiTransposeCopyTest, Transpose3DDim0Dim1) {
+  TensorFactory<ScalarType::Float> tf;
+
+  // Input: (2, 3, 4) -> Output: (3, 2, 4)
+  const std::vector<int32_t> input_sizes{2, 3, 4};
+  const std::vector<int32_t> output_sizes{3, 2, 4};
+
+  // Create input with sequential values for proper verification
+  std::vector<float> input_data(24);
+  for (int i = 0; i < 24; ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  // Expected output after swapping dims 0 and 1
+  // out[j][i][k] = in[i][j][k]
+  // Input layout (2, 3, 4): in[i][j][k] = i*12 + j*4 + k + 1
+  // Output layout (3, 2, 4): position = j*8 + i*4 + k
+  std::vector<float> expected_data = {1.0f,  2.0f,  3.0f,  4.0f,  13.0f, 14.0f,
+                                      15.0f, 16.0f, 5.0f,  6.0f,  7.0f,  8.0f,
+                                      17.0f, 18.0f, 19.0f, 20.0f, 9.0f,  10.0f,
+                                      11.0f, 12.0f, 21.0f, 22.0f, 23.0f, 24.0f};
+
+  Tensor input = tf.make(input_sizes, input_data);
+  Tensor out = tf.zeros(output_sizes);
+  Tensor expected = tf.make(output_sizes, expected_data);
+
+  transpose_copy_int_out(input, 0, 1, out);
+
+  EXPECT_THAT(
+      std::vector<executorch::aten::SizesType>(
+          out.sizes().begin(), out.sizes().end()),
+      ::testing::ElementsAre(3, 2, 4));
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+// Test 3D transpose swapping last two dimensions
+TEST_F(HiFiTransposeCopyTest, Transpose3DDim1Dim2) {
+  TensorFactory<ScalarType::Float> tf;
+
+  // Input: (2, 3, 4) -> Output: (2, 4, 3)
+  const std::vector<int32_t> input_sizes{2, 3, 4};
+  const std::vector<int32_t> output_sizes{2, 4, 3};
+
+  Tensor input = tf.ones(input_sizes);
+  Tensor out = tf.zeros(output_sizes);
+
+  transpose_copy_int_out(input, 1, 2, out);
+
+  EXPECT_EQ(out.size(0), 2);
+  EXPECT_EQ(out.size(1), 4);
+  EXPECT_EQ(out.size(2), 3);
+
+  // Verify all values are preserved
+  const float* out_data = out.const_data_ptr<float>();
+  for (int i = 0; i < out.numel(); ++i) {
+    EXPECT_EQ(out_data[i], 1.0f);
+  }
+}
+
+// Test 3D transpose swapping first and last dimensions
+TEST_F(HiFiTransposeCopyTest, Transpose3DDim0Dim2) {
+  TensorFactory<ScalarType::Float> tf;
+
+  // Input: (2, 3, 4) -> Output: (4, 3, 2)
+  const std::vector<int32_t> input_sizes{2, 3, 4};
+  const std::vector<int32_t> output_sizes{4, 3, 2};
+
+  // Create input with sequential values for proper verification
+  std::vector<float> input_data(24);
+  for (int i = 0; i < 24; ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  // Expected output after swapping dims 0 and 2
+  // Input shape: (2, 3, 4), Output shape: (4, 3, 2)
+  // out[k][j][i] = in[i][j][k]
+  // Input layout: in[i][j][k] = (i * 3 * 4) + (j * 4) + k + 1
+  // Output layout: out[k][j][i] with shape (4, 3, 2)
+  std::vector<float> expected_data = {1.0f, 13.0f, 5.0f, 17.0f, 9.0f,  21.0f,
+                                      2.0f, 14.0f, 6.0f, 18.0f, 10.0f, 22.0f,
+                                      3.0f, 15.0f, 7.0f, 19.0f, 11.0f, 23.0f,
+                                      4.0f, 16.0f, 8.0f, 20.0f, 12.0f, 24.0f};
+
+  Tensor input = tf.make(input_sizes, input_data);
+  Tensor out = tf.zeros(output_sizes);
+  Tensor expected = tf.make(output_sizes, expected_data);
+
+  transpose_copy_int_out(input, 0, 2, out);
+
+  EXPECT_THAT(
+      std::vector<executorch::aten::SizesType>(
+          out.sizes().begin(), out.sizes().end()),
+      ::testing::ElementsAre(4, 3, 2));
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+// Test 4D transpose (common in batch normalization and conv operations)
+// Use sizes aligned to 4 bytes for Float type on Xtensa
+TEST_F(HiFiTransposeCopyTest, Transpose4DNCHW) {
+  TensorFactory<ScalarType::Float> tf;
+
+  // Input: (2, 4, 4, 4) NCHW-like -> swap C and H -> (2, 4, 4, 4)
+  // Using aligned dimensions to avoid alignment issues on Xtensa
+  const std::vector<int32_t> input_sizes{2, 4, 4, 4};
+  const std::vector<int32_t> output_sizes{2, 4, 4, 4};
+
+  Tensor input = tf.ones(input_sizes);
+  Tensor out = tf.zeros(output_sizes);
+
+  transpose_copy_int_out(input, 1, 2, out);
+
+  EXPECT_THAT(
+      std::vector<executorch::aten::SizesType>(
+          out.sizes().begin(), out.sizes().end()),
+      ::testing::ElementsAre(2, 4, 4, 4));
+}
+
+// Test with negative dimension indices
+TEST_F(HiFiTransposeCopyTest, NegativeDimensionIndices) {
+  TensorFactory<ScalarType::Float> tf;
+
+  // Input: (2, 3, 4) with dims -2 and -1 should be equivalent to dims 1 and 2
+  const std::vector<int32_t> input_sizes{2, 3, 4};
+  const std::vector<int32_t> output_sizes{2, 4, 3};
+
+  std::vector<float> input_data(24);
+  for (int i = 0; i < 24; ++i) {
+    input_data[i] = static_cast<float>(i);
+  }
+
+  Tensor input = tf.make(input_sizes, input_data);
+  Tensor out_negative = tf.zeros(output_sizes);
+  Tensor out_positive = tf.zeros(output_sizes);
+
+  // Use negative indices
+  transpose_copy_int_out(input, -2, -1, out_negative);
+  // Use positive indices (should give same result)
+  transpose_copy_int_out(input, 1, 2, out_positive);
+
+  EXPECT_TENSOR_CLOSE(out_negative, out_positive);
+}
+
+// Test square matrix transpose (special case)
+TEST_F(HiFiTransposeCopyTest, SquareMatrixTranspose) {
+  TensorFactory<ScalarType::Float> tf;
+
+  // Input: (3, 3) square matrix
+  const std::vector<int32_t> sizes{3, 3};
+
+  // Input matrix:
+  // [[1, 2, 3],
+  //  [4, 5, 6],
+  //  [7, 8, 9]]
+  std::vector<float> input_data = {
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  // Expected transposed:
+  // [[1, 4, 7],
+  //  [2, 5, 8],
+  //  [3, 6, 9]]
+  std::vector<float> expected_data = {
+      1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f};
+
+  Tensor input = tf.make(sizes, input_data);
+  Tensor out = tf.zeros(sizes);
+  Tensor expected = tf.make(sizes, expected_data);
+
+  transpose_copy_int_out(input, 0, 1, out);
+
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+// Test large tensor transpose with non-trivial values
+TEST_F(HiFiTransposeCopyTest, LargeTensorTranspose) {
+  TensorFactory<ScalarType::Float> tf;
+
+  // Input: (8, 16) - larger tensor to test transpose correctness
+  const std::vector<int32_t> input_sizes{8, 16};
+  const std::vector<int32_t> output_sizes{16, 8};
+
+  // Create input with sequential values
+  std::vector<float> input_data(128);
+  for (int i = 0; i < 128; ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  // Create expected output: out[j][i] = in[i][j]
+  std::vector<float> expected_data(128);
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 16; ++j) {
+      expected_data[j * 8 + i] = input_data[i * 16 + j];
+    }
+  }
+
+  Tensor input = tf.make(input_sizes, input_data);
+  Tensor out = tf.zeros(output_sizes);
+  Tensor expected = tf.make(output_sizes, expected_data);
+
+  transpose_copy_int_out(input, 0, 1, out);
+
+  // Verify shape
+  EXPECT_THAT(
+      std::vector<executorch::aten::SizesType>(
+          out.sizes().begin(), out.sizes().end()),
+      ::testing::ElementsAre(16, 8));
+
+  // Verify values
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+// Test int8 with actual quantized-like values
+TEST_F(HiFiTransposeCopyTest, Int8QuantizedValues) {
+  TensorFactory<ScalarType::Char> tf;
+
+  const std::vector<int32_t> input_sizes{4, 4};
+  const std::vector<int32_t> output_sizes{4, 4};
+
+  // Simulate quantized values (typical range for int8 quantization)
+  std::vector<int8_t> input_data = {
+      -128, -64, 0, 64, 127, -100, 50, -25, 10, -10, 20, -20, 30, -30, 40, -40};
+  std::vector<int8_t> expected_data = {
+      -128, 127, 10, 30, -64, -100, -10, -30, 0, 50, 20, 40, 64, -25, -20, -40};
+
+  Tensor input = tf.make(input_sizes, input_data);
+  Tensor out = tf.zeros(output_sizes);
+  Tensor expected = tf.make(output_sizes, expected_data);
+
+  transpose_copy_int_out(input, 0, 1, out);
+
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+// Test 4D transpose shape (2, 3, 5, 7) with dim0=0, dim1=1
+// Reproduces the failing test case test_aten_transpose_copy_int_5
+TEST_F(HiFiTransposeCopyTest, Transpose4D_2_3_5_7_Dim01) {
+  TensorFactory<ScalarType::Float> tf;
+
+  // Input: (2, 3, 5, 7) -> Output: (3, 2, 5, 7) after swapping dims 0 and 1
+  const std::vector<int32_t> input_sizes{2, 3, 5, 7};
+  const std::vector<int32_t> output_sizes{3, 2, 5, 7};
+  const int numel = 2 * 3 * 5 * 7; // 210 elements
+
+  // Create input with sequential values for proper verification
+  std::vector<float> input_data(numel);
+  for (int i = 0; i < numel; ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  // Compute expected output after swapping dims 0 and 1
+  // out[j][i][k][l] = in[i][j][k][l]
+  // Input strides: (3*5*7=105, 5*7=35, 7, 1)
+  // Output strides: (2*5*7=70, 5*7=35, 7, 1)
+  std::vector<float> expected_data(numel);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int in_idx = i * 105 + j * 35 + k * 7 + l;
+          int out_idx = j * 70 + i * 35 + k * 7 + l;
+          expected_data[out_idx] = input_data[in_idx];
+        }
+      }
+    }
+  }
+
+  Tensor input = tf.make(input_sizes, input_data);
+  Tensor out = tf.zeros(output_sizes);
+  Tensor expected = tf.make(output_sizes, expected_data);
+
+  transpose_copy_int_out(input, 0, 1, out);
+
+  EXPECT_THAT(
+      std::vector<executorch::aten::SizesType>(
+          out.sizes().begin(), out.sizes().end()),
+      ::testing::ElementsAre(3, 2, 5, 7));
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+// Test preserving values through transpose (round-trip verification)
+TEST_F(HiFiTransposeCopyTest, TransposePreservesValues) {
+  TensorFactory<ScalarType::Float> tf;
+
+  const std::vector<int32_t> input_sizes{3, 5};
+  const std::vector<int32_t> transposed_sizes{5, 3};
+
+  // Create input with distinct values using std::array for stack allocation
+  std::array<float, 15> input_data;
+  for (int i = 0; i < 15; ++i) {
+    input_data[i] = static_cast<float>(i * 2 + 1); // 1, 3, 5, 7, ...
+  }
+
+  Tensor input = tf.make(
+      input_sizes, std::vector<float>(input_data.begin(), input_data.end()));
+  Tensor transposed = tf.zeros(transposed_sizes);
+  Tensor restored = tf.zeros(input_sizes);
+
+  // First transpose
+  transpose_copy_int_out(input, 0, 1, transposed);
+  // Second transpose (should restore original)
+  transpose_copy_int_out(transposed, 0, 1, restored);
+
+  // Verify round-trip preserves all values
+  EXPECT_TENSOR_CLOSE(input, restored);
+}
+
+} // namespace
+} // namespace native
+} // namespace HiFi
+} // namespace impl
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c
new file mode 100644
index 00000000000..7008ee58f0a
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c
@@ -0,0 +1,106 @@
+#include "xa_nn_common.h"
+#include "xa_nnlib_common_fpu.h"
+#include "xa_nnlib_err_chk.h"
+#include "xa_type_def.h"
+// #include "xa_nn_basic_state.h"
+#include "xa_nnlib_kernels_api.h"
+
+WORD32 xa_nn_im2row_quantized(
+    const WORD8 *__restrict__ data_im, const WORD32 in_zero_point,
+    /* input parameters*/
+    const WORD32 channels, const WORD32 height, const WORD32 width,
+    /* output parameters */
+    const WORD32 out_height, const WORD32 out_width,
+    /* convolution parameters */
+    const WORD32 kernel_h, const WORD32 kernel_w, const WORD32 pad_h,
+    const WORD32 pad_w, const WORD32 stride_h, const WORD32 stride_w,
+    const WORD32 dilation_h, const WORD32 dilation_w,
+    WORD8 *__restrict__ data_col, WORD32 channels_last) {
+  const WORD32 channels_col = channels * kernel_h * kernel_w;
+
+  // If the layout is NHWC, we can copy 'channels' worth of contiguous data
+  // points when performing im2row.
+  if (channels_last) {
+    // Iterate over the output domain
+    for (int _h = 0; _h < out_height; ++_h) {
+      for (int _w = 0; _w < out_width; ++_w) {
+        int32_t i_col = _h * out_width + _w;
+        // Each point in the output domain is the result of applying a filter of
+        // size kernel_h x kernel_w x channels on the input. But since channels
+        // is contiguous, we will not explicitly have a loop for it.
+        for (int _kh = 0; _kh < kernel_h; ++_kh) {
+          int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
+          for (int _kw = 0; _kw < kernel_w; ++_kw) {
+            int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
+
+            // h_im and w_im are the actual height and width coordinates of the
+            // input tensor from where we need to copy 'channels' points.
+            const int8_t *__restrict__ slice_im =
+                data_im + (h_im * width + w_im) * channels;
+            int8_t *__restrict__ slice_col = data_col + i_col * channels_col +
+                                             (_kh * kernel_w + _kw) * channels;
+            // If the coordinates were within the input domain, we copy
+            // 'channels' contiguous values. Otherwise we will fill the output
+            // with 0's.
+            if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+              const ae_int32x2 *pae_inp = (const ae_int32x2 *)slice_im;
+              ae_int32x2 *pae_out = (ae_int32x2 *)slice_col;
+              ae_valign inp_a, out_a;
+              inp_a = AE_LA64_PP(pae_inp);
+              out_a = AE_ZALIGN64();
+
+              ae_int32x2 d0;
+              for (int ic = 0; ic < channels >> 3; ic++) {
+                AE_LA32X2_IP(d0, inp_a, pae_inp);
+                AE_SA32X2_IP(d0, out_a, pae_out);
+              }
+              AE_SA64POS_FP(out_a, pae_out);
+
+              int remainder = channels & 7;
+              int8_t *ptmp_in = (int8_t *)pae_inp;
+              int8_t *ptmp_out = (int8_t *)pae_out;
+              for (int ic = 0; ic < remainder; ic++) {
+                *ptmp_out++ = *ptmp_in++;
+              }
+            } else {
+              for (int i = 0; i < channels; i++) {
+                slice_col[i] = (int8_t)(in_zero_point);
+              }
+            }
+          }
+        }
+      }
+    }
+  } else {
+    // Iterate over the output domain
+    for (int _h = 0; _h < out_height; ++_h) {
+      for (int _w = 0; _w < out_width; ++_w) {
+        int32_t i_col = _h * out_width + _w;
+
+        // Each point in the output domain is the result of applying a filter
+        // of size chanenls * kernel_h x kernel_w on the input
+        for (int _c = 0; _c < channels; ++_c) {
+          for (int _kh = 0; _kh < kernel_h; ++_kh) {
+            for (int _kw = 0; _kw < kernel_w; ++_kw) {
+              // c_col is the linearized access in the channels_col vector.
+              int32_t c_col = (_c * kernel_h + _kh) * kernel_w + _kw;
+              // h_im and w_im are the actual height and width coordinates of
+              // the input tensor that we need to copy to the output.
+              int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
+              int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
+              // If the current data access is within the input tensor, copy the
+              // value
+              if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
+                data_col[i_col * channels_col + c_col] =
+                    data_im[(_c * height + h_im) * width + w_im];
+              else
+                data_col[i_col * channels_col + c_col] = (int8_t)in_zero_point;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
index e7b80e3a1d9..5b3ed385568 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
@@ -170,44 +170,23 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
           for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4)
           {
             WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
-            if((((unsigned)p_inp4 & 1) == 0) && (((unsigned)p_out & 1) == 0))
+            ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
+            ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
+            ae_valign a_inp = AE_LA64_PP(pae_i);
+            ae_valign a_out = AE_ZALIGN64();
+            ae_int32x2 d0;
+            for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
             {
-              ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
-              ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
-              ae_int32x2 d0;
-              for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
-              {
-                AE_L32X2_IP(d0, pae_i, 2 * sizeof(WORD32));
-                AE_S32X2_IP(d0, pae_o, 2 * sizeof(WORD32));
-              }
-              ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
-              ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
-#pragma loop_count max=3
-              for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
-              {
-                puae_o[itr4] = puae_i[itr4];
-              }
+              AE_LA32X2_IP(d0, a_inp, pae_i);
+              AE_SA32X2_IP(d0, a_out, pae_o);
             }
-            else
-            {
-              ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
-              ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
-              ae_valign a_inp = AE_LA64_PP(pae_i);
-              ae_valign a_out = AE_ZALIGN64();
-              ae_int32x2 d0;
-              for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
-              {
-                AE_LA32X2_IP(d0, a_inp, pae_i);
-                AE_SA32X2_IP(d0, a_out, pae_o);
-              }
-              AE_SA64POS_FP(a_out, pae_o);
-              ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
-              ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
+            AE_SA64POS_FP(a_out, pae_o);
+            ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
+            ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
 #pragma loop_count max=3
-              for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
-              {
-                puae_o[itr4] = puae_i[itr4];
-              }
+            for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
+            {
+              puae_o[itr4] = puae_i[itr4];
             }
           }
         }
@@ -237,8 +216,10 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
               ae_int32x2 d0, d1;
               ae_int32x2 tmp0;
 
-              AE_L32_XP(d0, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2);
-              AE_L32_XP(d1, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2);
+              d0 = AE_L32_X((ae_int32 *)p_inp4, 0);
+              p_inp4 += inp_stride[p_5D_permute_vec[4]];
+              d1 = AE_L32_X((ae_int32 *)p_inp4, 0);
+              p_inp4 += inp_stride[p_5D_permute_vec[4]];
 
               tmp0 = AE_SEL32_HH(d0, d1);
 
@@ -257,4 +238,4 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
   }
 
   return 0;
-}
\ No newline at end of file
+}