diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp index fc162d6c7f1..3d5f8aabde8 100644 --- a/backends/cadence/hifi/operators/op_permute_copy.cpp +++ b/backends/cadence/hifi/operators/op_permute_copy.cpp @@ -68,8 +68,8 @@ Tensor& permute_copy_out( InvalidArgument, out); - const auto in_type = out.scalar_type(); - constexpr int kNnlibMaxDim = 16; + const auto in_type = in.scalar_type(); + constexpr int kNnlibMaxDim = 5; bool optimized = false; @@ -91,7 +91,13 @@ Tensor& permute_copy_out( for (int i = 0; i < num_inp_dims; i++) { p_inp_shape[i] = in.size(i); - p_out_shape[i] = in.size(dims[i]); + } + + for (int i = 0; i < num_out_dims; i++) { + p_out_shape[i] = out.size(i); + } + + for (int i = 0; i < num_inp_dims; i++) { p_permute_vec[i] = dims[i]; } diff --git a/backends/cadence/hifi/operators/op_transpose_copy.cpp b/backends/cadence/hifi/operators/op_transpose_copy.cpp index a21a7f6178c..d872bb8ed58 100644 --- a/backends/cadence/hifi/operators/op_transpose_copy.cpp +++ b/backends/cadence/hifi/operators/op_transpose_copy.cpp @@ -64,7 +64,7 @@ Tensor& transpose_copy_int_out( ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); - const auto in_type = out.scalar_type(); + const auto in_type = in.scalar_type(); constexpr int kNnlibMaxDim = 5; bool optimized = false; @@ -85,14 +85,12 @@ Tensor& transpose_copy_int_out( WORD32 p_out_shape[kNnlibMaxDim]; WORD32 p_permute_vec[kNnlibMaxDim]; - for (int i = 0; i < in.dim(); i++) { + for (int i = 0; i < num_inp_dims; i++) { p_inp_shape[i] = in.size(i); - } - for (int i = 0; i < out.dim(); i++) { p_out_shape[i] = out.size(i); } - for (int i = 0; i < in.dim(); i++) { + for (int i = 0; i < num_inp_dims; i++) { p_permute_vec[i] = i; } diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c index e7b80e3a1d9..5b3ed385568 100644 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c @@ -170,44 +170,23 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4) { WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); - if((((unsigned)p_inp4 & 1) == 0) && (((unsigned)p_out & 1) == 0)) + ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); + ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); + ae_valign a_inp = AE_LA64_PP(pae_i); + ae_valign a_out = AE_ZALIGN64(); + ae_int32x2 d0; + for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) { - ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); - ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); - ae_int32x2 d0; - for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) - { - AE_L32X2_IP(d0, pae_i, 2 * sizeof(WORD32)); - AE_S32X2_IP(d0, pae_o, 2 * sizeof(WORD32)); - } - ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); - ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); -#pragma loop_count max=3 - for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) - { - puae_o[itr4] = puae_i[itr4]; - } + AE_LA32X2_IP(d0, a_inp, pae_i); + AE_SA32X2_IP(d0, a_out, pae_o); } - else - { - ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); - ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); - ae_valign a_inp = AE_LA64_PP(pae_i); - ae_valign a_out = AE_ZALIGN64(); - ae_int32x2 d0; - for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) - { - AE_LA32X2_IP(d0, a_inp, pae_i); - AE_SA32X2_IP(d0, a_out, pae_o); - } - AE_SA64POS_FP(a_out, pae_o); - ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); - ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); + AE_SA64POS_FP(a_out, pae_o); + ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); + ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); #pragma loop_count max=3 - for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) - { - puae_o[itr4] = puae_i[itr4]; - } + for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) + { + puae_o[itr4] = puae_i[itr4]; } } } @@ -237,8 +216,10 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out ae_int32x2 d0, d1; ae_int32x2 tmp0; - AE_L32_XP(d0, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2); - AE_L32_XP(d1, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2); + d0 = AE_L32_X((ae_int32 *)p_inp4, 0); + p_inp4 += inp_stride[p_5D_permute_vec[4]]; + d1 = AE_L32_X((ae_int32 *)p_inp4, 0); + p_inp4 += inp_stride[p_5D_permute_vec[4]]; tmp0 = AE_SEL32_HH(d0, d1); @@ -257,4 +238,4 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out } return 0; -} \ No newline at end of file +}