Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions backends/cadence/hifi/operators/op_permute_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ Tensor& permute_copy_out(
InvalidArgument,
out);

const auto in_type = out.scalar_type();
constexpr int kNnlibMaxDim = 16;
const auto in_type = in.scalar_type();
constexpr int kNnlibMaxDim = 5;

bool optimized = false;

Expand All @@ -91,7 +91,13 @@ Tensor& permute_copy_out(

for (int i = 0; i < num_inp_dims; i++) {
p_inp_shape[i] = in.size(i);
p_out_shape[i] = in.size(dims[i]);
}

for (int i = 0; i < num_out_dims; i++) {
p_out_shape[i] = out.size(i);
}

for (int i = 0; i < num_inp_dims; i++) {
p_permute_vec[i] = dims[i];
}

Expand Down
8 changes: 3 additions & 5 deletions backends/cadence/hifi/operators/op_transpose_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ Tensor& transpose_copy_int_out(
ET_KERNEL_CHECK(
ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);

const auto in_type = out.scalar_type();
const auto in_type = in.scalar_type();
constexpr int kNnlibMaxDim = 5;

bool optimized = false;
Expand All @@ -85,14 +85,12 @@ Tensor& transpose_copy_int_out(
WORD32 p_out_shape[kNnlibMaxDim];
WORD32 p_permute_vec[kNnlibMaxDim];

for (int i = 0; i < in.dim(); i++) {
for (int i = 0; i < num_inp_dims; i++) {
p_inp_shape[i] = in.size(i);
}
for (int i = 0; i < out.dim(); i++) {
p_out_shape[i] = out.size(i);
}

for (int i = 0; i < in.dim(); i++) {
for (int i = 0; i < num_inp_dims; i++) {
p_permute_vec[i] = i;
}

Expand Down
57 changes: 19 additions & 38 deletions backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,44 +170,23 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4)
{
WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
if((((unsigned)p_inp4 & 1) == 0) && (((unsigned)p_out & 1) == 0))
ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
ae_valign a_inp = AE_LA64_PP(pae_i);
ae_valign a_out = AE_ZALIGN64();
ae_int32x2 d0;
for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
{
ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
ae_int32x2 d0;
for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
{
AE_L32X2_IP(d0, pae_i, 2 * sizeof(WORD32));
AE_S32X2_IP(d0, pae_o, 2 * sizeof(WORD32));
}
ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
#pragma loop_count max=3
for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
{
puae_o[itr4] = puae_i[itr4];
}
AE_LA32X2_IP(d0, a_inp, pae_i);
AE_SA32X2_IP(d0, a_out, pae_o);
}
else
{
ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
ae_valign a_inp = AE_LA64_PP(pae_i);
ae_valign a_out = AE_ZALIGN64();
ae_int32x2 d0;
for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
{
AE_LA32X2_IP(d0, a_inp, pae_i);
AE_SA32X2_IP(d0, a_out, pae_o);
}
AE_SA64POS_FP(a_out, pae_o);
ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
AE_SA64POS_FP(a_out, pae_o);
ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
#pragma loop_count max=3
for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
{
puae_o[itr4] = puae_i[itr4];
}
for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
{
puae_o[itr4] = puae_i[itr4];
}
}
}
Expand Down Expand Up @@ -237,8 +216,10 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
ae_int32x2 d0, d1;
ae_int32x2 tmp0;

AE_L32_XP(d0, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2);
AE_L32_XP(d1, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2);
d0 = AE_L32_X((ae_int32 *)p_inp4, 0);
p_inp4 += inp_stride[p_5D_permute_vec[4]];
d1 = AE_L32_X((ae_int32 *)p_inp4, 0);
p_inp4 += inp_stride[p_5D_permute_vec[4]];

tmp0 = AE_SEL32_HH(d0, d1);

Expand All @@ -257,4 +238,4 @@ WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
}

return 0;
}
}
Loading