pytorch · mergennachin · Feb 4, 2026 · Feb 3, 2026
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -5,16 +5,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Export model to CUDA/Metal format with optional quantization
+# Export model to CUDA/Metal/XNNPACK format with optional quantization
 
 show_help() {
   cat << EOF
 Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir]
 
-Export a HuggingFace model to CUDA/Metal format with optional quantization.
+Export a HuggingFace model to CUDA/Metal/XNNPACK format with optional quantization.
 
 Arguments:
-  device       cuda or metal (required)
+  device       cuda, metal, or xnnpack (required)
 
   hf_model     HuggingFace model ID (required)
                Supported models:
@@ -28,6 +28,7 @@ Arguments:
                  - non-quantized
                  - quantized-int4-tile-packed
                  - quantized-int4-weight-only
+                 - quantized-8da4w (XNNPACK only)
 
   output_dir   Output directory for artifacts (optional, default: current directory)
 
@@ -36,6 +37,7 @@ Examples:
   export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
   export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
   export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
+  export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
 EOF
 }
 
@@ -64,9 +66,11 @@ case "$DEVICE" in
     ;;
   metal)
     ;;
+  xnnpack)
+    ;;
   *)
     echo "Error: Unsupported device '$DEVICE'"
-    echo "Supported devices: cuda, cuda-windows, metal"
+    echo "Supported devices: cuda, cuda-windows, metal, xnnpack"
     exit 1
     ;;
 esac
@@ -139,9 +143,16 @@ case "$QUANT_NAME" in
     fi
     EXTRA_ARGS="--qlinear_encoder 4w"
     ;;
+  quantized-8da4w)
+    if [ "$DEVICE" != "xnnpack" ]; then
+      echo "Error: quantized-8da4w is only supported with xnnpack device"
+      exit 1
+    fi
+    EXTRA_ARGS="--qlinear 8da4w --qlinear_group_size 32 --qlinear_encoder 8da4w --qlinear_encoder_group_size 32"
+    ;;
   *)
     echo "Error: Unsupported quantization '$QUANT_NAME'"
-    echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only"
+    echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only, quantized-8da4w"
     exit 1
     ;;
 esac
@@ -157,10 +168,17 @@ pip list
 if [ "$MODEL_NAME" = "parakeet" ]; then
   pip install -r examples/models/parakeet/install_requirements.txt
 
+  # Set dtype based on backend (XNNPACK uses fp32, CUDA/Metal use bf16)
+  if [ "$DEVICE" = "xnnpack" ]; then
+    DTYPE_ARG=""
+  else
+    DTYPE_ARG="--dtype bf16"
+  fi
+
   python -m executorch.examples.models.parakeet.export_parakeet_tdt \
       --backend "$DEVICE" \
       --output-dir "${OUTPUT_DIR}" \
-      --dtype bf16 \
+      ${DTYPE_ARG} \
       ${EXTRA_ARGS}
 
   test -f "${OUTPUT_DIR}/model.pte"

diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -5,16 +5,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Test CUDA/Metal model end-to-end, need to run .ci/scripts/export_model_artifact.sh first
+# Test CUDA/Metal/XNNPACK model end-to-end, need to run .ci/scripts/export_model_artifact.sh first
 
 show_help() {
   cat << EOF
 Usage: test_model_e2e.sh <device> <hf_model> <quant_name> [model_dir]
 
-Build and run end-to-end tests for CUDA/Metal models.
+Build and run end-to-end tests for CUDA/Metal/XNNPACK models.
 
 Arguments:
-  device      cuda or metal (required)
+  device      cuda, metal, or xnnpack (required)
 
   hf_model    HuggingFace model ID (required)
               Supported models:
@@ -28,6 +28,7 @@ Arguments:
                 - non-quantized
                 - quantized-int4-tile-packed
                 - quantized-int4-weight-only
+                - quantized-8da4w (XNNPACK only)
 
   model_dir   Directory containing model artifacts (optional, default: current directory)
               Expected files: model.pte, aoti_cuda_blob.ptd (CUDA only)
@@ -37,6 +38,7 @@ Examples:
   test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
   test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
   test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
+  test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output"
 EOF
 }
 
@@ -174,12 +176,17 @@ echo "::endgroup::"
 
 echo "::group::Build $MODEL_NAME Runner"
 
-if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ]; then
-  echo "Error: Unsupported device '$DEVICE'. Must be 'cuda' or 'metal'."
+if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ] && [ "$DEVICE" != "xnnpack" ]; then
+  echo "Error: Unsupported device '$DEVICE'. Must be 'cuda', 'metal', or 'xnnpack'."
   exit 1
 fi
 
-MAKE_TARGET="${RUNNER_PATH}-${DEVICE}"
+# Map device to make target (xnnpack uses cpu target which includes XNNPACK)
+if [ "$DEVICE" = "xnnpack" ]; then
+  MAKE_TARGET="${RUNNER_PATH}-cpu"
+else
+  MAKE_TARGET="${RUNNER_PATH}-${DEVICE}"
+fi
 make "${MAKE_TARGET}"
 echo "::endgroup::"
 

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -163,6 +163,39 @@ jobs:
         # Build and test ExecuTorch
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
 
+  test-parakeet-xnnpack-linux:
+    name: test-parakeet-xnnpack-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.4xlarge.memory
+      docker-image: ci-image:executorch-ubuntu-22.04-clang12
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        echo "::group::Setup ExecuTorch"
+        ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Export Parakeet with XNNPACK"
+        bash .ci/scripts/export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./parakeet_output"
+        echo "::endgroup::"
+
+        echo "::group::Test Parakeet with XNNPACK"
+        bash .ci/scripts/test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./parakeet_output"
+        echo "::endgroup::"
+
   test-llama-runner-linux:
     # Test Both linux x86 and linux aarch64
     name: test-llama-runner-linux

@@ -24,13 +24,21 @@ find_package(gflags REQUIRED)
 # Find executorch libraries
 list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
 find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
-executorch_target_link_options_shared_lib(executorch)
+get_target_property(_executorch_imported executorch IMPORTED)
+if(NOT _executorch_imported)
+  executorch_target_link_options_shared_lib(executorch)
+endif()
 
 set(link_libraries executorch gflags)
 
 # Common ops for all builds
-list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
-executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+if(TARGET optimized_native_cpu_ops_lib)
+  list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
+  get_target_property(_is_imported optimized_native_cpu_ops_lib IMPORTED)
+  if(NOT _is_imported)
+    executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+  endif()
+endif()
 
 # CPU-only builds need quantized and custom ops
 if(NOT EXECUTORCH_BUILD_CUDA AND MSVC)
@@ -46,7 +54,10 @@ if(TARGET xnnpack_backend)
     list(APPEND xnnpack_backend_libs kleidiai)
   endif()
   list(APPEND link_libraries ${xnnpack_backend_libs})
-  executorch_target_link_options_shared_lib(xnnpack_backend)
+  get_target_property(_xnnpack_imported xnnpack_backend IMPORTED)
+  if(NOT _xnnpack_imported)
+    executorch_target_link_options_shared_lib(xnnpack_backend)
+  endif()
 endif()
 
 # Needed for cpuinfo where it uses android specific log lib

@@ -57,7 +57,19 @@ The export script supports quantizing encoder and decoder linear layers using [t
 | `8da4w` | 8-bit dynamic activation, 4-bit weight |
 | `8da8w` | 8-bit dynamic activation, 8-bit weight |
 
-#### Example: 4-bit Weight Quantization with Tile Packing
+#### Example: Dynamic Quantization for XNNPACK
+
+```bash
+python export_parakeet_tdt.py \
+    --backend xnnpack \
+    --qlinear_encoder 8da4w \
+    --qlinear_encoder_group_size 32 \
+    --qlinear 8da4w \
+    --qlinear_group_size 32 \
+    --output-dir ./parakeet_quantized_xnnpack
+```
+
+#### Example: 4-bit Weight Quantization with Tile Packing for CUDA
 
 ```bash
 python export_parakeet_tdt.py \

@@ -461,6 +461,7 @@ def export_all(
 def _create_xnnpack_partitioners(programs):
     """Create XNNPACK partitioners for all programs except preprocessor."""
     from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
+        XnnpackDynamicallyQuantizedPartitioner,
         XnnpackPartitioner,
     )
 
@@ -470,7 +471,13 @@ def _create_xnnpack_partitioners(programs):
         if key == "preprocessor":
             partitioner[key] = []
         else:
-            partitioner[key] = [XnnpackPartitioner()]
+            # Use both partitioners:
+            # 1. XnnpackDynamicallyQuantizedPartitioner for dynamic quantization (8da4w)
+            # 2. XnnpackPartitioner for remaining ops
+            partitioner[key] = [
+                XnnpackDynamicallyQuantizedPartitioner(),
+                XnnpackPartitioner(),
+            ]
     return partitioner, programs
 
 
@@ -584,6 +591,7 @@ def lower_to_executorch(programs, metadata=None, backend="portable"):
         config=ExecutorchBackendConfig(
             extract_delegate_segments=True,
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
+            do_quant_fusion_and_const_prop=True,
         ),
     )
 

@@ -89,6 +89,7 @@ def quantize_model_(  # noqa: C901
             config = Int8DynamicActivationIntxWeightConfig(
                 weight_dtype=torch.int4,
                 weight_granularity=granularity,
+                intx_choose_qparams_algorithm="hqq_scale_only",
             )
         elif qlinear_config == "8da8w":
             config = Int8DynamicActivationIntxWeightConfig(