pytorch · mergennachin · Feb 3, 2026
@@ -57,7 +57,19 @@ The export script supports quantizing encoder and decoder linear layers using [t
 | `8da4w` | 8-bit dynamic activation, 4-bit weight |
 | `8da8w` | 8-bit dynamic activation, 8-bit weight |
 
-#### Example: 4-bit Weight Quantization with Tile Packing
+#### Example: Dynamic Quantization for XNNPACK
+
+```bash
+python export_parakeet_tdt.py \
+    --backend xnnpack \
+    --qlinear_encoder 8da4w \
+    --qlinear_encoder_group_size 32 \
+    --qlinear 8da4w \
+    --qlinear_group_size 32 \
+    --output-dir ./parakeet_quantized_xnnpack
+```
+
+#### Example: 4-bit Weight Quantization with Tile Packing for CUDA
 
 ```bash
 python export_parakeet_tdt.py \

@@ -461,6 +461,7 @@ def export_all(
 def _create_xnnpack_partitioners(programs):
     """Create XNNPACK partitioners for all programs except preprocessor."""
     from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
+        XnnpackDynamicallyQuantizedPartitioner,
         XnnpackPartitioner,
     )
 
@@ -470,7 +471,13 @@ def _create_xnnpack_partitioners(programs):
         if key == "preprocessor":
             partitioner[key] = []
         else:
-            partitioner[key] = [XnnpackPartitioner()]
+            # Use both partitioners:
+            # 1. XnnpackDynamicallyQuantizedPartitioner for dynamic quantization (8da4w)
+            # 2. XnnpackPartitioner for remaining ops
+            partitioner[key] = [
+                XnnpackDynamicallyQuantizedPartitioner(),
+                XnnpackPartitioner(),
+            ]
     return partitioner, programs
 
 
@@ -584,6 +591,7 @@ def lower_to_executorch(programs, metadata=None, backend="portable"):
         config=ExecutorchBackendConfig(
             extract_delegate_segments=True,
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
+            do_quant_fusion_and_const_prop=True,
         ),
     )
 

@@ -89,6 +89,7 @@ def quantize_model_(  # noqa: C901
             config = Int8DynamicActivationIntxWeightConfig(
                 weight_dtype=torch.int4,
                 weight_granularity=granularity,
+                intx_choose_qparams_algorithm="hqq_scale_only",
             )
         elif qlinear_config == "8da8w":
             config = Int8DynamicActivationIntxWeightConfig(