diff --git a/examples/models/parakeet/README.md b/examples/models/parakeet/README.md index 23593d324d1..695e70ed472 100644 --- a/examples/models/parakeet/README.md +++ b/examples/models/parakeet/README.md @@ -57,7 +57,19 @@ The export script supports quantizing encoder and decoder linear layers using [t | `8da4w` | 8-bit dynamic activation, 4-bit weight | | `8da8w` | 8-bit dynamic activation, 8-bit weight | -#### Example: 4-bit Weight Quantization with Tile Packing +#### Example: Dynamic Quantization for XNNPACK + +```bash +python export_parakeet_tdt.py \ + --backend xnnpack \ + --qlinear_encoder 8da4w \ + --qlinear_encoder_group_size 32 \ + --qlinear 8da4w \ + --qlinear_group_size 32 \ + --output-dir ./parakeet_quantized_xnnpack +``` + +#### Example: 4-bit Weight Quantization with Tile Packing for CUDA ```bash python export_parakeet_tdt.py \ diff --git a/examples/models/parakeet/export_parakeet_tdt.py b/examples/models/parakeet/export_parakeet_tdt.py index 5a5b36c0549..fa21d4e3dc1 100644 --- a/examples/models/parakeet/export_parakeet_tdt.py +++ b/examples/models/parakeet/export_parakeet_tdt.py @@ -461,6 +461,7 @@ def export_all( def _create_xnnpack_partitioners(programs): """Create XNNPACK partitioners for all programs except preprocessor.""" from executorch.backends.xnnpack.partition.xnnpack_partitioner import ( + XnnpackDynamicallyQuantizedPartitioner, XnnpackPartitioner, ) @@ -470,7 +471,13 @@ def _create_xnnpack_partitioners(programs): if key == "preprocessor": partitioner[key] = [] else: - partitioner[key] = [XnnpackPartitioner()] + # Use both partitioners: + # 1. XnnpackDynamicallyQuantizedPartitioner for dynamic quantization (8da4w) + # 2. XnnpackPartitioner for remaining ops + partitioner[key] = [ + XnnpackDynamicallyQuantizedPartitioner(), + XnnpackPartitioner(), + ] return partitioner, programs @@ -584,6 +591,7 @@ def lower_to_executorch(programs, metadata=None, backend="portable"): config=ExecutorchBackendConfig( extract_delegate_segments=True, memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False), + do_quant_fusion_and_const_prop=True, ), ) diff --git a/examples/models/parakeet/quantize.py b/examples/models/parakeet/quantize.py index 3e540d84834..5d602d7a3e4 100644 --- a/examples/models/parakeet/quantize.py +++ b/examples/models/parakeet/quantize.py @@ -89,6 +89,7 @@ def quantize_model_( # noqa: C901 config = Int8DynamicActivationIntxWeightConfig( weight_dtype=torch.int4, weight_granularity=granularity, + intx_choose_qparams_algorithm="hqq_scale_only", ) elif qlinear_config == "8da8w": config = Int8DynamicActivationIntxWeightConfig(