Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion examples/models/parakeet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,19 @@ The export script supports quantizing encoder and decoder linear layers using [t
| `8da4w` | 8-bit dynamic activation, 4-bit weight |
| `8da8w` | 8-bit dynamic activation, 8-bit weight |

#### Example: 4-bit Weight Quantization with Tile Packing
#### Example: Dynamic Quantization for XNNPACK

```bash
python export_parakeet_tdt.py \
--backend xnnpack \
--qlinear_encoder 8da4w \
--qlinear_encoder_group_size 32 \
--qlinear 8da4w \
--qlinear_group_size 32 \
--output-dir ./parakeet_quantized_xnnpack
```

#### Example: 4-bit Weight Quantization with Tile Packing for CUDA

```bash
python export_parakeet_tdt.py \
Expand Down
10 changes: 9 additions & 1 deletion examples/models/parakeet/export_parakeet_tdt.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ def export_all(
def _create_xnnpack_partitioners(programs):
"""Create XNNPACK partitioners for all programs except preprocessor."""
from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
XnnpackDynamicallyQuantizedPartitioner,
XnnpackPartitioner,
)

Expand All @@ -470,7 +471,13 @@ def _create_xnnpack_partitioners(programs):
if key == "preprocessor":
partitioner[key] = []
else:
partitioner[key] = [XnnpackPartitioner()]
# Use both partitioners:
# 1. XnnpackDynamicallyQuantizedPartitioner for dynamic quantization (8da4w)
# 2. XnnpackPartitioner for remaining ops
partitioner[key] = [
XnnpackDynamicallyQuantizedPartitioner(),
XnnpackPartitioner(),
]
return partitioner, programs


Expand Down Expand Up @@ -584,6 +591,7 @@ def lower_to_executorch(programs, metadata=None, backend="portable"):
config=ExecutorchBackendConfig(
extract_delegate_segments=True,
memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
do_quant_fusion_and_const_prop=True,
),
)

Expand Down
1 change: 1 addition & 0 deletions examples/models/parakeet/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def quantize_model_( # noqa: C901
config = Int8DynamicActivationIntxWeightConfig(
weight_dtype=torch.int4,
weight_granularity=granularity,
intx_choose_qparams_algorithm="hqq_scale_only",
)
elif qlinear_config == "8da8w":
config = Int8DynamicActivationIntxWeightConfig(
Expand Down
Loading