diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh index 78e005950b7..341475afe6d 100755 --- a/.ci/scripts/export_model_artifact.sh +++ b/.ci/scripts/export_model_artifact.sh @@ -26,14 +26,16 @@ Arguments: quant_name Quantization type (optional, default: non-quantized) Options: - non-quantized - - quantized-int4-tile-packed - - quantized-int4-weight-only + - quantized-int4-tile-packed (CUDA only) + - quantized-int4-weight-only (CUDA only) + - quantized-int4-metal (Metal only) - quantized-8da4w (XNNPACK only) output_dir Output directory for artifacts (optional, default: current directory) Examples: export_model_artifact.sh metal "openai/whisper-small" + export_model_artifact.sh metal "nvidia/parakeet-tdt" "quantized-int4-metal" export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output" export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output" @@ -131,18 +133,25 @@ case "$QUANT_NAME" in ;; quantized-int4-tile-packed) if [ "$DEVICE" = "metal" ]; then - echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'" + echo "Error: Metal backend does not support quantization '$QUANT_NAME'" exit 1 fi EXTRA_ARGS="--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d" ;; quantized-int4-weight-only) if [ "$DEVICE" = "metal" ]; then - echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'" + echo "Error: Metal backend does not support quantization '$QUANT_NAME'" exit 1 fi EXTRA_ARGS="--qlinear_encoder 4w" ;; + quantized-int4-metal) + if [ "$DEVICE" != "metal" ]; then + echo "Error: Quantization '$QUANT_NAME' only supported on Metal backend" + exit 1 + fi + EXTRA_ARGS="--qlinear fpa4w --qlinear_encoder fpa4w" + ;; quantized-8da4w) if [ "$DEVICE" != "xnnpack" ]; then echo "Error: quantized-8da4w is only supported with xnnpack device" @@ -152,7 +161,7 @@ case "$QUANT_NAME" in ;; *) echo "Error: Unsupported quantization '$QUANT_NAME'" - echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only, quantized-8da4w" + echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only, quantized-int4-metal, quantized-8da4w" exit 1 ;; esac diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml index bf86b01aff8..ec15c87f737 100644 --- a/.github/workflows/metal.yml +++ b/.github/workflows/metal.yml @@ -41,11 +41,10 @@ jobs: set -eux echo "::group::Setup ExecuTorch" - PYTHON_EXECUTABLE=python ${CONDA_RUN} ./install_executorch.sh + PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh echo "::endgroup::" echo "::group::Build Metal Runtime" - ${CONDA_RUN} backends/apple/metal/tests/run_metal_test.sh --update-ao ${CONDA_RUN} backends/apple/metal/tests/run_metal_test.sh --build echo "::endgroup::" @@ -73,6 +72,12 @@ jobs: name: "parakeet-tdt" quant: - "non-quantized" + # Only test int4 quantization with parakeet-tdt + include: + - model: + repo: "nvidia" + name: "parakeet-tdt" + quant: "quantized-int4-metal" with: runner: macos-m2-stable python-version: '3.11' @@ -123,6 +128,12 @@ jobs: name: "parakeet-tdt" quant: - "non-quantized" + # Only test int4 quantization with parakeet-tdt + include: + - model: + repo: "nvidia" + name: "parakeet-tdt" + quant: "quantized-int4-metal" with: runner: macos-m2-stable python-version: '3.11' diff --git a/examples/models/parakeet/README.md b/examples/models/parakeet/README.md index 695e70ed472..649a2225536 100644 --- a/examples/models/parakeet/README.md +++ b/examples/models/parakeet/README.md @@ -39,10 +39,10 @@ The export script supports quantizing encoder and decoder linear layers using [t | Argument | Description | |----------|-------------| -| `--qlinear_encoder` | Quantization config for encoder linear layers: `4w`, `8w`, `8da4w`, `8da8w` | +| `--qlinear_encoder` | Quantization config for encoder linear layers: `4w`, `8w`, `8da4w`, `8da8w`, `fpa4w` | | `--qlinear_encoder_group_size` | Group size for encoder linear quantization (default: 32) | | `--qlinear_encoder_packing_format` | Packing format for encoder: `tile_packed_to_4d` | -| `--qlinear` | Quantization config for decoder linear layers: `4w`, `8w`, `8da4w`, `8da8w` | +| `--qlinear` | Quantization config for decoder linear layers: `4w`, `8w`, `8da4w`, `8da8w`, `fpa4w` | | `--qlinear_group_size` | Group size for decoder linear quantization (default: 32) | | `--qlinear_packing_format` | Packing format for decoder: `tile_packed_to_4d` | | `--qembedding` | Quantization config for decoder embedding layer: `4w`, `8w` | @@ -50,12 +50,13 @@ The export script supports quantizing encoder and decoder linear layers using [t #### Quantization Configs -| Config | Description | -|--------|-------------| -| `4w` | 4-bit weight only quantization | -| `8w` | 8-bit weight only quantization | -| `8da4w` | 8-bit dynamic activation, 4-bit weight | -| `8da8w` | 8-bit dynamic activation, 8-bit weight | +| Config | Description | Backends | +|--------|-------------|----------| +| `4w` | 4-bit weight only quantization | CUDA | +| `8w` | 8-bit weight only quantization | CUDA | +| `8da4w` | 8-bit dynamic activation, 4-bit weight | CUDA | +| `8da8w` | 8-bit dynamic activation, 8-bit weight | CUDA | +| `fpa4w` | Floating point activation, 4-bit weight | Metal | #### Example: Dynamic Quantization for XNNPACK @@ -86,6 +87,30 @@ python export_parakeet_tdt.py \ **Note:** The `tile_packed_to_4d` packing format is optimized for CUDA. +#### Example: Metal 4-bit Quantization + +```bash +python export_parakeet_tdt.py \ + --backend metal \ + --qlinear_encoder fpa4w \ + --qlinear_encoder_group_size 32 \ + --qlinear fpa4w \ + --qlinear_group_size 32 \ + --output-dir ./parakeet_metal_quantized +``` + +**Note:** Metal 4-bit quantization requires torchao built with experimental MPS (Metal) ops. + +You can install torchao with Metal support from the `ao` repo: +```bash +USE_CPP=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 pip install . --no-build-isolation +``` + +Alternatively, you can build torchao with Metal support while installing ExecuTorch: +```bash +EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh +``` + ### Metal Export (macOS) ```bash diff --git a/examples/models/parakeet/export_parakeet_tdt.py b/examples/models/parakeet/export_parakeet_tdt.py index fa21d4e3dc1..6747880cd9e 100644 --- a/examples/models/parakeet/export_parakeet_tdt.py +++ b/examples/models/parakeet/export_parakeet_tdt.py @@ -622,7 +622,7 @@ def main(): parser.add_argument( "--qlinear", type=str, - choices=["4w", "8w", "8da4w", "8da8w"], + choices=["4w", "8w", "8da4w", "8da8w", "fpa4w"], help="Quantization config for decoder linear layers", ) parser.add_argument( @@ -642,7 +642,7 @@ def main(): parser.add_argument( "--qlinear_encoder", type=str, - choices=["4w", "8w", "8da4w", "8da8w"], + choices=["4w", "8w", "8da4w", "8da8w", "fpa4w"], help="Quantization config for encoder linear layers", ) parser.add_argument( @@ -678,6 +678,12 @@ def main(): if args.dtype == "fp16": parser.error("fp16 is not yet supported") + # Validate fpa4w quantization requires Metal backend + if args.qlinear == "fpa4w" and args.backend != "metal": + parser.error("--qlinear=fpa4w can only be used with --backend=metal") + if args.qlinear_encoder == "fpa4w" and args.backend != "metal": + parser.error("--qlinear_encoder=fpa4w can only be used with --backend=metal") + os.makedirs(args.output_dir, exist_ok=True) print("Extracting tokenizer...") diff --git a/examples/models/parakeet/quantize.py b/examples/models/parakeet/quantize.py index 5d602d7a3e4..a08a681fdc6 100644 --- a/examples/models/parakeet/quantize.py +++ b/examples/models/parakeet/quantize.py @@ -17,7 +17,7 @@ def quantize_model_( # noqa: C901 Args: module: The PyTorch module to quantize. - qlinear_config: Quantization config for linear layers ("4w", "8w", "8da4w", "8da8w"). + qlinear_config: Quantization config for linear layers ("4w", "8w", "8da4w", "8da8w", "fpa4w"). qlinear_group_size: Group size for linear quantization (default: 32). qlinear_packing_format: Packing format for linear layers (e.g., "tile_packed_to_4d"). qembedding_config: Quantization config for embedding layers ("4w", "8w"). @@ -26,12 +26,41 @@ def quantize_model_( # noqa: C901 if not qlinear_config and not qembedding_config: return + from torchao.quantization.quant_api import quantize_ + + # Metal (MPS) quantization uses different API + if qlinear_config == "fpa4w": + # Load MPS ops + import torchao.experimental.ops.mps # noqa: F401 + from torchao.experimental.quant_api import UIntxWeightOnlyConfig + + config = UIntxWeightOnlyConfig( + group_size=qlinear_group_size, + bitwidth=4, + ) + + def linear_filter(m, fqn): + if isinstance(m, torch.nn.Linear): + if m.weight.shape[1] % qlinear_group_size != 0: + raise ValueError( + f"Metal int4 quantization requires weight dimension (K) to be multiple of group_size. " + f"Layer {fqn} has weight shape {m.weight.shape} (K={m.weight.shape[1]}, group_size={qlinear_group_size})" # noqa: E501 + ) + return True + return False + + print( + f" Applying {qlinear_config} linear quantization " + f"(group_size={qlinear_group_size})..." + ) + quantize_(module, config, filter_fn=linear_filter) + return + from torchao.quantization.granularity import PerAxis, PerGroup from torchao.quantization.quant_api import ( Int4WeightOnlyConfig, Int8DynamicActivationIntxWeightConfig, IntxWeightOnlyConfig, - quantize_, ) # Quantize embedding layers first diff --git a/third-party/ao b/third-party/ao index 28306f08500..1b4b6d998bf 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit 28306f085003b892bc0a250c209d80f5d4a5147b +Subproject commit 1b4b6d998bf988f059e97a10181cbc4aec269b69