pytorch · manuelcandales · Feb 5, 2026 · Jan 31, 2026 · Jan 31, 2026 · Jan 31, 2026
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -26,14 +26,16 @@ Arguments:
   quant_name   Quantization type (optional, default: non-quantized)
                Options:
                  - non-quantized
-                 - quantized-int4-tile-packed
-                 - quantized-int4-weight-only
+                 - quantized-int4-tile-packed (CUDA only)
+                 - quantized-int4-weight-only (CUDA only)
+                 - quantized-int4-metal (Metal only)
                  - quantized-8da4w (XNNPACK only)
 
   output_dir   Output directory for artifacts (optional, default: current directory)
 
 Examples:
   export_model_artifact.sh metal "openai/whisper-small"
+  export_model_artifact.sh metal "nvidia/parakeet-tdt" "quantized-int4-metal"
   export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
   export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
   export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
@@ -131,18 +133,25 @@ case "$QUANT_NAME" in
     ;;
   quantized-int4-tile-packed)
     if [ "$DEVICE" = "metal" ]; then
-      echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'"
+      echo "Error: Metal backend does not support quantization '$QUANT_NAME'"
       exit 1
     fi
     EXTRA_ARGS="--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
     ;;
   quantized-int4-weight-only)
     if [ "$DEVICE" = "metal" ]; then
-      echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'"
+      echo "Error: Metal backend does not support quantization '$QUANT_NAME'"
       exit 1
     fi
     EXTRA_ARGS="--qlinear_encoder 4w"
     ;;
+  quantized-int4-metal)
+    if [ "$DEVICE" != "metal" ]; then
+      echo "Error: Quantization '$QUANT_NAME' only supported on Metal backend"
+      exit 1
+    fi
+    EXTRA_ARGS="--qlinear fpa4w --qlinear_encoder fpa4w"
+    ;;
   quantized-8da4w)
     if [ "$DEVICE" != "xnnpack" ]; then
       echo "Error: quantized-8da4w is only supported with xnnpack device"
@@ -152,7 +161,7 @@ case "$QUANT_NAME" in
     ;;
   *)
     echo "Error: Unsupported quantization '$QUANT_NAME'"
-    echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only, quantized-8da4w"
+    echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only, quantized-int4-metal, quantized-8da4w"
     exit 1
     ;;
 esac

diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml
@@ -41,11 +41,10 @@ jobs:
         set -eux
 
         echo "::group::Setup ExecuTorch"
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} ./install_executorch.sh
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
         echo "::endgroup::"
 
         echo "::group::Build Metal Runtime"
-        ${CONDA_RUN} backends/apple/metal/tests/run_metal_test.sh --update-ao
         ${CONDA_RUN} backends/apple/metal/tests/run_metal_test.sh --build
         echo "::endgroup::"
 
@@ -73,6 +72,12 @@ jobs:
             name: "parakeet-tdt"
         quant:
           - "non-quantized"
+        # Only test int4 quantization with parakeet-tdt
+        include:
+          - model:
+              repo: "nvidia"
+              name: "parakeet-tdt"
+            quant: "quantized-int4-metal"
     with:
       runner: macos-m2-stable
       python-version: '3.11'
@@ -123,6 +128,12 @@ jobs:
             name: "parakeet-tdt"
         quant:
           - "non-quantized"
+        # Only test int4 quantization with parakeet-tdt
+        include:
+          - model:
+              repo: "nvidia"
+              name: "parakeet-tdt"
+            quant: "quantized-int4-metal"
     with:
       runner: macos-m2-stable
       python-version: '3.11'

@@ -39,23 +39,24 @@ The export script supports quantizing encoder and decoder linear layers using [t
 
 | Argument | Description |
 |----------|-------------|
-| `--qlinear_encoder` | Quantization config for encoder linear layers: `4w`, `8w`, `8da4w`, `8da8w` |
+| `--qlinear_encoder` | Quantization config for encoder linear layers: `4w`, `8w`, `8da4w`, `8da8w`, `fpa4w` |
 | `--qlinear_encoder_group_size` | Group size for encoder linear quantization (default: 32) |
 | `--qlinear_encoder_packing_format` | Packing format for encoder: `tile_packed_to_4d` |
-| `--qlinear` | Quantization config for decoder linear layers: `4w`, `8w`, `8da4w`, `8da8w` |
+| `--qlinear` | Quantization config for decoder linear layers: `4w`, `8w`, `8da4w`, `8da8w`, `fpa4w` |
 | `--qlinear_group_size` | Group size for decoder linear quantization (default: 32) |
 | `--qlinear_packing_format` | Packing format for decoder: `tile_packed_to_4d` |
 | `--qembedding` | Quantization config for decoder embedding layer: `4w`, `8w` |
 | `--qembedding_group_size` | Group size for embedding quantization (default: 0 = per-axis) |
 
 #### Quantization Configs
 
-| Config | Description |
-|--------|-------------|
-| `4w` | 4-bit weight only quantization |
-| `8w` | 8-bit weight only quantization |
-| `8da4w` | 8-bit dynamic activation, 4-bit weight |
-| `8da8w` | 8-bit dynamic activation, 8-bit weight |
+| Config | Description | Backends |
+|--------|-------------|----------|
+| `4w` | 4-bit weight only quantization | CUDA |
+| `8w` | 8-bit weight only quantization | CUDA |
+| `8da4w` | 8-bit dynamic activation, 4-bit weight | CUDA |
+| `8da8w` | 8-bit dynamic activation, 8-bit weight | CUDA |
+| `fpa4w` | Floating point activation, 4-bit weight | Metal |
 
 #### Example: Dynamic Quantization for XNNPACK
 
@@ -86,6 +87,30 @@ python export_parakeet_tdt.py \
 
 **Note:** The `tile_packed_to_4d` packing format is optimized for CUDA.
 
+#### Example: Metal 4-bit Quantization
+
+```bash
+python export_parakeet_tdt.py \
+    --backend metal \
+    --qlinear_encoder fpa4w \
+    --qlinear_encoder_group_size 32 \
+    --qlinear fpa4w \
+    --qlinear_group_size 32 \
+    --output-dir ./parakeet_metal_quantized
+```
+
+**Note:** Metal 4-bit quantization requires torchao built with experimental MPS (Metal) ops.
+
+You can install torchao with Metal support from the `ao` repo:
+```bash
+USE_CPP=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 pip install . --no-build-isolation
+```
+
+Alternatively, you can build torchao with Metal support while installing ExecuTorch:
+```bash
+EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
+```
+
 ### Metal Export (macOS)
 
 ```bash

@@ -622,7 +622,7 @@ def main():
     parser.add_argument(
         "--qlinear",
         type=str,
-        choices=["4w", "8w", "8da4w", "8da8w"],
+        choices=["4w", "8w", "8da4w", "8da8w", "fpa4w"],
         help="Quantization config for decoder linear layers",
     )
     parser.add_argument(
@@ -642,7 +642,7 @@ def main():
     parser.add_argument(
         "--qlinear_encoder",
         type=str,
-        choices=["4w", "8w", "8da4w", "8da8w"],
+        choices=["4w", "8w", "8da4w", "8da8w", "fpa4w"],
         help="Quantization config for encoder linear layers",
     )
     parser.add_argument(
@@ -678,6 +678,12 @@ def main():
     if args.dtype == "fp16":
         parser.error("fp16 is not yet supported")
 
+    # Validate fpa4w quantization requires Metal backend
+    if args.qlinear == "fpa4w" and args.backend != "metal":
+        parser.error("--qlinear=fpa4w can only be used with --backend=metal")
+    if args.qlinear_encoder == "fpa4w" and args.backend != "metal":
+        parser.error("--qlinear_encoder=fpa4w can only be used with --backend=metal")
+
     os.makedirs(args.output_dir, exist_ok=True)
 
     print("Extracting tokenizer...")

@@ -17,7 +17,7 @@ def quantize_model_(  # noqa: C901
 
     Args:
         module: The PyTorch module to quantize.
-        qlinear_config: Quantization config for linear layers ("4w", "8w", "8da4w", "8da8w").
+        qlinear_config: Quantization config for linear layers ("4w", "8w", "8da4w", "8da8w", "fpa4w").
         qlinear_group_size: Group size for linear quantization (default: 32).
         qlinear_packing_format: Packing format for linear layers (e.g., "tile_packed_to_4d").
         qembedding_config: Quantization config for embedding layers ("4w", "8w").
@@ -26,12 +26,41 @@ def quantize_model_(  # noqa: C901
     if not qlinear_config and not qembedding_config:
         return
 
+    from torchao.quantization.quant_api import quantize_
+
+    # Metal (MPS) quantization uses different API
+    if qlinear_config == "fpa4w":
+        # Load MPS ops
+        import torchao.experimental.ops.mps  # noqa: F401
+        from torchao.experimental.quant_api import UIntxWeightOnlyConfig
+
+        config = UIntxWeightOnlyConfig(
+            group_size=qlinear_group_size,
+            bitwidth=4,
+        )
+
+        def linear_filter(m, fqn):
+            if isinstance(m, torch.nn.Linear):
+                if m.weight.shape[1] % qlinear_group_size != 0:
+                    raise ValueError(
+                        f"Metal int4 quantization requires weight dimension (K) to be multiple of group_size. "
+                        f"Layer {fqn} has weight shape {m.weight.shape} (K={m.weight.shape[1]}, group_size={qlinear_group_size})"  # noqa: E501
+                    )
+                return True
+            return False
+
+        print(
+            f"  Applying {qlinear_config} linear quantization "
+            f"(group_size={qlinear_group_size})..."
+        )
+        quantize_(module, config, filter_fn=linear_filter)
+        return
+
     from torchao.quantization.granularity import PerAxis, PerGroup
     from torchao.quantization.quant_api import (
         Int4WeightOnlyConfig,
         Int8DynamicActivationIntxWeightConfig,
         IntxWeightOnlyConfig,
-        quantize_,
     )
 
     # Quantize embedding layers first