From 39db621505b31fe217eed504cf4419912d5ae2cc Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Fri, 30 Jan 2026 19:25:33 -0500
Subject: [PATCH 1/6] Update

[ghstack-poisoned]
---
 .github/workflows/metal.yml                  |  24 +
 backends/apple/metal/tests/run_metal_test.sh | 126 +++
 backends/apple/metal/tests/test_modules.py   | 817 +++++++++++++++++++
 3 files changed, 967 insertions(+)
 create mode 100755 backends/apple/metal/tests/run_metal_test.sh
 create mode 100644 backends/apple/metal/tests/test_modules.py
diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml
index 1e0ad2f9587..63466f36abb 100644
--- a/.github/workflows/metal.yml
+++ b/.github/workflows/metal.yml
@@ -28,6 +28,30 @@ jobs:
         PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_executorch.sh
         echo "::endgroup::"
 
+  test-metal-modules:
+    name: test-metal-backend-modules
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m2-stable
+      python-version: '3.11'
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Build Metal Runtime"
+        ${CONDA_RUN} backends/apple/metal/tests/run_metal_test.sh --build
+        echo "::endgroup::"
+
+        echo "::group::Run Metal Backend Module Tests"
+        ${CONDA_RUN} python -m unittest backends.apple.metal.tests.test_modules.TestMetalBackendModules
+        echo "::endgroup::"
+
   export-model-metal-artifact:
     name: export-model-metal-artifact
       # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
diff --git a/backends/apple/metal/tests/run_metal_test.sh b/backends/apple/metal/tests/run_metal_test.sh
new file mode 100755
index 00000000000..95c0cb1c6a7
--- /dev/null
+++ b/backends/apple/metal/tests/run_metal_test.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Script to build and run Metal backend tests
+# Usage:
+#   ./run_metal_test.sh --build              # Build the Metal runtime
+#   ./run_metal_test.sh --run <pte> <ptd>    # Run inference with given model files
+#   ./run_metal_test.sh --check-build        # Check if runtime is already built
+
+set -e  # Exit on any error
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+EXECUTORCH_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)"
+BUILD_DIR="$EXECUTORCH_ROOT/cmake-out"
+EXECUTOR_RUNNER="$BUILD_DIR/executor_runner"
+
+# Function to check if Metal runtime is built
+check_build() {
+    if [[ -f "$EXECUTOR_RUNNER" ]]; then
+        echo "true"
+        return 0
+    else
+        echo "false"
+        return 1
+    fi
+}
+
+# Function to build the Metal runtime
+build_runtime() {
+    echo "Building Metal runtime..."
+
+    # Check if we're on macOS
+    if [[ "$(uname)" != "Darwin" ]]; then
+        echo "Error: Metal backend is only supported on macOS"
+        exit 1
+    fi
+
+    # Create build directory
+    mkdir -p "$BUILD_DIR"
+    cd "$BUILD_DIR"
+
+    # CMake configuration for Metal backend
+    CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON \
+                -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+                -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+                -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+                -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+                -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+                -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+                -DAOTI_METAL=ON \
+                -DEXECUTORCH_LOG_LEVEL=Info \
+                -DCMAKE_BUILD_TYPE=Release"
+
+    echo "Running cmake..."
+    eval cmake $CMAKE_ARGS "$EXECUTORCH_ROOT"
+
+    echo "Building..."
+    cmake --build . -j$(sysctl -n hw.ncpu)
+
+    cd "$EXECUTORCH_ROOT"
+
+    if [[ -f "$EXECUTOR_RUNNER" ]]; then
+        echo "Build successful: $EXECUTOR_RUNNER"
+    else
+        echo "Error: Build failed - executor_runner not found"
+        exit 1
+    fi
+}
+
+# Function to run inference
+run_inference() {
+    local pte_path="$1"
+    local ptd_path="$2"
+
+    if [[ ! -f "$EXECUTOR_RUNNER" ]]; then
+        echo "Error: executor_runner not found at $EXECUTOR_RUNNER"
+        echo "Run '$0 --build' first to build the Metal runtime"
+        exit 1
+    fi
+
+    if [[ ! -f "$pte_path" ]]; then
+        echo "Error: PTE file not found: $pte_path"
+        exit 1
+    fi
+
+    if [[ ! -f "$ptd_path" ]]; then
+        echo "Error: PTD file not found: $ptd_path"
+        exit 1
+    fi
+
+    echo "Running inference..."
+    echo "  PTE: $pte_path"
+    echo "  PTD: $ptd_path"
+
+    "$EXECUTOR_RUNNER" --model_path "$pte_path" --data_path "$ptd_path"
+}
+
+# Parse command line arguments
+case "$1" in
+    --build)
+        build_runtime
+        ;;
+    --run)
+        if [[ -z "$2" ]] || [[ -z "$3" ]]; then
+            echo "Usage: $0 --run <pte_path> <ptd_path>"
+            exit 1
+        fi
+        run_inference "$2" "$3"
+        ;;
+    --check-build)
+        check_build
+        ;;
+    *)
+        echo "Metal Backend Test Runner"
+        echo ""
+        echo "Usage:"
+        echo "  $0 --build              Build the Metal runtime"
+        echo "  $0 --run <pte> <ptd>    Run inference with given model files"
+        echo "  $0 --check-build        Check if runtime is already built"
+        exit 1
+        ;;
+esac
diff --git a/backends/apple/metal/tests/test_modules.py b/backends/apple/metal/tests/test_modules.py
new file mode 100644
index 00000000000..424d736b3b7
--- /dev/null
+++ b/backends/apple/metal/tests/test_modules.py
@@ -0,0 +1,817 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Unit tests for Metal backend modules.
+
+These tests export and run various model modules through the Metal backend
+to verify that the export and execution pipeline works correctly.
+
+These tests require MPS to be available. On systems without MPS support,
+the export tests will be skipped.
+"""
+
+import os
+import platform
+import subprocess
+import tempfile
+import unittest
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
+
+import numpy as np
+import torch
+from executorch.backends.apple.metal.metal_backend import MetalBackend
+from executorch.backends.apple.metal.metal_partitioner import MetalPartitioner
+from executorch.exir import to_edge_transform_and_lower
+from torch import nn
+from torch.export import export
+from torch.nn.attention import SDPBackend
+
+
+# Check if MPS is available for export tests
+MPS_AVAILABLE = torch.backends.mps.is_available()
+IS_MACOS = platform.system() == "Darwin"
+SKIP_EXPORT_TESTS = not MPS_AVAILABLE
+SKIP_REASON = "MPS not available - Metal export tests require MPS support"
+
+# Paths
+TESTS_DIR = Path(__file__).parent
+EXECUTORCH_ROOT = TESTS_DIR.parent.parent.parent.parent
+BUILD_DIR = EXECUTORCH_ROOT / "cmake-out"
+EXECUTOR_RUNNER = BUILD_DIR / "executor_runner"
+RUN_METAL_TEST_SCRIPT = TESTS_DIR / "run_metal_test.sh"
+
+# Check if executor_runner is built
+EXECUTOR_RUNNER_AVAILABLE = EXECUTOR_RUNNER.exists()
+SKIP_RUNTIME_TESTS = not EXECUTOR_RUNNER_AVAILABLE or SKIP_EXPORT_TESTS
+SKIP_RUNTIME_REASON = (
+    "executor_runner not built - run 'backends/apple/metal/tests/run_metal_test.sh --build'"
+    if not EXECUTOR_RUNNER_AVAILABLE
+    else SKIP_REASON
+)
+
+# Data types to test
+DTYPES = [torch.float32, torch.bfloat16]
+
+# Map dtype to short name for test method naming
+DTYPE_NAMES = {
+    torch.float32: "float32",
+    torch.bfloat16: "bfloat16",
+}
+
+# Registry mapping model names to their configurations
+MODULE_REGISTRY: Dict[str, Dict[str, Any]] = {}
+
+
+# =============================================================================
+# Model Definitions
+# =============================================================================
+
+
+class Add(nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        return x + y
+
+MODULE_REGISTRY["add"] = {
+    "model_class": Add,
+    "input_shapes": [(10,), (10,)],
+    "description": "Simple tensor addition model",
+}
+
+
+# -------------------------------------------------------------------------
+# Matrix Multiplication Modules
+# -------------------------------------------------------------------------
+
+class Mm(nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        return x.mm(y)
+
+MODULE_REGISTRY["mm"] = {
+    "model_class": Mm,
+    "input_shapes": [(3, 4), (4, 5)],
+    "description": "Simple mm layer model",
+}
+
+# -------------------------------------------------------------------------
+class MmWeights(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = nn.Parameter(torch.arange(20, dtype=torch.float).reshape(4, 5))
+
+    def forward(self, x: torch.Tensor):
+        return x.mm(self.weight)
+
+MODULE_REGISTRY["mm_weights"] = {
+    "model_class": MmWeights,
+    "input_shapes": [(3, 4)],
+    "description": "Matrix multiplication with weight parameter",
+}
+
+# -------------------------------------------------------------------------
+class TwoMm(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.left_weight = nn.Parameter(
+            torch.arange(20, dtype=torch.float).reshape(4, 5)
+        )
+        self.right_weight = nn.Parameter(
+            torch.arange(42, dtype=torch.float).reshape(6, 7)
+        )
+
+    def forward(self, x: torch.Tensor):
+        return self.left_weight.mm(x).mm(self.right_weight)
+
+MODULE_REGISTRY["two_mm"] = {
+    "model_class": TwoMm,
+    "input_shapes": [(5, 6)],
+    "description": "Two consecutive matrix multiplications",
+}
+
+# -------------------------------------------------------------------------
+class ElementwiseMmReduction(nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        x1 = x.sin() + x
+        y2 = y.cos() + 3
+        z = x1.mm(y2)
+        return z + z.sum()
+
+MODULE_REGISTRY["elementwise_mm_reduction"] = {
+    "model_class": ElementwiseMmReduction,
+    "input_shapes": [(11, 45), (45, 8)],
+    "description": "Combining mm with elementwise and reduction ops",
+}
+
+
+# -------------------------------------------------------------------------
+# Linear Modules
+# -------------------------------------------------------------------------
+
+class LinearNoBias(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(7, 101, bias=False)
+
+    def forward(self, x: torch.Tensor):
+        return self.linear(x)
+
+MODULE_REGISTRY["linear_nobias"] = {
+    "model_class": LinearNoBias,
+    "input_shapes": [(127, 7)],
+    "description": "Simple linear layer model with no bias",
+}
+
+
+# -------------------------------------------------------------------------
+# Convolution Modules
+# -------------------------------------------------------------------------
+
+class SingleConv2d(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels=3, out_channels=5, kernel_size=3, stride=1, padding=1
+        )
+
+    def forward(self, x: torch.Tensor):
+        return self.conv(x)
+
+MODULE_REGISTRY["conv2d"] = {
+    "model_class": SingleConv2d,
+    "input_shapes": [(4, 3, 8, 8)],
+    "description": "Single Conv2d layer model",
+}
+
+# -------------------------------------------------------------------------
+class DepthwiseConv(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dilation=1,
+            groups=32,
+            bias=False,
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+MODULE_REGISTRY["depthwise_conv"] = {
+    "model_class": DepthwiseConv,
+    "input_shapes": [(1, 32, 112, 112)],
+    "description": "Single Depthwise Conv2d layer model",
+}
+
+# -------------------------------------------------------------------------
+class SmallConv1d(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels=8,
+            out_channels=6,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dilation=1,
+            groups=1,
+            bias=False,
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+MODULE_REGISTRY["small_conv1d"] = {
+    "model_class": SmallConv1d,
+    "input_shapes": [(1, 8, 5)],
+    "description": "Conv1d layer with 8 input channels, 6 output channels",
+}
+
+# -------------------------------------------------------------------------
+class MockConv1d(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels=80,
+            out_channels=384,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dilation=1,
+            groups=1,
+            bias=True,
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+MODULE_REGISTRY["conv1d"] = {
+    "model_class": MockConv1d,
+    "input_shapes": [(1, 80, 3000)],
+    "description": "Conv1d layer with 80 input channels, 384 output channels",
+}
+
+# -------------------------------------------------------------------------
+class VoxtralConv1d(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels=128,
+            out_channels=1280,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dilation=1,
+            groups=1,
+            bias=False,
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+MODULE_REGISTRY["voxtral_conv1d"] = {
+    "model_class": VoxtralConv1d,
+    "input_shapes": [(10, 128, 3000)],
+    "description": "Conv1d layer with 128 input channels, 1280 output channels",
+}
+
+
+# -------------------------------------------------------------------------
+# Attention (SDPA) Modules
+# -------------------------------------------------------------------------
+
+class SimpleSDPA(nn.Module):
+    """Minimal SDPA test model."""
+
+    def forward(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> torch.Tensor:
+        output = torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, dropout_p=0.0, is_causal=False
+        )
+        return output
+
+MODULE_REGISTRY["sdpa"] = {
+    "model_class": SimpleSDPA,
+    "input_shapes": [(2, 4, 16, 64), (2, 4, 16, 64), (2, 4, 16, 64)],
+    "description": "Simple Scaled Dot Product Attention model",
+}
+
+# -------------------------------------------------------------------------
+class AddSDPA(nn.Module):
+    """SDPA model with Q, K, V as parameters that adds input to SDPA output."""
+
+    def __init__(self, batch_size=2, num_heads=4, seq_len=16, head_dim=64):
+        super().__init__()
+        self.query = nn.Parameter(
+            torch.randn(batch_size, num_heads, seq_len, head_dim)
+        )
+        self.key = nn.Parameter(torch.randn(batch_size, num_heads, seq_len, head_dim))
+        self.value = nn.Parameter(
+            torch.randn(batch_size, num_heads, seq_len, head_dim)
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        sdpa_output = torch.nn.functional.scaled_dot_product_attention(
+            self.query, self.key, self.value, dropout_p=0.0, is_causal=False
+        )
+        return sdpa_output + x
+
+MODULE_REGISTRY["add_sdpa"] = {
+    "model_class": AddSDPA,
+    "input_shapes": [(2, 4, 16, 64)],
+    "description": "SDPA model with Q,K,V as parameters that adds input to output",
+}
+
+# -------------------------------------------------------------------------
+class BaseAddStridedSDPA(nn.Module):
+    """SDPA model with strided Q, K, V parameters."""
+
+    def __init__(self, q_size, k_size, v_size, q_stride, k_stride, v_stride, attn_mask_size=None):
+        super().__init__()
+        self.q_size = q_size
+        self.k_size = k_size
+        self.v_size = v_size
+        self.q_stride = q_stride
+        self.k_stride = k_stride
+        self.v_stride = v_stride
+        self.attn_mask_size = attn_mask_size
+
+        self.query = nn.Parameter(torch.randn(q_size))
+        self.key = nn.Parameter(torch.randn(k_size))
+        self.value = nn.Parameter(torch.randn(v_size))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        query = torch.as_strided(self.query, size=self.q_size, stride=self.q_stride)
+        key = torch.as_strided(self.key, size=self.k_size, stride=self.k_stride)
+        value = torch.as_strided(self.value, size=self.v_size, stride=self.v_stride)
+        attn_mask = None
+        if self.attn_mask_size:
+            attn_mask = torch.zeros(self.attn_mask_size)
+
+        sdpa_output = torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, attn_mask, dropout_p=0.0, is_causal=False, scale=1.0
+        )
+        return sdpa_output + x
+
+# -------------------------------------------------------------------------
+class AddStridedSDPA(BaseAddStridedSDPA):
+    def __init__(self):
+        super().__init__(
+            q_size=(10, 20, 1500, 64),
+            k_size=(10, 20, 1500, 64),
+            v_size=(10, 20, 1500, 64),
+            q_stride=(1920000, 64, 1280, 1),
+            k_stride=(1920000, 64, 1280, 1),
+            v_stride=(1920000, 64, 1280, 1),
+        )
+
+MODULE_REGISTRY["audio_encoder_sdpa1"] = {
+    "model_class": AddStridedSDPA,
+    "input_shapes": [(10, 20, 1500, 64)],
+    "description": "Audio Encoder model with strided SDPA",
+}
+
+# -------------------------------------------------------------------------
+class AddStridedSDPA1(BaseAddStridedSDPA):
+    def __init__(self):
+        super().__init__(
+            q_size=(1, 20, 1, 64),
+            k_size=(1, 20, 1500, 64),
+            v_size=(1, 20, 1500, 64),
+            q_stride=(1280, 64, 1280, 1),
+            k_stride=(1920000, 64, 1280, 1),
+            v_stride=(1920000, 64, 1280, 1),
+        )
+
+MODULE_REGISTRY["whisper_strided_sdpa1"] = {
+    "model_class": AddStridedSDPA1,
+    "input_shapes": [(1, 20, 1, 64)],
+    "description": "Whisper-like strided SDPA variant 1",
+}
+
+# -------------------------------------------------------------------------
+class AddStridedSDPA2(BaseAddStridedSDPA):
+    def __init__(self):
+        super().__init__(
+            q_size=(1, 20, 1, 64),
+            k_size=(1, 20, 1024, 64),
+            v_size=(1, 20, 1024, 64),
+            q_stride=(1280, 64, 1280, 1),
+            k_stride=(1310720, 65536, 64, 1),
+            v_stride=(1310720, 65536, 64, 1),
+            attn_mask_size=(1, 1, 1, 1024),
+        )
+
+MODULE_REGISTRY["whisper_strided_sdpa2"] = {
+    "model_class": AddStridedSDPA2,
+    "input_shapes": [(1, 20, 1, 64)],
+    "description": "Whisper-like strided SDPA variant 2",
+}
+
+
+# -------------------------------------------------------------------------
+# Normalization Modules
+# -------------------------------------------------------------------------
+
+class BatchNorm(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bn = nn.BatchNorm2d(num_features=16)
+
+    def forward(self, x):
+        return self.bn(x)
+
+MODULE_REGISTRY["batchnorm"] = {
+    "model_class": BatchNorm,
+    "input_shapes": [(1, 16, 32, 32)],
+    "description": "Single BatchNorm2d layer model",
+}
+
+
+# -------------------------------------------------------------------------
+# Block/Composite Modules
+# -------------------------------------------------------------------------
+
+class SingleResNetBlock(nn.Module):
+    def __init__(self, in_channels=64, out_channels=64, stride=1):
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False
+        )
+        self.bn2 = nn.BatchNorm2d(out_channels)
+
+        self.skip_connection = None
+        if stride != 1 or in_channels != out_channels:
+            self.skip_connection = nn.Sequential(
+                nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=stride, bias=False
+                ),
+                nn.BatchNorm2d(out_channels),
+            )
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.skip_connection is not None:
+            identity = self.skip_connection(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+MODULE_REGISTRY["single_resnet_block"] = {
+    "model_class": SingleResNetBlock,
+    "input_shapes": [(1, 64, 8, 8)],
+    "description": "Single ResNet block with skip connection",
+}
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+
+def get_model_and_inputs(
+    model_name: str, dtype: torch.dtype = torch.float32
+) -> Tuple[nn.Module, Tuple[torch.Tensor, ...]]:
+    """Get model and example inputs based on model name."""
+    if model_name not in MODULE_REGISTRY:
+        available_models = ", ".join(MODULE_REGISTRY.keys())
+        raise ValueError(
+            f"Unsupported model: {model_name}. Available models: {available_models}"
+        )
+
+    model_config = MODULE_REGISTRY[model_name]
+    model_class = model_config["model_class"]
+    input_shapes = model_config["input_shapes"]
+
+    model = model_class().eval()
+    if dtype is not None:
+        model = model.to(dtype)
+
+    example_inputs = tuple(
+        torch.randn(*shape, dtype=dtype) for shape in input_shapes
+    )
+
+    return model, example_inputs
+
+
+def export_model_to_metal(
+    model: nn.Module, example_inputs: Tuple[torch.Tensor, ...]
+) -> Any:
+    """Export model through the Metal backend pipeline."""
+    method_name = "forward"
+
+    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+        aten_dialect = export(model, example_inputs, strict=False)
+
+        edge_program = to_edge_transform_and_lower(
+            aten_dialect,
+            partitioner=[
+                MetalPartitioner(
+                    [MetalBackend.generate_method_name_compile_spec(method_name)]
+                )
+            ],
+        )
+
+    executorch_program = edge_program.to_executorch()
+    return executorch_program
+
+
+def export_model_to_files(
+    model: nn.Module,
+    example_inputs: Tuple[torch.Tensor, ...],
+    output_dir: Path,
+    model_name: str,
+) -> Tuple[Path, Path, torch.Tensor]:
+    """
+    Export model to .pte and .ptd files, and compute expected output.
+
+    Returns:
+        Tuple of (pte_path, ptd_path, expected_output)
+    """
+    # Compute expected output using all-ones input (matching export_aoti_metal.py)
+    all_ones_input = tuple(torch.ones_like(inp) for inp in example_inputs)
+    with torch.no_grad():
+        expected_output = model(*all_ones_input)
+
+    # Export to executorch
+    executorch_program = export_model_to_metal(model, example_inputs)
+
+    # Save .pte file
+    pte_path = output_dir / f"{model_name}.pte"
+    with open(pte_path, "wb") as f:
+        f.write(executorch_program.buffer)
+
+    # Save .ptd file (tensor data)
+    executorch_program.write_tensor_data_to_file(str(output_dir))
+    ptd_path = output_dir / "aoti_metal_blob.ptd"
+
+    return pte_path, ptd_path, expected_output
+
+
+def run_executor_runner(pte_path: Path, ptd_path: Path) -> bool:
+    """
+    Run the executor_runner binary with the given model files.
+
+    Returns:
+        True if execution succeeded, False otherwise.
+    """
+    if not EXECUTOR_RUNNER.exists():
+        raise RuntimeError(
+            f"executor_runner not found at {EXECUTOR_RUNNER}. "
+            f"Run '{RUN_METAL_TEST_SCRIPT} --build' to build."
+        )
+
+    cmd = [
+        str(EXECUTOR_RUNNER),
+        "--model_path", str(pte_path),
+        "--data_path", str(ptd_path),
+    ]
+
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=60,
+            cwd=str(EXECUTORCH_ROOT),
+        )
+        return result.returncode == 0
+    except subprocess.TimeoutExpired:
+        return False
+    except Exception:
+        return False
+
+
+def read_output_file(filepath: Path) -> Optional[np.ndarray]:
+    """Read comma-separated output values from a file."""
+    try:
+        with open(filepath, "r") as f:
+            content = f.read().strip()
+            if not content:
+                return None
+            values = [float(x.strip()) for x in content.split(",") if x.strip()]
+            return np.array(values)
+    except (FileNotFoundError, ValueError):
+        return None
+
+
+def compare_outputs(
+    expected: torch.Tensor,
+    runtime_output_file: Path,
+    atol: float = 1e-5,
+    rtol: float = 1e-5,
+) -> Tuple[bool, Optional[float], Optional[float]]:
+    """
+    Compare expected PyTorch output with runtime output from file.
+
+    Returns:
+        Tuple of (is_close, max_atol, max_rtol)
+    """
+    runtime_values = read_output_file(runtime_output_file)
+    if runtime_values is None:
+        return False, None, None
+
+    # Flatten expected output
+    if isinstance(expected, tuple):
+        expected_values = np.concatenate([t.flatten().numpy() for t in expected])
+    else:
+        expected_values = expected.flatten().numpy()
+
+    if len(runtime_values) != len(expected_values):
+        return False, None, None
+
+    # Calculate tolerances
+    abs_diff = np.abs(runtime_values - expected_values)
+    max_atol_val = np.max(abs_diff)
+
+    eps = 1e-8
+    denominator = np.maximum(
+        np.maximum(np.abs(runtime_values), np.abs(expected_values)), eps
+    )
+    rel_diff = abs_diff / denominator
+    max_rtol_val = np.max(rel_diff)
+
+    is_close = np.allclose(runtime_values, expected_values, atol=atol, rtol=rtol)
+
+    return is_close, max_atol_val, max_rtol_val
+
+
+# =============================================================================
+# Test Class
+# =============================================================================
+
+
+class TestMetalBackendModules(unittest.TestCase):
+    """
+    Test Metal backend modules export and execution.
+
+    Each test exports a model through the Metal backend and verifies:
+    1. The export process completes without errors
+    2. The exported program has non-zero buffer size
+    3. The runtime output matches the expected PyTorch output
+    """
+
+    def _test_module_export(
+        self, model_name: str, dtype: torch.dtype = torch.float32
+    ) -> None:
+        """Generic test for module export."""
+        if SKIP_EXPORT_TESTS:
+            self.skipTest(SKIP_REASON)
+
+        model, example_inputs = get_model_and_inputs(model_name, dtype=dtype)
+
+        # Verify model forward pass works before export
+        with torch.no_grad():
+            model_output = model(*example_inputs)
+
+        self.assertIsNotNone(
+            model_output,
+            f"{model_name} ({DTYPE_NAMES[dtype]}): Forward pass returned None",
+        )
+
+        # Export to Metal backend
+        executorch_program = export_model_to_metal(model, example_inputs)
+
+        self.assertIsNotNone(
+            executorch_program,
+            f"{model_name} ({DTYPE_NAMES[dtype]}): Export returned None",
+        )
+        self.assertGreater(
+            len(executorch_program.buffer),
+            0,
+            f"{model_name} ({DTYPE_NAMES[dtype]}): Exported buffer is empty",
+        )
+
+    def _test_module_output_consistency(
+        self, model_name: str, dtype: torch.dtype = torch.float32
+    ) -> None:
+        """
+        Test that Metal backend runtime output matches PyTorch output.
+
+        This test:
+        1. Exports the model to .pte and .ptd files
+        2. Runs the model using executor_runner
+        3. Compares the runtime output with expected PyTorch output
+        """
+        if SKIP_RUNTIME_TESTS:
+            self.skipTest(SKIP_RUNTIME_REASON)
+
+        model, example_inputs = get_model_and_inputs(model_name, dtype=dtype)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmpdir_path = Path(tmpdir)
+
+            # Create aoti_debug_data directory for output files
+            debug_dir = tmpdir_path / "aoti_debug_data"
+            debug_dir.mkdir(exist_ok=True)
+
+            # Export model and get expected output
+            pte_path, ptd_path, expected_output = export_model_to_files(
+                model, example_inputs, tmpdir_path, model_name
+            )
+
+            self.assertTrue(
+                pte_path.exists(),
+                f"{model_name}: PTE file not created at {pte_path}",
+            )
+            self.assertTrue(
+                ptd_path.exists(),
+                f"{model_name}: PTD file not created at {ptd_path}",
+            )
+
+            # Run executor_runner
+            success = run_executor_runner(pte_path, ptd_path)
+            self.assertTrue(
+                success,
+                f"{model_name}: executor_runner failed",
+            )
+
+            # Compare outputs
+            runtime_output_file = debug_dir / "final_runtime_output.txt"
+
+            if runtime_output_file.exists():
+                is_close, max_atol, max_rtol = compare_outputs(
+                    expected_output, runtime_output_file
+                )
+
+                self.assertTrue(
+                    is_close,
+                    f"{model_name} ({DTYPE_NAMES[dtype]}): Output mismatch - max_atol={max_atol}, max_rtol={max_rtol}",
+                )
+
+
+# =============================================================================
+# Dynamically generate test methods for each module and dtype in MODULE_REGISTRY
+# =============================================================================
+
+
+def _make_export_test(model_name: str, dtype: torch.dtype):
+    """Factory function to create an export test method for a given model and dtype."""
+    def test_method(self):
+        self._test_module_export(model_name, dtype)
+    dtype_name = DTYPE_NAMES[dtype]
+    test_method.__doc__ = f"Test {model_name} module export with {dtype_name}."
+    return test_method
+
+
+def _make_output_consistency_test(model_name: str, dtype: torch.dtype):
+    """Factory function to create an output consistency test method for a given model and dtype."""
+    def test_method(self):
+        self._test_module_output_consistency(model_name, dtype)
+    dtype_name = DTYPE_NAMES[dtype]
+    test_method.__doc__ = f"Test {model_name} module output consistency with {dtype_name}."
+    return test_method
+
+
+# Add export and output consistency tests for each module and dtype in the registry
+for _model_name in MODULE_REGISTRY:
+    for _dtype in DTYPES:
+        _dtype_name = DTYPE_NAMES[_dtype]
+
+        # Create export test: test_<model_name>_<dtype>_export
+        _export_test_name = f"test_{_model_name}_{_dtype_name}_export"
+        setattr(
+            TestMetalBackendModules,
+            _export_test_name,
+            _make_export_test(_model_name, _dtype),
+        )
+
+        # Create output consistency test: test_<model_name>_<dtype>_output_consistency
+        _consistency_test_name = f"test_{_model_name}_{_dtype_name}_output_consistency"
+        setattr(
+            TestMetalBackendModules,
+            _consistency_test_name,
+            _make_output_consistency_test(_model_name, _dtype),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0ed7c5c778087f16c3b06ab9463964f0b1e5287f Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Fri, 30 Jan 2026 21:59:19 -0500
Subject: [PATCH 2/6] Update

[ghstack-poisoned]
---
 backends/apple/metal/tests/run_metal_test.sh |   2 +-
 backends/apple/metal/tests/test_modules.py   | 166 ++++++++++++++-----
 2 files changed, 121 insertions(+), 47 deletions(-)

diff --git a/backends/apple/metal/tests/run_metal_test.sh b/backends/apple/metal/tests/run_metal_test.sh
index 95c0cb1c6a7..9595cbf0c3d 100755
--- a/backends/apple/metal/tests/run_metal_test.sh
+++ b/backends/apple/metal/tests/run_metal_test.sh
@@ -56,7 +56,7 @@ build_runtime() {
                 -DCMAKE_BUILD_TYPE=Release"
 
     echo "Running cmake..."
-    eval cmake $CMAKE_ARGS "$EXECUTORCH_ROOT"
+    cmake $CMAKE_ARGS "$EXECUTORCH_ROOT"
 
     echo "Building..."
     cmake --build . -j$(sysctl -n hw.ncpu)
diff --git a/backends/apple/metal/tests/test_modules.py b/backends/apple/metal/tests/test_modules.py
index 424d736b3b7..c97298a6bc2 100644
--- a/backends/apple/metal/tests/test_modules.py
+++ b/backends/apple/metal/tests/test_modules.py
@@ -38,6 +38,9 @@
 SKIP_EXPORT_TESTS = not MPS_AVAILABLE
 SKIP_REASON = "MPS not available - Metal export tests require MPS support"
 
+# Check if running in CI (GitHub Actions)
+IS_CI = os.environ.get("GITHUB_ACTIONS") == "true"
+
 # Paths
 TESTS_DIR = Path(__file__).parent
 EXECUTORCH_ROOT = TESTS_DIR.parent.parent.parent.parent
@@ -45,6 +48,12 @@
 EXECUTOR_RUNNER = BUILD_DIR / "executor_runner"
 RUN_METAL_TEST_SCRIPT = TESTS_DIR / "run_metal_test.sh"
 
+# Test output directory - use current working directory in CI for reliable write access
+if IS_CI:
+    TEST_OUTPUT_BASE_DIR = Path.cwd() / "aoti_debug_data"
+else:
+    TEST_OUTPUT_BASE_DIR = None  # Will use tempfile.TemporaryDirectory
+
 # Check if executor_runner is built
 EXECUTOR_RUNNER_AVAILABLE = EXECUTOR_RUNNER.exists()
 SKIP_RUNTIME_TESTS = not EXECUTOR_RUNNER_AVAILABLE or SKIP_EXPORT_TESTS
@@ -76,6 +85,7 @@ class Add(nn.Module):
     def forward(self, x: torch.Tensor, y: torch.Tensor):
         return x + y
 
+
 MODULE_REGISTRY["add"] = {
     "model_class": Add,
     "input_shapes": [(10,), (10,)],
@@ -87,16 +97,19 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 # Matrix Multiplication Modules
 # -------------------------------------------------------------------------
 
+
 class Mm(nn.Module):
     def forward(self, x: torch.Tensor, y: torch.Tensor):
         return x.mm(y)
 
+
 MODULE_REGISTRY["mm"] = {
     "model_class": Mm,
     "input_shapes": [(3, 4), (4, 5)],
     "description": "Simple mm layer model",
 }
 
+
 # -------------------------------------------------------------------------
 class MmWeights(nn.Module):
     def __init__(self):
@@ -106,12 +119,14 @@ def __init__(self):
     def forward(self, x: torch.Tensor):
         return x.mm(self.weight)
 
+
 MODULE_REGISTRY["mm_weights"] = {
     "model_class": MmWeights,
     "input_shapes": [(3, 4)],
     "description": "Matrix multiplication with weight parameter",
 }
 
+
 # -------------------------------------------------------------------------
 class TwoMm(nn.Module):
     def __init__(self):
@@ -126,12 +141,14 @@ def __init__(self):
     def forward(self, x: torch.Tensor):
         return self.left_weight.mm(x).mm(self.right_weight)
 
+
 MODULE_REGISTRY["two_mm"] = {
     "model_class": TwoMm,
     "input_shapes": [(5, 6)],
     "description": "Two consecutive matrix multiplications",
 }
 
+
 # -------------------------------------------------------------------------
 class ElementwiseMmReduction(nn.Module):
     def forward(self, x: torch.Tensor, y: torch.Tensor):
@@ -140,6 +157,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         z = x1.mm(y2)
         return z + z.sum()
 
+
 MODULE_REGISTRY["elementwise_mm_reduction"] = {
     "model_class": ElementwiseMmReduction,
     "input_shapes": [(11, 45), (45, 8)],
@@ -151,6 +169,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 # Linear Modules
 # -------------------------------------------------------------------------
 
+
 class LinearNoBias(nn.Module):
     def __init__(self):
         super().__init__()
@@ -159,6 +178,7 @@ def __init__(self):
     def forward(self, x: torch.Tensor):
         return self.linear(x)
 
+
 MODULE_REGISTRY["linear_nobias"] = {
     "model_class": LinearNoBias,
     "input_shapes": [(127, 7)],
@@ -170,6 +190,7 @@ def forward(self, x: torch.Tensor):
 # Convolution Modules
 # -------------------------------------------------------------------------
 
+
 class SingleConv2d(nn.Module):
     def __init__(self):
         super().__init__()
@@ -180,12 +201,14 @@ def __init__(self):
     def forward(self, x: torch.Tensor):
         return self.conv(x)
 
+
 MODULE_REGISTRY["conv2d"] = {
     "model_class": SingleConv2d,
     "input_shapes": [(4, 3, 8, 8)],
     "description": "Single Conv2d layer model",
 }
 
+
 # -------------------------------------------------------------------------
 class DepthwiseConv(nn.Module):
     def __init__(self):
@@ -204,12 +227,14 @@ def __init__(self):
     def forward(self, x):
         return self.conv(x)
 
+
 MODULE_REGISTRY["depthwise_conv"] = {
     "model_class": DepthwiseConv,
     "input_shapes": [(1, 32, 112, 112)],
     "description": "Single Depthwise Conv2d layer model",
 }
 
+
 # -------------------------------------------------------------------------
 class SmallConv1d(nn.Module):
     def __init__(self):
@@ -228,14 +253,16 @@ def __init__(self):
     def forward(self, x):
         return self.conv(x)
 
+
 MODULE_REGISTRY["small_conv1d"] = {
     "model_class": SmallConv1d,
     "input_shapes": [(1, 8, 5)],
     "description": "Conv1d layer with 8 input channels, 6 output channels",
 }
 
+
 # -------------------------------------------------------------------------
-class MockConv1d(nn.Module):
+class MediumConv1d(nn.Module):
     def __init__(self):
         super().__init__()
         self.conv = nn.Conv1d(
@@ -252,12 +279,14 @@ def __init__(self):
     def forward(self, x):
         return self.conv(x)
 
+
 MODULE_REGISTRY["conv1d"] = {
-    "model_class": MockConv1d,
+    "model_class": MediumConv1d,
     "input_shapes": [(1, 80, 3000)],
     "description": "Conv1d layer with 80 input channels, 384 output channels",
 }
 
+
 # -------------------------------------------------------------------------
 class VoxtralConv1d(nn.Module):
     def __init__(self):
@@ -276,6 +305,7 @@ def __init__(self):
     def forward(self, x):
         return self.conv(x)
 
+
 MODULE_REGISTRY["voxtral_conv1d"] = {
     "model_class": VoxtralConv1d,
     "input_shapes": [(10, 128, 3000)],
@@ -287,6 +317,7 @@ def forward(self, x):
 # Attention (SDPA) Modules
 # -------------------------------------------------------------------------
 
+
 class SimpleSDPA(nn.Module):
     """Minimal SDPA test model."""
 
@@ -298,25 +329,23 @@ def forward(
         )
         return output
 
+
 MODULE_REGISTRY["sdpa"] = {
     "model_class": SimpleSDPA,
     "input_shapes": [(2, 4, 16, 64), (2, 4, 16, 64), (2, 4, 16, 64)],
     "description": "Simple Scaled Dot Product Attention model",
 }
 
+
 # -------------------------------------------------------------------------
 class AddSDPA(nn.Module):
     """SDPA model with Q, K, V as parameters that adds input to SDPA output."""
 
     def __init__(self, batch_size=2, num_heads=4, seq_len=16, head_dim=64):
         super().__init__()
-        self.query = nn.Parameter(
-            torch.randn(batch_size, num_heads, seq_len, head_dim)
-        )
+        self.query = nn.Parameter(torch.randn(batch_size, num_heads, seq_len, head_dim))
         self.key = nn.Parameter(torch.randn(batch_size, num_heads, seq_len, head_dim))
-        self.value = nn.Parameter(
-            torch.randn(batch_size, num_heads, seq_len, head_dim)
-        )
+        self.value = nn.Parameter(torch.randn(batch_size, num_heads, seq_len, head_dim))
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         sdpa_output = torch.nn.functional.scaled_dot_product_attention(
@@ -324,17 +353,21 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         )
         return sdpa_output + x
 
+
 MODULE_REGISTRY["add_sdpa"] = {
     "model_class": AddSDPA,
     "input_shapes": [(2, 4, 16, 64)],
     "description": "SDPA model with Q,K,V as parameters that adds input to output",
 }
 
+
 # -------------------------------------------------------------------------
 class BaseAddStridedSDPA(nn.Module):
     """SDPA model with strided Q, K, V parameters."""
 
-    def __init__(self, q_size, k_size, v_size, q_stride, k_stride, v_stride, attn_mask_size=None):
+    def __init__(
+        self, q_size, k_size, v_size, q_stride, k_stride, v_stride, attn_mask_size=None
+    ):
         super().__init__()
         self.q_size = q_size
         self.k_size = k_size
@@ -361,6 +394,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         )
         return sdpa_output + x
 
+
 # -------------------------------------------------------------------------
 class AddStridedSDPA(BaseAddStridedSDPA):
     def __init__(self):
@@ -373,12 +407,14 @@ def __init__(self):
             v_stride=(1920000, 64, 1280, 1),
         )
 
+
 MODULE_REGISTRY["audio_encoder_sdpa1"] = {
     "model_class": AddStridedSDPA,
     "input_shapes": [(10, 20, 1500, 64)],
     "description": "Audio Encoder model with strided SDPA",
 }
 
+
 # -------------------------------------------------------------------------
 class AddStridedSDPA1(BaseAddStridedSDPA):
     def __init__(self):
@@ -391,12 +427,14 @@ def __init__(self):
             v_stride=(1920000, 64, 1280, 1),
         )
 
+
 MODULE_REGISTRY["whisper_strided_sdpa1"] = {
     "model_class": AddStridedSDPA1,
     "input_shapes": [(1, 20, 1, 64)],
     "description": "Whisper-like strided SDPA variant 1",
 }
 
+
 # -------------------------------------------------------------------------
 class AddStridedSDPA2(BaseAddStridedSDPA):
     def __init__(self):
@@ -410,6 +448,7 @@ def __init__(self):
             attn_mask_size=(1, 1, 1, 1024),
         )
 
+
 MODULE_REGISTRY["whisper_strided_sdpa2"] = {
     "model_class": AddStridedSDPA2,
     "input_shapes": [(1, 20, 1, 64)],
@@ -421,6 +460,7 @@ def __init__(self):
 # Normalization Modules
 # -------------------------------------------------------------------------
 
+
 class BatchNorm(nn.Module):
     def __init__(self):
         super().__init__()
@@ -429,6 +469,7 @@ def __init__(self):
     def forward(self, x):
         return self.bn(x)
 
+
 MODULE_REGISTRY["batchnorm"] = {
     "model_class": BatchNorm,
     "input_shapes": [(1, 16, 32, 32)],
@@ -440,6 +481,7 @@ def forward(self, x):
 # Block/Composite Modules
 # -------------------------------------------------------------------------
 
+
 class SingleResNetBlock(nn.Module):
     def __init__(self, in_channels=64, out_channels=64, stride=1):
         super().__init__()
@@ -485,6 +527,7 @@ def forward(self, x):
 
         return out
 
+
 MODULE_REGISTRY["single_resnet_block"] = {
     "model_class": SingleResNetBlock,
     "input_shapes": [(1, 64, 8, 8)],
@@ -515,9 +558,7 @@ def get_model_and_inputs(
     if dtype is not None:
         model = model.to(dtype)
 
-    example_inputs = tuple(
-        torch.randn(*shape, dtype=dtype) for shape in input_shapes
-    )
+    example_inputs = tuple(torch.randn(*shape, dtype=dtype) for shape in input_shapes)
 
     return model, example_inputs
 
@@ -576,12 +617,13 @@ def export_model_to_files(
     return pte_path, ptd_path, expected_output
 
 
-def run_executor_runner(pte_path: Path, ptd_path: Path) -> bool:
+def run_executor_runner(pte_path: Path, ptd_path: Path) -> Tuple[bool, Optional[str]]:
     """
     Run the executor_runner binary with the given model files.
 
     Returns:
-        True if execution succeeded, False otherwise.
+        Tuple of (success, error_message). If success is True, error_message is None.
+        If success is False, error_message contains details about the failure.
     """
     if not EXECUTOR_RUNNER.exists():
         raise RuntimeError(
@@ -591,8 +633,10 @@ def run_executor_runner(pte_path: Path, ptd_path: Path) -> bool:
 
     cmd = [
         str(EXECUTOR_RUNNER),
-        "--model_path", str(pte_path),
-        "--data_path", str(ptd_path),
+        "--model_path",
+        str(pte_path),
+        "--data_path",
+        str(ptd_path),
     ]
 
     try:
@@ -603,11 +647,17 @@ def run_executor_runner(pte_path: Path, ptd_path: Path) -> bool:
             timeout=60,
             cwd=str(EXECUTORCH_ROOT),
         )
-        return result.returncode == 0
-    except subprocess.TimeoutExpired:
-        return False
-    except Exception:
-        return False
+        if result.returncode == 0:
+            return True, None
+        else:
+            error_msg = (
+                f"executor_runner exited with code {result.returncode}\n"
+                f"stdout: {result.stdout}\n"
+                f"stderr: {result.stderr}"
+            )
+            return False, error_msg
+    except subprocess.TimeoutExpired as e:
+        return False, f"executor_runner timed out after 60 seconds: {e}"
 
 
 def read_output_file(filepath: Path) -> Optional[np.ndarray]:
@@ -639,11 +689,14 @@ def compare_outputs(
     if runtime_values is None:
         return False, None, None
 
-    # Flatten expected output
+    # Flatten expected output and move to CPU for numpy conversion
+    # (required when tensor is on MPS device)
     if isinstance(expected, tuple):
-        expected_values = np.concatenate([t.flatten().numpy() for t in expected])
+        expected_values = np.concatenate(
+            [t.detach().cpu().flatten().numpy() for t in expected]
+        )
     else:
-        expected_values = expected.flatten().numpy()
+        expected_values = expected.detach().cpu().flatten().numpy()
 
     if len(runtime_values) != len(expected_values):
         return False, None, None
@@ -725,47 +778,62 @@ def _test_module_output_consistency(
             self.skipTest(SKIP_RUNTIME_REASON)
 
         model, example_inputs = get_model_and_inputs(model_name, dtype=dtype)
+        dtype_name = DTYPE_NAMES[dtype]
+        test_subdir_name = f"{model_name}_{dtype_name}"
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            tmpdir_path = Path(tmpdir)
-
-            # Create aoti_debug_data directory for output files
-            debug_dir = tmpdir_path / "aoti_debug_data"
-            debug_dir.mkdir(exist_ok=True)
+        def run_test_in_directory(test_dir: Path) -> None:
+            """Run the actual test logic in the given directory."""
+            # Create model output directory: aoti_debug_data/<model_name>_<dtype>/
+            model_output_dir = test_dir / test_subdir_name
+            model_output_dir.mkdir(parents=True, exist_ok=True)
 
             # Export model and get expected output
             pte_path, ptd_path, expected_output = export_model_to_files(
-                model, example_inputs, tmpdir_path, model_name
+                model, example_inputs, model_output_dir, model_name
             )
 
             self.assertTrue(
                 pte_path.exists(),
-                f"{model_name}: PTE file not created at {pte_path}",
+                f"{model_name} ({dtype_name}): PTE file not created at {pte_path}",
             )
             self.assertTrue(
                 ptd_path.exists(),
-                f"{model_name}: PTD file not created at {ptd_path}",
+                f"{model_name} ({dtype_name}): PTD file not created at {ptd_path}",
             )
 
             # Run executor_runner
-            success = run_executor_runner(pte_path, ptd_path)
+            success, error_msg = run_executor_runner(pte_path, ptd_path)
             self.assertTrue(
                 success,
-                f"{model_name}: executor_runner failed",
+                f"{model_name} ({dtype_name}): executor_runner failed\n{error_msg}",
             )
 
-            # Compare outputs
-            runtime_output_file = debug_dir / "final_runtime_output.txt"
+            # Compare outputs - executor_runner writes to aoti_debug_data/ in cwd
+            # In CI, this is TEST_OUTPUT_BASE_DIR; locally it may vary
+            runtime_output_file = model_output_dir / "final_runtime_output.txt"
 
-            if runtime_output_file.exists():
-                is_close, max_atol, max_rtol = compare_outputs(
-                    expected_output, runtime_output_file
-                )
+            self.assertTrue(
+                runtime_output_file.exists(),
+                f"{model_name} ({dtype_name}): Runtime output file not created at {runtime_output_file}",
+            )
 
-                self.assertTrue(
-                    is_close,
-                    f"{model_name} ({DTYPE_NAMES[dtype]}): Output mismatch - max_atol={max_atol}, max_rtol={max_rtol}",
-                )
+            is_close, max_atol, max_rtol = compare_outputs(
+                expected_output, runtime_output_file
+            )
+
+            self.assertTrue(
+                is_close,
+                f"{model_name} ({dtype_name}): Output mismatch - max_atol={max_atol}, max_rtol={max_rtol}",
+            )
+
+        if IS_CI:
+            # In CI, use a persistent directory in the current working directory
+            TEST_OUTPUT_BASE_DIR.mkdir(parents=True, exist_ok=True)
+            run_test_in_directory(TEST_OUTPUT_BASE_DIR)
+        else:
+            # Locally, use a temporary directory that gets cleaned up
+            with tempfile.TemporaryDirectory() as tmpdir:
+                run_test_in_directory(Path(tmpdir))
 
 
 # =============================================================================
@@ -775,8 +843,10 @@ def _test_module_output_consistency(
 
 def _make_export_test(model_name: str, dtype: torch.dtype):
     """Factory function to create an export test method for a given model and dtype."""
+
     def test_method(self):
         self._test_module_export(model_name, dtype)
+
     dtype_name = DTYPE_NAMES[dtype]
     test_method.__doc__ = f"Test {model_name} module export with {dtype_name}."
     return test_method
@@ -784,10 +854,14 @@ def test_method(self):
 
 def _make_output_consistency_test(model_name: str, dtype: torch.dtype):
     """Factory function to create an output consistency test method for a given model and dtype."""
+
     def test_method(self):
         self._test_module_output_consistency(model_name, dtype)
+
     dtype_name = DTYPE_NAMES[dtype]
-    test_method.__doc__ = f"Test {model_name} module output consistency with {dtype_name}."
+    test_method.__doc__ = (
+        f"Test {model_name} module output consistency with {dtype_name}."
+    )
     return test_method
 
 

From b4310cc6f03ad25fd8e082b5d4995a18eb4e4491 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Fri, 30 Jan 2026 22:01:15 -0500
Subject: [PATCH 3/6] Update

[ghstack-poisoned]
---
 .github/workflows/metal.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml
index 63466f36abb..50ab0a70e1c 100644
--- a/.github/workflows/metal.yml
+++ b/.github/workflows/metal.yml
@@ -54,7 +54,7 @@ jobs:
 
   export-model-metal-artifact:
     name: export-model-metal-artifact
-      # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
+    # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
     if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit

From 94c823c8f1effbcf7f8c0bff3aa3db2c0ef570f3 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Fri, 30 Jan 2026 23:32:54 -0500
Subject: [PATCH 4/6] Update

[ghstack-poisoned]
---
 backends/apple/metal/tests/run_metal_test.sh |  19 +-
 backends/apple/metal/tests/test_modules.py   | 241 ++++++++++++++++---
 2 files changed, 212 insertions(+), 48 deletions(-)

diff --git a/backends/apple/metal/tests/run_metal_test.sh b/backends/apple/metal/tests/run_metal_test.sh
index 9595cbf0c3d..0f70c20ea4e 100755
--- a/backends/apple/metal/tests/run_metal_test.sh
+++ b/backends/apple/metal/tests/run_metal_test.sh
@@ -8,7 +8,7 @@
 # Script to build and run Metal backend tests
 # Usage:
 #   ./run_metal_test.sh --build              # Build the Metal runtime
-#   ./run_metal_test.sh --run <pte> <ptd>    # Run inference with given model files
+#   ./run_metal_test.sh --run <pte>          # Run inference with given model file
 #   ./run_metal_test.sh --check-build        # Check if runtime is already built
 
 set -e  # Exit on any error
@@ -74,7 +74,6 @@ build_runtime() {
 # Function to run inference
 run_inference() {
     local pte_path="$1"
-    local ptd_path="$2"
 
     if [[ ! -f "$EXECUTOR_RUNNER" ]]; then
         echo "Error: executor_runner not found at $EXECUTOR_RUNNER"
@@ -87,16 +86,10 @@ run_inference() {
         exit 1
     fi
 
-    if [[ ! -f "$ptd_path" ]]; then
-        echo "Error: PTD file not found: $ptd_path"
-        exit 1
-    fi
-
     echo "Running inference..."
     echo "  PTE: $pte_path"
-    echo "  PTD: $ptd_path"
 
-    "$EXECUTOR_RUNNER" --model_path "$pte_path" --data_path "$ptd_path"
+    "$EXECUTOR_RUNNER" --model_path "$pte_path"
 }
 
 # Parse command line arguments
@@ -105,11 +98,11 @@ case "$1" in
         build_runtime
         ;;
     --run)
-        if [[ -z "$2" ]] || [[ -z "$3" ]]; then
-            echo "Usage: $0 --run <pte_path> <ptd_path>"
+        if [[ -z "$2" ]]; then
+            echo "Usage: $0 --run <pte_path>"
             exit 1
         fi
-        run_inference "$2" "$3"
+        run_inference "$2"
         ;;
     --check-build)
         check_build
@@ -119,7 +112,7 @@ case "$1" in
         echo ""
         echo "Usage:"
         echo "  $0 --build              Build the Metal runtime"
-        echo "  $0 --run <pte> <ptd>    Run inference with given model files"
+        echo "  $0 --run <pte>          Run inference with given model file"
         echo "  $0 --check-build        Check if runtime is already built"
         exit 1
         ;;
diff --git a/backends/apple/metal/tests/test_modules.py b/backends/apple/metal/tests/test_modules.py
index c97298a6bc2..fc3e2c6d4e8 100644
--- a/backends/apple/metal/tests/test_modules.py
+++ b/backends/apple/metal/tests/test_modules.py
@@ -72,7 +72,22 @@
     torch.bfloat16: "bfloat16",
 }
 
+# Default tolerances for output comparison by dtype
+# bfloat16 has lower precision (7 bits mantissa vs 23 for float32)
+DEFAULT_TOLERANCES = {
+    torch.float32: {"atol": 1e-5, "rtol": 1e-5},
+    torch.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
 # Registry mapping model names to their configurations
+# Each entry can optionally include:
+#   - "atol": float - Override absolute tolerance for all dtypes
+#   - "rtol": float - Override relative tolerance for all dtypes
+#   - "atol_<dtype>": float - Override absolute tolerance for specific dtype (e.g., "atol_bfloat16")
+#   - "rtol_<dtype>": float - Override relative tolerance for specific dtype (e.g., "rtol_bfloat16")
+#   - "skip": bool or str - Skip all tests for this module (True to skip, or string with reason)
+#   - "skip_<dtype>": bool or str - Skip tests for specific dtype (e.g., "skip_bfloat16")
 MODULE_REGISTRY: Dict[str, Dict[str, Any]] = {}
 
 
@@ -206,6 +221,7 @@ def forward(self, x: torch.Tensor):
     "model_class": SingleConv2d,
     "input_shapes": [(4, 3, 8, 8)],
     "description": "Single Conv2d layer model",
+    "skip": True,
 }
 
 
@@ -232,6 +248,7 @@ def forward(self, x):
     "model_class": DepthwiseConv,
     "input_shapes": [(1, 32, 112, 112)],
     "description": "Single Depthwise Conv2d layer model",
+    "skip": True,
 }
 
 
@@ -412,6 +429,8 @@ def __init__(self):
     "model_class": AddStridedSDPA,
     "input_shapes": [(10, 20, 1500, 64)],
     "description": "Audio Encoder model with strided SDPA",
+    "atol_float32": 1e-4,
+    "atol_bfloat16": 5e-2,
 }
 
 
@@ -532,6 +551,45 @@ def forward(self, x):
     "model_class": SingleResNetBlock,
     "input_shapes": [(1, 64, 8, 8)],
     "description": "Single ResNet block with skip connection",
+    "skip": True,
+}
+
+
+# -------------------------------------------------------------------------
+class TransformerBlock(nn.Module):
+    def __init__(self, embed_dim=256, num_heads=8, ff_dim=1024, dropout=0.1):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, batch_first=True
+        )
+
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.norm2 = nn.LayerNorm(embed_dim)
+
+        self.ffn = nn.Sequential(
+            nn.Linear(embed_dim, ff_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(ff_dim, embed_dim),
+            nn.Dropout(dropout),
+        )
+
+    def forward(self, x):
+        attn_output, _ = self.self_attn(x, x, x)
+        x = self.norm1(x + attn_output)
+        ff_output = self.ffn(x)
+        x = self.norm2(x + ff_output)
+        return x
+
+
+MODULE_REGISTRY["transformer_block"] = {
+    "model_class": TransformerBlock,
+    "input_shapes": [(4, 32, 256)],
+    "description": "Single transformer block with multi-head attention and FFN",
+    "skip": True,
 }
 
 
@@ -540,6 +598,59 @@ def forward(self, x):
 # =============================================================================
 
 
+def get_tolerances_for_model(
+    model_name: str, dtype: torch.dtype
+) -> Tuple[float, float]:
+    """
+    Get atol and rtol tolerances for a specific model and dtype.
+
+    Priority order:
+    1. Model-specific dtype tolerance (e.g., "atol_bfloat16")
+    2. Model-specific general tolerance (e.g., "atol")
+    3. Default dtype tolerance from DEFAULT_TOLERANCES
+
+    Returns:
+        Tuple of (atol, rtol)
+    """
+    model_config = MODULE_REGISTRY.get(model_name, {})
+    dtype_name = DTYPE_NAMES.get(dtype, "float32")
+    default_tols = DEFAULT_TOLERANCES.get(dtype, DEFAULT_TOLERANCES[torch.float32])
+
+    # Check for dtype-specific override, then general override, then default
+    atol = model_config.get(
+        f"atol_{dtype_name}", model_config.get("atol", default_tols["atol"])
+    )
+    rtol = model_config.get(
+        f"rtol_{dtype_name}", model_config.get("rtol", default_tols["rtol"])
+    )
+
+    return atol, rtol
+
+
+def should_skip_model(model_name: str, dtype: torch.dtype) -> Tuple[bool, str]:
+    """
+    Check if a model should be skipped for testing.
+
+    Priority order:
+    1. Model-specific dtype skip (e.g., "skip_bfloat16")
+    2. Model-specific general skip (e.g., "skip")
+
+    Returns:
+        Tuple of (should_skip, reason)
+    """
+    model_config = MODULE_REGISTRY.get(model_name, {})
+    dtype_name = DTYPE_NAMES.get(dtype, "float32")
+
+    # Check for dtype-specific skip first, then general skip
+    skip_value = model_config.get(f"skip_{dtype_name}", model_config.get("skip", False))
+
+    if skip_value is True:
+        return True, f"{model_name} is marked as skipped"
+    elif isinstance(skip_value, str):
+        return True, skip_value
+    return False, ""
+
+
 def get_model_and_inputs(
     model_name: str, dtype: torch.dtype = torch.float32
 ) -> Tuple[nn.Module, Tuple[torch.Tensor, ...]]:
@@ -605,22 +716,24 @@ def export_model_to_files(
     # Export to executorch
     executorch_program = export_model_to_metal(model, example_inputs)
 
-    # Save .pte file
+    # Save .pte file (Metal backend embeds data into the .pte file, no separate .ptd)
     pte_path = output_dir / f"{model_name}.pte"
     with open(pte_path, "wb") as f:
         f.write(executorch_program.buffer)
 
-    # Save .ptd file (tensor data)
-    executorch_program.write_tensor_data_to_file(str(output_dir))
-    ptd_path = output_dir / "aoti_metal_blob.ptd"
-
-    return pte_path, ptd_path, expected_output
+    return pte_path, expected_output
 
 
-def run_executor_runner(pte_path: Path, ptd_path: Path) -> Tuple[bool, Optional[str]]:
+def run_executor_runner(
+    pte_path: Path, output_path: Path
+) -> Tuple[bool, Optional[str]]:
     """
     Run the executor_runner binary with the given model files.
 
+    Args:
+        pte_path: Path to the .pte model file
+        output_path: Base path for output files (executor_runner will create <output_path>-0.bin, etc.)
+
     Returns:
         Tuple of (success, error_message). If success is True, error_message is None.
         If success is False, error_message contains details about the failure.
@@ -635,8 +748,8 @@ def run_executor_runner(pte_path: Path, ptd_path: Path) -> Tuple[bool, Optional[
         str(EXECUTOR_RUNNER),
         "--model_path",
         str(pte_path),
-        "--data_path",
-        str(ptd_path),
+        "--output_file",
+        str(output_path),
     ]
 
     try:
@@ -660,32 +773,80 @@ def run_executor_runner(pte_path: Path, ptd_path: Path) -> Tuple[bool, Optional[
         return False, f"executor_runner timed out after 60 seconds: {e}"
 
 
-def read_output_file(filepath: Path) -> Optional[np.ndarray]:
-    """Read comma-separated output values from a file."""
+def read_binary_output_file(filepath: Path, dtype: torch.dtype) -> Optional[np.ndarray]:
+    """
+    Read binary output values from an executor_runner output file.
+
+    Args:
+        filepath: Path to the binary output file
+        dtype: The torch dtype to interpret the binary data as
+
+    Returns:
+        numpy array of values, or None if file doesn't exist or is empty
+    """
+    if not filepath.exists():
+        return None
+
+    # Map torch dtype to numpy dtype
+    dtype_map = {
+        torch.float32: np.float32,
+        torch.float16: np.float16,
+        torch.bfloat16: np.float32,  # bfloat16 is read as float32 after conversion
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+
+    np_dtype = dtype_map.get(dtype, np.float32)
+
     try:
-        with open(filepath, "r") as f:
-            content = f.read().strip()
-            if not content:
+        with open(filepath, "rb") as f:
+            binary_data = f.read()
+            if not binary_data:
                 return None
-            values = [float(x.strip()) for x in content.split(",") if x.strip()]
-            return np.array(values)
-    except (FileNotFoundError, ValueError):
+            # For bfloat16, the runtime output is in bfloat16 format (2 bytes per element)
+            # We need to read it as uint16 and convert
+            if dtype == torch.bfloat16:
+                # Read as uint16 (2 bytes per element like bfloat16)
+                values_uint16 = np.frombuffer(binary_data, dtype=np.uint16)
+                # Convert bfloat16 to float32 by shifting left 16 bits
+                values_uint32 = values_uint16.astype(np.uint32) << 16
+                values = values_uint32.view(np.float32)
+            else:
+                values = np.frombuffer(binary_data, dtype=np_dtype)
+            return values
+    except (FileNotFoundError, ValueError) as e:
+        print(f"Error reading binary file {filepath}: {e}")
         return None
 
 
 def compare_outputs(
     expected: torch.Tensor,
     runtime_output_file: Path,
-    atol: float = 1e-5,
-    rtol: float = 1e-5,
+    dtype: torch.dtype,
+    atol: Optional[float] = None,
+    rtol: Optional[float] = None,
 ) -> Tuple[bool, Optional[float], Optional[float]]:
     """
-    Compare expected PyTorch output with runtime output from file.
+    Compare expected PyTorch output with runtime output from binary file.
+
+    Args:
+        expected: Expected output tensor from PyTorch
+        runtime_output_file: Path to the binary output file from executor_runner
+        dtype: The dtype used for the model (needed to parse binary output)
+        atol: Absolute tolerance for comparison (if None, uses dtype-specific default)
+        rtol: Relative tolerance for comparison (if None, uses dtype-specific default)
 
     Returns:
         Tuple of (is_close, max_atol, max_rtol)
     """
-    runtime_values = read_output_file(runtime_output_file)
+    # Use dtype-specific tolerances if not specified
+    tolerances = DEFAULT_TOLERANCES.get(dtype, DEFAULT_TOLERANCES[torch.float32])
+    if atol is None:
+        atol = tolerances["atol"]
+    if rtol is None:
+        rtol = tolerances["rtol"]
+
+    runtime_values = read_binary_output_file(runtime_output_file, dtype)
     if runtime_values is None:
         return False, None, None
 
@@ -693,10 +854,10 @@ def compare_outputs(
     # (required when tensor is on MPS device)
     if isinstance(expected, tuple):
         expected_values = np.concatenate(
-            [t.detach().cpu().flatten().numpy() for t in expected]
+            [t.detach().cpu().float().flatten().numpy() for t in expected]
         )
     else:
-        expected_values = expected.detach().cpu().flatten().numpy()
+        expected_values = expected.detach().cpu().float().flatten().numpy()
 
     if len(runtime_values) != len(expected_values):
         return False, None, None
@@ -736,6 +897,11 @@ def _test_module_export(
         self, model_name: str, dtype: torch.dtype = torch.float32
     ) -> None:
         """Generic test for module export."""
+        # Check if this model/dtype combination should be skipped
+        skip, skip_reason = should_skip_model(model_name, dtype)
+        if skip:
+            self.skipTest(skip_reason)
+
         if SKIP_EXPORT_TESTS:
             self.skipTest(SKIP_REASON)
 
@@ -770,10 +936,15 @@ def _test_module_output_consistency(
         Test that Metal backend runtime output matches PyTorch output.
 
         This test:
-        1. Exports the model to .pte and .ptd files
+        1. Exports the model to a .pte file
         2. Runs the model using executor_runner
         3. Compares the runtime output with expected PyTorch output
         """
+        # Check if this model/dtype combination should be skipped
+        skip, skip_reason = should_skip_model(model_name, dtype)
+        if skip:
+            self.skipTest(skip_reason)
+
         if SKIP_RUNTIME_TESTS:
             self.skipTest(SKIP_RUNTIME_REASON)
 
@@ -788,7 +959,7 @@ def run_test_in_directory(test_dir: Path) -> None:
             model_output_dir.mkdir(parents=True, exist_ok=True)
 
             # Export model and get expected output
-            pte_path, ptd_path, expected_output = export_model_to_files(
+            pte_path, expected_output = export_model_to_files(
                 model, example_inputs, model_output_dir, model_name
             )
 
@@ -796,29 +967,29 @@ def run_test_in_directory(test_dir: Path) -> None:
                 pte_path.exists(),
                 f"{model_name} ({dtype_name}): PTE file not created at {pte_path}",
             )
-            self.assertTrue(
-                ptd_path.exists(),
-                f"{model_name} ({dtype_name}): PTD file not created at {ptd_path}",
-            )
 
-            # Run executor_runner
-            success, error_msg = run_executor_runner(pte_path, ptd_path)
+            # Run executor_runner with output file
+            output_base_path = model_output_dir / "output"
+            success, error_msg = run_executor_runner(pte_path, output_base_path)
             self.assertTrue(
                 success,
                 f"{model_name} ({dtype_name}): executor_runner failed\n{error_msg}",
             )
 
-            # Compare outputs - executor_runner writes to aoti_debug_data/ in cwd
-            # In CI, this is TEST_OUTPUT_BASE_DIR; locally it may vary
-            runtime_output_file = model_output_dir / "final_runtime_output.txt"
+            # executor_runner writes output files as <output_path>-<index>.bin
+            # For single output models, this is output-0.bin
+            runtime_output_file = model_output_dir / "output-0.bin"
 
             self.assertTrue(
                 runtime_output_file.exists(),
                 f"{model_name} ({dtype_name}): Runtime output file not created at {runtime_output_file}",
             )
 
+            # Get model-specific tolerances (with dtype-specific overrides)
+            atol, rtol = get_tolerances_for_model(model_name, dtype)
+
             is_close, max_atol, max_rtol = compare_outputs(
-                expected_output, runtime_output_file
+                expected_output, runtime_output_file, dtype, atol=atol, rtol=rtol
             )
 
             self.assertTrue(

From 31b6f4555bae21d41797d01ac18fa3b888f441f4 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Mon, 2 Feb 2026 16:27:14 -0500
Subject: [PATCH 5/6] Update

[ghstack-poisoned]
---
 backends/apple/metal/tests/test_modules.py | 261 +++------------------
 1 file changed, 29 insertions(+), 232 deletions(-)

diff --git a/backends/apple/metal/tests/test_modules.py b/backends/apple/metal/tests/test_modules.py
index fc3e2c6d4e8..6a5eaeb9a53 100644
--- a/backends/apple/metal/tests/test_modules.py
+++ b/backends/apple/metal/tests/test_modules.py
@@ -50,7 +50,7 @@
 
 # Test output directory - use current working directory in CI for reliable write access
 if IS_CI:
-    TEST_OUTPUT_BASE_DIR = Path.cwd() / "aoti_debug_data"
+    TEST_OUTPUT_BASE_DIR = Path.cwd() / "metal_backend_module_outputs"
 else:
     TEST_OUTPUT_BASE_DIR = None  # Will use tempfile.TemporaryDirectory
 
@@ -126,7 +126,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 
 
 # -------------------------------------------------------------------------
-class MmWeights(nn.Module):
+class MmWeightParam(nn.Module):
     def __init__(self):
         super().__init__()
         self.weight = nn.Parameter(torch.arange(20, dtype=torch.float).reshape(4, 5))
@@ -135,51 +135,13 @@ def forward(self, x: torch.Tensor):
         return x.mm(self.weight)
 
 
-MODULE_REGISTRY["mm_weights"] = {
-    "model_class": MmWeights,
+MODULE_REGISTRY["mm_weight_param"] = {
+    "model_class": MmWeightParam,
     "input_shapes": [(3, 4)],
     "description": "Matrix multiplication with weight parameter",
 }
 
 
-# -------------------------------------------------------------------------
-class TwoMm(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.left_weight = nn.Parameter(
-            torch.arange(20, dtype=torch.float).reshape(4, 5)
-        )
-        self.right_weight = nn.Parameter(
-            torch.arange(42, dtype=torch.float).reshape(6, 7)
-        )
-
-    def forward(self, x: torch.Tensor):
-        return self.left_weight.mm(x).mm(self.right_weight)
-
-
-MODULE_REGISTRY["two_mm"] = {
-    "model_class": TwoMm,
-    "input_shapes": [(5, 6)],
-    "description": "Two consecutive matrix multiplications",
-}
-
-
-# -------------------------------------------------------------------------
-class ElementwiseMmReduction(nn.Module):
-    def forward(self, x: torch.Tensor, y: torch.Tensor):
-        x1 = x.sin() + x
-        y2 = y.cos() + 3
-        z = x1.mm(y2)
-        return z + z.sum()
-
-
-MODULE_REGISTRY["elementwise_mm_reduction"] = {
-    "model_class": ElementwiseMmReduction,
-    "input_shapes": [(11, 45), (45, 8)],
-    "description": "Combining mm with elementwise and reduction ops",
-}
-
-
 # -------------------------------------------------------------------------
 # Linear Modules
 # -------------------------------------------------------------------------
@@ -206,54 +168,7 @@ def forward(self, x: torch.Tensor):
 # -------------------------------------------------------------------------
 
 
-class SingleConv2d(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv2d(
-            in_channels=3, out_channels=5, kernel_size=3, stride=1, padding=1
-        )
-
-    def forward(self, x: torch.Tensor):
-        return self.conv(x)
-
-
-MODULE_REGISTRY["conv2d"] = {
-    "model_class": SingleConv2d,
-    "input_shapes": [(4, 3, 8, 8)],
-    "description": "Single Conv2d layer model",
-    "skip": True,
-}
-
-
-# -------------------------------------------------------------------------
-class DepthwiseConv(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv2d(
-            in_channels=32,
-            out_channels=32,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            dilation=1,
-            groups=32,
-            bias=False,
-        )
-
-    def forward(self, x):
-        return self.conv(x)
-
-
-MODULE_REGISTRY["depthwise_conv"] = {
-    "model_class": DepthwiseConv,
-    "input_shapes": [(1, 32, 112, 112)],
-    "description": "Single Depthwise Conv2d layer model",
-    "skip": True,
-}
-
-
-# -------------------------------------------------------------------------
-class SmallConv1d(nn.Module):
+class Conv1dNoBias(nn.Module):
     def __init__(self):
         super().__init__()
         self.conv = nn.Conv1d(
@@ -271,15 +186,15 @@ def forward(self, x):
         return self.conv(x)
 
 
-MODULE_REGISTRY["small_conv1d"] = {
-    "model_class": SmallConv1d,
+MODULE_REGISTRY["conv1d_nobias"] = {
+    "model_class": Conv1dNoBias,
     "input_shapes": [(1, 8, 5)],
     "description": "Conv1d layer with 8 input channels, 6 output channels",
 }
 
 
 # -------------------------------------------------------------------------
-class MediumConv1d(nn.Module):
+class Conv1dBias(nn.Module):
     def __init__(self):
         super().__init__()
         self.conv = nn.Conv1d(
@@ -297,15 +212,15 @@ def forward(self, x):
         return self.conv(x)
 
 
-MODULE_REGISTRY["conv1d"] = {
-    "model_class": MediumConv1d,
+MODULE_REGISTRY["conv1d_bias"] = {
+    "model_class": Conv1dBias,
     "input_shapes": [(1, 80, 3000)],
     "description": "Conv1d layer with 80 input channels, 384 output channels",
 }
 
 
 # -------------------------------------------------------------------------
-class VoxtralConv1d(nn.Module):
+class Conv1dVoxtral(nn.Module):
     def __init__(self):
         super().__init__()
         self.conv = nn.Conv1d(
@@ -323,8 +238,8 @@ def forward(self, x):
         return self.conv(x)
 
 
-MODULE_REGISTRY["voxtral_conv1d"] = {
-    "model_class": VoxtralConv1d,
+MODULE_REGISTRY["conv1d_voxtral"] = {
+    "model_class": Conv1dVoxtral,
     "input_shapes": [(10, 128, 3000)],
     "description": "Conv1d layer with 128 input channels, 1280 output channels",
 }
@@ -335,7 +250,7 @@ def forward(self, x):
 # -------------------------------------------------------------------------
 
 
-class SimpleSDPA(nn.Module):
+class SDPA(nn.Module):
     """Minimal SDPA test model."""
 
     def forward(
@@ -348,14 +263,14 @@ def forward(
 
 
 MODULE_REGISTRY["sdpa"] = {
-    "model_class": SimpleSDPA,
+    "model_class": SDPA,
     "input_shapes": [(2, 4, 16, 64), (2, 4, 16, 64), (2, 4, 16, 64)],
     "description": "Simple Scaled Dot Product Attention model",
 }
 
 
 # -------------------------------------------------------------------------
-class AddSDPA(nn.Module):
+class SDPAAdd(nn.Module):
     """SDPA model with Q, K, V as parameters that adds input to SDPA output."""
 
     def __init__(self, batch_size=2, num_heads=4, seq_len=16, head_dim=64):
@@ -371,15 +286,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return sdpa_output + x
 
 
-MODULE_REGISTRY["add_sdpa"] = {
-    "model_class": AddSDPA,
+MODULE_REGISTRY["sdpa_add"] = {
+    "model_class": SDPAAdd,
     "input_shapes": [(2, 4, 16, 64)],
     "description": "SDPA model with Q,K,V as parameters that adds input to output",
 }
 
 
 # -------------------------------------------------------------------------
-class BaseAddStridedSDPA(nn.Module):
+class BaseStridedSDPA(nn.Module):
     """SDPA model with strided Q, K, V parameters."""
 
     def __init__(
@@ -413,7 +328,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 # -------------------------------------------------------------------------
-class AddStridedSDPA(BaseAddStridedSDPA):
+class SDPAStrided(BaseStridedSDPA):
     def __init__(self):
         super().__init__(
             q_size=(10, 20, 1500, 64),
@@ -425,8 +340,8 @@ def __init__(self):
         )
 
 
-MODULE_REGISTRY["audio_encoder_sdpa1"] = {
-    "model_class": AddStridedSDPA,
+MODULE_REGISTRY["sdpa_strided"] = {
+    "model_class": SDPAStrided,
     "input_shapes": [(10, 20, 1500, 64)],
     "description": "Audio Encoder model with strided SDPA",
     "atol_float32": 1e-4,
@@ -435,7 +350,7 @@ def __init__(self):
 
 
 # -------------------------------------------------------------------------
-class AddStridedSDPA1(BaseAddStridedSDPA):
+class SDPAStridedBroadcast(BaseStridedSDPA):
     def __init__(self):
         super().__init__(
             q_size=(1, 20, 1, 64),
@@ -447,15 +362,15 @@ def __init__(self):
         )
 
 
-MODULE_REGISTRY["whisper_strided_sdpa1"] = {
-    "model_class": AddStridedSDPA1,
+MODULE_REGISTRY["sdpa_strided_broadcast"] = {
+    "model_class": SDPAStridedBroadcast,
     "input_shapes": [(1, 20, 1, 64)],
     "description": "Whisper-like strided SDPA variant 1",
 }
 
 
 # -------------------------------------------------------------------------
-class AddStridedSDPA2(BaseAddStridedSDPA):
+class SDPAStridedBroadcastAttnMask(BaseStridedSDPA):
     def __init__(self):
         super().__init__(
             q_size=(1, 20, 1, 64),
@@ -468,131 +383,13 @@ def __init__(self):
         )
 
 
-MODULE_REGISTRY["whisper_strided_sdpa2"] = {
-    "model_class": AddStridedSDPA2,
+MODULE_REGISTRY["sdpa_strided_broadcast_attn_mask"] = {
+    "model_class": SDPAStridedBroadcastAttnMask,
     "input_shapes": [(1, 20, 1, 64)],
     "description": "Whisper-like strided SDPA variant 2",
 }
 
 
-# -------------------------------------------------------------------------
-# Normalization Modules
-# -------------------------------------------------------------------------
-
-
-class BatchNorm(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.bn = nn.BatchNorm2d(num_features=16)
-
-    def forward(self, x):
-        return self.bn(x)
-
-
-MODULE_REGISTRY["batchnorm"] = {
-    "model_class": BatchNorm,
-    "input_shapes": [(1, 16, 32, 32)],
-    "description": "Single BatchNorm2d layer model",
-}
-
-
-# -------------------------------------------------------------------------
-# Block/Composite Modules
-# -------------------------------------------------------------------------
-
-
-class SingleResNetBlock(nn.Module):
-    def __init__(self, in_channels=64, out_channels=64, stride=1):
-        super().__init__()
-        self.conv1 = nn.Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            bias=False,
-        )
-        self.bn1 = nn.BatchNorm2d(out_channels)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv2d(
-            out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False
-        )
-        self.bn2 = nn.BatchNorm2d(out_channels)
-
-        self.skip_connection = None
-        if stride != 1 or in_channels != out_channels:
-            self.skip_connection = nn.Sequential(
-                nn.Conv2d(
-                    in_channels, out_channels, kernel_size=1, stride=stride, bias=False
-                ),
-                nn.BatchNorm2d(out_channels),
-            )
-
-    def forward(self, x):
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.skip_connection is not None:
-            identity = self.skip_connection(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-
-MODULE_REGISTRY["single_resnet_block"] = {
-    "model_class": SingleResNetBlock,
-    "input_shapes": [(1, 64, 8, 8)],
-    "description": "Single ResNet block with skip connection",
-    "skip": True,
-}
-
-
-# -------------------------------------------------------------------------
-class TransformerBlock(nn.Module):
-    def __init__(self, embed_dim=256, num_heads=8, ff_dim=1024, dropout=0.1):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-
-        self.self_attn = nn.MultiheadAttention(
-            embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, batch_first=True
-        )
-
-        self.norm1 = nn.LayerNorm(embed_dim)
-        self.norm2 = nn.LayerNorm(embed_dim)
-
-        self.ffn = nn.Sequential(
-            nn.Linear(embed_dim, ff_dim),
-            nn.ReLU(),
-            nn.Dropout(dropout),
-            nn.Linear(ff_dim, embed_dim),
-            nn.Dropout(dropout),
-        )
-
-    def forward(self, x):
-        attn_output, _ = self.self_attn(x, x, x)
-        x = self.norm1(x + attn_output)
-        ff_output = self.ffn(x)
-        x = self.norm2(x + ff_output)
-        return x
-
-
-MODULE_REGISTRY["transformer_block"] = {
-    "model_class": TransformerBlock,
-    "input_shapes": [(4, 32, 256)],
-    "description": "Single transformer block with multi-head attention and FFN",
-    "skip": True,
-}
-
-
 # =============================================================================
 # Helper Functions
 # =============================================================================
@@ -954,7 +751,7 @@ def _test_module_output_consistency(
 
         def run_test_in_directory(test_dir: Path) -> None:
             """Run the actual test logic in the given directory."""
-            # Create model output directory: aoti_debug_data/<model_name>_<dtype>/
+            # Create model output directory: metal_backend_module_outputs/<model_name>_<dtype>/
             model_output_dir = test_dir / test_subdir_name
             model_output_dir.mkdir(parents=True, exist_ok=True)
 

From 08346599720e32ec8d58bc5a7050603694c97003 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Mon, 2 Feb 2026 17:26:39 -0500
Subject: [PATCH 6/6] Update

[ghstack-poisoned]
---
 backends/apple/metal/tests/test_modules.py | 38 ++++++++++++++++++----
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/backends/apple/metal/tests/test_modules.py b/backends/apple/metal/tests/test_modules.py
index 6a5eaeb9a53..59904bb494d 100644
--- a/backends/apple/metal/tests/test_modules.py
+++ b/backends/apple/metal/tests/test_modules.py
@@ -88,6 +88,14 @@
 #   - "rtol_<dtype>": float - Override relative tolerance for specific dtype (e.g., "rtol_bfloat16")
 #   - "skip": bool or str - Skip all tests for this module (True to skip, or string with reason)
 #   - "skip_<dtype>": bool or str - Skip tests for specific dtype (e.g., "skip_bfloat16")
+#
+# Model Parameter Initialization:
+#   Model parameters are initialized with their default dtype (typically float32) when the
+#   model class is instantiated. The parameters are then converted to the target dtype using
+#   model.to(dtype). For example:
+#     - nn.Parameter(torch.arange(20, dtype=torch.get_default_dtype()) creates float32 parameters
+#     - These are converted to bfloat16 when model.to(torch.bfloat16) is called
+#
 MODULE_REGISTRY: Dict[str, Dict[str, Any]] = {}
 
 
@@ -129,7 +137,9 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 class MmWeightParam(nn.Module):
     def __init__(self):
         super().__init__()
-        self.weight = nn.Parameter(torch.arange(20, dtype=torch.float).reshape(4, 5))
+        self.weight = nn.Parameter(
+            torch.arange(20, dtype=torch.get_default_dtype()).reshape(4, 5)
+        )
 
     def forward(self, x: torch.Tensor):
         return x.mm(self.weight)
@@ -451,7 +461,18 @@ def should_skip_model(model_name: str, dtype: torch.dtype) -> Tuple[bool, str]:
 def get_model_and_inputs(
     model_name: str, dtype: torch.dtype = torch.float32
 ) -> Tuple[nn.Module, Tuple[torch.Tensor, ...]]:
-    """Get model and example inputs based on model name."""
+    """Get model and example inputs based on model name.
+
+    Note: Model parameters are initialized with their default dtype (typically float32)
+    during model instantiation, then converted to the target dtype using model.to(dtype).
+
+    Args:
+        model_name: Name of the model to create
+        dtype: Target data type for the model (default: torch.float32)
+
+    Returns:
+        Tuple of (model, example_inputs)
+    """
     if model_name not in MODULE_REGISTRY:
         available_models = ", ".join(MODULE_REGISTRY.keys())
         raise ValueError(
@@ -462,7 +483,10 @@ def get_model_and_inputs(
     model_class = model_config["model_class"]
     input_shapes = model_config["input_shapes"]
 
+    # Create model with default parameter dtypes (typically float32)
     model = model_class().eval()
+
+    # Convert model parameters to target dtype if specified
     if dtype is not None:
         model = model.to(dtype)
 
@@ -493,17 +517,17 @@ def export_model_to_metal(
     return executorch_program
 
 
-def export_model_to_files(
+def export_model_to_pte(
     model: nn.Module,
     example_inputs: Tuple[torch.Tensor, ...],
     output_dir: Path,
     model_name: str,
-) -> Tuple[Path, Path, torch.Tensor]:
+) -> Tuple[Path, torch.Tensor]:
     """
-    Export model to .pte and .ptd files, and compute expected output.
+    Export model to .pte file, and compute expected output.
 
     Returns:
-        Tuple of (pte_path, ptd_path, expected_output)
+        Tuple of (pte_path, expected_output)
     """
     # Compute expected output using all-ones input (matching export_aoti_metal.py)
     all_ones_input = tuple(torch.ones_like(inp) for inp in example_inputs)
@@ -756,7 +780,7 @@ def run_test_in_directory(test_dir: Path) -> None:
             model_output_dir.mkdir(parents=True, exist_ok=True)
 
             # Export model and get expected output
-            pte_path, expected_output = export_model_to_files(
+            pte_path, expected_output = export_model_to_pte(
                 model, example_inputs, model_output_dir, model_name
             )