Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions .ci/scripts/export_model_artifact.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Export model to CUDA/Metal format with optional quantization
# Export model to CUDA/Metal/XNNPACK format with optional quantization

show_help() {
cat << EOF
Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir]

Export a HuggingFace model to CUDA/Metal format with optional quantization.
Export a HuggingFace model to CUDA/Metal/XNNPACK format with optional quantization.

Arguments:
device cuda or metal (required)
device cuda, metal, or xnnpack (required)

hf_model HuggingFace model ID (required)
Supported models:
Expand All @@ -28,6 +28,7 @@ Arguments:
- non-quantized
- quantized-int4-tile-packed
- quantized-int4-weight-only
- quantized-8da4w (XNNPACK only)

output_dir Output directory for artifacts (optional, default: current directory)

Expand All @@ -36,6 +37,7 @@ Examples:
export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
EOF
}

Expand Down Expand Up @@ -64,9 +66,11 @@ case "$DEVICE" in
;;
metal)
;;
xnnpack)
;;
*)
echo "Error: Unsupported device '$DEVICE'"
echo "Supported devices: cuda, cuda-windows, metal"
echo "Supported devices: cuda, cuda-windows, metal, xnnpack"
exit 1
;;
esac
Expand Down Expand Up @@ -139,9 +143,16 @@ case "$QUANT_NAME" in
fi
EXTRA_ARGS="--qlinear_encoder 4w"
;;
quantized-8da4w)
if [ "$DEVICE" != "xnnpack" ]; then
echo "Error: quantized-8da4w is only supported with xnnpack device"
exit 1
fi
EXTRA_ARGS="--qlinear 8da4w --qlinear_group_size 32 --qlinear_encoder 8da4w --qlinear_encoder_group_size 32"
;;
*)
echo "Error: Unsupported quantization '$QUANT_NAME'"
echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only"
echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only, quantized-8da4w"
exit 1
;;
esac
Expand All @@ -157,10 +168,17 @@ pip list
if [ "$MODEL_NAME" = "parakeet" ]; then
pip install -r examples/models/parakeet/install_requirements.txt

# Set dtype based on backend (XNNPACK uses fp32, CUDA/Metal use bf16)
if [ "$DEVICE" = "xnnpack" ]; then
DTYPE_ARG=""
else
DTYPE_ARG="--dtype bf16"
fi

python -m executorch.examples.models.parakeet.export_parakeet_tdt \
--backend "$DEVICE" \
--output-dir "${OUTPUT_DIR}" \
--dtype bf16 \
${DTYPE_ARG} \
${EXTRA_ARGS}

test -f "${OUTPUT_DIR}/model.pte"
Expand Down
19 changes: 13 additions & 6 deletions .ci/scripts/test_model_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Test CUDA/Metal model end-to-end, need to run .ci/scripts/export_model_artifact.sh first
# Test CUDA/Metal/XNNPACK model end-to-end, need to run .ci/scripts/export_model_artifact.sh first

show_help() {
cat << EOF
Usage: test_model_e2e.sh <device> <hf_model> <quant_name> [model_dir]

Build and run end-to-end tests for CUDA/Metal models.
Build and run end-to-end tests for CUDA/Metal/XNNPACK models.

Arguments:
device cuda or metal (required)
device cuda, metal, or xnnpack (required)

hf_model HuggingFace model ID (required)
Supported models:
Expand All @@ -28,6 +28,7 @@ Arguments:
- non-quantized
- quantized-int4-tile-packed
- quantized-int4-weight-only
- quantized-8da4w (XNNPACK only)

model_dir Directory containing model artifacts (optional, default: current directory)
Expected files: model.pte, aoti_cuda_blob.ptd (CUDA only)
Expand All @@ -37,6 +38,7 @@ Examples:
test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output"
EOF
}

Expand Down Expand Up @@ -174,12 +176,17 @@ echo "::endgroup::"

echo "::group::Build $MODEL_NAME Runner"

if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ]; then
echo "Error: Unsupported device '$DEVICE'. Must be 'cuda' or 'metal'."
if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ] && [ "$DEVICE" != "xnnpack" ]; then
echo "Error: Unsupported device '$DEVICE'. Must be 'cuda', 'metal', or 'xnnpack'."
exit 1
fi

MAKE_TARGET="${RUNNER_PATH}-${DEVICE}"
# Map device to make target (xnnpack uses cpu target which includes XNNPACK)
if [ "$DEVICE" = "xnnpack" ]; then
MAKE_TARGET="${RUNNER_PATH}-cpu"
else
MAKE_TARGET="${RUNNER_PATH}-${DEVICE}"
fi
make "${MAKE_TARGET}"
echo "::endgroup::"

Expand Down
33 changes: 33 additions & 0 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,39 @@ jobs:
# Build and test ExecuTorch
PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"

test-parakeet-xnnpack-linux:
name: test-parakeet-xnnpack-linux
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
with:
runner: linux.4xlarge.memory
docker-image: ci-image:executorch-ubuntu-22.04-clang12
submodules: 'recursive'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 120
script: |
set -eux

# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"

echo "::group::Setup ExecuTorch"
./install_executorch.sh
echo "::endgroup::"

echo "::group::Export Parakeet with XNNPACK"
bash .ci/scripts/export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./parakeet_output"
echo "::endgroup::"

echo "::group::Test Parakeet with XNNPACK"
bash .ci/scripts/test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./parakeet_output"
echo "::endgroup::"

test-llama-runner-linux:
# Test Both linux x86 and linux aarch64
name: test-llama-runner-linux
Expand Down
19 changes: 15 additions & 4 deletions examples/models/parakeet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,21 @@ find_package(gflags REQUIRED)
# Find executorch libraries
list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
executorch_target_link_options_shared_lib(executorch)
get_target_property(_executorch_imported executorch IMPORTED)
if(NOT _executorch_imported)
executorch_target_link_options_shared_lib(executorch)
endif()

set(link_libraries executorch gflags)

# Common ops for all builds
list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
if(TARGET optimized_native_cpu_ops_lib)
list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
get_target_property(_is_imported optimized_native_cpu_ops_lib IMPORTED)
if(NOT _is_imported)
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
endif()
endif()

# CPU-only builds need quantized and custom ops
if(NOT EXECUTORCH_BUILD_CUDA AND MSVC)
Expand All @@ -46,7 +54,10 @@ if(TARGET xnnpack_backend)
list(APPEND xnnpack_backend_libs kleidiai)
endif()
list(APPEND link_libraries ${xnnpack_backend_libs})
executorch_target_link_options_shared_lib(xnnpack_backend)
get_target_property(_xnnpack_imported xnnpack_backend IMPORTED)
if(NOT _xnnpack_imported)
executorch_target_link_options_shared_lib(xnnpack_backend)
endif()
endif()

# Needed for cpuinfo where it uses android specific log lib
Expand Down
14 changes: 13 additions & 1 deletion examples/models/parakeet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,19 @@ The export script supports quantizing encoder and decoder linear layers using [t
| `8da4w` | 8-bit dynamic activation, 4-bit weight |
| `8da8w` | 8-bit dynamic activation, 8-bit weight |

#### Example: 4-bit Weight Quantization with Tile Packing
#### Example: Dynamic Quantization for XNNPACK

```bash
python export_parakeet_tdt.py \
--backend xnnpack \
--qlinear_encoder 8da4w \
--qlinear_encoder_group_size 32 \
--qlinear 8da4w \
--qlinear_group_size 32 \
--output-dir ./parakeet_quantized_xnnpack
```

#### Example: 4-bit Weight Quantization with Tile Packing for CUDA

```bash
python export_parakeet_tdt.py \
Expand Down
10 changes: 9 additions & 1 deletion examples/models/parakeet/export_parakeet_tdt.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ def export_all(
def _create_xnnpack_partitioners(programs):
"""Create XNNPACK partitioners for all programs except preprocessor."""
from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
XnnpackDynamicallyQuantizedPartitioner,
XnnpackPartitioner,
)

Expand All @@ -470,7 +471,13 @@ def _create_xnnpack_partitioners(programs):
if key == "preprocessor":
partitioner[key] = []
else:
partitioner[key] = [XnnpackPartitioner()]
# Use both partitioners:
# 1. XnnpackDynamicallyQuantizedPartitioner for dynamic quantization (8da4w)
# 2. XnnpackPartitioner for remaining ops
partitioner[key] = [
XnnpackDynamicallyQuantizedPartitioner(),
XnnpackPartitioner(),
]
return partitioner, programs


Expand Down Expand Up @@ -584,6 +591,7 @@ def lower_to_executorch(programs, metadata=None, backend="portable"):
config=ExecutorchBackendConfig(
extract_delegate_segments=True,
memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
do_quant_fusion_and_const_prop=True,
),
)

Expand Down
1 change: 1 addition & 0 deletions examples/models/parakeet/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def quantize_model_( # noqa: C901
config = Int8DynamicActivationIntxWeightConfig(
weight_dtype=torch.int4,
weight_granularity=granularity,
intx_choose_qparams_algorithm="hqq_scale_only",
)
elif qlinear_config == "8da8w":
config = Int8DynamicActivationIntxWeightConfig(
Expand Down
Loading