diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh index d5c1913619d..78e005950b7 100755 --- a/.ci/scripts/export_model_artifact.sh +++ b/.ci/scripts/export_model_artifact.sh @@ -5,16 +5,16 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# Export model to CUDA/Metal format with optional quantization +# Export model to CUDA/Metal/XNNPACK format with optional quantization show_help() { cat << EOF Usage: export_model_artifact.sh [quant_name] [output_dir] -Export a HuggingFace model to CUDA/Metal format with optional quantization. +Export a HuggingFace model to CUDA/Metal/XNNPACK format with optional quantization. Arguments: - device cuda or metal (required) + device cuda, metal, or xnnpack (required) hf_model HuggingFace model ID (required) Supported models: @@ -28,6 +28,7 @@ Arguments: - non-quantized - quantized-int4-tile-packed - quantized-int4-weight-only + - quantized-8da4w (XNNPACK only) output_dir Output directory for artifacts (optional, default: current directory) @@ -36,6 +37,7 @@ Examples: export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output" export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output" + export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output" EOF } @@ -64,9 +66,11 @@ case "$DEVICE" in ;; metal) ;; + xnnpack) + ;; *) echo "Error: Unsupported device '$DEVICE'" - echo "Supported devices: cuda, cuda-windows, metal" + echo "Supported devices: cuda, cuda-windows, metal, xnnpack" exit 1 ;; esac @@ -139,9 +143,16 @@ case "$QUANT_NAME" in fi EXTRA_ARGS="--qlinear_encoder 4w" ;; + quantized-8da4w) + if [ "$DEVICE" != "xnnpack" ]; then + echo "Error: quantized-8da4w is only supported with xnnpack device" + exit 1 + fi + EXTRA_ARGS="--qlinear 8da4w --qlinear_group_size 32 --qlinear_encoder 8da4w --qlinear_encoder_group_size 32" + ;; *) echo "Error: Unsupported quantization '$QUANT_NAME'" - echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only" + echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only, quantized-8da4w" exit 1 ;; esac @@ -157,10 +168,17 @@ pip list if [ "$MODEL_NAME" = "parakeet" ]; then pip install -r examples/models/parakeet/install_requirements.txt + # Set dtype based on backend (XNNPACK uses fp32, CUDA/Metal use bf16) + if [ "$DEVICE" = "xnnpack" ]; then + DTYPE_ARG="" + else + DTYPE_ARG="--dtype bf16" + fi + python -m executorch.examples.models.parakeet.export_parakeet_tdt \ --backend "$DEVICE" \ --output-dir "${OUTPUT_DIR}" \ - --dtype bf16 \ + ${DTYPE_ARG} \ ${EXTRA_ARGS} test -f "${OUTPUT_DIR}/model.pte" diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh index 2fa2f311bb6..711aff15111 100755 --- a/.ci/scripts/test_model_e2e.sh +++ b/.ci/scripts/test_model_e2e.sh @@ -5,16 +5,16 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# Test CUDA/Metal model end-to-end, need to run .ci/scripts/export_model_artifact.sh first +# Test CUDA/Metal/XNNPACK model end-to-end, need to run .ci/scripts/export_model_artifact.sh first show_help() { cat << EOF Usage: test_model_e2e.sh [model_dir] -Build and run end-to-end tests for CUDA/Metal models. +Build and run end-to-end tests for CUDA/Metal/XNNPACK models. Arguments: - device cuda or metal (required) + device cuda, metal, or xnnpack (required) hf_model HuggingFace model ID (required) Supported models: @@ -28,6 +28,7 @@ Arguments: - non-quantized - quantized-int4-tile-packed - quantized-int4-weight-only + - quantized-8da4w (XNNPACK only) model_dir Directory containing model artifacts (optional, default: current directory) Expected files: model.pte, aoti_cuda_blob.ptd (CUDA only) @@ -37,6 +38,7 @@ Examples: test_model_e2e.sh metal "openai/whisper-small" "non-quantized" test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output" test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output" + test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output" EOF } @@ -174,12 +176,17 @@ echo "::endgroup::" echo "::group::Build $MODEL_NAME Runner" -if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ]; then - echo "Error: Unsupported device '$DEVICE'. Must be 'cuda' or 'metal'." +if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ] && [ "$DEVICE" != "xnnpack" ]; then + echo "Error: Unsupported device '$DEVICE'. Must be 'cuda', 'metal', or 'xnnpack'." exit 1 fi -MAKE_TARGET="${RUNNER_PATH}-${DEVICE}" +# Map device to make target (xnnpack uses cpu target which includes XNNPACK) +if [ "$DEVICE" = "xnnpack" ]; then + MAKE_TARGET="${RUNNER_PATH}-cpu" +else + MAKE_TARGET="${RUNNER_PATH}-${DEVICE}" +fi make "${MAKE_TARGET}" echo "::endgroup::" diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index e0f377efeb0..2645be6478e 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -163,6 +163,39 @@ jobs: # Build and test ExecuTorch PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" + test-parakeet-xnnpack-linux: + name: test-parakeet-xnnpack-linux + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + with: + runner: linux.4xlarge.memory + docker-image: ci-image:executorch-ubuntu-22.04-clang12 + submodules: 'recursive' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 120 + script: | + set -eux + + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + echo "::group::Setup ExecuTorch" + ./install_executorch.sh + echo "::endgroup::" + + echo "::group::Export Parakeet with XNNPACK" + bash .ci/scripts/export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./parakeet_output" + echo "::endgroup::" + + echo "::group::Test Parakeet with XNNPACK" + bash .ci/scripts/test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./parakeet_output" + echo "::endgroup::" + test-llama-runner-linux: # Test Both linux x86 and linux aarch64 name: test-llama-runner-linux diff --git a/examples/models/parakeet/CMakeLists.txt b/examples/models/parakeet/CMakeLists.txt index 5da7b4373b1..8c1eb547ccf 100644 --- a/examples/models/parakeet/CMakeLists.txt +++ b/examples/models/parakeet/CMakeLists.txt @@ -24,13 +24,21 @@ find_package(gflags REQUIRED) # Find executorch libraries list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..) find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH) -executorch_target_link_options_shared_lib(executorch) +get_target_property(_executorch_imported executorch IMPORTED) +if(NOT _executorch_imported) + executorch_target_link_options_shared_lib(executorch) +endif() set(link_libraries executorch gflags) # Common ops for all builds -list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas) -executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib) +if(TARGET optimized_native_cpu_ops_lib) + list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas) + get_target_property(_is_imported optimized_native_cpu_ops_lib IMPORTED) + if(NOT _is_imported) + executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib) + endif() +endif() # CPU-only builds need quantized and custom ops if(NOT EXECUTORCH_BUILD_CUDA AND MSVC) @@ -46,7 +54,10 @@ if(TARGET xnnpack_backend) list(APPEND xnnpack_backend_libs kleidiai) endif() list(APPEND link_libraries ${xnnpack_backend_libs}) - executorch_target_link_options_shared_lib(xnnpack_backend) + get_target_property(_xnnpack_imported xnnpack_backend IMPORTED) + if(NOT _xnnpack_imported) + executorch_target_link_options_shared_lib(xnnpack_backend) + endif() endif() # Needed for cpuinfo where it uses android specific log lib diff --git a/examples/models/parakeet/README.md b/examples/models/parakeet/README.md index 23593d324d1..695e70ed472 100644 --- a/examples/models/parakeet/README.md +++ b/examples/models/parakeet/README.md @@ -57,7 +57,19 @@ The export script supports quantizing encoder and decoder linear layers using [t | `8da4w` | 8-bit dynamic activation, 4-bit weight | | `8da8w` | 8-bit dynamic activation, 8-bit weight | -#### Example: 4-bit Weight Quantization with Tile Packing +#### Example: Dynamic Quantization for XNNPACK + +```bash +python export_parakeet_tdt.py \ + --backend xnnpack \ + --qlinear_encoder 8da4w \ + --qlinear_encoder_group_size 32 \ + --qlinear 8da4w \ + --qlinear_group_size 32 \ + --output-dir ./parakeet_quantized_xnnpack +``` + +#### Example: 4-bit Weight Quantization with Tile Packing for CUDA ```bash python export_parakeet_tdt.py \ diff --git a/examples/models/parakeet/export_parakeet_tdt.py b/examples/models/parakeet/export_parakeet_tdt.py index 5a5b36c0549..fa21d4e3dc1 100644 --- a/examples/models/parakeet/export_parakeet_tdt.py +++ b/examples/models/parakeet/export_parakeet_tdt.py @@ -461,6 +461,7 @@ def export_all( def _create_xnnpack_partitioners(programs): """Create XNNPACK partitioners for all programs except preprocessor.""" from executorch.backends.xnnpack.partition.xnnpack_partitioner import ( + XnnpackDynamicallyQuantizedPartitioner, XnnpackPartitioner, ) @@ -470,7 +471,13 @@ def _create_xnnpack_partitioners(programs): if key == "preprocessor": partitioner[key] = [] else: - partitioner[key] = [XnnpackPartitioner()] + # Use both partitioners: + # 1. XnnpackDynamicallyQuantizedPartitioner for dynamic quantization (8da4w) + # 2. XnnpackPartitioner for remaining ops + partitioner[key] = [ + XnnpackDynamicallyQuantizedPartitioner(), + XnnpackPartitioner(), + ] return partitioner, programs @@ -584,6 +591,7 @@ def lower_to_executorch(programs, metadata=None, backend="portable"): config=ExecutorchBackendConfig( extract_delegate_segments=True, memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False), + do_quant_fusion_and_const_prop=True, ), ) diff --git a/examples/models/parakeet/quantize.py b/examples/models/parakeet/quantize.py index 3e540d84834..5d602d7a3e4 100644 --- a/examples/models/parakeet/quantize.py +++ b/examples/models/parakeet/quantize.py @@ -89,6 +89,7 @@ def quantize_model_( # noqa: C901 config = Int8DynamicActivationIntxWeightConfig( weight_dtype=torch.int4, weight_granularity=granularity, + intx_choose_qparams_algorithm="hqq_scale_only", ) elif qlinear_config == "8da8w": config = Int8DynamicActivationIntxWeightConfig(