pytorch · jirioc · Feb 4, 2026
@@ -72,7 +72,9 @@ def verify_target(self, target: str):
                 f"Target `{target}` is not a valid target. Must be one of `{valid_targets}`."
             )
 
-    def convert(self, tflite_model: bytes, target: str) -> bytes:
+    def convert(
+        self, tflite_model: bytes, target: str, fetch_constants_to_sram: bool
+    ) -> bytes:
         # Neutron converter crashes if we provide invalid target -> verify.
         self.verify_target(target)
 
@@ -82,6 +84,7 @@ def convert(self, tflite_model: bytes, target: str) -> bytes:
         cctx.compilationOpts.excludeGraphPasses = (
             "HoistSliceAboveTranspose,MergeTranspose"
         )
+        cctx.compilationOpts.fetchConstantsToSRAM = fetch_constants_to_sram
 
         # Try to use multiprocessing for isolation, but fall back to direct execution
         # if the environment doesn't support it (e.g., in sandcastle/build environments)

@@ -46,6 +46,7 @@ def __init__(self):
         self.operators_not_to_delegate: List[str] = []
         self.neutron_converter_flavor = None
         self.use_neutron_for_format_conversion = True
+        self.fetch_constants_to_sram = False
 
     def _replace_colons(self, operator: str) -> str:
         """
@@ -60,6 +61,7 @@ def neutron_compile_spec(
         extra_flags: Optional[str] = None,
         operators_not_to_delegate: Optional[List[str]] = None,
         use_neutron_for_format_conversion: bool = True,
+        fetch_constants_to_sram: bool = False,
     ):
         """
         Generate compile spec for Neutron NPU
@@ -73,6 +75,8 @@ def neutron_compile_spec(
             use_neutron_for_format_conversion: If True, the EdgeProgramToIRConverter will insert `Transpose` ops to
                                                 ensure that the IO matches the executorch partition, which will be
                                                 delegated to Neutron.
+            fetch_constants_to_sram: If True, the Neutron Converter will insert microinstructions to prefetch weights
+                                     from FLASH to SRAM. This should be used when the whole model does not fit into SRAM.
         """
 
         self.neutron_converter_flavor = neutron_converter_flavor
@@ -94,6 +98,8 @@ def neutron_compile_spec(
 
         self.use_neutron_for_format_conversion = use_neutron_for_format_conversion
 
+        self.fetch_constants_to_sram = fetch_constants_to_sram
+
         return self
 
     def build(self):
@@ -116,6 +122,10 @@ def build(self):
                     "use_neutron_for_format_conversion",
                     f"{self.use_neutron_for_format_conversion}".encode(),
                 ),
+                CompileSpec(
+                    "fetch_constants_to_sram",
+                    f"{self.fetch_constants_to_sram}".encode(),
+                ),
             ]
 
         return self.compile_spec
@@ -128,6 +138,7 @@ def generate_neutron_compile_spec(
     extra_flags: Optional[str] = None,
     operators_not_to_delegate: Optional[List[str]] = None,
     use_neutron_for_format_conversion: bool = True,
+    fetch_constants_to_sram: bool = False,
 ) -> List[CompileSpec]:
     return (
         NeutronCompileSpecBuilder()
@@ -137,6 +148,7 @@ def generate_neutron_compile_spec(
             extra_flags=extra_flags,
             operators_not_to_delegate=operators_not_to_delegate,
             use_neutron_for_format_conversion=use_neutron_for_format_conversion,
+            fetch_constants_to_sram=fetch_constants_to_sram,
         )
         .build()
     )
@@ -160,6 +172,7 @@ def preprocess(  # noqa C901
         target = ""
         neutron_converter_flavor = ""
         use_neutron_for_format_conversion = None
+        fetch_constants_to_sram = False
         for spec in compile_spec:
             if spec.key == "output_format":
                 output_format = spec.value.decode()
@@ -171,6 +184,8 @@ def preprocess(  # noqa C901
                 neutron_converter_flavor = spec.value.decode()
             if spec.key == "use_neutron_for_format_conversion":
                 use_neutron_for_format_conversion = spec.value.decode() == "True"
+            if spec.key == "fetch_constants_to_sram":
+                fetch_constants_to_sram = spec.value.decode() == "True"
 
         # Check that the output format is set in the compile spec
         if not output_format:
@@ -209,7 +224,7 @@ def preprocess(  # noqa C901
             )
 
             neutron_model = NeutronConverterManager(neutron_converter_flavor).convert(
-                tflite_model, target
+                tflite_model, target, fetch_constants_to_sram
             )
 
             # Dump the tflite file if logging level is enabled

@@ -98,6 +98,7 @@ def to_quantized_edge_program(
     custom_delegation_options=CustomDelegationOptions(),  # noqa B008
     get_quantizer_fn=None,
     use_neutron_for_format_conversion=True,
+    fetch_constants_to_sram=False,
 ) -> EdgeProgramManager:
     _neutron_target_spec = NeutronTargetSpec(target, neutron_converter_flavor)
     if get_quantizer_fn is None:
@@ -125,6 +126,7 @@ def to_quantized_edge_program(
         operators_not_to_delegate=operators_not_to_delegate,
         neutron_converter_flavor=neutron_converter_flavor,
         use_neutron_for_format_conversion=use_neutron_for_format_conversion,
+        fetch_constants_to_sram=fetch_constants_to_sram,
     )
     partitioners = [
         NeutronPartitioner(

@@ -1,4 +1,4 @@
-# Copyright 2024-2025 NXP
+# Copyright 2024-2026 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -14,7 +14,8 @@
     NeutronConverterManager,
 )
 from executorch.backends.nxp.backend.node_format_inference import NodeFormatInference
-from executorch.backends.nxp.tests.models import Conv2dModule
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.models import Conv2dModule, LinearModule
 
 
 def test_conv2d_neutron_conversion__default_flavor():
@@ -31,7 +32,7 @@ def test_conv2d_neutron_conversion__default_flavor():
     )
 
     neutron_converter_manager = NeutronConverterManager()
-    neutron_model = neutron_converter_manager.convert(tflite_model, "imxrt700")
+    neutron_model = neutron_converter_manager.convert(tflite_model, "imxrt700", False)
 
     assert len(
         neutron_model
@@ -52,8 +53,30 @@ def test__conv2d_neutron_conversion__invalid_flavor():
     )
 
     with pytest.raises(RuntimeError) as excinfo:
-        _ = NeutronConverterManager("bad_flavor").convert(tflite_model, "imxrt700")
+        _ = NeutronConverterManager("bad_flavor").convert(
+            tflite_model, "imxrt700", False
+        )
 
     assert "Neutron Converter module with flavor 'bad_flavor' not found." in str(
         excinfo
     )
+
+
+def test_conv2d_neutron_conversion__prefetching(mocker):
+    model = LinearModule(True)
+    input_shape = (1, 1, 32, 32)
+
+    converter_spy = mocker.spy(NeutronConverterManager, "convert")
+    _ = to_quantized_edge_program(
+        model, input_shape, fetch_constants_to_sram=True
+    ).exported_program()
+    neutron_model_prefetch = converter_spy.spy_return
+
+    _ = to_quantized_edge_program(
+        model, input_shape, fetch_constants_to_sram=False
+    ).exported_program()
+    neutron_model_regular = converter_spy.spy_return
+
+    assert len(neutron_model_prefetch) != len(
+        neutron_model_regular
+    ), "The weight prefetching flag does not make a difference!"
@@ -214,6 +214,13 @@ def get_model_and_inputs_from_name(model_name: str):
         help="The model (including the Neutron backend) will use the channels last dim order, which can result in faster "
         "inference. The inputs must also be provided in the channels last dim order.",
     )
+    parser.add_argument(
+        "--fetch_constants_to_sram",
+        required=False,
+        default=False,
+        action="store_true",
+        help="This feature allows running models which do not fit into SRAM by offloading them to an external memory.",
+    )
 
     args = parser.parse_args()
 
@@ -291,6 +298,7 @@ def get_model_and_inputs_from_name(model_name: str):
         args.target,
         operators_not_to_delegate=args.operators_not_to_delegate,
         neutron_converter_flavor=args.neutron_converter_flavor,
+        fetch_constants_to_sram=args.fetch_constants_to_sram,
     )
     partitioners = (
         [NeutronPartitioner(compile_spec, neutron_target_spec)] if args.delegate else []