From f370da062a8aa2c32fad309c43c566fb17d39bea Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Thu, 22 Jan 2026 13:31:16 +0800
Subject: [PATCH 01/25] add counter classification pass

---
 include/TaskflowDialect/TaskflowOps.td        | 18 ++--
 include/TaskflowDialect/TaskflowPasses.h      |  1 +
 include/TaskflowDialect/TaskflowPasses.td     | 14 +++
 lib/TaskflowDialect/Transforms/CMakeLists.txt |  1 +
 .../Transforms/ClassifyCountersPass.cpp       | 92 +++++++++++++++++++
 .../ConstructHyperblockFromTaskPass.cpp       |  6 +-
 6 files changed, 122 insertions(+), 10 deletions(-)
 create mode 100644 lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp
diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td
index 2e6159af..fd8ebe87 100644
--- a/include/TaskflowDialect/TaskflowOps.td
+++ b/include/TaskflowDialect/TaskflowOps.td
@@ -151,24 +151,25 @@ def TaskflowCounterOp : TaskflowOpBase<"counter", [Pure]>{
     Represents a loop counter that generates iteration indices.
     The hardware counter produces a predicated index value.
 
-    Counter behavior:
-    - Top-level counter: increments unconditionally each cycle.
-    - Nested counter: increments only when the parent counter is valid.
+    Counter classification:
+    - "root": Top-level counter with no parent (drives entire loop nest)
+    - "relay": Intermediate counter with both parent and child counters
+    - "leaf": Innermost counter with no child counters (maps to CGRA tile array)
 
     Example:
-      // Top-level counter
+      // Root counter
       %i = taskflow.counter {
         lower_bound = 0 : index,
         upper_bound = 16 : index,
         step = 1 : index,
-        counter_name = "i"
+        counter_type = "root"
       } : index
-      // Nested counter
+      // Leaf counter
       %j = taskflow.counter parent(%i) {
         lower_bound = 0 : index,
         upper_bound = 8 : index,
         step = 1 : index,
-        counter_name = "j"
+        counter_type = "leaf"
       } : index
   }];
 
@@ -176,7 +177,8 @@ def TaskflowCounterOp : TaskflowOpBase<"counter", [Pure]>{
     Optional<AnyType>:$parent_index,
     IndexAttr:$lower_bound,
     IndexAttr:$upper_bound,
-    IndexAttr:$step
+    IndexAttr:$step,
+    OptionalAttr<StrAttr>:$counter_type
   );
 
   let results = (outs AnyType:$counter_index);
diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index 50f28d0e..c0007ce1 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -17,6 +17,7 @@ namespace taskflow {
 #include "TaskflowDialect/TaskflowPasses.h.inc"
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createCanonicalizeTaskPass();
+std::unique_ptr<mlir::Pass> createClassifyCountersPass();
 
 #define GEN_PASS_REGISTRATION
 #include "TaskflowDialect/TaskflowPasses.h.inc"
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 4728f138..4fc2137f 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -29,4 +29,18 @@ def CanonicalizeTask: Pass<"canonicalize-task", "func::FuncOp">{
   }];
   let constructor = "taskflow::createCanonicalizeTaskPass()";
 }
+
+def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{
+  let summary = "Classifies counters as root/relay/leaf";
+  let description = [{
+    Analyzes the counter hierarchy within taskflow.task operations and
+    classifies each counter:
+    - root: Top-level counter with no parent
+    - relay: Intermediate counter with both parent and child counters
+    - leaf: Innermost counter with no child counters
+
+    Leaf counters are mapped to CGRA tile arrays.
+  }];
+  let constructor = "taskflow::createClassifyCountersPass()";
+}
 #endif // TASKFLOW_PASSES_TD
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index ab118c89..e44401d8 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -3,6 +3,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
     CanonicalizeTaskPass.cpp
+    ClassifyCountersPass.cpp
 
     DEPENDS
     MLIRTaskflowTransformsIncGen
diff --git a/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp b/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp
new file mode 100644
index 00000000..354ee7d7
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp
@@ -0,0 +1,92 @@
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include <memory>
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+void classifyCountersInTask(TaskflowTaskOp task_op) {
+  // Collects all counters in the task.
+  SmallVector<TaskflowCounterOp> counters;
+  task_op.walk(
+      [&](TaskflowCounterOp counter_op) { counters.push_back(counter_op); });
+
+  if (counters.empty()) {
+    return;
+  }
+
+  // Builds parent-child relationships.
+  // Maps from counter results to counter ops.
+  DenseMap<Value, TaskflowCounterOp> value_to_counter;
+  for (TaskflowCounterOp counter_op : counters) {
+    value_to_counter[counter_op.getCounterIndex()] = counter_op;
+  }
+
+  // Finds which counters have children.
+  DenseSet<TaskflowCounterOp> counters_with_children;
+  for (TaskflowCounterOp counter_op : counters) {
+    if (auto parent_idx = counter_op.getParentIndex()) {
+      if (auto parent_counter = value_to_counter.lookup(parent_idx)) {
+        counters_with_children.insert(parent_counter);
+      }
+    }
+  }
+
+  // Classifies each counter.
+  OpBuilder builder(task_op.getContext());
+  for (TaskflowCounterOp counter_op : counters) {
+    bool has_parent = (counter_op.getParentIndex() != nullptr);
+    bool has_child = counters_with_children.contains(counter_op);
+    StringRef counter_type;
+    if (!has_parent && !has_child) {
+      // Single loop: treat as leaf counter (can be mapped to the CGRA tile
+      // array).
+      counter_type = "leaf";
+    } else if (!has_parent && has_child) {
+      // Root counter: top-level loop with nested loops.
+      counter_type = "root";
+    } else if (has_parent && has_child) {
+      // Relay counter: nested loop with further nested loops.
+      counter_type = "relay";
+    } else {
+      // Leaf counter: innermost loop.
+      counter_type = "leaf";
+    }
+
+    // Sets the counter type attribute.
+    counter_op.setCounterTypeAttr(builder.getStringAttr(counter_type));
+  }
+}
+
+struct ClassifyCountersPass
+    : public PassWrapper<ClassifyCountersPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ClassifyCountersPass)
+
+  StringRef getArgument() const override { return "classify-counters"; }
+  StringRef getDescription() const override {
+    return "Classify taskflow counters as root/relay/leaf.";
+  }
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    module.walk(
+        [&](TaskflowTaskOp task_op) { classifyCountersInTask(task_op); });
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::taskflow::createClassifyCountersPass() {
+  return std::make_unique<ClassifyCountersPass>();
+}
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
index 763e6153..5680acf7 100644
--- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
+++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
@@ -134,7 +134,8 @@ static void createCounterChainRecursivly(OpBuilder &builder, Location loc,
         loc, builder.getIndexType(), parent_counter,
         builder.getIndexAttr(loop_info->lower_bound),
         builder.getIndexAttr(loop_info->upper_bound),
-        builder.getIndexAttr(loop_info->step));
+        builder.getIndexAttr(loop_info->step),
+        /*Counter Type*/ nullptr);
     counter_index = counter_op.getCounterIndex();
   } else {
     // Top-level counter.
@@ -142,7 +143,8 @@ static void createCounterChainRecursivly(OpBuilder &builder, Location loc,
         loc, builder.getIndexType(), /*parent_index=*/nullptr,
         builder.getIndexAttr(loop_info->lower_bound),
         builder.getIndexAttr(loop_info->upper_bound),
-        builder.getIndexAttr(loop_info->step));
+        builder.getIndexAttr(loop_info->step),
+        /*Counter Type*/ nullptr);
     counter_index = counter_op.getCounterIndex();
   }
 

From fe987b8383f87025726cb668287b88b0205dc937 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Thu, 22 Jan 2026 15:45:11 +0800
Subject: [PATCH 02/25] change the definition of taskflow.hyperblock.yield

---
 include/Conversion/ConversionPasses.h         |   2 +-
 include/Conversion/ConversionPasses.td        |  13 +
 include/NeuraDialect/NeuraOps.td              |  41 +-
 include/TaskflowDialect/TaskflowOps.td        |  10 +-
 lib/Conversion/CMakeLists.txt                 |   2 +
 lib/Conversion/TaskflowToNeura/CMakeLists.txt |  19 +
 .../TaskflowToNeura/TaskflowToNeuraPass.cpp   | 169 +++++++
 .../Transforms/IterMergePatternPass.cpp       | 445 ++++++++++--------
 .../Transforms/WrapLoopInKernelPass.cpp       |   4 +-
 .../kernel_with_yield/kernel_with_yield.mlir  |   5 +-
 10 files changed, 497 insertions(+), 213 deletions(-)
 create mode 100644 lib/Conversion/TaskflowToNeura/CMakeLists.txt
 create mode 100644 lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp

diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h
index 0baf43f8..14e27a03 100644
--- a/include/Conversion/ConversionPasses.h
+++ b/include/Conversion/ConversionPasses.h
@@ -22,7 +22,7 @@ std::unique_ptr<mlir::Pass> createLowerAffineToNeuraPass();
 
 // TaskFlow Conversion Passes.
 std::unique_ptr<mlir::Pass> createConvertAffineToTaskflowPass();
-
+std::unique_ptr<mlir::Pass> createConvertTaskflowToNeuraPass();
 #define GEN_PASS_REGISTRATION
 #include "Conversion/ConversionPasses.h.inc"
 
diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td
index a341d9fe..e2d727d2 100644
--- a/include/Conversion/ConversionPasses.td
+++ b/include/Conversion/ConversionPasses.td
@@ -74,4 +74,17 @@ def ConvertAffineToTaskflow : Pass<"convert-affine-to-taskflow", "ModuleOp">{
   ];
 }
 
+def ConvertTaskflowToNeura : Pass<"convert-taskflow-to-neura", "ModuleOp">{
+  let summary = "Convert taskflow.hyperblock to neura.kernel";
+  let description = [{
+    Converts taskflow.hyperblock operations with leaf counters into neura.kernel
+    operations suitable for CGRA tile array mapping.
+  }];
+  let constructor = "mlir::createConvertTaskflowToNeuraPass()";
+  let dependentDialects = [
+    "mlir::taskflow::TaskflowDialect",
+    "mlir::neura::NeuraDialect"
+  ];
+}
+
 #endif // CONVERSION_PASSES_TD
\ No newline at end of file
diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index 55bc155d..7971d6c6 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -10,7 +10,11 @@ include "mlir/IR/OpBase.td"
 // Defines neura kernel related operations.
 // ----------------------------------------------------
 
-def Neura_KernelOp : Op<NeuraDialect, "kernel", [RecursiveMemoryEffects, SingleBlockImplicitTerminator<"YieldOp">]> {
+def Neura_KernelOp : Op<NeuraDialect, "kernel", [
+  IsolatedFromAbove,
+  RecursiveMemoryEffects,
+  AttrSizedOperandSegments
+  ]> {
   let summary = "Marks a region for CGRA execution.";
   let description = [{
     Defines a computation region that should be offloaded to CGRA.
@@ -41,6 +45,7 @@ def Neura_KernelOp : Op<NeuraDialect, "kernel", [RecursiveMemoryEffects, SingleB
 
   let arguments = (ins
     Variadic<AnyType>:$inputs, // Input operands from surrounding context.
+    Variadic<AnyType>:$iter_args_init, // Initial values for loop carried variables.
     OptionalAttr<I32Attr>:$cgra_id, // Target CGRA ID (for multi-CGRA systems).
     OptionalAttr<StrAttr>:$kernel_name, // Name of the kernel (for identification).
     OptionalAttr<StrAttr>:$accelerator // Target accelerator name.
@@ -52,6 +57,7 @@ def Neura_KernelOp : Op<NeuraDialect, "kernel", [RecursiveMemoryEffects, SingleB
 
   let assemblyFormat = [{
     (`ins` `(` $inputs^ `:` type($inputs) `)` )?
+    (`ins` `(` $iter_args_init^ `:` type($iter_args_init) `)` )?
     attr-dict-with-keyword
     $body
     (`:` type($outputs)^)?
@@ -59,7 +65,7 @@ def Neura_KernelOp : Op<NeuraDialect, "kernel", [RecursiveMemoryEffects, SingleB
 }
 
 // Yield operation for fused_op and kernel regions.
-def Neura_YieldOp : Op<NeuraDialect, "yield", [Terminator, Pure, ReturnLike]> {
+def Neura_YieldOp : Op<NeuraDialect, "yield", [Terminator, Pure, ReturnLike, AttrSizedOperandSegments]> {
   let summary = "Yield values from a neura.kernel or neura.fused_op region.";
   let description = [{
     Returns values from a neura.kernel or neura.fused_op region to the parent operation.
@@ -72,13 +78,17 @@ def Neura_YieldOp : Op<NeuraDialect, "yield", [Terminator, Pure, ReturnLike]> {
       } : f32
   }];
   
-  let arguments = (ins Variadic<AnyType>:$values);
+  let arguments = (ins
+    Variadic<AnyType>:$iter_args_next,
+    Variadic<AnyType>:$results);
 
   let builders = [
-    OpBuilder<(ins), [{ build($_builder, $_state, ValueRange{}); }]>
+    OpBuilder<(ins), [{ build($_builder, $_state, ValueRange{}, ValueRange{}); }]>
   ];
 
-  let assemblyFormat = [{($values^ `:` type($values))? attr-dict}];
+  let assemblyFormat = [{
+  ($iter_args_next^ `:` type($iter_args_next))?
+  ($results^ `:` type($results))? attr-dict}];
 
   let hasVerifier = 1;
 }
@@ -770,6 +780,27 @@ def Neura_LoopControlOp : Op<NeuraDialect, "loop_control">{
   //   " `(``parent_valid` `=` $parentValid `,` `start` `=` $start `,` `end` `=` $end `,` `step` `=` $step`)` attr-dict `:` type($parentValid) `,` type($start) `,` type($end) `,` type($step) `->` type($nextindex) `,` type($valid)";
 }
 
+// def Neura_CounterOp : Op<NeuraDialect, "counter", [Pure]>{
+//   let summary = "Hardware loop counter for CGRA execution.";
+//   let description = [{
+//     Represents a hardware loop counter unit that generates loop indices.
+//     This maps directly to a counter FU on the CGRA.
+
+//     The counter produces:
+//     - current index: the current loop index value.
+
+//     Example:
+//       %current_idx = neura.counter () <{
+//            start_value = 0 : i64, 
+//            end_value = 100 : i64, 
+//            step_value = 1 : i64
+//          }> : -> !neura.data<i64, i1>
+//   }];
+//   let arguments = (ins
+    
+//   );
+// }
+
 // ----------------------------------------------------
 // Defines operations for steering-control based DFG execution.
 // ----------------------------------------------------
diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td
index fd8ebe87..094b31e2 100644
--- a/include/TaskflowDialect/TaskflowOps.td
+++ b/include/TaskflowDialect/TaskflowOps.td
@@ -235,6 +235,7 @@ def TaskflowHyperblockYieldOp : TaskflowOpBase<"hyperblock.yield", [
   Terminator,
   Pure,
   ReturnLike,
+  AttrSizedOperandSegments,
   ParentOneOf<["TaskflowHyperblockOp"]>
 ]>{
   let summary = "Yield operation for Taskflow hyperblock";
@@ -243,15 +244,18 @@ def TaskflowHyperblockYieldOp : TaskflowOpBase<"hyperblock.yield", [
     Terminates the hyperblock body.
   }];
 
-  let arguments = (ins Variadic<AnyType>:$outputs);
+  let arguments = (ins
+    Variadic<AnyType>:$iter_args_next,
+    Variadic<AnyType>:$results);
 
   let assemblyFormat = [{
-    (`outputs` `(` $outputs^ `:` type($outputs) `)`)?
+    (`iter_args_next` `(` $iter_args_next^ `:` type($iter_args_next) `)`)?
+    (`results` `(` $results^ `:` type($results) `)`)?
     attr-dict
   }];
 
   let builders = [
-    OpBuilder<(ins), [{build($_builder, $_state, ValueRange{});}]>
+    OpBuilder<(ins), [{build($_builder, $_state, ValueRange{}, ValueRange{});}]>
   ];
 }
 
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index cf66d518..690dae25 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -6,6 +6,7 @@ add_subdirectory(LlvmToNeura)
 add_subdirectory(MemRefToNeura)
 add_subdirectory(BuiltinToNeura)
 add_subdirectory(AffineToTaskflow)
+add_subdirectory(TaskflowToNeura)
 
 add_library(MLIRConversion INTERFACE)
 
@@ -23,5 +24,6 @@ target_link_libraries(MLIRConversion INTERFACE
   MLIRNeuraMemRefToNeuraPass
   MLIRNeuraBuiltinToNeuraPass
   MLIRAffineToTaskflowPass
+  MLIRTaskflowToNeuraPass
   ${dialect_libs}
 )
\ No newline at end of file
diff --git a/lib/Conversion/TaskflowToNeura/CMakeLists.txt b/lib/Conversion/TaskflowToNeura/CMakeLists.txt
new file mode 100644
index 00000000..7db3d92b
--- /dev/null
+++ b/lib/Conversion/TaskflowToNeura/CMakeLists.txt
@@ -0,0 +1,19 @@
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+add_mlir_conversion_library(MLIRTaskflowToNeuraPass
+  TaskflowToNeuraPass.cpp
+
+  DEPENDS
+  MLIRConversionIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRFuncDialect
+  MLIRLLVMDialect
+  MLIRTaskflow
+  MLIRIR
+  MLIRPass
+  MLIRTransforms
+  MLIRNeura
+  MLIRSupport
+)
diff --git a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
new file mode 100644
index 00000000..460a5a15
--- /dev/null
+++ b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
@@ -0,0 +1,169 @@
+#include "Common/AcceleratorAttrs.h"
+#include "Conversion/ConversionPasses.h"
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+struct HyperblockToKernelPattern
+    : public OpRewritePattern<TaskflowHyperblockOp> {
+  using OpRewritePattern<TaskflowHyperblockOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TaskflowHyperblockOp hyperblock_op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = hyperblock_op.getLoc();
+
+    // Find the parent task to get access to task's block arguments.
+    auto taskOp = hyperblock_op->getParentOfType<TaskflowTaskOp>();
+    if (!taskOp)
+      return failure();
+
+    // Collect live-in values: values used in hyperblock but defined outside.
+    // These are the task's block arguments that the hyperblock body uses.
+    llvm::DenseSet<Value> liveInSet;
+    SmallVector<Value> liveInValues;
+
+    Block &hbBlock = hyperblock_op.getBody().front();
+    Block &taskBlock = taskOp.getBody().front();
+
+    // Walk hyperblock body to find uses of task block arguments.
+    hyperblock_op.walk([&](Operation *op) {
+      for (Value operand : op->getOperands()) {
+        // Check if operand is a task block argument.
+        if (auto blockArg = dyn_cast<BlockArgument>(operand)) {
+          if (blockArg.getOwner() == &taskBlock) {
+            if (liveInSet.insert(operand).second) {
+              liveInValues.push_back(operand);
+            }
+          }
+        }
+      }
+    });
+
+    // Collect iter_args initial values.
+    SmallVector<Value> iterArgsInit(hyperblock_op.getIterArgs().begin(),
+                                    hyperblock_op.getIterArgs().end());
+
+    // Determine result types.
+    SmallVector<Type> resultTypes(hyperblock_op.getResultTypes().begin(),
+                                  hyperblock_op.getResultTypes().end());
+
+    // Collect input types.
+    SmallVector<Type> inputTypes;
+    for (Value v : liveInValues) {
+      inputTypes.push_back(v.getType());
+    }
+
+    SmallVector<Type> iterArgsTypes;
+    for (Value v : iterArgsInit) {
+      iterArgsTypes.push_back(v.getType());
+    }
+
+    // Create neura.kernel.
+    auto kernelOp = rewriter.create<neura::KernelOp>(
+        loc, resultTypes, liveInValues, iterArgsInit,
+        /*cgra_id=*/rewriter.getI32IntegerAttr(0),
+        /*kernel_name=*/rewriter.getStringAttr("kernel"),
+        /*accelerator=*/rewriter.getStringAttr("neura"));
+
+    // Create entry block for kernel.
+    Region &kernelRegion = kernelOp.getBody();
+    Block *entryBlock = rewriter.createBlock(&kernelRegion);
+
+    IRMapping mapping;
+
+    // Add block arguments for live-in values (inputs).
+    for (auto [idx, liveIn] : llvm::enumerate(liveInValues)) {
+      BlockArgument arg = entryBlock->addArgument(liveIn.getType(), loc);
+      mapping.map(liveIn, arg);
+    }
+
+    // Add block arguments for iter_args.
+    size_t numIndices = hyperblock_op.getIndices().size();
+    for (auto [idx, iterArg] : llvm::enumerate(iterArgsInit)) {
+      BlockArgument arg = entryBlock->addArgument(iterArg.getType(), loc);
+      // Map hyperblock's iter_arg block argument to kernel's block argument.
+      mapping.map(hbBlock.getArgument(numIndices + idx), arg);
+    }
+
+    // Map hyperblock's index arguments - these will be replaced by counter
+    // later. For now, create placeholder block arguments.
+    for (size_t i = 0; i < numIndices; ++i) {
+      BlockArgument hbArg = hbBlock.getArgument(i);
+      BlockArgument arg = entryBlock->addArgument(hbArg.getType(), loc);
+      mapping.map(hbArg, arg);
+    }
+
+    // Clone hyperblock body into kernel.
+    rewriter.setInsertionPointToEnd(entryBlock);
+    for (Operation &op : hbBlock.without_terminator()) {
+      rewriter.clone(op, mapping);
+    }
+
+    // Convert hyperblock.yield to neura.yield.
+    auto yieldOp = cast<TaskflowHyperblockYieldOp>(hbBlock.getTerminator());
+    SmallVector<Value> iterArgsNext;
+    SmallVector<Value> results;
+
+    for (Value out : yieldOp.getOutputs()) {
+      Value mapped = mapping.lookupOrDefault(out);
+      // For kernels with iter_args, output goes to both iter_args_next and
+      // results.
+      iterArgsNext.push_back(mapped);
+      results.push_back(mapped);
+    }
+
+    rewriter.create<neura::YieldOp>(loc, iterArgsNext, results);
+
+    // Replace hyperblock results with kernel results.
+    rewriter.replaceOp(hyperblock_op, kernelOp.getResults());
+
+    return success();
+  }
+};
+
+struct ConvertTaskflowToNeuraPass
+    : public PassWrapper<ConvertTaskflowToNeuraPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertTaskflowToNeuraPass)
+
+  StringRef getArgument() const override { return "convert-taskflow-to-neura"; }
+  StringRef getDescription() const override {
+    return "Convert taskflow.hyperblock to neura.kernel";
+  }
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::neura::NeuraDialect>();
+    registry.insert<mlir::taskflow::TaskflowDialect>();
+  }
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    // Phase 1: Converts hyperblocks to kernels.
+    RewritePatternSet patterns(ctx);
+    patterns.add<HyperblockToKernelPattern>(ctx);
+
+    if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::createConvertTaskflowToNeuraPass() {
+  return std::make_unique<ConvertTaskflowToNeuraPass>();
+}
\ No newline at end of file
diff --git a/lib/NeuraDialect/Transforms/IterMergePatternPass.cpp b/lib/NeuraDialect/Transforms/IterMergePatternPass.cpp
index e9666bd6..6269ab0f 100644
--- a/lib/NeuraDialect/Transforms/IterMergePatternPass.cpp
+++ b/lib/NeuraDialect/Transforms/IterMergePatternPass.cpp
@@ -1,20 +1,20 @@
-#include "NeuraDialect/NeuraOps.h"
 #include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
 #include "NeuraDialect/Transforms/GraphMining/GraMi.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/IR/PatternMatch.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringRef.h"
-#include <memory>
-#include <vector>
+#include "llvm/Support/raw_ostream.h"
 #include <map>
+#include <memory>
 #include <set>
+#include <vector>
 
 using namespace mlir;
 
@@ -22,148 +22,160 @@ using namespace mlir;
 #define GEN_PASS_DEF_INITPATTERN
 #include "NeuraDialect/NeuraPasses.h.inc"
 
-void printDFGStatistics(mlir::neura::DfgGraph* graph) {
+void printDFGStatistics(mlir::neura::DfgGraph *graph) {
   llvm::errs() << "DFG Statistics:\n";
   llvm::errs() << "---------------\n";
 
   llvm::errs() << "Number of nodes: " << graph->getNumNodes() << "\n";
   llvm::errs() << "Number of edges: " << graph->getNumEdges() << "\n\n";
-  
+
   std::map<std::string, size_t> op_type_counts;
-  for (auto* node : graph->getNodes()) {
+  for (auto *node : graph->getNodes()) {
     op_type_counts[node->getLabel()]++;
   }
-  
+
   llvm::errs() << "Operation types and their counts:\n";
-  for (const auto& pair : op_type_counts) {
+  for (const auto &pair : op_type_counts) {
     llvm::errs() << "  - " << pair.first << ": " << pair.second << "\n";
   }
   llvm::errs() << "\n";
 }
 
 // Finds a valid insertion point for the fused operation.
-Operation* findValidInsertionPoint(
-    const mlir::neura::PatternInstance& instance,
-    const llvm::DenseSet<Operation*>& pattern_ops,
-    const SmallVector<Value>& valid_inputs,
-    const SmallVector<Value>& valid_outputs) {
-  
-  if (instance.operations.empty()) return nullptr;
-  
-  Block* block = instance.operations.front()->getBlock();
-  if (!block) return nullptr;
-  
-  for (Operation* op : instance.operations) {
+Operation *
+findValidInsertionPoint(const mlir::neura::PatternInstance &instance,
+                        const llvm::DenseSet<Operation *> &pattern_ops,
+                        const SmallVector<Value> &valid_inputs,
+                        const SmallVector<Value> &valid_outputs) {
+
+  if (instance.operations.empty())
+    return nullptr;
+
+  Block *block = instance.operations.front()->getBlock();
+  if (!block)
+    return nullptr;
+
+  for (Operation *op : instance.operations) {
     if (op->getBlock() != block) {
-      return nullptr;  
+      return nullptr;
     }
   }
-  
-  Operation* earliest_point = nullptr;
-  
+
+  Operation *earliest_point = nullptr;
+
   for (Value input : valid_inputs) {
-    Operation* def_op = input.getDefiningOp();
+    Operation *def_op = input.getDefiningOp();
     if (!def_op) {
       continue;
     }
-    
+
     if (def_op->getBlock() != block) {
       continue;
     }
-    
+
     if (!earliest_point) {
       earliest_point = def_op;
     } else if (!def_op->isBeforeInBlock(earliest_point)) {
       earliest_point = def_op;
     }
   }
-  
+
   // Finds the latest position: before all external uses of outputs
-  Operation* latest_point = nullptr;
-  
-  for (Value output : valid_outputs   ) {
-    for (OpOperand& use : output.getUses()) {
-      Operation* user = use.getOwner();
-      
+  Operation *latest_point = nullptr;
+
+  for (Value output : valid_outputs) {
+    for (OpOperand &use : output.getUses()) {
+      Operation *user = use.getOwner();
+
       if (pattern_ops.contains(user)) {
         continue;
       }
-      
+
       if (user->getBlock() != block) {
         continue;
       }
-      
+
       if (!latest_point) {
         latest_point = user;
       } else if (user->isBeforeInBlock(latest_point)) {
-        latest_point = user;  
+        latest_point = user;
       }
     }
   }
-  
+
   if (!earliest_point) {
     earliest_point = instance.operations.front();
-    for (Operation* op : instance.operations) {
+    for (Operation *op : instance.operations) {
       if (op->isBeforeInBlock(earliest_point)) {
         earliest_point = op;
       }
     }
   }
-  
+
   // [earliest_point, latest_point)
   if (latest_point) {
-    if (!earliest_point->isBeforeInBlock(latest_point) || earliest_point == latest_point) {
+    if (!earliest_point->isBeforeInBlock(latest_point) ||
+        earliest_point == latest_point) {
       return nullptr;
     }
   }
-  
+
   // Returns the valid insertion point (inserts after earliest_point)
   return earliest_point;
 }
 
-bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstance& instance, const mlir::neura::FrequentSubgraph& pattern) {
-  if (instance.operations.empty()) return false;
-  
-  for (Operation* op : instance.operations) {
+bool rewritePatternInstance(OpBuilder &builder,
+                            const mlir::neura::PatternInstance &instance,
+                            const mlir::neura::FrequentSubgraph &pattern) {
+  if (instance.operations.empty())
+    return false;
+
+  for (Operation *op : instance.operations) {
     if (!op || !op->getBlock()) {
       return false;
     }
   }
-  
-  llvm::DenseSet<Operation*> pattern_ops(instance.operations.begin(), instance.operations.end());
-  
+
+  llvm::DenseSet<Operation *> pattern_ops(instance.operations.begin(),
+                                          instance.operations.end());
+
   // First, collects inputs and outputs to determine valid insertion point
   llvm::SetVector<Value> input_set_for_check;
-  for (Operation* op : instance.operations) {
+  for (Operation *op : instance.operations) {
     for (Value operand : op->getOperands()) {
-      Operation* def_op = operand.getDefiningOp();
-      if (def_op && def_op->getName().getStringRef().str() == "neura.fused_op" && pattern_ops.contains(def_op)) {
+      Operation *def_op = operand.getDefiningOp();
+      if (def_op &&
+          def_op->getName().getStringRef().str() == "neura.fused_op" &&
+          pattern_ops.contains(def_op)) {
         continue;
       }
       if (!def_op || !pattern_ops.contains(def_op)) {
         input_set_for_check.insert(operand);
       }
     }
-    
-    if (op->getName().getStringRef().str() == "neura.fused_op" && op->getNumRegions() > 0) {
-      Region& region = op->getRegion(0);
+
+    if (op->getName().getStringRef().str() == "neura.fused_op" &&
+        op->getNumRegions() > 0) {
+      Region &region = op->getRegion(0);
       if (!region.empty()) {
-        Block& block = region.front();
-        llvm::DenseSet<Operation*> nested_pattern_ops;
-        
-        for (Operation& body_op : block.getOperations()) {
+        Block &block = region.front();
+        llvm::DenseSet<Operation *> nested_pattern_ops;
+
+        for (Operation &body_op : block.getOperations()) {
           if (body_op.getName().getStringRef().str() != "neura.yield") {
             nested_pattern_ops.insert(&body_op);
             for (Value operand : body_op.getOperands()) {
               if (mlir::isa<BlockArgument>(operand)) {
                 continue;
               }
-              
-              Operation* def_op = operand.getDefiningOp();
-              if (def_op && !nested_pattern_ops.contains(def_op) && !pattern_ops.contains(def_op)) {
+
+              Operation *def_op = operand.getDefiningOp();
+              if (def_op && !nested_pattern_ops.contains(def_op) &&
+                  !pattern_ops.contains(def_op)) {
                 input_set_for_check.insert(operand);
               } else if (!def_op) {
-                assert(false && "Value without defining op should not happen normally");
+                assert(false &&
+                       "Value without defining op should not happen normally");
               }
             }
           }
@@ -172,87 +184,85 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan
     }
   }
   SmallVector<Value> valid_inputs = input_set_for_check.takeVector();
-  
+
   llvm::SetVector<Value> output_set_for_check;
-  for (Operation* op : instance.operations) {
+  for (Operation *op : instance.operations) {
     for (Value result : op->getResults()) {
       bool has_external_use = false;
-      for (OpOperand& use : result.getUses()) {
-        Operation* user = use.getOwner();
+      for (OpOperand &use : result.getUses()) {
+        Operation *user = use.getOwner();
         if (!pattern_ops.contains(user)) {
           has_external_use = true;
           break;
         }
       }
-      
+
       if (has_external_use) {
         output_set_for_check.insert(result);
       }
     }
   }
   SmallVector<Value> valid_outputs = output_set_for_check.takeVector();
-  
+
   // Finds a valid insertion point that avoids dominance issues
-  Operation* insertion_point = findValidInsertionPoint(instance, pattern_ops, valid_inputs, valid_outputs);
+  Operation *insertion_point = findValidInsertionPoint(
+      instance, pattern_ops, valid_inputs, valid_outputs);
   if (!insertion_point) {
     return false;
   }
-  
+
   builder.setInsertionPointAfter(insertion_point);
-  
+
   SmallVector<Type> output_types;
   for (Value output : valid_outputs) {
     output_types.push_back(output.getType());
   }
 
   auto pattern_op = builder.create<neura::FusedOp>(
-      insertion_point->getLoc(),
-      output_types,
-      valid_inputs,
+      insertion_point->getLoc(), output_types, valid_inputs,
       builder.getI64IntegerAttr(pattern.getId()),
       builder.getStringAttr(pattern.getPattern()),
-      builder.getI64IntegerAttr(pattern.getFrequency())
-  );
+      builder.getI64IntegerAttr(pattern.getFrequency()));
 
-  Region& body_region = pattern_op.getBody();
-  Block* body_block = new Block();
+  Region &body_region = pattern_op.getBody();
+  Block *body_block = new Block();
   body_region.push_back(body_block);
-  
+
   for (Value input : valid_inputs) {
     body_block->addArgument(input.getType(), input.getLoc());
   }
-  
+
   builder.setInsertionPointToStart(body_block);
   IRMapping mapping;
-  
+
   for (size_t i = 0; i < valid_inputs.size(); ++i) {
     mapping.map(valid_inputs[i], body_block->getArgument(i));
   }
-  
+
   llvm::DenseMap<Value, Value> original_to_cloned;
-  
-  Operation* cloned_op = nullptr;
 
-  for (Operation* op : instance.operations) {
+  Operation *cloned_op = nullptr;
+
+  for (Operation *op : instance.operations) {
     if (op->getName().getStringRef().str() == "neura.fused_op") {
       if (op->getNumRegions() > 0) {
-        Region& region = op->getRegion(0);
+        Region &region = op->getRegion(0);
         if (!region.empty()) {
-          Block& block = region.front();
-          
-          llvm::DenseSet<Operation*> nested_pattern_body_ops;
+          Block &block = region.front();
+
+          llvm::DenseSet<Operation *> nested_pattern_body_ops;
           llvm::SetVector<Value> nested_pattern_used_values;
-          
-          for (Operation& body_op : block.getOperations()) {
+
+          for (Operation &body_op : block.getOperations()) {
             if (body_op.getName().getStringRef().str() != "neura.yield") {
               nested_pattern_body_ops.insert(&body_op);
-              
+
               for (Value operand : body_op.getOperands()) {
                 if (mlir::isa<BlockArgument>(operand)) {
                   continue;
                 }
-                
-                Operation* def_op = operand.getDefiningOp();
+
+                Operation *def_op = operand.getDefiningOp();
                 if (def_op) {
                   if (nested_pattern_body_ops.contains(def_op)) {
                     continue;
@@ -264,11 +274,12 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan
               }
             }
           }
-          
-          for (size_t i = 0; i < op->getNumOperands() && i < block.getNumArguments(); ++i) {
+
+          for (size_t i = 0;
+               i < op->getNumOperands() && i < block.getNumArguments(); ++i) {
             Value pattern_input = op->getOperand(i);
             BlockArgument nested_arg = block.getArgument(i);
-            
+
             if (mapping.contains(pattern_input)) {
               mapping.map(nested_arg, mapping.lookup(pattern_input));
             } else {
@@ -279,36 +290,43 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan
               }
             }
           }
-          
+
           for (Value used_val : nested_pattern_used_values) {
-            if (mlir::isa<BlockArgument>(used_val) || mapping.contains(used_val)) {
+            if (mlir::isa<BlockArgument>(used_val) ||
+                mapping.contains(used_val)) {
               continue;
             }
-            
-            Operation* def_op = used_val.getDefiningOp();
-            if (def_op && pattern_ops.contains(def_op) && original_to_cloned.count(used_val)) {
+
+            Operation *def_op = used_val.getDefiningOp();
+            if (def_op && pattern_ops.contains(def_op) &&
+                original_to_cloned.count(used_val)) {
               mapping.map(used_val, original_to_cloned[used_val]);
             } else {
               mapping.map(used_val, used_val);
             }
           }
-          
-          for (Operation& body_op : block.getOperations()) {
+
+          for (Operation &body_op : block.getOperations()) {
             if (body_op.getName().getStringRef().str() != "neura.yield") {
               cloned_op = builder.clone(body_op, mapping);
               for (size_t i = 0; i < body_op.getNumResults(); ++i) {
-                original_to_cloned[body_op.getResult(i)] = cloned_op->getResult(i);
+                original_to_cloned[body_op.getResult(i)] =
+                    cloned_op->getResult(i);
               }
             }
           }
-          
-          for (Operation& block_op : block.getOperations()) {
+
+          for (Operation &block_op : block.getOperations()) {
             if (block_op.getName().getStringRef().str() == "neura.yield") {
-              for (size_t i = 0; i < op->getNumResults() && i < block_op.getNumOperands(); ++i) {
+              for (size_t i = 0;
+                   i < op->getNumResults() && i < block_op.getNumOperands();
+                   ++i) {
                 Value yield_operand = block_op.getOperand(i);
                 if (original_to_cloned.count(yield_operand)) {
-                  original_to_cloned[op->getResult(i)] = original_to_cloned[yield_operand];
-                  mapping.map(op->getResult(i), original_to_cloned[yield_operand]);
+                  original_to_cloned[op->getResult(i)] =
+                      original_to_cloned[yield_operand];
+                  mapping.map(op->getResult(i),
+                              original_to_cloned[yield_operand]);
                 } else {
                   return false;
                 }
@@ -320,8 +338,9 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan
       }
     } else {
       for (Value operand : op->getOperands()) {
-        Operation* def_op = operand.getDefiningOp();
-        if (def_op && def_op->getName().getStringRef().str() == "neura.fused_op" && 
+        Operation *def_op = operand.getDefiningOp();
+        if (def_op &&
+            def_op->getName().getStringRef().str() == "neura.fused_op" &&
             pattern_ops.contains(def_op) && original_to_cloned.count(operand)) {
           if (!mapping.contains(operand)) {
             mapping.map(operand, original_to_cloned[operand]);
@@ -334,7 +353,7 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan
       }
     }
   }
-  
+
   SmallVector<Value> yield_operands;
   for (size_t i = 0; i < valid_outputs.size(); ++i) {
     Value original_output = valid_outputs[i];
@@ -349,9 +368,10 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan
       return false;
     }
   }
-  
-  builder.create<neura::YieldOp>(insertion_point->getLoc(), yield_operands);
-  
+
+  builder.create<neura::YieldOp>(insertion_point->getLoc(), ValueRange{},
+                                 yield_operands);
+
   llvm::DenseSet<Value> replaced_outputs;
   for (size_t i = 0; i < valid_outputs.size(); ++i) {
     Value old_value = valid_outputs[i];
@@ -359,8 +379,8 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan
     old_value.replaceAllUsesWith(new_value);
     replaced_outputs.insert(old_value);
   }
-  
-  for (auto& pair : original_to_cloned) {
+
+  for (auto &pair : original_to_cloned) {
     Value old_value = pair.first;
     if (replaced_outputs.contains(old_value)) {
       continue;
@@ -370,99 +390,105 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan
       old_value.replaceAllUsesWith(new_value);
     }
   }
-  
+
   original_to_cloned.clear();
-  
-  for (auto it = instance.operations.rbegin(); it != instance.operations.rend(); ++it) {
-    Operation* op = *it;
-  
+
+  for (auto it = instance.operations.rbegin(); it != instance.operations.rend();
+       ++it) {
+    Operation *op = *it;
+
     if (op->getName().getStringRef().str() == "neura.fused_op") {
-      Region& region = op->getRegion(0);
-      Block& block = region.front();
-      
-      for (Operation& body_op : block.getOperations()) {
+      Region &region = op->getRegion(0);
+      Block &block = region.front();
+
+      for (Operation &body_op : block.getOperations()) {
         for (Value result : body_op.getResults()) {
           if (!result.use_empty()) {
             result.dropAllUses();
           }
         }
       }
-      
+
       for (BlockArgument arg : block.getArguments()) {
         if (!arg.use_empty()) {
           arg.dropAllUses();
         }
       }
-      
+
       while (!block.empty()) {
-        Operation& body_op = block.back();
+        Operation &body_op = block.back();
         body_op.dropAllReferences();
         body_op.erase();
       }
     }
-    
+
     op->dropAllUses();
     op->erase();
   }
-  
+
   return true;
 }
 
-int rewritePatternsToRegions(mlir::neura::DfgGraph* dfg_graph, ModuleOp module_op, const std::vector<mlir::neura::PatternWithSelectedInstances>& patterns_with_instances) {
+int rewritePatternsToRegions(
+    mlir::neura::DfgGraph *dfg_graph, ModuleOp module_op,
+    const std::vector<mlir::neura::PatternWithSelectedInstances>
+        &patterns_with_instances) {
   int rewrite_count = 0;
   size_t total_critical = 0;
   size_t total_non_critical = 0;
-  MLIRContext* context = module_op.getContext();
+  MLIRContext *context = module_op.getContext();
   OpBuilder builder(context);
-  
-  for (const auto& pwsi : patterns_with_instances) {
-    if (pwsi.pattern.getNodes().size() < 2) continue;
+
+  for (const auto &pwsi : patterns_with_instances) {
+    if (pwsi.pattern.getNodes().size() < 2)
+      continue;
     total_critical += pwsi.critical_instances.size();
     total_non_critical += pwsi.non_critical_instances.size();
   }
-  
+
   size_t total_instances = total_critical + total_non_critical;
   if (total_instances == 0) {
     llvm::errs() << "  No valid instances to rewrite\n";
     return 0;
   }
-  
+
   std::set<std::string> attempted_patterns;
-  
+
   // Phase 1: Rewrites all critical path instances across all patterns
   llvm::errs() << "  Phase 1: Rewriting critical path instances...\n";
-  for (const auto& pwsi : patterns_with_instances) {
+  for (const auto &pwsi : patterns_with_instances) {
     if (pwsi.pattern.getNodes().size() < 2 || pwsi.critical_instances.empty()) {
       continue;
     }
-    
+
     attempted_patterns.insert(pwsi.pattern.getPattern());
-    
-    for (const auto& instance : pwsi.critical_instances) {
+
+    for (const auto &instance : pwsi.critical_instances) {
       rewritePatternInstance(builder, instance, pwsi.pattern);
     }
   }
 
   // Phase 2: Rewrites all non-critical path instances across all patterns
   llvm::errs() << "  Phase 2: Rewriting non-critical path instances...\n";
-  for (const auto& pwsi : patterns_with_instances) {
-    if (pwsi.pattern.getNodes().size() < 2 || pwsi.non_critical_instances.empty()) {
+  for (const auto &pwsi : patterns_with_instances) {
+    if (pwsi.pattern.getNodes().size() < 2 ||
+        pwsi.non_critical_instances.empty()) {
       continue;
     }
-    
+
     // Marks pattern as attempted before trying to fuse instances
     attempted_patterns.insert(pwsi.pattern.getPattern());
-    
-    for (const auto& instance : pwsi.non_critical_instances) {
+
+    for (const auto &instance : pwsi.non_critical_instances) {
       rewritePatternInstance(builder, instance, pwsi.pattern);
     }
   }
 
   // Marks all attempted patterns
-  for (const auto& pattern_str : attempted_patterns) {
+  for (const auto &pattern_str : attempted_patterns) {
     mlir::neura::GraMi::markPatternAsAttempted(pattern_str);
   }
-  
+
   return rewrite_count;
 }
 
@@ -478,78 +504,93 @@ struct IterMergePatternPass
 
   StringRef getArgument() const override { return "iter-merge-pattern"; }
   StringRef getDescription() const override {
-    return "Iteratively merge and identify common patterns in DFG using graph mining.";
+    return "Iteratively merge and identify common patterns in DFG using graph "
+           "mining.";
   }
 
   Option<int> min_support{
       *this, "min-support",
-      llvm::cl::desc("Minimum support threshold for pattern mining (default: 2)"),
+      llvm::cl::desc(
+          "Minimum support threshold for pattern mining (default: 2)"),
       llvm::cl::init(2)};
   Option<int> max_iter{
       *this, "max-iter",
-      llvm::cl::desc("Maximum number of iterations for pattern merging (default: 2)"),
+      llvm::cl::desc(
+          "Maximum number of iterations for pattern merging (default: 2)"),
       llvm::cl::init(2)};
 
   void runOnOperation() override {
-    
+
     ModuleOp module_op = getOperation();
-    
+
     llvm::errs() << "\n========================================\n";
     llvm::errs() << "IterMergePatternPass: Starting pattern mining\n";
-    llvm::errs() << "Minimum support threshold: " << min_support.getValue() << "\n";
+    llvm::errs() << "Minimum support threshold: " << min_support.getValue()
+                 << "\n";
     llvm::errs() << "========================================\n\n";
-    
+
     int iter = 0;
-    bool cleared_attempted = false;  // Tracks if it has cleared attempted marks once
+    bool cleared_attempted =
+        false; // Tracks if it has cleared attempted marks once
     while (iter < max_iter.getValue()) {
       llvm::errs() << "Iteration " << iter << "\n";
-      
-      // Re-collects critical path operations from all functions for this iteration
-      // Critical path may change after each iteration due to pattern fusion
-      llvm::DenseSet<Operation*> all_critical_ops;
+
+      // Re-collects critical path operations from all functions for this
+      // iteration Critical path may change after each iteration due to pattern
+      // fusion
+      llvm::DenseSet<Operation *> all_critical_ops;
       module_op.walk([&](func::FuncOp func) {
         auto critical_ops = mlir::neura::GraMi::collectCriticalPathOps(func);
-        for (Operation* op : critical_ops) {
+        for (Operation *op : critical_ops) {
           all_critical_ops.insert(op);
         }
       });
-      llvm::errs() << "  Collected " << all_critical_ops.size() << " critical path operations for iteration " << iter << "\n";
-      
+      llvm::errs() << "  Collected " << all_critical_ops.size()
+                   << " critical path operations for iteration " << iter
+                   << "\n";
+
       auto dfg_graph = mlir::neura::DfgExtractor::extractFromModule(module_op);
-      
+
       if (!dfg_graph) {
         llvm::errs() << "Error: Failed to extract DFG from module\n";
         signalPassFailure();
         return;
-      } 
-      
+      }
+
       printDFGStatistics(dfg_graph.get());
       mlir::neura::GraMi grami(dfg_graph.get(), min_support.getValue());
       grami.setCriticalPathOps(all_critical_ops);
-      std::vector<mlir::neura::PatternWithSelectedInstances> patterns_with_instances = grami.mineFrequentSubgraphs();
-      
+      std::vector<mlir::neura::PatternWithSelectedInstances>
+          patterns_with_instances = grami.mineFrequentSubgraphs();
+
       // If no patterns were fused and it hasn't cleared attempted marks yet,
-      // clears them and tries one more iteration (without incrementing iter count)
+      // clears them and tries one more iteration (without incrementing iter
+      // count)
       if (patterns_with_instances.empty() && !cleared_attempted) {
-        llvm::errs() << "  No patterns fused in this iteration. Clearing attempted marks and retrying...\n";
+        llvm::errs() << "  No patterns fused in this iteration. Clearing "
+                        "attempted marks and retrying...\n";
         mlir::neura::GraMi::clearAttemptedPatterns();
         cleared_attempted = true;
         // Retries this iteration with cleared marks (doesn't increment iter)
         continue;
       }
 
-      // If it cleared marks and still got 0, or if it has reached max iterations, stops
+      // If it cleared marks and still got 0, or if it has reached max
+      // iterations, stops
       if (patterns_with_instances.empty() && cleared_attempted) {
-        llvm::errs() << "  No patterns fused even after clearing attempted marks. Stopping.\n";
+        llvm::errs() << "  No patterns fused even after clearing attempted "
+                        "marks. Stopping.\n";
         break;
       }
 
-      int rewrite_count = rewritePatternsToRegions(dfg_graph.get(), module_op, patterns_with_instances);
-      llvm::errs() << "  - Rewrote " << rewrite_count << " pattern instances\n\n";
-      
+      int rewrite_count = rewritePatternsToRegions(dfg_graph.get(), module_op,
+                                                   patterns_with_instances);
+      llvm::errs() << "  - Rewrote " << rewrite_count
+                   << " pattern instances\n\n";
+
       iter++;
     }
-    
+
     llvm::errs() << "\n========================================\n";
     llvm::errs() << "IterMergePatternPass: Completed\n";
     llvm::errs() << "========================================\n\n";
@@ -571,43 +612,48 @@ struct InitPatternPass
 
   Option<int> min_support{
       *this, "min-support",
-      llvm::cl::desc("Minimum support threshold for pattern mining (default: 2)"),
+      llvm::cl::desc(
+          "Minimum support threshold for pattern mining (default: 2)"),
       llvm::cl::init(2)};
 
   void runOnOperation() override {
     ModuleOp module_op = getOperation();
-    
+
     llvm::errs() << "\n========================================\n";
     llvm::errs() << "InitPatternPass: Starting pattern mining\n";
-    llvm::errs() << "Minimum support threshold: " << min_support.getValue() << "\n";
+    llvm::errs() << "Minimum support threshold: " << min_support.getValue()
+                 << "\n";
     llvm::errs() << "========================================\n\n";
-    
+
     // Collects critical path operations from all functions
-    llvm::DenseSet<Operation*> all_critical_ops;
+    llvm::DenseSet<Operation *> all_critical_ops;
     module_op.walk([&](func::FuncOp func) {
       auto critical_ops = mlir::neura::GraMi::collectCriticalPathOps(func);
-      for (Operation* op : critical_ops) {
+      for (Operation *op : critical_ops) {
         all_critical_ops.insert(op);
       }
     });
-    llvm::errs() << "Collected " << all_critical_ops.size() << " critical path operations\n\n";
-    
+    llvm::errs() << "Collected " << all_critical_ops.size()
+                 << " critical path operations\n\n";
+
     auto dfg_graph = mlir::neura::DfgExtractor::extractFromModule(module_op);
-    
+
     if (!dfg_graph) {
       llvm::errs() << "Error: Failed to extract DFG from module\n";
       signalPassFailure();
       return;
-    } 
-    
+    }
+
     printDFGStatistics(dfg_graph.get());
     mlir::neura::GraMi grami(dfg_graph.get(), min_support.getValue());
     grami.setCriticalPathOps(all_critical_ops);
-    std::vector<mlir::neura::PatternWithSelectedInstances> patterns_with_instances = grami.mineFrequentSubgraphs();
-    
-    int rewrite_count = rewritePatternsToRegions(dfg_graph.get(), module_op, patterns_with_instances);
+    std::vector<mlir::neura::PatternWithSelectedInstances>
+        patterns_with_instances = grami.mineFrequentSubgraphs();
+
+    int rewrite_count = rewritePatternsToRegions(dfg_graph.get(), module_op,
+                                                 patterns_with_instances);
     llvm::errs() << "  - Rewrote " << rewrite_count << " pattern instances\n\n";
-    
+
     llvm::errs() << "\n========================================\n";
     llvm::errs() << "InitPatternPass: Completed\n";
     llvm::errs() << "========================================\n\n";
@@ -625,4 +671,3 @@ std::unique_ptr<Pass> createInitPatternPass() {
   return std::make_unique<InitPatternPass>();
 }
 } // namespace mlir::neura
-
diff --git a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
index 1a2c9391..ac664382 100644
--- a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
+++ b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
@@ -72,10 +72,10 @@ static LogicalResult wrapInnermostLoopAsKernel(affine::AffineForOp for_op,
   if (has_outputs) {
     // If the loop has outputs, yield the loop results.
     SmallVector<Value> yield_operands(for_op.getResults());
-    builder.create<neura::YieldOp>(loc, yield_operands);
+    builder.create<neura::YieldOp>(loc, ValueRange{}, yield_operands);
   } else {
     // If the loop has no outputs, create an empty yield.
-    builder.create<neura::YieldOp>(loc, ValueRange{});
+    builder.create<neura::YieldOp>(loc);
   }
 
   return success();
diff --git a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir b/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir
index 458602bd..ad24eac4 100644
--- a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir
+++ b/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir
@@ -1,7 +1,8 @@
 // Wraps the innermost loop within neura.kernel operation.
 // RUN: mlir-neura-opt %s \
-// RUN:  --wrap-loop-in-kernel \
-// RUN:  | FileCheck %s
+// RUN: --wrap-loop-in-kernel \
+// RUN: -o %t-wrapped.mlir
+// RUN: FileCheck %s --input-file=%t-wrapped.mlir
 
 module attributes {} {
   func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref<?x128xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {

From 5bb37772f8fd8c1ea049b52d46b55ec89b103239 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Thu, 22 Jan 2026 19:01:38 +0800
Subject: [PATCH 03/25] change the definition of neura.kernel

---
 include/NeuraDialect/NeuraOps.td              |  38 ++---
 .../TaskflowToNeura/TaskflowToNeuraPass.cpp   | 155 ++++++++++--------
 .../Transforms/CanonicalizeTaskPass.cpp       |  15 +-
 .../ConstructHyperblockFromTaskPass.cpp       |   3 +-
 4 files changed, 121 insertions(+), 90 deletions(-)

diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index 7971d6c6..71218450 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -87,8 +87,8 @@ def Neura_YieldOp : Op<NeuraDialect, "yield", [Terminator, Pure, ReturnLike, Att
   ];
 
   let assemblyFormat = [{
-  ($iter_args_next^ `:` type($iter_args_next))?
-  ($results^ `:` type($results))? attr-dict}];
+  (`iter_args_next` `(` $iter_args_next^ `:` type($iter_args_next) `)` )?
+  (`results` `(` $results^ `:` type($results) `)` )? attr-dict}];
 
   let hasVerifier = 1;
 }
@@ -780,26 +780,26 @@ def Neura_LoopControlOp : Op<NeuraDialect, "loop_control">{
   //   " `(``parent_valid` `=` $parentValid `,` `start` `=` $start `,` `end` `=` $end `,` `step` `=` $step`)` attr-dict `:` type($parentValid) `,` type($start) `,` type($end) `,` type($step) `->` type($nextindex) `,` type($valid)";
 }
 
-// def Neura_CounterOp : Op<NeuraDialect, "counter", [Pure]>{
-//   let summary = "Hardware loop counter for CGRA execution.";
-//   let description = [{
-//     Represents a hardware loop counter unit that generates loop indices.
-//     This maps directly to a counter FU on the CGRA.
+def Neura_CounterOp : Op<NeuraDialect, "counter", [Pure]>{
+  let summary = "Hardware loop counter for CGRA execution.";
+  let description = [{
+    Represents a hardware loop counter unit that generates loop indices.
+    This maps directly to a counter FU on the CGRA.
 
-//     The counter produces:
-//     - current index: the current loop index value.
+    The counter produces:
+    - current index: the current loop index value.
 
-//     Example:
-//       %current_idx = neura.counter () <{
-//            start_value = 0 : i64, 
-//            end_value = 100 : i64, 
-//            step_value = 1 : i64
-//          }> : -> !neura.data<i64, i1>
-//   }];
-//   let arguments = (ins
+    Example:
+      %current_idx = neura.counter () <{
+           start_value = 0 : i64, 
+           end_value = 100 : i64, 
+           step_value = 1 : i64
+         }> : -> !neura.data<i64, i1>
+  }];
+  let arguments = (ins
     
-//   );
-// }
+  );
+}
 
 // ----------------------------------------------------
 // Defines operations for steering-control based DFG execution.
diff --git a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
index 460a5a15..ea46d969 100644
--- a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
+++ b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
@@ -18,6 +18,25 @@ using namespace mlir;
 using namespace mlir::taskflow;
 
 namespace {
+// Pattern to convert taskflow.hyperblock to neura.kernel.
+//
+// Hyperblock structure:
+//   %result = taskflow.hyperblock(%idx, %iter_init) {
+//   ^bb0(%idx_arg: index, %iter_arg: T):
+//     ... body ...
+//     taskflow.hyperblock.yield outputs(%next_iter : T)
+//   } : (index, T) -> T
+//
+// Kernel structure:
+//   %result = neura.kernel ins(%idx, %live_in...) iter_args(%iter_init) {
+//   ^bb0(%idx_arg: index, %live_in_args..., %iter_arg: T):
+//     ... body ...
+//     neura.yield iter_args(%next_iter) results(%next_iter)
+//   } -> T
+//
+// Block argument order must match:
+//   Hyperblock: [indices..., iter_args...]
+//   Kernel:     [inputs (indices + live_ins)..., iter_args...]
 struct HyperblockToKernelPattern
     : public OpRewritePattern<TaskflowHyperblockOp> {
   using OpRewritePattern<TaskflowHyperblockOp>::OpRewritePattern;
@@ -26,109 +45,115 @@ struct HyperblockToKernelPattern
                                 PatternRewriter &rewriter) const override {
     Location loc = hyperblock_op.getLoc();
 
-    // Find the parent task to get access to task's block arguments.
-    auto taskOp = hyperblock_op->getParentOfType<TaskflowTaskOp>();
-    if (!taskOp)
+    // Finds the parent task to access task's block arguments.
+    TaskflowTaskOp task_op = hyperblock_op->getParentOfType<TaskflowTaskOp>();
+    if (!task_op) {
       return failure();
+    }
+
+    Block &hb_block = hyperblock_op.getBody().front();
+    Block &task_block = task_op.getBody().front();
 
-    // Collect live-in values: values used in hyperblock but defined outside.
-    // These are the task's block arguments that the hyperblock body uses.
-    llvm::DenseSet<Value> liveInSet;
-    SmallVector<Value> liveInValues;
+    // Gets hyperblock operands.
+    SmallVector<Value> indices(hyperblock_op.getIndices());
+    SmallVector<Value> iter_args_init(hyperblock_op.getIterArgs());
+    size_t num_indices = indices.size();
+    size_t num_iter_args_init = iter_args_init.size();
 
-    Block &hbBlock = hyperblock_op.getBody().front();
-    Block &taskBlock = taskOp.getBody().front();
+    // Collects live-in values of the hyperblock: task block arguments used in
+    // the hyperblock body.
+    llvm::DenseSet<Value> live_in_set;
+    SmallVector<Value> live_in_values;
 
-    // Walk hyperblock body to find uses of task block arguments.
     hyperblock_op.walk([&](Operation *op) {
       for (Value operand : op->getOperands()) {
-        // Check if operand is a task block argument.
         if (auto blockArg = dyn_cast<BlockArgument>(operand)) {
-          if (blockArg.getOwner() == &taskBlock) {
-            if (liveInSet.insert(operand).second) {
-              liveInValues.push_back(operand);
+          if (blockArg.getOwner() == &task_block) {
+            if (live_in_set.insert(operand).second) {
+              live_in_values.push_back(operand);
             }
           }
         }
+        assert(!operand.getDefiningOp() && "Unexpected non-block-arg operand");
       }
     });
 
-    // Collect iter_args initial values.
-    SmallVector<Value> iterArgsInit(hyperblock_op.getIterArgs().begin(),
-                                    hyperblock_op.getIterArgs().end());
+    // Builds the neura.kernel inputs: [indices..., live_ins...].
+    SmallVector<Value> kernel_inputs;
+    kernel_inputs.append(indices);
+    kernel_inputs.append(live_in_values);
 
-    // Determine result types.
-    SmallVector<Type> resultTypes(hyperblock_op.getResultTypes().begin(),
-                                  hyperblock_op.getResultTypes().end());
+    // Result types from hyperblock.
+    SmallVector<Type> resultTypes(hyperblock_op.getResultTypes());
 
-    // Collect input types.
-    SmallVector<Type> inputTypes;
-    for (Value v : liveInValues) {
-      inputTypes.push_back(v.getType());
-    }
-
-    SmallVector<Type> iterArgsTypes;
-    for (Value v : iterArgsInit) {
-      iterArgsTypes.push_back(v.getType());
-    }
-
-    // Create neura.kernel.
-    auto kernelOp = rewriter.create<neura::KernelOp>(
-        loc, resultTypes, liveInValues, iterArgsInit,
+    // Creates neura.kernel.
+    neura::KernelOp kernelOp = rewriter.create<neura::KernelOp>(
+        loc, resultTypes, kernel_inputs, iter_args_init,
         /*cgra_id=*/rewriter.getI32IntegerAttr(0),
         /*kernel_name=*/rewriter.getStringAttr("kernel"),
         /*accelerator=*/rewriter.getStringAttr("neura"));
 
-    // Create entry block for kernel.
-    Region &kernelRegion = kernelOp.getBody();
-    Block *entryBlock = rewriter.createBlock(&kernelRegion);
+    // Creates the entry block for kernel.
+    Region &kernel_region = kernelOp.getBody();
+    Block *entry_block = rewriter.createBlock(&kernel_region);
 
     IRMapping mapping;
 
-    // Add block arguments for live-in values (inputs).
-    for (auto [idx, liveIn] : llvm::enumerate(liveInValues)) {
-      BlockArgument arg = entryBlock->addArgument(liveIn.getType(), loc);
-      mapping.map(liveIn, arg);
+    // Kernel block argument layout: [inputs..., iter_args...]
+    // Where inputs = [indices..., live_ins...]
+    //
+    // Hyperblock block argument layout: [indices..., iter_args...]
+
+    // 1. Adds block arguments for indices and map to hyperblock's index args.
+    for (size_t i = 0; i < num_indices; ++i) {
+      BlockArgument kernel_indices_arg =
+          entry_block->addArgument(indices[i].getType(), loc);
+      BlockArgument hb_arg = hb_block.getArgument(i);
+      mapping.map(hb_arg, kernel_indices_arg);
     }
 
-    // Add block arguments for iter_args.
-    size_t numIndices = hyperblock_op.getIndices().size();
-    for (auto [idx, iterArg] : llvm::enumerate(iterArgsInit)) {
-      BlockArgument arg = entryBlock->addArgument(iterArg.getType(), loc);
-      // Map hyperblock's iter_arg block argument to kernel's block argument.
-      mapping.map(hbBlock.getArgument(numIndices + idx), arg);
+    // 2. Adds block arguments for live-in values and map to task block args.
+    for (Value live_in : live_in_values) {
+      BlockArgument kernel_live_in_arg =
+          entry_block->addArgument(live_in.getType(), loc);
+      mapping.map(live_in, kernel_live_in_arg);
     }
 
-    // Map hyperblock's index arguments - these will be replaced by counter
-    // later. For now, create placeholder block arguments.
-    for (size_t i = 0; i < numIndices; ++i) {
-      BlockArgument hbArg = hbBlock.getArgument(i);
-      BlockArgument arg = entryBlock->addArgument(hbArg.getType(), loc);
-      mapping.map(hbArg, arg);
+    // 3. Adds block arguments for iter_args and map to hyperblock's iter_args.
+    for (size_t i = 0; i < num_iter_args_init; ++i) {
+      BlockArgument kernel_iter_arg =
+          entry_block->addArgument(iter_args_init[i].getType(), loc);
+      BlockArgument hb_arg = hb_block.getArgument(num_indices + i);
+      mapping.map(hb_arg, kernel_iter_arg);
     }
 
-    // Clone hyperblock body into kernel.
-    rewriter.setInsertionPointToEnd(entryBlock);
-    for (Operation &op : hbBlock.without_terminator()) {
+    // Clones hyperblock body into kernel.
+    rewriter.setInsertionPointToEnd(entry_block);
+    for (Operation &op : hb_block.without_terminator()) {
       rewriter.clone(op, mapping);
     }
 
-    // Convert hyperblock.yield to neura.yield.
-    auto yieldOp = cast<TaskflowHyperblockYieldOp>(hbBlock.getTerminator());
-    SmallVector<Value> iterArgsNext;
+    // Converts hyperblock.yield to neura.yield.
+    TaskflowHyperblockYieldOp hb_yield_op =
+        cast<TaskflowHyperblockYieldOp>(hb_block.getTerminator());
+
+    SmallVector<Value> iter_args_next;
     SmallVector<Value> results;
 
-    for (Value out : yieldOp.getOutputs()) {
+    // Maps yield outputs.
+    for (Value out : hb_yield_op.getResults()) {
       Value mapped = mapping.lookupOrDefault(out);
-      // For kernels with iter_args, output goes to both iter_args_next and
-      // results.
-      iterArgsNext.push_back(mapped);
       results.push_back(mapped);
     }
 
-    rewriter.create<neura::YieldOp>(loc, iterArgsNext, results);
+    for (Value iter_arg : hb_yield_op.getIterArgsNext()) {
+      Value mapped = mapping.lookupOrDefault(iter_arg);
+      iter_args_next.push_back(mapped);
+    }
+
+    rewriter.create<neura::YieldOp>(loc, iter_args_next, results);
 
-    // Replace hyperblock results with kernel results.
+    // Replaces hyperblock with kernel.
     rewriter.replaceOp(hyperblock_op, kernelOp.getResults());
 
     return success();
diff --git a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp b/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp
index 151226cf..4281fae2 100644
--- a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp
+++ b/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp
@@ -395,13 +395,18 @@ class AtomicTaskBuilder {
 
     if (auto yield =
             dyn_cast<TaskflowHyperblockYieldOp>(old_body->getTerminator())) {
-      SmallVector<Value> yield_ops;
-      for (Value v : yield.getOutputs()) {
-        yield_ops.push_back(mapping.lookupOrDefault(v));
+      SmallVector<Value> yield_results;
+      SmallVector<Value> yield_iter_args_next;
+      for (Value v : yield.getResults()) {
+        yield_results.push_back(mapping.lookupOrDefault(v));
       }
-      hb_builder.create<TaskflowHyperblockYieldOp>(this->loc, yield_ops);
+      for (Value v : yield.getIterArgsNext()) {
+        yield_iter_args_next.push_back(mapping.lookupOrDefault(v));
+      }
+      hb_builder.create<TaskflowHyperblockYieldOp>(this->loc, yield_results,
+                                                   yield_iter_args_next);
     } else {
-      hb_builder.create<TaskflowHyperblockYieldOp>(this->loc, ValueRange{});
+      hb_builder.create<TaskflowHyperblockYieldOp>(this->loc);
     }
   }
 
diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
index 5680acf7..690d3552 100644
--- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
+++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
@@ -433,7 +433,8 @@ static TaskflowHyperblockOp createHyperblock(
       }
 
       // Creates hyperblock.yield with the mapped operands.
-      hyperblock_builder.create<TaskflowHyperblockYieldOp>(loc, yield_operands);
+      hyperblock_builder.create<TaskflowHyperblockYieldOp>(loc, yield_operands,
+                                                           yield_operands);
       has_terminator = true;
       continue;
     }

From db78bc76fd9bf2e4e616c7ef05b04cef9baf21d3 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Thu, 22 Jan 2026 22:50:18 +0800
Subject: [PATCH 04/25] enable taskflow to neura conversion

---
 include/NeuraDialect/NeuraOps.td              |  26 ++--
 include/TaskflowDialect/TaskflowOps.td        |   5 +-
 .../TaskflowToNeura/TaskflowToNeuraPass.cpp   | 147 ++++++++++++++++--
 .../Transforms/ClassifyCountersPass.cpp       |   4 +
 .../ConstructHyperblockFromTaskPass.cpp       |   4 +-
 5 files changed, 164 insertions(+), 22 deletions(-)

diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index 71218450..80006ce6 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -53,11 +53,11 @@ def Neura_KernelOp : Op<NeuraDialect, "kernel", [
 
   let results = (outs Variadic<AnyType>:$outputs);
 
-  let regions = (region SizedRegion<1>:$body);
+  let regions = (region AnyRegion:$body);
 
   let assemblyFormat = [{
-    (`ins` `(` $inputs^ `:` type($inputs) `)` )?
-    (`ins` `(` $iter_args_init^ `:` type($iter_args_init) `)` )?
+    (`inputs` `(` $inputs^ `:` type($inputs) `)` )?
+    (`iter_args_init` `(` $iter_args_init^ `:` type($iter_args_init) `)` )?
     attr-dict-with-keyword
     $body
     (`:` type($outputs)^)?
@@ -790,15 +790,23 @@ def Neura_CounterOp : Op<NeuraDialect, "counter", [Pure]>{
     - current index: the current loop index value.
 
     Example:
-      %current_idx = neura.counter () <{
-           start_value = 0 : i64, 
-           end_value = 100 : i64, 
-           step_value = 1 : i64
-         }> : -> !neura.data<i64, i1>
+      %idx = neura.counter {
+        lower_bound = 0 : index,
+        upper_bound = 32 : index,
+        step = 1 : index,
+        counter_type = "leaf"
+      } : index
   }];
   let arguments = (ins
-    
+    IndexAttr:$lower_bound,
+    IndexAttr:$upper_bound,
+    IndexAttr:$step,
+    StrAttr:$counter_type,
+    I32Attr:$counter_id
   );
+
+  let results = (outs AnyType:$current_index);
+  let assemblyFormat = "attr-dict `:` type($current_index)";
 }
 
 // ----------------------------------------------------
diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td
index 094b31e2..a7ee4a6c 100644
--- a/include/TaskflowDialect/TaskflowOps.td
+++ b/include/TaskflowDialect/TaskflowOps.td
@@ -144,7 +144,7 @@ def TaskflowChannelOp : TaskflowOpBase<"channel", [Pure, SameOperandsAndResultTy
 // Intra-Task Operations.
 //----------------------------------------------------------------------
 // Counter operation representing loop iteration control within a Taskflow task.
-def TaskflowCounterOp : TaskflowOpBase<"counter", [Pure]>{
+def TaskflowCounterOp : TaskflowOpBase<"counter", []>{
   let summary = "Loop counter operation with hardware counter semantics";
 
   let description = [{
@@ -178,7 +178,8 @@ def TaskflowCounterOp : TaskflowOpBase<"counter", [Pure]>{
     IndexAttr:$lower_bound,
     IndexAttr:$upper_bound,
     IndexAttr:$step,
-    OptionalAttr<StrAttr>:$counter_type
+    OptionalAttr<StrAttr>:$counter_type,
+    OptionalAttr<I32Attr>:$counter_id
   );
 
   let results = (outs AnyType:$counter_index);
diff --git a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
index ea46d969..fc34a545 100644
--- a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
+++ b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
@@ -10,9 +10,12 @@
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
 using namespace mlir::taskflow;
@@ -56,7 +59,10 @@ struct HyperblockToKernelPattern
 
     // Gets hyperblock operands.
     SmallVector<Value> indices(hyperblock_op.getIndices());
+    DenseSet<Value> indices_set(indices.begin(), indices.end());
     SmallVector<Value> iter_args_init(hyperblock_op.getIterArgs());
+    DenseSet<Value> iter_args_init_set(iter_args_init.begin(),
+                                       iter_args_init.end());
     size_t num_indices = indices.size();
     size_t num_iter_args_init = iter_args_init.size();
 
@@ -69,12 +75,27 @@ struct HyperblockToKernelPattern
       for (Value operand : op->getOperands()) {
         if (auto blockArg = dyn_cast<BlockArgument>(operand)) {
           if (blockArg.getOwner() == &task_block) {
+            if (iter_args_init_set.contains(operand) ||
+                indices_set.contains(operand)) {
+              // Skips iter args and indices.
+              continue;
+            }
             if (live_in_set.insert(operand).second) {
               live_in_values.push_back(operand);
             }
+          } else {
+            assert(blockArg.getOwner() == &hb_block &&
+                   "Unexpected block argument from other block");
           }
+        } else if (operand.getDefiningOp()) {
+          Operation *def_op = operand.getDefiningOp();
+          llvm::errs() << "[taskflow2neura] Operand from op: "
+                       << *(operand.getDefiningOp()) << "\n";
+          assert(((isa<TaskflowCounterOp>(def_op) &&
+                   def_op->getParentOp() == task_op) ||
+                  (hyperblock_op->isProperAncestor(def_op))) &&
+                 "Unexpected non-block-arg operand in hyperblock");
         }
-        assert(!operand.getDefiningOp() && "Unexpected non-block-arg operand");
       }
     });
 
@@ -89,9 +110,8 @@ struct HyperblockToKernelPattern
     // Creates neura.kernel.
     neura::KernelOp kernelOp = rewriter.create<neura::KernelOp>(
         loc, resultTypes, kernel_inputs, iter_args_init,
-        /*cgra_id=*/rewriter.getI32IntegerAttr(0),
-        /*kernel_name=*/rewriter.getStringAttr("kernel"),
-        /*accelerator=*/rewriter.getStringAttr("neura"));
+        /*Optional cgra_id*/ nullptr, /*Optional kernel_name*/ nullptr,
+        /*Optional accelerator*/ nullptr);
 
     // Creates the entry block for kernel.
     Region &kernel_region = kernelOp.getBody();
@@ -160,6 +180,102 @@ struct HyperblockToKernelPattern
   }
 };
 
+struct InternalizeCounterPattern : public OpRewritePattern<neura::KernelOp> {
+  using OpRewritePattern<neura::KernelOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(neura::KernelOp kernel_op,
+                                PatternRewriter &rewriter) const override {
+    SmallVector<Value> inputs(kernel_op.getInputs());
+    SmallVector<Value> iter_args_init(kernel_op.getIterArgsInit());
+
+    // Finds counter inputs: inputs defined by taskflow.counter ops.
+    SmallVector<std::pair<size_t, TaskflowCounterOp>> counter_inputs;
+
+    for (size_t i = 0; i < inputs.size(); i++) {
+      if (TaskflowCounterOp counter_op =
+              inputs[i].getDefiningOp<TaskflowCounterOp>()) {
+        counter_inputs.push_back({i, counter_op});
+      }
+    }
+
+    // If there is no counter inputs, nothing to do.
+    if (counter_inputs.empty()) {
+      return failure();
+    }
+
+    Location loc = kernel_op.getLoc();
+    Block &old_block = kernel_op.getBody().front();
+
+    // Builds new inputs (excluding counter inputs).
+    DenseSet<size_t> counter_idx_set;
+    for (auto &[idx, _] : counter_inputs) {
+      counter_idx_set.insert(idx);
+    }
+    SmallVector<Value> new_inputs;
+    for (size_t i = 0; i < inputs.size(); i++) {
+      if (!counter_idx_set.contains(i)) {
+        new_inputs.push_back(inputs[i]);
+      }
+    }
+
+    // Creates new kernel with updated inputs.
+    SmallVector<Type> result_types(kernel_op.getResultTypes());
+    neura::KernelOp new_kernel_op = rewriter.create<neura::KernelOp>(
+        loc, result_types, new_inputs, iter_args_init,
+        /*cgra_id=*/kernel_op.getCgraIdAttr(),
+        /*kernel_name=*/kernel_op.getKernelNameAttr(),
+        /*accelerator=*/kernel_op.getAcceleratorAttr());
+
+    // Creates the entry block for new kernel.
+    Region &new_region = new_kernel_op.getBody();
+    Block *new_block = rewriter.createBlock(&new_region);
+
+    IRMapping mapping;
+    // Maps non-counter input block arguments.
+    for (size_t i = 0; i < inputs.size(); i++) {
+      BlockArgument old_arg = old_block.getArgument(i);
+      if (!counter_idx_set.contains(i)) {
+        BlockArgument new_arg = new_block->addArgument(old_arg.getType(), loc);
+        mapping.map(old_arg, new_arg);
+      }
+    }
+
+    // Maps iter_args block arguments.
+    size_t num_inputs = inputs.size();
+    for (size_t i = 0; i < iter_args_init.size(); i++) {
+      BlockArgument old_arg = old_block.getArgument(num_inputs + i);
+      BlockArgument new_arg = new_block->addArgument(old_arg.getType(), loc);
+      mapping.map(old_arg, new_arg);
+    }
+
+    // Inserts neura.counter ops at the start of the new block.
+    rewriter.setInsertionPointToStart(new_block);
+    for (auto &[old_idx, source_counter] : counter_inputs) {
+      BlockArgument old_counter_arg = old_block.getArgument(old_idx);
+
+      // Creates neura.counter op.
+      neura::CounterOp new_counter_op = rewriter.create<neura::CounterOp>(
+          source_counter.getLoc(), old_counter_arg.getType(),
+          source_counter.getLowerBoundAttr(),
+          source_counter.getUpperBoundAttr(), source_counter.getStepAttr(),
+          source_counter.getCounterTypeAttr(),
+          source_counter.getCounterIdAttr());
+      mapping.map(old_counter_arg, new_counter_op.getCurrentIndex());
+    }
+
+    // Clones rest of the body.
+    rewriter.setInsertionPointToEnd(new_block);
+    for (Operation &op : old_block.getOperations()) {
+      rewriter.clone(op, mapping);
+    }
+
+    // Replaces old kernel with new kernel.
+    rewriter.replaceOp(kernel_op, new_kernel_op.getResults());
+
+    return success();
+  }
+};
+
 struct ConvertTaskflowToNeuraPass
     : public PassWrapper<ConvertTaskflowToNeuraPass, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertTaskflowToNeuraPass)
@@ -178,12 +294,25 @@ struct ConvertTaskflowToNeuraPass
     MLIRContext *ctx = &getContext();
 
     // Phase 1: Converts hyperblocks to kernels.
-    RewritePatternSet patterns(ctx);
-    patterns.add<HyperblockToKernelPattern>(ctx);
+    {
+      RewritePatternSet patterns(ctx);
+      patterns.add<HyperblockToKernelPattern>(ctx);
 
-    if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
-      signalPassFailure();
-      return;
+      if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
+        signalPassFailure();
+        return;
+      }
+    }
+
+    // Phase 2: Internalizes counters into kernels.
+    {
+      RewritePatternSet patterns(ctx);
+      patterns.add<InternalizeCounterPattern>(ctx);
+
+      if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
+        signalPassFailure();
+        return;
+      }
     }
   }
 };
diff --git a/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp b/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp
index 354ee7d7..8555f6de 100644
--- a/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp
+++ b/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp
@@ -44,6 +44,8 @@ void classifyCountersInTask(TaskflowTaskOp task_op) {
     }
   }
 
+  int global_counter_id = 0;
+
   // Classifies each counter.
   OpBuilder builder(task_op.getContext());
   for (TaskflowCounterOp counter_op : counters) {
@@ -67,6 +69,8 @@ void classifyCountersInTask(TaskflowTaskOp task_op) {
 
     // Sets the counter type attribute.
     counter_op.setCounterTypeAttr(builder.getStringAttr(counter_type));
+    // Sets the counter id attribute.
+    counter_op.setCounterIdAttr(builder.getI32IntegerAttr(global_counter_id++));
   }
 }
 
diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
index 690d3552..6955e29c 100644
--- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
+++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
@@ -135,7 +135,7 @@ static void createCounterChainRecursivly(OpBuilder &builder, Location loc,
         builder.getIndexAttr(loop_info->lower_bound),
         builder.getIndexAttr(loop_info->upper_bound),
         builder.getIndexAttr(loop_info->step),
-        /*Counter Type*/ nullptr);
+        /*Counter Type*/ nullptr, /*Counter ID*/ nullptr);
     counter_index = counter_op.getCounterIndex();
   } else {
     // Top-level counter.
@@ -144,7 +144,7 @@ static void createCounterChainRecursivly(OpBuilder &builder, Location loc,
         builder.getIndexAttr(loop_info->lower_bound),
         builder.getIndexAttr(loop_info->upper_bound),
         builder.getIndexAttr(loop_info->step),
-        /*Counter Type*/ nullptr);
+        /*Counter Type*/ nullptr, /*Counter ID*/ nullptr);
     counter_index = counter_op.getCounterIndex();
   }
 

From 9ef22162d94468cdb26223532225703f3b2c2fb5 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 23 Jan 2026 13:17:01 +0800
Subject: [PATCH 05/25] assign accelerator for neura.kernel

---
 lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp
index 0dbed531..11688539 100644
--- a/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp
+++ b/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp
@@ -1,4 +1,5 @@
 #include "Common/AcceleratorAttrs.h"
+#include "NeuraDialect/NeuraOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Builders.h"
@@ -31,6 +32,12 @@ struct AssignAcceleratorPass
           func->setAttr(mlir::accel::kAcceleratorAttr,
                         builder.getStringAttr(mlir::accel::kNeuraTarget));
         }
+      } else if (neura::KernelOp kernel_op = dyn_cast<neura::KernelOp>(op)) {
+        // Handles neura.kernel ops as well.
+        if (!kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
+          kernel_op->setAttr(mlir::accel::kAcceleratorAttr,
+                             builder.getStringAttr(mlir::accel::kNeuraTarget));
+        }
       }
     });
   }

From 3ff449ba6c0b084342235931becf8270919d4dbb Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 23 Jan 2026 14:06:14 +0800
Subject: [PATCH 06/25] enable neura.kernel lowering in conversion passes

---
 .../ArithToNeura/ArithToNeuraPass.cpp         |  69 ++++----
 .../ArithToNeura/ArithToNeuraPatterns.td      |   4 -
 lib/Conversion/ArithToNeura/CMakeLists.txt    |  12 --
 .../BuiltinToNeura/BuiltinToNeuraPass.cpp     |  27 ++-
 lib/Conversion/LlvmToNeura/CMakeLists.txt     |  13 --
 .../LlvmToNeura/LlvmToNeuraPass.cpp           | 165 ++++++++++--------
 .../LlvmToNeura/LlvmToNeuraPatterns.td        |   4 -
 .../MemRefToNeura/MemRefToNeuraPass.cpp       |  30 +++-
 .../AffineToNeura/unsupported-affine-if.mlir  |   4 +-
 test/mapping_quality/branch_for.mlir          |   6 +-
 test/neura/ctrl/branch_for.mlir               |   6 +-
 11 files changed, 187 insertions(+), 153 deletions(-)
 delete mode 100644 lib/Conversion/ArithToNeura/ArithToNeuraPatterns.td
 delete mode 100644 lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td

diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
index 7241d7a8..a6e68ef9 100644
--- a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
+++ b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
@@ -5,22 +5,12 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/ADT/StringRef.h"
 
-namespace mlir {
-namespace neura {
-// Uses arith2neura instead of llvm to avoid conflicts.
-namespace arith2neura {
-
-#include "ArithToNeuraPatterns.inc"
-
-} // namespace arith2neura
-} // namespace neura
-} // namespace mlir
-
 using namespace mlir;
 using namespace mlir::func;
 using namespace mlir::neura;
@@ -96,7 +86,6 @@ struct ArithSubFToNeuraFSub : public OpRewritePattern<mlir::arith::SubFOp> {
     Value rhs = op.getRhs();
     Type result_type = op.getType();
 
-
     rewriter.replaceOpWithNewOp<neura::FSubOp>(op, result_type, lhs, rhs);
     return success();
   }
@@ -126,7 +115,6 @@ struct ArithMulFToNeuraFMul : public OpRewritePattern<mlir::arith::MulFOp> {
     Value rhs = op.getRhs();
     Type result_type = op.getType();
 
-
     rewriter.replaceOpWithNewOp<neura::FMulOp>(op, result_type, lhs, rhs);
     return success();
   }
@@ -171,8 +159,7 @@ struct ArithRemSIToNeuraOp : public OpRewritePattern<mlir::arith::RemSIOp> {
     Location loc = op.getLoc();
     // Converts arith RemSIOp to basic Neura Op.
 
-    Value div =
-        rewriter.create<neura::DivOp>(loc, result_type, lhs, rhs);
+    Value div = rewriter.create<neura::DivOp>(loc, result_type, lhs, rhs);
     Value mul = rewriter.create<neura::MulOp>(loc, result_type, rhs, div);
     Value rem = rewriter.create<neura::SubOp>(loc, result_type, lhs, mul);
 
@@ -244,7 +231,8 @@ struct ArithSelectToNeuraSel : public OpRewritePattern<mlir::arith::SelectOp> {
     Value false_value = op.getFalseValue();
     Type result_type = op.getType();
 
-    // Converts arith SelectOp to Neura SelOp with consistent order: (cond, ifTrue, ifFalse).
+    // Converts arith SelectOp to Neura SelOp with consistent order: (cond,
+    // ifTrue, ifFalse).
     rewriter.replaceOpWithNewOp<neura::SelOp>(op, result_type, condition,
                                               true_value, false_value);
     return success();
@@ -261,8 +249,8 @@ struct ArithExtUIToNeuraCast : public OpRewritePattern<mlir::arith::ExtUIOp> {
 
     // Converts arith ExtUIOp to Neura cast operation.
 
-    rewriter.replaceOpWithNewOp<neura::CastOp>(
-        op, result_type, input, rewriter.getStringAttr("extui"));
+    rewriter.replaceOpWithNewOp<neura::CastOp>(op, result_type, input,
+                                               rewriter.getStringAttr("extui"));
     return success();
   }
 };
@@ -277,8 +265,8 @@ struct ArithExtfToNeuraCast : public OpRewritePattern<mlir::arith::ExtFOp> {
 
     // Converts arith ExtFOp to Neura cast operation.
 
-    rewriter.replaceOpWithNewOp<neura::CastOp>(
-        op, result_type, input, rewriter.getStringAttr("extf"));
+    rewriter.replaceOpWithNewOp<neura::CastOp>(op, result_type, input,
+                                               rewriter.getStringAttr("extf"));
     return success();
   }
 };
@@ -326,26 +314,47 @@ struct LowerArithToNeuraPass
     registry.insert<mlir::neura::NeuraDialect>();
   }
 
+  RewritePatternSet populateArithToNeuraPatterns(MLIRContext *context) {
+    RewritePatternSet patterns(context);
+    patterns
+        .add<ArithFAddToNeuraFAdd, ArithConstantToNeuraConstant,
+             ArithAddIToNeuraAdd, ArithCmpiToNeuraICmp, ArithSelectToNeuraSel,
+             ArithExtUIToNeuraCast, ArithIndexCastToNeuraCast,
+             ArithFDivToNeuraFDiv, ArithExtfToNeuraCast, ArithMulFToNeuraFMul,
+             ArithSubIToNeuraSub, ArithSubFToNeuraFSub, ArithMulIToNeuraMul,
+             ArithDivSIToNeuraDiv, ArithRemSIToNeuraOp>(context);
+    return patterns;
+  }
+
   void runOnOperation() override {
     ModuleOp module_op = getOperation();
     MLIRContext *context = &getContext();
+
     module_op.walk([&](func::FuncOp func_op) {
       if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
         auto target =
             func_op->getAttrOfType<StringAttr>(mlir::accel::kAcceleratorAttr);
         if (target && target.getValue() == mlir::accel::kNeuraTarget) {
-          RewritePatternSet patterns(&getContext());
-          mlir::neura::arith2neura::populateWithGenerated(patterns);
-          patterns.add<
-              ArithFAddToNeuraFAdd, ArithConstantToNeuraConstant,
-              ArithAddIToNeuraAdd, ArithCmpiToNeuraICmp, ArithSelectToNeuraSel,
-              ArithExtUIToNeuraCast, ArithIndexCastToNeuraCast,
-              ArithFDivToNeuraFDiv, ArithExtfToNeuraCast, ArithMulFToNeuraFMul,
-              ArithSubIToNeuraSub, ArithSubFToNeuraFSub, ArithMulIToNeuraMul,
-              ArithDivSIToNeuraDiv, ArithRemSIToNeuraOp>(context);
+          RewritePatternSet patterns = populateArithToNeuraPatterns(context);
           // Apply patterns to the function, not the entire module
+          if (failed(applyPatternsGreedily(func_op, std::move(patterns)))) {
+            signalPassFailure();
+          }
+        }
+      }
+    });
+
+    // Applies patterns to the neura.kernel regions.
+    module_op.walk([&](neura::KernelOp kernel_op) {
+      if (kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
+        auto accel_target =
+            kernel_op->getAttrOfType<StringAttr>(mlir::accel::kAcceleratorAttr);
+        if (accel_target &&
+            accel_target.getValue() == mlir::accel::kNeuraTarget) {
+          Region &kernel_region = kernel_op.getBody();
+          RewritePatternSet patterns = populateArithToNeuraPatterns(context);
           if (failed(
-                  applyPatternsGreedily(func_op, std::move(patterns)))) {
+                  applyPatternsGreedily(kernel_region, std::move(patterns)))) {
             signalPassFailure();
           }
         }
diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPatterns.td b/lib/Conversion/ArithToNeura/ArithToNeuraPatterns.td
deleted file mode 100644
index 7715f90f..00000000
--- a/lib/Conversion/ArithToNeura/ArithToNeuraPatterns.td
+++ /dev/null
@@ -1,4 +0,0 @@
-include "mlir/IR/OpBase.td"
-include "mlir/IR/PatternBase.td"
-include "mlir/Dialect/Arith/IR/ArithOps.td"
-include "NeuraDialect/NeuraOps.td"
diff --git a/lib/Conversion/ArithToNeura/CMakeLists.txt b/lib/Conversion/ArithToNeura/CMakeLists.txt
index 4ace588e..c5397d26 100644
--- a/lib/Conversion/ArithToNeura/CMakeLists.txt
+++ b/lib/Conversion/ArithToNeura/CMakeLists.txt
@@ -1,20 +1,9 @@
-set(LLVM_TARGET_DEFINITIONS ${CMAKE_CURRENT_SOURCE_DIR}/ArithToNeuraPatterns.td)
-mlir_tablegen(ArithToNeuraPatterns.inc
-  -gen-rewriters
-  -I ${MLIR_SOURCE_DIR}/include
-  -I ${MLIR_BINARY_DIR}/include
-  -I ${CMAKE_SOURCE_DIR}/include
-  -I ${CMAKE_CURRENT_SOURCE_DIR}
-)
-add_public_tablegen_target(MLIRNeuraArithToNeuraIncGen)
-
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 add_mlir_conversion_library(MLIRNeuraArithToNeuraPass
   ArithToNeuraPass.cpp
 
   DEPENDS
-  MLIRNeuraArithToNeuraIncGen
   MLIRConversionIncGen
 
   LINK_LIBS PUBLIC
@@ -22,5 +11,4 @@ add_mlir_conversion_library(MLIRNeuraArithToNeuraPass
     MLIRPass
     MLIRSupport
     MLIRTransforms
-    # MLIRNeura
 )
\ No newline at end of file
diff --git a/lib/Conversion/BuiltinToNeura/BuiltinToNeuraPass.cpp b/lib/Conversion/BuiltinToNeura/BuiltinToNeuraPass.cpp
index 78550c77..e8d148b5 100644
--- a/lib/Conversion/BuiltinToNeura/BuiltinToNeuraPass.cpp
+++ b/lib/Conversion/BuiltinToNeura/BuiltinToNeuraPass.cpp
@@ -57,22 +57,45 @@ struct LowerBuiltinToNeuraPass
     registry.insert<mlir::neura::NeuraDialect>();
   }
 
+  RewritePatternSet populateBuiltinToNeuraPatterns(MLIRContext *context) {
+    RewritePatternSet patterns(context);
+    patterns.add<BuiltinUnrealizedConversionCastToNeuraCast>(context);
+    return patterns;
+  }
+
   void runOnOperation() override {
     ModuleOp module_op = getOperation();
     MLIRContext *context = &getContext();
-    RewritePatternSet patterns(&getContext());
-    patterns.add<BuiltinUnrealizedConversionCastToNeuraCast>(context);
+
     module_op.walk([&](func::FuncOp func_op) {
       if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
         auto target =
             func_op->getAttrOfType<StringAttr>(mlir::accel::kAcceleratorAttr);
         if (target && target.getValue() == mlir::accel::kNeuraTarget) {
+          RewritePatternSet patterns = populateBuiltinToNeuraPatterns(context);
           if (failed(applyPatternsGreedily(func_op, std::move(patterns)))) {
             return signalPassFailure();
           }
         }
       }
     });
+
+    // Applies patterns to the neura.kernel regions.
+    module_op.walk([&](neura::KernelOp kernel_op) {
+      if (kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
+        auto accel_target =
+            kernel_op->getAttrOfType<StringAttr>(mlir::accel::kAcceleratorAttr);
+        if (accel_target &&
+            accel_target.getValue() == mlir::accel::kNeuraTarget) {
+          Region &kernel_region = kernel_op.getBody();
+          RewritePatternSet patterns = populateBuiltinToNeuraPatterns(context);
+          if (failed(
+                  applyPatternsGreedily(kernel_region, std::move(patterns)))) {
+            signalPassFailure();
+          }
+        }
+      }
+    });
   }
 };
 } // namespace
diff --git a/lib/Conversion/LlvmToNeura/CMakeLists.txt b/lib/Conversion/LlvmToNeura/CMakeLists.txt
index 7ced65aa..1c04e922 100644
--- a/lib/Conversion/LlvmToNeura/CMakeLists.txt
+++ b/lib/Conversion/LlvmToNeura/CMakeLists.txt
@@ -1,22 +1,9 @@
-set(LLVM_TARGET_DEFINITIONS
-  ${CMAKE_CURRENT_SOURCE_DIR}/LlvmToNeuraPatterns.td
-)
-mlir_tablegen(LlvmToNeuraPatterns.inc
-  -gen-rewriters
-  -I ${MLIR_SOURCE_DIR}/include
-  -I ${MLIR_BINARY_DIR}/include
-  -I ${CMAKE_SOURCE_DIR}/include
-  -I ${CMAKE_CURRENT_SOURCE_DIR}
-)
-add_public_tablegen_target(MLIRLlvmToNeuraPatternIncGen)
-
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 add_mlir_conversion_library(MLIRNeuraLlvmToNeuraPass
   LlvmToNeuraPass.cpp
 
   DEPENDS
-  MLIRLlvmToNeuraPatternIncGen
   MLIRConversionIncGen
 
   LINK_LIBS PUBLIC
diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
index 959b015b..c28e50db 100644
--- a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
+++ b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
@@ -9,17 +9,6 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
-namespace mlir {
-namespace neura {
-// Uses llvm2neura instead of llvm to avoid conflicts.
-namespace llvm2neura {
-
-#include "LlvmToNeuraPatterns.inc"
-
-} // namespace llvm2neura
-} // namespace neura
-} // namespace mlir
-
 using namespace mlir;
 using namespace mlir::neura;
 
@@ -158,8 +147,8 @@ struct LlvmMaxNumToNeuraFMax : public OpRewritePattern<LLVM::MaxNumOp> {
     if (!mlir::isa<FloatType>(result_type))
       return failure();
 
-    rewriter.replaceOpWithNewOp<neura::FMaxOp>(op, result_type, lhs, rhs,
-                                               rewriter.getStringAttr("maxnum"));
+    rewriter.replaceOpWithNewOp<neura::FMaxOp>(
+        op, result_type, lhs, rhs, rewriter.getStringAttr("maxnum"));
     return success();
   }
 };
@@ -177,8 +166,8 @@ struct LlvmMaximumToNeuraFMax : public OpRewritePattern<LLVM::MaximumOp> {
     if (!mlir::isa<FloatType>(result_type))
       return failure();
 
-    rewriter.replaceOpWithNewOp<neura::FMaxOp>(op, result_type, lhs, rhs,
-                                               rewriter.getStringAttr("maximum"));
+    rewriter.replaceOpWithNewOp<neura::FMaxOp>(
+        op, result_type, lhs, rhs, rewriter.getStringAttr("maximum"));
     return success();
   }
 };
@@ -196,8 +185,8 @@ struct LlvmMinNumToNeuraFMin : public OpRewritePattern<LLVM::MinNumOp> {
     if (!mlir::isa<FloatType>(result_type))
       return failure();
 
-    rewriter.replaceOpWithNewOp<neura::FMinOp>(op, result_type, lhs, rhs,
-                                               rewriter.getStringAttr("minnum"));
+    rewriter.replaceOpWithNewOp<neura::FMinOp>(
+        op, result_type, lhs, rhs, rewriter.getStringAttr("minnum"));
     return success();
   }
 };
@@ -215,8 +204,8 @@ struct LlvmMinimumToNeuraFMin : public OpRewritePattern<LLVM::MinimumOp> {
     if (!mlir::isa<FloatType>(result_type))
       return failure();
 
-    rewriter.replaceOpWithNewOp<neura::FMinOp>(op, result_type, lhs, rhs,
-                                               rewriter.getStringAttr("minimum"));
+    rewriter.replaceOpWithNewOp<neura::FMinOp>(
+        op, result_type, lhs, rhs, rewriter.getStringAttr("minimum"));
     return success();
   }
 };
@@ -248,8 +237,8 @@ struct LlvmFPToSIToNeuraCast : public OpRewritePattern<mlir::LLVM::FPToSIOp> {
     Type result_type = op.getType();
 
     // Creates a cast operation with "fptosi" as the cast type.
-    rewriter.replaceOpWithNewOp<neura::CastOp>(op, result_type, input, 
-                                               rewriter.getStringAttr("fptosi"));
+    rewriter.replaceOpWithNewOp<neura::CastOp>(
+        op, result_type, input, rewriter.getStringAttr("fptosi"));
     return success();
   }
 };
@@ -264,14 +253,16 @@ struct LlvmSelectToNeuraSel : public OpRewritePattern<LLVM::SelectOp> {
     Value false_value = op.getFalseValue();
     Type result_type = op.getType();
 
-    // neura.sel now follows the same order as llvm.select: (cond, ifTrue, ifFalse)
-    rewriter.replaceOpWithNewOp<neura::SelOp>(op, result_type, 
-                                               cond, true_value, false_value);
+    // neura.sel now follows the same order as llvm.select: (cond, ifTrue,
+    // ifFalse)
+    rewriter.replaceOpWithNewOp<neura::SelOp>(op, result_type, cond, true_value,
+                                              false_value);
     return success();
   }
 };
 
-struct LlvmFMulAddToNeuraFMulFAdd : public OpRewritePattern<mlir::LLVM::FMulAddOp> {
+struct LlvmFMulAddToNeuraFMulFAdd
+    : public OpRewritePattern<mlir::LLVM::FMulAddOp> {
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(mlir::LLVM::FMulAddOp op,
@@ -301,12 +292,12 @@ struct LlvmMemsetToNeuraOps : public OpRewritePattern<LLVM::MemsetOp> {
     auto value = op.getVal();
     auto len = op.getLen();
     auto is_volatile = op.getIsVolatile();
-    
+
     // Creates neura.memset operation with full semantics.
     // Passes all operands to the hardware-specific operation.
     // The RTL layer can implement this as appropriate for the target hardware.
-    rewriter.replaceOpWithNewOp<neura::MemsetOp>(op, dest, value, len, 
-                                                   is_volatile);
+    rewriter.replaceOpWithNewOp<neura::MemsetOp>(op, dest, value, len,
+                                                 is_volatile);
     return success();
   }
 };
@@ -398,11 +389,12 @@ struct LlvmVectorReduceAddToNeuraVectorReduceAdd : public RewritePattern {
     // Checks that we have exactly one operand and one result.
     if (op->getNumOperands() != 1 || op->getNumResults() != 1)
       return failure();
-    
+
     Value input = op->getOperand(0);
     Type result_type = op->getResult(0).getType();
 
-    rewriter.replaceOpWithNewOp<neura::VectorReduceAddOp>(op, result_type, input);
+    rewriter.replaceOpWithNewOp<neura::VectorReduceAddOp>(op, result_type,
+                                                          input);
     return success();
   }
 };
@@ -511,10 +503,10 @@ struct LlvmCondBrToNeuraCondBr : public OpRewritePattern<LLVM::CondBrOp> {
     auto new_op = rewriter.create<neura::CondBr>(
         op.getLoc(),       // Location
         op.getCondition(), // Condition
-        true_operands,      // True destination operands
-        false_operands,     // False destination operands
-        true_dest,          // True destination block
-        false_dest          // False destination block
+        true_operands,     // True destination operands
+        false_operands,    // False destination operands
+        true_dest,         // True destination block
+        false_dest         // False destination block
     );
 
     // Replaces the old op with the new one.
@@ -590,27 +582,30 @@ struct LlvmSubToNeuraSub : public OpRewritePattern<LLVM::SubOp> {
 // TODO: Implements LlvmAndToNeuraMul. Used in ADPCM coder and MVT kernels.
 //       llvm.and operations appear in:
 //       - adpcm_coder-kernel.mlir (lines 55, 94: bitwise AND operations)
-//       - mvt-kernel.mlir (lines 44, 47, 50, 53: vector and scalar AND operations)
-//       Implementation: and(a, b) = mul(a, b) for boolean values.
+//       - mvt-kernel.mlir (lines 44, 47, 50, 53: vector and scalar AND
+//       operations) Implementation: and(a, b) = mul(a, b) for boolean values.
 
 // TODO: Implements LlvmAllocaToNeuraOps. Used in DTW kernel.
 //       llvm.alloca operations appear in:
 //       - dtw-kernel-O0.mlir (lines 19-23: multiple stack allocations)
-//       Implementation: For CGRA, erases alloca or converts to register allocation.
+//       Implementation: For CGRA, erases alloca or converts to register
+//       allocation.
 
-// TODO: Implements LlvmLShrToNeuraShl. Used in ADPCM coder/decoder and FFT kernels.
+// TODO: Implements LlvmLShrToNeuraShl. Used in ADPCM coder/decoder and FFT
+// kernels.
 //       llvm.lshr operations appear in:
 //       - adpcm_coder-kernel.mlir (line 54: %42 = llvm.lshr %40, %7 : i32)
 //       - adpcm_decoder-kernel.ll (line 35: %30 = lshr i32 %29, 4)
 //       - fft_kernel.mlir (line 67: %49 = llvm.lshr %7, %1 : i32)
-//       Implementation: Needs proper logical right shift (lshr(x,n) != shl(x,-n)).
+//       Implementation: Needs proper logical right shift (lshr(x,n) !=
+//       shl(x,-n)).
 
 // TODO: Implements LlvmAShrToNeuraAShr. Used in ADPCM coder/decoder kernels.
 //       llvm.ashr operations appear in:
 //       - adpcm_coder-kernel.mlir (lines 57, 63, 70: multiple ashr operations)
 //       - adpcm_decoder-kernel.ll (lines 49, 56, 61: ashr i32 %20, 3/1/2)
-//       Implementation: Needs proper arithmetic right shift (preserves sign bit).
-
+//       Implementation: Needs proper arithmetic right shift (preserves sign
+//       bit).
 
 struct LlvmSMaxToNeuraSMax : public OpRewritePattern<LLVM::SMaxOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -624,10 +619,9 @@ struct LlvmSMaxToNeuraSMax : public OpRewritePattern<LLVM::SMaxOp> {
     Location loc = op.getLoc();
 
     // Implements smax(a, b) = a >= b ? a : b.
-    auto cmp = rewriter.create<neura::ICmpOp>(loc, rewriter.getI1Type(), 
-                                             lhs, rhs,
-                                             rewriter.getStringAttr("sge"));
-    
+    auto cmp = rewriter.create<neura::ICmpOp>(
+        loc, rewriter.getI1Type(), lhs, rhs, rewriter.getStringAttr("sge"));
+
     // Selects: a >= b ? a : b.
     rewriter.replaceOpWithNewOp<neura::SelOp>(op, result_type, cmp, lhs, rhs);
     return success();
@@ -716,9 +710,9 @@ struct LlvmTruncToNeuraCast : public OpRewritePattern<LLVM::TruncOp> {
   LogicalResult matchAndRewrite(LLVM::TruncOp op,
                                 PatternRewriter &rewriter) const override {
     // Trunc is a simple cast operation.
-    auto result = rewriter.create<neura::CastOp>(
-        op.getLoc(), op.getType(), op.getArg(),
-        rewriter.getStringAttr("trunc"));
+    auto result =
+        rewriter.create<neura::CastOp>(op.getLoc(), op.getType(), op.getArg(),
+                                       rewriter.getStringAttr("trunc"));
     rewriter.replaceOp(op, result.getResult());
     return success();
   }
@@ -730,8 +724,8 @@ struct LlvmUDivToNeuraDiv : public OpRewritePattern<LLVM::UDivOp> {
   LogicalResult matchAndRewrite(LLVM::UDivOp op,
                                 PatternRewriter &rewriter) const override {
     // UDiv is unsigned division.
-    auto result = rewriter.create<neura::DivOp>(
-        op.getLoc(), op.getType(), op.getLhs(), op.getRhs());
+    auto result = rewriter.create<neura::DivOp>(op.getLoc(), op.getType(),
+                                                op.getLhs(), op.getRhs());
     rewriter.replaceOp(op, result.getResult());
     return success();
   }
@@ -743,8 +737,8 @@ struct LlvmURemToNeuraRem : public OpRewritePattern<LLVM::URemOp> {
   LogicalResult matchAndRewrite(LLVM::URemOp op,
                                 PatternRewriter &rewriter) const override {
     // URem is unsigned remainder.
-    auto result = rewriter.create<neura::RemOp>(
-        op.getLoc(), op.getType(), op.getLhs(), op.getRhs());
+    auto result = rewriter.create<neura::RemOp>(op.getLoc(), op.getType(),
+                                                op.getLhs(), op.getRhs());
     rewriter.replaceOp(op, result.getResult());
     return success();
   }
@@ -792,7 +786,7 @@ struct LlvmFuncToNeuraFunc : public OpRewritePattern<LLVM::LLVMFuncOp> {
     // Converts LLVMFunctionType to FunctionType.
     auto llvm_func_type = op.getFunctionType();
     auto func_type = rewriter.getFunctionType(llvm_func_type.getParams(),
-                                             llvm_func_type.getReturnType());
+                                              llvm_func_type.getReturnType());
 
     // Creates the new func.func operation using OperationState to have full
     // control.
@@ -811,11 +805,9 @@ struct LlvmFuncToNeuraFunc : public OpRewritePattern<LLVM::LLVMFuncOp> {
     }
     state.addAttributes(attrs);
 
-
     // Adds the function body region.
     state.addRegion();
 
-
     auto new_func = cast<func::FuncOp>(rewriter.create(state));
 
     // Moves the function body.
@@ -854,7 +846,6 @@ struct LlvmCallToFuncCall : public OpRewritePattern<LLVM::CallOp> {
     // Gets the result types from the function signature.
     auto result_types = func_op.getFunctionType().getResults();
 
-
     // Converts the call to func.call.
     auto new_call = rewriter.create<func::CallOp>(
         op.getLoc(), result_types, callee.value(), op.getArgOperands());
@@ -886,10 +877,8 @@ struct LowerLlvmToNeuraPass
     registry.insert<mlir::func::FuncDialect>();
   }
 
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    // Adds DRR patterns.
-    mlir::neura::llvm2neura::populateWithGenerated(patterns);
+  RewritePatternSet populateLlvmToNeuraPatterns(MLIRContext *context) {
+    RewritePatternSet patterns(context);
     patterns.add<LlvmConstantToNeuraConstant>(&getContext());
     // Vector operations must be registered before scalar operations
     // to ensure vector types are matched first.
@@ -939,26 +928,34 @@ struct LowerLlvmToNeuraPass
     patterns.add<LlvmURemToNeuraRem>(&getContext());
     patterns.add<LlvmSMaxToNeuraSMax>(&getContext());
     // TODO: Adds more LLVM to Neura conversion patterns as needed.
-    // patterns.add<LlvmXOrToNeuraOr>(&getContext());     // TODO: Uses in ADPCM coder + FFT kernels.
-    // patterns.add<LlvmAndToNeuraMul>(&getContext());    // TODO: Uses in ADPCM coder + MVT kernels.
-    // patterns.add<LlvmAllocaToNeuraOps>(&getContext()); // TODO: Uses in DTW kernel.
-    // TODO: Fixes right shift implementations. Current implementations are incorrect.
-    // patterns.add<LlvmLShrToNeuraShl>(&getContext());  // TODO: Uses in ADPCM coder/decoder + FFT kernels.
-    // patterns.add<LlvmAShrToNeuraAShr>(&getContext()); // TODO: Uses in ADPCM coder/decoder kernels.
-    // patterns.add<LlvmAbsToNeuraAbs>(&getContext());   // TODO: Uses in ADPCM coder kernel.
-
-
-    FrozenRewritePatternSet frozen(std::move(patterns));
+    // patterns.add<LlvmXOrToNeuraOr>(&getContext());     // TODO: Uses in ADPCM
+    // coder + FFT kernels. patterns.add<LlvmAndToNeuraMul>(&getContext()); //
+    // TODO: Uses in ADPCM coder + MVT kernels.
+    // patterns.add<LlvmAllocaToNeuraOps>(&getContext()); // TODO: Uses in DTW
+    // kernel.
+    // TODO: Fixes right shift implementations. Current implementations are
+    // incorrect. patterns.add<LlvmLShrToNeuraShl>(&getContext());  // TODO:
+    // Uses in ADPCM coder/decoder + FFT kernels.
+    // patterns.add<LlvmAShrToNeuraAShr>(&getContext()); // TODO: Uses in ADPCM
+    // coder/decoder kernels. patterns.add<LlvmAbsToNeuraAbs>(&getContext()); //
+    // TODO: Uses in ADPCM coder kernel.
+    return patterns;
+  }
 
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
     ModuleOp module_op = getOperation();
 
-    // Performs function-level conversions.
-    if (failed(applyPatternsGreedily(module_op, frozen))) {
+    // Performs the llvm.func -> func.func conversion first.
+    RewritePatternSet func_patterns(context);
+    func_patterns.add<LlvmFuncToNeuraFunc>(context);
+    func_patterns.add<LlvmCallToFuncCall>(context);
+
+    if (failed(applyPatternsGreedily(module_op, std::move(func_patterns)))) {
       signalPassFailure();
-      return;
     }
 
-    // Performs operation-level conversions.
+    // Performs operation-level conversions for func::FuncOp.
     // Applies to every region inside the module (regardless of func type,
     // e.g., mlir func or llvm func).
     module_op.walk([&](FunctionOpInterface func) {
@@ -967,13 +964,31 @@ struct LowerLlvmToNeuraPass
             func->getAttrOfType<StringAttr>(mlir::accel::kAcceleratorAttr);
         if (target && target.getValue() == mlir::accel::kNeuraTarget) {
           for (Region &region : func->getRegions()) {
-            if (failed(applyPatternsGreedily(region, frozen))) {
+            RewritePatternSet patterns = populateLlvmToNeuraPatterns(context);
+            if (failed(applyPatternsGreedily(region, std::move(patterns)))) {
               signalPassFailure();
             }
           }
         }
       }
     });
+
+    // Applies patterns to the neura.kernel regions.
+    module_op.walk([&](neura::KernelOp kernel_op) {
+      if (kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
+        auto accel_target =
+            kernel_op->getAttrOfType<StringAttr>(mlir::accel::kAcceleratorAttr);
+        if (accel_target &&
+            accel_target.getValue() == mlir::accel::kNeuraTarget) {
+          Region &kernel_region = kernel_op.getBody();
+          RewritePatternSet patterns = populateLlvmToNeuraPatterns(context);
+          if (failed(
+                  applyPatternsGreedily(kernel_region, std::move(patterns)))) {
+            signalPassFailure();
+          }
+        }
+      }
+    });
   }
 };
 } // namespace
diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td b/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td
deleted file mode 100644
index 1b99a47c..00000000
--- a/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td
+++ /dev/null
@@ -1,4 +0,0 @@
-include "mlir/IR/OpBase.td"
-include "mlir/IR/PatternBase.td"
-include "mlir/Dialect/LLVMIR/LLVMOps.td"
-include "NeuraDialect/NeuraOps.td"
diff --git a/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp b/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp
index c7157120..c8f3501f 100644
--- a/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp
+++ b/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp
@@ -83,26 +83,46 @@ struct LowerMemRefToNeuraPass
     registry.insert<mlir::neura::NeuraDialect>();
   }
 
-  void runOnOperation() override {
-    ModuleOp module_op = getOperation();
-    MLIRContext *context = &getContext();
-    RewritePatternSet patterns(&getContext());
-
+  RewritePatternSet populateMemRefToNeuraPatterns(MLIRContext *context) {
+    RewritePatternSet patterns(context);
     patterns.add<MemRefLoadLowering>(context);
     patterns.add<MemRefStoreLowering>(context);
     patterns.add<MemRefAllocaToNeuraAlloca>(context);
+    return patterns;
+  }
+
+  void runOnOperation() override {
+    ModuleOp module_op = getOperation();
+    MLIRContext *context = &getContext();
 
     module_op.walk([&](func::FuncOp func_op) {
       if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
         auto target =
             func_op->getAttrOfType<StringAttr>(mlir::accel::kAcceleratorAttr);
         if (target && target.getValue() == mlir::accel::kNeuraTarget) {
+          RewritePatternSet patterns = populateMemRefToNeuraPatterns(context);
           if (failed(applyPatternsGreedily(func_op, std::move(patterns)))) {
             return signalPassFailure();
           }
         }
       }
     });
+
+    module_op.walk([&](neura::KernelOp kernel_op) {
+      if (kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
+        auto accel_target =
+            kernel_op->getAttrOfType<StringAttr>(mlir::accel::kAcceleratorAttr);
+        if (accel_target &&
+            accel_target.getValue() == mlir::accel::kNeuraTarget) {
+          Region &kernel_region = kernel_op.getBody();
+          RewritePatternSet patterns = populateMemRefToNeuraPatterns(context);
+          if (failed(
+                  applyPatternsGreedily(kernel_region, std::move(patterns)))) {
+            signalPassFailure();
+          }
+        }
+      }
+    });
   }
 };
 } // namespace
diff --git a/test/Conversion/AffineToNeura/unsupported-affine-if.mlir b/test/Conversion/AffineToNeura/unsupported-affine-if.mlir
index 1095a239..8c799ee2 100644
--- a/test/Conversion/AffineToNeura/unsupported-affine-if.mlir
+++ b/test/Conversion/AffineToNeura/unsupported-affine-if.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-neura-opt %s --lower-affine | FileCheck %s --check-prefix=CHECK-SCF
 // RUN: mlir-neura-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-func-to-llvm | FileCheck %s --check-prefix=CHECK-LLVM
-// RUN: mlir-neura-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-func-to-llvm --lower-llvm-to-neura | FileCheck %s --check-prefix=CHECK-NEURA-BR
+// RUN: mlir-neura-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --assign-accelerator --convert-func-to-llvm --lower-llvm-to-neura | FileCheck %s --check-prefix=CHECK-NEURA-BR
 
 // This test demonstrates the complete multi-stage lowering chain for conditionals.
 // Note: Direct lowering affine.if to Neura is not supported.
@@ -54,7 +54,7 @@ module {
 // CHECK-LLVM: %{{.*}} = llvm.icmp "sge" %{{.*}}, %{{.*}} : i64
 // CHECK-LLVM: llvm.cond_br %{{.*}}, ^bb3, ^bb4
 
-// CHECK-NEURA-BR-LABEL: llvm.func @affine_if_example
+// CHECK-NEURA-BR-LABEL: func.func @affine_if_example
 // CHECK-NEURA-BR: %{{.*}} = "neura.constant"() <{value = -5 : index}> : () -> i64
 // CHECK-NEURA-BR: %{{.*}} = "neura.constant"() <{value = 1 : index}> : () -> i64
 // CHECK-NEURA-BR: %{{.*}} = "neura.constant"() <{value = 10 : index}> : () -> i64
diff --git a/test/mapping_quality/branch_for.mlir b/test/mapping_quality/branch_for.mlir
index 07db3866..f78a1be1 100644
--- a/test/mapping_quality/branch_for.mlir
+++ b/test/mapping_quality/branch_for.mlir
@@ -103,9 +103,9 @@ func.func @loop_test() -> f32 {
 // CHECK-NEXT:   %7 = "neura.fadd"(%6, %3) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
 // CHECK-NEXT:   %8 = "neura.add"(%5, %2) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
 // CHECK-NEXT:   %9 = "neura.icmp"(%8, %0) <{cmpType = "slt"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
-// CHECK-NEXT:   neura.cond_br %9 : !neura.data<i1, i1> then %8, %7 : !neura.data<i64, i1>, !neura.data<f32, i1> to ^bb1 else %7 : !neura.data<f32, i1> to ^bb2
-// CHECK-NEXT: ^bb2(%10: !neura.data<f32, i1>):  // pred: ^bb1
-// CHECK-NEXT:   "neura.return"(%10) : (!neura.data<f32, i1>) -> ()
+// CHECK-NEXT:   neura.cond_br %9 : !neura.data<i1, i1> then %8, %7 : !neura.data<i64, i1>, !neura.data<f32, i1> to ^bb1 else to ^bb2
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT:   "neura.return"(%7) : (!neura.data<f32, i1>) -> ()
 // CHECK-NEXT: }
 
 // CANONICALIZE:       func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
diff --git a/test/neura/ctrl/branch_for.mlir b/test/neura/ctrl/branch_for.mlir
index 6ea1910d..bbb06ed9 100644
--- a/test/neura/ctrl/branch_for.mlir
+++ b/test/neura/ctrl/branch_for.mlir
@@ -110,9 +110,9 @@ func.func @loop_test() -> f32 {
 // CHECK-NEXT:   %7 = "neura.fadd"(%6, %3) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
 // CHECK-NEXT:   %8 = "neura.add"(%5, %2) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
 // CHECK-NEXT:   %9 = "neura.icmp"(%8, %0) <{cmpType = "slt"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
-// CHECK-NEXT:   neura.cond_br %9 : !neura.data<i1, i1> then %8, %7 : !neura.data<i64, i1>, !neura.data<f32, i1> to ^bb1 else %7 : !neura.data<f32, i1> to ^bb2
-// CHECK-NEXT: ^bb2(%10: !neura.data<f32, i1>):  // pred: ^bb1
-// CHECK-NEXT:   "neura.return"(%10) : (!neura.data<f32, i1>) -> ()
+// CHECK-NEXT:   neura.cond_br %9 : !neura.data<i1, i1> then %8, %7 : !neura.data<i64, i1>, !neura.data<f32, i1> to ^bb1 else to ^bb2
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT:   "neura.return"(%7) : (!neura.data<f32, i1>) -> ()
 // CHECK-NEXT: }
 
 // CANONICALIZE:       func.func @loop_test() -> f32 attributes {accelerator = "neura"} {

From f41691a07be506e16802b34df499b97fbaf6e0be Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 23 Jan 2026 15:02:54 +0800
Subject: [PATCH 07/25] enable promote func/kernel arguments to constant

---
 include/NeuraDialect/NeuraPasses.h            |  2 +-
 include/NeuraDialect/NeuraPasses.td           |  8 +-
 lib/NeuraDialect/NeuraPasses.cpp              |  2 +-
 lib/NeuraDialect/Transforms/CMakeLists.txt    |  2 +-
 ...ass.cpp => PromoteInputArgToConstPass.cpp} | 75 +++++++++++++++++--
 .../bert/bert_node1/bert_node1.mlir           |  2 +-
 .../bert/bert_node28/bert_node28.mlir         |  2 +-
 test/c2llvm2mlir/nested_loop/test.mlir        |  4 +-
 test/c2llvm2mlir/simple_loop/test.mlir        |  4 +-
 .../complex_nested/complex_nested.mlir        |  2 +-
 .../non_perfect_nested.mlir                   |  2 +-
 .../perfect_nested/perfect_nested.mlir        |  4 +-
 .../perfect_reduction/perfect_reduction.mlir  |  2 +-
 .../simple_loop/simple_loop.mlir              |  8 +-
 .../simple_loop_reduction.mlir                |  8 +-
 test/e2e/axpy/axpy_kernel.mlir                |  2 +-
 test/e2e/bicg/bicg_kernel.mlir                |  8 +-
 test/e2e/fir/fir_kernel.mlir                  |  4 +-
 test/e2e/fir/fir_kernel_vec.mlir              |  2 +-
 test/e2e/gemm/gemm_kernel.mlir                |  2 +-
 test/e2e/gemv/gemv_kernel.mlir                |  2 +-
 test/e2e/histogram/histogram_kernel.mlir      |  4 +-
 test/e2e/relu/relu_kernel.mlir                |  2 +-
 test/e2e/spmv/spmv_kernel.mlir                |  2 +-
 test/honor_arch/fir_removed_tiles_test.mlir   |  2 +-
 test/neura/ctrl/branch.mlir                   |  4 +-
 test/neura/ctrl/branch_for.mlir               | 12 +--
 .../ctrl/branch_with_and_without_arg.mlir     |  4 +-
 test/neura/ctrl/branch_without_arg.mlir       |  4 +-
 test/neura/ctrl/for_with_if.mlir              |  2 +-
 test/neura/ctrl/nested_branch.mlir            |  4 +-
 test/neura/for_loop/kernel_test.mlir          |  6 +-
 test/neura/for_loop/relu_test.mlir            |  6 +-
 test/neura/fusion/test.mlir                   | 20 ++---
 test/neura/steer_ctrl/for_with_if.mlir        |  2 +-
 .../steer_ctrl/loop_with_return_value.mlir    |  4 +-
 .../steer_ctrl/loop_without_return_value.mlir |  2 +-
 .../constant_folding/simple_loop.mlir         |  2 +-
 38 files changed, 144 insertions(+), 85 deletions(-)
 rename lib/NeuraDialect/Transforms/{PromoteFuncArgToConstPass.cpp => PromoteInputArgToConstPass.cpp} (58%)

diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index 95aa70c8..0b77521d 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -27,7 +27,7 @@ std::unique_ptr<mlir::Pass> createMapToAcceleratorPass();
 std::unique_ptr<mlir::Pass> createGenerateCodePass();
 std::unique_ptr<mlir::Pass> createCanonicalizeReturnPass();
 std::unique_ptr<mlir::Pass> createCanonicalizeLiveInPass();
-std::unique_ptr<mlir::Pass> createPromoteFuncArgToConstPass();
+std::unique_ptr<mlir::Pass> createPromoteInputArgToConstPass();
 std::unique_ptr<mlir::Pass> createTransformToSteerControlPass();
 std::unique_ptr<mlir::Pass> createRemovePredicatedTypePass();
 std::unique_ptr<mlir::Pass> createWrapLoopInKernelPass();
diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index ec0df60b..fc6cec1e 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -97,12 +97,12 @@ def CanonicalizeLiveIn : Pass<"canonicalize-live-in", "ModuleOp"> {
   let constructor = "neura::createCanonicalizeLiveInPass()";
 }
 
-def PromoteFuncArgToConst : Pass<"promote-func-arg-to-const", "ModuleOp"> {
-  let summary = "Promotes function arguments to neura constant operations";
+def PromoteInputArgToConst : Pass<"promote-input-arg-to-const", "ModuleOp"> {
+  let summary = "Promotes input arguments of functions or neura.kernels to neura constant operations";
   let description = [{
-    This pass promotes function arguments to neura constant operations.
+    This pass promotes input arguments of functions or neura.kernels to neura constant operations.
   }];
-  let constructor = "neura::createPromoteFuncArgToConstPass()";
+  let constructor = "neura::createPromoteInputArgToConstPass()";
 }
 
 def CanonicalizeCast : Pass<"canonicalize-cast", "ModuleOp"> {
diff --git a/lib/NeuraDialect/NeuraPasses.cpp b/lib/NeuraDialect/NeuraPasses.cpp
index 26c1b6f2..80b6a6f1 100644
--- a/lib/NeuraDialect/NeuraPasses.cpp
+++ b/lib/NeuraDialect/NeuraPasses.cpp
@@ -31,7 +31,7 @@ void mlir::neura::registerNeuraConversionPassPipeline() {
 
         pm.addPass(mlir::neura::createCanonicalizeReturnPass());
         pm.addPass(mlir::neura::createCanonicalizeCastPass());
-        pm.addPass(mlir::neura::createPromoteFuncArgToConstPass());
+        pm.addPass(mlir::neura::createPromoteInputArgToConstPass());
         pm.addPass(mlir::neura::createFoldConstantPass());
         pm.addPass(mlir::neura::createCanonicalizeLiveInPass());
         pm.addPass(mlir::neura::createLeveragePredicatedValuePass());
diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt
index 85200b48..da7056fb 100644
--- a/lib/NeuraDialect/Transforms/CMakeLists.txt
+++ b/lib/NeuraDialect/Transforms/CMakeLists.txt
@@ -13,7 +13,7 @@ add_mlir_library(
     CanonicalizeReturnPass.cpp
     CanonicalizeLiveInPass.cpp
     CanonicalizeCastPass.cpp
-    PromoteFuncArgToConstPass.cpp
+    PromoteInputArgToConstPass.cpp
     IterMergePatternPass.cpp
     TransformToSteerControlPass.cpp
     RemovePredicatedTypePass.cpp
diff --git a/lib/NeuraDialect/Transforms/PromoteFuncArgToConstPass.cpp b/lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp
similarity index 58%
rename from lib/NeuraDialect/Transforms/PromoteFuncArgToConstPass.cpp
rename to lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp
index 8db54b2e..7889922c 100644
--- a/lib/NeuraDialect/Transforms/PromoteFuncArgToConstPass.cpp
+++ b/lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp
@@ -4,6 +4,7 @@
 #include "NeuraDialect/NeuraPasses.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Block.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Region.h"
@@ -15,7 +16,7 @@
 
 using namespace mlir;
 
-#define GEN_PASS_DEF_PROMOTEFUNCARGTOCONST
+#define GEN_PASS_DEF_PROMOTEINPUTARGTOCONST
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 namespace {
@@ -73,13 +74,58 @@ LogicalResult promoteFunctionArgsToConstants(Region &region) {
   return success();
 }
 
-struct PromoteFuncArgToConstPass
-    : public PassWrapper<PromoteFuncArgToConstPass, OperationPass<ModuleOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PromoteFuncArgToConstPass)
+LogicalResult promoteKernelArgsToConstants(neura::KernelOp kernel_op) {
+  Region &kernel_region = kernel_op.getBody();
+  if (kernel_region.empty()) {
+    return success();
+  }
+
+  Block &entry_block = kernel_region.front();
+  OpBuilder builder(&entry_block, entry_block.begin());
+
+  // Gets the number of inputs and iter_args from kernel operands.
+  size_t num_inputs = kernel_op.getInputs().size();
+  size_t num_iter_args = kernel_op.getIterArgsInit().size();
+
+  // Verifies block arguments layout: [inputs..., iter_args...]
+  SmallVector<BlockArgument> args(entry_block.getArguments().begin(),
+                                  entry_block.getArguments().end());
+
+  assert(args.size() == num_inputs + num_iter_args &&
+         "Kernel block arguments size mismatch");
+
+  // Only promotes input arguments (not iter_args).
+  // Block arguments layout: [input0, input1, ..., iter_arg0, iter_arg1, ...]
+  for (size_t i = 0; i < num_inputs; ++i) {
+    BlockArgument input_arg = args[i];
+
+    // Creates a constant for this input.
+    std::string const_name = "%input" + std::to_string(i);
+    auto const_op = builder.create<neura::ConstantOp>(
+        input_arg.getLoc(), input_arg.getType(),
+        builder.getStringAttr(const_name));
+
+    // Replaces all uses of this input argument with the constant.
+    input_arg.replaceAllUsesWith(const_op.getResult());
+  }
+
+  // Note: iter_args (args[num_inputs] to args[num_inputs + num_iter_args - 1])
+  // are NOT promoted here. They will be handled in transform-ctrl-to-data-flow
+  // pass.
 
-  StringRef getArgument() const override { return "promote-func-arg-to-const"; }
+  return success();
+}
+
+struct PromoteInputArgToConstPass
+    : public PassWrapper<PromoteInputArgToConstPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PromoteInputArgToConstPass)
+
+  StringRef getArgument() const override {
+    return "promote-input-arg-to-const";
+  }
   StringRef getDescription() const override {
-    return "Promotes function arguments to constants.";
+    return "Promotes input arguments of functions or neura.kernels to neura "
+           "constant operations.";
   }
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<mlir::neura::NeuraDialect>();
@@ -118,12 +164,25 @@ struct PromoteFuncArgToConstPass
         return;
       }
     });
+
+    // Processes neura.kernel input arguments.
+    module_op.walk([&](neura::KernelOp kernel_op) {
+      auto accel_attr =
+          kernel_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+      if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
+        return;
+      }
+      if (failed(promoteKernelArgsToConstants(kernel_op))) {
+        signalPassFailure();
+        return;
+      }
+    });
   }
 };
 } // namespace
 
 namespace mlir::neura {
-std::unique_ptr<Pass> createPromoteFuncArgToConstPass() {
-  return std::make_unique<PromoteFuncArgToConstPass>();
+std::unique_ptr<Pass> createPromoteInputArgToConstPass() {
+  return std::make_unique<PromoteInputArgToConstPass>();
 }
 } // namespace mlir::neura
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node1/bert_node1.mlir b/test/affine2neura/bert/bert_node1/bert_node1.mlir
index 64610dfd..70132cdd 100644
--- a/test/affine2neura/bert/bert_node1/bert_node1.mlir
+++ b/test/affine2neura/bert/bert_node1/bert_node1.mlir
@@ -14,7 +14,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
diff --git a/test/affine2neura/bert/bert_node28/bert_node28.mlir b/test/affine2neura/bert/bert_node28/bert_node28.mlir
index 65494bb4..75f66c35 100644
--- a/test/affine2neura/bert/bert_node28/bert_node28.mlir
+++ b/test/affine2neura/bert/bert_node28/bert_node28.mlir
@@ -14,7 +14,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
diff --git a/test/c2llvm2mlir/nested_loop/test.mlir b/test/c2llvm2mlir/nested_loop/test.mlir
index 924e15fc..3bf536ff 100644
--- a/test/c2llvm2mlir/nested_loop/test.mlir
+++ b/test/c2llvm2mlir/nested_loop/test.mlir
@@ -3,7 +3,7 @@
 
 // RUN: mlir-neura-opt --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
@@ -14,7 +14,7 @@
 
 // RUN: mlir-neura-opt --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
diff --git a/test/c2llvm2mlir/simple_loop/test.mlir b/test/c2llvm2mlir/simple_loop/test.mlir
index 09285607..2af2d0c6 100644
--- a/test/c2llvm2mlir/simple_loop/test.mlir
+++ b/test/c2llvm2mlir/simple_loop/test.mlir
@@ -22,7 +22,7 @@
 
 // RUN: mlir-neura-opt --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
@@ -36,7 +36,7 @@
 // Test with mapping table dump enabled
 // RUN: mlir-neura-opt --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/controflow_fuse/complex_nested/complex_nested.mlir b/test/controflow_fuse/complex_nested/complex_nested.mlir
index 0fa153b4..77a4eb2f 100644
--- a/test/controflow_fuse/complex_nested/complex_nested.mlir
+++ b/test/controflow_fuse/complex_nested/complex_nested.mlir
@@ -14,7 +14,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
diff --git a/test/controflow_fuse/non_perfect_nested/non_perfect_nested.mlir b/test/controflow_fuse/non_perfect_nested/non_perfect_nested.mlir
index 598ac289..95765b42 100644
--- a/test/controflow_fuse/non_perfect_nested/non_perfect_nested.mlir
+++ b/test/controflow_fuse/non_perfect_nested/non_perfect_nested.mlir
@@ -14,7 +14,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
diff --git a/test/controflow_fuse/perfect_nested/perfect_nested.mlir b/test/controflow_fuse/perfect_nested/perfect_nested.mlir
index a664bb16..bbc5877e 100644
--- a/test/controflow_fuse/perfect_nested/perfect_nested.mlir
+++ b/test/controflow_fuse/perfect_nested/perfect_nested.mlir
@@ -23,7 +23,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
@@ -37,7 +37,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --fold-constant \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
diff --git a/test/controflow_fuse/perfect_reduction/perfect_reduction.mlir b/test/controflow_fuse/perfect_reduction/perfect_reduction.mlir
index 77c41c7b..d009ea04 100644
--- a/test/controflow_fuse/perfect_reduction/perfect_reduction.mlir
+++ b/test/controflow_fuse/perfect_reduction/perfect_reduction.mlir
@@ -24,7 +24,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
diff --git a/test/controflow_fuse/simple_loop/simple_loop.mlir b/test/controflow_fuse/simple_loop/simple_loop.mlir
index 13b2e91f..e9c04f7c 100644
--- a/test/controflow_fuse/simple_loop/simple_loop.mlir
+++ b/test/controflow_fuse/simple_loop/simple_loop.mlir
@@ -18,7 +18,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-live-in | FileCheck %s --check-prefix=CANONICALIZE
 
 // RUN: mlir-neura-opt %t-llvm.mlir \
@@ -28,7 +28,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
@@ -42,7 +42,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --fold-constant \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
@@ -60,7 +60,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --fold-constant \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
diff --git a/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir b/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir
index 3619db45..ace0dd26 100644
--- a/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir
+++ b/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir
@@ -18,7 +18,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-live-in \
 // RUN: | FileCheck %s --check-prefix=CANONICALIZE
 
@@ -29,7 +29,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
@@ -43,7 +43,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --fold-constant \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
@@ -61,7 +61,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --fold-constant \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
diff --git a/test/e2e/axpy/axpy_kernel.mlir b/test/e2e/axpy/axpy_kernel.mlir
index fc07dcef..8d3e9fba 100644
--- a/test/e2e/axpy/axpy_kernel.mlir
+++ b/test/e2e/axpy/axpy_kernel.mlir
@@ -7,7 +7,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/e2e/bicg/bicg_kernel.mlir b/test/e2e/bicg/bicg_kernel.mlir
index 2824b6fc..d353ec1f 100644
--- a/test/e2e/bicg/bicg_kernel.mlir
+++ b/test/e2e/bicg/bicg_kernel.mlir
@@ -13,7 +13,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   -o %t-before-canonicalize.mlir
 // RUN: FileCheck %s --input-file=%t-before-canonicalize.mlir -check-prefix=BEFORE_CANONICALIZE
@@ -21,7 +21,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-live-in \
 // RUN:   -o %t-after-canonicalize.mlir
@@ -30,7 +30,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
@@ -338,7 +338,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/e2e/fir/fir_kernel.mlir b/test/e2e/fir/fir_kernel.mlir
index 6b476a36..f7049b62 100644
--- a/test/e2e/fir/fir_kernel.mlir
+++ b/test/e2e/fir/fir_kernel.mlir
@@ -8,7 +8,7 @@
 // RUN: cd %t.dir && mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
@@ -192,7 +192,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/e2e/fir/fir_kernel_vec.mlir b/test/e2e/fir/fir_kernel_vec.mlir
index 2c0e8207..366feba8 100644
--- a/test/e2e/fir/fir_kernel_vec.mlir
+++ b/test/e2e/fir/fir_kernel_vec.mlir
@@ -6,7 +6,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/e2e/gemm/gemm_kernel.mlir b/test/e2e/gemm/gemm_kernel.mlir
index 674a027e..3376fe0a 100644
--- a/test/e2e/gemm/gemm_kernel.mlir
+++ b/test/e2e/gemm/gemm_kernel.mlir
@@ -7,7 +7,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/e2e/gemv/gemv_kernel.mlir b/test/e2e/gemv/gemv_kernel.mlir
index fc6d862d..9f8f1317 100644
--- a/test/e2e/gemv/gemv_kernel.mlir
+++ b/test/e2e/gemv/gemv_kernel.mlir
@@ -7,7 +7,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/e2e/histogram/histogram_kernel.mlir b/test/e2e/histogram/histogram_kernel.mlir
index ca045ef3..9f2d6f23 100644
--- a/test/e2e/histogram/histogram_kernel.mlir
+++ b/test/e2e/histogram/histogram_kernel.mlir
@@ -8,7 +8,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
@@ -154,7 +154,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/e2e/relu/relu_kernel.mlir b/test/e2e/relu/relu_kernel.mlir
index cf09e451..b5c46f98 100644
--- a/test/e2e/relu/relu_kernel.mlir
+++ b/test/e2e/relu/relu_kernel.mlir
@@ -15,7 +15,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/e2e/spmv/spmv_kernel.mlir b/test/e2e/spmv/spmv_kernel.mlir
index 9e871ed4..32a50da9 100644
--- a/test/e2e/spmv/spmv_kernel.mlir
+++ b/test/e2e/spmv/spmv_kernel.mlir
@@ -7,7 +7,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/honor_arch/fir_removed_tiles_test.mlir b/test/honor_arch/fir_removed_tiles_test.mlir
index a1d94188..23e4009d 100644
--- a/test/honor_arch/fir_removed_tiles_test.mlir
+++ b/test/honor_arch/fir_removed_tiles_test.mlir
@@ -8,7 +8,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/neura/ctrl/branch.mlir b/test/neura/ctrl/branch.mlir
index eba379e0..d56813d6 100644
--- a/test/neura/ctrl/branch.mlir
+++ b/test/neura/ctrl/branch.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
@@ -10,7 +10,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
diff --git a/test/neura/ctrl/branch_for.mlir b/test/neura/ctrl/branch_for.mlir
index bbb06ed9..a626575e 100644
--- a/test/neura/ctrl/branch_for.mlir
+++ b/test/neura/ctrl/branch_for.mlir
@@ -7,7 +7,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-live-in \
 // RUN:   | FileCheck %s -check-prefix=CANONICALIZE
@@ -15,7 +15,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
@@ -26,7 +26,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
@@ -38,7 +38,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
@@ -51,7 +51,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
@@ -66,7 +66,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/neura/ctrl/branch_with_and_without_arg.mlir b/test/neura/ctrl/branch_with_and_without_arg.mlir
index d861d1d5..87e6b61b 100644
--- a/test/neura/ctrl/branch_with_and_without_arg.mlir
+++ b/test/neura/ctrl/branch_with_and_without_arg.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
 // RUN:   | FileCheck %s
@@ -9,7 +9,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
diff --git a/test/neura/ctrl/branch_without_arg.mlir b/test/neura/ctrl/branch_without_arg.mlir
index e505afda..726f8e1a 100644
--- a/test/neura/ctrl/branch_without_arg.mlir
+++ b/test/neura/ctrl/branch_without_arg.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
 // RUN:   | FileCheck %s
@@ -9,7 +9,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
diff --git a/test/neura/ctrl/for_with_if.mlir b/test/neura/ctrl/for_with_if.mlir
index ad8ba343..0f93ace2 100644
--- a/test/neura/ctrl/for_with_if.mlir
+++ b/test/neura/ctrl/for_with_if.mlir
@@ -18,7 +18,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
diff --git a/test/neura/ctrl/nested_branch.mlir b/test/neura/ctrl/nested_branch.mlir
index 92fe6975..5af809b4 100644
--- a/test/neura/ctrl/nested_branch.mlir
+++ b/test/neura/ctrl/nested_branch.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
@@ -10,7 +10,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
diff --git a/test/neura/for_loop/kernel_test.mlir b/test/neura/for_loop/kernel_test.mlir
index 1c00b1d5..298a9426 100644
--- a/test/neura/for_loop/kernel_test.mlir
+++ b/test/neura/for_loop/kernel_test.mlir
@@ -7,7 +7,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir\
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
 // RUN:  | FileCheck %s
@@ -15,7 +15,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir\
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
@@ -27,7 +27,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir\
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
diff --git a/test/neura/for_loop/relu_test.mlir b/test/neura/for_loop/relu_test.mlir
index 366083d6..a34e4fd7 100644
--- a/test/neura/for_loop/relu_test.mlir
+++ b/test/neura/for_loop/relu_test.mlir
@@ -5,14 +5,14 @@
 // RUN: mlir-neura-opt %t-relu.mlir\
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-live-in \
 // RUN:  | FileCheck %s
 
 // RUN: mlir-neura-opt %t-relu.mlir\
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
@@ -22,7 +22,7 @@
 // RUN: mlir-neura-opt %t-relu.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \
diff --git a/test/neura/fusion/test.mlir b/test/neura/fusion/test.mlir
index 35643cdf..0e6a3dce 100644
--- a/test/neura/fusion/test.mlir
+++ b/test/neura/fusion/test.mlir
@@ -2,7 +2,7 @@
 // RUN: mlir-translate --import-llvm %t-kernel.ll -o %t-kernel.mlir
 // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --assign-accelerator \
 // RUN:           --lower-llvm-to-neura \
-// RUN:           --promote-func-arg-to-const \
+// RUN:           --promote-input-arg-to-const \
 // RUN:           --canonicalize-return \
 // RUN:           --canonicalize-live-in \
 // RUN:           --leverage-predicated-value \
@@ -16,7 +16,7 @@
 
 // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --assign-accelerator \
 // RUN:           --lower-llvm-to-neura \
-// RUN:           --promote-func-arg-to-const \
+// RUN:           --promote-input-arg-to-const \
 // RUN:           --canonicalize-return \
 // RUN:           --canonicalize-live-in \
 // RUN:           --leverage-predicated-value \
@@ -38,7 +38,7 @@
 // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --verify-each=true --mlir-print-ir-after-failure \
 // RUN:           --assign-accelerator \
 // RUN:           --lower-llvm-to-neura \
-// RUN:           --promote-func-arg-to-const \
+// RUN:           --promote-input-arg-to-const \
 // RUN:           --canonicalize-cast \
 // RUN:           --canonicalize-return \
 // RUN:           --canonicalize-live-in \
@@ -53,27 +53,27 @@
 // CHECK-ITER-MERGE-PATTERN-NEXT:     ^bb0(%arg5: !neura.data<i64, i1>):
 // CHECK-ITER-MERGE-PATTERN-NEXT:       %61 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
 // CHECK-ITER-MERGE-PATTERN-NEXT:       %62 = neura.phi_start %61, %arg5 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// CHECK-ITER-MERGE-PATTERN-NEXT:       neura.yield %61, %62 : !neura.data<i64, i1>, !neura.data<i64, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:       neura.yield results(%61, %62 : !neura.data<i64, i1>, !neura.data<i64, i1>)
 // CHECK-ITER-MERGE-PATTERN-NEXT:     }) : (!neura.data<i64, i1>) -> (!neura.data<i64, i1>, !neura.data<i64, i1>)
 // CHECK-ITER-MERGE-PATTERN:      %16:2 = "neura.fused_op"(%4, %13, %15) <{frequency = 8 : i64, pattern_id = 10 : i64, pattern_name = "phi_start->fused_op:gep->load"}> ({
 // CHECK-ITER-MERGE-PATTERN-NEXT:     ^bb0(%arg5: !neura.data<!llvm.ptr, i1>, %arg6: !neura.data<!llvm.ptr, i1>, %arg7: !neura.data<i64, i1>):
 // CHECK-ITER-MERGE-PATTERN-NEXT:       %61 = neura.phi_start %arg5, %arg6 : !neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1> -> !neura.data<!llvm.ptr, i1>
 // CHECK-ITER-MERGE-PATTERN-NEXT:       %62 = "neura.gep"(%61, %arg7) <{operandSegmentSizes = array<i32: 1, 1>}> : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
 // CHECK-ITER-MERGE-PATTERN-NEXT:       %63 = "neura.load"(%62) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// CHECK-ITER-MERGE-PATTERN-NEXT:       neura.yield %61, %63 : !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:       neura.yield results(%61, %63 : !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>)
 // CHECK-ITER-MERGE-PATTERN-NEXT:     }) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> (!neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>)
 // CHECK-ITER-MERGE-PATTERN:     %17:3 = "neura.fused_op"(%2, %12, %15) <{frequency = 8 : i64, pattern_id = 10 : i64, pattern_name = "phi_start->fused_op:gep->load"}> ({
 // CHECK-ITER-MERGE-PATTERN-NEXT:     ^bb0(%arg5: !neura.data<!llvm.ptr, i1>, %arg6: !neura.data<!llvm.ptr, i1>, %arg7: !neura.data<i64, i1>):
 // CHECK-ITER-MERGE-PATTERN-NEXT:       %61 = neura.phi_start %arg5, %arg6 : !neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1> -> !neura.data<!llvm.ptr, i1>
 // CHECK-ITER-MERGE-PATTERN-NEXT:       %62 = "neura.gep"(%61, %arg7) <{operandSegmentSizes = array<i32: 1, 1>}> : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
 // CHECK-ITER-MERGE-PATTERN-NEXT:       %63 = "neura.load"(%62) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// CHECK-ITER-MERGE-PATTERN-NEXT:       neura.yield %61, %62, %63 : !neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:       neura.yield results(%61, %62, %63 : !neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>)
 // CHECK-ITER-MERGE-PATTERN-NEXT:     }) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>)
 
 // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --verify-each=true --mlir-print-ir-after-failure \
 // RUN:           --assign-accelerator \
 // RUN:           --lower-llvm-to-neura \
-// RUN:           --promote-func-arg-to-const \
+// RUN:           --promote-input-arg-to-const \
 // RUN:           --canonicalize-cast \
 // RUN:           --canonicalize-return \
 // RUN:           --canonicalize-live-in \
@@ -88,19 +88,19 @@
 // CHECK-INIT-PATTERN-NEXT:     ^bb0(%arg5: !neura.data<!llvm.ptr, i1>, %arg6: !neura.data<i64, i1>):
 // CHECK-INIT-PATTERN-NEXT:       %72 = "neura.gep"(%arg5, %arg6) <{operandSegmentSizes = array<i32: 1, 1>}> : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
 // CHECK-INIT-PATTERN-NEXT:       %73 = "neura.load"(%72) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// CHECK-INIT-PATTERN-NEXT:       neura.yield %72, %73 : !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>
+// CHECK-INIT-PATTERN-NEXT:       neura.yield results(%72, %73 : !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>)
 // CHECK-INIT-PATTERN-NEXT:     }) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> (!neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>)
 // CHECK-INIT-PATTERN-NEXT:     %22 = "neura.fused_op"(%18, %20) <{frequency = 6 : i64, pattern_id = 2 : i64, pattern_name = "gep->load"}> ({
 // CHECK-INIT-PATTERN-NEXT:     ^bb0(%arg5: !neura.data<!llvm.ptr, i1>, %arg6: !neura.data<i64, i1>):
 // CHECK-INIT-PATTERN-NEXT:       %72 = "neura.gep"(%arg5, %arg6) <{operandSegmentSizes = array<i32: 1, 1>}> : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
 // CHECK-INIT-PATTERN-NEXT:       %73 = "neura.load"(%72) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// CHECK-INIT-PATTERN-NEXT:       neura.yield %73 : !neura.data<i32, i1>
+// CHECK-INIT-PATTERN-NEXT:       neura.yield results(%73 : !neura.data<i32, i1>)
 // CHECK-INIT-PATTERN-NEXT:     }) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<i32, i1>
 
 // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --verify-each=true --mlir-print-ir-after-failure \
 // RUN:           --assign-accelerator \
 // RUN:           --lower-llvm-to-neura \
-// RUN:           --promote-func-arg-to-const \
+// RUN:           --promote-input-arg-to-const \
 // RUN:           --canonicalize-cast \
 // RUN:           --canonicalize-return \
 // RUN:           --canonicalize-live-in \
diff --git a/test/neura/steer_ctrl/for_with_if.mlir b/test/neura/steer_ctrl/for_with_if.mlir
index fe145f99..1fb72aac 100644
--- a/test/neura/steer_ctrl/for_with_if.mlir
+++ b/test/neura/steer_ctrl/for_with_if.mlir
@@ -6,7 +6,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
diff --git a/test/neura/steer_ctrl/loop_with_return_value.mlir b/test/neura/steer_ctrl/loop_with_return_value.mlir
index e9e1bf11..1104a7a7 100644
--- a/test/neura/steer_ctrl/loop_with_return_value.mlir
+++ b/test/neura/steer_ctrl/loop_with_return_value.mlir
@@ -6,7 +6,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
@@ -22,7 +22,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
diff --git a/test/neura/steer_ctrl/loop_without_return_value.mlir b/test/neura/steer_ctrl/loop_without_return_value.mlir
index 55e28f08..7311c543 100644
--- a/test/neura/steer_ctrl/loop_without_return_value.mlir
+++ b/test/neura/steer_ctrl/loop_without_return_value.mlir
@@ -6,7 +6,7 @@
 // RUN: --lower-builtin-to-neura \
 // RUN: --lower-llvm-to-neura \
 // RUN: --canonicalize-cast \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --canonicalize-return \
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
diff --git a/test/optimization/constant_folding/simple_loop.mlir b/test/optimization/constant_folding/simple_loop.mlir
index 483a042c..4df60b58 100644
--- a/test/optimization/constant_folding/simple_loop.mlir
+++ b/test/optimization/constant_folding/simple_loop.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-neura-opt %s \
-// RUN: --promote-func-arg-to-const \
+// RUN: --promote-input-arg-to-const \
 // RUN: --fold-constant \
 // RUN: | FileCheck %s -check-prefix=FOLD
 

From 3907f67cc5fb513cc8fb979d0f8b34aabcdf9a34 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 23 Jan 2026 15:59:37 +0800
Subject: [PATCH 08/25] enable canonicalize-return for neura.kernel

---
 .../Transforms/AssignAcceleratorPass.cpp      |  33 +-
 .../Transforms/CanonicalizeReturnPass.cpp     | 321 +++++++++++++++---
 2 files changed, 302 insertions(+), 52 deletions(-)

diff --git a/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp
index 11688539..47f5771b 100644
--- a/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp
+++ b/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp
@@ -12,6 +12,16 @@ using namespace mlir;
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 namespace {
+// Checks if a function contains any neura.kernel operations.
+static bool containsNeuraKernelOp(FunctionOpInterface func_op) {
+  bool has_kernel = false;
+  func_op.walk([&](neura::KernelOp kernel_op) {
+    has_kernel = true;
+    return WalkResult::interrupt();
+  });
+  return has_kernel;
+}
+
 struct AssignAcceleratorPass
     : public PassWrapper<AssignAcceleratorPass, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AssignAcceleratorPass)
@@ -25,19 +35,28 @@ struct AssignAcceleratorPass
     ModuleOp module = getOperation();
     Builder builder(&getContext());
 
+    // Firstly assigns accelerator to all neura.kernel ops.
+    module.walk([&](neura::KernelOp kernel_op) {
+      // Handles neura.kernel ops.
+      if (!kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
+        kernel_op->setAttr(mlir::accel::kAcceleratorAttr,
+                           builder.getStringAttr(mlir::accel::kNeuraTarget));
+      }
+    });
+
+    // Secondly assigns accelerator to functions.
+    // Skips functions that:
+    //   1. Are named "main";
+    //   2. Already have accelerator attribute;
+    //   3. Contain neura.kernel operations.
     module.walk([&](Operation *op) {
       if (auto func = dyn_cast<FunctionOpInterface>(op)) {
         if (func.getName() != "main" && !func.isExternal() &&
-            !func->hasAttr(mlir::accel::kAcceleratorAttr)) {
+            !func->hasAttr(mlir::accel::kAcceleratorAttr) &&
+            !containsNeuraKernelOp(func)) {
           func->setAttr(mlir::accel::kAcceleratorAttr,
                         builder.getStringAttr(mlir::accel::kNeuraTarget));
         }
-      } else if (neura::KernelOp kernel_op = dyn_cast<neura::KernelOp>(op)) {
-        // Handles neura.kernel ops as well.
-        if (!kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
-          kernel_op->setAttr(mlir::accel::kAcceleratorAttr,
-                             builder.getStringAttr(mlir::accel::kNeuraTarget));
-        }
       }
     });
   }
diff --git a/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp
index 46c5407e..22e3869c 100644
--- a/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp
+++ b/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp
@@ -20,8 +20,9 @@ using namespace mlir;
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 namespace {
-// Return type attribute values.
+// Return/Yield type attribute values.
 constexpr const char *kReturnTypeAttr = "return_type";
+constexpr const char *kYieldTypeAttr = "yield_type";
 constexpr const char *kReturnTypeVoid = "void";
 constexpr const char *kReturnTypeValue = "value";
 
@@ -58,6 +59,197 @@ static void processReturns(Region &region, OpBuilder &builder) {
   }
 }
 
+// Processes neura.yield operations in kernel regions.
+static void processYields(neura::KernelOp kernel_op, OpBuilder &builder) {
+  SmallVector<neura::YieldOp> empty_yields;
+
+  kernel_op.walk([&](neura::YieldOp yield_op) {
+    llvm::errs() << "[canonicalize] Processing neura.yield operation...\n";
+    llvm::errs() << yield_op << "\n";
+
+    // Case 1: yield has results - mark as value type.
+    if (yield_op.getResults().size() > 0) {
+      llvm::errs() << "[canonicalize] Marking neura.yield with value...\n";
+      yield_op->setAttr(kYieldTypeAttr,
+                        builder.getStringAttr(kReturnTypeValue));
+      return;
+    }
+
+    // Case 2 & 3: yield has no results.
+    empty_yields.push_back(yield_op);
+  });
+
+  // Processes empty yields.
+  for (neura::YieldOp yield_op : empty_yields) {
+    llvm::errs() << "[canonicalize] Processing empty neura.yield...\n";
+
+    // Searches for counters in the kernel.
+    neura::CounterOp root_counter = nullptr;
+    neura::CounterOp any_counter = nullptr;
+
+    kernel_op.walk([&](neura::CounterOp counter_op) {
+      any_counter = counter_op;
+
+      if (counter_op.getCounterTypeAttr() &&
+          counter_op.getCounterTypeAttr().getValue() == "root") {
+        root_counter = counter_op;
+      }
+    });
+
+    // Case 2: Has counter - uses counter as trigger.
+    if (root_counter || any_counter) {
+      Value trigger_value = root_counter ? root_counter.getCurrentIndex()
+                                         : any_counter.getCurrentIndex();
+
+      llvm::errs() << "[canonicalize] Using "
+                   << (root_counter ? "root" : "leaf")
+                   << " counter as trigger.\n";
+
+      // Creates new yield with trigger value as result.
+      builder.setInsertionPoint(yield_op);
+
+      SmallVector<Value> iter_args_next(yield_op.getIterArgsNext());
+      SmallVector<Value> results = {trigger_value};
+
+      auto new_yield = builder.create<neura::YieldOp>(yield_op.getLoc(),
+                                                      iter_args_next, results);
+      new_yield->setAttr(kYieldTypeAttr,
+                         builder.getStringAttr(kReturnTypeVoid));
+
+      yield_op.erase();
+    } else {
+      // Case 3: No counter - mark for void processing (similar to return).
+      llvm::errs()
+          << "[canonicalize] No counter found, marking as void yield\n";
+      yield_op->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid));
+    }
+  }
+}
+
+// Processes empty yield void blocks (similar to processEmptyReturnVoidBlock).
+static void processEmptyYieldVoidBlock(Block *yield_block,
+                                       neura::YieldOp void_yield_op,
+                                       OpBuilder &builder) {
+  SmallVector<Block *> predecessor_blocks(yield_block->getPredecessors());
+
+  // Entry block with yield_void is unreachable; no action needed.
+  if (predecessor_blocks.empty()) {
+    llvm::errs()
+        << "[canonicalize] Entry block with void yield is unreachable\n";
+    return;
+  }
+
+  // Separates predecessor blocks into cond_br and br blocks.
+  SmallVector<Block *> cond_br_preds;
+  SmallVector<Block *> br_preds;
+
+  for (Block *pred_block : predecessor_blocks) {
+    Operation *terminator = pred_block->getTerminator();
+    if (isa<neura::CondBr>(terminator)) {
+      cond_br_preds.push_back(pred_block);
+    } else if (isa<neura::Br>(terminator)) {
+      br_preds.push_back(pred_block);
+    }
+  }
+
+  // Handles br_preds: copy yield_void to pred_block with a trigger value.
+  for (Block *pred_block : br_preds) {
+    neura::Br br = cast<neura::Br>(pred_block->getTerminator());
+
+    // Finds a suitable trigger value in the predecessor block.
+    Value trigger_value = nullptr;
+
+    for (Operation &op : llvm::reverse(*pred_block)) {
+      if (&op == br) {
+        continue;
+      }
+
+      if (op.getNumResults() > 0) {
+        trigger_value = op.getResult(0);
+        break;
+      }
+    }
+
+    if (!trigger_value) {
+      llvm::errs() << "[canonicalize] Error: No suitable value found in "
+                      "predecessor block\n";
+      return;
+    }
+
+    builder.setInsertionPoint(br);
+
+    SmallVector<Value> iter_args_next(void_yield_op.getIterArgsNext());
+    SmallVector<Value> results = {trigger_value};
+
+    auto new_yield =
+        builder.create<neura::YieldOp>(br.getLoc(), iter_args_next, results);
+    new_yield->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid));
+    br.erase();
+  }
+
+  // If there are no cond_br predecessors, remove the yield_void block.
+  if (cond_br_preds.empty()) {
+    void_yield_op.erase();
+    yield_block->erase();
+    return;
+  }
+
+  // Handles cond_preds: add a block argument for the trigger value.
+  BlockArgument trigger_arg =
+      yield_block->addArgument(builder.getI1Type(), void_yield_op.getLoc());
+
+  // Updates each cond_pred block's terminator to pass the trigger value.
+  for (Block *pred_block : cond_br_preds) {
+    neura::CondBr cond_br = cast<neura::CondBr>(pred_block->getTerminator());
+    Value cond = cond_br.getCondition();
+    Value trigger_value = nullptr;
+
+    bool is_true_branch = (cond_br.getTrueDest() == yield_block);
+    bool is_false_branch = (cond_br.getFalseDest() == yield_block);
+
+    if (is_true_branch && !is_false_branch) {
+      trigger_value = cond;
+    } else if (!is_true_branch && is_false_branch) {
+      builder.setInsertionPoint(cond_br);
+      Value negated_cond =
+          builder.create<neura::NotOp>(cond_br.getLoc(), cond.getType(), cond);
+      trigger_value = negated_cond;
+    } else {
+      llvm::errs() << "[canonicalize] Error: Both branches lead to yield\n";
+      return;
+    }
+
+    if (trigger_value) {
+      SmallVector<Value> true_args(cond_br.getTrueArgs());
+      SmallVector<Value> false_args(cond_br.getFalseArgs());
+
+      if (is_true_branch) {
+        true_args.push_back(trigger_value);
+      }
+      if (is_false_branch) {
+        false_args.push_back(trigger_value);
+      }
+
+      builder.setInsertionPoint(cond_br);
+      builder.create<neura::CondBr>(
+          cond_br.getLoc(), cond_br.getCondition(), true_args, false_args,
+          cond_br.getTrueDest(), cond_br.getFalseDest());
+      cond_br.erase();
+    }
+  }
+
+  // Updates the yield_void operation to use the block argument as trigger.
+  builder.setInsertionPoint(void_yield_op);
+
+  SmallVector<Value> iter_args_next(void_yield_op.getIterArgsNext());
+  SmallVector<Value> results = {trigger_arg};
+
+  auto new_yield = builder.create<neura::YieldOp>(void_yield_op.getLoc(),
+                                                  iter_args_next, results);
+  new_yield->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid));
+  void_yield_op.erase();
+}
+
 static void processEmptyReturnVoidBlock(Block *ret_block,
                                         neura::ReturnOp void_ret_op,
                                         OpBuilder &builder) {
@@ -177,7 +369,7 @@ static void processEmptyReturnVoidBlock(Block *ret_block,
 }
 
 struct CanonicalizeReturnPass
-    : public PassWrapper<CanonicalizeReturnPass, OperationPass<func::FuncOp>> {
+    : public PassWrapper<CanonicalizeReturnPass, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CanonicalizeReturnPass)
 
   StringRef getArgument() const override { return "canonicalize-return"; }
@@ -190,58 +382,97 @@ struct CanonicalizeReturnPass
   }
 
   void runOnOperation() override {
-    func::FuncOp func_op = getOperation();
-    // Checks for neura accelerator attribute.
-    auto accel_attr =
-        func_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
-    if (!accel_attr) {
-      return;
-    }
-
-    Region &region = func_op.getBody();
-    if (region.empty()) {
-      return;
-    }
+    ModuleOp module_op = getOperation();
+    OpBuilder builder(module_op.getContext());
+
+    // Processes all functions.
+    module_op.walk([&](func::FuncOp func_op) {
+      // Checks for neura accelerator attribute.
+      auto accel_attr =
+          func_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+      if (!accel_attr) {
+        return;
+      }
 
-    OpBuilder builder(func_op.getContext());
+      Region &region = func_op.getBody();
+      if (region.empty()) {
+        return;
+      }
 
-    // Step 1: Marks empty returns with "void" attribute.
-    processReturns(region, builder);
+      // Step 1: Marks empty returns with "void" attribute.
+      processReturns(region, builder);
 
-    if (!isVoidFunction(func_op)) {
-      llvm::errs() << "[ctrl2data] Function is not void, no further action "
-                      "needed.\n";
-      return;
-    }
+      if (!isVoidFunction(func_op)) {
+        llvm::errs() << "[ctrl2data] Function is not void, no further action "
+                        "needed.\n";
+        return;
+      }
 
-    // Step 2: Collects all return operations with "is_void" attribute.
-    SmallVector<neura::ReturnOp> ret_void_ops;
-    region.walk([&](neura::ReturnOp ret_op) {
-      if (ret_op->hasAttr(kReturnTypeAttr)) {
-        if (dyn_cast<StringAttr>(ret_op->getAttr(kReturnTypeAttr)).getValue() ==
-            kReturnTypeVoid) {
-          ret_void_ops.push_back(ret_op);
+      // Step 2: Collects all return operations with "is_void" attribute.
+      SmallVector<neura::ReturnOp> ret_void_ops;
+      region.walk([&](neura::ReturnOp ret_op) {
+        if (ret_op->hasAttr(kReturnTypeAttr)) {
+          if (dyn_cast<StringAttr>(ret_op->getAttr(kReturnTypeAttr))
+                  .getValue() == kReturnTypeVoid) {
+            ret_void_ops.push_back(ret_op);
+          }
+        }
+      });
+
+      // Step 3: Processes each return_void block.
+      for (neura::ReturnOp ret_void_op : ret_void_ops) {
+        Block *ret_block = ret_void_op->getBlock();
+
+        // Checks if ret_block only contains the return_void operation.
+        bool is_empty_block = (ret_block->getOperations().size() == 1);
+
+        if (is_empty_block) {
+          processEmptyReturnVoidBlock(ret_block, ret_void_op, builder);
+        } else {
+          // TODO: Handle non-empty return blocks.
+          // The basic idea is to create a new block that only contains the
+          // return_void operation, and redirect the original return block to
+          // this new block.
+          assert(false && "Unsupported case: return block is not empty.");
         }
       }
     });
 
-    // Step 3: Processes each return_void block.
-    for (neura::ReturnOp ret_void_op : ret_void_ops) {
-      Block *ret_block = ret_void_op->getBlock();
-
-      // Checks if ret_block only contains the return_void operation.
-      bool is_empty_block = (ret_block->getOperations().size() == 1);
-
-      if (is_empty_block) {
-        processEmptyReturnVoidBlock(ret_block, ret_void_op, builder);
-      } else {
-        // TODO: Handle non-empty return blocks.
-        // The basic idea is to create a new block that only contains the
-        // return_void operation, and redirect the original return block to this
-        // new block.
-        assert(false && "Unsupported case: return block is not empty.");
+    // Processes all neura.kernel operations.
+    module_op.walk([&](neura::KernelOp kernel_op) {
+      auto accel_attr =
+          kernel_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+      if (!accel_attr) {
+        return;
       }
-    }
+
+      // Step 1: Processes yields (handles cases 1 & 2)
+      processYields(kernel_op, builder);
+
+      // Step 2: Collects void yields without trigger values (case 3).
+      SmallVector<neura::YieldOp> yield_void_ops;
+      kernel_op.walk([&](neura::YieldOp yield_op) {
+        if (yield_op->hasAttr(kYieldTypeAttr)) {
+          if (dyn_cast<StringAttr>(yield_op->getAttr(kYieldTypeAttr))
+                      .getValue() == kReturnTypeVoid &&
+              yield_op.getResults().size() == 0) {
+            yield_void_ops.push_back(yield_op);
+          }
+        }
+      });
+
+      // Step 3: Processes each yield_void block (case 3)
+      for (neura::YieldOp yield_void_op : yield_void_ops) {
+        Block *yield_block = yield_void_op->getBlock();
+        bool is_empty_block = (yield_block->getOperations().size() == 1);
+
+        if (is_empty_block) {
+          processEmptyYieldVoidBlock(yield_block, yield_void_op, builder);
+        } else {
+          assert(false && "Unsupported case: yield block is not empty.");
+        }
+      }
+    });
   }
 };
 } // namespace

From 8ae81872d611252613fc11d8a5a9c605fb2dfa55 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 23 Jan 2026 16:40:30 +0800
Subject: [PATCH 09/25] enable leverage-predicated-values for neura.kernel

---
 .../Transforms/CanonicalizeLiveInPass.cpp     | 26 +++++
 .../LeveragePredicatedValuePass.cpp           | 99 ++++++++++++-------
 2 files changed, 91 insertions(+), 34 deletions(-)

diff --git a/lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp
index 56f72a06..e02ebcc8 100644
--- a/lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp
+++ b/lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp
@@ -794,6 +794,8 @@ struct CanonicalizeLiveInPass
 
   void runOnOperation() override {
     ModuleOp module_op = getOperation();
+
+    // Processes functions.
     module_op.walk([&](Operation *op) {
       Region *region = nullptr;
       if (auto func_op = dyn_cast<func::FuncOp>(op)) {
@@ -827,6 +829,30 @@ struct CanonicalizeLiveInPass
         return;
       }
     });
+
+    // Processes neura.kernel operations.
+    module_op.walk([&](neura::KernelOp kernel_op) {
+      auto accel_attr =
+          kernel_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+      if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
+        return;
+      }
+
+      Region &kernel_region = kernel_op.getBody();
+      if (kernel_region.empty()) {
+        return;
+      }
+
+      // Creates dominance info for the kernel region.
+      DominanceInfo dom_info(kernel_op);
+      PostDominanceInfo post_dom_info(kernel_op);
+
+      if (failed(promoteLiveInValuesToBlockArgs(kernel_region, dom_info,
+                                                post_dom_info))) {
+        signalPassFailure();
+        return;
+      }
+    });
   }
 };
 } // namespace
diff --git a/lib/NeuraDialect/Transforms/LeveragePredicatedValuePass.cpp b/lib/NeuraDialect/Transforms/LeveragePredicatedValuePass.cpp
index 56516c0e..54bc73b5 100644
--- a/lib/NeuraDialect/Transforms/LeveragePredicatedValuePass.cpp
+++ b/lib/NeuraDialect/Transforms/LeveragePredicatedValuePass.cpp
@@ -7,6 +7,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
@@ -39,50 +40,80 @@ struct LeveragePredicatedValuePass
       if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
         return;
       }
-      // Converts block argument types to predicated values.
-      func.walk([&](Block *block) {
-        // skips the entry (first) block of the function.
-        if (block == &block->getParent()->front()) {
-          return;
-        }
-
-        for (BlockArgument arg : block->getArguments()) {
-          Type orig_type = arg.getType();
+      if (failed(processRegion(func.getFunctionBody()))) {
+        llvm::errs() << "Failed to process function: " << func.getName()
+                     << "\n";
+        signalPassFailure();
+        return;
+      }
+    });
 
-          // Avoid double-wrapping if already predicated
-          if (llvm::isa<neura::PredicatedValue>(orig_type)) {
-            continue;
-          }
+    // Processes each neura.kernel operation.
+    module.walk([&](neura::KernelOp kernel_op) {
+      auto accel_attr =
+          kernel_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+      if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
+        return;
+      }
 
-          auto predicated_type = neura::PredicatedValue::get(
-              func.getContext(), orig_type,
-              IntegerType::get(func.getContext(), 1));
-          arg.setType(predicated_type);
-        }
-      });
-
-      // Gets operations in topological order (operands before users).
-      SmallVector<Operation *> orderedOps;
-      getOperationsInTopologicalOrder(func, orderedOps);
-
-      // Processes each operation in order.
-      for (Operation *op : orderedOps) {
-        if (failed(applyPredicatedDataType(op))) {
-          llvm::errs() << "Failed to convert op to predicated form: " << *op
-                       << "\n";
-          signalPassFailure();
-          return;
-        }
+      if (failed(processRegion(kernel_op.getBody()))) {
+        llvm::errs() << "Failed to process neura.kernel operation: "
+                     << *kernel_op << "\n";
+        signalPassFailure();
+        return;
       }
     });
   }
 
 private:
+  // Processes a region (function body or kernel body).
+  LogicalResult processRegion(Region &region) {
+    if (region.empty()) {
+      return success();
+    }
+
+    for (Block &block : region) {
+      // Skips the entry (first) block of the function.
+      if (&block == &region.front()) {
+        continue;
+      }
+
+      for (BlockArgument arg : block.getArguments()) {
+        Type orig_type = arg.getType();
+
+        // Avoids double-wrapping if already predicated.
+        if (llvm::isa<neura::PredicatedValue>(orig_type)) {
+          continue;
+        }
+
+        auto predicated_type = neura::PredicatedValue::get(
+            region.getContext(), orig_type,
+            IntegerType::get(region.getContext(), 1));
+        arg.setType(predicated_type);
+      }
+    }
+
+    // Gets operations in topological order (operands before users).
+    SmallVector<Operation *> ordered_ops;
+    getOperationsInTopologicalOrder(region, ordered_ops);
+
+    // Processes each operation in order.
+    for (Operation *op : ordered_ops) {
+      if (failed(applyPredicatedDataType(op))) {
+        llvm::errs() << "Failed to convert op to predicated form: " << *op
+                     << "\n";
+        return failure();
+      }
+    }
+
+    return success();
+  }
+
   // Gets operations in topological order.
-  void getOperationsInTopologicalOrder(FunctionOpInterface func,
+  void getOperationsInTopologicalOrder(Region &region,
                                        SmallVector<Operation *> &ordered) {
     DenseSet<Operation *> visited;
-    func.walk<WalkOrder::PreOrder>([&](Operation *op) {
+    region.walk<WalkOrder::PreOrder>([&](Operation *op) {
       // Uses standard DFS to build topological order.
       if (visited.contains(op)) {
         return;

From deffa0a7f6aba0d00b5509df393885fbfc46750a Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 24 Jan 2026 13:30:35 +0800
Subject: [PATCH 10/25] enable kernel with counters dataflow lowering

---
 include/NeuraDialect/NeuraOps.td              |  21 +
 .../Transforms/CanonicalizeReturnPass.cpp     | 261 +++----------
 .../Transforms/PromoteInputArgToConstPass.cpp |  54 +--
 .../TransformCtrlToDataFlowPass.cpp           | 360 ++++++++++++++++--
 test/neura/ctrl/branch_for.mlir               |   3 +-
 5 files changed, 440 insertions(+), 259 deletions(-)

diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index 80006ce6..7aa0d783 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -780,6 +780,7 @@ def Neura_LoopControlOp : Op<NeuraDialect, "loop_control">{
   //   " `(``parent_valid` `=` $parentValid `,` `start` `=` $start `,` `end` `=` $end `,` `step` `=` $step`)` attr-dict `:` type($parentValid) `,` type($start) `,` type($end) `,` type($step) `->` type($nextindex) `,` type($valid)";
 }
 
+// Defines an operation for hardware loop counters.
 def Neura_CounterOp : Op<NeuraDialect, "counter", [Pure]>{
   let summary = "Hardware loop counter for CGRA execution.";
   let description = [{
@@ -809,6 +810,26 @@ def Neura_CounterOp : Op<NeuraDialect, "counter", [Pure]>{
   let assemblyFormat = "attr-dict `:` type($current_index)";
 }
 
+// Defines an operation to extract the predicate bit from a predicated value.
+def Neura_ExtractPredicateOp : Op<NeuraDialect, "extract_predicate">{
+  let summary = "Extracts the predicate bit from a predicated value.";
+  let description = [{
+    Extracts the predicate bit from a predicated value,
+    producing a boolean predicated value: !neura.predicated<i1, i1>.
+
+    Example:
+      %counter = neura.counter {bound = 16} : !neura.predicated<index, i1>
+      %is_valid = neura.extract_predicate %counter : !neura.predicated<index, i1> -> !neura.predicated<i1, i1>
+      
+      // Use for gating final results:
+      %gated = neura.grant_predicate %result, %is_valid
+  }];
+
+  let arguments = (ins AnyType:$input);
+  let results = (outs AnyType:$predicate);
+  let assemblyFormat = "$input attr-dict `:` type($input) `->` type($predicate)";
+}
+
 // ----------------------------------------------------
 // Defines operations for steering-control based DFG execution.
 // ----------------------------------------------------
diff --git a/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp
index 22e3869c..1ce7fe9b 100644
--- a/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp
+++ b/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp
@@ -39,6 +39,16 @@ static bool isVoidFunction(func::FuncOp func_op) {
   return false;
 }
 
+// Checks if kernel has any counter.
+static bool kernelHasCounter(neura::KernelOp kernel_op) {
+  bool has_counter = false;
+  kernel_op.walk([&](neura::CounterOp counter_op) {
+    has_counter = true;
+    return WalkResult::interrupt();
+  });
+  return has_counter;
+}
+
 // Marks empty returns with "is_void" attribute and adds trigger values.
 static void processReturns(Region &region, OpBuilder &builder) {
   SmallVector<neura::ReturnOp> empty_returns;
@@ -59,195 +69,65 @@ static void processReturns(Region &region, OpBuilder &builder) {
   }
 }
 
-// Processes neura.yield operations in kernel regions.
-static void processYields(neura::KernelOp kernel_op, OpBuilder &builder) {
-  SmallVector<neura::YieldOp> empty_yields;
-
-  kernel_op.walk([&](neura::YieldOp yield_op) {
-    llvm::errs() << "[canonicalize] Processing neura.yield operation...\n";
-    llvm::errs() << yield_op << "\n";
-
-    // Case 1: yield has results - mark as value type.
-    if (yield_op.getResults().size() > 0) {
-      llvm::errs() << "[canonicalize] Marking neura.yield with value...\n";
-      yield_op->setAttr(kYieldTypeAttr,
-                        builder.getStringAttr(kReturnTypeValue));
-      return;
-    }
-
-    // Case 2 & 3: yield has no results.
-    empty_yields.push_back(yield_op);
-  });
-
-  // Processes empty yields.
-  for (neura::YieldOp yield_op : empty_yields) {
-    llvm::errs() << "[canonicalize] Processing empty neura.yield...\n";
+// Converts yields to returns (for kernels without counters).
+static void convertYieldsToReturns(neura::KernelOp kernel_op,
+                                   OpBuilder &builder) {
+  SmallVector<neura::YieldOp> yields_to_convert;
 
-    // Searches for counters in the kernel.
-    neura::CounterOp root_counter = nullptr;
-    neura::CounterOp any_counter = nullptr;
-
-    kernel_op.walk([&](neura::CounterOp counter_op) {
-      any_counter = counter_op;
-
-      if (counter_op.getCounterTypeAttr() &&
-          counter_op.getCounterTypeAttr().getValue() == "root") {
-        root_counter = counter_op;
-      }
-    });
+  // Collects all yields in kernel.
+  kernel_op.walk(
+      [&](neura::YieldOp yield_op) { yields_to_convert.push_back(yield_op); });
 
-    // Case 2: Has counter - uses counter as trigger.
-    if (root_counter || any_counter) {
-      Value trigger_value = root_counter ? root_counter.getCurrentIndex()
-                                         : any_counter.getCurrentIndex();
+  for (neura::YieldOp yield_op : yields_to_convert) {
+    llvm::errs() << "[canonicalize]   Converting yield to return: " << yield_op
+                 << "\n";
 
-      llvm::errs() << "[canonicalize] Using "
-                   << (root_counter ? "root" : "leaf")
-                   << " counter as trigger.\n";
+    builder.setInsertionPoint(yield_op);
 
-      // Creates new yield with trigger value as result.
-      builder.setInsertionPoint(yield_op);
-
-      SmallVector<Value> iter_args_next(yield_op.getIterArgsNext());
-      SmallVector<Value> results = {trigger_value};
-
-      auto new_yield = builder.create<neura::YieldOp>(yield_op.getLoc(),
-                                                      iter_args_next, results);
-      new_yield->setAttr(kYieldTypeAttr,
-                         builder.getStringAttr(kReturnTypeVoid));
-
-      yield_op.erase();
+    if (yield_op.getResults().size() > 0) {
+      // Yield with results → return with values
+      llvm::errs() << "[canonicalize]     Yield has results\n";
+      builder.create<neura::ReturnOp>(yield_op.getLoc(), yield_op.getResults());
     } else {
-      // Case 3: No counter - mark for void processing (similar to return).
-      llvm::errs()
-          << "[canonicalize] No counter found, marking as void yield\n";
-      yield_op->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid));
+      // Yield without results → return without operands (empty return)
+      llvm::errs() << "[canonicalize]     Yield is void\n";
+      builder.create<neura::ReturnOp>(yield_op.getLoc(), ValueRange{});
     }
-  }
-}
-
-// Processes empty yield void blocks (similar to processEmptyReturnVoidBlock).
-static void processEmptyYieldVoidBlock(Block *yield_block,
-                                       neura::YieldOp void_yield_op,
-                                       OpBuilder &builder) {
-  SmallVector<Block *> predecessor_blocks(yield_block->getPredecessors());
 
-  // Entry block with yield_void is unreachable; no action needed.
-  if (predecessor_blocks.empty()) {
-    llvm::errs()
-        << "[canonicalize] Entry block with void yield is unreachable\n";
-    return;
+    yield_op.erase();
   }
+}
 
-  // Separates predecessor blocks into cond_br and br blocks.
-  SmallVector<Block *> cond_br_preds;
-  SmallVector<Block *> br_preds;
-
-  for (Block *pred_block : predecessor_blocks) {
-    Operation *terminator = pred_block->getTerminator();
-    if (isa<neura::CondBr>(terminator)) {
-      cond_br_preds.push_back(pred_block);
-    } else if (isa<neura::Br>(terminator)) {
-      br_preds.push_back(pred_block);
-    }
-  }
-
-  // Handles br_preds: copy yield_void to pred_block with a trigger value.
-  for (Block *pred_block : br_preds) {
-    neura::Br br = cast<neura::Br>(pred_block->getTerminator());
-
-    // Finds a suitable trigger value in the predecessor block.
-    Value trigger_value = nullptr;
-
-    for (Operation &op : llvm::reverse(*pred_block)) {
-      if (&op == br) {
-        continue;
-      }
-
-      if (op.getNumResults() > 0) {
-        trigger_value = op.getResult(0);
-        break;
+// Processes neura.yield operations in kernel regions.
+static void processYields(neura::KernelOp kernel_op, OpBuilder &builder) {
+  // Checks if kernel has counter.
+  bool has_counter = kernelHasCounter(kernel_op);
+
+  if (has_counter) {
+    // Case 1: kernel has counter -> keep yields, just marks them.
+    kernel_op.walk([&](neura::YieldOp yield_op) {
+      llvm::errs() << "[canonicalize] Processing neura.yield operation...\n";
+      llvm::errs() << yield_op << "\n";
+
+      // yield has results - mark as value type.
+      if (yield_op.getResults().size() > 0) {
+        llvm::errs() << "[canonicalize] Marking neura.yield with value...\n";
+        yield_op->setAttr(kYieldTypeAttr,
+                          builder.getStringAttr(kReturnTypeValue));
+        return;
       }
-    }
-
-    if (!trigger_value) {
-      llvm::errs() << "[canonicalize] Error: No suitable value found in "
-                      "predecessor block\n";
-      return;
-    }
 
-    builder.setInsertionPoint(br);
-
-    SmallVector<Value> iter_args_next(void_yield_op.getIterArgsNext());
-    SmallVector<Value> results = {trigger_value};
-
-    auto new_yield =
-        builder.create<neura::YieldOp>(br.getLoc(), iter_args_next, results);
-    new_yield->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid));
-    br.erase();
-  }
-
-  // If there are no cond_br predecessors, remove the yield_void block.
-  if (cond_br_preds.empty()) {
-    void_yield_op.erase();
-    yield_block->erase();
-    return;
-  }
-
-  // Handles cond_preds: add a block argument for the trigger value.
-  BlockArgument trigger_arg =
-      yield_block->addArgument(builder.getI1Type(), void_yield_op.getLoc());
-
-  // Updates each cond_pred block's terminator to pass the trigger value.
-  for (Block *pred_block : cond_br_preds) {
-    neura::CondBr cond_br = cast<neura::CondBr>(pred_block->getTerminator());
-    Value cond = cond_br.getCondition();
-    Value trigger_value = nullptr;
-
-    bool is_true_branch = (cond_br.getTrueDest() == yield_block);
-    bool is_false_branch = (cond_br.getFalseDest() == yield_block);
-
-    if (is_true_branch && !is_false_branch) {
-      trigger_value = cond;
-    } else if (!is_true_branch && is_false_branch) {
-      builder.setInsertionPoint(cond_br);
-      Value negated_cond =
-          builder.create<neura::NotOp>(cond_br.getLoc(), cond.getType(), cond);
-      trigger_value = negated_cond;
-    } else {
-      llvm::errs() << "[canonicalize] Error: Both branches lead to yield\n";
-      return;
-    }
-
-    if (trigger_value) {
-      SmallVector<Value> true_args(cond_br.getTrueArgs());
-      SmallVector<Value> false_args(cond_br.getFalseArgs());
-
-      if (is_true_branch) {
-        true_args.push_back(trigger_value);
-      }
-      if (is_false_branch) {
-        false_args.push_back(trigger_value);
-      }
+      // yield has NO results, marks as "void" type.
+      yield_op->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid));
+    });
+  } else {
+    // Case 2: kernel has NO counter -> converts yields to direct returns.
+    llvm::errs()
+        << "[canonicalize]    No counter -> converting yields to returns\n";
+    convertYieldsToReturns(kernel_op, builder);
 
-      builder.setInsertionPoint(cond_br);
-      builder.create<neura::CondBr>(
-          cond_br.getLoc(), cond_br.getCondition(), true_args, false_args,
-          cond_br.getTrueDest(), cond_br.getFalseDest());
-      cond_br.erase();
-    }
+    // No marks the returns we converted.
   }
-
-  // Updates the yield_void operation to use the block argument as trigger.
-  builder.setInsertionPoint(void_yield_op);
-
-  SmallVector<Value> iter_args_next(void_yield_op.getIterArgsNext());
-  SmallVector<Value> results = {trigger_arg};
-
-  auto new_yield = builder.create<neura::YieldOp>(void_yield_op.getLoc(),
-                                                  iter_args_next, results);
-  new_yield->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid));
-  void_yield_op.erase();
 }
 
 static void processEmptyReturnVoidBlock(Block *ret_block,
@@ -439,6 +319,9 @@ struct CanonicalizeReturnPass
     });
 
     // Processes all neura.kernel operations.
+    // There are two cases to handle:
+    // 1) kernel with counters - the return process is triggered by the counter.
+    // 2) kernel without counters - same logic as function return.
     module_op.walk([&](neura::KernelOp kernel_op) {
       auto accel_attr =
           kernel_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
@@ -446,32 +329,8 @@ struct CanonicalizeReturnPass
         return;
       }
 
-      // Step 1: Processes yields (handles cases 1 & 2)
+      // Step 1: Processes yields.
       processYields(kernel_op, builder);
-
-      // Step 2: Collects void yields without trigger values (case 3).
-      SmallVector<neura::YieldOp> yield_void_ops;
-      kernel_op.walk([&](neura::YieldOp yield_op) {
-        if (yield_op->hasAttr(kYieldTypeAttr)) {
-          if (dyn_cast<StringAttr>(yield_op->getAttr(kYieldTypeAttr))
-                      .getValue() == kReturnTypeVoid &&
-              yield_op.getResults().size() == 0) {
-            yield_void_ops.push_back(yield_op);
-          }
-        }
-      });
-
-      // Step 3: Processes each yield_void block (case 3)
-      for (neura::YieldOp yield_void_op : yield_void_ops) {
-        Block *yield_block = yield_void_op->getBlock();
-        bool is_empty_block = (yield_block->getOperations().size() == 1);
-
-        if (is_empty_block) {
-          processEmptyYieldVoidBlock(yield_block, yield_void_op, builder);
-        } else {
-          assert(false && "Unsupported case: yield block is not empty.");
-        }
-      }
     });
   }
 };
diff --git a/lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp b/lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp
index 7889922c..39edd185 100644
--- a/lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp
+++ b/lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp
@@ -20,36 +20,9 @@ using namespace mlir;
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 namespace {
-/**
- * @brief Specializes a region by "internalizing" its input arguments as
- * constants.
- *
- * This function performs a redirection of the dataflow. It identifies all
- * input arguments of the entry block, creates a corresponding
- * `neura::ConstantOp` for each, and re-links all internal operations to use
- * these constants instead of the original block parameters.
- *
- * ### Example Transformation:
- * * **Before:**
- * @code
- * func.func @compute(%arg0: i32) {
- * %0 = arith.addi %arg0, %arg0 : i32
- * return %0 : i32
- * }
- * @endcode
- * * **After:**
- * @code
- * func.func @compute(%arg0: i32) {
- * %0 = "neura.constant"() {value = "%arg0"} : () -> i32
- * %1 = arith.addi %0, %0 : i32  // Uses replaced
- * return %1 : i32
- * }
- * @endcode
- *
- * @param region The MLIR Region (typically a function body) to transform.
- * @return Success if the transformation was applied (even if the region was
- * empty).
- */
+// Attribute name to mark iter_arg init constants.
+constexpr const char *kIterArgInitAttr = "is_iter_arg_init";
+
 LogicalResult promoteFunctionArgsToConstants(Region &region) {
   if (region.empty()) {
     return success();
@@ -94,7 +67,7 @@ LogicalResult promoteKernelArgsToConstants(neura::KernelOp kernel_op) {
   assert(args.size() == num_inputs + num_iter_args &&
          "Kernel block arguments size mismatch");
 
-  // Only promotes input arguments (not iter_args).
+  // Step 1: promotes input arguments (not iter_args).
   // Block arguments layout: [input0, input1, ..., iter_arg0, iter_arg1, ...]
   for (size_t i = 0; i < num_inputs; ++i) {
     BlockArgument input_arg = args[i];
@@ -109,9 +82,22 @@ LogicalResult promoteKernelArgsToConstants(neura::KernelOp kernel_op) {
     input_arg.replaceAllUsesWith(const_op.getResult());
   }
 
-  // Note: iter_args (args[num_inputs] to args[num_inputs + num_iter_args - 1])
-  // are NOT promoted here. They will be handled in transform-ctrl-to-data-flow
-  // pass.
+  // Step 2: promotes iter_args_init to constants with special attribute.
+  for (size_t i = 0; i < num_iter_args; i++) {
+    BlockArgument iter_arg = args[num_inputs + i];
+
+    // Creates a constant for this iter_arg_init value.
+    std::string const_name = "%iter_arg_init" + std::to_string(i);
+    auto const_op =
+        builder.create<neura::ConstantOp>(iter_arg.getLoc(), iter_arg.getType(),
+                                          builder.getStringAttr(const_name));
+
+    // Marks this constant as an iter_arg init value.
+    const_op->setAttr(kIterArgInitAttr, builder.getBoolAttr(true));
+
+    // Replaces all uses of this iter_arg argument with the constant.
+    iter_arg.replaceAllUsesWith(const_op.getResult());
+  }
 
   return success();
 }
diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
index 9f8f2a86..6847d04d 100644
--- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
+++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
@@ -24,9 +24,67 @@ using namespace mlir;
 #define GEN_PASS_DEF_TRANSFORMCTRLTODATAFLOW
 #include "NeuraDialect/NeuraPasses.h.inc"
 
+// Attribute name to mark iter_arg init constants.
+constexpr const char *kIterArgInitAttr = "is_iter_arg_init";
+
+//---------------------------------------------------------------------------
+// Checks if task has counter (root counter).
+//---------------------------------------------------------------------------
+bool taskHasCounter(neura::KernelOp kernel_op) {
+  if (!kernel_op) {
+    return false;
+  }
+
+  bool found_counter = false;
+  kernel_op.walk([&](neura::CounterOp counter_op) {
+    if (counter_op) {
+      found_counter = true;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+
+  return found_counter;
+}
+
+//---------------------------------------------------------------------------
+// Finds root counter in the neura.kernel operation.
+//---------------------------------------------------------------------------
+neura::CounterOp findRootCounterInKernel(neura::KernelOp kernel_op) {
+  if (!kernel_op) {
+    return nullptr;
+  }
+
+  neura::CounterOp root_counter = nullptr;
+  neura::CounterOp leaf_counter = nullptr;
+
+  // Walks throught kernel body to find counter.
+  kernel_op.walk([&](neura::CounterOp counter_op) {
+    StringRef counter_type = counter_op.getCounterType();
+
+    if (counter_type == "root") {
+      root_counter = counter_op;
+    } else if (counter_type == "leaf") {
+      leaf_counter = counter_op;
+    }
+  });
+
+  if (root_counter) {
+    return root_counter;
+  } else if (leaf_counter) {
+    return leaf_counter;
+  }
+
+  return nullptr;
+}
+
 // Inserts `grant_once` for every predicated value defined in the entry block
 // that is used outside of the block (i.e., a live-out).
-void GrantPredicateInEntryBlock(Block *entry_block, OpBuilder &builder) {
+void GrantPredicateInEntryBlock(Block *entry_block, OpBuilder &builder,
+                                bool has_task_counter = false) {
+  if (has_task_counter) {
+    return;
+  }
   SmallVector<Value> live_out_arg_values;
   SmallVector<Value> live_out_non_arg_values;
 
@@ -86,6 +144,183 @@ void GrantPredicateInEntryBlock(Block *entry_block, OpBuilder &builder) {
   }
 }
 
+//---------------------------------------------------------------------------
+// Iter_args handling (always grant_once for now).
+//---------------------------------------------------------------------------
+void handleKernelIterArgs(neura::KernelOp kernel_op, Block *entry_block,
+                          OpBuilder &builder,
+                          SmallVector<Value> &iter_arg_final_values) {
+  llvm::errs() << "[iter_args] Handling kernel iter_args...\n";
+
+  SmallVector<neura::ConstantOp> iter_arg_init_ops;
+  for (Operation &op : kernel_op.getOps()) {
+    if (auto const_op = dyn_cast<neura::ConstantOp>(op)) {
+      if (const_op->hasAttr(kIterArgInitAttr) &&
+          const_op->getAttrOfType<BoolAttr>(kIterArgInitAttr).getValue()) {
+        iter_arg_init_ops.push_back(const_op);
+      }
+    }
+  }
+
+  if (iter_arg_init_ops.empty()) {
+    llvm::errs() << "[iter_args]   No iter_args\n";
+    return;
+  }
+
+  neura::YieldOp yield_op = nullptr;
+  for (Operation &op : kernel_op.getOps()) {
+    if (auto yld = dyn_cast<neura::YieldOp>(op)) {
+      yield_op = yld;
+      break;
+    }
+  }
+
+  if (!yield_op || yield_op.getIterArgsNext().empty()) {
+    llvm::errs() << "[iter_args]   No iter_args_next in yield\n";
+    return;
+  }
+
+  for (size_t i = 0; i < iter_arg_init_ops.size(); ++i) {
+    neura::ConstantOp init_const = iter_arg_init_ops[i];
+    Value feedback_value = yield_op.getIterArgsNext()[i];
+
+    llvm::errs() << "[iter_args]   Processing iter_arg " << i << "\n";
+
+    // Grants once the init.
+    builder.setInsertionPointAfter(init_const);
+    neura::GrantOnceOp granted_init = builder.create<neura::GrantOnceOp>(
+        init_const.getLoc(), init_const.getType(), init_const.getResult());
+
+    // Creates reserve for feedback value.
+    builder.setInsertionPointAfter(granted_init);
+    neura::ReserveOp reserve_op = builder.create<neura::ReserveOp>(
+        init_const.getLoc(), init_const.getType());
+
+    // Creates phi for init and feedback.
+    builder.setInsertionPointAfter(reserve_op);
+    neura::PhiOp phi = builder.create<neura::PhiOp>(
+        init_const.getLoc(), init_const.getType(),
+        ValueRange{granted_init.getResult(), reserve_op.getResult()});
+
+    // Replaces uses.
+    init_const.getResult().replaceUsesWithIf(
+        phi.getResult(), [&](OpOperand &use) {
+          Operation *user = use.getOwner();
+          return user != granted_init && !isa<neura::CtrlMovOp>(user);
+        });
+
+    // Creates ctrl_mov.
+    builder.setInsertionPoint(yield_op);
+    builder.create<neura::CtrlMovOp>(yield_op.getLoc(), feedback_value,
+                                     reserve_op.getResult());
+
+    iter_arg_final_values.push_back(feedback_value);
+    llvm::errs() << "[iter_args]     Created iter_arg with grant_once\n";
+  }
+
+  llvm::errs() << "[iter_args] Iter_args complete\n\n";
+}
+
+//---------------------------------------------------------------------------
+// Handles kernel yield with counter-based gating.
+//---------------------------------------------------------------------------
+void handleKernelYieldTermination(
+    neura::KernelOp kernel_op, Block *entry_block, OpBuilder &builder,
+    bool has_task_counter, const SmallVector<Value> &iter_arg_final_values) {
+  llvm::errs() << "[yield] ========================================\n";
+  llvm::errs() << "[yield] Handling Yield Termination\n";
+  llvm::errs() << "[yield] ========================================\n";
+
+  neura::YieldOp yield_op = nullptr;
+  for (Operation &op : kernel_op.getOps()) {
+    if (auto yld = dyn_cast<neura::YieldOp>(op)) {
+      yield_op = yld;
+      break;
+    }
+  }
+
+  if (!yield_op) {
+    llvm::errs() << "[yield] No yield operation found\n";
+    return;
+  }
+
+  builder.setInsertionPoint(yield_op);
+
+  if (!yield_op->hasAttr("yield_type")) {
+    llvm::errs() << "[yield] No yield_type attribute\n";
+    yield_op.erase();
+    return;
+  }
+
+  StringRef yield_type =
+      yield_op->getAttrOfType<StringAttr>("yield_type").getValue();
+
+  //--------------------------------------------------------------------------
+  // Case 1: VALUE yield
+  //--------------------------------------------------------------------------
+  if (yield_type == "value") {
+    llvm::errs() << "[yield] Processing VALUE yield\n";
+
+    if (has_task_counter) {
+      llvm::errs()
+          << "[yield]   Has counter → Gate with NOT(counter predicate)\n";
+
+      // Finds counter in kernel that defines the predicate.
+      neura::CounterOp counter_op = findRootCounterInKernel(kernel_op);
+
+      assert(counter_op &&
+             "Kernel has outer task counter but no neura::CounterOp found.");
+
+      // Extracts predicate and negates it.
+      Value counter_value = counter_op.getCurrentIndex();
+
+      auto pred_type = builder.getType<neura::PredicatedValue>(
+          builder.getI1Type(), builder.getI1Type());
+
+      auto extract_pred = builder.create<neura::ExtractPredicateOp>(
+          counter_op.getLoc(), pred_type, counter_value);
+
+      Value counter_pred = extract_pred.getPredicate();
+
+      // When the counter predicate is false, we want to trigger the return.
+      auto not_op = builder.create<neura::NotOp>(
+          counter_op.getLoc(), counter_pred.getType(), counter_pred);
+
+      Value return_gate = not_op.getResult();
+
+      llvm::errs() << "[yield]     Extracted counter predicate\n";
+      llvm::errs() << "[yield]     Created NOT gate for return\n";
+
+      // Gates all results with NOT (counter predicate).
+      SmallVector<Value> gated_results;
+      for (Value result : yield_op.getResults()) {
+        auto gated = builder.create<neura::GrantPredicateOp>(
+            yield_op.getLoc(), result.getType(), result, return_gate);
+        gated_results.push_back(gated.getResult());
+
+        llvm::errs() << "[yield]     Gated result with NOT(counter_pred)\n";
+      }
+
+      auto return_val = builder.create<neura::ReturnValueOp>(yield_op.getLoc(),
+                                                             gated_results);
+      llvm::errs() << "[yield]   Created return_value with counter gating\n";
+      builder.setInsertionPointAfter(return_val);
+      builder.create<neura::YieldOp>(builder.getUnknownLoc());
+
+    } else {
+      llvm::errs() << "[yield]   No counter, handled as normal case.\n";
+    }
+    yield_op.erase();
+  }
+  //--------------------------------------------------------------------------
+  // Case 2: VOID yield
+  //--------------------------------------------------------------------------
+  else if (yield_type == "void") {
+    llvm::errs() << "[yield] Processing VOID yield\n";
+  }
+  llvm::errs() << "[yield] ========================================\n\n";
+}
+
 // Control flow struct.
 struct ControlFlowInfo {
   struct Edge {
@@ -264,6 +499,8 @@ void buildControlFlowInfo(Region &region, ControlFlowInfo &ctrl_info,
 
     } else if (auto rt = dyn_cast<neura::ReturnOp>(terminator)) {
       llvm::errs() << "[ctrl2data] ReturnOp found: " << *rt << "\n";
+    } else if (auto yield = dyn_cast<neura::YieldOp>(terminator)) {
+      llvm::errs() << "[ctrl2data] YieldOp found: " << *yield << "\n";
     } else {
       assert(false && "Unknown terminator operation in control flow graph.");
     }
@@ -480,8 +717,8 @@ void createReserveAndPhiOps(
 
 // Transforms control flow into data flow.
 void transformControlFlowToDataFlow(Region &region, ControlFlowInfo &ctrl_info,
-                                    DominanceInfo &dom_info,
-                                    OpBuilder &builder) {
+                                    DominanceInfo &dom_info, OpBuilder &builder,
+                                    bool is_kernel = false) {
 
   // Asserts that all live-out values are dominated by block arguments.
   assertLiveOutValuesDominatedByBlockArgs(region);
@@ -572,6 +809,10 @@ void transformControlFlowToDataFlow(Region &region, ControlFlowInfo &ctrl_info,
     block->erase();
   }
 
+  if (is_kernel) {
+    return;
+  }
+
   // Converts neura.return to return_void or return_value.
   SmallVector<neura::ReturnOp> return_ops;
   for (Operation &op : llvm::make_early_inc_range(*entry_block)) {
@@ -710,30 +951,103 @@ struct TransformCtrlToDataFlowPass
 
   void runOnOperation() override {
     ModuleOp module = getOperation();
-    module.walk([&](Operation *op) {
-      Region *region = nullptr;
-      DominanceInfo domInfo;
-      OpBuilder builder(op->getContext());
-
-      if (auto func = dyn_cast<func::FuncOp>(op)) {
-        auto accel_attr =
-            func->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
-        if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
-          return;
-        }
-        region = &func.getBody();
-        domInfo = DominanceInfo(func);
-        GrantPredicateInEntryBlock(&region->front(), builder);
-        assertLiveOutValuesDominatedByBlockArgs(*region);
-      } else {
+
+    // Step 1: Processes each function with neura target in the module.
+    module.walk([&](func::FuncOp func_op) {
+      auto accel_attr =
+          func_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+      if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
         return;
       }
-      ControlFlowInfo ctrlInfo;
-      buildControlFlowInfo(*region, ctrlInfo, domInfo);
-      transformControlFlowToDataFlow(*region, ctrlInfo, domInfo, builder);
+
+      Region &region = func_op.getBody();
+      DominanceInfo dom_info(func_op);
+      OpBuilder builder(func_op.getContext());
+      GrantPredicateInEntryBlock(&region.front(), builder);
+      assertLiveOutValuesDominatedByBlockArgs(region);
+
+      ControlFlowInfo ctrl_info;
+      buildControlFlowInfo(region, ctrl_info, dom_info);
+      transformControlFlowToDataFlow(region, ctrl_info, dom_info, builder);
 
       // Converts phi operations to phi_start operations.
-      convertPhiToPhiStart(*region, builder);
+      convertPhiToPhiStart(region, builder);
+    });
+
+    // Step 2: Processes neura.kernel operation.
+    // For neura.kernel operations, we need to handle three cases:
+    // Case 1: outer task has counter, no return value
+    //     - Skips grant predicate in entry block
+    //     - Outer counter (root counter) gates the return
+    // Case 2: outer task has counter, with return value
+    //     - Skips grant predicate in entry block
+    //     - Outer counter (root counter) gates the return
+    //     - Inserts extract_predicate from outer counter (root counter) to gate
+    //     return values
+    // Case 3: outer task has no counter, with/without return value
+    //     - Normal grant predicate in entry block
+    //     - Normal transfrom-ctrl-to-data-flow process
+    module.walk([&](neura::KernelOp kernel_op) {
+      auto accel_attr =
+          kernel_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+      if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
+        return;
+      }
+
+      llvm::errs()
+          << "\n[ctrl2data] ========================================\n";
+      llvm::errs() << "[ctrl2data] Processing KERNEL\n";
+      llvm::errs() << "[ctrl2data] ========================================\n";
+
+      Region &kernel_region = kernel_op.getBody();
+      Block *entry_block = &kernel_region.front();
+      OpBuilder builder(kernel_op.getContext());
+      DominanceInfo dom_info(kernel_op);
+
+      // STEP 0: Checks if the kernel has root counter.
+      bool has_task_counter = taskHasCounter(kernel_op);
+
+      llvm::errs() << "[ctrl2data] Task has counter: "
+                   << (has_task_counter ? "YES" : "NO") << "\n\n";
+
+      SmallVector<Value> iter_arg_final_values;
+
+      // STEP 1: Handles iter_args of the neura.kernel.
+      llvm::errs() << "[ctrl2data] === STEP 1: Handle iter_args ===\n";
+      handleKernelIterArgs(kernel_op, entry_block, builder,
+                           iter_arg_final_values);
+
+      // STEP 2: Grants predicates (only if NO task counter).
+      llvm::errs() << "[ctrl2data] === STEP 2: Grant predicates ===\n";
+      GrantPredicateInEntryBlock(entry_block, builder, has_task_counter);
+
+      // STEP 3: Transforms control flow (if multi-block).
+      if (kernel_region.getBlocks().size() > 1) {
+        llvm::errs() << "[ctrl2data] === STEP 3: Transform control flow ===\n";
+        assertLiveOutValuesDominatedByBlockArgs(kernel_region);
+        ControlFlowInfo ctrl_info;
+        buildControlFlowInfo(kernel_region, ctrl_info, dom_info);
+        transformControlFlowToDataFlow(kernel_region, ctrl_info, dom_info,
+                                       builder, true);
+      } else {
+        llvm::errs() << "[ctrl2data] === STEP 3: Single block (skip) ===\n";
+      }
+      convertPhiToPhiStart(kernel_region, builder);
+
+      // STEP 4: Handles yield termination in neura.kernel.
+      llvm::errs() << "[ctrl2data] === STEP 4: Handle yield ===\n";
+      handleKernelYieldTermination(kernel_op, entry_block, builder,
+                                   has_task_counter, iter_arg_final_values);
+
+      kernel_op->setAttr(neura::attr::kDataflowMode,
+                         StringAttr::get(kernel_op.getContext(),
+                                         neura::attr::val::kModePredicate));
+
+      llvm::errs() << "[ctrl2data] ========================================\n";
+      llvm::errs() << "[ctrl2data] ✅ KERNEL Complete\n";
+      llvm::errs()
+          << "[ctrl2data] ========================================\n\n";
+      llvm::errs() << "transformed kernel op:\n" << kernel_op << "\n";
     });
   }
 };
diff --git a/test/neura/ctrl/branch_for.mlir b/test/neura/ctrl/branch_for.mlir
index a626575e..0a7d6031 100644
--- a/test/neura/ctrl/branch_for.mlir
+++ b/test/neura/ctrl/branch_for.mlir
@@ -21,7 +21,8 @@
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
 // RUN:   --transform-ctrl-to-data-flow \
-// RUN:   | FileCheck %s -check-prefix=CTRL2DATA
+// RUN:   -o %t-transformed.mlir
+// RU:   | FileCheck %s -check-prefix=CTRL2DATA
 
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \

From c5e42ebe53335378df708736131ee2a04541d274 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 24 Jan 2026 13:55:00 +0800
Subject: [PATCH 11/25] enable kernel without counters dataflow lowering

---
 .../Transforms/CanonicalizeReturnPass.cpp     | 41 +++++++++
 .../TransformCtrlToDataFlowPass.cpp           | 84 +++++++++++++------
 2 files changed, 99 insertions(+), 26 deletions(-)

diff --git a/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp
index 1ce7fe9b..d17bf786 100644
--- a/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp
+++ b/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp
@@ -127,6 +127,8 @@ static void processYields(neura::KernelOp kernel_op, OpBuilder &builder) {
     convertYieldsToReturns(kernel_op, builder);
 
     // No marks the returns we converted.
+    Region &kernel_region = kernel_op.getBody();
+    processReturns(kernel_region, builder);
   }
 }
 
@@ -248,6 +250,38 @@ static void processEmptyReturnVoidBlock(Block *ret_block,
   void_ret_op.erase();
 }
 
+// Processes void returns in kernel (same logic as function).
+static void processVoidReturnsInKernel(neura::KernelOp kernel_op,
+                                       OpBuilder &builder) {
+  Region &kernel_region = kernel_op.getBody();
+
+  // Collects all return operations with "void" attribute.
+  SmallVector<neura::ReturnOp> ret_void_ops;
+  kernel_region.walk([&](neura::ReturnOp ret_op) {
+    if (ret_op->hasAttr(kReturnTypeAttr)) {
+      if (dyn_cast<StringAttr>(ret_op->getAttr(kReturnTypeAttr)).getValue() ==
+          kReturnTypeVoid) {
+        ret_void_ops.push_back(ret_op);
+      }
+    }
+  });
+
+  llvm::errs() << "[canonicalize]   Found " << ret_void_ops.size()
+               << " void returns in kernel\n";
+
+  // Processes each return_void block.
+  for (neura::ReturnOp ret_void_op : ret_void_ops) {
+    Block *ret_block = ret_void_op->getBlock();
+    bool is_empty_block = (ret_block->getOperations().size() == 1);
+
+    if (is_empty_block) {
+      processEmptyReturnVoidBlock(ret_block, ret_void_op, builder);
+    } else {
+      assert(false && "Unsupported case: return block is not empty.");
+    }
+  }
+}
+
 struct CanonicalizeReturnPass
     : public PassWrapper<CanonicalizeReturnPass, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CanonicalizeReturnPass)
@@ -331,6 +365,13 @@ struct CanonicalizeReturnPass
 
       // Step 1: Processes yields.
       processYields(kernel_op, builder);
+
+      // Step 2: If yields are converted to returns, processes void returns.
+      bool has_counter = kernelHasCounter(kernel_op);
+      if (!has_counter) {
+        llvm::errs() << "[canonicalize] Processing void returns in kernel\n";
+        processVoidReturnsInKernel(kernel_op, builder);
+      }
     });
   }
 };
diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
index 6847d04d..556d6181 100644
--- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
+++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
@@ -718,7 +718,7 @@ void createReserveAndPhiOps(
 // Transforms control flow into data flow.
 void transformControlFlowToDataFlow(Region &region, ControlFlowInfo &ctrl_info,
                                     DominanceInfo &dom_info, OpBuilder &builder,
-                                    bool is_kernel = false) {
+                                    bool has_task_counter = false) {
 
   // Asserts that all live-out values are dominated by block arguments.
   assertLiveOutValuesDominatedByBlockArgs(region);
@@ -809,7 +809,7 @@ void transformControlFlowToDataFlow(Region &region, ControlFlowInfo &ctrl_info,
     block->erase();
   }
 
-  if (is_kernel) {
+  if (has_task_counter) {
     return;
   }
 
@@ -851,29 +851,24 @@ void transformControlFlowToDataFlow(Region &region, ControlFlowInfo &ctrl_info,
   builder.create<neura::YieldOp>(builder.getUnknownLoc());
 
   // Sets the "dataflow_mode" attribute to "predicate" for the parent
-  // function.
-  if (auto func = dyn_cast<func::FuncOp>(region.getParentOp())) {
-    if (!func->hasAttr(neura::attr::kDataflowMode)) {
-      func->setAttr(
-          neura::attr::kDataflowMode,
-          StringAttr::get(func.getContext(), neura::attr::val::kModePredicate));
-      llvm::errs()
-          << "[ctrl2data] Set dataflow mode to predicate for function: "
-          << func.getName() << "\n";
-    } else {
-      llvm::errs() << "[ctrl2data] Function " << func.getName()
-                   << " already has dataflow_mode set to "
-                   << func->getAttrOfType<StringAttr>(
-                              neura::attr::kDataflowMode)
-                          .getValue()
-                   << "\n";
-      func->setAttr(
-          neura::attr::kDataflowMode,
-          StringAttr::get(func.getContext(), neura::attr::val::kModePredicate));
-    }
+  // function/kernel.
+  Operation *parent_op = region.getParentOp();
+  llvm::errs() << "[ctrl2data] Parent operation: " << *parent_op << "\n";
+  if (auto func = dyn_cast<func::FuncOp>(parent_op)) {
+    func->setAttr(
+        neura::attr::kDataflowMode,
+        StringAttr::get(func.getContext(), neura::attr::val::kModePredicate));
+    llvm::errs() << "[ctrl2data] Set dataflow mode to predicate for function: "
+                 << func.getName() << "\n";
+  } else if (auto kernel = dyn_cast<neura::KernelOp>(parent_op)) {
+    // Parent is a kernel.
+    kernel->setAttr(
+        neura::attr::kDataflowMode,
+        StringAttr::get(kernel.getContext(), neura::attr::val::kModePredicate));
+    llvm::errs() << "[ctrl2data] Set dataflow mode to predicate for kernel.\n";
   } else {
-    assert(false &&
-           "[ctrl2data] Warning: Parent operation is not a func::FuncOp.\n");
+    assert(false && "[ctrl2data] Warning: Parent operation is neither a "
+                    "func::FuncOp nor a neura::KernelOp.\n");
   }
 }
 
@@ -982,8 +977,8 @@ struct TransformCtrlToDataFlowPass
     // Case 2: outer task has counter, with return value
     //     - Skips grant predicate in entry block
     //     - Outer counter (root counter) gates the return
-    //     - Inserts extract_predicate from outer counter (root counter) to gate
-    //     return values
+    //     - Inserts extract_predicate from outer counter (root counter) to
+    //     gate return values
     // Case 3: outer task has no counter, with/without return value
     //     - Normal grant predicate in entry block
     //     - Normal transfrom-ctrl-to-data-flow process
@@ -1009,6 +1004,43 @@ struct TransformCtrlToDataFlowPass
 
       llvm::errs() << "[ctrl2data] Task has counter: "
                    << (has_task_counter ? "YES" : "NO") << "\n\n";
+      if (!has_task_counter) {
+        llvm::errs() << "[ctrl2data] === Kernel WITHOUT counter ===\n";
+        llvm::errs() << "[ctrl2data] Using standard function lowering flow\n\n";
+
+        // Step 1: Grant predicates in entry block
+        llvm::errs() << "[ctrl2data] STEP 1: Grant predicates\n";
+        GrantPredicateInEntryBlock(entry_block, builder, false);
+
+        // Step 2: Assert live-out values
+        llvm::errs() << "[ctrl2data] STEP 2: Assert live-out values\n";
+        assertLiveOutValuesDominatedByBlockArgs(kernel_region);
+
+        // Step 3: Build control flow info
+        llvm::errs() << "[ctrl2data] STEP 3: Build control flow info\n";
+        ControlFlowInfo ctrl_info;
+        buildControlFlowInfo(kernel_region, ctrl_info, dom_info);
+
+        // Step 4: Transform control flow to data flow
+        llvm::errs() << "[ctrl2data] STEP 4: Transform control flow\n";
+        transformControlFlowToDataFlow(kernel_region, ctrl_info, dom_info,
+                                       builder,
+                                       false); // ✅ false = use function logic
+
+        // Step 5: Convert phi to phi_start
+        llvm::errs() << "[ctrl2data] STEP 5: Convert phi to phi_start\n";
+        convertPhiToPhiStart(kernel_region, builder);
+
+        llvm::errs() << "[ctrl2data] ✅ Kernel WITHOUT counter complete\n";
+        llvm::errs()
+            << "[ctrl2data] ========================================\n\n";
+
+        // Set dataflow mode attribute
+        kernel_op->setAttr(neura::attr::kDataflowMode,
+                           StringAttr::get(kernel_op.getContext(),
+                                           neura::attr::val::kModePredicate));
+        return;
+      }
 
       SmallVector<Value> iter_arg_final_values;
 

From 2842fae364b7e6fbb951e29fea5f3e2c0b3ea102 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 24 Jan 2026 15:14:17 +0800
Subject: [PATCH 12/25] enable kenrel mapping

---
 include/NeuraDialect/Mapping/mapping_util.h   |  12 +-
 lib/NeuraDialect/Mapping/mapping_util.cpp     |  63 +--
 .../Transforms/GraphMining/GraMi.cpp          | 475 ++++++++++--------
 .../Transforms/InsertDataMovPass.cpp          | 112 +++--
 .../Transforms/MapToAcceleratorPass.cpp       | 403 +++++++++------
 .../interpreter/lower_and_interpret.mlir      |   2 +-
 .../interpreter/lower_and_interpret_subf.mlir |   2 +-
 7 files changed, 642 insertions(+), 427 deletions(-)

diff --git a/include/NeuraDialect/Mapping/mapping_util.h b/include/NeuraDialect/Mapping/mapping_util.h
index 0a36d476..ee6fcefc 100644
--- a/include/NeuraDialect/Mapping/mapping_util.h
+++ b/include/NeuraDialect/Mapping/mapping_util.h
@@ -30,13 +30,13 @@ struct RecurrenceCycle {
 };
 
 // Collects recurrence cycles rooted at reserve and closed by ctrl_mov.
-SmallVector<RecurrenceCycle, 4> collectRecurrenceCycles(Operation *func_op);
+SmallVector<RecurrenceCycle, 4> collectRecurrenceCycles(Region &region);
 
 // Calculates ResMII: ceil(#ops / #tiles).
-int calculateResMii(Operation *func_op, const Architecture &architecture);
+int calculateResMii(Region &region, const Architecture &architecture);
 
-// Returns topologically sorted operations in func_op.
-std::vector<Operation *> getTopologicallySortedOps(Operation *func_op);
+// Returns topologically sorted operations in region.
+std::vector<Operation *> getTopologicallySortedOps(Region &region);
 
 // Given the sorted operations, returns a vector of pairs where each pair
 // contains a vector of operations at the same ALAP (as late as possible)
@@ -82,8 +82,8 @@ bool tryRouteBackwardMove(Operation *mov_op, MappingLoc src_loc,
 // ctrl_mov users found.
 llvm::SmallVector<Operation *> getCtrlMovUsers(Operation *op);
 
-// Identifies operations on the critical path (i.e., operations with zero slack).
-// Returns pair of: (critical_ops_set, asap_level_map)
+// Identifies operations on the critical path (i.e., operations with zero
+// slack). Returns pair of: (critical_ops_set, asap_level_map)
 std::pair<std::set<Operation *>, llvm::DenseMap<Operation *, int>>
 identifyCriticalPathOps(const std::vector<Operation *> &sorted_ops);
 
diff --git a/lib/NeuraDialect/Mapping/mapping_util.cpp b/lib/NeuraDialect/Mapping/mapping_util.cpp
index f5b7a86d..814c59a3 100644
--- a/lib/NeuraDialect/Mapping/mapping_util.cpp
+++ b/lib/NeuraDialect/Mapping/mapping_util.cpp
@@ -191,10 +191,10 @@ void traverseAlongPath(Operation *op, Value reserve_value,
 } // namespace
 
 SmallVector<RecurrenceCycle, 4>
-mlir::neura::collectRecurrenceCycles(Operation *func_op) {
+mlir::neura::collectRecurrenceCycles(Region &region) {
   SmallVector<RecurrenceCycle, 4> recurrence_cycles;
 
-  func_op->walk([&](neura::CtrlMovOp ctrl_mov_op) {
+  region.walk([&](neura::CtrlMovOp ctrl_mov_op) {
     Value target = ctrl_mov_op.getTarget();
     auto reserve_op = target.getDefiningOp<neura::ReserveOp>();
     if (!reserve_op) {
@@ -226,12 +226,12 @@ mlir::neura::collectRecurrenceCycles(Operation *func_op) {
   return recurrence_cycles;
 }
 
-int mlir::neura::calculateResMii(Operation *func_op,
+int mlir::neura::calculateResMii(Region &region,
                                  const Architecture &architecture) {
   int num_ops = 0;
 
   // Count all "compute" operations (non-terminators, non-block ops).
-  func_op->walk([&](Operation *op) {
+  region.walk([&](Operation *op) {
     // Skips non-materialized ops.
     if (isa<func::FuncOp>(op) ||
         isa<neura::CtrlMovOp, neura::DataMovOp, neura::ReserveOp>(op)) {
@@ -254,13 +254,13 @@ int mlir::neura::calculateResMii(Operation *func_op,
 }
 
 std::vector<Operation *>
-mlir::neura::getTopologicallySortedOps(Operation *func_op) {
+mlir::neura::getTopologicallySortedOps(Region &region) {
   std::vector<Operation *> sorted_ops;
   llvm::DenseMap<Operation *, int> pending_deps;
   std::deque<Operation *> ready_queue;
 
   // Collects recurrence cycle ops.
-  auto recurrence_cycles = collectRecurrenceCycles(func_op);
+  auto recurrence_cycles = collectRecurrenceCycles(region);
   llvm::DenseSet<Operation *> recurrence_ops;
   for (const auto &cycle : recurrence_cycles) {
     for (Operation *op : cycle.operations) {
@@ -268,10 +268,10 @@ mlir::neura::getTopologicallySortedOps(Operation *func_op) {
     }
   }
   // Counts unresolved dependencies for each op.
-  func_op->walk([&](Operation *op) {
-    if (op == func_op) {
-      return;
-    }
+  region.walk([&](Operation *op) {
+    // if (op == func_op) {
+    //   return;
+    // }
     int dep_count = 0;
     for (Value operand : op->getOperands()) {
       if (operand.getDefiningOp()) {
@@ -417,14 +417,14 @@ std::vector<std::pair<Operation *, int>> mlir::neura::flatten_level_buckets(
                               const std::pair<Operation *, int> &b_pair) {
                 Operation *a = a_pair.first;
                 Operation *b = b_pair.first;
-                
+
                 bool a_is_critical = critical_ops.count(a) > 0;
                 bool b_is_critical = critical_ops.count(b) > 0;
-                
+
                 // Priority 1: Critical ops come first (within same ALAP level).
                 if (a_is_critical != b_is_critical)
                   return a_is_critical > b_is_critical;
-                
+
                 // Priority 2: Degree (connectivity) - higher degree first.
                 int degree_a = a->getNumOperands();
                 int degree_b = b->getNumOperands();
@@ -438,7 +438,7 @@ std::vector<std::pair<Operation *, int>> mlir::neura::flatten_level_buckets(
                 }
                 if (degree_a != degree_b)
                   return degree_a > degree_b;
-                
+
                 // Priority 3: Original index (stability tie-breaker).
                 return a_pair.second < b_pair.second;
               });
@@ -1036,18 +1036,22 @@ mlir::neura::calculateAward(Operation *op, std::set<Operation *> &critical_ops,
           }
 
           float in_ratio = (total_in > 0) ? (float)occupied_in / total_in : 0;
-          float out_ratio = (total_out > 0) ? (float)occupied_out / total_out : 0;
-          
+          float out_ratio =
+              (total_out > 0) ? (float)occupied_out / total_out : 0;
+
           // Adaptive penalty strategy:
-          // - Use very strong penalty (60) only for high fan-in ops (>= 3 producers)
+          // - Use very strong penalty (60) only for high fan-in ops (>= 3
+          // producers)
           // - Use weak penalty (15) for low fan-in ops
-          // This optimizes fuse-pattern (II=11 target) without breaking iter-merge
+          // This optimizes fuse-pattern (II=11 target) without breaking
+          // iter-merge
           int base_penalty_coeff = (producers.size() >= 3)
                                        ? kStrongCongestionPenalty
                                        : kWeakCongestionPenalty;
-          
-          int congestion_penalty = static_cast<int>(in_ratio * in_ratio * base_penalty_coeff) +
-                                   static_cast<int>(out_ratio * out_ratio * base_penalty_coeff);
+
+          int congestion_penalty =
+              static_cast<int>(in_ratio * in_ratio * base_penalty_coeff) +
+              static_cast<int>(out_ratio * out_ratio * base_penalty_coeff);
 
           int total_award = tile_award + time_bonus - congestion_penalty;
           updateAward(locs_with_award, tile_loc_candidate, total_award);
@@ -1062,15 +1066,14 @@ mlir::neura::calculateAward(Operation *op, std::set<Operation *> &critical_ops,
 
   // Sorts by award (descending). Use stable sort/tie-breaker logic
   // to minimize noise in mapping results.
-  std::sort(
-      locs_award_vec.begin(), locs_award_vec.end(),
-      [](const std::pair<MappingLoc, int> &a,
-         const std::pair<MappingLoc, int> &b) {
-        if (a.second != b.second)
-          return a.second > b.second;
-        // Tie-breaker: earlier time step first.
-        return a.first.time_step < b.first.time_step;
-      });
+  std::sort(locs_award_vec.begin(), locs_award_vec.end(),
+            [](const std::pair<MappingLoc, int> &a,
+               const std::pair<MappingLoc, int> &b) {
+              if (a.second != b.second)
+                return a.second > b.second;
+              // Tie-breaker: earlier time step first.
+              return a.first.time_step < b.first.time_step;
+            });
   // TODO: Needs to handle tie case and prioritize lower resource utilization,
   // however, compiled II becomes worse after adding this tie-breaker:
   // https://github.com/coredac/dataflow/issues/59.
diff --git a/lib/NeuraDialect/Transforms/GraphMining/GraMi.cpp b/lib/NeuraDialect/Transforms/GraphMining/GraMi.cpp
index 908d68a4..04cbaf24 100644
--- a/lib/NeuraDialect/Transforms/GraphMining/GraMi.cpp
+++ b/lib/NeuraDialect/Transforms/GraphMining/GraMi.cpp
@@ -1,12 +1,12 @@
-#include "Common/AcceleratorAttrs.h"
-#include "NeuraDialect/NeuraAttributes.h"
 #include "NeuraDialect/Transforms/GraphMining/GraMi.h"
+#include "Common/AcceleratorAttrs.h"
 #include "NeuraDialect/Mapping/mapping_util.h"
+#include "NeuraDialect/NeuraAttributes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/Value.h"
 #include "mlir/IR/Block.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/IR/Region.h"
+#include "mlir/IR/Value.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <sstream>
@@ -17,14 +17,14 @@ using namespace mlir::neura;
 // Static member definition for tracking attempted patterns
 std::set<std::string> GraMi::attempted_patterns_;
 
-DfgNode* DfgGraph::addNode(mlir::Operation* op, const std::string& label) {
+DfgNode *DfgGraph::addNode(mlir::Operation *op, const std::string &label) {
   auto node = new DfgNode(next_node_id_++, op, label);
   nodes_.push_back(node);
   op_to_node_[op] = node;
   return node;
 }
 
-DfgEdge* DfgGraph::addEdge(DfgNode* from, DfgNode* to, mlir::Value value) {
+DfgEdge *DfgGraph::addEdge(DfgNode *from, DfgNode *to, mlir::Value value) {
   auto edge = new DfgEdge(next_edge_id_++, from, to, value);
   edges_.push_back(edge);
   from->addOutgoingEdge(edge);
@@ -32,14 +32,14 @@ DfgEdge* DfgGraph::addEdge(DfgNode* from, DfgNode* to, mlir::Value value) {
   return edge;
 }
 
-DfgNode* DfgGraph::getNode(DfgNode::NodeId id) const {
+DfgNode *DfgGraph::getNode(DfgNode::NodeId id) const {
   if (id < nodes_.size()) {
     return nodes_[id];
   }
   return nullptr;
 }
 
-DfgEdge* DfgGraph::getEdge(DfgEdge::EdgeId id) const {
+DfgEdge *DfgGraph::getEdge(DfgEdge::EdgeId id) const {
   if (id < edges_.size()) {
     return edges_[id];
   }
@@ -47,10 +47,10 @@ DfgEdge* DfgGraph::getEdge(DfgEdge::EdgeId id) const {
 }
 
 void DfgGraph::clear() {
-  for (auto* node : nodes_) {
+  for (auto *node : nodes_) {
     delete node;
   }
-  for (auto* edge : edges_) {
+  for (auto *edge : edges_) {
     delete edge;
   }
   nodes_.clear();
@@ -60,14 +60,14 @@ void DfgGraph::clear() {
   next_edge_id_ = 0;
 }
 
-std::string DfgExtractor::getOperationLabel(mlir::Operation* op) {
+std::string DfgExtractor::getOperationLabel(mlir::Operation *op) {
   std::string op_name = op->getName().getStringRef().str();
-  
+
   size_t dot_pos = op_name.find('.');
   if (dot_pos != std::string::npos) {
     op_name = op_name.substr(dot_pos + 1);
   }
-  
+
   if (op->getNumResults() > 0) {
     Type result_type = op->getResult(0).getType();
     if (auto int_type = mlir::dyn_cast<IntegerType>(result_type)) {
@@ -76,36 +76,38 @@ std::string DfgExtractor::getOperationLabel(mlir::Operation* op) {
       op_name += "_f" + std::to_string(float_type.getWidth());
     }
   }
-  
+
   return op_name;
 }
 
-// Excludes operations that are not part of the DFG since they don't involve computation and will not be mapped onto the functional units.
-bool DfgExtractor::shouldIncludeOperation(mlir::Operation* op) {
+// Excludes operations that are not part of the DFG since they don't involve
+// computation and will not be mapped onto the functional units.
+bool DfgExtractor::shouldIncludeOperation(mlir::Operation *op) {
   if (op->getName().getStringRef().contains("func.") ||
       op->getName().getStringRef().contains("module") ||
-      op->getName().getStringRef().contains("return") || 
-      op->getName().getStringRef().contains("data_mov") || 
+      op->getName().getStringRef().contains("return") ||
+      op->getName().getStringRef().contains("data_mov") ||
       op->getName().getStringRef().contains("ctrl_mov") ||
-      op->getName().getStringRef().contains("reserve") || 
+      op->getName().getStringRef().contains("reserve") ||
       op->getName().getStringRef().contains("alloca") ||
       op->getName().getStringRef().contains("yield")) {
     return false;
   }
-  
+
   if (op->getDialect()->getNamespace() == "neura") {
     return true;
   }
-  
+
   if (op->getDialect()->getNamespace() == "llvm") {
     return false;
   }
-  
+
   if (op->getDialect()->getNamespace() == "arith") {
     return false;
   }
-  
-  llvm::errs() << "Excluding operation: " << op->getName().getStringRef() << "\n";
+
+  llvm::errs() << "Excluding operation: " << op->getName().getStringRef()
+               << "\n";
 
   return false;
 }
@@ -113,100 +115,108 @@ bool DfgExtractor::shouldIncludeOperation(mlir::Operation* op) {
 // Extracts the data flow graph from the module.
 std::unique_ptr<DfgGraph> DfgExtractor::extractFromModule(ModuleOp module) {
   auto graph = std::make_unique<DfgGraph>();
-  
+
   module.walk([&](func::FuncOp func) {
     llvm::errs() << "Extracting DFG from function: " << func.getName() << "\n";
 
     auto func_graph = extractFromFunction(func);
     if (func_graph) {
-      for (auto* node : func_graph->getNodes()) {
+      for (auto *node : func_graph->getNodes()) {
         graph->addNode(node->getOperation(), node->getLabel());
       }
-      for (auto* edge : func_graph->getEdges()) {
+      for (auto *edge : func_graph->getEdges()) {
         graph->addEdge(edge->getFrom(), edge->getTo(), edge->getValue());
       }
     }
   });
-  
+
   return graph;
 }
 
 // Extracts the data flow graph from the function.
 std::unique_ptr<DfgGraph> DfgExtractor::extractFromFunction(func::FuncOp func) {
   auto graph = std::make_unique<DfgGraph>();
-  
-  func.walk([&](Block* block) {
+
+  func.walk([&](Block *block) {
     auto block_graph = extractFromBlock(block);
     if (block_graph) {
-      for (auto* node : block_graph->getNodes()) {
+      for (auto *node : block_graph->getNodes()) {
         graph->addNode(node->getOperation(), node->getLabel());
       }
-      for (auto* edge : block_graph->getEdges()) {
+      for (auto *edge : block_graph->getEdges()) {
         graph->addEdge(edge->getFrom(), edge->getTo(), edge->getValue());
       }
     }
   });
-  
+
   return graph;
 }
 
 // Extracts the data flow graph from the block.
-std::unique_ptr<DfgGraph> DfgExtractor::extractFromBlock(mlir::Block* block) {
+std::unique_ptr<DfgGraph> DfgExtractor::extractFromBlock(mlir::Block *block) {
   auto graph = std::make_unique<DfgGraph>();
-  llvm::DenseMap<mlir::Value, DfgNode*> value_to_node;
-  
-  for (auto& op : block->getOperations()) {
+  llvm::DenseMap<mlir::Value, DfgNode *> value_to_node;
+
+  for (auto &op : block->getOperations()) {
     if (shouldIncludeOperation(&op)) {
       std::string label = getOperationLabel(&op);
-      DfgNode* node = graph->addNode(&op, label);
-      
+      DfgNode *node = graph->addNode(&op, label);
+
       for (mlir::Value result : op.getResults()) {
         value_to_node[result] = node;
       }
     }
   }
-  
-  for (auto& op : block->getOperations()) {
+
+  for (auto &op : block->getOperations()) {
     if (shouldIncludeOperation(&op)) {
-      DfgNode* current_node = nullptr;
-      
+      DfgNode *current_node = nullptr;
+
       for (mlir::Value result : op.getResults()) {
         if (value_to_node.count(result)) {
           current_node = value_to_node[result];
           break;
         }
       }
-      
-      if (!current_node) continue;
-      
+
+      if (!current_node)
+        continue;
+
       for (mlir::Value operand : op.getOperands()) {
         if (value_to_node.count(operand)) {
-          DfgNode* source_node = value_to_node[operand];
+          DfgNode *source_node = value_to_node[operand];
           graph->addEdge(source_node, current_node, operand);
         }
       }
     }
   }
-  
+
   return graph;
 }
 
 // Mines the frequent subgraphs from the data flow graph.
 // Algorithm:
 // 1. Collects all 2-node patterns from the graph
-// 2. For each pattern, separates instances into critical path vs non-critical path
-// 3. For each pattern, performs MWIS with higher weight for critical path instances
+// 2. For each pattern, separates instances into critical path vs non-critical
+// path
+// 3. For each pattern, performs MWIS with higher weight for critical path
+// instances
 // 4. Performs inter-pattern analysis with critical path conflict priority
 std::vector<PatternWithSelectedInstances> GraMi::mineFrequentSubgraphs() {
   std::vector<FrequentSubgraph> frequent_subgraphs;
-  
+
   // Map from pattern string to (critical instances, non-critical instances)
-  std::map<std::string, std::pair<std::vector<PatternInstance>, std::vector<PatternInstance>>> pattern_instances;
-  
-  auto derive_label = [](mlir::Operation* op, const std::string& fallback_label) -> std::string {
-    if (!op) return fallback_label;
+  std::map<std::string, std::pair<std::vector<PatternInstance>,
+                                  std::vector<PatternInstance>>>
+      pattern_instances;
+
+  auto derive_label = [](mlir::Operation *op,
+                         const std::string &fallback_label) -> std::string {
+    if (!op)
+      return fallback_label;
     auto name = op->getName().getStringRef();
-    if (name.ends_with(attr::val::kOpFused) || name.contains(attr::val::kNeuraFusedOp)) {
+    if (name.ends_with(attr::val::kOpFused) ||
+        name.contains(attr::val::kNeuraFusedOp)) {
       if (auto attr = op->getAttr("pattern_name")) {
         if (auto str_attr = mlir::dyn_cast<mlir::StringAttr>(attr)) {
           return std::string("fused_op:") + str_attr.getValue().str();
@@ -217,29 +227,38 @@ std::vector<PatternWithSelectedInstances> GraMi::mineFrequentSubgraphs() {
     return fallback_label;
   };
 
-  llvm::errs() << "[GraMi] Critical path ops count: " << critical_path_ops_.size() << "\n";
+  llvm::errs() << "[GraMi] Critical path ops count: "
+               << critical_path_ops_.size() << "\n";
 
   // Step 1: Collects all 2-node patterns and classifies instances
-  for (auto* edge : graph_->getEdges()) {
-    DfgNode* from = edge->getFrom();
-    DfgNode* to = edge->getTo();
-
-    auto* from_op = from->getOperation();
-    auto* to_op = to->getOperation();
+  for (auto *edge : graph_->getEdges()) {
+    DfgNode *from = edge->getFrom();
+    DfgNode *to = edge->getTo();
 
+    auto *from_op = from->getOperation();
+    auto *to_op = to->getOperation();
 
     // Skips operations inside fused_op
-    if (from_op->getParentRegion()->getParentOp()->getName().getStringRef().str() == "neura.fused_op" || to_op->getParentRegion()->getParentOp()->getName().getStringRef().str() == "neura.fused_op") {
+    if (from_op->getParentRegion()
+                ->getParentOp()
+                ->getName()
+                .getStringRef()
+                .str() == "neura.fused_op" ||
+        to_op->getParentRegion()
+                ->getParentOp()
+                ->getName()
+                .getStringRef()
+                .str() == "neura.fused_op") {
       continue;
     }
 
     std::string from_label = derive_label(from_op, from->getLabel());
     std::string to_label = derive_label(to_op, to->getLabel());
     std::string pattern = from_label + "->" + to_label;
-    
+
     PatternInstance instance;
     instance.frequency = 1;
-    
+
     if (from_op->isBeforeInBlock(to_op)) {
       instance.operations.push_back(from_op);
       instance.operations.push_back(to_op);
@@ -249,11 +268,11 @@ std::vector<PatternWithSelectedInstances> GraMi::mineFrequentSubgraphs() {
       instance.operations.push_back(from_op);
       instance.last_op = from_op;
     }
-    
-    llvm::DenseSet<mlir::Operation*> pattern_ops;
+
+    llvm::DenseSet<mlir::Operation *> pattern_ops;
     pattern_ops.insert(from_op);
     pattern_ops.insert(to_op);
-    
+
     llvm::SetVector<mlir::Value> input_set;
     for (mlir::Value operand : from_op->getOperands()) {
       input_set.insert(operand);
@@ -263,29 +282,31 @@ std::vector<PatternWithSelectedInstances> GraMi::mineFrequentSubgraphs() {
         input_set.insert(operand);
       }
     }
-    instance.inputs = std::vector<mlir::Value>(input_set.begin(), input_set.end());
-    
+    instance.inputs =
+        std::vector<mlir::Value>(input_set.begin(), input_set.end());
+
     llvm::SetVector<mlir::Value> output_set;
-    for (mlir::Operation* op : instance.operations) {
+    for (mlir::Operation *op : instance.operations) {
       for (mlir::Value result : op->getResults()) {
         bool has_external_use = false;
-        for (mlir::OpOperand& use : result.getUses()) {
-          mlir::Operation* user = use.getOwner();
+        for (mlir::OpOperand &use : result.getUses()) {
+          mlir::Operation *user = use.getOwner();
           if (!pattern_ops.contains(user)) {
             has_external_use = true;
             break;
           }
         }
-        
+
         if (has_external_use) {
           output_set.insert(result);
         }
       }
     }
-    instance.outputs = std::vector<mlir::Value>(output_set.begin(), output_set.end());
-    
+    instance.outputs =
+        std::vector<mlir::Value>(output_set.begin(), output_set.end());
+
     instance.is_on_critical_path = isInstanceOnCriticalPath(instance);
-    
+
     if (instance.is_on_critical_path) {
       pattern_instances[pattern].first.push_back(instance);
     } else {
@@ -295,45 +316,55 @@ std::vector<PatternWithSelectedInstances> GraMi::mineFrequentSubgraphs() {
 
   // Step 2: Processes frequent patterns and performs per-pattern MWIS
   std::vector<PatternWithSelectedInstances> candidates;
-  
-  for (auto& [pattern, instances_pair] : pattern_instances) {
-    auto& [critical_instances, non_critical_instances] = instances_pair;
-    size_t total_count = critical_instances.size() + non_critical_instances.size();
-    
+
+  for (auto &[pattern, instances_pair] : pattern_instances) {
+    auto &[critical_instances, non_critical_instances] = instances_pair;
+    size_t total_count =
+        critical_instances.size() + non_critical_instances.size();
+
     // Skips patterns that have been attempted for fusion
     if (hasPatternBeenAttempted(pattern)) {
       continue;
     }
-    
+
     if (total_count >= min_support_) {
       size_t pattern_idx = frequent_subgraphs.size();
       std::string from_label = pattern.substr(0, pattern.find("->"));
       std::string to_label = pattern.substr(pattern.find("->") + 2);
-      FrequentSubgraph subgraph(pattern, total_count, static_cast<int64_t>(pattern_idx));
+      FrequentSubgraph subgraph(pattern, total_count,
+                                static_cast<int64_t>(pattern_idx));
       subgraph.addNode(0, from_label);
       subgraph.addNode(1, to_label);
       subgraph.addEdge(0, 0, 1);
       frequent_subgraphs.push_back(subgraph);
-      
-      for (auto& inst : critical_instances) {
+
+      for (auto &inst : critical_instances) {
         inst.pattern_id = static_cast<int64_t>(pattern_idx);
       }
-      for (auto& inst : non_critical_instances) {
+      for (auto &inst : non_critical_instances) {
         inst.pattern_id = static_cast<int64_t>(pattern_idx);
       }
-      
-      auto [selected_critical, selected_non_critical] = selectMWISForPattern(critical_instances, non_critical_instances, 10.0);
-      
+
+      auto [selected_critical, selected_non_critical] = selectMWISForPattern(
+          critical_instances, non_critical_instances, 10.0);
+
       // Creates PatternWithSelectedInstances
       PatternWithSelectedInstances pwsi(subgraph);
       pwsi.critical_instances = selected_critical;
       pwsi.non_critical_instances = selected_non_critical;
-      pwsi.selected_instances.insert(pwsi.selected_instances.end(), selected_critical.begin(), selected_critical.end());
-      pwsi.selected_instances.insert(pwsi.selected_instances.end(), selected_non_critical.begin(), selected_non_critical.end());
-      
+      pwsi.selected_instances.insert(pwsi.selected_instances.end(),
+                                     selected_critical.begin(),
+                                     selected_critical.end());
+      pwsi.selected_instances.insert(pwsi.selected_instances.end(),
+                                     selected_non_critical.begin(),
+                                     selected_non_critical.end());
+
       candidates.push_back(pwsi);
-      
-      llvm::errs() << "[GraMi] Pattern #" << pattern_idx << " after intra-pattern MWIS: " << selected_critical.size() << " critical, " << selected_non_critical.size() << " non-critical selected\n";
+
+      llvm::errs() << "[GraMi] Pattern #" << pattern_idx
+                   << " after intra-pattern MWIS: " << selected_critical.size()
+                   << " critical, " << selected_non_critical.size()
+                   << " non-critical selected\n";
     }
   }
 
@@ -341,43 +372,46 @@ std::vector<PatternWithSelectedInstances> GraMi::mineFrequentSubgraphs() {
     llvm::errs() << "[GraMi] No frequent patterns found\n";
     return {};
   }
-  
+
   // Step 3: Performs inter-pattern analysis with critical path priority
-  std::vector<PatternWithSelectedInstances> result = selectPatternsWithCriticalPriority(candidates, min_support_);
-  
+  std::vector<PatternWithSelectedInstances> result =
+      selectPatternsWithCriticalPriority(candidates, min_support_);
+
   llvm::errs() << "[GraMi] Final result: " << result.size() << " patterns\n";
 
   // Prints summary
   size_t total_critical = 0, total_non_critical = 0;
-  for (const auto& p : result) {
+  for (const auto &p : result) {
     total_critical += p.critical_instances.size();
     total_non_critical += p.non_critical_instances.size();
   }
-  llvm::errs() << "[GraMi] Summary: " << result.size() << " patterns, " << total_critical << " critical instances, " << total_non_critical << " non-critical instances\n";
+  llvm::errs() << "[GraMi] Summary: " << result.size() << " patterns, "
+               << total_critical << " critical instances, "
+               << total_non_critical << " non-critical instances\n";
 
   return result;
 }
 
 // Checks if the candidate pattern is frequent using the threshold min_support_.
-bool GraMi::isFrequent(const FrequentSubgraph& candidate) {
+bool GraMi::isFrequent(const FrequentSubgraph &candidate) {
   size_t support = countSupport(candidate);
   return support >= min_support_;
 }
 
 // Counts the support of the pattern in the data flow graph.
-size_t GraMi::countSupport(const FrequentSubgraph& pattern) {
+size_t GraMi::countSupport(const FrequentSubgraph &pattern) {
   std::map<std::string, size_t> node_counts;
-  for (const auto& pair : pattern.getNodes()) {
+  for (const auto &pair : pattern.getNodes()) {
     node_counts[pair.second]++;
   }
-  
+
   std::map<std::string, size_t> graph_node_counts;
-  for (auto* node : graph_->getNodes()) {
+  for (auto *node : graph_->getNodes()) {
     graph_node_counts[node->getLabel()]++;
   }
-  
+
   size_t min_count = SIZE_MAX;
-  for (const auto& pair : node_counts) {
+  for (const auto &pair : node_counts) {
     size_t graph_count = graph_node_counts[pair.first];
     size_t required_count = pair.second;
     if (graph_count < required_count) {
@@ -385,77 +419,82 @@ size_t GraMi::countSupport(const FrequentSubgraph& pattern) {
     }
     min_count = std::min(min_count, graph_count / required_count);
   }
-  
+
   return min_count;
 }
 
 // Generates a string representation of the pattern.
-std::string GraMi::generatePatternString(const FrequentSubgraph& subgraph) {
+std::string GraMi::generatePatternString(const FrequentSubgraph &subgraph) {
   std::ostringstream oss;
   oss << "Pattern: ";
-  
+
   oss << "Nodes[";
-  for (const auto& pair : subgraph.getNodes()) {
+  for (const auto &pair : subgraph.getNodes()) {
     oss << pair.first << ":" << pair.second << " ";
   }
   oss << "] ";
-  
+
   oss << "Edges[";
-  for (const auto& pair : subgraph.getEdges()) {
-    oss << pair.first << ":" << pair.second.first << "->" << pair.second.second << " ";
+  for (const auto &pair : subgraph.getEdges()) {
+    oss << pair.first << ":" << pair.second.first << "->" << pair.second.second
+        << " ";
   }
   oss << "] ";
-  
+
   oss << "Support: " << subgraph.getFrequency();
-  
+
   return oss.str();
 }
 
 // Collects critical path operations from the function.
 // Critical paths are recurrence cycles with maximum length.
-llvm::DenseSet<mlir::Operation*> GraMi::collectCriticalPathOps(mlir::func::FuncOp func) {
-  llvm::DenseSet<mlir::Operation*> critical_ops;
-  
+llvm::DenseSet<mlir::Operation *>
+GraMi::collectCriticalPathOps(mlir::func::FuncOp func) {
+  llvm::DenseSet<mlir::Operation *> critical_ops;
+
   // Collects all recurrence cycles
-  auto recurrence_cycles = collectRecurrenceCycles(func);
-  
+  auto recurrence_cycles = collectRecurrenceCycles(func.getBody());
+
   if (recurrence_cycles.empty()) {
     llvm::errs() << "[GraMi] No recurrence cycles found\n";
     return critical_ops;
   }
-  
+
   // Finds the maximum recurrence length
   int max_length = 0;
-  for (const auto& cycle : recurrence_cycles) {
+  for (const auto &cycle : recurrence_cycles) {
     max_length = std::max(max_length, cycle.length);
   }
-  
+
   llvm::errs() << "[GraMi] Maximum recurrence length: " << max_length << "\n";
-  
+
   // Collects operations from all cycles with maximum length
   int critical_cycle_count = 0;
-  for (const auto& cycle : recurrence_cycles) {
+  for (const auto &cycle : recurrence_cycles) {
     if (cycle.length == max_length) {
       critical_cycle_count++;
-      for (mlir::Operation* op : cycle.operations) {
+      for (mlir::Operation *op : cycle.operations) {
         critical_ops.insert(op);
       }
-      llvm::errs() << "[GraMi] Critical path cycle (length " << cycle.length << "):\n";
-      for (mlir::Operation* op : cycle.operations) {
+      llvm::errs() << "[GraMi] Critical path cycle (length " << cycle.length
+                   << "):\n";
+      for (mlir::Operation *op : cycle.operations) {
         llvm::errs() << "  " << *op << "\n";
       }
     }
   }
-  
-  llvm::errs() << "[GraMi] Found " << critical_cycle_count << " critical path(s) with " 
-               << critical_ops.size() << " total operations\n";
-  
+
+  llvm::errs() << "[GraMi] Found " << critical_cycle_count
+               << " critical path(s) with " << critical_ops.size()
+               << " total operations\n";
+
   return critical_ops;
 }
 
-// Checks if an instance is on critical path (all operations of the instance must be on critical path)
-bool GraMi::isInstanceOnCriticalPath(const PatternInstance& instance) const {
-  for (mlir::Operation* op : instance.operations) {
+// Checks if an instance is on critical path (all operations of the instance
+// must be on critical path)
+bool GraMi::isInstanceOnCriticalPath(const PatternInstance &instance) const {
+  for (mlir::Operation *op : instance.operations) {
     if (!critical_path_ops_.contains(op)) {
       return false;
     }
@@ -463,20 +502,25 @@ bool GraMi::isInstanceOnCriticalPath(const PatternInstance& instance) const {
   return true;
 }
 
-// Checks if the two instances conflict. Conflict occurs if the two instances have the same operation.
-bool GraMi::instancesConflict(const PatternInstance& a, const PatternInstance& b) {
-  for (mlir::Operation* op_a : a.operations) {
-    for (mlir::Operation* op_b : b.operations) {
-      if (op_a == op_b) return true;
+// Checks if the two instances conflict. Conflict occurs if the two instances
+// have the same operation.
+bool GraMi::instancesConflict(const PatternInstance &a,
+                              const PatternInstance &b) {
+  for (mlir::Operation *op_a : a.operations) {
+    for (mlir::Operation *op_b : b.operations) {
+      if (op_a == op_b)
+        return true;
     }
   }
   return false;
 }
 
-// Checks if the two patterns conflict. If any instance in the two patterns conflict, the patterns conflict.
-bool GraMi::patternsConflict(const PatternWithInstances& a, const PatternWithInstances& b) {
-  for (const auto& inst_a : a.instances) {
-    for (const auto& inst_b : b.instances) {
+// Checks if the two patterns conflict. If any instance in the two patterns
+// conflict, the patterns conflict.
+bool GraMi::patternsConflict(const PatternWithInstances &a,
+                             const PatternWithInstances &b) {
+  for (const auto &inst_a : a.instances) {
+    for (const auto &inst_b : b.instances) {
       if (instancesConflict(inst_a, inst_b)) {
         return true;
       }
@@ -486,10 +530,10 @@ bool GraMi::patternsConflict(const PatternWithInstances& a, const PatternWithIns
 }
 
 // Checks if two patterns have conflicting critical path instances
-bool GraMi::criticalInstancesConflict(const PatternWithSelectedInstances& a, 
-                                      const PatternWithSelectedInstances& b) {
-  for (const auto& inst_a : a.critical_instances) {
-    for (const auto& inst_b : b.critical_instances) {
+bool GraMi::criticalInstancesConflict(const PatternWithSelectedInstances &a,
+                                      const PatternWithSelectedInstances &b) {
+  for (const auto &inst_a : a.critical_instances) {
+    for (const auto &inst_b : b.critical_instances) {
       if (instancesConflict(inst_a, inst_b)) {
         return true;
       }
@@ -500,73 +544,81 @@ bool GraMi::criticalInstancesConflict(const PatternWithSelectedInstances& a,
 
 // Selects maximum weighted independent set for a single pattern
 // Critical path instances have higher weight
-std::pair<std::vector<PatternInstance>, std::vector<PatternInstance>> GraMi::selectMWISForPattern(const std::vector<PatternInstance>& critical_instances, const std::vector<PatternInstance>& non_critical_instances, double critical_weight_multiplier) {
-  
+std::pair<std::vector<PatternInstance>, std::vector<PatternInstance>>
+GraMi::selectMWISForPattern(
+    const std::vector<PatternInstance> &critical_instances,
+    const std::vector<PatternInstance> &non_critical_instances,
+    double critical_weight_multiplier) {
+
   // Combines all instances with their weights
   std::vector<std::pair<PatternInstance, double>> weighted_instances;
-  
-  for (const auto& inst : critical_instances) {
+
+  for (const auto &inst : critical_instances) {
     weighted_instances.push_back({inst, critical_weight_multiplier});
   }
-  for (const auto& inst : non_critical_instances) {
+  for (const auto &inst : non_critical_instances) {
     weighted_instances.push_back({inst, 1.0});
   }
-  
+
   if (weighted_instances.empty()) {
     return {{}, {}};
   }
-  
+
   size_t n = weighted_instances.size();
-  
+
   // Builds conflict graph
   std::vector<std::vector<size_t>> conflicts(n);
   for (size_t i = 0; i < n; ++i) {
     for (size_t j = i + 1; j < n; ++j) {
-      if (instancesConflict(weighted_instances[i].first, weighted_instances[j].first)) {
+      if (instancesConflict(weighted_instances[i].first,
+                            weighted_instances[j].first)) {
         conflicts[i].push_back(j);
         conflicts[j].push_back(i);
       }
     }
   }
-  
+
   // Greedy MWIS selection: prioritizes by weight / (degree + 1)
   std::vector<size_t> selected_indices;
   std::vector<bool> available(n, true);
-  
+
   while (true) {
     size_t best_idx = n;
     double best_score = -1.0;
-    
+
     for (size_t i = 0; i < n; ++i) {
-      if (!available[i]) continue;
-      
+      if (!available[i])
+        continue;
+
       size_t active_degree = 0;
       for (size_t neighbor : conflicts[i]) {
-        if (available[neighbor]) active_degree++;
+        if (available[neighbor])
+          active_degree++;
       }
-      
+
       double score = weighted_instances[i].second / (active_degree + 1);
-      
+
       if (score > best_score) {
         best_score = score;
         best_idx = i;
       }
     }
-    
-    if (best_idx == n) break;
-    
+
+    if (best_idx == n)
+      break;
+
     selected_indices.push_back(best_idx);
     available[best_idx] = false;
-    
+
     for (size_t neighbor : conflicts[best_idx]) {
       available[neighbor] = false;
     }
   }
-  
+
   // Separates selected instances into critical and non-critical
   std::vector<PatternInstance> selected_critical;
   std::vector<PatternInstance> selected_non_critical;
-  
+
   size_t critical_count = critical_instances.size();
   for (size_t idx : selected_indices) {
     if (idx < critical_count) {
@@ -575,7 +627,7 @@ std::pair<std::vector<PatternInstance>, std::vector<PatternInstance>> GraMi::sel
       selected_non_critical.push_back(weighted_instances[idx].first);
     }
   }
-  
+
   return {selected_critical, selected_non_critical};
 }
 
@@ -583,48 +635,56 @@ std::pair<std::vector<PatternInstance>, std::vector<PatternInstance>> GraMi::sel
 // Rules:
 // - If two patterns have conflicting critical instances, they cannot coexist
 //   Chooses the pattern with more critical instances
-// - Non-critical vs non-critical or non-critical vs critical conflicts are allowed
-std::vector<PatternWithSelectedInstances> GraMi::selectPatternsWithCriticalPriority(std::vector<PatternWithSelectedInstances>& candidates, size_t min_support) {
-  
-  if (candidates.empty()) return {};
-  
-  // Sorts candidates by number of critical instances (descending), then by total instances
+// - Non-critical vs non-critical or non-critical vs critical conflicts are
+// allowed
+std::vector<PatternWithSelectedInstances>
+GraMi::selectPatternsWithCriticalPriority(
+    std::vector<PatternWithSelectedInstances> &candidates, size_t min_support) {
+
+  if (candidates.empty())
+    return {};
+
+  // Sorts candidates by number of critical instances (descending), then by
+  // total instances
   std::sort(candidates.begin(), candidates.end(),
-    [](const PatternWithSelectedInstances& a, const PatternWithSelectedInstances& b) {
-      if (a.critical_instances.size() != b.critical_instances.size()) {
-        return a.critical_instances.size() > b.critical_instances.size();
-      }
-      return a.selected_instances.size() > b.selected_instances.size();
-    });
-  
+            [](const PatternWithSelectedInstances &a,
+               const PatternWithSelectedInstances &b) {
+              if (a.critical_instances.size() != b.critical_instances.size()) {
+                return a.critical_instances.size() >
+                       b.critical_instances.size();
+              }
+              return a.selected_instances.size() > b.selected_instances.size();
+            });
+
   std::vector<PatternWithSelectedInstances> result;
-  
+
   for (size_t i = 0; i < candidates.size(); ++i) {
     // Checks for critical instance conflicts with already selected patterns
     bool has_critical_conflict = false;
-    for (const auto& selected : result) {
+    for (const auto &selected : result) {
       if (criticalInstancesConflict(candidates[i], selected)) {
         has_critical_conflict = true;
         break;
       }
     }
-    
+
     if (has_critical_conflict) {
       continue;
     }
-    
+
     result.push_back(candidates[i]);
   }
-  
-  // Now handles non-critical conflicts: removes conflicting non-critical instances
+
+  // Now handles non-critical conflicts: removes conflicting non-critical
+  // instances
   for (size_t i = 0; i < result.size(); ++i) {
     for (size_t j = i + 1; j < result.size(); ++j) {
       // Finds non-critical instances in pattern j that conflict with pattern i
       std::vector<PatternInstance> remaining_non_critical;
-      for (const auto& inst_j : result[j].non_critical_instances) {
+      for (const auto &inst_j : result[j].non_critical_instances) {
         bool conflicts_with_i = false;
         // Checks conflict with pattern i
-        for (const auto& inst_i : result[i].selected_instances) {
+        for (const auto &inst_i : result[i].selected_instances) {
           if (instancesConflict(inst_i, inst_j)) {
             conflicts_with_i = true;
             break;
@@ -635,26 +695,31 @@ std::vector<PatternWithSelectedInstances> GraMi::selectPatternsWithCriticalPrior
         }
       }
       result[j].non_critical_instances = remaining_non_critical;
-      
+
       // Updates selected_instances
       result[j].selected_instances.clear();
-      result[j].selected_instances.insert(result[j].selected_instances.end(), result[j].critical_instances.begin(), result[j].critical_instances.end());
-      result[j].selected_instances.insert(result[j].selected_instances.end(), result[j].non_critical_instances.begin(), result[j].non_critical_instances.end());
+      result[j].selected_instances.insert(result[j].selected_instances.end(),
+                                          result[j].critical_instances.begin(),
+                                          result[j].critical_instances.end());
+      result[j].selected_instances.insert(
+          result[j].selected_instances.end(),
+          result[j].non_critical_instances.begin(),
+          result[j].non_critical_instances.end());
     }
   }
-  
+
   return result;
 }
 
 // Gets the label of the operation.
-std::string GraMi::getOperationLabel(mlir::Operation* op) {
+std::string GraMi::getOperationLabel(mlir::Operation *op) {
   std::string op_name = op->getName().getStringRef().str();
-  
+
   size_t dot_pos = op_name.find('.');
   if (dot_pos != std::string::npos) {
     op_name = op_name.substr(dot_pos + 1);
   }
-  
+
   if (op->getNumResults() > 0) {
     Type result_type = op->getResult(0).getType();
     if (auto int_type = mlir::dyn_cast<IntegerType>(result_type)) {
@@ -663,17 +728,17 @@ std::string GraMi::getOperationLabel(mlir::Operation* op) {
       op_name += "_f" + std::to_string(float_type.getWidth());
     }
   }
-  
+
   return op_name;
 }
 
 // Checks if a pattern has been attempted for fusion
-bool GraMi::hasPatternBeenAttempted(const std::string& pattern) {
+bool GraMi::hasPatternBeenAttempted(const std::string &pattern) {
   return attempted_patterns_.find(pattern) != attempted_patterns_.end();
 }
 
 // Marks a pattern as attempted for fusion
-void GraMi::markPatternAsAttempted(const std::string& pattern) {
+void GraMi::markPatternAsAttempted(const std::string &pattern) {
   attempted_patterns_.insert(pattern);
   llvm::errs() << "[GraMi] Marked pattern as attempted: " << pattern << "\n";
 }
diff --git a/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp b/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp
index ae7ef859..1c887a67 100644
--- a/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp
+++ b/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp
@@ -28,7 +28,7 @@ struct InsertDataMovForNeuraOps : public RewritePattern {
       return failure();
     }
 
-    // Skip operations inside fused_op regions
+    // Skips operations inside fused_op regions.
     Operation *parent_op = op->getParentOp();
     while (parent_op) {
       if (isa<neura::FusedOp>(parent_op)) {
@@ -36,7 +36,6 @@ struct InsertDataMovForNeuraOps : public RewritePattern {
       }
       parent_op = parent_op->getParentOp();
     }
-    
 
     bool all_inputs_are_mov_except_reserve =
         llvm::all_of(op->getOperands(), [](Value v) {
@@ -86,15 +85,16 @@ struct InsertDataMovForNeuraOps : public RewritePattern {
       return failure(); // do not rewrite
     }
 
-
     // Wraps operands in mov, but skip those already wrapped or from reserve.
     SmallVector<Value> new_operands;
     bool any_change = false;
     for (Value operand : op->getOperands()) {
       Operation *producer = operand.getDefiningOp();
 
-      // Skips adding mov for any operand that comes from a reserve op or already from data_mov.
-      if (producer && (isa<neura::ReserveOp>(producer) || isa<neura::DataMovOp>(producer))) {
+      // Skips adding mov for any operand that comes from a reserve op or
+      // already from data_mov.
+      if (producer && (isa<neura::ReserveOp>(producer) ||
+                       isa<neura::DataMovOp>(producer))) {
         new_operands.push_back(operand);
         continue;
       }
@@ -129,7 +129,8 @@ struct InsertDataMovForNeuraOps : public RewritePattern {
   }
 };
 
-// Wraps all fused_op's inputs and outputs with data_mov operations in the module.
+// Wraps all fused_op's inputs and outputs with data_mov operations in the
+// module.
 void wrapFusedOpsWithDataMov(ModuleOp module_op) {
   SmallVector<neura::FusedOp> fused_ops_to_process;
   module_op.walk([&](neura::FusedOp fused_op) {
@@ -145,13 +146,14 @@ void wrapFusedOpsWithDataMov(ModuleOp module_op) {
     SmallVector<Value> new_operands;
     for (Value operand : fused_op->getOperands()) {
       Operation *producer = operand.getDefiningOp();
-      
+
       // Skip if already wrapped in data_mov or from reserve
       if (isa_and_nonnull<neura::DataMovOp>(producer) ||
           isa_and_nonnull<neura::ReserveOp>(producer)) {
         new_operands.push_back(operand);
       } else {
-        auto mov = rewriter.create<neura::DataMovOp>(loc, operand.getType(), operand);
+        auto mov =
+            rewriter.create<neura::DataMovOp>(loc, operand.getType(), operand);
         new_operands.push_back(mov);
       }
     }
@@ -161,9 +163,9 @@ void wrapFusedOpsWithDataMov(ModuleOp module_op) {
     for (size_t i = 0; i < fused_op->getNumOperands(); ++i) {
       mapper.map(fused_op->getOperand(i), new_operands[i]);
     }
-    
+
     Operation *new_fused_op = rewriter.clone(*fused_op.getOperation(), mapper);
-    
+
     // Update the operands of the cloned operation
     for (size_t i = 0; i < new_operands.size(); ++i) {
       new_fused_op->setOperand(i, new_operands[i]);
@@ -171,33 +173,35 @@ void wrapFusedOpsWithDataMov(ModuleOp module_op) {
 
     // Wrap outputs with data_mov - create separate data_mov for each user
     rewriter.setInsertionPointAfter(new_fused_op);
-    
+
     // For each result of the fused_op, create a separate data_mov for each user
-    for (size_t result_idx = 0; result_idx < fused_op->getNumResults(); ++result_idx) {
+    for (size_t result_idx = 0; result_idx < fused_op->getNumResults();
+         ++result_idx) {
       Value old_result = fused_op->getResult(result_idx);
       Value new_result = new_fused_op->getResult(result_idx);
-      
+
       // Collect all users first (to avoid iterator invalidation)
-      SmallVector<OpOperand*> users_to_update;
+      SmallVector<OpOperand *> users_to_update;
       for (OpOperand &use : old_result.getUses()) {
         users_to_update.push_back(&use);
       }
-      
+
       // Create a separate data_mov for each user
       for (OpOperand *use : users_to_update) {
         Operation *user_op = use->getOwner();
-        
-        // If the user is already a data_mov (created by another fused_op's input wrapping),
-        // just update its operand to avoid nested data_mov
+
+        // If the user is already a data_mov (created by another fused_op's
+        // input wrapping), just update its operand to avoid nested data_mov
         if (auto existing_mov = llvm::dyn_cast<neura::DataMovOp>(user_op)) {
           if (use->getOperandNumber() == 0) { // data_mov only has one operand
             existing_mov->setOperand(0, new_result);
             continue;
           }
         }
-        
+
         // Otherwise, create a new data_mov for this user
-        auto mov = rewriter.create<neura::DataMovOp>(loc, new_result.getType(), new_result);
+        auto mov = rewriter.create<neura::DataMovOp>(loc, new_result.getType(),
+                                                     new_result);
         use->set(mov);
       }
     }
@@ -226,19 +230,69 @@ struct InsertDataMovPass
 
     ModuleOp module_op = getOperation();
 
-    // First, handle fused_op operations specially
+    // Step 1, handles fused_op operations specially.
     wrapFusedOpsWithDataMov(module_op);
 
-    // Then applies patterns to every region inside the module, excluding fused_op regions.
-    module_op.walk([&](Operation *op) {
-      if (!op->getRegions().empty() && !llvm::isa<neura::FusedOp>(op)) {
-        for (Region &region : op->getRegions()) {
-          if (failed(applyPatternsGreedily(region, frozen))) {
-            signalPassFailure();
-          }
-        }
+    // Then applies patterns to every region inside the module, excluding
+    // fused_op regions.
+    // module_op.walk([&](Operation *op) {
+    //   if (!op->getRegions().empty() && !llvm::isa<neura::FusedOp>(op)) {
+    //     for (Region &region : op->getRegions()) {
+    //       if (failed(applyPatternsGreedily(region, frozen))) {
+    //         signalPassFailure();
+    //       }
+    //     }
+    //   }
+    // });
+
+    // Step 2: Processes functions with neura accelerator attribute.
+    llvm::errs() << "[InsertDataMovPass] Processing functions...\n";
+    module_op.walk([&](func::FuncOp func_op) {
+      auto accel_attr =
+          func_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+
+      if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
+        llvm::errs() << "[InsertDataMovPass]   Skipping function: "
+                     << func_op.getName() << " (not neura target)\n";
+        return;
+      }
+
+      llvm::errs() << "[InsertDataMovPass]   Processing function: "
+                   << func_op.getName() << "\n";
+
+      Region &func_region = func_op.getBody();
+      if (failed(applyPatternsGreedily(func_region, frozen))) {
+        llvm::errs() << "[InsertDataMovPass]   ❌ Failed to apply patterns\n";
+        signalPassFailure();
+      } else {
+        llvm::errs() << "[InsertDataMovPass]   ✅ Successfully processed\n";
       }
     });
+
+    // Step 3: Processes kernels with neura accelerator attributes.
+    llvm::errs() << "[InsertDataMovPass] Processing kernels...\n";
+    module_op.walk([&](neura::KernelOp kernel_op) {
+      auto accel_attr =
+          kernel_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+
+      if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
+        llvm::errs()
+            << "[InsertDataMovPass]   Skipping kernel (not neura target)\n";
+        return;
+      }
+
+      llvm::errs() << "[InsertDataMovPass]   Processing kernel...\n";
+
+      Region &kernel_region = kernel_op.getBody();
+      if (failed(applyPatternsGreedily(kernel_region, frozen))) {
+        llvm::errs() << "[InsertDataMovPass]   ❌ Failed to apply patterns\n";
+        signalPassFailure();
+      } else {
+        llvm::errs() << "[InsertDataMovPass]   ✅ Successfully processed\n";
+      }
+    });
+
+    llvm::errs() << "[InsertDataMovPass] ✅ Pass complete\n";
   }
 };
 } // namespace
diff --git a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
index 700c2b4d..d8b7ef57 100644
--- a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
+++ b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
@@ -157,12 +157,11 @@ struct MapToAcceleratorPass
   }
 
   // Assigns unique dfg_id to all operations in SSA topological order.
-  void assignDfgIds(func::FuncOp func) {
+  void assignDfgIdsInRegion(Region &region, int &next_id) {
     // Uses existing topological sort to get all operations in order.
-    std::vector<Operation *> sorted_ops = getTopologicallySortedOps(func);
+    std::vector<Operation *> sorted_ops = getTopologicallySortedOps(region);
 
-    auto ctx = func.getContext();
-    int next_id = 0;
+    auto ctx = region.getContext();
 
     // Assigns ID to each operation in topological order.
     for (Operation *op : sorted_ops) {
@@ -177,6 +176,170 @@ struct MapToAcceleratorPass
                  << " dfg_id(s) in total\n";
   }
 
+  // Generic mapping function works for both function and kernel mapping.
+  template <typename OpType>
+  bool mapRegion(OpType op, Region &region, Architecture &architecture,
+                 Mapping *mapping_strategy, bool is_spatial_only,
+                 int max_ctrl_mem_items,
+                 const std::string &resolved_mapping_mode,
+                 const std::string &resolved_mapping_strategy) {
+    // Checks steering mode compatibility with architecture.
+    auto dataflow_mode_attr =
+        op->template getAttrOfType<StringAttr>(attr::kDataflowMode);
+    bool is_steering_mode =
+        (dataflow_mode_attr &&
+         dataflow_mode_attr.getValue() == attr::val::kModeSteering);
+    if (is_steering_mode) {
+      if (!is_spatial_only) {
+        op.emitError()
+            << "Steering mode mapping only supports spatial-only mapping mode.";
+        return false;
+      }
+    }
+
+    // Collects and reports recurrence cycles found in the function.
+    auto recurrence_cycles = collectRecurrenceCycles(region);
+    std::set<Operation *> critical_ops;
+    RecurrenceCycle *longest = nullptr;
+    int rec_mii = 1;
+    for (auto &cycle : recurrence_cycles) {
+      llvm::outs() << "[DEBUG] Recurrence cycle (length " << cycle.length
+                   << "):\n";
+      for (Operation *op : cycle.operations) {
+        critical_ops.insert(op);
+        llvm::outs() << "  " << *op << "\n";
+      }
+      if (!longest || cycle.length > longest->length) {
+        longest = &cycle;
+      }
+    }
+
+    if (longest) {
+      llvm::outs() << "[MapToAcceleratorPass] Longest recurrence cycle (length "
+                   << longest->length << "):\n";
+      for (Operation *op : longest->operations) {
+        op->print(llvm::outs()), llvm::outs() << "\n";
+      }
+      rec_mii = longest->length;
+    } else if (!longest) {
+      rec_mii = 1; // No recurrence cycles found, set MII to 1.
+    }
+
+    int res_mii = calculateResMii(region, architecture);
+
+    const int possible_min_ii = std::max(rec_mii, res_mii);
+    const int max_ii =
+        max_ctrl_mem_items; // Use YAML config (default 20 if not specified)
+
+    std::vector<Operation *> topologically_sorted_ops =
+        getTopologicallySortedOps(region);
+    if (topologically_sorted_ops.empty()) {
+      assert(false && "Mapping aborted due to empty op list.");
+    }
+
+    // Filters out operations inside fused_op regions.
+    // Only map the fused_op itself, not the operations within its region
+    std::vector<Operation *> filtered_ops;
+    int skipped_count = 0;
+    for (Operation *op : topologically_sorted_ops) {
+      Operation *parent_op = op->getParentOp();
+      // Check if parent is a fused_op by checking operation name
+      if (parent_op &&
+          parent_op->getName().getStringRef().contains(attr::val::kOpFused)) {
+        // Skip operations inside fused_op region
+        llvm::outs() << "[MapToAcceleratorPass] Skipping op inside fused_op: "
+                     << *op << "\n";
+        skipped_count++;
+        continue;
+      }
+      filtered_ops.push_back(op);
+    }
+    topologically_sorted_ops = std::move(filtered_ops);
+
+    if (skipped_count > 0) {
+      llvm::errs() << "[MapToAcceleratorPass] Filtered out " << skipped_count
+                   << " operations inside fused_op regions\n";
+    }
+
+    for (Operation *op : topologically_sorted_ops) {
+      llvm::outs() << "[MapToAcceleratorPass] Topologically sorted op: " << *op
+                   << "\n";
+    }
+    std::vector<std::vector<Operation *>> level_buckets =
+        getOpsInAlapLevels(topologically_sorted_ops, critical_ops);
+    for (int level = 0; level < static_cast<int>(level_buckets.size());
+         ++level) {
+      llvm::outs() << "[MapToAcceleratorPass] ALAP Bucket Level " << level
+                   << ": " << level_buckets[level].size() << " ops\n";
+      for (Operation *op : level_buckets[level]) {
+        llvm::outs() << "  " << *op << "\n";
+      }
+    }
+    std::vector<std::pair<Operation *, int>> sorted_ops_with_alap_levels =
+        flatten_level_buckets(level_buckets, critical_ops);
+    for (const auto &[op, level] : sorted_ops_with_alap_levels) {
+      llvm::outs() << "[MapToAcceleratorPass] ALAP sorted op: " << *op
+                   << " (ALAP level: " << level << ")\n";
+    }
+    // assert(false);
+    for (int ii = possible_min_ii; ii <= max_ii; ++ii) {
+      llvm::errs() << "[MapToAcceleratorPass] Start mapping with target II of "
+                   << ii << "\n";
+      // Creates a mapping state for the current II.
+      MappingState mapping_state(architecture, ii, is_spatial_only);
+      if (mapping_strategy->map(sorted_ops_with_alap_levels, critical_ops,
+                                architecture, mapping_state)) {
+        // success
+        if (dumpMappingTable) {
+          // logs to stderr
+          mapping_state.dumpOpToLocs();
+        }
+        mapping_state.encodeMappingState();
+
+        // Assigns unique dfg_id to all operations in SSA topological order.
+        int next_id = 0;
+        assignDfgIdsInRegion(region, next_id);
+
+        // Sets the mapping_info attribute on the function.
+        auto ctx = op->getContext();
+        SmallVector<NamedAttribute, 8> mapping_attrs;
+        mapping_attrs.push_back(
+            NamedAttribute(StringAttr::get(ctx, attr::kXTiles),
+                           IntegerAttr::get(IntegerType::get(ctx, 32),
+                                            architecture.getPerCgraColumns())));
+        mapping_attrs.push_back(
+            NamedAttribute(StringAttr::get(ctx, attr::kYTiles),
+                           IntegerAttr::get(IntegerType::get(ctx, 32),
+                                            architecture.getPerCgraRows())));
+        mapping_attrs.push_back(
+            NamedAttribute(StringAttr::get(ctx, attr::kMappingStrategy),
+                           StringAttr::get(ctx, resolved_mapping_strategy)));
+        mapping_attrs.push_back(
+            NamedAttribute(StringAttr::get(ctx, attr::kMappingMode),
+                           StringAttr::get(ctx, resolved_mapping_mode)));
+        mapping_attrs.push_back(
+            NamedAttribute(StringAttr::get(ctx, attr::kCompiledII),
+                           IntegerAttr::get(IntegerType::get(ctx, 32), ii)));
+        mapping_attrs.push_back(NamedAttribute(
+            StringAttr::get(ctx, attr::kRecMII),
+            IntegerAttr::get(IntegerType::get(ctx, 32), rec_mii)));
+        mapping_attrs.push_back(NamedAttribute(
+            StringAttr::get(ctx, attr::kResMII),
+            IntegerAttr::get(IntegerType::get(ctx, 32), res_mii)));
+        DictionaryAttr mapping_info = DictionaryAttr::get(ctx, mapping_attrs);
+
+        op->setAttr(attr::kMappingInfo, mapping_info);
+        return true;
+      }
+      llvm::errs() << "[MapToAcceleratorPass] Mapping failed for target II of "
+                   << ii << "\n";
+      mapping_state.dumpOpToLocs();
+    }
+    llvm::errs()
+        << "[MapToAcceleratorPass] Mapping failed for all target II values.\n";
+    return false;
+  }
+
   void runOnOperation() override {
     ModuleOp module = getOperation();
     llvm::errs() << "[MapToAcceleratorPass] Starting mapping pass...\n";
@@ -193,173 +356,103 @@ struct MapToAcceleratorPass
 
     const Architecture &architecture = mlir::neura::getArchitecture();
 
-    module.walk([&](func::FuncOp func) {
-      // Skips functions not targeting the neura accelerator.
-      auto accel_attr =
-          func->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
-      if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
+    std::string architecture_spec_file = mlir::neura::getArchitectureSpecFile();
+    int multi_cgra_rows = kMultiCgraDefaultRows;
+    int multi_cgra_columns = kMultiCgraDefaultColumns;
+    int per_cgra_rows = kPerCgraDefaultRows;
+    int per_cgra_columns = kPerCgraDefaultColumns;
+    int max_ctrl_mem_items = kDefaultMaxCtrlMemItems;
+    mlir::neura::TileDefaults tile_defaults;
+    std::vector<mlir::neura::TileOverride> tile_overrides;
+    mlir::neura::LinkDefaults link_defaults;
+    std::vector<mlir::neura::LinkOverride> link_overrides;
+    mlir::neura::BaseTopology multi_cgra_base_topology =
+        mlir::neura::BaseTopology::MESH;
+    mlir::neura::BaseTopology per_cgra_base_topology =
+        mlir::neura::BaseTopology::MESH;
+
+    if (!architecture_spec_file.empty()) {
+
+      // Use LLVM YAML parser to validate the YAML syntax (no mapping yet)
+      llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> buffer_or_err =
+          llvm::MemoryBuffer::getFile(architecture_spec_file);
+      if (!buffer_or_err) {
+        llvm::errs() << "[MapToAcceleratorPass] Failed to open architecture "
+                        "specification file: "
+                     << architecture_spec_file << "\n";
         return;
       }
 
-      // Checks the dataflow IR mode.
-      auto dataflow_mode_attr =
-          func->getAttrOfType<StringAttr>(attr::kDataflowMode);
-      bool is_steering_mode =
-          (dataflow_mode_attr &&
-           dataflow_mode_attr.getValue() == attr::val::kModeSteering);
-
-      // If steering mode, enforce spatial-only mapping.
-      if (is_steering_mode) {
-        if (!is_spatial_only) {
-          func.emitError() << "Steering IR mode requires spatial-only mapping, "
-                           << "but got mapping mode: " << resolved_mapping_mode;
-          signalPassFailure();
-          return;
-        }
-        llvm::errs() << "[MapToAcceleratorPass] Using spatial-only mapping for "
-                        "steering mode function: "
-                     << func.getName() << "\n";
-      }
+      llvm::SourceMgr sm;
+      sm.AddNewSourceBuffer(std::move(*buffer_or_err), llvm::SMLoc());
+      llvm::yaml::Stream yaml_stream(
+          sm.getMemoryBuffer(sm.getMainFileID())->getBuffer(), sm);
 
-      // Collects and reports recurrence cycles found in the function.
-      auto recurrence_cycles = collectRecurrenceCycles(func);
-      std::set<Operation *> critical_ops;
-      RecurrenceCycle *longest = nullptr;
-      int rec_mii = 1;
-      for (auto &cycle : recurrence_cycles) {
-        llvm::outs() << "[DEBUG] Recurrence cycle (length " << cycle.length
-                     << "):\n";
-        for (Operation *op : cycle.operations) {
-          critical_ops.insert(op);
-          llvm::outs() << "  " << *op << "\n";
-        }
-        if (!longest || cycle.length > longest->length) {
-          longest = &cycle;
-        }
+      bool parse_failed = false;
+      llvm::yaml::Document &yaml_doc = *yaml_stream.begin();
+      (void)yaml_doc; // ensure document is created
+      if (yaml_stream.failed()) {
+        parse_failed = true;
       }
 
-      if (longest) {
-        llvm::outs()
-            << "[MapToAcceleratorPass] Longest recurrence cycle (length "
-            << longest->length << "):\n";
-        for (Operation *op : longest->operations) {
-          op->print(llvm::outs()), llvm::outs() << "\n";
-        }
-        rec_mii = longest->length;
-      } else if (!longest) {
-        rec_mii = 1; // No recurrence cycles found, set MII to 1.
+      if (parse_failed) {
+        llvm::errs() << "[MapToAcceleratorPass] YAML parse error in: "
+                     << architecture_spec_file << "\n";
+        return;
       }
 
-      int res_mii = calculateResMii(func, architecture);
+      // Parses YAML configuration.
+      if (!parseArchitectureYaml(
+              yaml_doc, multi_cgra_rows, multi_cgra_columns,
+              multi_cgra_base_topology, per_cgra_rows, per_cgra_columns,
+              per_cgra_base_topology, max_ctrl_mem_items, tile_defaults,
+              tile_overrides, link_defaults, link_overrides)) {
+        return;
+      }
+    } else {
+      llvm::errs() << "[MapToAcceleratorPass] No architecture specification "
+                      "file provided.\n";
+    }
 
-      const int possible_min_ii = std::max(rec_mii, res_mii);
-      const int max_allowed_ii = architecture.getMaxCtrlMemItems();
+    // Creates architecture.
+    Architecture architecture(
+        multi_cgra_rows, multi_cgra_columns, multi_cgra_base_topology,
+        per_cgra_rows, per_cgra_columns, per_cgra_base_topology, tile_defaults,
+        tile_overrides, link_defaults, link_overrides);
 
-      std::vector<Operation *> topologically_sorted_ops =
-          getTopologicallySortedOps(func);
-      if (topologically_sorted_ops.empty()) {
-        llvm::errs()
-            << "[MapToAcceleratorPass] No operations to map in function "
-            << func.getName() << "\n";
-        assert(false && "Mapping aborted due to empty op list.");
+    // Maps kernels.
+    module.walk([&](neura::KernelOp kernel_op) {
+      auto accel_attr =
+          kernel_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+      if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
+        return;
       }
 
-      // Filter out operations inside fused_op regions
-      // Only map the fused_op itself, not the operations within its region
-      std::vector<Operation *> filtered_ops;
-      int skipped_count = 0;
-      for (Operation *op : topologically_sorted_ops) {
-        Operation *parent_op = op->getParentOp();
-        // Check if parent is a fused_op by checking operation name
-        if (parent_op &&
-            parent_op->getName().getStringRef().contains(attr::val::kOpFused)) {
-          // Skip operations inside fused_op region
-          llvm::outs() << "[MapToAcceleratorPass] Skipping op inside fused_op: "
-                       << *op << "\n";
-          skipped_count++;
-          continue;
-        }
-        filtered_ops.push_back(op);
+      Region &kernel_region = kernel_op.getBody();
+      if (!mapRegion(kernel_op, kernel_region, architecture,
+                     mapping_strategy.get(), is_spatial_only,
+                     max_ctrl_mem_items, resolved_mapping_mode,
+                     resolved_mapping_strategy)) {
+        llvm::errs() << "[MapToAcceleratorPass] Mapping failed for kernel.\n";
+        signalPassFailure();
       }
-      topologically_sorted_ops = std::move(filtered_ops);
+    });
 
-      if (skipped_count > 0) {
-        llvm::errs() << "[MapToAcceleratorPass] Filtered out " << skipped_count
-                     << " operations inside fused_op regions\n";
+    // Maps functions.
+    module.walk([&](func::FuncOp func_op) {
+      auto accel_attr =
+          func_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+      if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
+        return;
       }
 
-      for (Operation *op : topologically_sorted_ops) {
-        llvm::outs() << "[MapToAcceleratorPass] Topologically sorted op: "
-                     << *op << "\n";
-      }
-      std::vector<std::vector<Operation *>> level_buckets =
-          getOpsInAlapLevels(topologically_sorted_ops, critical_ops);
-      for (int level = 0; level < static_cast<int>(level_buckets.size());
-           ++level) {
-        llvm::outs() << "[MapToAcceleratorPass] ALAP Bucket Level " << level
-                     << ": " << level_buckets[level].size() << " ops\n";
-        for (Operation *op : level_buckets[level]) {
-          llvm::outs() << "  " << *op << "\n";
-        }
-      }
-      std::vector<std::pair<Operation *, int>> sorted_ops_with_alap_levels =
-          flatten_level_buckets(level_buckets, critical_ops);
-      for (const auto &[op, level] : sorted_ops_with_alap_levels) {
-        llvm::outs() << "[MapToAcceleratorPass] ALAP sorted op: " << *op
-                     << " (ALAP level: " << level << ")\n";
-      }
-      // assert(false);
-      for (int ii = possible_min_ii; ii <= max_allowed_ii; ++ii) {
-        llvm::errs()
-            << "[MapToAcceleratorPass] Start mapping with target II of " << ii
-            << "\n";
-        // Creates a mapping state for the current II.
-        MappingState mapping_state(architecture, ii, is_spatial_only);
-        if (mapping_strategy->map(sorted_ops_with_alap_levels, critical_ops,
-                                  architecture, mapping_state)) {
-          // success
-          if (dumpMappingTable) {
-            // logs to stderr
-            mapping_state.dumpOpToLocs();
-          }
-          mapping_state.encodeMappingState();
-
-          // Assigns unique dfg_id to all operations in SSA topological order.
-          assignDfgIds(func);
-
-          // Sets the mapping_info attribute on the function.
-          auto ctx = func.getContext();
-          SmallVector<NamedAttribute, 8> mapping_attrs;
-          mapping_attrs.push_back(NamedAttribute(
-              StringAttr::get(ctx, attr::kXTiles),
-              IntegerAttr::get(IntegerType::get(ctx, 32),
-                               architecture.getPerCgraColumns())));
-          mapping_attrs.push_back(
-              NamedAttribute(StringAttr::get(ctx, attr::kYTiles),
-                             IntegerAttr::get(IntegerType::get(ctx, 32),
-                                              architecture.getPerCgraRows())));
-          mapping_attrs.push_back(
-              NamedAttribute(StringAttr::get(ctx, attr::kMappingStrategy),
-                             StringAttr::get(ctx, resolved_mapping_strategy)));
-          mapping_attrs.push_back(
-              NamedAttribute(StringAttr::get(ctx, attr::kMappingMode),
-                             StringAttr::get(ctx, resolved_mapping_mode)));
-          mapping_attrs.push_back(
-              NamedAttribute(StringAttr::get(ctx, attr::kCompiledII),
-                             IntegerAttr::get(IntegerType::get(ctx, 32), ii)));
-          mapping_attrs.push_back(NamedAttribute(
-              StringAttr::get(ctx, attr::kRecMII),
-              IntegerAttr::get(IntegerType::get(ctx, 32), rec_mii)));
-          mapping_attrs.push_back(NamedAttribute(
-              StringAttr::get(ctx, attr::kResMII),
-              IntegerAttr::get(IntegerType::get(ctx, 32), res_mii)));
-          DictionaryAttr mapping_info = DictionaryAttr::get(ctx, mapping_attrs);
-
-          func->setAttr(attr::kMappingInfo, mapping_info);
-          break;
-        }
-        llvm::errs() << "[DEBUG] mapping failed for II = " << ii << "\n";
-        mapping_state.dumpOpToLocs(); // logs to stderr
+      Region &func_region = func_op.getBody();
+
+      if (!mapRegion(func_op, func_region, architecture, mapping_strategy.get(),
+                     is_spatial_only, max_ctrl_mem_items, resolved_mapping_mode,
+                     resolved_mapping_strategy)) {
+        llvm::errs() << "[MapToAcceleratorPass] Failed to map function.\n";
+        signalPassFailure();
       }
     });
   }
diff --git a/test/neura/interpreter/lower_and_interpret.mlir b/test/neura/interpreter/lower_and_interpret.mlir
index 9d50c317..2559b89b 100644
--- a/test/neura/interpreter/lower_and_interpret.mlir
+++ b/test/neura/interpreter/lower_and_interpret.mlir
@@ -19,7 +19,7 @@
 
 // RUN: %t-out.bin > %t-dumped_output.txt
 
-// RUN: mlir-neura-opt --lower-arith-to-neura --insert-data-mov %s \
+// RUN: mlir-neura-opt --assign-accelerator --lower-arith-to-neura --insert-data-mov %s \
 // RUN:   -o %t-neura.mlir
 
 // RUN: neura-interpreter %t-neura.mlir >> %t-dumped_output.txt
diff --git a/test/neura/interpreter/lower_and_interpret_subf.mlir b/test/neura/interpreter/lower_and_interpret_subf.mlir
index a91bed9d..9670adeb 100644
--- a/test/neura/interpreter/lower_and_interpret_subf.mlir
+++ b/test/neura/interpreter/lower_and_interpret_subf.mlir
@@ -19,7 +19,7 @@
 
 // RUN: %t-out.bin > %t-dumped_output.txt
 
-// RUN: mlir-neura-opt --lower-arith-to-neura --insert-data-mov %s \
+// RUN: mlir-neura-opt --assign-accelerator --lower-arith-to-neura --insert-data-mov %s \
 // RUN:   -o %t-neura.mlir
 
 // RUN: neura-interpreter %t-neura.mlir >> %t-dumped_output.txt

From 6f7084d0d722f1b9ffba5046f8de6740f97bc851 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 24 Jan 2026 16:03:05 +0800
Subject: [PATCH 13/25] enable kernel mapping

---
 .../Transforms/CanonicalizeCastPass.cpp       | 46 +++++++++++--------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp
index 18bde2b3..5b06f085 100644
--- a/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp
+++ b/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp
@@ -111,31 +111,39 @@ struct CanonicalizeCastPass
   void runOnOperation() override {
     auto module_op = getOperation();
 
-    module_op.walk([&](Operation *op) {
-      Region *region = nullptr;
-      if (auto func_op = dyn_cast<func::FuncOp>(op)) {
-        auto accel_attr =
-            func_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
-        if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
-          return;
-        }
-        region = &func_op.getBody();
-      } else if (auto llvm_func = dyn_cast<LLVM::LLVMFuncOp>(op)) {
-        auto accel_attr =
-            llvm_func->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
-        if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
-          return;
-        }
-        region = &llvm_func.getBody();
-      } else {
+    // Proceeses function.
+    module_op.walk([&](func::FuncOp func_op) {
+      auto accel_attr =
+          func_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+      if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
+        return;
+      }
+      Region &func_region = func_op.getBody();
+
+      if (func_region.empty()) {
+        return;
+      }
+
+      if (failed(canonicalizeCast(func_region))) {
+        signalPassFailure();
+        return;
+      }
+    });
+
+    // Processes neura.kernel.
+    module_op.walk([&](neura::KernelOp kernel_op) {
+      auto accel_attr =
+          kernel_op->getAttrOfType<StringAttr>(accel::kAcceleratorAttr);
+      if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) {
         return;
       }
+      Region &kernel_region = kernel_op.getBody();
 
-      if (!region || region->empty()) {
+      if (kernel_region.empty()) {
         return;
       }
 
-      if (failed(canonicalizeCast(*region))) {
+      if (failed(canonicalizeCast(kernel_region))) {
         signalPassFailure();
         return;
       }

From 7a06474ada76788e2289740c7d65139d2cfb1ccb Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 24 Jan 2026 19:14:19 +0800
Subject: [PATCH 14/25] distinguish iter_arg_init in fold-constant pass

---
 include/NeuraDialect/NeuraPasses.h            |   1 -
 .../Transforms/MapToAcceleratorPass.cpp       |   3 +
 .../HwAgnosticOpt/FoldConstantPass.cpp        |  17 +-
 .../TransformCtrlToDataFlowPass.cpp           |   2 +
 .../Transforms/WrapLoopInKernelPass.cpp       | 142 ------------
 test/multi-cgra/kernel_mapping/fir/fir.mlir   | 202 ++++++++++++++++++
 .../kernel_with_yield/kernel_with_yield.mlir  |  38 ----
 .../kernel_without_yield.mlir                 |  30 ---
 .../multi-kernel/multi-kernel.mlir            |  89 --------
 .../irregular-loop/irregular-loop.mlir        |   6 +-
 10 files changed, 226 insertions(+), 304 deletions(-)
 delete mode 100644 lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
 create mode 100644 test/multi-cgra/kernel_mapping/fir/fir.mlir
 delete mode 100644 test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir
 delete mode 100644 test/multi-cgra/neura-kernel/kernel_without_yield/kernel_without_yield.mlir
 delete mode 100644 test/multi-cgra/neura-kernel/multi-kernel/multi-kernel.mlir

diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index 0b77521d..75ddbd24 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -30,7 +30,6 @@ std::unique_ptr<mlir::Pass> createCanonicalizeLiveInPass();
 std::unique_ptr<mlir::Pass> createPromoteInputArgToConstPass();
 std::unique_ptr<mlir::Pass> createTransformToSteerControlPass();
 std::unique_ptr<mlir::Pass> createRemovePredicatedTypePass();
-std::unique_ptr<mlir::Pass> createWrapLoopInKernelPass();
 
 // ====================================
 // Optimization Passes
diff --git a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
index d8b7ef57..cfe14543 100644
--- a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
+++ b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
@@ -225,6 +225,9 @@ struct MapToAcceleratorPass
       rec_mii = 1; // No recurrence cycles found, set MII to 1.
     }
 
+    llvm::errs() << "[MapToAcceleratorPass] Calculated Recurrence MII: "
+                 << rec_mii << "\n";
+
     int res_mii = calculateResMii(region, architecture);
 
     const int possible_min_ii = std::max(rec_mii, res_mii);
diff --git a/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp b/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp
index c50519cf..105f3635 100644
--- a/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp
+++ b/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp
@@ -1,6 +1,6 @@
+#include "NeuraDialect/NeuraAttributes.h"
 #include "NeuraDialect/NeuraOps.h"
 #include "NeuraDialect/NeuraTypes.h"
-#include "NeuraDialect/NeuraAttributes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -22,6 +22,8 @@ using namespace mlir;
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 namespace {
+// Attribute name to mark iter_arg init constants.
+constexpr const char *kIterArgInitAttr = "is_iter_arg_init";
 
 // =========================================
 // Helper Functions
@@ -35,6 +37,16 @@ bool isOriginConstantOp(Value value) {
     return false;
   }
 
+  // Skips constants marked as iter_arg_init.
+  if (def_op->hasAttr(kIterArgInitAttr)) {
+    if (auto bool_attr = def_op->getAttrOfType<BoolAttr>(kIterArgInitAttr)) {
+      if (bool_attr.getValue()) {
+        // This constant is an iter_arg_init, should not be folded.
+        return false;
+      }
+    }
+  }
+
   // Checks if the result type is the original type or the predicated type.
   Type result_type = value.getType();
   if (isa<neura::PredicatedValue>(result_type)) {
@@ -434,7 +446,8 @@ struct FuseStoreIndexedConstantPattern
   LogicalResult matchAndRewrite(neura::StoreIndexedOp op,
                                 PatternRewriter &rewriter) const override {
     // Checks if already folded.
-    if (op->hasAttr(neura::attr::kLhsValue) || op->hasAttr(neura::attr::kRhsValue)) {
+    if (op->hasAttr(neura::attr::kLhsValue) ||
+        op->hasAttr(neura::attr::kRhsValue)) {
       return failure();
     }
 
diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
index 556d6181..14257c8c 100644
--- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
+++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
@@ -215,6 +215,8 @@ void handleKernelIterArgs(neura::KernelOp kernel_op, Block *entry_block,
                                      reserve_op.getResult());
 
     iter_arg_final_values.push_back(feedback_value);
+
+    init_const->removeAttr(kIterArgInitAttr);
     llvm::errs() << "[iter_args]     Created iter_arg with grant_once\n";
   }
 
diff --git a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
deleted file mode 100644
index ac664382..00000000
--- a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-#include "NeuraDialect/NeuraDialect.h"
-#include "NeuraDialect/NeuraOps.h"
-#include "NeuraDialect/NeuraPasses.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/TypeID.h"
-#include "mlir/Transforms/RegionUtils.h"
-#include "llvm/ADT/STLExtras.h"
-#include <memory>
-
-using namespace mlir;
-
-namespace {
-
-static bool isInnermostLoop(affine::AffineForOp for_op) {
-  bool has_nested_loops = false;
-  for_op.getBody()->walk([&](affine::AffineForOp) { has_nested_loops = true; });
-  return !has_nested_loops;
-}
-
-// Wraps an innermost affine for loop in a neura.kernel operation.
-static LogicalResult wrapInnermostLoopAsKernel(affine::AffineForOp for_op,
-                                               OpBuilder &builder,
-                                               unsigned &kernel_id) {
-  Location loc = for_op.getLoc();
-
-  // Collects values that need to be captured by the kernel.
-  llvm::SetVector<Value> captured_values;
-  getUsedValuesDefinedAbove(for_op.getRegion(), captured_values);
-
-  // Checks if the loop has output values.
-  bool has_outputs = !for_op.getResults().empty();
-
-  // Creates the neura.kernel operation.
-  builder.setInsertionPoint(for_op);
-  SmallVector<Value> inputs(captured_values.begin(), captured_values.end());
-  SmallVector<Type> input_types;
-  for (Value val : inputs) {
-    input_types.push_back(val.getType());
-  }
-
-  neura::KernelOp kernel_op = builder.create<neura::KernelOp>(
-      loc, /*output_types=*/for_op->getResultTypes(),
-      /*inputs=*/inputs);
-
-  // Sets kernel name.
-  std::string kernel_name = "kernel_" + std::to_string(kernel_id++);
-  kernel_op.setKernelNameAttr(builder.getStringAttr(kernel_name));
-
-  // Creats the kernel body block with arguments for captured values.
-  Block *kernel_body = new Block();
-  kernel_op.getBody().push_back(kernel_body);
-
-  // Replaces uses of the original loop's results with kernel results.
-  if (has_outputs) {
-    for (auto [orig_result, kernel_result] :
-         llvm::zip(for_op->getResults(), kernel_op.getResults())) {
-      orig_result.replaceAllUsesWith(kernel_result);
-    }
-  }
-
-  // Moves the loop directly in to the kernel body.
-  builder.setInsertionPointToStart(kernel_body);
-  for_op->moveBefore(kernel_body, kernel_body->end());
-
-  builder.setInsertionPointToEnd(kernel_body);
-  // Adds yield operation with proper operands.
-  if (has_outputs) {
-    // If the loop has outputs, yield the loop results.
-    SmallVector<Value> yield_operands(for_op.getResults());
-    builder.create<neura::YieldOp>(loc, ValueRange{}, yield_operands);
-  } else {
-    // If the loop has no outputs, create an empty yield.
-    builder.create<neura::YieldOp>(loc);
-  }
-
-  return success();
-}
-
-struct WrapLoopInKernelPass
-    : public PassWrapper<WrapLoopInKernelPass, OperationPass<func::FuncOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WrapLoopInKernelPass)
-
-  StringRef getArgument() const override { return "wrap-loop-in-kernel"; }
-  StringRef getDescription() const override {
-    return "Wraps loops in Neura kernel operations.";
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<neura::NeuraDialect, affine::AffineDialect,
-                    func::FuncDialect>();
-  }
-
-  void runOnOperation() override {
-    func::FuncOp func_op = getOperation();
-
-    // Skips if function already has kerenls.
-    bool has_kernels = false;
-    func_op.walk([&](neura::KernelOp) { has_kernels = true; });
-    if (has_kernels) {
-      return;
-    }
-
-    // Skips main function.
-    if (func_op.getName() == "main") {
-      return;
-    }
-
-    // Collects all innermost affine for loops in the function.
-    // TODO: Support more kernel wrapping strategies.
-    SmallVector<affine::AffineForOp> innermost_loops;
-    func_op.walk([&](affine::AffineForOp for_op) {
-      if (isInnermostLoop(for_op)) {
-        innermost_loops.push_back(for_op);
-      }
-    });
-
-    if (innermost_loops.empty()) {
-      return;
-    }
-
-    // Wraps each innermost affine for loop in a neura.kernel operation.
-    // TODO: Support more kernel wrapping strategies.
-    OpBuilder builder(func_op->getContext());
-    unsigned kernel_id = 0;
-    for (affine::AffineForOp loop : innermost_loops) {
-      if (failed(wrapInnermostLoopAsKernel(loop, builder, kernel_id))) {
-        signalPassFailure();
-        return;
-      }
-    }
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> mlir::neura::createWrapLoopInKernelPass() {
-  return std::make_unique<WrapLoopInKernelPass>();
-}
\ No newline at end of file
diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir
new file mode 100644
index 00000000..5924f46c
--- /dev/null
+++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir
@@ -0,0 +1,202 @@
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: -o %t.taskflow.mlir
+// RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW
+
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: -o %t.canonicalized.mlir
+// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE
+
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: -o %t.kernel.mlir
+// RUN: FileCheck %s --input-file=%t.kernel.mlir --check-prefixes=KERNEL
+
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: -o %t.neura.mlir
+// RUN: FileCheck %s --input-file=%t.neura.mlir --check-prefixes=NEURA
+
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: --promote-input-arg-to-const \
+// RUN: --fold-constant \
+// RUN: --canonicalize-return \
+// RUN: --canonicalize-live-in \
+// RUN: --leverage-predicated-value \
+// RUN: --transform-ctrl-to-data-flow \
+// RUN: --fold-constant \
+// RUN: -o %t.dataflow.mlir
+// RUN: FileCheck %s --input-file=%t.dataflow.mlir --check-prefixes=DATAFLOW
+
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: --promote-input-arg-to-const \
+// RUN: --fold-constant \
+// RUN: --canonicalize-return \
+// RUN: --canonicalize-live-in \
+// RUN: --leverage-predicated-value \
+// RUN: --transform-ctrl-to-data-flow \
+// RUN: --fold-constant \
+// RUN: --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN: --architecture-spec=%S/../../../arch_spec/architecture.yaml \
+// RUN: -o %t.mapped.mlir
+// RUN: FileCheck %s --input-file=%t.mapped.mlir --check-prefixes=MAPPED
+
+
+
+module attributes {} {
+  func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = affine.for %arg3 = 0 to 32 iter_args(%arg4 = %c0_i32) -> (i32) {
+      %1 = affine.load %arg0[%arg3] : memref<?xi32>
+      %2 = affine.load %arg2[%arg3] : memref<?xi32>
+      %3 = arith.muli %1, %2 : i32
+      %4 = arith.addi %arg4, %3 : i32
+      affine.yield %4 : i32
+    }
+    return %0 : i32
+  }
+}
+
+// TASKFLOW:      module {
+// TASKFLOW-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// TASKFLOW-NEXT:     %c0_i32 = arith.constant 0 : i32
+// TASKFLOW-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// TASKFLOW-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// TASKFLOW-NEXT:       %0 = affine.for %arg6 = 0 to 32 iter_args(%arg7 = %arg5) -> (i32) {
+// TASKFLOW-NEXT:         %1 = affine.load %arg3[%arg6] : memref<?xi32>
+// TASKFLOW-NEXT:         %2 = affine.load %arg4[%arg6] : memref<?xi32>
+// TASKFLOW-NEXT:         %3 = arith.muli %1, %2 : i32
+// TASKFLOW-NEXT:         %4 = arith.addi %arg7, %3 : i32
+// TASKFLOW-NEXT:         affine.yield %4 : i32
+// TASKFLOW-NEXT:       }
+// TASKFLOW-NEXT:       "taskflow.yield"(%0) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
+// TASKFLOW-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// TASKFLOW-NEXT:     return %value_outputs : i32
+// TASKFLOW-NEXT:   }
+// TASKFLOW-NEXT: }
+
+// CANONICALIZE:      module {
+// CANONICALIZE-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// CANONICALIZE-NEXT:     %c0_i32 = arith.constant 0 : i32
+// CANONICALIZE-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// CANONICALIZE-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// CANONICALIZE-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// CANONICALIZE-NEXT:       %1 = "taskflow.hyperblock"(%0, %arg5) <{operandSegmentSizes = array<i32: 1, 1>}> ({
+// CANONICALIZE-NEXT:       ^bb0(%arg6: index, %arg7: i32):
+// CANONICALIZE-NEXT:         %2 = memref.load %arg3[%arg6] : memref<?xi32>
+// CANONICALIZE-NEXT:         %3 = memref.load %arg4[%arg6] : memref<?xi32>
+// CANONICALIZE-NEXT:         %4 = arith.muli %2, %3 : i32
+// CANONICALIZE-NEXT:         %5 = arith.addi %arg7, %4 : i32
+// CANONICALIZE-NEXT:         taskflow.hyperblock.yield iter_args_next(%5 : i32) results(%5 : i32)
+// CANONICALIZE-NEXT:       }) : (index, i32) -> i32
+// CANONICALIZE-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
+// CANONICALIZE-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// CANONICALIZE-NEXT:     return %value_outputs : i32
+// CANONICALIZE-NEXT:   }
+// CANONICALIZE-NEXT: }
+
+// KERNEL:      module {
+// KERNEL-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// KERNEL-NEXT:     %c0_i32 = arith.constant 0 : i32
+// KERNEL-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// KERNEL-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// KERNEL-NEXT:       %1 = neura.kernel inputs(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) iter_args_init(%arg5 : i32) {
+// KERNEL-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
+// KERNEL-NEXT:         %2 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// KERNEL-NEXT:         %3 = memref.load %arg6[%2] : memref<?xi32>
+// KERNEL-NEXT:         %4 = memref.load %arg7[%2] : memref<?xi32>
+// KERNEL-NEXT:         %5 = arith.muli %3, %4 : i32
+// KERNEL-NEXT:         %6 = arith.addi %arg8, %5 : i32
+// KERNEL-NEXT:         neura.yield iter_args_next(%6 : i32) results(%6 : i32)
+// KERNEL-NEXT:       } : i32
+// KERNEL-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
+// KERNEL-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// KERNEL-NEXT:     return %value_outputs : i32
+// KERNEL-NEXT:   }
+// KERNEL-NEXT: }
+
+// NEURA:      module {
+// NEURA-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// NEURA-NEXT:     %c0_i32 = arith.constant 0 : i32
+// NEURA-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// NEURA-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// NEURA-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// NEURA-NEXT:       %1 = neura.kernel inputs(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) iter_args_init(%arg5 : i32) attributes {accelerator = "neura"} {
+// NEURA-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
+// NEURA-NEXT:         %2 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// NEURA-NEXT:         %3 = neura.load_indexed %arg6[%2 : index] memref<?xi32> : i32
+// NEURA-NEXT:         %4 = neura.load_indexed %arg7[%2 : index] memref<?xi32> : i32
+// NEURA-NEXT:         %5 = "neura.mul"(%3, %4) : (i32, i32) -> i32
+// NEURA-NEXT:         %6 = "neura.add"(%arg8, %5) : (i32, i32) -> i32
+// NEURA-NEXT:         neura.yield iter_args_next(%6 : i32) results(%6 : i32)
+// NEURA-NEXT:       } : i32
+// NEURA-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
+// NEURA-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// NEURA-NEXT:     return %value_outputs : i32
+// NEURA-NEXT:   }
+// NEURA-NEXT: }
+
+// DATAFLOW:      module {
+// DATAFLOW-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// DATAFLOW-NEXT:     %c0_i32 = arith.constant 0 : i32
+// DATAFLOW-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// DATAFLOW-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// DATAFLOW-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// DATAFLOW-NEXT:       %1 = neura.kernel inputs(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
+// DATAFLOW-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
+// DATAFLOW-NEXT:         %2 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : !neura.data<index, i1>
+// DATAFLOW-NEXT:         %3 = neura.load_indexed [%2 : !neura.data<index, i1>]  {lhs_value = "%input0"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %4 = neura.load_indexed [%2 : !neura.data<index, i1>]  {lhs_value = "%input1"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %5 = "neura.mul"(%3, %4) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %6 = "neura.add"(%5) {lhs_value = "%iter_arg_init0"} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %7 = neura.extract_predicate %2 : !neura.data<index, i1> -> !neura.data<i1, i1>
+// DATAFLOW-NEXT:         %8 = "neura.not"(%7) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// DATAFLOW-NEXT:         %9 = neura.grant_predicate %6, %8 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         neura.return_value %9 : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         neura.yield
+// DATAFLOW-NEXT:       } : i32
+// DATAFLOW-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
+// DATAFLOW-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// DATAFLOW-NEXT:     return %value_outputs : i32
+// DATAFLOW-NEXT:   }
+// DATAFLOW-NEXT: }
\ No newline at end of file
diff --git a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir b/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir
deleted file mode 100644
index ad24eac4..00000000
--- a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir
+++ /dev/null
@@ -1,38 +0,0 @@
-// Wraps the innermost loop within neura.kernel operation.
-// RUN: mlir-neura-opt %s \
-// RUN: --wrap-loop-in-kernel \
-// RUN: -o %t-wrapped.mlir
-// RUN: FileCheck %s --input-file=%t-wrapped.mlir
-
-module attributes {} {
-  func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref<?x128xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-    %c0_i32 = arith.constant 0 : i32
-    %0 = affine.for %arg1 = 0 to 128 iter_args(%arg2 = %c0_i32) -> (i32) {
-      %1 = affine.for %arg3 = 0 to 128 iter_args(%arg4 = %arg2) -> (i32) {
-        %2 = affine.load %arg0[%arg1, %arg3] : memref<?x128xi32>
-        %3 = arith.addi %arg4, %2 : i32
-        affine.yield %3 : i32
-      }
-      affine.yield %1 : i32
-    }
-    return %0 : i32
-  }
-}
-
- // CHECK:      module {
- // CHECK-NEXT:   func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref<?x128xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
- // CHECK-NEXT:     %c0_i32 = arith.constant 0 : i32
- // CHECK-NEXT:     %0 = affine.for %arg1 = 0 to 128 iter_args(%arg2 = %c0_i32) -> (i32) {
- // CHECK-NEXT:       %1 = neura.kernel ins(%arg0, %arg1 : memref<?x128xi32>, index) attributes {kernel_name = "kernel_0"} {
- // CHECK-NEXT:         %2 = affine.for %arg3 = 0 to 128 iter_args(%arg4 = %arg2) -> (i32) {
- // CHECK-NEXT:           %3 = affine.load %arg0[%arg1, %arg3] : memref<?x128xi32>
- // CHECK-NEXT:           %4 = arith.addi %arg4, %3 : i32
- // CHECK-NEXT:           affine.yield %4 : i32
- // CHECK-NEXT:         }
- // CHECK-NEXT:         neura.yield %2 : i32
- // CHECK-NEXT:       } : i32
- // CHECK-NEXT:       affine.yield %1 : i32
- // CHECK-NEXT:     }
- // CHECK-NEXT:     return %0 : i32
- // CHECK-NEXT:   }
- // CHECK-NEXT: }
diff --git a/test/multi-cgra/neura-kernel/kernel_without_yield/kernel_without_yield.mlir b/test/multi-cgra/neura-kernel/kernel_without_yield/kernel_without_yield.mlir
deleted file mode 100644
index 0775cf19..00000000
--- a/test/multi-cgra/neura-kernel/kernel_without_yield/kernel_without_yield.mlir
+++ /dev/null
@@ -1,30 +0,0 @@
-// Wraps the innermost loop within neura.kernel operation.
-// RUN: mlir-neura-opt %s \
-// RUN:  --wrap-loop-in-kernel \
-// RUN:  | FileCheck %s
-
-module attributes {} {
-  func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {llvm.linkage = #llvm.linkage<external>} {
-    affine.for %arg2 = 0 to 128 {
-      affine.for %arg3 = 0 to 128 {
-        %0 = affine.load %arg0[0, 0, 0, 0, 0, %arg3] : memref<?x1x1x1x1x128xi8>
-        affine.store %0, %arg1[0, 0, %arg2, 0, 0, %arg3] : memref<?x1x128x1x1x128xi8>
-      }
-    }
-    return
-  }
-}
-
- // CHECK:      module {
- // CHECK-NEXT:   func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {llvm.linkage = #llvm.linkage<external>} {
- // CHECK-NEXT:     affine.for %arg2 = 0 to 128 {
- // CHECK-NEXT:       neura.kernel ins(%arg0, %arg1, %arg2 : memref<?x1x1x1x1x128xi8>, memref<?x1x128x1x1x128xi8>, index) attributes {kernel_name = "kernel_0"} {
- // CHECK-NEXT:         affine.for %arg3 = 0 to 128 {
- // CHECK-NEXT:           %0 = affine.load %arg0[0, 0, 0, 0, 0, %arg3] : memref<?x1x1x1x1x128xi8>
- // CHECK-NEXT:           affine.store %0, %arg1[0, 0, %arg2, 0, 0, %arg3] : memref<?x1x128x1x1x128xi8>
- // CHECK-NEXT:         }
- // CHECK-NEXT:       }
- // CHECK-NEXT:     }
- // CHECK-NEXT:     return
- // CHECK-NEXT:   }
- // CHECK-NEXT: }
\ No newline at end of file
diff --git a/test/multi-cgra/neura-kernel/multi-kernel/multi-kernel.mlir b/test/multi-cgra/neura-kernel/multi-kernel/multi-kernel.mlir
deleted file mode 100644
index 12e2846e..00000000
--- a/test/multi-cgra/neura-kernel/multi-kernel/multi-kernel.mlir
+++ /dev/null
@@ -1,89 +0,0 @@
-// Wraps the innermost loop within neura.kernel operation.
-// This function is a convolution followed by ReLU activation.
-
-// RUN: mlir-neura-opt %s \
-// RUN:  --wrap-loop-in-kernel \
-// RUN:  | FileCheck %s
-
-module attributes {} {
-  func.func @_Z17conv3x3_then_reluPA32_A32_KfPA3_A3_A3_S_PS_PA30_A30_fSA_(%arg0: memref<?x32x32xf32>, %arg1: memref<?x3x3x3xf32>, %arg2: memref<?xf32>, %arg3: memref<?x30x30xf32>, %arg4: memref<?x30x30xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
-    %cst = arith.constant 0.000000e+00 : f32
-    affine.for %arg5 = 0 to 64 {
-      affine.for %arg6 = 0 to 30 {
-        affine.for %arg7 = 0 to 30 {
-          %0 = affine.load %arg2[%arg5] : memref<?xf32>
-          %1 = affine.for %arg8 = 0 to 3 iter_args(%arg9 = %0) -> (f32) {
-            %2 = affine.for %arg10 = 0 to 3 iter_args(%arg11 = %arg9) -> (f32) {
-              %3 = affine.for %arg12 = 0 to 3 iter_args(%arg13 = %arg11) -> (f32) {
-                %4 = affine.load %arg0[%arg8, %arg6 + %arg10, %arg7 + %arg12] : memref<?x32x32xf32>
-                %5 = affine.load %arg1[%arg5, %arg8, %arg10, %arg12] : memref<?x3x3x3xf32>
-                %6 = arith.mulf %4, %5 : f32
-                %7 = arith.addf %arg13, %6 : f32
-                affine.yield %7 : f32
-              }
-              affine.yield %3 : f32
-            }
-            affine.yield %2 : f32
-          }
-          affine.store %1, %arg3[%arg5, %arg6, %arg7] : memref<?x30x30xf32>
-        }
-      }
-    }
-    affine.for %arg5 = 0 to 64 {
-      affine.for %arg6 = 0 to 30 {
-        affine.for %arg7 = 0 to 30 {
-          %0 = affine.load %arg3[%arg5, %arg6, %arg7] : memref<?x30x30xf32>
-          %1 = arith.cmpf ogt, %0, %cst : f32
-          %2 = arith.select %1, %0, %cst : f32
-          affine.store %2, %arg4[%arg5, %arg6, %arg7] : memref<?x30x30xf32>
-        }
-      }
-    }
-    return
-  }
-}
-
-
- // CHECK:      module {
- // CHECK-NEXT:   func.func @_Z17conv3x3_then_reluPA32_A32_KfPA3_A3_A3_S_PS_PA30_A30_fSA_(%arg0: memref<?x32x32xf32>, %arg1: memref<?x3x3x3xf32>, %arg2: memref<?xf32>, %arg3: memref<?x30x30xf32>, %arg4: memref<?x30x30xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
- // CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
- // CHECK-NEXT:     affine.for %arg5 = 0 to 64 {
- // CHECK-NEXT:       affine.for %arg6 = 0 to 30 {
- // CHECK-NEXT:         affine.for %arg7 = 0 to 30 {
- // CHECK-NEXT:           %0 = affine.load %arg2[%arg5] : memref<?xf32>
- // CHECK-NEXT:           %1 = affine.for %arg8 = 0 to 3 iter_args(%arg9 = %0) -> (f32) {
- // CHECK-NEXT:             %2 = affine.for %arg10 = 0 to 3 iter_args(%arg11 = %arg9) -> (f32) {
- // CHECK-NEXT:               %3 = neura.kernel ins(%arg0, %arg8, %arg6, %arg10, %arg7, %arg1, %arg5 : memref<?x32x32xf32>, index, index, index, index, memref<?x3x3x3xf32>, index) attributes {kernel_name = "kernel_0"} {
- // CHECK-NEXT:                 %4 = affine.for %arg12 = 0 to 3 iter_args(%arg13 = %arg11) -> (f32) {
- // CHECK-NEXT:                   %5 = affine.load %arg0[%arg8, %arg6 + %arg10, %arg7 + %arg12] : memref<?x32x32xf32>
- // CHECK-NEXT:                   %6 = affine.load %arg1[%arg5, %arg8, %arg10, %arg12] : memref<?x3x3x3xf32>
- // CHECK-NEXT:                   %7 = arith.mulf %5, %6 : f32
- // CHECK-NEXT:                   %8 = arith.addf %arg13, %7 : f32
- // CHECK-NEXT:                   affine.yield %8 : f32
- // CHECK-NEXT:                 }
- // CHECK-NEXT:                 neura.yield %4 : f32
- // CHECK-NEXT:               } : f32
- // CHECK-NEXT:               affine.yield %3 : f32
- // CHECK-NEXT:             }
- // CHECK-NEXT:             affine.yield %2 : f32
- // CHECK-NEXT:           }
- // CHECK-NEXT:           affine.store %1, %arg3[%arg5, %arg6, %arg7] : memref<?x30x30xf32>
- // CHECK-NEXT:         }
- // CHECK-NEXT:       }
- // CHECK-NEXT:     }
- // CHECK-NEXT:     affine.for %arg5 = 0 to 64 {
- // CHECK-NEXT:       affine.for %arg6 = 0 to 30 {
- // CHECK-NEXT:         neura.kernel ins(%arg3, %arg5, %arg6, %cst, %arg4 : memref<?x30x30xf32>, index, index, f32, memref<?x30x30xf32>) attributes {kernel_name = "kernel_1"} {
- // CHECK-NEXT:           affine.for %arg7 = 0 to 30 {
- // CHECK-NEXT:             %0 = affine.load %arg3[%arg5, %arg6, %arg7] : memref<?x30x30xf32>
- // CHECK-NEXT:             %1 = arith.cmpf ogt, %0, %cst : f32
- // CHECK-NEXT:             %2 = arith.select %1, %0, %cst : f32
- // CHECK-NEXT:             affine.store %2, %arg4[%arg5, %arg6, %arg7] : memref<?x30x30xf32>
- // CHECK-NEXT:           }
- // CHECK-NEXT:         }
- // CHECK-NEXT:       }
- // CHECK-NEXT:     }
- // CHECK-NEXT:     return
- // CHECK-NEXT:   }
- // CHECK-NEXT: }
-
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 6ce8e5e6..9d1e6f46 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -107,7 +107,7 @@ module attributes {} {
 // HYPERBLOCK-NEXT:       ^bb0(%arg1: index, %arg2: i32):
 // HYPERBLOCK-NEXT:         %3 = arith.index_cast %arg1 : index to i32
 // HYPERBLOCK-NEXT:         %4 = arith.addi %arg2, %3 : i32
-// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield outputs(%4 : i32)
+// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32)
 // HYPERBLOCK-NEXT:       }) : (index, i32) -> i32
 // HYPERBLOCK-NEXT:       "taskflow.yield"(%2) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
 // HYPERBLOCK-NEXT:     }) : (i32) -> i32
@@ -151,6 +151,8 @@ module attributes {} {
 // HYPERBLOCK-NEXT:   }
 // HYPERBLOCK-NEXT: }
 
+
+
 // CANONICALIZE:      module {
 // CANONICALIZE-NEXT:   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
 // CANONICALIZE-NEXT:     %c2_i32 = arith.constant 2 : i32
@@ -165,7 +167,7 @@ module attributes {} {
 // CANONICALIZE-NEXT:       ^bb0(%arg1: index, %arg2: i32):
 // CANONICALIZE-NEXT:         %3 = arith.index_cast %arg1 : index to i32
 // CANONICALIZE-NEXT:         %4 = arith.addi %arg2, %3 : i32
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield outputs(%4 : i32)
+// CANONICALIZE-NEXT:         taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32)
 // CANONICALIZE-NEXT:       }) : (index, i32) -> i32
 // CANONICALIZE-NEXT:       "taskflow.yield"(%2) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
 // CANONICALIZE-NEXT:     }) : (i32) -> i32

From 8948aee1e00a7c2932241a61f2623ba8b9687f3e Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 24 Jan 2026 19:41:01 +0800
Subject: [PATCH 15/25] add tests for e2e taskflow2neura test

---
 .../NeuraDialect/Architecture/Architecture.h  |  10 +-
 .../Architecture/ArchitectureSpec.h           |  11 +-
 test/arch_spec/architecture.yaml              |   2 +-
 test/multi-cgra/kernel_mapping/fir/fir.mlir   |  66 +++-
 .../loop-in-kernel/loop-in-kernel.mlir        | 203 +++++++++++++
 test/multi-cgra/kernel_mapping/relu/relu.mlir | 286 ++++++++++++++++++
 6 files changed, 560 insertions(+), 18 deletions(-)
 create mode 100644 test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir
 create mode 100644 test/multi-cgra/kernel_mapping/relu/relu.mlir

diff --git a/include/NeuraDialect/Architecture/Architecture.h b/include/NeuraDialect/Architecture/Architecture.h
index 7dcbad9e..a27af7e2 100644
--- a/include/NeuraDialect/Architecture/Architecture.h
+++ b/include/NeuraDialect/Architecture/Architecture.h
@@ -88,7 +88,10 @@ enum OperationKind {
   // Data movement operations.
   IReserve = 38,
   IDataMov = 39,
-  ICtrlMov = 40
+  ICtrlMov = 40,
+  // Counter operations.
+  ICounter = 41,
+  IExtractPredicate = 42
 };
 
 // Maps hardware resource names to their supported operations.
@@ -135,7 +138,10 @@ static const std::map<std::string, std::vector<OperationKind>>
 
         // Predicate operations.
         {"grant", {IGrantPredicate, IGrantOnce, IGrantAlways}},
-};
+
+        // Counter operations.
+        {"counter", {ICounter}},
+        {"extract_predicate", {IExtractPredicate}}};
 
 //===----------------------------------------------------------------------===//
 // BasicResource: abstract base class for Tile, Link, etc.
diff --git a/include/NeuraDialect/Architecture/ArchitectureSpec.h b/include/NeuraDialect/Architecture/ArchitectureSpec.h
index 70ee0033..9cd4bff9 100644
--- a/include/NeuraDialect/Architecture/ArchitectureSpec.h
+++ b/include/NeuraDialect/Architecture/ArchitectureSpec.h
@@ -21,11 +21,12 @@ struct TileDefaults {
   // Default function unit types - include all supported function units
   // types for newbie convenience.
   std::vector<std::string> function_units = {
-      "add",          "mul",   "div",       "fadd",      "fmul",
-      "fdiv",         "logic", "cmp",       "sel",       "type_conv",
-      "shift",        "vfmul", "fadd_fadd", "fmul_fadd", "grant",
-      "loop_control", "phi",   "constant",  "mem",       "return",
-      "mem_indexed",  "alloca"};
+      "add",         "mul",       "div",     "fadd",
+      "fmul",        "fdiv",      "logic",   "cmp",
+      "sel",         "type_conv", "shift",   "vfmul",
+      "fadd_fadd",   "fmul_fadd", "grant",   "loop_control",
+      "phi",         "constant",  "mem",     "return",
+      "mem_indexed", "alloca",    "counter", "extract_predicate"};
 };
 
 // Structure for holding memory configuration.
diff --git a/test/arch_spec/architecture.yaml b/test/arch_spec/architecture.yaml
index 31235dca..cf0730b5 100644
--- a/test/arch_spec/architecture.yaml
+++ b/test/arch_spec/architecture.yaml
@@ -15,7 +15,7 @@ per_cgra_defaults:
 
 tile_defaults:
   num_registers: 32
-  fu_types: ["add", "mul", "div", "fadd", "fmul", "fdiv", "logic", "cmp", "sel", "type_conv", "vfmul", "fadd_fadd", "fmul_fadd", "grant", "loop_control", "phi", "constant", "mem", "return", "mem_indexed", "alloca", "shift"]
+  fu_types: ["add", "mul", "div", "fadd", "fmul", "fdiv", "logic", "cmp", "sel", "type_conv", "vfmul", "fadd_fadd", "fmul_fadd", "grant", "loop_control", "phi", "constant", "mem", "return", "mem_indexed", "alloca", "shift", "counter", "extract_predicate"]
 
 link_defaults:
   latency: 1
diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir
index 5924f46c..46f62a2c 100644
--- a/test/multi-cgra/kernel_mapping/fir/fir.mlir
+++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir
@@ -75,6 +75,7 @@
 // RUN: --leverage-predicated-value \
 // RUN: --transform-ctrl-to-data-flow \
 // RUN: --fold-constant \
+// RUN: --insert-data-mov \
 // RUN: --map-to-accelerator="mapping-strategy=heuristic" \
 // RUN: --architecture-spec=%S/../../../arch_spec/architecture.yaml \
 // RUN: -o %t.mapped.mlir
@@ -184,19 +185,64 @@ module attributes {} {
 // DATAFLOW-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
 // DATAFLOW-NEXT:       %1 = neura.kernel inputs(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
 // DATAFLOW-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
-// DATAFLOW-NEXT:         %2 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : !neura.data<index, i1>
-// DATAFLOW-NEXT:         %3 = neura.load_indexed [%2 : !neura.data<index, i1>]  {lhs_value = "%input0"} : !neura.data<i32, i1>
-// DATAFLOW-NEXT:         %4 = neura.load_indexed [%2 : !neura.data<index, i1>]  {lhs_value = "%input1"} : !neura.data<i32, i1>
-// DATAFLOW-NEXT:         %5 = "neura.mul"(%3, %4) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
-// DATAFLOW-NEXT:         %6 = "neura.add"(%5) {lhs_value = "%iter_arg_init0"} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// DATAFLOW-NEXT:         %7 = neura.extract_predicate %2 : !neura.data<index, i1> -> !neura.data<i1, i1>
-// DATAFLOW-NEXT:         %8 = "neura.not"(%7) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// DATAFLOW-NEXT:         %9 = neura.grant_predicate %6, %8 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
-// DATAFLOW-NEXT:         neura.return_value %9 : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %2 = "neura.grant_once"() <{constant_value = "%iter_arg_init0"}> : () -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %3 = neura.reserve : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %4 = neura.phi_start %2, %3 : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %5 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : !neura.data<index, i1>
+// DATAFLOW-NEXT:         %6 = neura.load_indexed [%5 : !neura.data<index, i1>]  {lhs_value = "%input0"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %7 = neura.load_indexed [%5 : !neura.data<index, i1>]  {lhs_value = "%input1"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %8 = "neura.mul"(%6, %7) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %9 = "neura.add"(%4, %8) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         neura.ctrl_mov %9 -> %3 : !neura.data<i32, i1> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %10 = neura.extract_predicate %5 : !neura.data<index, i1> -> !neura.data<i1, i1>
+// DATAFLOW-NEXT:         %11 = "neura.not"(%10) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// DATAFLOW-NEXT:         %12 = neura.grant_predicate %9, %11 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         neura.return_value %12 : !neura.data<i32, i1>
 // DATAFLOW-NEXT:         neura.yield
 // DATAFLOW-NEXT:       } : i32
 // DATAFLOW-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
 // DATAFLOW-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
 // DATAFLOW-NEXT:     return %value_outputs : i32
 // DATAFLOW-NEXT:   }
-// DATAFLOW-NEXT: }
\ No newline at end of file
+// DATAFLOW-NEXT: }
+
+// MAPPED:      module {
+// MAPPED-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// MAPPED-NEXT:     %c0_i32 = arith.constant 0 : i32
+// MAPPED-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// MAPPED-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// MAPPED-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// MAPPED-NEXT:       %1 = neura.kernel inputs(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 2 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 2 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
+// MAPPED-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
+// MAPPED-NEXT:         %2 = "neura.grant_once"() <{constant_value = "%iter_arg_init0"}> {dfg_id = 0 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 3 : i32}]} : () -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %3 = neura.reserve {dfg_id = 1 : i32} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %4 = "neura.data_mov"(%2) {dfg_id = 4 : i32, mapping_locs = [{id = 42 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}, {id = 288 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %5 = neura.phi_start %4, %3 {dfg_id = 8 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %6 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", dfg_id = 2 : i32, lower_bound = 0 : index, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 1 : i32, y = 1 : i32}], step = 1 : index, upper_bound = 32 : index} : !neura.data<index, i1>
+// MAPPED-NEXT:         %7 = "neura.data_mov"(%6) {dfg_id = 5 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         %8 = neura.load_indexed [%7 : !neura.data<index, i1>]  {dfg_id = 9 : i32, lhs_value = "%input0", mapping_locs = [{id = 5 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %9 = "neura.data_mov"(%6) {dfg_id = 6 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         %10 = neura.load_indexed [%9 : !neura.data<index, i1>]  {dfg_id = 10 : i32, lhs_value = "%input1", mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 0 : i32}]} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %11 = "neura.data_mov"(%8) {dfg_id = 13 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %12 = "neura.data_mov"(%10) {dfg_id = 14 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %13 = "neura.mul"(%11, %12) {dfg_id = 16 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 0 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %14 = "neura.data_mov"(%5) {dfg_id = 12 : i32, mapping_locs = [{id = 288 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %15 = "neura.data_mov"(%13) {dfg_id = 18 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 2 : i32}, {id = 16 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %16 = "neura.add"(%14, %15) {dfg_id = 20 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 2 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         neura.ctrl_mov %16 -> %3 {dfg_id = 21 : i32, mapping_locs = [{id = 289 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : !neura.data<i32, i1> !neura.data<i32, i1>
+// MAPPED-NEXT:         %17 = "neura.data_mov"(%6) {dfg_id = 7 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}, {id = 128 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         %18 = neura.extract_predicate %17 {dfg_id = 11 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data<index, i1> -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %19 = "neura.data_mov"(%18) {dfg_id = 15 : i32, mapping_locs = [{id = 128 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %20 = "neura.not"(%19) {dfg_id = 17 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 1 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %21 = "neura.data_mov"(%16) {dfg_id = 22 : i32, mapping_locs = [{id = 27 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %22 = "neura.data_mov"(%20) {dfg_id = 19 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}, {id = 256 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %23 = neura.grant_predicate %21, %22 {dfg_id = 23 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 5 : i32, x = 0 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %24 = "neura.data_mov"(%23) {dfg_id = 24 : i32, mapping_locs = [{id = 256 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         neura.return_value %24 : !neura.data<i32, i1> {dfg_id = 25 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 2 : i32}]}
+// MAPPED-NEXT:         neura.yield {dfg_id = 3 : i32}
+// MAPPED-NEXT:       } : i32
+// MAPPED-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
+// MAPPED-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// MAPPED-NEXT:     return %value_outputs : i32
+// MAPPED-NEXT:   }
+// MAPPED-NEXT: }
\ No newline at end of file
diff --git a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir
new file mode 100644
index 00000000..f926d548
--- /dev/null
+++ b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir
@@ -0,0 +1,203 @@
+// RUN: mlir-neura-opt %s \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: -o %t.neura.mlir
+// RUN: FileCheck %s --input-file=%t.neura.mlir --check-prefixes=NEURA
+
+// RUN: mlir-neura-opt %s \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: --promote-input-arg-to-const \
+// RUN: --fold-constant \
+// RUN: --canonicalize-return \
+// RUN: --canonicalize-live-in \
+// RUN: --leverage-predicated-value \
+// RUN: --transform-ctrl-to-data-flow \
+// RUN: --fold-constant \
+// RUN: -o %t.dataflow.mlir
+// RUN: FileCheck %s --input-file=%t.dataflow.mlir --check-prefixes=DATAFLOW
+
+// RUN: mlir-neura-opt %s \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: --promote-input-arg-to-const \
+// RUN: --fold-constant \
+// RUN: --canonicalize-cast \
+// RUN: --canonicalize-return \
+// RUN: --canonicalize-live-in \
+// RUN: --leverage-predicated-value \
+// RUN: --transform-ctrl-to-data-flow \
+// RUN: --fold-constant \
+// RUN: --insert-data-mov \
+// RUN: --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN: --architecture-spec=%S/../../../arch_spec/architecture.yaml \
+// RUN: -o %t.mapped.mlir
+// RUN: FileCheck %s --input-file=%t.mapped.mlir --check-prefixes=MAPPED
+
+module {
+  func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+    ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+      %1 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref<?xi32>, memref<?xi32>, i32) {
+      ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
+        %0 = affine.for %arg9 = 0 to 32 iter_args(%arg10 = %arg8) -> (i32) {
+            %1 = affine.load %arg6[%arg9] : memref<?xi32>
+            %2 = affine.load %arg7[%arg9] : memref<?xi32>
+            %3 = arith.muli %1, %2 : i32
+            %4 = arith.addi %arg10, %3 : i32
+            affine.yield %4 : i32
+        }
+        neura.yield results(%0 : i32)
+      } : i32
+      "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
+    }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+    return %value_outputs : i32
+  }
+}
+
+// NEURA:      module {
+// NEURA-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// NEURA-NEXT:     %c0_i32 = arith.constant 0 : i32
+// NEURA-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// NEURA-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// NEURA-NEXT:       %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref<?xi32>, memref<?xi32>, i32) attributes {accelerator = "neura"} {
+// NEURA-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
+// NEURA-NEXT:         %1 = "neura.constant"() <{value = 1 : index}> : () -> index
+// NEURA-NEXT:         %2 = "neura.constant"() <{value = 32 : index}> : () -> index
+// NEURA-NEXT:         %3 = "neura.constant"() <{value = 0 : index}> : () -> index
+// NEURA-NEXT:         %4 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// NEURA-NEXT:         neura.br %4, %arg8 : i64, i32 to ^bb1
+// NEURA-NEXT:       ^bb1(%5: i64, %6: i32):  // 2 preds: ^bb0, ^bb2
+// NEURA-NEXT:         %7 = "neura.cast"(%5) <{cast_type = "int_to_index"}> : (i64) -> index
+// NEURA-NEXT:         %8 = "neura.icmp"(%7, %2) <{cmpType = "slt"}> : (index, index) -> i1
+// NEURA-NEXT:         neura.cond_br %8 : i1 then to ^bb2 else to ^bb3
+// NEURA-NEXT:       ^bb2:  // pred: ^bb1
+// NEURA-NEXT:         %9 = neura.load_indexed %arg6[%7 : index] memref<?xi32> : i32
+// NEURA-NEXT:         %10 = neura.load_indexed %arg7[%7 : index] memref<?xi32> : i32
+// NEURA-NEXT:         %11 = "neura.mul"(%9, %10) : (i32, i32) -> i32
+// NEURA-NEXT:         %12 = "neura.add"(%6, %11) : (i32, i32) -> i32
+// NEURA-NEXT:         %13 = "neura.add"(%7, %1) : (index, index) -> index
+// NEURA-NEXT:         %14 = "neura.cast"(%13) <{cast_type = "index_to_int"}> : (index) -> i64
+// NEURA-NEXT:         neura.br %14, %12 : i64, i32 to ^bb1
+// NEURA-NEXT:       ^bb3:  // pred: ^bb1
+// NEURA-NEXT:         neura.yield results(%6 : i32)
+// NEURA-NEXT:       } : i32
+// NEURA-NEXT:       "taskflow.yield"(%0) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
+// NEURA-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// NEURA-NEXT:     return %value_outputs : i32
+// NEURA-NEXT:   }
+// NEURA-NEXT: }
+
+
+// DATAFLOW:     module {
+// DATAFLOW-NEXT:  func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// DATAFLOW-NEXT:    %c0_i32 = arith.constant 0 : i32
+// DATAFLOW-NEXT:    %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// DATAFLOW-NEXT:    ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// DATAFLOW-NEXT:      %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref<?xi32>, memref<?xi32>, i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
+// DATAFLOW-NEXT:      ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
+// DATAFLOW-NEXT:        %1 = "neura.grant_once"() <{constant_value = "%input2"}> : () -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:        %2 = "neura.constant"() <{value = 0 : index}> : () -> !neura.data<index, i1>
+// DATAFLOW-NEXT:        %3 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (!neura.data<index, i1>) -> !neura.data<i64, i1>
+// DATAFLOW-NEXT:        %4 = "neura.grant_once"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// DATAFLOW-NEXT:        %5 = neura.reserve : !neura.data<i32, i1>
+// DATAFLOW-NEXT:        %6 = neura.phi_start %1, %5 : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:        %7 = neura.reserve : !neura.data<i64, i1>
+// DATAFLOW-NEXT:        %8 = neura.phi_start %4, %7 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// DATAFLOW-NEXT:        %9 = "neura.cast"(%8) <{cast_type = "int_to_index"}> : (!neura.data<i64, i1>) -> !neura.data<index, i1>
+// DATAFLOW-NEXT:        %10 = "neura.icmp"(%9) <{cmpType = "slt"}> {rhs_value = 32 : index} : (!neura.data<index, i1>) -> !neura.data<i1, i1>
+// DATAFLOW-NEXT:        %11 = neura.grant_predicate %9, %10 : !neura.data<index, i1>, !neura.data<i1, i1> -> !neura.data<index, i1>
+// DATAFLOW-NEXT:        %12 = neura.grant_predicate %6, %10 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:        %13 = "neura.not"(%10) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// DATAFLOW-NEXT:        %14 = neura.grant_predicate %6, %13 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:        neura.return_value %14 : !neura.data<i32, i1>
+// DATAFLOW-NEXT:        %15 = neura.load_indexed [%11 : !neura.data<index, i1>]  {lhs_value = "%input0"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:        %16 = neura.load_indexed [%11 : !neura.data<index, i1>]  {lhs_value = "%input1"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:        %17 = "neura.mul"(%15, %16) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:        %18 = "neura.add"(%12, %17) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:        %19 = "neura.add"(%11) {rhs_value = 1 : index} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// DATAFLOW-NEXT:        %20 = "neura.cast"(%19) <{cast_type = "index_to_int"}> : (!neura.data<index, i1>) -> !neura.data<i64, i1>
+// DATAFLOW-NEXT:        neura.ctrl_mov %20 -> %7 : !neura.data<i64, i1> !neura.data<i64, i1>
+// DATAFLOW-NEXT:        neura.ctrl_mov %18 -> %5 : !neura.data<i32, i1> !neura.data<i32, i1>
+// DATAFLOW-NEXT:        neura.yield
+// DATAFLOW-NEXT:      } : i32
+// DATAFLOW-NEXT:      "taskflow.yield"(%0) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
+// DATAFLOW-NEXT:    }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// DATAFLOW-NEXT:    return %value_outputs : i32
+// DATAFLOW-NEXT:  }
+// DATAFLOW-NEXT:}
+
+
+// MAPPED:      module {
+// MAPPED-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// MAPPED-NEXT:     %c0_i32 = arith.constant 0 : i32
+// MAPPED-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// MAPPED-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// MAPPED-NEXT:       %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref<?xi32>, memref<?xi32>, i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 4 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
+// MAPPED-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
+// MAPPED-NEXT:         %1 = "neura.grant_once"() <{constant_value = "%input2"}> {dfg_id = 0 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 0 : i32}]} : () -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %2 = "neura.grant_once"() <{constant_value = 0 : i64}> {dfg_id = 1 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 0 : i32, y = 0 : i32}]} : () -> !neura.data<i64, i1>
+// MAPPED-NEXT:         %3 = neura.reserve {dfg_id = 2 : i32} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %4 = "neura.data_mov"(%1) {dfg_id = 5 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %5 = neura.phi_start %4, %3 {dfg_id = 7 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %6 = neura.reserve {dfg_id = 3 : i32} : !neura.data<i64, i1>
+// MAPPED-NEXT:         %7 = "neura.data_mov"(%2) {dfg_id = 6 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPED-NEXT:         %8 = neura.phi_start %7, %6 {dfg_id = 8 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 0 : i32}]} : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPED-NEXT:         %9 = "neura.data_mov"(%8) {dfg_id = 12 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPED-NEXT:         %10 = "neura.icmp"(%9) <{cmpType = "slt"}> {dfg_id = 13 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 0 : i32}], rhs_value = 32 : index} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %11 = "neura.data_mov"(%8) {dfg_id = 11 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 1 : i32}, {id = 33 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPED-NEXT:         %12 = "neura.data_mov"(%10) {dfg_id = 16 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %13 = neura.grant_predicate %11, %12 {dfg_id = 19 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 0 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPED-NEXT:         %14 = "neura.data_mov"(%5) {dfg_id = 10 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %15 = "neura.data_mov"(%10) {dfg_id = 15 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 160 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 160 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %16 = neura.grant_predicate %14, %15 {dfg_id = 18 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %17 = "neura.data_mov"(%10) {dfg_id = 14 : i32, mapping_locs = [{id = 3 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 64 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %18 = "neura.not"(%17) {dfg_id = 17 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 0 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %19 = "neura.data_mov"(%5) {dfg_id = 9 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 4 : i32}, {id = 0 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %20 = "neura.data_mov"(%18) {dfg_id = 20 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 4 : i32}, {id = 2 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %21 = neura.grant_predicate %19, %20 {dfg_id = 25 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 0 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %22 = "neura.data_mov"(%21) {dfg_id = 29 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}, {id = 12 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         neura.return_value %22 : !neura.data<i32, i1> {dfg_id = 33 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 8 : i32, x = 0 : i32, y = 2 : i32}]}
+// MAPPED-NEXT:         %23 = "neura.data_mov"(%13) {dfg_id = 24 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}, {id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPED-NEXT:         %24 = neura.load_indexed [%23 : !neura.data<i64, i1>]  {dfg_id = 28 : i32, lhs_value = "%input0", mapping_locs = [{id = 0 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 0 : i32, y = 0 : i32}]} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %25 = "neura.data_mov"(%13) {dfg_id = 23 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPED-NEXT:         %26 = neura.load_indexed [%25 : !neura.data<i64, i1>]  {dfg_id = 27 : i32, lhs_value = "%input1", mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %27 = "neura.data_mov"(%24) {dfg_id = 32 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %28 = "neura.data_mov"(%26) {dfg_id = 31 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 4 : i32}, {id = 128 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %29 = "neura.mul"(%27, %28) {dfg_id = 34 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 1 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %30 = "neura.data_mov"(%16) {dfg_id = 21 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}, {id = 160 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %31 = "neura.data_mov"(%29) {dfg_id = 35 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %32 = "neura.add"(%30, %31) {dfg_id = 36 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 1 : i32, y = 1 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %33 = "neura.data_mov"(%13) {dfg_id = 22 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPED-NEXT:         %34 = "neura.add"(%33) {dfg_id = 26 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 0 : i32}], rhs_value = 1 : index} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPED-NEXT:         neura.ctrl_mov %34 -> %6 {dfg_id = 30 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPED-NEXT:         neura.ctrl_mov %32 -> %3 {dfg_id = 37 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}]} : !neura.data<i32, i1> !neura.data<i32, i1>
+// MAPPED-NEXT:         neura.yield {dfg_id = 4 : i32}
+// MAPPED-NEXT:       } : i32
+// MAPPED-NEXT:       "taskflow.yield"(%0) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
+// MAPPED-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// MAPPED-NEXT:     return %value_outputs : i32
+// MAPPED-NEXT:   }
+// MAPPED-NEXT: }
+
+
+
diff --git a/test/multi-cgra/kernel_mapping/relu/relu.mlir b/test/multi-cgra/kernel_mapping/relu/relu.mlir
new file mode 100644
index 00000000..ebede17a
--- /dev/null
+++ b/test/multi-cgra/kernel_mapping/relu/relu.mlir
@@ -0,0 +1,286 @@
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: -o %t.taskflow.mlir
+// RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW
+
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: -o %t.canonicalized.mlir
+// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE
+
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: -o %t.kernel.mlir
+// RUN: FileCheck %s --input-file=%t.kernel.mlir --check-prefixes=KERNEL
+
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: -o %t.neura.mlir
+// RUN: FileCheck %s --input-file=%t.neura.mlir --check-prefixes=NEURA
+
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: --promote-input-arg-to-const \
+// RUN: --fold-constant \
+// RUN: --canonicalize-return \
+// RUN: --canonicalize-live-in \
+// RUN: --leverage-predicated-value \
+// RUN: --transform-ctrl-to-data-flow \
+// RUN: --fold-constant \
+// RUN: -o %t.dataflow.mlir
+// RUN: FileCheck %s --input-file=%t.dataflow.mlir --check-prefixes=DATAFLOW
+
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task \
+// RUN: --canonicalize-task \
+// RUN: --classify-counters \
+// RUN: --convert-taskflow-to-neura \
+// RUN: --lower-affine \
+// RUN: --convert-scf-to-cf \
+// RUN: --convert-cf-to-llvm \
+// RUN: --assign-accelerator \
+// RUN: --lower-memref-to-neura \
+// RUN: --lower-arith-to-neura \
+// RUN: --lower-builtin-to-neura \
+// RUN: --lower-llvm-to-neura \
+// RUN: --promote-input-arg-to-const \
+// RUN: --fold-constant \
+// RUN: --canonicalize-return \
+// RUN: --canonicalize-live-in \
+// RUN: --leverage-predicated-value \
+// RUN: --transform-ctrl-to-data-flow \
+// RUN: --fold-constant \
+// RUN: --insert-data-mov \
+// RUN: --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN: --architecture-spec=%S/../../../arch_spec/architecture.yaml \
+// RUN: -o %t.mapped.mlir
+// RUN: FileCheck %s --input-file=%t.mapped.mlir --check-prefixes=MAPPED
+
+module attributes {} {
+  func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    affine.for %arg2 = 0 to 32 {
+      %0 = affine.load %arg0[%arg2] : memref<?xi32>
+      %1 = arith.cmpi sgt, %0, %c0_i32 : i32
+      scf.if %1 {
+        %2 = affine.load %arg0[%arg2] : memref<?xi32>
+        %3 = affine.load %arg1[%arg2] : memref<?xi32>
+        %4 = arith.addi %3, %2 : i32
+        affine.store %4, %arg1[%arg2] : memref<?xi32>
+      } else {
+        %2 = affine.load %arg1[%arg2] : memref<?xi32>
+        affine.store %2, %arg1[%arg2] : memref<?xi32>
+      }
+    }
+    return
+  }
+}
+
+// TASKFLOW:      module {
+// TASKFLOW-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
+// TASKFLOW-NEXT:     %c0_i32 = arith.constant 0 : i32
+// TASKFLOW-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
+// TASKFLOW-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: i32):
+// TASKFLOW-NEXT:       affine.for %arg5 = 0 to 32 {
+// TASKFLOW-NEXT:         %0 = affine.load %arg2[%arg5] : memref<?xi32>
+// TASKFLOW-NEXT:         %1 = arith.cmpi sgt, %0, %arg4 : i32
+// TASKFLOW-NEXT:         scf.if %1 {
+// TASKFLOW-NEXT:           %2 = affine.load %arg2[%arg5] : memref<?xi32>
+// TASKFLOW-NEXT:           %3 = affine.load %arg3[%arg5] : memref<?xi32>
+// TASKFLOW-NEXT:           %4 = arith.addi %3, %2 : i32
+// TASKFLOW-NEXT:           affine.store %4, %arg3[%arg5] : memref<?xi32>
+// TASKFLOW-NEXT:         } else {
+// TASKFLOW-NEXT:           %2 = affine.load %arg3[%arg5] : memref<?xi32>
+// TASKFLOW-NEXT:           affine.store %2, %arg3[%arg5] : memref<?xi32>
+// TASKFLOW-NEXT:         }
+// TASKFLOW-NEXT:       }
+// TASKFLOW-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
+// TASKFLOW-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> memref<?xi32>
+// TASKFLOW-NEXT:     return
+// TASKFLOW-NEXT:   }
+// TASKFLOW-NEXT: }
+
+// CANONICALIZE:      module {
+// CANONICALIZE-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
+// CANONICALIZE-NEXT:     %c0_i32 = arith.constant 0 : i32
+// CANONICALIZE-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
+// CANONICALIZE-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: i32):
+// CANONICALIZE-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// CANONICALIZE-NEXT:       "taskflow.hyperblock"(%0) <{operandSegmentSizes = array<i32: 1, 0>}> ({
+// CANONICALIZE-NEXT:       ^bb0(%arg5: index):
+// CANONICALIZE-NEXT:         %1 = memref.load %arg2[%arg5] : memref<?xi32>
+// CANONICALIZE-NEXT:         %2 = arith.cmpi sgt, %1, %arg4 : i32
+// CANONICALIZE-NEXT:         scf.if %2 {
+// CANONICALIZE-NEXT:           %3 = memref.load %arg2[%arg5] : memref<?xi32>
+// CANONICALIZE-NEXT:           %4 = memref.load %arg3[%arg5] : memref<?xi32>
+// CANONICALIZE-NEXT:           %5 = arith.addi %4, %3 : i32
+// CANONICALIZE-NEXT:           memref.store %5, %arg3[%arg5] : memref<?xi32>
+// CANONICALIZE-NEXT:         } else {
+// CANONICALIZE-NEXT:           %3 = memref.load %arg3[%arg5] : memref<?xi32>
+// CANONICALIZE-NEXT:           memref.store %3, %arg3[%arg5] : memref<?xi32>
+// CANONICALIZE-NEXT:         }
+// CANONICALIZE-NEXT:         taskflow.hyperblock.yield
+// CANONICALIZE-NEXT:       }) : (index) -> ()
+// CANONICALIZE-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
+// CANONICALIZE-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> memref<?xi32>
+// CANONICALIZE-NEXT:     return
+// CANONICALIZE-NEXT:   }
+// CANONICALIZE-NEXT: }
+
+// KERNEL:      module {
+// KERNEL-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
+// KERNEL-NEXT:     %c0_i32 = arith.constant 0 : i32
+// KERNEL-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
+// KERNEL-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: i32):
+// KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// KERNEL-NEXT:       neura.kernel inputs(%arg2, %arg4, %arg3 : memref<?xi32>, i32, memref<?xi32>) {
+// KERNEL-NEXT:       ^bb0(%arg5: memref<?xi32>, %arg6: i32, %arg7: memref<?xi32>):
+// KERNEL-NEXT:         %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// KERNEL-NEXT:         %2 = memref.load %arg5[%1] : memref<?xi32>
+// KERNEL-NEXT:         %3 = arith.cmpi sgt, %2, %arg6 : i32
+// KERNEL-NEXT:         scf.if %3 {
+// KERNEL-NEXT:           %4 = memref.load %arg5[%1] : memref<?xi32>
+// KERNEL-NEXT:           %5 = memref.load %arg7[%1] : memref<?xi32>
+// KERNEL-NEXT:           %6 = arith.addi %5, %4 : i32
+// KERNEL-NEXT:           memref.store %6, %arg7[%1] : memref<?xi32>
+// KERNEL-NEXT:         } else {
+// KERNEL-NEXT:           %4 = memref.load %arg7[%1] : memref<?xi32>
+// KERNEL-NEXT:           memref.store %4, %arg7[%1] : memref<?xi32>
+// KERNEL-NEXT:         }
+// KERNEL-NEXT:         neura.yield
+// KERNEL-NEXT:       }
+// KERNEL-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
+// KERNEL-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> memref<?xi32>
+// KERNEL-NEXT:     return
+// KERNEL-NEXT:   }
+// KERNEL-NEXT: }
+
+// NEURA:      module {
+// NEURA-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
+// NEURA-NEXT:     %c0_i32 = arith.constant 0 : i32
+// NEURA-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
+// NEURA-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: i32):
+// NEURA-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// NEURA-NEXT:       neura.kernel inputs(%arg2, %arg4, %arg3 : memref<?xi32>, i32, memref<?xi32>) attributes {accelerator = "neura"} {
+// NEURA-NEXT:       ^bb0(%arg5: memref<?xi32>, %arg6: i32, %arg7: memref<?xi32>):
+// NEURA-NEXT:         %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// NEURA-NEXT:         %2 = neura.load_indexed %arg5[%1 : index] memref<?xi32> : i32
+// NEURA-NEXT:         %3 = "neura.icmp"(%2, %arg6) <{cmpType = "sgt"}> : (i32, i32) -> i1
+// NEURA-NEXT:         neura.cond_br %3 : i1 then to ^bb1 else to ^bb2
+// NEURA-NEXT:       ^bb1:  // pred: ^bb0
+// NEURA-NEXT:         %4 = neura.load_indexed %arg5[%1 : index] memref<?xi32> : i32
+// NEURA-NEXT:         %5 = neura.load_indexed %arg7[%1 : index] memref<?xi32> : i32
+// NEURA-NEXT:         %6 = "neura.add"(%5, %4) : (i32, i32) -> i32
+// NEURA-NEXT:         neura.store_indexed %6 to %arg7[%1 : index] memref<?xi32> : i32
+// NEURA-NEXT:         neura.br to ^bb3
+// NEURA-NEXT:       ^bb2:  // pred: ^bb0
+// NEURA-NEXT:         %7 = neura.load_indexed %arg7[%1 : index] memref<?xi32> : i32
+// NEURA-NEXT:         neura.store_indexed %7 to %arg7[%1 : index] memref<?xi32> : i32
+// NEURA-NEXT:         neura.br to ^bb3
+// NEURA-NEXT:       ^bb3:  // 2 preds: ^bb1, ^bb2
+// NEURA-NEXT:         neura.yield
+// NEURA-NEXT:       }
+// NEURA-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
+// NEURA-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> memref<?xi32>
+// NEURA-NEXT:     return
+// NEURA-NEXT:   }
+// NEURA-NEXT: }
+
+// DATAFLOW:      module {
+// DATAFLOW-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
+// DATAFLOW-NEXT:     %c0_i32 = arith.constant 0 : i32
+// DATAFLOW-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
+// DATAFLOW-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: i32):
+// DATAFLOW-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// DATAFLOW-NEXT:       neura.kernel inputs(%arg2, %arg4, %arg3 : memref<?xi32>, i32, memref<?xi32>) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
+// DATAFLOW-NEXT:       ^bb0(%arg5: memref<?xi32>, %arg6: i32, %arg7: memref<?xi32>):
+// DATAFLOW-NEXT:         %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : !neura.data<index, i1>
+// DATAFLOW-NEXT:         %2 = neura.load_indexed [%1 : !neura.data<index, i1>]  {lhs_value = "%input0"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %3 = "neura.icmp"(%2) <{cmpType = "sgt"}> {rhs_value = "%input1"} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
+// DATAFLOW-NEXT:         %4 = neura.grant_predicate %1, %3 : !neura.data<index, i1>, !neura.data<i1, i1> -> !neura.data<index, i1>
+// DATAFLOW-NEXT:         %5 = "neura.not"(%3) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// DATAFLOW-NEXT:         %6 = neura.grant_predicate %1, %5 : !neura.data<index, i1>, !neura.data<i1, i1> -> !neura.data<index, i1>
+// DATAFLOW-NEXT:         %7 = neura.load_indexed [%6 : !neura.data<index, i1>]  {lhs_value = "%input2"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         neura.store_indexed %7 to [%6 : !neura.data<index, i1>]  {rhs_value = "%input2"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %8 = neura.load_indexed [%4 : !neura.data<index, i1>]  {lhs_value = "%input0"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %9 = neura.load_indexed [%4 : !neura.data<index, i1>]  {lhs_value = "%input2"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %10 = "neura.add"(%9, %8) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         neura.store_indexed %10 to [%4 : !neura.data<index, i1>]  {rhs_value = "%input2"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         neura.yield {yield_type = "void"}
+// DATAFLOW-NEXT:       }
+// DATAFLOW-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
+// DATAFLOW-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> memref<?xi32>
+// DATAFLOW-NEXT:     return
+// DATAFLOW-NEXT:   }
+// DATAFLOW-NEXT: }
+
+// MAPPED:      module {
+// MAPPED-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
+// MAPPED-NEXT:     %c0_i32 = arith.constant 0 : i32
+// MAPPED-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
+// MAPPED-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: i32):
+// MAPPED-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// MAPPED-NEXT:       neura.kernel inputs(%arg2, %arg4, %arg3 : memref<?xi32>, i32, memref<?xi32>) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 2 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 1 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
+// MAPPED-NEXT:       ^bb0(%arg5: memref<?xi32>, %arg6: i32, %arg7: memref<?xi32>):
+// MAPPED-NEXT:         %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", dfg_id = 0 : i32, lower_bound = 0 : index, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 1 : i32, y = 1 : i32}], step = 1 : index, upper_bound = 32 : index} : !neura.data<index, i1>
+// MAPPED-NEXT:         %2 = "neura.data_mov"(%1) {dfg_id = 2 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         %3 = neura.load_indexed [%2 : !neura.data<index, i1>]  {dfg_id = 5 : i32, lhs_value = "%input0", mapping_locs = [{id = 5 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %4 = "neura.data_mov"(%3) {dfg_id = 6 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %5 = "neura.icmp"(%4) <{cmpType = "sgt"}> {dfg_id = 7 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 0 : i32}], rhs_value = "%input1"} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %6 = "neura.data_mov"(%1) {dfg_id = 3 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}, {id = 32 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}, {id = 32 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         %7 = "neura.data_mov"(%5) {dfg_id = 9 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %8 = neura.grant_predicate %6, %7 {dfg_id = 11 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 0 : i32}]} : !neura.data<index, i1>, !neura.data<i1, i1> -> !neura.data<index, i1>
+// MAPPED-NEXT:         %9 = "neura.data_mov"(%5) {dfg_id = 8 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %10 = "neura.not"(%9) {dfg_id = 10 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 0 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %11 = "neura.data_mov"(%1) {dfg_id = 4 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}, {id = 128 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}, {id = 128 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}, {id = 128 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         %12 = "neura.data_mov"(%10) {dfg_id = 12 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %13 = neura.grant_predicate %11, %12 {dfg_id = 16 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 4 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data<index, i1>, !neura.data<i1, i1> -> !neura.data<index, i1>
+// MAPPED-NEXT:         %14 = "neura.data_mov"(%13) {dfg_id = 20 : i32, mapping_locs = [{id = 129 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         %15 = neura.load_indexed [%14 : !neura.data<index, i1>]  {dfg_id = 23 : i32, lhs_value = "%input2", mapping_locs = [{id = 4 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 5 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %16 = "neura.data_mov"(%15) {dfg_id = 25 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %17 = "neura.data_mov"(%13) {dfg_id = 19 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 4 : i32}, {id = 256 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         neura.store_indexed %16 to [%17 : !neura.data<index, i1>]  {dfg_id = 27 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 2 : i32}], rhs_value = "%input2"} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %18 = "neura.data_mov"(%8) {dfg_id = 15 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 3 : i32}, {id = 3 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         %19 = neura.load_indexed [%18 : !neura.data<index, i1>]  {dfg_id = 18 : i32, lhs_value = "%input0", mapping_locs = [{id = 2 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 0 : i32}]} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %20 = "neura.data_mov"(%8) {dfg_id = 14 : i32, mapping_locs = [{id = 3 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         %21 = neura.load_indexed [%20 : !neura.data<index, i1>]  {dfg_id = 17 : i32, lhs_value = "%input2", mapping_locs = [{id = 2 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 0 : i32}]} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %22 = "neura.data_mov"(%21) {dfg_id = 21 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 4 : i32}, {id = 96 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %23 = "neura.data_mov"(%19) {dfg_id = 22 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %24 = "neura.add"(%22, %23) {dfg_id = 24 : i32, mapping_locs = [{id = 3 : i32, index_per_ii = 0 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 6 : i32, x = 3 : i32, y = 0 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %25 = "neura.data_mov"(%24) {dfg_id = 26 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 0 : i32, invalid_iterations = 3 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %26 = "neura.data_mov"(%8) {dfg_id = 13 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}, {id = 14 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 4 : i32}, {id = 18 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 5 : i32}, {id = 224 : i32, index_per_ii = 0 : i32, invalid_iterations = 3 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         neura.store_indexed %25 to [%26 : !neura.data<index, i1>]  {dfg_id = 28 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 1 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 7 : i32, x = 3 : i32, y = 1 : i32}], rhs_value = "%input2"} : !neura.data<i32, i1>
+// MAPPED-NEXT:         neura.yield {dfg_id = 1 : i32, yield_type = "void"}
+// MAPPED-NEXT:       }
+// MAPPED-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
+// MAPPED-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> memref<?xi32>
+// MAPPED-NEXT:     return
+// MAPPED-NEXT:   }
+// MAPPED-NEXT: }
+
+

From b86894d0a16f6a479d3139a9016abe7953c5a001 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Thu, 22 Jan 2026 15:45:11 +0800
Subject: [PATCH 16/25] change the definition of taskflow.hyperblock.yield

---
 .../Transforms/WrapLoopInKernelPass.cpp       | 142 ++++++++++++++++++
 .../kernel_with_yield/kernel_with_yield.mlir  |  38 +++++
 2 files changed, 180 insertions(+)
 create mode 100644 lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
 create mode 100644 test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir

diff --git a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
new file mode 100644
index 00000000..ac664382
--- /dev/null
+++ b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
@@ -0,0 +1,142 @@
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "NeuraDialect/NeuraPasses.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/TypeID.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/STLExtras.h"
+#include <memory>
+
+using namespace mlir;
+
+namespace {
+
+static bool isInnermostLoop(affine::AffineForOp for_op) {
+  bool has_nested_loops = false;
+  for_op.getBody()->walk([&](affine::AffineForOp) { has_nested_loops = true; });
+  return !has_nested_loops;
+}
+
+// Wraps an innermost affine for loop in a neura.kernel operation.
+static LogicalResult wrapInnermostLoopAsKernel(affine::AffineForOp for_op,
+                                               OpBuilder &builder,
+                                               unsigned &kernel_id) {
+  Location loc = for_op.getLoc();
+
+  // Collects values that need to be captured by the kernel.
+  llvm::SetVector<Value> captured_values;
+  getUsedValuesDefinedAbove(for_op.getRegion(), captured_values);
+
+  // Checks if the loop has output values.
+  bool has_outputs = !for_op.getResults().empty();
+
+  // Creates the neura.kernel operation.
+  builder.setInsertionPoint(for_op);
+  SmallVector<Value> inputs(captured_values.begin(), captured_values.end());
+  SmallVector<Type> input_types;
+  for (Value val : inputs) {
+    input_types.push_back(val.getType());
+  }
+
+  neura::KernelOp kernel_op = builder.create<neura::KernelOp>(
+      loc, /*output_types=*/for_op->getResultTypes(),
+      /*inputs=*/inputs);
+
+  // Sets kernel name.
+  std::string kernel_name = "kernel_" + std::to_string(kernel_id++);
+  kernel_op.setKernelNameAttr(builder.getStringAttr(kernel_name));
+
+  // Creats the kernel body block with arguments for captured values.
+  Block *kernel_body = new Block();
+  kernel_op.getBody().push_back(kernel_body);
+
+  // Replaces uses of the original loop's results with kernel results.
+  if (has_outputs) {
+    for (auto [orig_result, kernel_result] :
+         llvm::zip(for_op->getResults(), kernel_op.getResults())) {
+      orig_result.replaceAllUsesWith(kernel_result);
+    }
+  }
+
+  // Moves the loop directly in to the kernel body.
+  builder.setInsertionPointToStart(kernel_body);
+  for_op->moveBefore(kernel_body, kernel_body->end());
+
+  builder.setInsertionPointToEnd(kernel_body);
+  // Adds yield operation with proper operands.
+  if (has_outputs) {
+    // If the loop has outputs, yield the loop results.
+    SmallVector<Value> yield_operands(for_op.getResults());
+    builder.create<neura::YieldOp>(loc, ValueRange{}, yield_operands);
+  } else {
+    // If the loop has no outputs, create an empty yield.
+    builder.create<neura::YieldOp>(loc);
+  }
+
+  return success();
+}
+
+struct WrapLoopInKernelPass
+    : public PassWrapper<WrapLoopInKernelPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WrapLoopInKernelPass)
+
+  StringRef getArgument() const override { return "wrap-loop-in-kernel"; }
+  StringRef getDescription() const override {
+    return "Wraps loops in Neura kernel operations.";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<neura::NeuraDialect, affine::AffineDialect,
+                    func::FuncDialect>();
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func_op = getOperation();
+
+    // Skips if function already has kerenls.
+    bool has_kernels = false;
+    func_op.walk([&](neura::KernelOp) { has_kernels = true; });
+    if (has_kernels) {
+      return;
+    }
+
+    // Skips main function.
+    if (func_op.getName() == "main") {
+      return;
+    }
+
+    // Collects all innermost affine for loops in the function.
+    // TODO: Support more kernel wrapping strategies.
+    SmallVector<affine::AffineForOp> innermost_loops;
+    func_op.walk([&](affine::AffineForOp for_op) {
+      if (isInnermostLoop(for_op)) {
+        innermost_loops.push_back(for_op);
+      }
+    });
+
+    if (innermost_loops.empty()) {
+      return;
+    }
+
+    // Wraps each innermost affine for loop in a neura.kernel operation.
+    // TODO: Support more kernel wrapping strategies.
+    OpBuilder builder(func_op->getContext());
+    unsigned kernel_id = 0;
+    for (affine::AffineForOp loop : innermost_loops) {
+      if (failed(wrapInnermostLoopAsKernel(loop, builder, kernel_id))) {
+        signalPassFailure();
+        return;
+      }
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::neura::createWrapLoopInKernelPass() {
+  return std::make_unique<WrapLoopInKernelPass>();
+}
\ No newline at end of file
diff --git a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir b/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir
new file mode 100644
index 00000000..ad24eac4
--- /dev/null
+++ b/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir
@@ -0,0 +1,38 @@
+// Wraps the innermost loop within neura.kernel operation.
+// RUN: mlir-neura-opt %s \
+// RUN: --wrap-loop-in-kernel \
+// RUN: -o %t-wrapped.mlir
+// RUN: FileCheck %s --input-file=%t-wrapped.mlir
+
+module attributes {} {
+  func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref<?x128xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = affine.for %arg1 = 0 to 128 iter_args(%arg2 = %c0_i32) -> (i32) {
+      %1 = affine.for %arg3 = 0 to 128 iter_args(%arg4 = %arg2) -> (i32) {
+        %2 = affine.load %arg0[%arg1, %arg3] : memref<?x128xi32>
+        %3 = arith.addi %arg4, %2 : i32
+        affine.yield %3 : i32
+      }
+      affine.yield %1 : i32
+    }
+    return %0 : i32
+  }
+}
+
+ // CHECK:      module {
+ // CHECK-NEXT:   func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref<?x128xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+ // CHECK-NEXT:     %c0_i32 = arith.constant 0 : i32
+ // CHECK-NEXT:     %0 = affine.for %arg1 = 0 to 128 iter_args(%arg2 = %c0_i32) -> (i32) {
+ // CHECK-NEXT:       %1 = neura.kernel ins(%arg0, %arg1 : memref<?x128xi32>, index) attributes {kernel_name = "kernel_0"} {
+ // CHECK-NEXT:         %2 = affine.for %arg3 = 0 to 128 iter_args(%arg4 = %arg2) -> (i32) {
+ // CHECK-NEXT:           %3 = affine.load %arg0[%arg1, %arg3] : memref<?x128xi32>
+ // CHECK-NEXT:           %4 = arith.addi %arg4, %3 : i32
+ // CHECK-NEXT:           affine.yield %4 : i32
+ // CHECK-NEXT:         }
+ // CHECK-NEXT:         neura.yield %2 : i32
+ // CHECK-NEXT:       } : i32
+ // CHECK-NEXT:       affine.yield %1 : i32
+ // CHECK-NEXT:     }
+ // CHECK-NEXT:     return %0 : i32
+ // CHECK-NEXT:   }
+ // CHECK-NEXT: }

From 10b1076baa3c4490d8821c41fdcde2b6b52ea488 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Mon, 26 Jan 2026 11:33:01 +0800
Subject: [PATCH 17/25] [clean] remove redundant code

---
 lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
index 14257c8c..258a4be5 100644
--- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
+++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
@@ -86,7 +86,6 @@ void GrantPredicateInEntryBlock(Block *entry_block, OpBuilder &builder,
     return;
   }
   SmallVector<Value> live_out_arg_values;
-  SmallVector<Value> live_out_non_arg_values;
 
   // Step 1: Collects all live-out values first.
   for (Operation &op : *entry_block) {

From 2ce803119ea280a768e2589490208eb811a2dcf6 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Mon, 26 Jan 2026 11:45:05 +0800
Subject: [PATCH 18/25] [clean] remove redudant files

---
 .../Transforms/WrapLoopInKernelPass.cpp       | 142 ------------------
 .../kernel_with_yield/kernel_with_yield.mlir  |  38 -----
 2 files changed, 180 deletions(-)
 delete mode 100644 lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
 delete mode 100644 test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir

diff --git a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
deleted file mode 100644
index ac664382..00000000
--- a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-#include "NeuraDialect/NeuraDialect.h"
-#include "NeuraDialect/NeuraOps.h"
-#include "NeuraDialect/NeuraPasses.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/TypeID.h"
-#include "mlir/Transforms/RegionUtils.h"
-#include "llvm/ADT/STLExtras.h"
-#include <memory>
-
-using namespace mlir;
-
-namespace {
-
-static bool isInnermostLoop(affine::AffineForOp for_op) {
-  bool has_nested_loops = false;
-  for_op.getBody()->walk([&](affine::AffineForOp) { has_nested_loops = true; });
-  return !has_nested_loops;
-}
-
-// Wraps an innermost affine for loop in a neura.kernel operation.
-static LogicalResult wrapInnermostLoopAsKernel(affine::AffineForOp for_op,
-                                               OpBuilder &builder,
-                                               unsigned &kernel_id) {
-  Location loc = for_op.getLoc();
-
-  // Collects values that need to be captured by the kernel.
-  llvm::SetVector<Value> captured_values;
-  getUsedValuesDefinedAbove(for_op.getRegion(), captured_values);
-
-  // Checks if the loop has output values.
-  bool has_outputs = !for_op.getResults().empty();
-
-  // Creates the neura.kernel operation.
-  builder.setInsertionPoint(for_op);
-  SmallVector<Value> inputs(captured_values.begin(), captured_values.end());
-  SmallVector<Type> input_types;
-  for (Value val : inputs) {
-    input_types.push_back(val.getType());
-  }
-
-  neura::KernelOp kernel_op = builder.create<neura::KernelOp>(
-      loc, /*output_types=*/for_op->getResultTypes(),
-      /*inputs=*/inputs);
-
-  // Sets kernel name.
-  std::string kernel_name = "kernel_" + std::to_string(kernel_id++);
-  kernel_op.setKernelNameAttr(builder.getStringAttr(kernel_name));
-
-  // Creats the kernel body block with arguments for captured values.
-  Block *kernel_body = new Block();
-  kernel_op.getBody().push_back(kernel_body);
-
-  // Replaces uses of the original loop's results with kernel results.
-  if (has_outputs) {
-    for (auto [orig_result, kernel_result] :
-         llvm::zip(for_op->getResults(), kernel_op.getResults())) {
-      orig_result.replaceAllUsesWith(kernel_result);
-    }
-  }
-
-  // Moves the loop directly in to the kernel body.
-  builder.setInsertionPointToStart(kernel_body);
-  for_op->moveBefore(kernel_body, kernel_body->end());
-
-  builder.setInsertionPointToEnd(kernel_body);
-  // Adds yield operation with proper operands.
-  if (has_outputs) {
-    // If the loop has outputs, yield the loop results.
-    SmallVector<Value> yield_operands(for_op.getResults());
-    builder.create<neura::YieldOp>(loc, ValueRange{}, yield_operands);
-  } else {
-    // If the loop has no outputs, create an empty yield.
-    builder.create<neura::YieldOp>(loc);
-  }
-
-  return success();
-}
-
-struct WrapLoopInKernelPass
-    : public PassWrapper<WrapLoopInKernelPass, OperationPass<func::FuncOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WrapLoopInKernelPass)
-
-  StringRef getArgument() const override { return "wrap-loop-in-kernel"; }
-  StringRef getDescription() const override {
-    return "Wraps loops in Neura kernel operations.";
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<neura::NeuraDialect, affine::AffineDialect,
-                    func::FuncDialect>();
-  }
-
-  void runOnOperation() override {
-    func::FuncOp func_op = getOperation();
-
-    // Skips if function already has kerenls.
-    bool has_kernels = false;
-    func_op.walk([&](neura::KernelOp) { has_kernels = true; });
-    if (has_kernels) {
-      return;
-    }
-
-    // Skips main function.
-    if (func_op.getName() == "main") {
-      return;
-    }
-
-    // Collects all innermost affine for loops in the function.
-    // TODO: Support more kernel wrapping strategies.
-    SmallVector<affine::AffineForOp> innermost_loops;
-    func_op.walk([&](affine::AffineForOp for_op) {
-      if (isInnermostLoop(for_op)) {
-        innermost_loops.push_back(for_op);
-      }
-    });
-
-    if (innermost_loops.empty()) {
-      return;
-    }
-
-    // Wraps each innermost affine for loop in a neura.kernel operation.
-    // TODO: Support more kernel wrapping strategies.
-    OpBuilder builder(func_op->getContext());
-    unsigned kernel_id = 0;
-    for (affine::AffineForOp loop : innermost_loops) {
-      if (failed(wrapInnermostLoopAsKernel(loop, builder, kernel_id))) {
-        signalPassFailure();
-        return;
-      }
-    }
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> mlir::neura::createWrapLoopInKernelPass() {
-  return std::make_unique<WrapLoopInKernelPass>();
-}
\ No newline at end of file
diff --git a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir b/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir
deleted file mode 100644
index ad24eac4..00000000
--- a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir
+++ /dev/null
@@ -1,38 +0,0 @@
-// Wraps the innermost loop within neura.kernel operation.
-// RUN: mlir-neura-opt %s \
-// RUN: --wrap-loop-in-kernel \
-// RUN: -o %t-wrapped.mlir
-// RUN: FileCheck %s --input-file=%t-wrapped.mlir
-
-module attributes {} {
-  func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref<?x128xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-    %c0_i32 = arith.constant 0 : i32
-    %0 = affine.for %arg1 = 0 to 128 iter_args(%arg2 = %c0_i32) -> (i32) {
-      %1 = affine.for %arg3 = 0 to 128 iter_args(%arg4 = %arg2) -> (i32) {
-        %2 = affine.load %arg0[%arg1, %arg3] : memref<?x128xi32>
-        %3 = arith.addi %arg4, %2 : i32
-        affine.yield %3 : i32
-      }
-      affine.yield %1 : i32
-    }
-    return %0 : i32
-  }
-}
-
- // CHECK:      module {
- // CHECK-NEXT:   func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref<?x128xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
- // CHECK-NEXT:     %c0_i32 = arith.constant 0 : i32
- // CHECK-NEXT:     %0 = affine.for %arg1 = 0 to 128 iter_args(%arg2 = %c0_i32) -> (i32) {
- // CHECK-NEXT:       %1 = neura.kernel ins(%arg0, %arg1 : memref<?x128xi32>, index) attributes {kernel_name = "kernel_0"} {
- // CHECK-NEXT:         %2 = affine.for %arg3 = 0 to 128 iter_args(%arg4 = %arg2) -> (i32) {
- // CHECK-NEXT:           %3 = affine.load %arg0[%arg1, %arg3] : memref<?x128xi32>
- // CHECK-NEXT:           %4 = arith.addi %arg4, %3 : i32
- // CHECK-NEXT:           affine.yield %4 : i32
- // CHECK-NEXT:         }
- // CHECK-NEXT:         neura.yield %2 : i32
- // CHECK-NEXT:       } : i32
- // CHECK-NEXT:       affine.yield %1 : i32
- // CHECK-NEXT:     }
- // CHECK-NEXT:     return %0 : i32
- // CHECK-NEXT:   }
- // CHECK-NEXT: }

From d8e7c0fb30b8bbbeab9687ec2a4844a072404aca Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Tue, 27 Jan 2026 11:45:46 +0800
Subject: [PATCH 19/25] sync with main

---
 include/NeuraDialect/NeuraPasses.td        | 8 --------
 lib/NeuraDialect/Transforms/CMakeLists.txt | 1 -
 test/neura/fusion/test.mlir                | 2 +-
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index fc6cec1e..123bf1c8 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -167,14 +167,6 @@ def InitPattern : Pass<"init-pattern", "ModuleOp"> {
   let constructor = "neura::createInitPatternPass()";
 }
 
-def WrapLoopInKernelPass : Pass<"wrap-loop-in-kernel", "func::FuncOp">{
-  let summary = "Wrap loops in neura.kernel operations";
-  let description = [{
-    This pass wraps loops in neura.kernel operations to encapsulate loop bodies.
-  }];
-  let constructor = "neura::createWrapLoopInKernelPass()";
-}
-
 def HardwareMerge : Pass<"hardware-merge", "ModuleOp"> {
   let summary = "Merge and optimize hardware units for pattern execution";
   let description = [{
diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt
index da7056fb..010fc3c7 100644
--- a/lib/NeuraDialect/Transforms/CMakeLists.txt
+++ b/lib/NeuraDialect/Transforms/CMakeLists.txt
@@ -19,7 +19,6 @@ add_mlir_library(
     RemovePredicatedTypePass.cpp
     HardwareMergePass.cpp
     GraphMining/HardwareTemplate.cpp
-    WrapLoopInKernelPass.cpp
 
     DEPENDS
     MLIRNeuraTransformsIncGen
diff --git a/test/neura/fusion/test.mlir b/test/neura/fusion/test.mlir
index 0e6a3dce..63881151 100644
--- a/test/neura/fusion/test.mlir
+++ b/test/neura/fusion/test.mlir
@@ -117,7 +117,7 @@
 // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --verify-each=true --mlir-print-ir-after-failure \
 // RUN:           --assign-accelerator \
 // RUN:           --lower-llvm-to-neura \
-// RUN:           --promote-func-arg-to-const \
+// RUN:           --promote-input-arg-to-const \
 // RUN:           --canonicalize-return \
 // RUN:           --canonicalize-cast \
 // RUN:           --canonicalize-live-in \

From b454a8d03b5eb6db76e7c9afb226733c819b1359 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 31 Jan 2026 14:20:54 +0800
Subject: [PATCH 20/25] sync with main

---
 .../Transforms/MapToAcceleratorPass.cpp       | 75 +------------------
 1 file changed, 4 insertions(+), 71 deletions(-)

diff --git a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
index cfe14543..9b5ee423 100644
--- a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
+++ b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
@@ -178,9 +178,8 @@ struct MapToAcceleratorPass
 
   // Generic mapping function works for both function and kernel mapping.
   template <typename OpType>
-  bool mapRegion(OpType op, Region &region, Architecture &architecture,
+  bool mapRegion(OpType op, Region &region, const Architecture &architecture,
                  Mapping *mapping_strategy, bool is_spatial_only,
-                 int max_ctrl_mem_items,
                  const std::string &resolved_mapping_mode,
                  const std::string &resolved_mapping_strategy) {
     // Checks steering mode compatibility with architecture.
@@ -231,8 +230,7 @@ struct MapToAcceleratorPass
     int res_mii = calculateResMii(region, architecture);
 
     const int possible_min_ii = std::max(rec_mii, res_mii);
-    const int max_ii =
-        max_ctrl_mem_items; // Use YAML config (default 20 if not specified)
+    const int max_ii = architecture.getMaxCtrlMemItems();
 
     std::vector<Operation *> topologically_sorted_ops =
         getTopologicallySortedOps(region);
@@ -359,70 +357,6 @@ struct MapToAcceleratorPass
 
     const Architecture &architecture = mlir::neura::getArchitecture();
 
-    std::string architecture_spec_file = mlir::neura::getArchitectureSpecFile();
-    int multi_cgra_rows = kMultiCgraDefaultRows;
-    int multi_cgra_columns = kMultiCgraDefaultColumns;
-    int per_cgra_rows = kPerCgraDefaultRows;
-    int per_cgra_columns = kPerCgraDefaultColumns;
-    int max_ctrl_mem_items = kDefaultMaxCtrlMemItems;
-    mlir::neura::TileDefaults tile_defaults;
-    std::vector<mlir::neura::TileOverride> tile_overrides;
-    mlir::neura::LinkDefaults link_defaults;
-    std::vector<mlir::neura::LinkOverride> link_overrides;
-    mlir::neura::BaseTopology multi_cgra_base_topology =
-        mlir::neura::BaseTopology::MESH;
-    mlir::neura::BaseTopology per_cgra_base_topology =
-        mlir::neura::BaseTopology::MESH;
-
-    if (!architecture_spec_file.empty()) {
-
-      // Use LLVM YAML parser to validate the YAML syntax (no mapping yet)
-      llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> buffer_or_err =
-          llvm::MemoryBuffer::getFile(architecture_spec_file);
-      if (!buffer_or_err) {
-        llvm::errs() << "[MapToAcceleratorPass] Failed to open architecture "
-                        "specification file: "
-                     << architecture_spec_file << "\n";
-        return;
-      }
-
-      llvm::SourceMgr sm;
-      sm.AddNewSourceBuffer(std::move(*buffer_or_err), llvm::SMLoc());
-      llvm::yaml::Stream yaml_stream(
-          sm.getMemoryBuffer(sm.getMainFileID())->getBuffer(), sm);
-
-      bool parse_failed = false;
-      llvm::yaml::Document &yaml_doc = *yaml_stream.begin();
-      (void)yaml_doc; // ensure document is created
-      if (yaml_stream.failed()) {
-        parse_failed = true;
-      }
-
-      if (parse_failed) {
-        llvm::errs() << "[MapToAcceleratorPass] YAML parse error in: "
-                     << architecture_spec_file << "\n";
-        return;
-      }
-
-      // Parses YAML configuration.
-      if (!parseArchitectureYaml(
-              yaml_doc, multi_cgra_rows, multi_cgra_columns,
-              multi_cgra_base_topology, per_cgra_rows, per_cgra_columns,
-              per_cgra_base_topology, max_ctrl_mem_items, tile_defaults,
-              tile_overrides, link_defaults, link_overrides)) {
-        return;
-      }
-    } else {
-      llvm::errs() << "[MapToAcceleratorPass] No architecture specification "
-                      "file provided.\n";
-    }
-
-    // Creates architecture.
-    Architecture architecture(
-        multi_cgra_rows, multi_cgra_columns, multi_cgra_base_topology,
-        per_cgra_rows, per_cgra_columns, per_cgra_base_topology, tile_defaults,
-        tile_overrides, link_defaults, link_overrides);
-
     // Maps kernels.
     module.walk([&](neura::KernelOp kernel_op) {
       auto accel_attr =
@@ -434,8 +368,7 @@ struct MapToAcceleratorPass
       Region &kernel_region = kernel_op.getBody();
       if (!mapRegion(kernel_op, kernel_region, architecture,
                      mapping_strategy.get(), is_spatial_only,
-                     max_ctrl_mem_items, resolved_mapping_mode,
-                     resolved_mapping_strategy)) {
+                     resolved_mapping_mode, resolved_mapping_strategy)) {
         llvm::errs() << "[MapToAcceleratorPass] Mapping failed for kernel.\n";
         signalPassFailure();
       }
@@ -452,7 +385,7 @@ struct MapToAcceleratorPass
       Region &func_region = func_op.getBody();
 
       if (!mapRegion(func_op, func_region, architecture, mapping_strategy.get(),
-                     is_spatial_only, max_ctrl_mem_items, resolved_mapping_mode,
+                     is_spatial_only, resolved_mapping_mode,
                      resolved_mapping_strategy)) {
         llvm::errs() << "[MapToAcceleratorPass] Failed to map function.\n";
         signalPassFailure();

From 75cbdfe01ae39259cfd3c135ff2899b01abe9201 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 31 Jan 2026 14:35:13 +0800
Subject: [PATCH 21/25] recover wraploopinkernel pass

---
 include/NeuraDialect/NeuraPasses.h            |   1 +
 include/NeuraDialect/NeuraPasses.td           |   8 +
 lib/NeuraDialect/Transforms/CMakeLists.txt    |   1 +
 .../Transforms/WrapLoopInKernelPass.cpp       | 142 ++++++++++++++++++
 test/compiler_e2e/visualize/test.mlir         |   2 +-
 test/visualize/test2.mlir                     |   2 +-
 6 files changed, 154 insertions(+), 2 deletions(-)
 create mode 100644 lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp

diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index 75ddbd24..803cc589 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -37,6 +37,7 @@ std::unique_ptr<mlir::Pass> createRemovePredicatedTypePass();
 // Hardware specific optimization passes
 std::unique_ptr<mlir::Pass> createFuseLoopControlPass();
 std::unique_ptr<mlir::Pass> createFusePatternPass();
+std::unique_ptr<mlir::Pass> createWrapLoopInKernelPass();
 
 // Hardware agnostic optimization passes
 std::unique_ptr<mlir::Pass> createFoldConstantPass();
diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index 123bf1c8..fc6cec1e 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -167,6 +167,14 @@ def InitPattern : Pass<"init-pattern", "ModuleOp"> {
   let constructor = "neura::createInitPatternPass()";
 }
 
+def WrapLoopInKernelPass : Pass<"wrap-loop-in-kernel", "func::FuncOp">{
+  let summary = "Wrap loops in neura.kernel operations";
+  let description = [{
+    This pass wraps loops in neura.kernel operations to encapsulate loop bodies.
+  }];
+  let constructor = "neura::createWrapLoopInKernelPass()";
+}
+
 def HardwareMerge : Pass<"hardware-merge", "ModuleOp"> {
   let summary = "Merge and optimize hardware units for pattern execution";
   let description = [{
diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt
index 010fc3c7..01df4219 100644
--- a/lib/NeuraDialect/Transforms/CMakeLists.txt
+++ b/lib/NeuraDialect/Transforms/CMakeLists.txt
@@ -18,6 +18,7 @@ add_mlir_library(
     TransformToSteerControlPass.cpp
     RemovePredicatedTypePass.cpp
     HardwareMergePass.cpp
+    WrapLoopInKernelPass.cpp
     GraphMining/HardwareTemplate.cpp
 
     DEPENDS
diff --git a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
new file mode 100644
index 00000000..ac664382
--- /dev/null
+++ b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp
@@ -0,0 +1,142 @@
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "NeuraDialect/NeuraPasses.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/TypeID.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/STLExtras.h"
+#include <memory>
+
+using namespace mlir;
+
+namespace {
+
+static bool isInnermostLoop(affine::AffineForOp for_op) {
+  bool has_nested_loops = false;
+  for_op.getBody()->walk([&](affine::AffineForOp) { has_nested_loops = true; });
+  return !has_nested_loops;
+}
+
+// Wraps an innermost affine for loop in a neura.kernel operation.
+static LogicalResult wrapInnermostLoopAsKernel(affine::AffineForOp for_op,
+                                               OpBuilder &builder,
+                                               unsigned &kernel_id) {
+  Location loc = for_op.getLoc();
+
+  // Collects values that need to be captured by the kernel.
+  llvm::SetVector<Value> captured_values;
+  getUsedValuesDefinedAbove(for_op.getRegion(), captured_values);
+
+  // Checks if the loop has output values.
+  bool has_outputs = !for_op.getResults().empty();
+
+  // Creates the neura.kernel operation.
+  builder.setInsertionPoint(for_op);
+  SmallVector<Value> inputs(captured_values.begin(), captured_values.end());
+  SmallVector<Type> input_types;
+  for (Value val : inputs) {
+    input_types.push_back(val.getType());
+  }
+
+  neura::KernelOp kernel_op = builder.create<neura::KernelOp>(
+      loc, /*output_types=*/for_op->getResultTypes(),
+      /*inputs=*/inputs);
+
+  // Sets kernel name.
+  std::string kernel_name = "kernel_" + std::to_string(kernel_id++);
+  kernel_op.setKernelNameAttr(builder.getStringAttr(kernel_name));
+
+  // Creats the kernel body block with arguments for captured values.
+  Block *kernel_body = new Block();
+  kernel_op.getBody().push_back(kernel_body);
+
+  // Replaces uses of the original loop's results with kernel results.
+  if (has_outputs) {
+    for (auto [orig_result, kernel_result] :
+         llvm::zip(for_op->getResults(), kernel_op.getResults())) {
+      orig_result.replaceAllUsesWith(kernel_result);
+    }
+  }
+
+  // Moves the loop directly in to the kernel body.
+  builder.setInsertionPointToStart(kernel_body);
+  for_op->moveBefore(kernel_body, kernel_body->end());
+
+  builder.setInsertionPointToEnd(kernel_body);
+  // Adds yield operation with proper operands.
+  if (has_outputs) {
+    // If the loop has outputs, yield the loop results.
+    SmallVector<Value> yield_operands(for_op.getResults());
+    builder.create<neura::YieldOp>(loc, ValueRange{}, yield_operands);
+  } else {
+    // If the loop has no outputs, create an empty yield.
+    builder.create<neura::YieldOp>(loc);
+  }
+
+  return success();
+}
+
+struct WrapLoopInKernelPass
+    : public PassWrapper<WrapLoopInKernelPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WrapLoopInKernelPass)
+
+  StringRef getArgument() const override { return "wrap-loop-in-kernel"; }
+  StringRef getDescription() const override {
+    return "Wraps loops in Neura kernel operations.";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<neura::NeuraDialect, affine::AffineDialect,
+                    func::FuncDialect>();
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func_op = getOperation();
+
+    // Skips if function already has kerenls.
+    bool has_kernels = false;
+    func_op.walk([&](neura::KernelOp) { has_kernels = true; });
+    if (has_kernels) {
+      return;
+    }
+
+    // Skips main function.
+    if (func_op.getName() == "main") {
+      return;
+    }
+
+    // Collects all innermost affine for loops in the function.
+    // TODO: Support more kernel wrapping strategies.
+    SmallVector<affine::AffineForOp> innermost_loops;
+    func_op.walk([&](affine::AffineForOp for_op) {
+      if (isInnermostLoop(for_op)) {
+        innermost_loops.push_back(for_op);
+      }
+    });
+
+    if (innermost_loops.empty()) {
+      return;
+    }
+
+    // Wraps each innermost affine for loop in a neura.kernel operation.
+    // TODO: Support more kernel wrapping strategies.
+    OpBuilder builder(func_op->getContext());
+    unsigned kernel_id = 0;
+    for (affine::AffineForOp loop : innermost_loops) {
+      if (failed(wrapInnermostLoopAsKernel(loop, builder, kernel_id))) {
+        signalPassFailure();
+        return;
+      }
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::neura::createWrapLoopInKernelPass() {
+  return std::make_unique<WrapLoopInKernelPass>();
+}
\ No newline at end of file
diff --git a/test/compiler_e2e/visualize/test.mlir b/test/compiler_e2e/visualize/test.mlir
index a07cc79f..50b1c32a 100644
--- a/test/compiler_e2e/visualize/test.mlir
+++ b/test/compiler_e2e/visualize/test.mlir
@@ -162,4 +162,4 @@ func.func @test_print_op_graph(%a: f32, %b: f32) -> f32 {
 // CHECK-GRAPH: label = "neura.fmul : (!neura.data<f32, i1>)\n\nrhs_value: 2.000000e+00 : f32", shape = ellipse, style = filled];
 // CHECK-GRAPH: label = "neura.data_mov : (!neura.data<f32, i1>)\n", shape = ellipse, style = filled];
 // CHECK-GRAPH: label = "neura.return_value : ()\n", shape = ellipse, style = filled];
-// CHECK-GRAPH: label = "neura.yield : ()\n", shape = ellipse, style = filled];
+// CHECK-GRAPH: label = "neura.yield : ()\n\noperandSegmentSizes: array<i32: 0, 0>", shape = ellipse, style = filled];
diff --git a/test/visualize/test2.mlir b/test/visualize/test2.mlir
index 7a686e52..69a3756d 100644
--- a/test/visualize/test2.mlir
+++ b/test/visualize/test2.mlir
@@ -29,4 +29,4 @@ func.func @test_print_op_graph(%a: f32, %b: f32) -> f32 {
 // CHECK-GRAPH: label = "neura.fmul : (!neura.data<f32, i1>)\n\nrhs_value: 2.000000e+00 : f32", shape = ellipse, style = filled];
 // CHECK-GRAPH: label = "neura.data_mov : (!neura.data<f32, i1>)\n", shape = ellipse, style = filled];
 // CHECK-GRAPH: label = "neura.return_value : ()\n", shape = ellipse, style = filled];
-// CHECK-GRAPH: label = "neura.yield : ()\n", shape = ellipse, style = filled];
+// CHECK-GRAPH: label = "neura.yield : ()\n\noperandSegmentSizes: array<i32: 0, 0>", shape = ellipse, style = filled];

From 2898f1557d1f778921c56d8f0a0a688b1914ee72 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 31 Jan 2026 14:42:32 +0800
Subject: [PATCH 22/25] [fix] fix bugs in iter_args handling

---
 .../TransformCtrlToDataFlowPass.cpp           | 23 ++++----
 test/multi-cgra/kernel_mapping/fir/fir.mlir   | 52 +++++++++----------
 2 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
index 258a4be5..5fedbec3 100644
--- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
+++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
@@ -148,7 +148,7 @@ void GrantPredicateInEntryBlock(Block *entry_block, OpBuilder &builder,
 //---------------------------------------------------------------------------
 void handleKernelIterArgs(neura::KernelOp kernel_op, Block *entry_block,
                           OpBuilder &builder,
-                          SmallVector<Value> &iter_arg_final_values) {
+                          SmallVector<Value> &iter_arg_phi_values) {
   llvm::errs() << "[iter_args] Handling kernel iter_args...\n";
 
   SmallVector<neura::ConstantOp> iter_arg_init_ops;
@@ -213,7 +213,7 @@ void handleKernelIterArgs(neura::KernelOp kernel_op, Block *entry_block,
     builder.create<neura::CtrlMovOp>(yield_op.getLoc(), feedback_value,
                                      reserve_op.getResult());
 
-    iter_arg_final_values.push_back(feedback_value);
+    iter_arg_phi_values.push_back(phi.getResult());
 
     init_const->removeAttr(kIterArgInitAttr);
     llvm::errs() << "[iter_args]     Created iter_arg with grant_once\n";
@@ -227,7 +227,7 @@ void handleKernelIterArgs(neura::KernelOp kernel_op, Block *entry_block,
 //---------------------------------------------------------------------------
 void handleKernelYieldTermination(
     neura::KernelOp kernel_op, Block *entry_block, OpBuilder &builder,
-    bool has_task_counter, const SmallVector<Value> &iter_arg_final_values) {
+    bool has_task_counter, const SmallVector<Value> &iter_arg_phi_values) {
   llvm::errs() << "[yield] ========================================\n";
   llvm::errs() << "[yield] Handling Yield Termination\n";
   llvm::errs() << "[yield] ========================================\n";
@@ -294,9 +294,12 @@ void handleKernelYieldTermination(
 
       // Gates all results with NOT (counter predicate).
       SmallVector<Value> gated_results;
-      for (Value result : yield_op.getResults()) {
+      for (size_t i = 0; i < yield_op.getResults().size(); ++i) {
+        Value result_to_gate = iter_arg_phi_values[i];
+
         auto gated = builder.create<neura::GrantPredicateOp>(
-            yield_op.getLoc(), result.getType(), result, return_gate);
+            yield_op.getLoc(), result_to_gate.getType(), result_to_gate,
+            return_gate);
         gated_results.push_back(gated.getResult());
 
         llvm::errs() << "[yield]     Gated result with NOT(counter_pred)\n";
@@ -1043,12 +1046,12 @@ struct TransformCtrlToDataFlowPass
         return;
       }
 
-      SmallVector<Value> iter_arg_final_values;
+      SmallVector<Value> iter_arg_phi_values;
 
       // STEP 1: Handles iter_args of the neura.kernel.
       llvm::errs() << "[ctrl2data] === STEP 1: Handle iter_args ===\n";
       handleKernelIterArgs(kernel_op, entry_block, builder,
-                           iter_arg_final_values);
+                           iter_arg_phi_values);
 
       // STEP 2: Grants predicates (only if NO task counter).
       llvm::errs() << "[ctrl2data] === STEP 2: Grant predicates ===\n";
@@ -1065,12 +1068,12 @@ struct TransformCtrlToDataFlowPass
       } else {
         llvm::errs() << "[ctrl2data] === STEP 3: Single block (skip) ===\n";
       }
-      convertPhiToPhiStart(kernel_region, builder);
-
       // STEP 4: Handles yield termination in neura.kernel.
       llvm::errs() << "[ctrl2data] === STEP 4: Handle yield ===\n";
       handleKernelYieldTermination(kernel_op, entry_block, builder,
-                                   has_task_counter, iter_arg_final_values);
+                                   has_task_counter, iter_arg_phi_values);
+
+      convertPhiToPhiStart(kernel_region, builder);
 
       kernel_op->setAttr(neura::attr::kDataflowMode,
                          StringAttr::get(kernel_op.getContext(),
diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir
index 46f62a2c..cc2bf924 100644
--- a/test/multi-cgra/kernel_mapping/fir/fir.mlir
+++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir
@@ -196,7 +196,7 @@ module attributes {} {
 // DATAFLOW-NEXT:         neura.ctrl_mov %9 -> %3 : !neura.data<i32, i1> !neura.data<i32, i1>
 // DATAFLOW-NEXT:         %10 = neura.extract_predicate %5 : !neura.data<index, i1> -> !neura.data<i1, i1>
 // DATAFLOW-NEXT:         %11 = "neura.not"(%10) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// DATAFLOW-NEXT:         %12 = neura.grant_predicate %9, %11 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %12 = neura.grant_predicate %4, %11 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
 // DATAFLOW-NEXT:         neura.return_value %12 : !neura.data<i32, i1>
 // DATAFLOW-NEXT:         neura.yield
 // DATAFLOW-NEXT:       } : i32
@@ -212,33 +212,33 @@ module attributes {} {
 // MAPPED-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
 // MAPPED-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // MAPPED-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
-// MAPPED-NEXT:       %1 = neura.kernel inputs(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 2 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 2 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
+// MAPPED-NEXT:       %1 = neura.kernel inputs(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 2 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
 // MAPPED-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
-// MAPPED-NEXT:         %2 = "neura.grant_once"() <{constant_value = "%iter_arg_init0"}> {dfg_id = 0 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 3 : i32}]} : () -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %2 = "neura.grant_once"() <{constant_value = "%iter_arg_init0"}> {dfg_id = 0 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 0 : i32, y = 2 : i32}]} : () -> !neura.data<i32, i1>
 // MAPPED-NEXT:         %3 = neura.reserve {dfg_id = 1 : i32} : !neura.data<i32, i1>
-// MAPPED-NEXT:         %4 = "neura.data_mov"(%2) {dfg_id = 4 : i32, mapping_locs = [{id = 42 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}, {id = 288 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPED-NEXT:         %5 = neura.phi_start %4, %3 {dfg_id = 8 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
-// MAPPED-NEXT:         %6 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", dfg_id = 2 : i32, lower_bound = 0 : index, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 1 : i32, y = 1 : i32}], step = 1 : index, upper_bound = 32 : index} : !neura.data<index, i1>
-// MAPPED-NEXT:         %7 = "neura.data_mov"(%6) {dfg_id = 5 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
-// MAPPED-NEXT:         %8 = neura.load_indexed [%7 : !neura.data<index, i1>]  {dfg_id = 9 : i32, lhs_value = "%input0", mapping_locs = [{id = 5 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data<i32, i1>
-// MAPPED-NEXT:         %9 = "neura.data_mov"(%6) {dfg_id = 6 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
-// MAPPED-NEXT:         %10 = neura.load_indexed [%9 : !neura.data<index, i1>]  {dfg_id = 10 : i32, lhs_value = "%input1", mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 0 : i32}]} : !neura.data<i32, i1>
-// MAPPED-NEXT:         %11 = "neura.data_mov"(%8) {dfg_id = 13 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPED-NEXT:         %12 = "neura.data_mov"(%10) {dfg_id = 14 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPED-NEXT:         %13 = "neura.mul"(%11, %12) {dfg_id = 16 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 0 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPED-NEXT:         %14 = "neura.data_mov"(%5) {dfg_id = 12 : i32, mapping_locs = [{id = 288 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPED-NEXT:         %15 = "neura.data_mov"(%13) {dfg_id = 18 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 2 : i32}, {id = 16 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPED-NEXT:         %16 = "neura.add"(%14, %15) {dfg_id = 20 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 2 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPED-NEXT:         neura.ctrl_mov %16 -> %3 {dfg_id = 21 : i32, mapping_locs = [{id = 289 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : !neura.data<i32, i1> !neura.data<i32, i1>
-// MAPPED-NEXT:         %17 = "neura.data_mov"(%6) {dfg_id = 7 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}, {id = 128 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
-// MAPPED-NEXT:         %18 = neura.extract_predicate %17 {dfg_id = 11 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data<index, i1> -> !neura.data<i1, i1>
-// MAPPED-NEXT:         %19 = "neura.data_mov"(%18) {dfg_id = 15 : i32, mapping_locs = [{id = 128 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPED-NEXT:         %20 = "neura.not"(%19) {dfg_id = 17 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 1 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPED-NEXT:         %21 = "neura.data_mov"(%16) {dfg_id = 22 : i32, mapping_locs = [{id = 27 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPED-NEXT:         %22 = "neura.data_mov"(%20) {dfg_id = 19 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}, {id = 256 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPED-NEXT:         %23 = neura.grant_predicate %21, %22 {dfg_id = 23 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 5 : i32, x = 0 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
-// MAPPED-NEXT:         %24 = "neura.data_mov"(%23) {dfg_id = 24 : i32, mapping_locs = [{id = 256 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPED-NEXT:         neura.return_value %24 : !neura.data<i32, i1> {dfg_id = 25 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 2 : i32}]}
+// MAPPED-NEXT:         %4 = "neura.data_mov"(%2) {dfg_id = 4 : i32, mapping_locs = [{id = 24 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %5 = neura.phi_start %4, %3 {dfg_id = 8 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %6 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", dfg_id = 2 : i32, lower_bound = 0 : index, mapping_locs = [{id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 0 : i32, y = 0 : i32}], step = 1 : index, upper_bound = 32 : index} : !neura.data<index, i1>
+// MAPPED-NEXT:         %7 = "neura.data_mov"(%6) {dfg_id = 5 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 0 : i32}, {id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         %8 = neura.load_indexed [%7 : !neura.data<index, i1>]  {dfg_id = 9 : i32, lhs_value = "%input0", mapping_locs = [{id = 0 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 0 : i32}]} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %9 = "neura.data_mov"(%6) {dfg_id = 6 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}, {id = 32 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         %10 = neura.load_indexed [%9 : !neura.data<index, i1>]  {dfg_id = 10 : i32, lhs_value = "%input1", mapping_locs = [{id = 1 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 0 : i32}]} : !neura.data<i32, i1>
+// MAPPED-NEXT:         %11 = "neura.data_mov"(%8) {dfg_id = 14 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %12 = "neura.data_mov"(%10) {dfg_id = 15 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %13 = "neura.mul"(%11, %12) {dfg_id = 17 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 0 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %14 = "neura.data_mov"(%5) {dfg_id = 13 : i32, mapping_locs = [{id = 29 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 160 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %15 = "neura.data_mov"(%13) {dfg_id = 19 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %16 = "neura.add"(%14, %15) {dfg_id = 21 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 1 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         neura.ctrl_mov %16 -> %3 {dfg_id = 23 : i32, mapping_locs = [{id = 16 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 4 : i32}, {id = 288 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : !neura.data<i32, i1> !neura.data<i32, i1>
+// MAPPED-NEXT:         %17 = "neura.data_mov"(%6) {dfg_id = 7 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// MAPPED-NEXT:         %18 = neura.extract_predicate %17 {dfg_id = 11 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 0 : i32, y = 0 : i32}]} : !neura.data<index, i1> -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %19 = "neura.data_mov"(%18) {dfg_id = 16 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %20 = "neura.not"(%19) {dfg_id = 18 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 1 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %21 = "neura.data_mov"(%5) {dfg_id = 12 : i32, mapping_locs = [{id = 27 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %22 = "neura.data_mov"(%20) {dfg_id = 20 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPED-NEXT:         %23 = neura.grant_predicate %21, %22 {dfg_id = 22 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// MAPPED-NEXT:         %24 = "neura.data_mov"(%23) {dfg_id = 24 : i32, mapping_locs = [{id = 256 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPED-NEXT:         neura.return_value %24 : !neura.data<i32, i1> {dfg_id = 25 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 0 : i32, y = 2 : i32}]}
 // MAPPED-NEXT:         neura.yield {dfg_id = 3 : i32}
 // MAPPED-NEXT:       } : i32
 // MAPPED-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()

From 565c4fd61f99b255e895652617d9db10f5228fc7 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 31 Jan 2026 15:07:03 +0800
Subject: [PATCH 23/25] sync with main

---
 include/NeuraDialect/NeuraPasses.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index 803cc589..b88bec1f 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -30,6 +30,7 @@ std::unique_ptr<mlir::Pass> createCanonicalizeLiveInPass();
 std::unique_ptr<mlir::Pass> createPromoteInputArgToConstPass();
 std::unique_ptr<mlir::Pass> createTransformToSteerControlPass();
 std::unique_ptr<mlir::Pass> createRemovePredicatedTypePass();
+std::unique_ptr<mlir::Pass> createWrapLoopInKernelPass();
 
 // ====================================
 // Optimization Passes

From 616da9d3028db3fd0c85d351db54ed781c41fbcc Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 31 Jan 2026 15:45:19 +0800
Subject: [PATCH 24/25] revert the github workflow

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index a7a73b93..1703ce3d 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -65,7 +65,7 @@ jobs:
       run: |
         mkdir -p ${{ env.CCACHE_DIR }}
         git --version
-        git clone --depth 1 --filter=blob:none --revision=${{ env.LLVM_COMMIT }} https://github.com/llvm/llvm-project.git
+        git clone --revision=${{ env.LLVM_COMMIT }} https://github.com/llvm/llvm-project.git
         cd llvm-project
         mkdir build && cd build
         cmake -G Ninja ../llvm \

From 67ea96b4d948f2ae230fcf4a1ee2313c9cf3d791 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 31 Jan 2026 16:48:48 +0800
Subject: [PATCH 25/25] modify the git clone cmd

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 1703ce3d..a7a73b93 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -65,7 +65,7 @@ jobs:
       run: |
         mkdir -p ${{ env.CCACHE_DIR }}
         git --version
-        git clone --revision=${{ env.LLVM_COMMIT }} https://github.com/llvm/llvm-project.git
+        git clone --depth 1 --filter=blob:none --revision=${{ env.LLVM_COMMIT }} https://github.com/llvm/llvm-project.git
         cd llvm-project
         mkdir build && cd build
         cmake -G Ninja ../llvm \