From f370da062a8aa2c32fad309c43c566fb17d39bea Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Thu, 22 Jan 2026 13:31:16 +0800 Subject: [PATCH 01/25] add counter classification pass --- include/TaskflowDialect/TaskflowOps.td | 18 ++-- include/TaskflowDialect/TaskflowPasses.h | 1 + include/TaskflowDialect/TaskflowPasses.td | 14 +++ lib/TaskflowDialect/Transforms/CMakeLists.txt | 1 + .../Transforms/ClassifyCountersPass.cpp | 92 +++++++++++++++++++ .../ConstructHyperblockFromTaskPass.cpp | 6 +- 6 files changed, 122 insertions(+), 10 deletions(-) create mode 100644 lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td index 2e6159af..fd8ebe87 100644 --- a/include/TaskflowDialect/TaskflowOps.td +++ b/include/TaskflowDialect/TaskflowOps.td @@ -151,24 +151,25 @@ def TaskflowCounterOp : TaskflowOpBase<"counter", [Pure]>{ Represents a loop counter that generates iteration indices. The hardware counter produces a predicated index value. - Counter behavior: - - Top-level counter: increments unconditionally each cycle. - - Nested counter: increments only when the parent counter is valid. + Counter classification: + - "root": Top-level counter with no parent (drives entire loop nest) + - "relay": Intermediate counter with both parent and child counters + - "leaf": Innermost counter with no child counters (maps to CGRA tile array) Example: - // Top-level counter + // Root counter %i = taskflow.counter { lower_bound = 0 : index, upper_bound = 16 : index, step = 1 : index, - counter_name = "i" + counter_type = "root" } : index - // Nested counter + // Leaf counter %j = taskflow.counter parent(%i) { lower_bound = 0 : index, upper_bound = 8 : index, step = 1 : index, - counter_name = "j" + counter_type = "leaf" } : index }]; @@ -176,7 +177,8 @@ def TaskflowCounterOp : TaskflowOpBase<"counter", [Pure]>{ Optional:$parent_index, IndexAttr:$lower_bound, IndexAttr:$upper_bound, - IndexAttr:$step + IndexAttr:$step, + OptionalAttr:$counter_type ); let results = (outs AnyType:$counter_index); diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index 50f28d0e..c0007ce1 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -17,6 +17,7 @@ namespace taskflow { #include "TaskflowDialect/TaskflowPasses.h.inc" std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createCanonicalizeTaskPass(); +std::unique_ptr createClassifyCountersPass(); #define GEN_PASS_REGISTRATION #include "TaskflowDialect/TaskflowPasses.h.inc" diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index 4728f138..4fc2137f 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -29,4 +29,18 @@ def CanonicalizeTask: Pass<"canonicalize-task", "func::FuncOp">{ }]; let constructor = "taskflow::createCanonicalizeTaskPass()"; } + +def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{ + let summary = "Classifies counters as root/relay/leaf"; + let description = [{ + Analyzes the counter hierarchy within taskflow.task operations and + classifies each counter: + - root: Top-level counter with no parent + - relay: Intermediate counter with both parent and child counters + - leaf: Innermost counter with no child counters + + Leaf counters are mapped to CGRA tile arrays. + }]; + let constructor = "taskflow::createClassifyCountersPass()"; +} #endif // TASKFLOW_PASSES_TD \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index ab118c89..e44401d8 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -3,6 +3,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_mlir_library(MLIRTaskflowTransforms ConstructHyperblockFromTaskPass.cpp CanonicalizeTaskPass.cpp + ClassifyCountersPass.cpp DEPENDS MLIRTaskflowTransformsIncGen diff --git a/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp b/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp new file mode 100644 index 00000000..354ee7d7 --- /dev/null +++ b/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp @@ -0,0 +1,92 @@ +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/TaskflowPasses.h" + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/TypeID.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { +void classifyCountersInTask(TaskflowTaskOp task_op) { + // Collects all counters in the task. + SmallVector counters; + task_op.walk( + [&](TaskflowCounterOp counter_op) { counters.push_back(counter_op); }); + + if (counters.empty()) { + return; + } + + // Builds parent-child relationships. + // Maps from counter results to counter ops. + DenseMap value_to_counter; + for (TaskflowCounterOp counter_op : counters) { + value_to_counter[counter_op.getCounterIndex()] = counter_op; + } + + // Finds which counters have children. + DenseSet counters_with_children; + for (TaskflowCounterOp counter_op : counters) { + if (auto parent_idx = counter_op.getParentIndex()) { + if (auto parent_counter = value_to_counter.lookup(parent_idx)) { + counters_with_children.insert(parent_counter); + } + } + } + + // Classifies each counter. + OpBuilder builder(task_op.getContext()); + for (TaskflowCounterOp counter_op : counters) { + bool has_parent = (counter_op.getParentIndex() != nullptr); + bool has_child = counters_with_children.contains(counter_op); + StringRef counter_type; + if (!has_parent && !has_child) { + // Single loop: treat as leaf counter (can be mapped to the CGRA tile + // array). + counter_type = "leaf"; + } else if (!has_parent && has_child) { + // Root counter: top-level loop with nested loops. + counter_type = "root"; + } else if (has_parent && has_child) { + // Relay counter: nested loop with further nested loops. + counter_type = "relay"; + } else { + // Leaf counter: innermost loop. + counter_type = "leaf"; + } + + // Sets the counter type attribute. + counter_op.setCounterTypeAttr(builder.getStringAttr(counter_type)); + } +} + +struct ClassifyCountersPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ClassifyCountersPass) + + StringRef getArgument() const override { return "classify-counters"; } + StringRef getDescription() const override { + return "Classify taskflow counters as root/relay/leaf."; + } + + void runOnOperation() override { + ModuleOp module = getOperation(); + module.walk( + [&](TaskflowTaskOp task_op) { classifyCountersInTask(task_op); }); + } +}; +} // namespace + +std::unique_ptr mlir::taskflow::createClassifyCountersPass() { + return std::make_unique(); +} \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp index 763e6153..5680acf7 100644 --- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp +++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp @@ -134,7 +134,8 @@ static void createCounterChainRecursivly(OpBuilder &builder, Location loc, loc, builder.getIndexType(), parent_counter, builder.getIndexAttr(loop_info->lower_bound), builder.getIndexAttr(loop_info->upper_bound), - builder.getIndexAttr(loop_info->step)); + builder.getIndexAttr(loop_info->step), + /*Counter Type*/ nullptr); counter_index = counter_op.getCounterIndex(); } else { // Top-level counter. @@ -142,7 +143,8 @@ static void createCounterChainRecursivly(OpBuilder &builder, Location loc, loc, builder.getIndexType(), /*parent_index=*/nullptr, builder.getIndexAttr(loop_info->lower_bound), builder.getIndexAttr(loop_info->upper_bound), - builder.getIndexAttr(loop_info->step)); + builder.getIndexAttr(loop_info->step), + /*Counter Type*/ nullptr); counter_index = counter_op.getCounterIndex(); } From fe987b8383f87025726cb668287b88b0205dc937 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Thu, 22 Jan 2026 15:45:11 +0800 Subject: [PATCH 02/25] change the definition of taskflow.hyperblock.yield --- include/Conversion/ConversionPasses.h | 2 +- include/Conversion/ConversionPasses.td | 13 + include/NeuraDialect/NeuraOps.td | 41 +- include/TaskflowDialect/TaskflowOps.td | 10 +- lib/Conversion/CMakeLists.txt | 2 + lib/Conversion/TaskflowToNeura/CMakeLists.txt | 19 + .../TaskflowToNeura/TaskflowToNeuraPass.cpp | 169 +++++++ .../Transforms/IterMergePatternPass.cpp | 445 ++++++++++-------- .../Transforms/WrapLoopInKernelPass.cpp | 4 +- .../kernel_with_yield/kernel_with_yield.mlir | 5 +- 10 files changed, 497 insertions(+), 213 deletions(-) create mode 100644 lib/Conversion/TaskflowToNeura/CMakeLists.txt create mode 100644 lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h index 0baf43f8..14e27a03 100644 --- a/include/Conversion/ConversionPasses.h +++ b/include/Conversion/ConversionPasses.h @@ -22,7 +22,7 @@ std::unique_ptr createLowerAffineToNeuraPass(); // TaskFlow Conversion Passes. std::unique_ptr createConvertAffineToTaskflowPass(); - +std::unique_ptr createConvertTaskflowToNeuraPass(); #define GEN_PASS_REGISTRATION #include "Conversion/ConversionPasses.h.inc" diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td index a341d9fe..e2d727d2 100644 --- a/include/Conversion/ConversionPasses.td +++ b/include/Conversion/ConversionPasses.td @@ -74,4 +74,17 @@ def ConvertAffineToTaskflow : Pass<"convert-affine-to-taskflow", "ModuleOp">{ ]; } +def ConvertTaskflowToNeura : Pass<"convert-taskflow-to-neura", "ModuleOp">{ + let summary = "Convert taskflow.hyperblock to neura.kernel"; + let description = [{ + Converts taskflow.hyperblock operations with leaf counters into neura.kernel + operations suitable for CGRA tile array mapping. + }]; + let constructor = "mlir::createConvertTaskflowToNeuraPass()"; + let dependentDialects = [ + "mlir::taskflow::TaskflowDialect", + "mlir::neura::NeuraDialect" + ]; +} + #endif // CONVERSION_PASSES_TD \ No newline at end of file diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td index 55bc155d..7971d6c6 100644 --- a/include/NeuraDialect/NeuraOps.td +++ b/include/NeuraDialect/NeuraOps.td @@ -10,7 +10,11 @@ include "mlir/IR/OpBase.td" // Defines neura kernel related operations. // ---------------------------------------------------- -def Neura_KernelOp : Op]> { +def Neura_KernelOp : Op { let summary = "Marks a region for CGRA execution."; let description = [{ Defines a computation region that should be offloaded to CGRA. @@ -41,6 +45,7 @@ def Neura_KernelOp : Op:$inputs, // Input operands from surrounding context. + Variadic:$iter_args_init, // Initial values for loop carried variables. OptionalAttr:$cgra_id, // Target CGRA ID (for multi-CGRA systems). OptionalAttr:$kernel_name, // Name of the kernel (for identification). OptionalAttr:$accelerator // Target accelerator name. @@ -52,6 +57,7 @@ def Neura_KernelOp : Op { +def Neura_YieldOp : Op { let summary = "Yield values from a neura.kernel or neura.fused_op region."; let description = [{ Returns values from a neura.kernel or neura.fused_op region to the parent operation. @@ -72,13 +78,17 @@ def Neura_YieldOp : Op { } : f32 }]; - let arguments = (ins Variadic:$values); + let arguments = (ins + Variadic:$iter_args_next, + Variadic:$results); let builders = [ - OpBuilder<(ins), [{ build($_builder, $_state, ValueRange{}); }]> + OpBuilder<(ins), [{ build($_builder, $_state, ValueRange{}, ValueRange{}); }]> ]; - let assemblyFormat = [{($values^ `:` type($values))? attr-dict}]; + let assemblyFormat = [{ + ($iter_args_next^ `:` type($iter_args_next))? + ($results^ `:` type($results))? attr-dict}]; let hasVerifier = 1; } @@ -770,6 +780,27 @@ def Neura_LoopControlOp : Op{ // " `(``parent_valid` `=` $parentValid `,` `start` `=` $start `,` `end` `=` $end `,` `step` `=` $step`)` attr-dict `:` type($parentValid) `,` type($start) `,` type($end) `,` type($step) `->` type($nextindex) `,` type($valid)"; } +// def Neura_CounterOp : Op{ +// let summary = "Hardware loop counter for CGRA execution."; +// let description = [{ +// Represents a hardware loop counter unit that generates loop indices. +// This maps directly to a counter FU on the CGRA. + +// The counter produces: +// - current index: the current loop index value. + +// Example: +// %current_idx = neura.counter () <{ +// start_value = 0 : i64, +// end_value = 100 : i64, +// step_value = 1 : i64 +// }> : -> !neura.data +// }]; +// let arguments = (ins + +// ); +// } + // ---------------------------------------------------- // Defines operations for steering-control based DFG execution. // ---------------------------------------------------- diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td index fd8ebe87..094b31e2 100644 --- a/include/TaskflowDialect/TaskflowOps.td +++ b/include/TaskflowDialect/TaskflowOps.td @@ -235,6 +235,7 @@ def TaskflowHyperblockYieldOp : TaskflowOpBase<"hyperblock.yield", [ Terminator, Pure, ReturnLike, + AttrSizedOperandSegments, ParentOneOf<["TaskflowHyperblockOp"]> ]>{ let summary = "Yield operation for Taskflow hyperblock"; @@ -243,15 +244,18 @@ def TaskflowHyperblockYieldOp : TaskflowOpBase<"hyperblock.yield", [ Terminates the hyperblock body. }]; - let arguments = (ins Variadic:$outputs); + let arguments = (ins + Variadic:$iter_args_next, + Variadic:$results); let assemblyFormat = [{ - (`outputs` `(` $outputs^ `:` type($outputs) `)`)? + (`iter_args_next` `(` $iter_args_next^ `:` type($iter_args_next) `)`)? + (`results` `(` $results^ `:` type($results) `)`)? attr-dict }]; let builders = [ - OpBuilder<(ins), [{build($_builder, $_state, ValueRange{});}]> + OpBuilder<(ins), [{build($_builder, $_state, ValueRange{}, ValueRange{});}]> ]; } diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt index cf66d518..690dae25 100644 --- a/lib/Conversion/CMakeLists.txt +++ b/lib/Conversion/CMakeLists.txt @@ -6,6 +6,7 @@ add_subdirectory(LlvmToNeura) add_subdirectory(MemRefToNeura) add_subdirectory(BuiltinToNeura) add_subdirectory(AffineToTaskflow) +add_subdirectory(TaskflowToNeura) add_library(MLIRConversion INTERFACE) @@ -23,5 +24,6 @@ target_link_libraries(MLIRConversion INTERFACE MLIRNeuraMemRefToNeuraPass MLIRNeuraBuiltinToNeuraPass MLIRAffineToTaskflowPass + MLIRTaskflowToNeuraPass ${dialect_libs} ) \ No newline at end of file diff --git a/lib/Conversion/TaskflowToNeura/CMakeLists.txt b/lib/Conversion/TaskflowToNeura/CMakeLists.txt new file mode 100644 index 00000000..7db3d92b --- /dev/null +++ b/lib/Conversion/TaskflowToNeura/CMakeLists.txt @@ -0,0 +1,19 @@ +include_directories(${CMAKE_CURRENT_BINARY_DIR}) + +add_mlir_conversion_library(MLIRTaskflowToNeuraPass + TaskflowToNeuraPass.cpp + + DEPENDS + MLIRConversionIncGen + + LINK_LIBS PUBLIC + MLIRArithDialect + MLIRFuncDialect + MLIRLLVMDialect + MLIRTaskflow + MLIRIR + MLIRPass + MLIRTransforms + MLIRNeura + MLIRSupport +) diff --git a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp new file mode 100644 index 00000000..460a5a15 --- /dev/null +++ b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp @@ -0,0 +1,169 @@ +#include "Common/AcceleratorAttrs.h" +#include "Conversion/ConversionPasses.h" +#include "NeuraDialect/NeuraDialect.h" +#include "NeuraDialect/NeuraOps.h" +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/StringRef.h" + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { +struct HyperblockToKernelPattern + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(TaskflowHyperblockOp hyperblock_op, + PatternRewriter &rewriter) const override { + Location loc = hyperblock_op.getLoc(); + + // Find the parent task to get access to task's block arguments. + auto taskOp = hyperblock_op->getParentOfType(); + if (!taskOp) + return failure(); + + // Collect live-in values: values used in hyperblock but defined outside. + // These are the task's block arguments that the hyperblock body uses. + llvm::DenseSet liveInSet; + SmallVector liveInValues; + + Block &hbBlock = hyperblock_op.getBody().front(); + Block &taskBlock = taskOp.getBody().front(); + + // Walk hyperblock body to find uses of task block arguments. + hyperblock_op.walk([&](Operation *op) { + for (Value operand : op->getOperands()) { + // Check if operand is a task block argument. + if (auto blockArg = dyn_cast(operand)) { + if (blockArg.getOwner() == &taskBlock) { + if (liveInSet.insert(operand).second) { + liveInValues.push_back(operand); + } + } + } + } + }); + + // Collect iter_args initial values. + SmallVector iterArgsInit(hyperblock_op.getIterArgs().begin(), + hyperblock_op.getIterArgs().end()); + + // Determine result types. + SmallVector resultTypes(hyperblock_op.getResultTypes().begin(), + hyperblock_op.getResultTypes().end()); + + // Collect input types. + SmallVector inputTypes; + for (Value v : liveInValues) { + inputTypes.push_back(v.getType()); + } + + SmallVector iterArgsTypes; + for (Value v : iterArgsInit) { + iterArgsTypes.push_back(v.getType()); + } + + // Create neura.kernel. + auto kernelOp = rewriter.create( + loc, resultTypes, liveInValues, iterArgsInit, + /*cgra_id=*/rewriter.getI32IntegerAttr(0), + /*kernel_name=*/rewriter.getStringAttr("kernel"), + /*accelerator=*/rewriter.getStringAttr("neura")); + + // Create entry block for kernel. + Region &kernelRegion = kernelOp.getBody(); + Block *entryBlock = rewriter.createBlock(&kernelRegion); + + IRMapping mapping; + + // Add block arguments for live-in values (inputs). + for (auto [idx, liveIn] : llvm::enumerate(liveInValues)) { + BlockArgument arg = entryBlock->addArgument(liveIn.getType(), loc); + mapping.map(liveIn, arg); + } + + // Add block arguments for iter_args. + size_t numIndices = hyperblock_op.getIndices().size(); + for (auto [idx, iterArg] : llvm::enumerate(iterArgsInit)) { + BlockArgument arg = entryBlock->addArgument(iterArg.getType(), loc); + // Map hyperblock's iter_arg block argument to kernel's block argument. + mapping.map(hbBlock.getArgument(numIndices + idx), arg); + } + + // Map hyperblock's index arguments - these will be replaced by counter + // later. For now, create placeholder block arguments. + for (size_t i = 0; i < numIndices; ++i) { + BlockArgument hbArg = hbBlock.getArgument(i); + BlockArgument arg = entryBlock->addArgument(hbArg.getType(), loc); + mapping.map(hbArg, arg); + } + + // Clone hyperblock body into kernel. + rewriter.setInsertionPointToEnd(entryBlock); + for (Operation &op : hbBlock.without_terminator()) { + rewriter.clone(op, mapping); + } + + // Convert hyperblock.yield to neura.yield. + auto yieldOp = cast(hbBlock.getTerminator()); + SmallVector iterArgsNext; + SmallVector results; + + for (Value out : yieldOp.getOutputs()) { + Value mapped = mapping.lookupOrDefault(out); + // For kernels with iter_args, output goes to both iter_args_next and + // results. + iterArgsNext.push_back(mapped); + results.push_back(mapped); + } + + rewriter.create(loc, iterArgsNext, results); + + // Replace hyperblock results with kernel results. + rewriter.replaceOp(hyperblock_op, kernelOp.getResults()); + + return success(); + } +}; + +struct ConvertTaskflowToNeuraPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertTaskflowToNeuraPass) + + StringRef getArgument() const override { return "convert-taskflow-to-neura"; } + StringRef getDescription() const override { + return "Convert taskflow.hyperblock to neura.kernel"; + } + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + registry.insert(); + } + + void runOnOperation() override { + ModuleOp module = getOperation(); + MLIRContext *ctx = &getContext(); + + // Phase 1: Converts hyperblocks to kernels. + RewritePatternSet patterns(ctx); + patterns.add(ctx); + + if (failed(applyPatternsGreedily(module, std::move(patterns)))) { + signalPassFailure(); + return; + } + } +}; +} // namespace + +std::unique_ptr mlir::createConvertTaskflowToNeuraPass() { + return std::make_unique(); +} \ No newline at end of file diff --git a/lib/NeuraDialect/Transforms/IterMergePatternPass.cpp b/lib/NeuraDialect/Transforms/IterMergePatternPass.cpp index e9666bd6..6269ab0f 100644 --- a/lib/NeuraDialect/Transforms/IterMergePatternPass.cpp +++ b/lib/NeuraDialect/Transforms/IterMergePatternPass.cpp @@ -1,20 +1,20 @@ -#include "NeuraDialect/NeuraOps.h" #include "NeuraDialect/NeuraDialect.h" +#include "NeuraDialect/NeuraOps.h" #include "NeuraDialect/Transforms/GraphMining/GraMi.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/IR/PatternMatch.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/Builders.h" #include "mlir/IR/IRMapping.h" +#include "mlir/IR/PatternMatch.h" #include "mlir/Pass/Pass.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" -#include -#include +#include "llvm/Support/raw_ostream.h" #include +#include #include +#include using namespace mlir; @@ -22,148 +22,160 @@ using namespace mlir; #define GEN_PASS_DEF_INITPATTERN #include "NeuraDialect/NeuraPasses.h.inc" -void printDFGStatistics(mlir::neura::DfgGraph* graph) { +void printDFGStatistics(mlir::neura::DfgGraph *graph) { llvm::errs() << "DFG Statistics:\n"; llvm::errs() << "---------------\n"; llvm::errs() << "Number of nodes: " << graph->getNumNodes() << "\n"; llvm::errs() << "Number of edges: " << graph->getNumEdges() << "\n\n"; - + std::map op_type_counts; - for (auto* node : graph->getNodes()) { + for (auto *node : graph->getNodes()) { op_type_counts[node->getLabel()]++; } - + llvm::errs() << "Operation types and their counts:\n"; - for (const auto& pair : op_type_counts) { + for (const auto &pair : op_type_counts) { llvm::errs() << " - " << pair.first << ": " << pair.second << "\n"; } llvm::errs() << "\n"; } // Finds a valid insertion point for the fused operation. -Operation* findValidInsertionPoint( - const mlir::neura::PatternInstance& instance, - const llvm::DenseSet& pattern_ops, - const SmallVector& valid_inputs, - const SmallVector& valid_outputs) { - - if (instance.operations.empty()) return nullptr; - - Block* block = instance.operations.front()->getBlock(); - if (!block) return nullptr; - - for (Operation* op : instance.operations) { +Operation * +findValidInsertionPoint(const mlir::neura::PatternInstance &instance, + const llvm::DenseSet &pattern_ops, + const SmallVector &valid_inputs, + const SmallVector &valid_outputs) { + + if (instance.operations.empty()) + return nullptr; + + Block *block = instance.operations.front()->getBlock(); + if (!block) + return nullptr; + + for (Operation *op : instance.operations) { if (op->getBlock() != block) { - return nullptr; + return nullptr; } } - - Operation* earliest_point = nullptr; - + + Operation *earliest_point = nullptr; + for (Value input : valid_inputs) { - Operation* def_op = input.getDefiningOp(); + Operation *def_op = input.getDefiningOp(); if (!def_op) { continue; } - + if (def_op->getBlock() != block) { continue; } - + if (!earliest_point) { earliest_point = def_op; } else if (!def_op->isBeforeInBlock(earliest_point)) { earliest_point = def_op; } } - + // Finds the latest position: before all external uses of outputs - Operation* latest_point = nullptr; - - for (Value output : valid_outputs ) { - for (OpOperand& use : output.getUses()) { - Operation* user = use.getOwner(); - + Operation *latest_point = nullptr; + + for (Value output : valid_outputs) { + for (OpOperand &use : output.getUses()) { + Operation *user = use.getOwner(); + if (pattern_ops.contains(user)) { continue; } - + if (user->getBlock() != block) { continue; } - + if (!latest_point) { latest_point = user; } else if (user->isBeforeInBlock(latest_point)) { - latest_point = user; + latest_point = user; } } } - + if (!earliest_point) { earliest_point = instance.operations.front(); - for (Operation* op : instance.operations) { + for (Operation *op : instance.operations) { if (op->isBeforeInBlock(earliest_point)) { earliest_point = op; } } } - + // [earliest_point, latest_point) if (latest_point) { - if (!earliest_point->isBeforeInBlock(latest_point) || earliest_point == latest_point) { + if (!earliest_point->isBeforeInBlock(latest_point) || + earliest_point == latest_point) { return nullptr; } } - + // Returns the valid insertion point (inserts after earliest_point) return earliest_point; } -bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstance& instance, const mlir::neura::FrequentSubgraph& pattern) { - if (instance.operations.empty()) return false; - - for (Operation* op : instance.operations) { +bool rewritePatternInstance(OpBuilder &builder, + const mlir::neura::PatternInstance &instance, + const mlir::neura::FrequentSubgraph &pattern) { + if (instance.operations.empty()) + return false; + + for (Operation *op : instance.operations) { if (!op || !op->getBlock()) { return false; } } - - llvm::DenseSet pattern_ops(instance.operations.begin(), instance.operations.end()); - + + llvm::DenseSet pattern_ops(instance.operations.begin(), + instance.operations.end()); + // First, collects inputs and outputs to determine valid insertion point llvm::SetVector input_set_for_check; - for (Operation* op : instance.operations) { + for (Operation *op : instance.operations) { for (Value operand : op->getOperands()) { - Operation* def_op = operand.getDefiningOp(); - if (def_op && def_op->getName().getStringRef().str() == "neura.fused_op" && pattern_ops.contains(def_op)) { + Operation *def_op = operand.getDefiningOp(); + if (def_op && + def_op->getName().getStringRef().str() == "neura.fused_op" && + pattern_ops.contains(def_op)) { continue; } if (!def_op || !pattern_ops.contains(def_op)) { input_set_for_check.insert(operand); } } - - if (op->getName().getStringRef().str() == "neura.fused_op" && op->getNumRegions() > 0) { - Region& region = op->getRegion(0); + + if (op->getName().getStringRef().str() == "neura.fused_op" && + op->getNumRegions() > 0) { + Region ®ion = op->getRegion(0); if (!region.empty()) { - Block& block = region.front(); - llvm::DenseSet nested_pattern_ops; - - for (Operation& body_op : block.getOperations()) { + Block &block = region.front(); + llvm::DenseSet nested_pattern_ops; + + for (Operation &body_op : block.getOperations()) { if (body_op.getName().getStringRef().str() != "neura.yield") { nested_pattern_ops.insert(&body_op); for (Value operand : body_op.getOperands()) { if (mlir::isa(operand)) { continue; } - - Operation* def_op = operand.getDefiningOp(); - if (def_op && !nested_pattern_ops.contains(def_op) && !pattern_ops.contains(def_op)) { + + Operation *def_op = operand.getDefiningOp(); + if (def_op && !nested_pattern_ops.contains(def_op) && + !pattern_ops.contains(def_op)) { input_set_for_check.insert(operand); } else if (!def_op) { - assert(false && "Value without defining op should not happen normally"); + assert(false && + "Value without defining op should not happen normally"); } } } @@ -172,87 +184,85 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan } } SmallVector valid_inputs = input_set_for_check.takeVector(); - + llvm::SetVector output_set_for_check; - for (Operation* op : instance.operations) { + for (Operation *op : instance.operations) { for (Value result : op->getResults()) { bool has_external_use = false; - for (OpOperand& use : result.getUses()) { - Operation* user = use.getOwner(); + for (OpOperand &use : result.getUses()) { + Operation *user = use.getOwner(); if (!pattern_ops.contains(user)) { has_external_use = true; break; } } - + if (has_external_use) { output_set_for_check.insert(result); } } } SmallVector valid_outputs = output_set_for_check.takeVector(); - + // Finds a valid insertion point that avoids dominance issues - Operation* insertion_point = findValidInsertionPoint(instance, pattern_ops, valid_inputs, valid_outputs); + Operation *insertion_point = findValidInsertionPoint( + instance, pattern_ops, valid_inputs, valid_outputs); if (!insertion_point) { return false; } - + builder.setInsertionPointAfter(insertion_point); - + SmallVector output_types; for (Value output : valid_outputs) { output_types.push_back(output.getType()); } auto pattern_op = builder.create( - insertion_point->getLoc(), - output_types, - valid_inputs, + insertion_point->getLoc(), output_types, valid_inputs, builder.getI64IntegerAttr(pattern.getId()), builder.getStringAttr(pattern.getPattern()), - builder.getI64IntegerAttr(pattern.getFrequency()) - ); + builder.getI64IntegerAttr(pattern.getFrequency())); - Region& body_region = pattern_op.getBody(); - Block* body_block = new Block(); + Region &body_region = pattern_op.getBody(); + Block *body_block = new Block(); body_region.push_back(body_block); - + for (Value input : valid_inputs) { body_block->addArgument(input.getType(), input.getLoc()); } - + builder.setInsertionPointToStart(body_block); IRMapping mapping; - + for (size_t i = 0; i < valid_inputs.size(); ++i) { mapping.map(valid_inputs[i], body_block->getArgument(i)); } - + llvm::DenseMap original_to_cloned; - - Operation* cloned_op = nullptr; - for (Operation* op : instance.operations) { + Operation *cloned_op = nullptr; + + for (Operation *op : instance.operations) { if (op->getName().getStringRef().str() == "neura.fused_op") { if (op->getNumRegions() > 0) { - Region& region = op->getRegion(0); + Region ®ion = op->getRegion(0); if (!region.empty()) { - Block& block = region.front(); - - llvm::DenseSet nested_pattern_body_ops; + Block &block = region.front(); + + llvm::DenseSet nested_pattern_body_ops; llvm::SetVector nested_pattern_used_values; - - for (Operation& body_op : block.getOperations()) { + + for (Operation &body_op : block.getOperations()) { if (body_op.getName().getStringRef().str() != "neura.yield") { nested_pattern_body_ops.insert(&body_op); - + for (Value operand : body_op.getOperands()) { if (mlir::isa(operand)) { continue; } - - Operation* def_op = operand.getDefiningOp(); + + Operation *def_op = operand.getDefiningOp(); if (def_op) { if (nested_pattern_body_ops.contains(def_op)) { continue; @@ -264,11 +274,12 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan } } } - - for (size_t i = 0; i < op->getNumOperands() && i < block.getNumArguments(); ++i) { + + for (size_t i = 0; + i < op->getNumOperands() && i < block.getNumArguments(); ++i) { Value pattern_input = op->getOperand(i); BlockArgument nested_arg = block.getArgument(i); - + if (mapping.contains(pattern_input)) { mapping.map(nested_arg, mapping.lookup(pattern_input)); } else { @@ -279,36 +290,43 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan } } } - + for (Value used_val : nested_pattern_used_values) { - if (mlir::isa(used_val) || mapping.contains(used_val)) { + if (mlir::isa(used_val) || + mapping.contains(used_val)) { continue; } - - Operation* def_op = used_val.getDefiningOp(); - if (def_op && pattern_ops.contains(def_op) && original_to_cloned.count(used_val)) { + + Operation *def_op = used_val.getDefiningOp(); + if (def_op && pattern_ops.contains(def_op) && + original_to_cloned.count(used_val)) { mapping.map(used_val, original_to_cloned[used_val]); } else { mapping.map(used_val, used_val); } } - - for (Operation& body_op : block.getOperations()) { + + for (Operation &body_op : block.getOperations()) { if (body_op.getName().getStringRef().str() != "neura.yield") { cloned_op = builder.clone(body_op, mapping); for (size_t i = 0; i < body_op.getNumResults(); ++i) { - original_to_cloned[body_op.getResult(i)] = cloned_op->getResult(i); + original_to_cloned[body_op.getResult(i)] = + cloned_op->getResult(i); } } } - - for (Operation& block_op : block.getOperations()) { + + for (Operation &block_op : block.getOperations()) { if (block_op.getName().getStringRef().str() == "neura.yield") { - for (size_t i = 0; i < op->getNumResults() && i < block_op.getNumOperands(); ++i) { + for (size_t i = 0; + i < op->getNumResults() && i < block_op.getNumOperands(); + ++i) { Value yield_operand = block_op.getOperand(i); if (original_to_cloned.count(yield_operand)) { - original_to_cloned[op->getResult(i)] = original_to_cloned[yield_operand]; - mapping.map(op->getResult(i), original_to_cloned[yield_operand]); + original_to_cloned[op->getResult(i)] = + original_to_cloned[yield_operand]; + mapping.map(op->getResult(i), + original_to_cloned[yield_operand]); } else { return false; } @@ -320,8 +338,9 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan } } else { for (Value operand : op->getOperands()) { - Operation* def_op = operand.getDefiningOp(); - if (def_op && def_op->getName().getStringRef().str() == "neura.fused_op" && + Operation *def_op = operand.getDefiningOp(); + if (def_op && + def_op->getName().getStringRef().str() == "neura.fused_op" && pattern_ops.contains(def_op) && original_to_cloned.count(operand)) { if (!mapping.contains(operand)) { mapping.map(operand, original_to_cloned[operand]); @@ -334,7 +353,7 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan } } } - + SmallVector yield_operands; for (size_t i = 0; i < valid_outputs.size(); ++i) { Value original_output = valid_outputs[i]; @@ -349,9 +368,10 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan return false; } } - - builder.create(insertion_point->getLoc(), yield_operands); - + + builder.create(insertion_point->getLoc(), ValueRange{}, + yield_operands); + llvm::DenseSet replaced_outputs; for (size_t i = 0; i < valid_outputs.size(); ++i) { Value old_value = valid_outputs[i]; @@ -359,8 +379,8 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan old_value.replaceAllUsesWith(new_value); replaced_outputs.insert(old_value); } - - for (auto& pair : original_to_cloned) { + + for (auto &pair : original_to_cloned) { Value old_value = pair.first; if (replaced_outputs.contains(old_value)) { continue; @@ -370,99 +390,105 @@ bool rewritePatternInstance(OpBuilder& builder, const mlir::neura::PatternInstan old_value.replaceAllUsesWith(new_value); } } - + original_to_cloned.clear(); - - for (auto it = instance.operations.rbegin(); it != instance.operations.rend(); ++it) { - Operation* op = *it; - + + for (auto it = instance.operations.rbegin(); it != instance.operations.rend(); + ++it) { + Operation *op = *it; + if (op->getName().getStringRef().str() == "neura.fused_op") { - Region& region = op->getRegion(0); - Block& block = region.front(); - - for (Operation& body_op : block.getOperations()) { + Region ®ion = op->getRegion(0); + Block &block = region.front(); + + for (Operation &body_op : block.getOperations()) { for (Value result : body_op.getResults()) { if (!result.use_empty()) { result.dropAllUses(); } } } - + for (BlockArgument arg : block.getArguments()) { if (!arg.use_empty()) { arg.dropAllUses(); } } - + while (!block.empty()) { - Operation& body_op = block.back(); + Operation &body_op = block.back(); body_op.dropAllReferences(); body_op.erase(); } } - + op->dropAllUses(); op->erase(); } - + return true; } -int rewritePatternsToRegions(mlir::neura::DfgGraph* dfg_graph, ModuleOp module_op, const std::vector& patterns_with_instances) { +int rewritePatternsToRegions( + mlir::neura::DfgGraph *dfg_graph, ModuleOp module_op, + const std::vector + &patterns_with_instances) { int rewrite_count = 0; size_t total_critical = 0; size_t total_non_critical = 0; - MLIRContext* context = module_op.getContext(); + MLIRContext *context = module_op.getContext(); OpBuilder builder(context); - - for (const auto& pwsi : patterns_with_instances) { - if (pwsi.pattern.getNodes().size() < 2) continue; + + for (const auto &pwsi : patterns_with_instances) { + if (pwsi.pattern.getNodes().size() < 2) + continue; total_critical += pwsi.critical_instances.size(); total_non_critical += pwsi.non_critical_instances.size(); } - + size_t total_instances = total_critical + total_non_critical; if (total_instances == 0) { llvm::errs() << " No valid instances to rewrite\n"; return 0; } - + std::set attempted_patterns; - + // Phase 1: Rewrites all critical path instances across all patterns llvm::errs() << " Phase 1: Rewriting critical path instances...\n"; - for (const auto& pwsi : patterns_with_instances) { + for (const auto &pwsi : patterns_with_instances) { if (pwsi.pattern.getNodes().size() < 2 || pwsi.critical_instances.empty()) { continue; } - + attempted_patterns.insert(pwsi.pattern.getPattern()); - - for (const auto& instance : pwsi.critical_instances) { + + for (const auto &instance : pwsi.critical_instances) { rewritePatternInstance(builder, instance, pwsi.pattern); } } // Phase 2: Rewrites all non-critical path instances across all patterns llvm::errs() << " Phase 2: Rewriting non-critical path instances...\n"; - for (const auto& pwsi : patterns_with_instances) { - if (pwsi.pattern.getNodes().size() < 2 || pwsi.non_critical_instances.empty()) { + for (const auto &pwsi : patterns_with_instances) { + if (pwsi.pattern.getNodes().size() < 2 || + pwsi.non_critical_instances.empty()) { continue; } - + // Marks pattern as attempted before trying to fuse instances attempted_patterns.insert(pwsi.pattern.getPattern()); - - for (const auto& instance : pwsi.non_critical_instances) { + + for (const auto &instance : pwsi.non_critical_instances) { rewritePatternInstance(builder, instance, pwsi.pattern); } } // Marks all attempted patterns - for (const auto& pattern_str : attempted_patterns) { + for (const auto &pattern_str : attempted_patterns) { mlir::neura::GraMi::markPatternAsAttempted(pattern_str); } - + return rewrite_count; } @@ -478,78 +504,93 @@ struct IterMergePatternPass StringRef getArgument() const override { return "iter-merge-pattern"; } StringRef getDescription() const override { - return "Iteratively merge and identify common patterns in DFG using graph mining."; + return "Iteratively merge and identify common patterns in DFG using graph " + "mining."; } Option min_support{ *this, "min-support", - llvm::cl::desc("Minimum support threshold for pattern mining (default: 2)"), + llvm::cl::desc( + "Minimum support threshold for pattern mining (default: 2)"), llvm::cl::init(2)}; Option max_iter{ *this, "max-iter", - llvm::cl::desc("Maximum number of iterations for pattern merging (default: 2)"), + llvm::cl::desc( + "Maximum number of iterations for pattern merging (default: 2)"), llvm::cl::init(2)}; void runOnOperation() override { - + ModuleOp module_op = getOperation(); - + llvm::errs() << "\n========================================\n"; llvm::errs() << "IterMergePatternPass: Starting pattern mining\n"; - llvm::errs() << "Minimum support threshold: " << min_support.getValue() << "\n"; + llvm::errs() << "Minimum support threshold: " << min_support.getValue() + << "\n"; llvm::errs() << "========================================\n\n"; - + int iter = 0; - bool cleared_attempted = false; // Tracks if it has cleared attempted marks once + bool cleared_attempted = + false; // Tracks if it has cleared attempted marks once while (iter < max_iter.getValue()) { llvm::errs() << "Iteration " << iter << "\n"; - - // Re-collects critical path operations from all functions for this iteration - // Critical path may change after each iteration due to pattern fusion - llvm::DenseSet all_critical_ops; + + // Re-collects critical path operations from all functions for this + // iteration Critical path may change after each iteration due to pattern + // fusion + llvm::DenseSet all_critical_ops; module_op.walk([&](func::FuncOp func) { auto critical_ops = mlir::neura::GraMi::collectCriticalPathOps(func); - for (Operation* op : critical_ops) { + for (Operation *op : critical_ops) { all_critical_ops.insert(op); } }); - llvm::errs() << " Collected " << all_critical_ops.size() << " critical path operations for iteration " << iter << "\n"; - + llvm::errs() << " Collected " << all_critical_ops.size() + << " critical path operations for iteration " << iter + << "\n"; + auto dfg_graph = mlir::neura::DfgExtractor::extractFromModule(module_op); - + if (!dfg_graph) { llvm::errs() << "Error: Failed to extract DFG from module\n"; signalPassFailure(); return; - } - + } + printDFGStatistics(dfg_graph.get()); mlir::neura::GraMi grami(dfg_graph.get(), min_support.getValue()); grami.setCriticalPathOps(all_critical_ops); - std::vector patterns_with_instances = grami.mineFrequentSubgraphs(); - + std::vector + patterns_with_instances = grami.mineFrequentSubgraphs(); + // If no patterns were fused and it hasn't cleared attempted marks yet, - // clears them and tries one more iteration (without incrementing iter count) + // clears them and tries one more iteration (without incrementing iter + // count) if (patterns_with_instances.empty() && !cleared_attempted) { - llvm::errs() << " No patterns fused in this iteration. Clearing attempted marks and retrying...\n"; + llvm::errs() << " No patterns fused in this iteration. Clearing " + "attempted marks and retrying...\n"; mlir::neura::GraMi::clearAttemptedPatterns(); cleared_attempted = true; // Retries this iteration with cleared marks (doesn't increment iter) continue; } - // If it cleared marks and still got 0, or if it has reached max iterations, stops + // If it cleared marks and still got 0, or if it has reached max + // iterations, stops if (patterns_with_instances.empty() && cleared_attempted) { - llvm::errs() << " No patterns fused even after clearing attempted marks. Stopping.\n"; + llvm::errs() << " No patterns fused even after clearing attempted " + "marks. Stopping.\n"; break; } - int rewrite_count = rewritePatternsToRegions(dfg_graph.get(), module_op, patterns_with_instances); - llvm::errs() << " - Rewrote " << rewrite_count << " pattern instances\n\n"; - + int rewrite_count = rewritePatternsToRegions(dfg_graph.get(), module_op, + patterns_with_instances); + llvm::errs() << " - Rewrote " << rewrite_count + << " pattern instances\n\n"; + iter++; } - + llvm::errs() << "\n========================================\n"; llvm::errs() << "IterMergePatternPass: Completed\n"; llvm::errs() << "========================================\n\n"; @@ -571,43 +612,48 @@ struct InitPatternPass Option min_support{ *this, "min-support", - llvm::cl::desc("Minimum support threshold for pattern mining (default: 2)"), + llvm::cl::desc( + "Minimum support threshold for pattern mining (default: 2)"), llvm::cl::init(2)}; void runOnOperation() override { ModuleOp module_op = getOperation(); - + llvm::errs() << "\n========================================\n"; llvm::errs() << "InitPatternPass: Starting pattern mining\n"; - llvm::errs() << "Minimum support threshold: " << min_support.getValue() << "\n"; + llvm::errs() << "Minimum support threshold: " << min_support.getValue() + << "\n"; llvm::errs() << "========================================\n\n"; - + // Collects critical path operations from all functions - llvm::DenseSet all_critical_ops; + llvm::DenseSet all_critical_ops; module_op.walk([&](func::FuncOp func) { auto critical_ops = mlir::neura::GraMi::collectCriticalPathOps(func); - for (Operation* op : critical_ops) { + for (Operation *op : critical_ops) { all_critical_ops.insert(op); } }); - llvm::errs() << "Collected " << all_critical_ops.size() << " critical path operations\n\n"; - + llvm::errs() << "Collected " << all_critical_ops.size() + << " critical path operations\n\n"; + auto dfg_graph = mlir::neura::DfgExtractor::extractFromModule(module_op); - + if (!dfg_graph) { llvm::errs() << "Error: Failed to extract DFG from module\n"; signalPassFailure(); return; - } - + } + printDFGStatistics(dfg_graph.get()); mlir::neura::GraMi grami(dfg_graph.get(), min_support.getValue()); grami.setCriticalPathOps(all_critical_ops); - std::vector patterns_with_instances = grami.mineFrequentSubgraphs(); - - int rewrite_count = rewritePatternsToRegions(dfg_graph.get(), module_op, patterns_with_instances); + std::vector + patterns_with_instances = grami.mineFrequentSubgraphs(); + + int rewrite_count = rewritePatternsToRegions(dfg_graph.get(), module_op, + patterns_with_instances); llvm::errs() << " - Rewrote " << rewrite_count << " pattern instances\n\n"; - + llvm::errs() << "\n========================================\n"; llvm::errs() << "InitPatternPass: Completed\n"; llvm::errs() << "========================================\n\n"; @@ -625,4 +671,3 @@ std::unique_ptr createInitPatternPass() { return std::make_unique(); } } // namespace mlir::neura - diff --git a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp index 1a2c9391..ac664382 100644 --- a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp +++ b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp @@ -72,10 +72,10 @@ static LogicalResult wrapInnermostLoopAsKernel(affine::AffineForOp for_op, if (has_outputs) { // If the loop has outputs, yield the loop results. SmallVector yield_operands(for_op.getResults()); - builder.create(loc, yield_operands); + builder.create(loc, ValueRange{}, yield_operands); } else { // If the loop has no outputs, create an empty yield. - builder.create(loc, ValueRange{}); + builder.create(loc); } return success(); diff --git a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir b/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir index 458602bd..ad24eac4 100644 --- a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir +++ b/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir @@ -1,7 +1,8 @@ // Wraps the innermost loop within neura.kernel operation. // RUN: mlir-neura-opt %s \ -// RUN: --wrap-loop-in-kernel \ -// RUN: | FileCheck %s +// RUN: --wrap-loop-in-kernel \ +// RUN: -o %t-wrapped.mlir +// RUN: FileCheck %s --input-file=%t-wrapped.mlir module attributes {} { func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { From 5bb37772f8fd8c1ea049b52d46b55ec89b103239 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Thu, 22 Jan 2026 19:01:38 +0800 Subject: [PATCH 03/25] change the definition of neura.kernel --- include/NeuraDialect/NeuraOps.td | 38 ++--- .../TaskflowToNeura/TaskflowToNeuraPass.cpp | 155 ++++++++++-------- .../Transforms/CanonicalizeTaskPass.cpp | 15 +- .../ConstructHyperblockFromTaskPass.cpp | 3 +- 4 files changed, 121 insertions(+), 90 deletions(-) diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td index 7971d6c6..71218450 100644 --- a/include/NeuraDialect/NeuraOps.td +++ b/include/NeuraDialect/NeuraOps.td @@ -87,8 +87,8 @@ def Neura_YieldOp : Op{ // " `(``parent_valid` `=` $parentValid `,` `start` `=` $start `,` `end` `=` $end `,` `step` `=` $step`)` attr-dict `:` type($parentValid) `,` type($start) `,` type($end) `,` type($step) `->` type($nextindex) `,` type($valid)"; } -// def Neura_CounterOp : Op{ -// let summary = "Hardware loop counter for CGRA execution."; -// let description = [{ -// Represents a hardware loop counter unit that generates loop indices. -// This maps directly to a counter FU on the CGRA. +def Neura_CounterOp : Op{ + let summary = "Hardware loop counter for CGRA execution."; + let description = [{ + Represents a hardware loop counter unit that generates loop indices. + This maps directly to a counter FU on the CGRA. -// The counter produces: -// - current index: the current loop index value. + The counter produces: + - current index: the current loop index value. -// Example: -// %current_idx = neura.counter () <{ -// start_value = 0 : i64, -// end_value = 100 : i64, -// step_value = 1 : i64 -// }> : -> !neura.data -// }]; -// let arguments = (ins + Example: + %current_idx = neura.counter () <{ + start_value = 0 : i64, + end_value = 100 : i64, + step_value = 1 : i64 + }> : -> !neura.data + }]; + let arguments = (ins -// ); -// } + ); +} // ---------------------------------------------------- // Defines operations for steering-control based DFG execution. diff --git a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp index 460a5a15..ea46d969 100644 --- a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp +++ b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp @@ -18,6 +18,25 @@ using namespace mlir; using namespace mlir::taskflow; namespace { +// Pattern to convert taskflow.hyperblock to neura.kernel. +// +// Hyperblock structure: +// %result = taskflow.hyperblock(%idx, %iter_init) { +// ^bb0(%idx_arg: index, %iter_arg: T): +// ... body ... +// taskflow.hyperblock.yield outputs(%next_iter : T) +// } : (index, T) -> T +// +// Kernel structure: +// %result = neura.kernel ins(%idx, %live_in...) iter_args(%iter_init) { +// ^bb0(%idx_arg: index, %live_in_args..., %iter_arg: T): +// ... body ... +// neura.yield iter_args(%next_iter) results(%next_iter) +// } -> T +// +// Block argument order must match: +// Hyperblock: [indices..., iter_args...] +// Kernel: [inputs (indices + live_ins)..., iter_args...] struct HyperblockToKernelPattern : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -26,109 +45,115 @@ struct HyperblockToKernelPattern PatternRewriter &rewriter) const override { Location loc = hyperblock_op.getLoc(); - // Find the parent task to get access to task's block arguments. - auto taskOp = hyperblock_op->getParentOfType(); - if (!taskOp) + // Finds the parent task to access task's block arguments. + TaskflowTaskOp task_op = hyperblock_op->getParentOfType(); + if (!task_op) { return failure(); + } + + Block &hb_block = hyperblock_op.getBody().front(); + Block &task_block = task_op.getBody().front(); - // Collect live-in values: values used in hyperblock but defined outside. - // These are the task's block arguments that the hyperblock body uses. - llvm::DenseSet liveInSet; - SmallVector liveInValues; + // Gets hyperblock operands. + SmallVector indices(hyperblock_op.getIndices()); + SmallVector iter_args_init(hyperblock_op.getIterArgs()); + size_t num_indices = indices.size(); + size_t num_iter_args_init = iter_args_init.size(); - Block &hbBlock = hyperblock_op.getBody().front(); - Block &taskBlock = taskOp.getBody().front(); + // Collects live-in values of the hyperblock: task block arguments used in + // the hyperblock body. + llvm::DenseSet live_in_set; + SmallVector live_in_values; - // Walk hyperblock body to find uses of task block arguments. hyperblock_op.walk([&](Operation *op) { for (Value operand : op->getOperands()) { - // Check if operand is a task block argument. if (auto blockArg = dyn_cast(operand)) { - if (blockArg.getOwner() == &taskBlock) { - if (liveInSet.insert(operand).second) { - liveInValues.push_back(operand); + if (blockArg.getOwner() == &task_block) { + if (live_in_set.insert(operand).second) { + live_in_values.push_back(operand); } } } + assert(!operand.getDefiningOp() && "Unexpected non-block-arg operand"); } }); - // Collect iter_args initial values. - SmallVector iterArgsInit(hyperblock_op.getIterArgs().begin(), - hyperblock_op.getIterArgs().end()); + // Builds the neura.kernel inputs: [indices..., live_ins...]. + SmallVector kernel_inputs; + kernel_inputs.append(indices); + kernel_inputs.append(live_in_values); - // Determine result types. - SmallVector resultTypes(hyperblock_op.getResultTypes().begin(), - hyperblock_op.getResultTypes().end()); + // Result types from hyperblock. + SmallVector resultTypes(hyperblock_op.getResultTypes()); - // Collect input types. - SmallVector inputTypes; - for (Value v : liveInValues) { - inputTypes.push_back(v.getType()); - } - - SmallVector iterArgsTypes; - for (Value v : iterArgsInit) { - iterArgsTypes.push_back(v.getType()); - } - - // Create neura.kernel. - auto kernelOp = rewriter.create( - loc, resultTypes, liveInValues, iterArgsInit, + // Creates neura.kernel. + neura::KernelOp kernelOp = rewriter.create( + loc, resultTypes, kernel_inputs, iter_args_init, /*cgra_id=*/rewriter.getI32IntegerAttr(0), /*kernel_name=*/rewriter.getStringAttr("kernel"), /*accelerator=*/rewriter.getStringAttr("neura")); - // Create entry block for kernel. - Region &kernelRegion = kernelOp.getBody(); - Block *entryBlock = rewriter.createBlock(&kernelRegion); + // Creates the entry block for kernel. + Region &kernel_region = kernelOp.getBody(); + Block *entry_block = rewriter.createBlock(&kernel_region); IRMapping mapping; - // Add block arguments for live-in values (inputs). - for (auto [idx, liveIn] : llvm::enumerate(liveInValues)) { - BlockArgument arg = entryBlock->addArgument(liveIn.getType(), loc); - mapping.map(liveIn, arg); + // Kernel block argument layout: [inputs..., iter_args...] + // Where inputs = [indices..., live_ins...] + // + // Hyperblock block argument layout: [indices..., iter_args...] + + // 1. Adds block arguments for indices and map to hyperblock's index args. + for (size_t i = 0; i < num_indices; ++i) { + BlockArgument kernel_indices_arg = + entry_block->addArgument(indices[i].getType(), loc); + BlockArgument hb_arg = hb_block.getArgument(i); + mapping.map(hb_arg, kernel_indices_arg); } - // Add block arguments for iter_args. - size_t numIndices = hyperblock_op.getIndices().size(); - for (auto [idx, iterArg] : llvm::enumerate(iterArgsInit)) { - BlockArgument arg = entryBlock->addArgument(iterArg.getType(), loc); - // Map hyperblock's iter_arg block argument to kernel's block argument. - mapping.map(hbBlock.getArgument(numIndices + idx), arg); + // 2. Adds block arguments for live-in values and map to task block args. + for (Value live_in : live_in_values) { + BlockArgument kernel_live_in_arg = + entry_block->addArgument(live_in.getType(), loc); + mapping.map(live_in, kernel_live_in_arg); } - // Map hyperblock's index arguments - these will be replaced by counter - // later. For now, create placeholder block arguments. - for (size_t i = 0; i < numIndices; ++i) { - BlockArgument hbArg = hbBlock.getArgument(i); - BlockArgument arg = entryBlock->addArgument(hbArg.getType(), loc); - mapping.map(hbArg, arg); + // 3. Adds block arguments for iter_args and map to hyperblock's iter_args. + for (size_t i = 0; i < num_iter_args_init; ++i) { + BlockArgument kernel_iter_arg = + entry_block->addArgument(iter_args_init[i].getType(), loc); + BlockArgument hb_arg = hb_block.getArgument(num_indices + i); + mapping.map(hb_arg, kernel_iter_arg); } - // Clone hyperblock body into kernel. - rewriter.setInsertionPointToEnd(entryBlock); - for (Operation &op : hbBlock.without_terminator()) { + // Clones hyperblock body into kernel. + rewriter.setInsertionPointToEnd(entry_block); + for (Operation &op : hb_block.without_terminator()) { rewriter.clone(op, mapping); } - // Convert hyperblock.yield to neura.yield. - auto yieldOp = cast(hbBlock.getTerminator()); - SmallVector iterArgsNext; + // Converts hyperblock.yield to neura.yield. + TaskflowHyperblockYieldOp hb_yield_op = + cast(hb_block.getTerminator()); + + SmallVector iter_args_next; SmallVector results; - for (Value out : yieldOp.getOutputs()) { + // Maps yield outputs. + for (Value out : hb_yield_op.getResults()) { Value mapped = mapping.lookupOrDefault(out); - // For kernels with iter_args, output goes to both iter_args_next and - // results. - iterArgsNext.push_back(mapped); results.push_back(mapped); } - rewriter.create(loc, iterArgsNext, results); + for (Value iter_arg : hb_yield_op.getIterArgsNext()) { + Value mapped = mapping.lookupOrDefault(iter_arg); + iter_args_next.push_back(mapped); + } + + rewriter.create(loc, iter_args_next, results); - // Replace hyperblock results with kernel results. + // Replaces hyperblock with kernel. rewriter.replaceOp(hyperblock_op, kernelOp.getResults()); return success(); diff --git a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp b/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp index 151226cf..4281fae2 100644 --- a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp +++ b/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp @@ -395,13 +395,18 @@ class AtomicTaskBuilder { if (auto yield = dyn_cast(old_body->getTerminator())) { - SmallVector yield_ops; - for (Value v : yield.getOutputs()) { - yield_ops.push_back(mapping.lookupOrDefault(v)); + SmallVector yield_results; + SmallVector yield_iter_args_next; + for (Value v : yield.getResults()) { + yield_results.push_back(mapping.lookupOrDefault(v)); } - hb_builder.create(this->loc, yield_ops); + for (Value v : yield.getIterArgsNext()) { + yield_iter_args_next.push_back(mapping.lookupOrDefault(v)); + } + hb_builder.create(this->loc, yield_results, + yield_iter_args_next); } else { - hb_builder.create(this->loc, ValueRange{}); + hb_builder.create(this->loc); } } diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp index 5680acf7..690d3552 100644 --- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp +++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp @@ -433,7 +433,8 @@ static TaskflowHyperblockOp createHyperblock( } // Creates hyperblock.yield with the mapped operands. - hyperblock_builder.create(loc, yield_operands); + hyperblock_builder.create(loc, yield_operands, + yield_operands); has_terminator = true; continue; } From db78bc76fd9bf2e4e616c7ef05b04cef9baf21d3 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Thu, 22 Jan 2026 22:50:18 +0800 Subject: [PATCH 04/25] enable taskflow to neura conversion --- include/NeuraDialect/NeuraOps.td | 26 ++-- include/TaskflowDialect/TaskflowOps.td | 5 +- .../TaskflowToNeura/TaskflowToNeuraPass.cpp | 147 ++++++++++++++++-- .../Transforms/ClassifyCountersPass.cpp | 4 + .../ConstructHyperblockFromTaskPass.cpp | 4 +- 5 files changed, 164 insertions(+), 22 deletions(-) diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td index 71218450..80006ce6 100644 --- a/include/NeuraDialect/NeuraOps.td +++ b/include/NeuraDialect/NeuraOps.td @@ -53,11 +53,11 @@ def Neura_KernelOp : Op:$outputs); - let regions = (region SizedRegion<1>:$body); + let regions = (region AnyRegion:$body); let assemblyFormat = [{ - (`ins` `(` $inputs^ `:` type($inputs) `)` )? - (`ins` `(` $iter_args_init^ `:` type($iter_args_init) `)` )? + (`inputs` `(` $inputs^ `:` type($inputs) `)` )? + (`iter_args_init` `(` $iter_args_init^ `:` type($iter_args_init) `)` )? attr-dict-with-keyword $body (`:` type($outputs)^)? @@ -790,15 +790,23 @@ def Neura_CounterOp : Op{ - current index: the current loop index value. Example: - %current_idx = neura.counter () <{ - start_value = 0 : i64, - end_value = 100 : i64, - step_value = 1 : i64 - }> : -> !neura.data + %idx = neura.counter { + lower_bound = 0 : index, + upper_bound = 32 : index, + step = 1 : index, + counter_type = "leaf" + } : index }]; let arguments = (ins - + IndexAttr:$lower_bound, + IndexAttr:$upper_bound, + IndexAttr:$step, + StrAttr:$counter_type, + I32Attr:$counter_id ); + + let results = (outs AnyType:$current_index); + let assemblyFormat = "attr-dict `:` type($current_index)"; } // ---------------------------------------------------- diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td index 094b31e2..a7ee4a6c 100644 --- a/include/TaskflowDialect/TaskflowOps.td +++ b/include/TaskflowDialect/TaskflowOps.td @@ -144,7 +144,7 @@ def TaskflowChannelOp : TaskflowOpBase<"channel", [Pure, SameOperandsAndResultTy // Intra-Task Operations. //---------------------------------------------------------------------- // Counter operation representing loop iteration control within a Taskflow task. -def TaskflowCounterOp : TaskflowOpBase<"counter", [Pure]>{ +def TaskflowCounterOp : TaskflowOpBase<"counter", []>{ let summary = "Loop counter operation with hardware counter semantics"; let description = [{ @@ -178,7 +178,8 @@ def TaskflowCounterOp : TaskflowOpBase<"counter", [Pure]>{ IndexAttr:$lower_bound, IndexAttr:$upper_bound, IndexAttr:$step, - OptionalAttr:$counter_type + OptionalAttr:$counter_type, + OptionalAttr:$counter_id ); let results = (outs AnyType:$counter_index); diff --git a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp index ea46d969..fc34a545 100644 --- a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp +++ b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp @@ -10,9 +10,12 @@ #include "mlir/IR/IRMapping.h" #include "mlir/IR/MLIRContext.h" #include "mlir/IR/PatternMatch.h" +#include "mlir/IR/Value.h" #include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/raw_ostream.h" using namespace mlir; using namespace mlir::taskflow; @@ -56,7 +59,10 @@ struct HyperblockToKernelPattern // Gets hyperblock operands. SmallVector indices(hyperblock_op.getIndices()); + DenseSet indices_set(indices.begin(), indices.end()); SmallVector iter_args_init(hyperblock_op.getIterArgs()); + DenseSet iter_args_init_set(iter_args_init.begin(), + iter_args_init.end()); size_t num_indices = indices.size(); size_t num_iter_args_init = iter_args_init.size(); @@ -69,12 +75,27 @@ struct HyperblockToKernelPattern for (Value operand : op->getOperands()) { if (auto blockArg = dyn_cast(operand)) { if (blockArg.getOwner() == &task_block) { + if (iter_args_init_set.contains(operand) || + indices_set.contains(operand)) { + // Skips iter args and indices. + continue; + } if (live_in_set.insert(operand).second) { live_in_values.push_back(operand); } + } else { + assert(blockArg.getOwner() == &hb_block && + "Unexpected block argument from other block"); } + } else if (operand.getDefiningOp()) { + Operation *def_op = operand.getDefiningOp(); + llvm::errs() << "[taskflow2neura] Operand from op: " + << *(operand.getDefiningOp()) << "\n"; + assert(((isa(def_op) && + def_op->getParentOp() == task_op) || + (hyperblock_op->isProperAncestor(def_op))) && + "Unexpected non-block-arg operand in hyperblock"); } - assert(!operand.getDefiningOp() && "Unexpected non-block-arg operand"); } }); @@ -89,9 +110,8 @@ struct HyperblockToKernelPattern // Creates neura.kernel. neura::KernelOp kernelOp = rewriter.create( loc, resultTypes, kernel_inputs, iter_args_init, - /*cgra_id=*/rewriter.getI32IntegerAttr(0), - /*kernel_name=*/rewriter.getStringAttr("kernel"), - /*accelerator=*/rewriter.getStringAttr("neura")); + /*Optional cgra_id*/ nullptr, /*Optional kernel_name*/ nullptr, + /*Optional accelerator*/ nullptr); // Creates the entry block for kernel. Region &kernel_region = kernelOp.getBody(); @@ -160,6 +180,102 @@ struct HyperblockToKernelPattern } }; +struct InternalizeCounterPattern : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(neura::KernelOp kernel_op, + PatternRewriter &rewriter) const override { + SmallVector inputs(kernel_op.getInputs()); + SmallVector iter_args_init(kernel_op.getIterArgsInit()); + + // Finds counter inputs: inputs defined by taskflow.counter ops. + SmallVector> counter_inputs; + + for (size_t i = 0; i < inputs.size(); i++) { + if (TaskflowCounterOp counter_op = + inputs[i].getDefiningOp()) { + counter_inputs.push_back({i, counter_op}); + } + } + + // If there is no counter inputs, nothing to do. + if (counter_inputs.empty()) { + return failure(); + } + + Location loc = kernel_op.getLoc(); + Block &old_block = kernel_op.getBody().front(); + + // Builds new inputs (excluding counter inputs). + DenseSet counter_idx_set; + for (auto &[idx, _] : counter_inputs) { + counter_idx_set.insert(idx); + } + SmallVector new_inputs; + for (size_t i = 0; i < inputs.size(); i++) { + if (!counter_idx_set.contains(i)) { + new_inputs.push_back(inputs[i]); + } + } + + // Creates new kernel with updated inputs. + SmallVector result_types(kernel_op.getResultTypes()); + neura::KernelOp new_kernel_op = rewriter.create( + loc, result_types, new_inputs, iter_args_init, + /*cgra_id=*/kernel_op.getCgraIdAttr(), + /*kernel_name=*/kernel_op.getKernelNameAttr(), + /*accelerator=*/kernel_op.getAcceleratorAttr()); + + // Creates the entry block for new kernel. + Region &new_region = new_kernel_op.getBody(); + Block *new_block = rewriter.createBlock(&new_region); + + IRMapping mapping; + // Maps non-counter input block arguments. + for (size_t i = 0; i < inputs.size(); i++) { + BlockArgument old_arg = old_block.getArgument(i); + if (!counter_idx_set.contains(i)) { + BlockArgument new_arg = new_block->addArgument(old_arg.getType(), loc); + mapping.map(old_arg, new_arg); + } + } + + // Maps iter_args block arguments. + size_t num_inputs = inputs.size(); + for (size_t i = 0; i < iter_args_init.size(); i++) { + BlockArgument old_arg = old_block.getArgument(num_inputs + i); + BlockArgument new_arg = new_block->addArgument(old_arg.getType(), loc); + mapping.map(old_arg, new_arg); + } + + // Inserts neura.counter ops at the start of the new block. + rewriter.setInsertionPointToStart(new_block); + for (auto &[old_idx, source_counter] : counter_inputs) { + BlockArgument old_counter_arg = old_block.getArgument(old_idx); + + // Creates neura.counter op. + neura::CounterOp new_counter_op = rewriter.create( + source_counter.getLoc(), old_counter_arg.getType(), + source_counter.getLowerBoundAttr(), + source_counter.getUpperBoundAttr(), source_counter.getStepAttr(), + source_counter.getCounterTypeAttr(), + source_counter.getCounterIdAttr()); + mapping.map(old_counter_arg, new_counter_op.getCurrentIndex()); + } + + // Clones rest of the body. + rewriter.setInsertionPointToEnd(new_block); + for (Operation &op : old_block.getOperations()) { + rewriter.clone(op, mapping); + } + + // Replaces old kernel with new kernel. + rewriter.replaceOp(kernel_op, new_kernel_op.getResults()); + + return success(); + } +}; + struct ConvertTaskflowToNeuraPass : public PassWrapper> { MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertTaskflowToNeuraPass) @@ -178,12 +294,25 @@ struct ConvertTaskflowToNeuraPass MLIRContext *ctx = &getContext(); // Phase 1: Converts hyperblocks to kernels. - RewritePatternSet patterns(ctx); - patterns.add(ctx); + { + RewritePatternSet patterns(ctx); + patterns.add(ctx); - if (failed(applyPatternsGreedily(module, std::move(patterns)))) { - signalPassFailure(); - return; + if (failed(applyPatternsGreedily(module, std::move(patterns)))) { + signalPassFailure(); + return; + } + } + + // Phase 2: Internalizes counters into kernels. + { + RewritePatternSet patterns(ctx); + patterns.add(ctx); + + if (failed(applyPatternsGreedily(module, std::move(patterns)))) { + signalPassFailure(); + return; + } } } }; diff --git a/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp b/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp index 354ee7d7..8555f6de 100644 --- a/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp +++ b/lib/TaskflowDialect/Transforms/ClassifyCountersPass.cpp @@ -44,6 +44,8 @@ void classifyCountersInTask(TaskflowTaskOp task_op) { } } + int global_counter_id = 0; + // Classifies each counter. OpBuilder builder(task_op.getContext()); for (TaskflowCounterOp counter_op : counters) { @@ -67,6 +69,8 @@ void classifyCountersInTask(TaskflowTaskOp task_op) { // Sets the counter type attribute. counter_op.setCounterTypeAttr(builder.getStringAttr(counter_type)); + // Sets the counter id attribute. + counter_op.setCounterIdAttr(builder.getI32IntegerAttr(global_counter_id++)); } } diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp index 690d3552..6955e29c 100644 --- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp +++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp @@ -135,7 +135,7 @@ static void createCounterChainRecursivly(OpBuilder &builder, Location loc, builder.getIndexAttr(loop_info->lower_bound), builder.getIndexAttr(loop_info->upper_bound), builder.getIndexAttr(loop_info->step), - /*Counter Type*/ nullptr); + /*Counter Type*/ nullptr, /*Counter ID*/ nullptr); counter_index = counter_op.getCounterIndex(); } else { // Top-level counter. @@ -144,7 +144,7 @@ static void createCounterChainRecursivly(OpBuilder &builder, Location loc, builder.getIndexAttr(loop_info->lower_bound), builder.getIndexAttr(loop_info->upper_bound), builder.getIndexAttr(loop_info->step), - /*Counter Type*/ nullptr); + /*Counter Type*/ nullptr, /*Counter ID*/ nullptr); counter_index = counter_op.getCounterIndex(); } From 9ef22162d94468cdb26223532225703f3b2c2fb5 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Fri, 23 Jan 2026 13:17:01 +0800 Subject: [PATCH 05/25] assign accelerator for neura.kernel --- lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp index 0dbed531..11688539 100644 --- a/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp +++ b/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp @@ -1,4 +1,5 @@ #include "Common/AcceleratorAttrs.h" +#include "NeuraDialect/NeuraOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Builders.h" @@ -31,6 +32,12 @@ struct AssignAcceleratorPass func->setAttr(mlir::accel::kAcceleratorAttr, builder.getStringAttr(mlir::accel::kNeuraTarget)); } + } else if (neura::KernelOp kernel_op = dyn_cast(op)) { + // Handles neura.kernel ops as well. + if (!kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) { + kernel_op->setAttr(mlir::accel::kAcceleratorAttr, + builder.getStringAttr(mlir::accel::kNeuraTarget)); + } } }); } From 3ff449ba6c0b084342235931becf8270919d4dbb Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Fri, 23 Jan 2026 14:06:14 +0800 Subject: [PATCH 06/25] enable neura.kernel lowering in conversion passes --- .../ArithToNeura/ArithToNeuraPass.cpp | 69 ++++---- .../ArithToNeura/ArithToNeuraPatterns.td | 4 - lib/Conversion/ArithToNeura/CMakeLists.txt | 12 -- .../BuiltinToNeura/BuiltinToNeuraPass.cpp | 27 ++- lib/Conversion/LlvmToNeura/CMakeLists.txt | 13 -- .../LlvmToNeura/LlvmToNeuraPass.cpp | 165 ++++++++++-------- .../LlvmToNeura/LlvmToNeuraPatterns.td | 4 - .../MemRefToNeura/MemRefToNeuraPass.cpp | 30 +++- .../AffineToNeura/unsupported-affine-if.mlir | 4 +- test/mapping_quality/branch_for.mlir | 6 +- test/neura/ctrl/branch_for.mlir | 6 +- 11 files changed, 187 insertions(+), 153 deletions(-) delete mode 100644 lib/Conversion/ArithToNeura/ArithToNeuraPatterns.td delete mode 100644 lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp index 7241d7a8..a6e68ef9 100644 --- a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp +++ b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp @@ -5,22 +5,12 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/Attributes.h" +#include "mlir/IR/MLIRContext.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/ADT/StringRef.h" -namespace mlir { -namespace neura { -// Uses arith2neura instead of llvm to avoid conflicts. -namespace arith2neura { - -#include "ArithToNeuraPatterns.inc" - -} // namespace arith2neura -} // namespace neura -} // namespace mlir - using namespace mlir; using namespace mlir::func; using namespace mlir::neura; @@ -96,7 +86,6 @@ struct ArithSubFToNeuraFSub : public OpRewritePattern { Value rhs = op.getRhs(); Type result_type = op.getType(); - rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs); return success(); } @@ -126,7 +115,6 @@ struct ArithMulFToNeuraFMul : public OpRewritePattern { Value rhs = op.getRhs(); Type result_type = op.getType(); - rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs); return success(); } @@ -171,8 +159,7 @@ struct ArithRemSIToNeuraOp : public OpRewritePattern { Location loc = op.getLoc(); // Converts arith RemSIOp to basic Neura Op. - Value div = - rewriter.create(loc, result_type, lhs, rhs); + Value div = rewriter.create(loc, result_type, lhs, rhs); Value mul = rewriter.create(loc, result_type, rhs, div); Value rem = rewriter.create(loc, result_type, lhs, mul); @@ -244,7 +231,8 @@ struct ArithSelectToNeuraSel : public OpRewritePattern { Value false_value = op.getFalseValue(); Type result_type = op.getType(); - // Converts arith SelectOp to Neura SelOp with consistent order: (cond, ifTrue, ifFalse). + // Converts arith SelectOp to Neura SelOp with consistent order: (cond, + // ifTrue, ifFalse). rewriter.replaceOpWithNewOp(op, result_type, condition, true_value, false_value); return success(); @@ -261,8 +249,8 @@ struct ArithExtUIToNeuraCast : public OpRewritePattern { // Converts arith ExtUIOp to Neura cast operation. - rewriter.replaceOpWithNewOp( - op, result_type, input, rewriter.getStringAttr("extui")); + rewriter.replaceOpWithNewOp(op, result_type, input, + rewriter.getStringAttr("extui")); return success(); } }; @@ -277,8 +265,8 @@ struct ArithExtfToNeuraCast : public OpRewritePattern { // Converts arith ExtFOp to Neura cast operation. - rewriter.replaceOpWithNewOp( - op, result_type, input, rewriter.getStringAttr("extf")); + rewriter.replaceOpWithNewOp(op, result_type, input, + rewriter.getStringAttr("extf")); return success(); } }; @@ -326,26 +314,47 @@ struct LowerArithToNeuraPass registry.insert(); } + RewritePatternSet populateArithToNeuraPatterns(MLIRContext *context) { + RewritePatternSet patterns(context); + patterns + .add(context); + return patterns; + } + void runOnOperation() override { ModuleOp module_op = getOperation(); MLIRContext *context = &getContext(); + module_op.walk([&](func::FuncOp func_op) { if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) { auto target = func_op->getAttrOfType(mlir::accel::kAcceleratorAttr); if (target && target.getValue() == mlir::accel::kNeuraTarget) { - RewritePatternSet patterns(&getContext()); - mlir::neura::arith2neura::populateWithGenerated(patterns); - patterns.add< - ArithFAddToNeuraFAdd, ArithConstantToNeuraConstant, - ArithAddIToNeuraAdd, ArithCmpiToNeuraICmp, ArithSelectToNeuraSel, - ArithExtUIToNeuraCast, ArithIndexCastToNeuraCast, - ArithFDivToNeuraFDiv, ArithExtfToNeuraCast, ArithMulFToNeuraFMul, - ArithSubIToNeuraSub, ArithSubFToNeuraFSub, ArithMulIToNeuraMul, - ArithDivSIToNeuraDiv, ArithRemSIToNeuraOp>(context); + RewritePatternSet patterns = populateArithToNeuraPatterns(context); // Apply patterns to the function, not the entire module + if (failed(applyPatternsGreedily(func_op, std::move(patterns)))) { + signalPassFailure(); + } + } + } + }); + + // Applies patterns to the neura.kernel regions. + module_op.walk([&](neura::KernelOp kernel_op) { + if (kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) { + auto accel_target = + kernel_op->getAttrOfType(mlir::accel::kAcceleratorAttr); + if (accel_target && + accel_target.getValue() == mlir::accel::kNeuraTarget) { + Region &kernel_region = kernel_op.getBody(); + RewritePatternSet patterns = populateArithToNeuraPatterns(context); if (failed( - applyPatternsGreedily(func_op, std::move(patterns)))) { + applyPatternsGreedily(kernel_region, std::move(patterns)))) { signalPassFailure(); } } diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPatterns.td b/lib/Conversion/ArithToNeura/ArithToNeuraPatterns.td deleted file mode 100644 index 7715f90f..00000000 --- a/lib/Conversion/ArithToNeura/ArithToNeuraPatterns.td +++ /dev/null @@ -1,4 +0,0 @@ -include "mlir/IR/OpBase.td" -include "mlir/IR/PatternBase.td" -include "mlir/Dialect/Arith/IR/ArithOps.td" -include "NeuraDialect/NeuraOps.td" diff --git a/lib/Conversion/ArithToNeura/CMakeLists.txt b/lib/Conversion/ArithToNeura/CMakeLists.txt index 4ace588e..c5397d26 100644 --- a/lib/Conversion/ArithToNeura/CMakeLists.txt +++ b/lib/Conversion/ArithToNeura/CMakeLists.txt @@ -1,20 +1,9 @@ -set(LLVM_TARGET_DEFINITIONS ${CMAKE_CURRENT_SOURCE_DIR}/ArithToNeuraPatterns.td) -mlir_tablegen(ArithToNeuraPatterns.inc - -gen-rewriters - -I ${MLIR_SOURCE_DIR}/include - -I ${MLIR_BINARY_DIR}/include - -I ${CMAKE_SOURCE_DIR}/include - -I ${CMAKE_CURRENT_SOURCE_DIR} -) -add_public_tablegen_target(MLIRNeuraArithToNeuraIncGen) - include_directories(${CMAKE_CURRENT_BINARY_DIR}) add_mlir_conversion_library(MLIRNeuraArithToNeuraPass ArithToNeuraPass.cpp DEPENDS - MLIRNeuraArithToNeuraIncGen MLIRConversionIncGen LINK_LIBS PUBLIC @@ -22,5 +11,4 @@ add_mlir_conversion_library(MLIRNeuraArithToNeuraPass MLIRPass MLIRSupport MLIRTransforms - # MLIRNeura ) \ No newline at end of file diff --git a/lib/Conversion/BuiltinToNeura/BuiltinToNeuraPass.cpp b/lib/Conversion/BuiltinToNeura/BuiltinToNeuraPass.cpp index 78550c77..e8d148b5 100644 --- a/lib/Conversion/BuiltinToNeura/BuiltinToNeuraPass.cpp +++ b/lib/Conversion/BuiltinToNeura/BuiltinToNeuraPass.cpp @@ -57,22 +57,45 @@ struct LowerBuiltinToNeuraPass registry.insert(); } + RewritePatternSet populateBuiltinToNeuraPatterns(MLIRContext *context) { + RewritePatternSet patterns(context); + patterns.add(context); + return patterns; + } + void runOnOperation() override { ModuleOp module_op = getOperation(); MLIRContext *context = &getContext(); - RewritePatternSet patterns(&getContext()); - patterns.add(context); + module_op.walk([&](func::FuncOp func_op) { if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) { auto target = func_op->getAttrOfType(mlir::accel::kAcceleratorAttr); if (target && target.getValue() == mlir::accel::kNeuraTarget) { + RewritePatternSet patterns = populateBuiltinToNeuraPatterns(context); if (failed(applyPatternsGreedily(func_op, std::move(patterns)))) { return signalPassFailure(); } } } }); + + // Applies patterns to the neura.kernel regions. + module_op.walk([&](neura::KernelOp kernel_op) { + if (kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) { + auto accel_target = + kernel_op->getAttrOfType(mlir::accel::kAcceleratorAttr); + if (accel_target && + accel_target.getValue() == mlir::accel::kNeuraTarget) { + Region &kernel_region = kernel_op.getBody(); + RewritePatternSet patterns = populateBuiltinToNeuraPatterns(context); + if (failed( + applyPatternsGreedily(kernel_region, std::move(patterns)))) { + signalPassFailure(); + } + } + } + }); } }; } // namespace diff --git a/lib/Conversion/LlvmToNeura/CMakeLists.txt b/lib/Conversion/LlvmToNeura/CMakeLists.txt index 7ced65aa..1c04e922 100644 --- a/lib/Conversion/LlvmToNeura/CMakeLists.txt +++ b/lib/Conversion/LlvmToNeura/CMakeLists.txt @@ -1,22 +1,9 @@ -set(LLVM_TARGET_DEFINITIONS - ${CMAKE_CURRENT_SOURCE_DIR}/LlvmToNeuraPatterns.td -) -mlir_tablegen(LlvmToNeuraPatterns.inc - -gen-rewriters - -I ${MLIR_SOURCE_DIR}/include - -I ${MLIR_BINARY_DIR}/include - -I ${CMAKE_SOURCE_DIR}/include - -I ${CMAKE_CURRENT_SOURCE_DIR} -) -add_public_tablegen_target(MLIRLlvmToNeuraPatternIncGen) - include_directories(${CMAKE_CURRENT_BINARY_DIR}) add_mlir_conversion_library(MLIRNeuraLlvmToNeuraPass LlvmToNeuraPass.cpp DEPENDS - MLIRLlvmToNeuraPatternIncGen MLIRConversionIncGen LINK_LIBS PUBLIC diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp index 959b015b..c28e50db 100644 --- a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp +++ b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp @@ -9,17 +9,6 @@ #include "mlir/Pass/Pass.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" -namespace mlir { -namespace neura { -// Uses llvm2neura instead of llvm to avoid conflicts. -namespace llvm2neura { - -#include "LlvmToNeuraPatterns.inc" - -} // namespace llvm2neura -} // namespace neura -} // namespace mlir - using namespace mlir; using namespace mlir::neura; @@ -158,8 +147,8 @@ struct LlvmMaxNumToNeuraFMax : public OpRewritePattern { if (!mlir::isa(result_type)) return failure(); - rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs, - rewriter.getStringAttr("maxnum")); + rewriter.replaceOpWithNewOp( + op, result_type, lhs, rhs, rewriter.getStringAttr("maxnum")); return success(); } }; @@ -177,8 +166,8 @@ struct LlvmMaximumToNeuraFMax : public OpRewritePattern { if (!mlir::isa(result_type)) return failure(); - rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs, - rewriter.getStringAttr("maximum")); + rewriter.replaceOpWithNewOp( + op, result_type, lhs, rhs, rewriter.getStringAttr("maximum")); return success(); } }; @@ -196,8 +185,8 @@ struct LlvmMinNumToNeuraFMin : public OpRewritePattern { if (!mlir::isa(result_type)) return failure(); - rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs, - rewriter.getStringAttr("minnum")); + rewriter.replaceOpWithNewOp( + op, result_type, lhs, rhs, rewriter.getStringAttr("minnum")); return success(); } }; @@ -215,8 +204,8 @@ struct LlvmMinimumToNeuraFMin : public OpRewritePattern { if (!mlir::isa(result_type)) return failure(); - rewriter.replaceOpWithNewOp(op, result_type, lhs, rhs, - rewriter.getStringAttr("minimum")); + rewriter.replaceOpWithNewOp( + op, result_type, lhs, rhs, rewriter.getStringAttr("minimum")); return success(); } }; @@ -248,8 +237,8 @@ struct LlvmFPToSIToNeuraCast : public OpRewritePattern { Type result_type = op.getType(); // Creates a cast operation with "fptosi" as the cast type. - rewriter.replaceOpWithNewOp(op, result_type, input, - rewriter.getStringAttr("fptosi")); + rewriter.replaceOpWithNewOp( + op, result_type, input, rewriter.getStringAttr("fptosi")); return success(); } }; @@ -264,14 +253,16 @@ struct LlvmSelectToNeuraSel : public OpRewritePattern { Value false_value = op.getFalseValue(); Type result_type = op.getType(); - // neura.sel now follows the same order as llvm.select: (cond, ifTrue, ifFalse) - rewriter.replaceOpWithNewOp(op, result_type, - cond, true_value, false_value); + // neura.sel now follows the same order as llvm.select: (cond, ifTrue, + // ifFalse) + rewriter.replaceOpWithNewOp(op, result_type, cond, true_value, + false_value); return success(); } }; -struct LlvmFMulAddToNeuraFMulFAdd : public OpRewritePattern { +struct LlvmFMulAddToNeuraFMulFAdd + : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(mlir::LLVM::FMulAddOp op, @@ -301,12 +292,12 @@ struct LlvmMemsetToNeuraOps : public OpRewritePattern { auto value = op.getVal(); auto len = op.getLen(); auto is_volatile = op.getIsVolatile(); - + // Creates neura.memset operation with full semantics. // Passes all operands to the hardware-specific operation. // The RTL layer can implement this as appropriate for the target hardware. - rewriter.replaceOpWithNewOp(op, dest, value, len, - is_volatile); + rewriter.replaceOpWithNewOp(op, dest, value, len, + is_volatile); return success(); } }; @@ -398,11 +389,12 @@ struct LlvmVectorReduceAddToNeuraVectorReduceAdd : public RewritePattern { // Checks that we have exactly one operand and one result. if (op->getNumOperands() != 1 || op->getNumResults() != 1) return failure(); - + Value input = op->getOperand(0); Type result_type = op->getResult(0).getType(); - rewriter.replaceOpWithNewOp(op, result_type, input); + rewriter.replaceOpWithNewOp(op, result_type, + input); return success(); } }; @@ -511,10 +503,10 @@ struct LlvmCondBrToNeuraCondBr : public OpRewritePattern { auto new_op = rewriter.create( op.getLoc(), // Location op.getCondition(), // Condition - true_operands, // True destination operands - false_operands, // False destination operands - true_dest, // True destination block - false_dest // False destination block + true_operands, // True destination operands + false_operands, // False destination operands + true_dest, // True destination block + false_dest // False destination block ); // Replaces the old op with the new one. @@ -590,27 +582,30 @@ struct LlvmSubToNeuraSub : public OpRewritePattern { // TODO: Implements LlvmAndToNeuraMul. Used in ADPCM coder and MVT kernels. // llvm.and operations appear in: // - adpcm_coder-kernel.mlir (lines 55, 94: bitwise AND operations) -// - mvt-kernel.mlir (lines 44, 47, 50, 53: vector and scalar AND operations) -// Implementation: and(a, b) = mul(a, b) for boolean values. +// - mvt-kernel.mlir (lines 44, 47, 50, 53: vector and scalar AND +// operations) Implementation: and(a, b) = mul(a, b) for boolean values. // TODO: Implements LlvmAllocaToNeuraOps. Used in DTW kernel. // llvm.alloca operations appear in: // - dtw-kernel-O0.mlir (lines 19-23: multiple stack allocations) -// Implementation: For CGRA, erases alloca or converts to register allocation. +// Implementation: For CGRA, erases alloca or converts to register +// allocation. -// TODO: Implements LlvmLShrToNeuraShl. Used in ADPCM coder/decoder and FFT kernels. +// TODO: Implements LlvmLShrToNeuraShl. Used in ADPCM coder/decoder and FFT +// kernels. // llvm.lshr operations appear in: // - adpcm_coder-kernel.mlir (line 54: %42 = llvm.lshr %40, %7 : i32) // - adpcm_decoder-kernel.ll (line 35: %30 = lshr i32 %29, 4) // - fft_kernel.mlir (line 67: %49 = llvm.lshr %7, %1 : i32) -// Implementation: Needs proper logical right shift (lshr(x,n) != shl(x,-n)). +// Implementation: Needs proper logical right shift (lshr(x,n) != +// shl(x,-n)). // TODO: Implements LlvmAShrToNeuraAShr. Used in ADPCM coder/decoder kernels. // llvm.ashr operations appear in: // - adpcm_coder-kernel.mlir (lines 57, 63, 70: multiple ashr operations) // - adpcm_decoder-kernel.ll (lines 49, 56, 61: ashr i32 %20, 3/1/2) -// Implementation: Needs proper arithmetic right shift (preserves sign bit). - +// Implementation: Needs proper arithmetic right shift (preserves sign +// bit). struct LlvmSMaxToNeuraSMax : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -624,10 +619,9 @@ struct LlvmSMaxToNeuraSMax : public OpRewritePattern { Location loc = op.getLoc(); // Implements smax(a, b) = a >= b ? a : b. - auto cmp = rewriter.create(loc, rewriter.getI1Type(), - lhs, rhs, - rewriter.getStringAttr("sge")); - + auto cmp = rewriter.create( + loc, rewriter.getI1Type(), lhs, rhs, rewriter.getStringAttr("sge")); + // Selects: a >= b ? a : b. rewriter.replaceOpWithNewOp(op, result_type, cmp, lhs, rhs); return success(); @@ -716,9 +710,9 @@ struct LlvmTruncToNeuraCast : public OpRewritePattern { LogicalResult matchAndRewrite(LLVM::TruncOp op, PatternRewriter &rewriter) const override { // Trunc is a simple cast operation. - auto result = rewriter.create( - op.getLoc(), op.getType(), op.getArg(), - rewriter.getStringAttr("trunc")); + auto result = + rewriter.create(op.getLoc(), op.getType(), op.getArg(), + rewriter.getStringAttr("trunc")); rewriter.replaceOp(op, result.getResult()); return success(); } @@ -730,8 +724,8 @@ struct LlvmUDivToNeuraDiv : public OpRewritePattern { LogicalResult matchAndRewrite(LLVM::UDivOp op, PatternRewriter &rewriter) const override { // UDiv is unsigned division. - auto result = rewriter.create( - op.getLoc(), op.getType(), op.getLhs(), op.getRhs()); + auto result = rewriter.create(op.getLoc(), op.getType(), + op.getLhs(), op.getRhs()); rewriter.replaceOp(op, result.getResult()); return success(); } @@ -743,8 +737,8 @@ struct LlvmURemToNeuraRem : public OpRewritePattern { LogicalResult matchAndRewrite(LLVM::URemOp op, PatternRewriter &rewriter) const override { // URem is unsigned remainder. - auto result = rewriter.create( - op.getLoc(), op.getType(), op.getLhs(), op.getRhs()); + auto result = rewriter.create(op.getLoc(), op.getType(), + op.getLhs(), op.getRhs()); rewriter.replaceOp(op, result.getResult()); return success(); } @@ -792,7 +786,7 @@ struct LlvmFuncToNeuraFunc : public OpRewritePattern { // Converts LLVMFunctionType to FunctionType. auto llvm_func_type = op.getFunctionType(); auto func_type = rewriter.getFunctionType(llvm_func_type.getParams(), - llvm_func_type.getReturnType()); + llvm_func_type.getReturnType()); // Creates the new func.func operation using OperationState to have full // control. @@ -811,11 +805,9 @@ struct LlvmFuncToNeuraFunc : public OpRewritePattern { } state.addAttributes(attrs); - // Adds the function body region. state.addRegion(); - auto new_func = cast(rewriter.create(state)); // Moves the function body. @@ -854,7 +846,6 @@ struct LlvmCallToFuncCall : public OpRewritePattern { // Gets the result types from the function signature. auto result_types = func_op.getFunctionType().getResults(); - // Converts the call to func.call. auto new_call = rewriter.create( op.getLoc(), result_types, callee.value(), op.getArgOperands()); @@ -886,10 +877,8 @@ struct LowerLlvmToNeuraPass registry.insert(); } - void runOnOperation() override { - RewritePatternSet patterns(&getContext()); - // Adds DRR patterns. - mlir::neura::llvm2neura::populateWithGenerated(patterns); + RewritePatternSet populateLlvmToNeuraPatterns(MLIRContext *context) { + RewritePatternSet patterns(context); patterns.add(&getContext()); // Vector operations must be registered before scalar operations // to ensure vector types are matched first. @@ -939,26 +928,34 @@ struct LowerLlvmToNeuraPass patterns.add(&getContext()); patterns.add(&getContext()); // TODO: Adds more LLVM to Neura conversion patterns as needed. - // patterns.add(&getContext()); // TODO: Uses in ADPCM coder + FFT kernels. - // patterns.add(&getContext()); // TODO: Uses in ADPCM coder + MVT kernels. - // patterns.add(&getContext()); // TODO: Uses in DTW kernel. - // TODO: Fixes right shift implementations. Current implementations are incorrect. - // patterns.add(&getContext()); // TODO: Uses in ADPCM coder/decoder + FFT kernels. - // patterns.add(&getContext()); // TODO: Uses in ADPCM coder/decoder kernels. - // patterns.add(&getContext()); // TODO: Uses in ADPCM coder kernel. - - - FrozenRewritePatternSet frozen(std::move(patterns)); + // patterns.add(&getContext()); // TODO: Uses in ADPCM + // coder + FFT kernels. patterns.add(&getContext()); // + // TODO: Uses in ADPCM coder + MVT kernels. + // patterns.add(&getContext()); // TODO: Uses in DTW + // kernel. + // TODO: Fixes right shift implementations. Current implementations are + // incorrect. patterns.add(&getContext()); // TODO: + // Uses in ADPCM coder/decoder + FFT kernels. + // patterns.add(&getContext()); // TODO: Uses in ADPCM + // coder/decoder kernels. patterns.add(&getContext()); // + // TODO: Uses in ADPCM coder kernel. + return patterns; + } + void runOnOperation() override { + MLIRContext *context = &getContext(); ModuleOp module_op = getOperation(); - // Performs function-level conversions. - if (failed(applyPatternsGreedily(module_op, frozen))) { + // Performs the llvm.func -> func.func conversion first. + RewritePatternSet func_patterns(context); + func_patterns.add(context); + func_patterns.add(context); + + if (failed(applyPatternsGreedily(module_op, std::move(func_patterns)))) { signalPassFailure(); - return; } - // Performs operation-level conversions. + // Performs operation-level conversions for func::FuncOp. // Applies to every region inside the module (regardless of func type, // e.g., mlir func or llvm func). module_op.walk([&](FunctionOpInterface func) { @@ -967,13 +964,31 @@ struct LowerLlvmToNeuraPass func->getAttrOfType(mlir::accel::kAcceleratorAttr); if (target && target.getValue() == mlir::accel::kNeuraTarget) { for (Region ®ion : func->getRegions()) { - if (failed(applyPatternsGreedily(region, frozen))) { + RewritePatternSet patterns = populateLlvmToNeuraPatterns(context); + if (failed(applyPatternsGreedily(region, std::move(patterns)))) { signalPassFailure(); } } } } }); + + // Applies patterns to the neura.kernel regions. + module_op.walk([&](neura::KernelOp kernel_op) { + if (kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) { + auto accel_target = + kernel_op->getAttrOfType(mlir::accel::kAcceleratorAttr); + if (accel_target && + accel_target.getValue() == mlir::accel::kNeuraTarget) { + Region &kernel_region = kernel_op.getBody(); + RewritePatternSet patterns = populateLlvmToNeuraPatterns(context); + if (failed( + applyPatternsGreedily(kernel_region, std::move(patterns)))) { + signalPassFailure(); + } + } + } + }); } }; } // namespace diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td b/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td deleted file mode 100644 index 1b99a47c..00000000 --- a/lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td +++ /dev/null @@ -1,4 +0,0 @@ -include "mlir/IR/OpBase.td" -include "mlir/IR/PatternBase.td" -include "mlir/Dialect/LLVMIR/LLVMOps.td" -include "NeuraDialect/NeuraOps.td" diff --git a/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp b/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp index c7157120..c8f3501f 100644 --- a/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp +++ b/lib/Conversion/MemRefToNeura/MemRefToNeuraPass.cpp @@ -83,26 +83,46 @@ struct LowerMemRefToNeuraPass registry.insert(); } - void runOnOperation() override { - ModuleOp module_op = getOperation(); - MLIRContext *context = &getContext(); - RewritePatternSet patterns(&getContext()); - + RewritePatternSet populateMemRefToNeuraPatterns(MLIRContext *context) { + RewritePatternSet patterns(context); patterns.add(context); patterns.add(context); patterns.add(context); + return patterns; + } + + void runOnOperation() override { + ModuleOp module_op = getOperation(); + MLIRContext *context = &getContext(); module_op.walk([&](func::FuncOp func_op) { if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) { auto target = func_op->getAttrOfType(mlir::accel::kAcceleratorAttr); if (target && target.getValue() == mlir::accel::kNeuraTarget) { + RewritePatternSet patterns = populateMemRefToNeuraPatterns(context); if (failed(applyPatternsGreedily(func_op, std::move(patterns)))) { return signalPassFailure(); } } } }); + + module_op.walk([&](neura::KernelOp kernel_op) { + if (kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) { + auto accel_target = + kernel_op->getAttrOfType(mlir::accel::kAcceleratorAttr); + if (accel_target && + accel_target.getValue() == mlir::accel::kNeuraTarget) { + Region &kernel_region = kernel_op.getBody(); + RewritePatternSet patterns = populateMemRefToNeuraPatterns(context); + if (failed( + applyPatternsGreedily(kernel_region, std::move(patterns)))) { + signalPassFailure(); + } + } + } + }); } }; } // namespace diff --git a/test/Conversion/AffineToNeura/unsupported-affine-if.mlir b/test/Conversion/AffineToNeura/unsupported-affine-if.mlir index 1095a239..8c799ee2 100644 --- a/test/Conversion/AffineToNeura/unsupported-affine-if.mlir +++ b/test/Conversion/AffineToNeura/unsupported-affine-if.mlir @@ -1,6 +1,6 @@ // RUN: mlir-neura-opt %s --lower-affine | FileCheck %s --check-prefix=CHECK-SCF // RUN: mlir-neura-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-func-to-llvm | FileCheck %s --check-prefix=CHECK-LLVM -// RUN: mlir-neura-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-func-to-llvm --lower-llvm-to-neura | FileCheck %s --check-prefix=CHECK-NEURA-BR +// RUN: mlir-neura-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --assign-accelerator --convert-func-to-llvm --lower-llvm-to-neura | FileCheck %s --check-prefix=CHECK-NEURA-BR // This test demonstrates the complete multi-stage lowering chain for conditionals. // Note: Direct lowering affine.if to Neura is not supported. @@ -54,7 +54,7 @@ module { // CHECK-LLVM: %{{.*}} = llvm.icmp "sge" %{{.*}}, %{{.*}} : i64 // CHECK-LLVM: llvm.cond_br %{{.*}}, ^bb3, ^bb4 -// CHECK-NEURA-BR-LABEL: llvm.func @affine_if_example +// CHECK-NEURA-BR-LABEL: func.func @affine_if_example // CHECK-NEURA-BR: %{{.*}} = "neura.constant"() <{value = -5 : index}> : () -> i64 // CHECK-NEURA-BR: %{{.*}} = "neura.constant"() <{value = 1 : index}> : () -> i64 // CHECK-NEURA-BR: %{{.*}} = "neura.constant"() <{value = 10 : index}> : () -> i64 diff --git a/test/mapping_quality/branch_for.mlir b/test/mapping_quality/branch_for.mlir index 07db3866..f78a1be1 100644 --- a/test/mapping_quality/branch_for.mlir +++ b/test/mapping_quality/branch_for.mlir @@ -103,9 +103,9 @@ func.func @loop_test() -> f32 { // CHECK-NEXT: %7 = "neura.fadd"(%6, %3) : (!neura.data, !neura.data) -> !neura.data // CHECK-NEXT: %8 = "neura.add"(%5, %2) : (!neura.data, !neura.data) -> !neura.data // CHECK-NEXT: %9 = "neura.icmp"(%8, %0) <{cmpType = "slt"}> : (!neura.data, !neura.data) -> !neura.data -// CHECK-NEXT: neura.cond_br %9 : !neura.data then %8, %7 : !neura.data, !neura.data to ^bb1 else %7 : !neura.data to ^bb2 -// CHECK-NEXT: ^bb2(%10: !neura.data): // pred: ^bb1 -// CHECK-NEXT: "neura.return"(%10) : (!neura.data) -> () +// CHECK-NEXT: neura.cond_br %9 : !neura.data then %8, %7 : !neura.data, !neura.data to ^bb1 else to ^bb2 +// CHECK-NEXT: ^bb2: // pred: ^bb1 +// CHECK-NEXT: "neura.return"(%7) : (!neura.data) -> () // CHECK-NEXT: } // CANONICALIZE: func.func @loop_test() -> f32 attributes {accelerator = "neura"} { diff --git a/test/neura/ctrl/branch_for.mlir b/test/neura/ctrl/branch_for.mlir index 6ea1910d..bbb06ed9 100644 --- a/test/neura/ctrl/branch_for.mlir +++ b/test/neura/ctrl/branch_for.mlir @@ -110,9 +110,9 @@ func.func @loop_test() -> f32 { // CHECK-NEXT: %7 = "neura.fadd"(%6, %3) : (!neura.data, !neura.data) -> !neura.data // CHECK-NEXT: %8 = "neura.add"(%5, %2) : (!neura.data, !neura.data) -> !neura.data // CHECK-NEXT: %9 = "neura.icmp"(%8, %0) <{cmpType = "slt"}> : (!neura.data, !neura.data) -> !neura.data -// CHECK-NEXT: neura.cond_br %9 : !neura.data then %8, %7 : !neura.data, !neura.data to ^bb1 else %7 : !neura.data to ^bb2 -// CHECK-NEXT: ^bb2(%10: !neura.data): // pred: ^bb1 -// CHECK-NEXT: "neura.return"(%10) : (!neura.data) -> () +// CHECK-NEXT: neura.cond_br %9 : !neura.data then %8, %7 : !neura.data, !neura.data to ^bb1 else to ^bb2 +// CHECK-NEXT: ^bb2: // pred: ^bb1 +// CHECK-NEXT: "neura.return"(%7) : (!neura.data) -> () // CHECK-NEXT: } // CANONICALIZE: func.func @loop_test() -> f32 attributes {accelerator = "neura"} { From f41691a07be506e16802b34df499b97fbaf6e0be Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Fri, 23 Jan 2026 15:02:54 +0800 Subject: [PATCH 07/25] enable promote func/kernel arguments to constant --- include/NeuraDialect/NeuraPasses.h | 2 +- include/NeuraDialect/NeuraPasses.td | 8 +- lib/NeuraDialect/NeuraPasses.cpp | 2 +- lib/NeuraDialect/Transforms/CMakeLists.txt | 2 +- ...ass.cpp => PromoteInputArgToConstPass.cpp} | 75 +++++++++++++++++-- .../bert/bert_node1/bert_node1.mlir | 2 +- .../bert/bert_node28/bert_node28.mlir | 2 +- test/c2llvm2mlir/nested_loop/test.mlir | 4 +- test/c2llvm2mlir/simple_loop/test.mlir | 4 +- .../complex_nested/complex_nested.mlir | 2 +- .../non_perfect_nested.mlir | 2 +- .../perfect_nested/perfect_nested.mlir | 4 +- .../perfect_reduction/perfect_reduction.mlir | 2 +- .../simple_loop/simple_loop.mlir | 8 +- .../simple_loop_reduction.mlir | 8 +- test/e2e/axpy/axpy_kernel.mlir | 2 +- test/e2e/bicg/bicg_kernel.mlir | 8 +- test/e2e/fir/fir_kernel.mlir | 4 +- test/e2e/fir/fir_kernel_vec.mlir | 2 +- test/e2e/gemm/gemm_kernel.mlir | 2 +- test/e2e/gemv/gemv_kernel.mlir | 2 +- test/e2e/histogram/histogram_kernel.mlir | 4 +- test/e2e/relu/relu_kernel.mlir | 2 +- test/e2e/spmv/spmv_kernel.mlir | 2 +- test/honor_arch/fir_removed_tiles_test.mlir | 2 +- test/neura/ctrl/branch.mlir | 4 +- test/neura/ctrl/branch_for.mlir | 12 +-- .../ctrl/branch_with_and_without_arg.mlir | 4 +- test/neura/ctrl/branch_without_arg.mlir | 4 +- test/neura/ctrl/for_with_if.mlir | 2 +- test/neura/ctrl/nested_branch.mlir | 4 +- test/neura/for_loop/kernel_test.mlir | 6 +- test/neura/for_loop/relu_test.mlir | 6 +- test/neura/fusion/test.mlir | 20 ++--- test/neura/steer_ctrl/for_with_if.mlir | 2 +- .../steer_ctrl/loop_with_return_value.mlir | 4 +- .../steer_ctrl/loop_without_return_value.mlir | 2 +- .../constant_folding/simple_loop.mlir | 2 +- 38 files changed, 144 insertions(+), 85 deletions(-) rename lib/NeuraDialect/Transforms/{PromoteFuncArgToConstPass.cpp => PromoteInputArgToConstPass.cpp} (58%) diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h index 95aa70c8..0b77521d 100644 --- a/include/NeuraDialect/NeuraPasses.h +++ b/include/NeuraDialect/NeuraPasses.h @@ -27,7 +27,7 @@ std::unique_ptr createMapToAcceleratorPass(); std::unique_ptr createGenerateCodePass(); std::unique_ptr createCanonicalizeReturnPass(); std::unique_ptr createCanonicalizeLiveInPass(); -std::unique_ptr createPromoteFuncArgToConstPass(); +std::unique_ptr createPromoteInputArgToConstPass(); std::unique_ptr createTransformToSteerControlPass(); std::unique_ptr createRemovePredicatedTypePass(); std::unique_ptr createWrapLoopInKernelPass(); diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td index ec0df60b..fc6cec1e 100644 --- a/include/NeuraDialect/NeuraPasses.td +++ b/include/NeuraDialect/NeuraPasses.td @@ -97,12 +97,12 @@ def CanonicalizeLiveIn : Pass<"canonicalize-live-in", "ModuleOp"> { let constructor = "neura::createCanonicalizeLiveInPass()"; } -def PromoteFuncArgToConst : Pass<"promote-func-arg-to-const", "ModuleOp"> { - let summary = "Promotes function arguments to neura constant operations"; +def PromoteInputArgToConst : Pass<"promote-input-arg-to-const", "ModuleOp"> { + let summary = "Promotes input arguments of functions or neura.kernels to neura constant operations"; let description = [{ - This pass promotes function arguments to neura constant operations. + This pass promotes input arguments of functions or neura.kernels to neura constant operations. }]; - let constructor = "neura::createPromoteFuncArgToConstPass()"; + let constructor = "neura::createPromoteInputArgToConstPass()"; } def CanonicalizeCast : Pass<"canonicalize-cast", "ModuleOp"> { diff --git a/lib/NeuraDialect/NeuraPasses.cpp b/lib/NeuraDialect/NeuraPasses.cpp index 26c1b6f2..80b6a6f1 100644 --- a/lib/NeuraDialect/NeuraPasses.cpp +++ b/lib/NeuraDialect/NeuraPasses.cpp @@ -31,7 +31,7 @@ void mlir::neura::registerNeuraConversionPassPipeline() { pm.addPass(mlir::neura::createCanonicalizeReturnPass()); pm.addPass(mlir::neura::createCanonicalizeCastPass()); - pm.addPass(mlir::neura::createPromoteFuncArgToConstPass()); + pm.addPass(mlir::neura::createPromoteInputArgToConstPass()); pm.addPass(mlir::neura::createFoldConstantPass()); pm.addPass(mlir::neura::createCanonicalizeLiveInPass()); pm.addPass(mlir::neura::createLeveragePredicatedValuePass()); diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt index 85200b48..da7056fb 100644 --- a/lib/NeuraDialect/Transforms/CMakeLists.txt +++ b/lib/NeuraDialect/Transforms/CMakeLists.txt @@ -13,7 +13,7 @@ add_mlir_library( CanonicalizeReturnPass.cpp CanonicalizeLiveInPass.cpp CanonicalizeCastPass.cpp - PromoteFuncArgToConstPass.cpp + PromoteInputArgToConstPass.cpp IterMergePatternPass.cpp TransformToSteerControlPass.cpp RemovePredicatedTypePass.cpp diff --git a/lib/NeuraDialect/Transforms/PromoteFuncArgToConstPass.cpp b/lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp similarity index 58% rename from lib/NeuraDialect/Transforms/PromoteFuncArgToConstPass.cpp rename to lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp index 8db54b2e..7889922c 100644 --- a/lib/NeuraDialect/Transforms/PromoteFuncArgToConstPass.cpp +++ b/lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp @@ -4,6 +4,7 @@ #include "NeuraDialect/NeuraPasses.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/Attributes.h" #include "mlir/IR/Block.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Region.h" @@ -15,7 +16,7 @@ using namespace mlir; -#define GEN_PASS_DEF_PROMOTEFUNCARGTOCONST +#define GEN_PASS_DEF_PROMOTEINPUTARGTOCONST #include "NeuraDialect/NeuraPasses.h.inc" namespace { @@ -73,13 +74,58 @@ LogicalResult promoteFunctionArgsToConstants(Region ®ion) { return success(); } -struct PromoteFuncArgToConstPass - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PromoteFuncArgToConstPass) +LogicalResult promoteKernelArgsToConstants(neura::KernelOp kernel_op) { + Region &kernel_region = kernel_op.getBody(); + if (kernel_region.empty()) { + return success(); + } + + Block &entry_block = kernel_region.front(); + OpBuilder builder(&entry_block, entry_block.begin()); + + // Gets the number of inputs and iter_args from kernel operands. + size_t num_inputs = kernel_op.getInputs().size(); + size_t num_iter_args = kernel_op.getIterArgsInit().size(); + + // Verifies block arguments layout: [inputs..., iter_args...] + SmallVector args(entry_block.getArguments().begin(), + entry_block.getArguments().end()); + + assert(args.size() == num_inputs + num_iter_args && + "Kernel block arguments size mismatch"); + + // Only promotes input arguments (not iter_args). + // Block arguments layout: [input0, input1, ..., iter_arg0, iter_arg1, ...] + for (size_t i = 0; i < num_inputs; ++i) { + BlockArgument input_arg = args[i]; + + // Creates a constant for this input. + std::string const_name = "%input" + std::to_string(i); + auto const_op = builder.create( + input_arg.getLoc(), input_arg.getType(), + builder.getStringAttr(const_name)); + + // Replaces all uses of this input argument with the constant. + input_arg.replaceAllUsesWith(const_op.getResult()); + } + + // Note: iter_args (args[num_inputs] to args[num_inputs + num_iter_args - 1]) + // are NOT promoted here. They will be handled in transform-ctrl-to-data-flow + // pass. - StringRef getArgument() const override { return "promote-func-arg-to-const"; } + return success(); +} + +struct PromoteInputArgToConstPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PromoteInputArgToConstPass) + + StringRef getArgument() const override { + return "promote-input-arg-to-const"; + } StringRef getDescription() const override { - return "Promotes function arguments to constants."; + return "Promotes input arguments of functions or neura.kernels to neura " + "constant operations."; } void getDependentDialects(DialectRegistry ®istry) const override { registry.insert(); @@ -118,12 +164,25 @@ struct PromoteFuncArgToConstPass return; } }); + + // Processes neura.kernel input arguments. + module_op.walk([&](neura::KernelOp kernel_op) { + auto accel_attr = + kernel_op->getAttrOfType(accel::kAcceleratorAttr); + if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { + return; + } + if (failed(promoteKernelArgsToConstants(kernel_op))) { + signalPassFailure(); + return; + } + }); } }; } // namespace namespace mlir::neura { -std::unique_ptr createPromoteFuncArgToConstPass() { - return std::make_unique(); +std::unique_ptr createPromoteInputArgToConstPass() { + return std::make_unique(); } } // namespace mlir::neura \ No newline at end of file diff --git a/test/affine2neura/bert/bert_node1/bert_node1.mlir b/test/affine2neura/bert/bert_node1/bert_node1.mlir index 64610dfd..70132cdd 100644 --- a/test/affine2neura/bert/bert_node1/bert_node1.mlir +++ b/test/affine2neura/bert/bert_node1/bert_node1.mlir @@ -14,7 +14,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/affine2neura/bert/bert_node28/bert_node28.mlir b/test/affine2neura/bert/bert_node28/bert_node28.mlir index 65494bb4..75f66c35 100644 --- a/test/affine2neura/bert/bert_node28/bert_node28.mlir +++ b/test/affine2neura/bert/bert_node28/bert_node28.mlir @@ -14,7 +14,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/c2llvm2mlir/nested_loop/test.mlir b/test/c2llvm2mlir/nested_loop/test.mlir index 924e15fc..3bf536ff 100644 --- a/test/c2llvm2mlir/nested_loop/test.mlir +++ b/test/c2llvm2mlir/nested_loop/test.mlir @@ -3,7 +3,7 @@ // RUN: mlir-neura-opt --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ @@ -14,7 +14,7 @@ // RUN: mlir-neura-opt --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/c2llvm2mlir/simple_loop/test.mlir b/test/c2llvm2mlir/simple_loop/test.mlir index 09285607..2af2d0c6 100644 --- a/test/c2llvm2mlir/simple_loop/test.mlir +++ b/test/c2llvm2mlir/simple_loop/test.mlir @@ -22,7 +22,7 @@ // RUN: mlir-neura-opt --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ @@ -36,7 +36,7 @@ // Test with mapping table dump enabled // RUN: mlir-neura-opt --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/controflow_fuse/complex_nested/complex_nested.mlir b/test/controflow_fuse/complex_nested/complex_nested.mlir index 0fa153b4..77a4eb2f 100644 --- a/test/controflow_fuse/complex_nested/complex_nested.mlir +++ b/test/controflow_fuse/complex_nested/complex_nested.mlir @@ -14,7 +14,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/controflow_fuse/non_perfect_nested/non_perfect_nested.mlir b/test/controflow_fuse/non_perfect_nested/non_perfect_nested.mlir index 598ac289..95765b42 100644 --- a/test/controflow_fuse/non_perfect_nested/non_perfect_nested.mlir +++ b/test/controflow_fuse/non_perfect_nested/non_perfect_nested.mlir @@ -14,7 +14,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/controflow_fuse/perfect_nested/perfect_nested.mlir b/test/controflow_fuse/perfect_nested/perfect_nested.mlir index a664bb16..bbc5877e 100644 --- a/test/controflow_fuse/perfect_nested/perfect_nested.mlir +++ b/test/controflow_fuse/perfect_nested/perfect_nested.mlir @@ -23,7 +23,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ @@ -37,7 +37,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/controflow_fuse/perfect_reduction/perfect_reduction.mlir b/test/controflow_fuse/perfect_reduction/perfect_reduction.mlir index 77c41c7b..d009ea04 100644 --- a/test/controflow_fuse/perfect_reduction/perfect_reduction.mlir +++ b/test/controflow_fuse/perfect_reduction/perfect_reduction.mlir @@ -24,7 +24,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/controflow_fuse/simple_loop/simple_loop.mlir b/test/controflow_fuse/simple_loop/simple_loop.mlir index 13b2e91f..e9c04f7c 100644 --- a/test/controflow_fuse/simple_loop/simple_loop.mlir +++ b/test/controflow_fuse/simple_loop/simple_loop.mlir @@ -18,7 +18,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-live-in | FileCheck %s --check-prefix=CANONICALIZE // RUN: mlir-neura-opt %t-llvm.mlir \ @@ -28,7 +28,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ @@ -42,7 +42,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ @@ -60,7 +60,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir b/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir index 3619db45..ace0dd26 100644 --- a/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir +++ b/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir @@ -18,7 +18,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-live-in \ // RUN: | FileCheck %s --check-prefix=CANONICALIZE @@ -29,7 +29,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ @@ -43,7 +43,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ @@ -61,7 +61,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/e2e/axpy/axpy_kernel.mlir b/test/e2e/axpy/axpy_kernel.mlir index fc07dcef..8d3e9fba 100644 --- a/test/e2e/axpy/axpy_kernel.mlir +++ b/test/e2e/axpy/axpy_kernel.mlir @@ -7,7 +7,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/e2e/bicg/bicg_kernel.mlir b/test/e2e/bicg/bicg_kernel.mlir index 2824b6fc..d353ec1f 100644 --- a/test/e2e/bicg/bicg_kernel.mlir +++ b/test/e2e/bicg/bicg_kernel.mlir @@ -13,7 +13,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: -o %t-before-canonicalize.mlir // RUN: FileCheck %s --input-file=%t-before-canonicalize.mlir -check-prefix=BEFORE_CANONICALIZE @@ -21,7 +21,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-live-in \ // RUN: -o %t-after-canonicalize.mlir @@ -30,7 +30,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ @@ -338,7 +338,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/e2e/fir/fir_kernel.mlir b/test/e2e/fir/fir_kernel.mlir index 6b476a36..f7049b62 100644 --- a/test/e2e/fir/fir_kernel.mlir +++ b/test/e2e/fir/fir_kernel.mlir @@ -8,7 +8,7 @@ // RUN: cd %t.dir && mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ @@ -192,7 +192,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/e2e/fir/fir_kernel_vec.mlir b/test/e2e/fir/fir_kernel_vec.mlir index 2c0e8207..366feba8 100644 --- a/test/e2e/fir/fir_kernel_vec.mlir +++ b/test/e2e/fir/fir_kernel_vec.mlir @@ -6,7 +6,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/e2e/gemm/gemm_kernel.mlir b/test/e2e/gemm/gemm_kernel.mlir index 674a027e..3376fe0a 100644 --- a/test/e2e/gemm/gemm_kernel.mlir +++ b/test/e2e/gemm/gemm_kernel.mlir @@ -7,7 +7,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/e2e/gemv/gemv_kernel.mlir b/test/e2e/gemv/gemv_kernel.mlir index fc6d862d..9f8f1317 100644 --- a/test/e2e/gemv/gemv_kernel.mlir +++ b/test/e2e/gemv/gemv_kernel.mlir @@ -7,7 +7,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/e2e/histogram/histogram_kernel.mlir b/test/e2e/histogram/histogram_kernel.mlir index ca045ef3..9f2d6f23 100644 --- a/test/e2e/histogram/histogram_kernel.mlir +++ b/test/e2e/histogram/histogram_kernel.mlir @@ -8,7 +8,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ @@ -154,7 +154,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/e2e/relu/relu_kernel.mlir b/test/e2e/relu/relu_kernel.mlir index cf09e451..b5c46f98 100644 --- a/test/e2e/relu/relu_kernel.mlir +++ b/test/e2e/relu/relu_kernel.mlir @@ -15,7 +15,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/e2e/spmv/spmv_kernel.mlir b/test/e2e/spmv/spmv_kernel.mlir index 9e871ed4..32a50da9 100644 --- a/test/e2e/spmv/spmv_kernel.mlir +++ b/test/e2e/spmv/spmv_kernel.mlir @@ -7,7 +7,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/honor_arch/fir_removed_tiles_test.mlir b/test/honor_arch/fir_removed_tiles_test.mlir index a1d94188..23e4009d 100644 --- a/test/honor_arch/fir_removed_tiles_test.mlir +++ b/test/honor_arch/fir_removed_tiles_test.mlir @@ -8,7 +8,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/neura/ctrl/branch.mlir b/test/neura/ctrl/branch.mlir index eba379e0..d56813d6 100644 --- a/test/neura/ctrl/branch.mlir +++ b/test/neura/ctrl/branch.mlir @@ -1,7 +1,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ @@ -10,7 +10,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/neura/ctrl/branch_for.mlir b/test/neura/ctrl/branch_for.mlir index bbb06ed9..a626575e 100644 --- a/test/neura/ctrl/branch_for.mlir +++ b/test/neura/ctrl/branch_for.mlir @@ -7,7 +7,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-live-in \ // RUN: | FileCheck %s -check-prefix=CANONICALIZE @@ -15,7 +15,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ @@ -26,7 +26,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ @@ -38,7 +38,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ @@ -51,7 +51,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ @@ -66,7 +66,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/neura/ctrl/branch_with_and_without_arg.mlir b/test/neura/ctrl/branch_with_and_without_arg.mlir index d861d1d5..87e6b61b 100644 --- a/test/neura/ctrl/branch_with_and_without_arg.mlir +++ b/test/neura/ctrl/branch_with_and_without_arg.mlir @@ -1,7 +1,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ // RUN: | FileCheck %s @@ -9,7 +9,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/neura/ctrl/branch_without_arg.mlir b/test/neura/ctrl/branch_without_arg.mlir index e505afda..726f8e1a 100644 --- a/test/neura/ctrl/branch_without_arg.mlir +++ b/test/neura/ctrl/branch_without_arg.mlir @@ -1,7 +1,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ // RUN: | FileCheck %s @@ -9,7 +9,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/neura/ctrl/for_with_if.mlir b/test/neura/ctrl/for_with_if.mlir index ad8ba343..0f93ace2 100644 --- a/test/neura/ctrl/for_with_if.mlir +++ b/test/neura/ctrl/for_with_if.mlir @@ -18,7 +18,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/neura/ctrl/nested_branch.mlir b/test/neura/ctrl/nested_branch.mlir index 92fe6975..5af809b4 100644 --- a/test/neura/ctrl/nested_branch.mlir +++ b/test/neura/ctrl/nested_branch.mlir @@ -1,7 +1,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ @@ -10,7 +10,7 @@ // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/neura/for_loop/kernel_test.mlir b/test/neura/for_loop/kernel_test.mlir index 1c00b1d5..298a9426 100644 --- a/test/neura/for_loop/kernel_test.mlir +++ b/test/neura/for_loop/kernel_test.mlir @@ -7,7 +7,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir\ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ // RUN: | FileCheck %s @@ -15,7 +15,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir\ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ @@ -27,7 +27,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir\ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/neura/for_loop/relu_test.mlir b/test/neura/for_loop/relu_test.mlir index 366083d6..a34e4fd7 100644 --- a/test/neura/for_loop/relu_test.mlir +++ b/test/neura/for_loop/relu_test.mlir @@ -5,14 +5,14 @@ // RUN: mlir-neura-opt %t-relu.mlir\ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-live-in \ // RUN: | FileCheck %s // RUN: mlir-neura-opt %t-relu.mlir\ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ @@ -22,7 +22,7 @@ // RUN: mlir-neura-opt %t-relu.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/neura/fusion/test.mlir b/test/neura/fusion/test.mlir index 35643cdf..0e6a3dce 100644 --- a/test/neura/fusion/test.mlir +++ b/test/neura/fusion/test.mlir @@ -2,7 +2,7 @@ // RUN: mlir-translate --import-llvm %t-kernel.ll -o %t-kernel.mlir // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ @@ -16,7 +16,7 @@ // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ @@ -38,7 +38,7 @@ // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --verify-each=true --mlir-print-ir-after-failure \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-cast \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ @@ -53,27 +53,27 @@ // CHECK-ITER-MERGE-PATTERN-NEXT: ^bb0(%arg5: !neura.data): // CHECK-ITER-MERGE-PATTERN-NEXT: %61 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data // CHECK-ITER-MERGE-PATTERN-NEXT: %62 = neura.phi_start %61, %arg5 : !neura.data, !neura.data -> !neura.data -// CHECK-ITER-MERGE-PATTERN-NEXT: neura.yield %61, %62 : !neura.data, !neura.data +// CHECK-ITER-MERGE-PATTERN-NEXT: neura.yield results(%61, %62 : !neura.data, !neura.data) // CHECK-ITER-MERGE-PATTERN-NEXT: }) : (!neura.data) -> (!neura.data, !neura.data) // CHECK-ITER-MERGE-PATTERN: %16:2 = "neura.fused_op"(%4, %13, %15) <{frequency = 8 : i64, pattern_id = 10 : i64, pattern_name = "phi_start->fused_op:gep->load"}> ({ // CHECK-ITER-MERGE-PATTERN-NEXT: ^bb0(%arg5: !neura.data, %arg6: !neura.data, %arg7: !neura.data): // CHECK-ITER-MERGE-PATTERN-NEXT: %61 = neura.phi_start %arg5, %arg6 : !neura.data, !neura.data -> !neura.data // CHECK-ITER-MERGE-PATTERN-NEXT: %62 = "neura.gep"(%61, %arg7) <{operandSegmentSizes = array}> : (!neura.data, !neura.data) -> !neura.data // CHECK-ITER-MERGE-PATTERN-NEXT: %63 = "neura.load"(%62) : (!neura.data) -> !neura.data -// CHECK-ITER-MERGE-PATTERN-NEXT: neura.yield %61, %63 : !neura.data, !neura.data +// CHECK-ITER-MERGE-PATTERN-NEXT: neura.yield results(%61, %63 : !neura.data, !neura.data) // CHECK-ITER-MERGE-PATTERN-NEXT: }) : (!neura.data, !neura.data, !neura.data) -> (!neura.data, !neura.data) // CHECK-ITER-MERGE-PATTERN: %17:3 = "neura.fused_op"(%2, %12, %15) <{frequency = 8 : i64, pattern_id = 10 : i64, pattern_name = "phi_start->fused_op:gep->load"}> ({ // CHECK-ITER-MERGE-PATTERN-NEXT: ^bb0(%arg5: !neura.data, %arg6: !neura.data, %arg7: !neura.data): // CHECK-ITER-MERGE-PATTERN-NEXT: %61 = neura.phi_start %arg5, %arg6 : !neura.data, !neura.data -> !neura.data // CHECK-ITER-MERGE-PATTERN-NEXT: %62 = "neura.gep"(%61, %arg7) <{operandSegmentSizes = array}> : (!neura.data, !neura.data) -> !neura.data // CHECK-ITER-MERGE-PATTERN-NEXT: %63 = "neura.load"(%62) : (!neura.data) -> !neura.data -// CHECK-ITER-MERGE-PATTERN-NEXT: neura.yield %61, %62, %63 : !neura.data, !neura.data, !neura.data +// CHECK-ITER-MERGE-PATTERN-NEXT: neura.yield results(%61, %62, %63 : !neura.data, !neura.data, !neura.data) // CHECK-ITER-MERGE-PATTERN-NEXT: }) : (!neura.data, !neura.data, !neura.data) -> (!neura.data, !neura.data, !neura.data) // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --verify-each=true --mlir-print-ir-after-failure \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-cast \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ @@ -88,19 +88,19 @@ // CHECK-INIT-PATTERN-NEXT: ^bb0(%arg5: !neura.data, %arg6: !neura.data): // CHECK-INIT-PATTERN-NEXT: %72 = "neura.gep"(%arg5, %arg6) <{operandSegmentSizes = array}> : (!neura.data, !neura.data) -> !neura.data // CHECK-INIT-PATTERN-NEXT: %73 = "neura.load"(%72) : (!neura.data) -> !neura.data -// CHECK-INIT-PATTERN-NEXT: neura.yield %72, %73 : !neura.data, !neura.data +// CHECK-INIT-PATTERN-NEXT: neura.yield results(%72, %73 : !neura.data, !neura.data) // CHECK-INIT-PATTERN-NEXT: }) : (!neura.data, !neura.data) -> (!neura.data, !neura.data) // CHECK-INIT-PATTERN-NEXT: %22 = "neura.fused_op"(%18, %20) <{frequency = 6 : i64, pattern_id = 2 : i64, pattern_name = "gep->load"}> ({ // CHECK-INIT-PATTERN-NEXT: ^bb0(%arg5: !neura.data, %arg6: !neura.data): // CHECK-INIT-PATTERN-NEXT: %72 = "neura.gep"(%arg5, %arg6) <{operandSegmentSizes = array}> : (!neura.data, !neura.data) -> !neura.data // CHECK-INIT-PATTERN-NEXT: %73 = "neura.load"(%72) : (!neura.data) -> !neura.data -// CHECK-INIT-PATTERN-NEXT: neura.yield %73 : !neura.data +// CHECK-INIT-PATTERN-NEXT: neura.yield results(%73 : !neura.data) // CHECK-INIT-PATTERN-NEXT: }) : (!neura.data, !neura.data) -> !neura.data // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --verify-each=true --mlir-print-ir-after-failure \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-cast \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ diff --git a/test/neura/steer_ctrl/for_with_if.mlir b/test/neura/steer_ctrl/for_with_if.mlir index fe145f99..1fb72aac 100644 --- a/test/neura/steer_ctrl/for_with_if.mlir +++ b/test/neura/steer_ctrl/for_with_if.mlir @@ -6,7 +6,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/neura/steer_ctrl/loop_with_return_value.mlir b/test/neura/steer_ctrl/loop_with_return_value.mlir index e9e1bf11..1104a7a7 100644 --- a/test/neura/steer_ctrl/loop_with_return_value.mlir +++ b/test/neura/steer_ctrl/loop_with_return_value.mlir @@ -6,7 +6,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ @@ -22,7 +22,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/neura/steer_ctrl/loop_without_return_value.mlir b/test/neura/steer_ctrl/loop_without_return_value.mlir index 55e28f08..7311c543 100644 --- a/test/neura/steer_ctrl/loop_without_return_value.mlir +++ b/test/neura/steer_ctrl/loop_without_return_value.mlir @@ -6,7 +6,7 @@ // RUN: --lower-builtin-to-neura \ // RUN: --lower-llvm-to-neura \ // RUN: --canonicalize-cast \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ diff --git a/test/optimization/constant_folding/simple_loop.mlir b/test/optimization/constant_folding/simple_loop.mlir index 483a042c..4df60b58 100644 --- a/test/optimization/constant_folding/simple_loop.mlir +++ b/test/optimization/constant_folding/simple_loop.mlir @@ -1,5 +1,5 @@ // RUN: mlir-neura-opt %s \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: | FileCheck %s -check-prefix=FOLD From 3907f67cc5fb513cc8fb979d0f8b34aabcdf9a34 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Fri, 23 Jan 2026 15:59:37 +0800 Subject: [PATCH 08/25] enable canonicalize-return for neura.kernel --- .../Transforms/AssignAcceleratorPass.cpp | 33 +- .../Transforms/CanonicalizeReturnPass.cpp | 321 +++++++++++++++--- 2 files changed, 302 insertions(+), 52 deletions(-) diff --git a/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp index 11688539..47f5771b 100644 --- a/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp +++ b/lib/NeuraDialect/Transforms/AssignAcceleratorPass.cpp @@ -12,6 +12,16 @@ using namespace mlir; #include "NeuraDialect/NeuraPasses.h.inc" namespace { +// Checks if a function contains any neura.kernel operations. +static bool containsNeuraKernelOp(FunctionOpInterface func_op) { + bool has_kernel = false; + func_op.walk([&](neura::KernelOp kernel_op) { + has_kernel = true; + return WalkResult::interrupt(); + }); + return has_kernel; +} + struct AssignAcceleratorPass : public PassWrapper> { MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AssignAcceleratorPass) @@ -25,19 +35,28 @@ struct AssignAcceleratorPass ModuleOp module = getOperation(); Builder builder(&getContext()); + // Firstly assigns accelerator to all neura.kernel ops. + module.walk([&](neura::KernelOp kernel_op) { + // Handles neura.kernel ops. + if (!kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) { + kernel_op->setAttr(mlir::accel::kAcceleratorAttr, + builder.getStringAttr(mlir::accel::kNeuraTarget)); + } + }); + + // Secondly assigns accelerator to functions. + // Skips functions that: + // 1. Are named "main"; + // 2. Already have accelerator attribute; + // 3. Contain neura.kernel operations. module.walk([&](Operation *op) { if (auto func = dyn_cast(op)) { if (func.getName() != "main" && !func.isExternal() && - !func->hasAttr(mlir::accel::kAcceleratorAttr)) { + !func->hasAttr(mlir::accel::kAcceleratorAttr) && + !containsNeuraKernelOp(func)) { func->setAttr(mlir::accel::kAcceleratorAttr, builder.getStringAttr(mlir::accel::kNeuraTarget)); } - } else if (neura::KernelOp kernel_op = dyn_cast(op)) { - // Handles neura.kernel ops as well. - if (!kernel_op->hasAttr(mlir::accel::kAcceleratorAttr)) { - kernel_op->setAttr(mlir::accel::kAcceleratorAttr, - builder.getStringAttr(mlir::accel::kNeuraTarget)); - } } }); } diff --git a/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp index 46c5407e..22e3869c 100644 --- a/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp +++ b/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp @@ -20,8 +20,9 @@ using namespace mlir; #include "NeuraDialect/NeuraPasses.h.inc" namespace { -// Return type attribute values. +// Return/Yield type attribute values. constexpr const char *kReturnTypeAttr = "return_type"; +constexpr const char *kYieldTypeAttr = "yield_type"; constexpr const char *kReturnTypeVoid = "void"; constexpr const char *kReturnTypeValue = "value"; @@ -58,6 +59,197 @@ static void processReturns(Region ®ion, OpBuilder &builder) { } } +// Processes neura.yield operations in kernel regions. +static void processYields(neura::KernelOp kernel_op, OpBuilder &builder) { + SmallVector empty_yields; + + kernel_op.walk([&](neura::YieldOp yield_op) { + llvm::errs() << "[canonicalize] Processing neura.yield operation...\n"; + llvm::errs() << yield_op << "\n"; + + // Case 1: yield has results - mark as value type. + if (yield_op.getResults().size() > 0) { + llvm::errs() << "[canonicalize] Marking neura.yield with value...\n"; + yield_op->setAttr(kYieldTypeAttr, + builder.getStringAttr(kReturnTypeValue)); + return; + } + + // Case 2 & 3: yield has no results. + empty_yields.push_back(yield_op); + }); + + // Processes empty yields. + for (neura::YieldOp yield_op : empty_yields) { + llvm::errs() << "[canonicalize] Processing empty neura.yield...\n"; + + // Searches for counters in the kernel. + neura::CounterOp root_counter = nullptr; + neura::CounterOp any_counter = nullptr; + + kernel_op.walk([&](neura::CounterOp counter_op) { + any_counter = counter_op; + + if (counter_op.getCounterTypeAttr() && + counter_op.getCounterTypeAttr().getValue() == "root") { + root_counter = counter_op; + } + }); + + // Case 2: Has counter - uses counter as trigger. + if (root_counter || any_counter) { + Value trigger_value = root_counter ? root_counter.getCurrentIndex() + : any_counter.getCurrentIndex(); + + llvm::errs() << "[canonicalize] Using " + << (root_counter ? "root" : "leaf") + << " counter as trigger.\n"; + + // Creates new yield with trigger value as result. + builder.setInsertionPoint(yield_op); + + SmallVector iter_args_next(yield_op.getIterArgsNext()); + SmallVector results = {trigger_value}; + + auto new_yield = builder.create(yield_op.getLoc(), + iter_args_next, results); + new_yield->setAttr(kYieldTypeAttr, + builder.getStringAttr(kReturnTypeVoid)); + + yield_op.erase(); + } else { + // Case 3: No counter - mark for void processing (similar to return). + llvm::errs() + << "[canonicalize] No counter found, marking as void yield\n"; + yield_op->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid)); + } + } +} + +// Processes empty yield void blocks (similar to processEmptyReturnVoidBlock). +static void processEmptyYieldVoidBlock(Block *yield_block, + neura::YieldOp void_yield_op, + OpBuilder &builder) { + SmallVector predecessor_blocks(yield_block->getPredecessors()); + + // Entry block with yield_void is unreachable; no action needed. + if (predecessor_blocks.empty()) { + llvm::errs() + << "[canonicalize] Entry block with void yield is unreachable\n"; + return; + } + + // Separates predecessor blocks into cond_br and br blocks. + SmallVector cond_br_preds; + SmallVector br_preds; + + for (Block *pred_block : predecessor_blocks) { + Operation *terminator = pred_block->getTerminator(); + if (isa(terminator)) { + cond_br_preds.push_back(pred_block); + } else if (isa(terminator)) { + br_preds.push_back(pred_block); + } + } + + // Handles br_preds: copy yield_void to pred_block with a trigger value. + for (Block *pred_block : br_preds) { + neura::Br br = cast(pred_block->getTerminator()); + + // Finds a suitable trigger value in the predecessor block. + Value trigger_value = nullptr; + + for (Operation &op : llvm::reverse(*pred_block)) { + if (&op == br) { + continue; + } + + if (op.getNumResults() > 0) { + trigger_value = op.getResult(0); + break; + } + } + + if (!trigger_value) { + llvm::errs() << "[canonicalize] Error: No suitable value found in " + "predecessor block\n"; + return; + } + + builder.setInsertionPoint(br); + + SmallVector iter_args_next(void_yield_op.getIterArgsNext()); + SmallVector results = {trigger_value}; + + auto new_yield = + builder.create(br.getLoc(), iter_args_next, results); + new_yield->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid)); + br.erase(); + } + + // If there are no cond_br predecessors, remove the yield_void block. + if (cond_br_preds.empty()) { + void_yield_op.erase(); + yield_block->erase(); + return; + } + + // Handles cond_preds: add a block argument for the trigger value. + BlockArgument trigger_arg = + yield_block->addArgument(builder.getI1Type(), void_yield_op.getLoc()); + + // Updates each cond_pred block's terminator to pass the trigger value. + for (Block *pred_block : cond_br_preds) { + neura::CondBr cond_br = cast(pred_block->getTerminator()); + Value cond = cond_br.getCondition(); + Value trigger_value = nullptr; + + bool is_true_branch = (cond_br.getTrueDest() == yield_block); + bool is_false_branch = (cond_br.getFalseDest() == yield_block); + + if (is_true_branch && !is_false_branch) { + trigger_value = cond; + } else if (!is_true_branch && is_false_branch) { + builder.setInsertionPoint(cond_br); + Value negated_cond = + builder.create(cond_br.getLoc(), cond.getType(), cond); + trigger_value = negated_cond; + } else { + llvm::errs() << "[canonicalize] Error: Both branches lead to yield\n"; + return; + } + + if (trigger_value) { + SmallVector true_args(cond_br.getTrueArgs()); + SmallVector false_args(cond_br.getFalseArgs()); + + if (is_true_branch) { + true_args.push_back(trigger_value); + } + if (is_false_branch) { + false_args.push_back(trigger_value); + } + + builder.setInsertionPoint(cond_br); + builder.create( + cond_br.getLoc(), cond_br.getCondition(), true_args, false_args, + cond_br.getTrueDest(), cond_br.getFalseDest()); + cond_br.erase(); + } + } + + // Updates the yield_void operation to use the block argument as trigger. + builder.setInsertionPoint(void_yield_op); + + SmallVector iter_args_next(void_yield_op.getIterArgsNext()); + SmallVector results = {trigger_arg}; + + auto new_yield = builder.create(void_yield_op.getLoc(), + iter_args_next, results); + new_yield->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid)); + void_yield_op.erase(); +} + static void processEmptyReturnVoidBlock(Block *ret_block, neura::ReturnOp void_ret_op, OpBuilder &builder) { @@ -177,7 +369,7 @@ static void processEmptyReturnVoidBlock(Block *ret_block, } struct CanonicalizeReturnPass - : public PassWrapper> { + : public PassWrapper> { MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CanonicalizeReturnPass) StringRef getArgument() const override { return "canonicalize-return"; } @@ -190,58 +382,97 @@ struct CanonicalizeReturnPass } void runOnOperation() override { - func::FuncOp func_op = getOperation(); - // Checks for neura accelerator attribute. - auto accel_attr = - func_op->getAttrOfType(accel::kAcceleratorAttr); - if (!accel_attr) { - return; - } - - Region ®ion = func_op.getBody(); - if (region.empty()) { - return; - } + ModuleOp module_op = getOperation(); + OpBuilder builder(module_op.getContext()); + + // Processes all functions. + module_op.walk([&](func::FuncOp func_op) { + // Checks for neura accelerator attribute. + auto accel_attr = + func_op->getAttrOfType(accel::kAcceleratorAttr); + if (!accel_attr) { + return; + } - OpBuilder builder(func_op.getContext()); + Region ®ion = func_op.getBody(); + if (region.empty()) { + return; + } - // Step 1: Marks empty returns with "void" attribute. - processReturns(region, builder); + // Step 1: Marks empty returns with "void" attribute. + processReturns(region, builder); - if (!isVoidFunction(func_op)) { - llvm::errs() << "[ctrl2data] Function is not void, no further action " - "needed.\n"; - return; - } + if (!isVoidFunction(func_op)) { + llvm::errs() << "[ctrl2data] Function is not void, no further action " + "needed.\n"; + return; + } - // Step 2: Collects all return operations with "is_void" attribute. - SmallVector ret_void_ops; - region.walk([&](neura::ReturnOp ret_op) { - if (ret_op->hasAttr(kReturnTypeAttr)) { - if (dyn_cast(ret_op->getAttr(kReturnTypeAttr)).getValue() == - kReturnTypeVoid) { - ret_void_ops.push_back(ret_op); + // Step 2: Collects all return operations with "is_void" attribute. + SmallVector ret_void_ops; + region.walk([&](neura::ReturnOp ret_op) { + if (ret_op->hasAttr(kReturnTypeAttr)) { + if (dyn_cast(ret_op->getAttr(kReturnTypeAttr)) + .getValue() == kReturnTypeVoid) { + ret_void_ops.push_back(ret_op); + } + } + }); + + // Step 3: Processes each return_void block. + for (neura::ReturnOp ret_void_op : ret_void_ops) { + Block *ret_block = ret_void_op->getBlock(); + + // Checks if ret_block only contains the return_void operation. + bool is_empty_block = (ret_block->getOperations().size() == 1); + + if (is_empty_block) { + processEmptyReturnVoidBlock(ret_block, ret_void_op, builder); + } else { + // TODO: Handle non-empty return blocks. + // The basic idea is to create a new block that only contains the + // return_void operation, and redirect the original return block to + // this new block. + assert(false && "Unsupported case: return block is not empty."); } } }); - // Step 3: Processes each return_void block. - for (neura::ReturnOp ret_void_op : ret_void_ops) { - Block *ret_block = ret_void_op->getBlock(); - - // Checks if ret_block only contains the return_void operation. - bool is_empty_block = (ret_block->getOperations().size() == 1); - - if (is_empty_block) { - processEmptyReturnVoidBlock(ret_block, ret_void_op, builder); - } else { - // TODO: Handle non-empty return blocks. - // The basic idea is to create a new block that only contains the - // return_void operation, and redirect the original return block to this - // new block. - assert(false && "Unsupported case: return block is not empty."); + // Processes all neura.kernel operations. + module_op.walk([&](neura::KernelOp kernel_op) { + auto accel_attr = + kernel_op->getAttrOfType(accel::kAcceleratorAttr); + if (!accel_attr) { + return; } - } + + // Step 1: Processes yields (handles cases 1 & 2) + processYields(kernel_op, builder); + + // Step 2: Collects void yields without trigger values (case 3). + SmallVector yield_void_ops; + kernel_op.walk([&](neura::YieldOp yield_op) { + if (yield_op->hasAttr(kYieldTypeAttr)) { + if (dyn_cast(yield_op->getAttr(kYieldTypeAttr)) + .getValue() == kReturnTypeVoid && + yield_op.getResults().size() == 0) { + yield_void_ops.push_back(yield_op); + } + } + }); + + // Step 3: Processes each yield_void block (case 3) + for (neura::YieldOp yield_void_op : yield_void_ops) { + Block *yield_block = yield_void_op->getBlock(); + bool is_empty_block = (yield_block->getOperations().size() == 1); + + if (is_empty_block) { + processEmptyYieldVoidBlock(yield_block, yield_void_op, builder); + } else { + assert(false && "Unsupported case: yield block is not empty."); + } + } + }); } }; } // namespace From 8ae81872d611252613fc11d8a5a9c605fb2dfa55 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Fri, 23 Jan 2026 16:40:30 +0800 Subject: [PATCH 09/25] enable leverage-predicated-values for neura.kernel --- .../Transforms/CanonicalizeLiveInPass.cpp | 26 +++++ .../LeveragePredicatedValuePass.cpp | 99 ++++++++++++------- 2 files changed, 91 insertions(+), 34 deletions(-) diff --git a/lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp index 56f72a06..e02ebcc8 100644 --- a/lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp +++ b/lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp @@ -794,6 +794,8 @@ struct CanonicalizeLiveInPass void runOnOperation() override { ModuleOp module_op = getOperation(); + + // Processes functions. module_op.walk([&](Operation *op) { Region *region = nullptr; if (auto func_op = dyn_cast(op)) { @@ -827,6 +829,30 @@ struct CanonicalizeLiveInPass return; } }); + + // Processes neura.kernel operations. + module_op.walk([&](neura::KernelOp kernel_op) { + auto accel_attr = + kernel_op->getAttrOfType(accel::kAcceleratorAttr); + if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { + return; + } + + Region &kernel_region = kernel_op.getBody(); + if (kernel_region.empty()) { + return; + } + + // Creates dominance info for the kernel region. + DominanceInfo dom_info(kernel_op); + PostDominanceInfo post_dom_info(kernel_op); + + if (failed(promoteLiveInValuesToBlockArgs(kernel_region, dom_info, + post_dom_info))) { + signalPassFailure(); + return; + } + }); } }; } // namespace diff --git a/lib/NeuraDialect/Transforms/LeveragePredicatedValuePass.cpp b/lib/NeuraDialect/Transforms/LeveragePredicatedValuePass.cpp index 56516c0e..54bc73b5 100644 --- a/lib/NeuraDialect/Transforms/LeveragePredicatedValuePass.cpp +++ b/lib/NeuraDialect/Transforms/LeveragePredicatedValuePass.cpp @@ -7,6 +7,7 @@ #include "mlir/IR/PatternMatch.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/Support/LogicalResult.h" #include "llvm/Support/raw_ostream.h" using namespace mlir; @@ -39,50 +40,80 @@ struct LeveragePredicatedValuePass if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { return; } - // Converts block argument types to predicated values. - func.walk([&](Block *block) { - // skips the entry (first) block of the function. - if (block == &block->getParent()->front()) { - return; - } - - for (BlockArgument arg : block->getArguments()) { - Type orig_type = arg.getType(); + if (failed(processRegion(func.getFunctionBody()))) { + llvm::errs() << "Failed to process function: " << func.getName() + << "\n"; + signalPassFailure(); + return; + } + }); - // Avoid double-wrapping if already predicated - if (llvm::isa(orig_type)) { - continue; - } + // Processes each neura.kernel operation. + module.walk([&](neura::KernelOp kernel_op) { + auto accel_attr = + kernel_op->getAttrOfType(accel::kAcceleratorAttr); + if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { + return; + } - auto predicated_type = neura::PredicatedValue::get( - func.getContext(), orig_type, - IntegerType::get(func.getContext(), 1)); - arg.setType(predicated_type); - } - }); - - // Gets operations in topological order (operands before users). - SmallVector orderedOps; - getOperationsInTopologicalOrder(func, orderedOps); - - // Processes each operation in order. - for (Operation *op : orderedOps) { - if (failed(applyPredicatedDataType(op))) { - llvm::errs() << "Failed to convert op to predicated form: " << *op - << "\n"; - signalPassFailure(); - return; - } + if (failed(processRegion(kernel_op.getBody()))) { + llvm::errs() << "Failed to process neura.kernel operation: " + << *kernel_op << "\n"; + signalPassFailure(); + return; } }); } private: + // Processes a region (function body or kernel body). + LogicalResult processRegion(Region ®ion) { + if (region.empty()) { + return success(); + } + + for (Block &block : region) { + // Skips the entry (first) block of the function. + if (&block == ®ion.front()) { + continue; + } + + for (BlockArgument arg : block.getArguments()) { + Type orig_type = arg.getType(); + + // Avoids double-wrapping if already predicated. + if (llvm::isa(orig_type)) { + continue; + } + + auto predicated_type = neura::PredicatedValue::get( + region.getContext(), orig_type, + IntegerType::get(region.getContext(), 1)); + arg.setType(predicated_type); + } + } + + // Gets operations in topological order (operands before users). + SmallVector ordered_ops; + getOperationsInTopologicalOrder(region, ordered_ops); + + // Processes each operation in order. + for (Operation *op : ordered_ops) { + if (failed(applyPredicatedDataType(op))) { + llvm::errs() << "Failed to convert op to predicated form: " << *op + << "\n"; + return failure(); + } + } + + return success(); + } + // Gets operations in topological order. - void getOperationsInTopologicalOrder(FunctionOpInterface func, + void getOperationsInTopologicalOrder(Region ®ion, SmallVector &ordered) { DenseSet visited; - func.walk([&](Operation *op) { + region.walk([&](Operation *op) { // Uses standard DFS to build topological order. if (visited.contains(op)) { return; From deffa0a7f6aba0d00b5509df393885fbfc46750a Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 24 Jan 2026 13:30:35 +0800 Subject: [PATCH 10/25] enable kernel with counters dataflow lowering --- include/NeuraDialect/NeuraOps.td | 21 + .../Transforms/CanonicalizeReturnPass.cpp | 261 +++---------- .../Transforms/PromoteInputArgToConstPass.cpp | 54 +-- .../TransformCtrlToDataFlowPass.cpp | 360 ++++++++++++++++-- test/neura/ctrl/branch_for.mlir | 3 +- 5 files changed, 440 insertions(+), 259 deletions(-) diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td index 80006ce6..7aa0d783 100644 --- a/include/NeuraDialect/NeuraOps.td +++ b/include/NeuraDialect/NeuraOps.td @@ -780,6 +780,7 @@ def Neura_LoopControlOp : Op{ // " `(``parent_valid` `=` $parentValid `,` `start` `=` $start `,` `end` `=` $end `,` `step` `=` $step`)` attr-dict `:` type($parentValid) `,` type($start) `,` type($end) `,` type($step) `->` type($nextindex) `,` type($valid)"; } +// Defines an operation for hardware loop counters. def Neura_CounterOp : Op{ let summary = "Hardware loop counter for CGRA execution."; let description = [{ @@ -809,6 +810,26 @@ def Neura_CounterOp : Op{ let assemblyFormat = "attr-dict `:` type($current_index)"; } +// Defines an operation to extract the predicate bit from a predicated value. +def Neura_ExtractPredicateOp : Op{ + let summary = "Extracts the predicate bit from a predicated value."; + let description = [{ + Extracts the predicate bit from a predicated value, + producing a boolean predicated value: !neura.predicated. + + Example: + %counter = neura.counter {bound = 16} : !neura.predicated + %is_valid = neura.extract_predicate %counter : !neura.predicated -> !neura.predicated + + // Use for gating final results: + %gated = neura.grant_predicate %result, %is_valid + }]; + + let arguments = (ins AnyType:$input); + let results = (outs AnyType:$predicate); + let assemblyFormat = "$input attr-dict `:` type($input) `->` type($predicate)"; +} + // ---------------------------------------------------- // Defines operations for steering-control based DFG execution. // ---------------------------------------------------- diff --git a/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp index 22e3869c..1ce7fe9b 100644 --- a/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp +++ b/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp @@ -39,6 +39,16 @@ static bool isVoidFunction(func::FuncOp func_op) { return false; } +// Checks if kernel has any counter. +static bool kernelHasCounter(neura::KernelOp kernel_op) { + bool has_counter = false; + kernel_op.walk([&](neura::CounterOp counter_op) { + has_counter = true; + return WalkResult::interrupt(); + }); + return has_counter; +} + // Marks empty returns with "is_void" attribute and adds trigger values. static void processReturns(Region ®ion, OpBuilder &builder) { SmallVector empty_returns; @@ -59,195 +69,65 @@ static void processReturns(Region ®ion, OpBuilder &builder) { } } -// Processes neura.yield operations in kernel regions. -static void processYields(neura::KernelOp kernel_op, OpBuilder &builder) { - SmallVector empty_yields; - - kernel_op.walk([&](neura::YieldOp yield_op) { - llvm::errs() << "[canonicalize] Processing neura.yield operation...\n"; - llvm::errs() << yield_op << "\n"; - - // Case 1: yield has results - mark as value type. - if (yield_op.getResults().size() > 0) { - llvm::errs() << "[canonicalize] Marking neura.yield with value...\n"; - yield_op->setAttr(kYieldTypeAttr, - builder.getStringAttr(kReturnTypeValue)); - return; - } - - // Case 2 & 3: yield has no results. - empty_yields.push_back(yield_op); - }); - - // Processes empty yields. - for (neura::YieldOp yield_op : empty_yields) { - llvm::errs() << "[canonicalize] Processing empty neura.yield...\n"; +// Converts yields to returns (for kernels without counters). +static void convertYieldsToReturns(neura::KernelOp kernel_op, + OpBuilder &builder) { + SmallVector yields_to_convert; - // Searches for counters in the kernel. - neura::CounterOp root_counter = nullptr; - neura::CounterOp any_counter = nullptr; - - kernel_op.walk([&](neura::CounterOp counter_op) { - any_counter = counter_op; - - if (counter_op.getCounterTypeAttr() && - counter_op.getCounterTypeAttr().getValue() == "root") { - root_counter = counter_op; - } - }); + // Collects all yields in kernel. + kernel_op.walk( + [&](neura::YieldOp yield_op) { yields_to_convert.push_back(yield_op); }); - // Case 2: Has counter - uses counter as trigger. - if (root_counter || any_counter) { - Value trigger_value = root_counter ? root_counter.getCurrentIndex() - : any_counter.getCurrentIndex(); + for (neura::YieldOp yield_op : yields_to_convert) { + llvm::errs() << "[canonicalize] Converting yield to return: " << yield_op + << "\n"; - llvm::errs() << "[canonicalize] Using " - << (root_counter ? "root" : "leaf") - << " counter as trigger.\n"; + builder.setInsertionPoint(yield_op); - // Creates new yield with trigger value as result. - builder.setInsertionPoint(yield_op); - - SmallVector iter_args_next(yield_op.getIterArgsNext()); - SmallVector results = {trigger_value}; - - auto new_yield = builder.create(yield_op.getLoc(), - iter_args_next, results); - new_yield->setAttr(kYieldTypeAttr, - builder.getStringAttr(kReturnTypeVoid)); - - yield_op.erase(); + if (yield_op.getResults().size() > 0) { + // Yield with results → return with values + llvm::errs() << "[canonicalize] Yield has results\n"; + builder.create(yield_op.getLoc(), yield_op.getResults()); } else { - // Case 3: No counter - mark for void processing (similar to return). - llvm::errs() - << "[canonicalize] No counter found, marking as void yield\n"; - yield_op->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid)); + // Yield without results → return without operands (empty return) + llvm::errs() << "[canonicalize] Yield is void\n"; + builder.create(yield_op.getLoc(), ValueRange{}); } - } -} - -// Processes empty yield void blocks (similar to processEmptyReturnVoidBlock). -static void processEmptyYieldVoidBlock(Block *yield_block, - neura::YieldOp void_yield_op, - OpBuilder &builder) { - SmallVector predecessor_blocks(yield_block->getPredecessors()); - // Entry block with yield_void is unreachable; no action needed. - if (predecessor_blocks.empty()) { - llvm::errs() - << "[canonicalize] Entry block with void yield is unreachable\n"; - return; + yield_op.erase(); } +} - // Separates predecessor blocks into cond_br and br blocks. - SmallVector cond_br_preds; - SmallVector br_preds; - - for (Block *pred_block : predecessor_blocks) { - Operation *terminator = pred_block->getTerminator(); - if (isa(terminator)) { - cond_br_preds.push_back(pred_block); - } else if (isa(terminator)) { - br_preds.push_back(pred_block); - } - } - - // Handles br_preds: copy yield_void to pred_block with a trigger value. - for (Block *pred_block : br_preds) { - neura::Br br = cast(pred_block->getTerminator()); - - // Finds a suitable trigger value in the predecessor block. - Value trigger_value = nullptr; - - for (Operation &op : llvm::reverse(*pred_block)) { - if (&op == br) { - continue; - } - - if (op.getNumResults() > 0) { - trigger_value = op.getResult(0); - break; +// Processes neura.yield operations in kernel regions. +static void processYields(neura::KernelOp kernel_op, OpBuilder &builder) { + // Checks if kernel has counter. + bool has_counter = kernelHasCounter(kernel_op); + + if (has_counter) { + // Case 1: kernel has counter -> keep yields, just marks them. + kernel_op.walk([&](neura::YieldOp yield_op) { + llvm::errs() << "[canonicalize] Processing neura.yield operation...\n"; + llvm::errs() << yield_op << "\n"; + + // yield has results - mark as value type. + if (yield_op.getResults().size() > 0) { + llvm::errs() << "[canonicalize] Marking neura.yield with value...\n"; + yield_op->setAttr(kYieldTypeAttr, + builder.getStringAttr(kReturnTypeValue)); + return; } - } - - if (!trigger_value) { - llvm::errs() << "[canonicalize] Error: No suitable value found in " - "predecessor block\n"; - return; - } - builder.setInsertionPoint(br); - - SmallVector iter_args_next(void_yield_op.getIterArgsNext()); - SmallVector results = {trigger_value}; - - auto new_yield = - builder.create(br.getLoc(), iter_args_next, results); - new_yield->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid)); - br.erase(); - } - - // If there are no cond_br predecessors, remove the yield_void block. - if (cond_br_preds.empty()) { - void_yield_op.erase(); - yield_block->erase(); - return; - } - - // Handles cond_preds: add a block argument for the trigger value. - BlockArgument trigger_arg = - yield_block->addArgument(builder.getI1Type(), void_yield_op.getLoc()); - - // Updates each cond_pred block's terminator to pass the trigger value. - for (Block *pred_block : cond_br_preds) { - neura::CondBr cond_br = cast(pred_block->getTerminator()); - Value cond = cond_br.getCondition(); - Value trigger_value = nullptr; - - bool is_true_branch = (cond_br.getTrueDest() == yield_block); - bool is_false_branch = (cond_br.getFalseDest() == yield_block); - - if (is_true_branch && !is_false_branch) { - trigger_value = cond; - } else if (!is_true_branch && is_false_branch) { - builder.setInsertionPoint(cond_br); - Value negated_cond = - builder.create(cond_br.getLoc(), cond.getType(), cond); - trigger_value = negated_cond; - } else { - llvm::errs() << "[canonicalize] Error: Both branches lead to yield\n"; - return; - } - - if (trigger_value) { - SmallVector true_args(cond_br.getTrueArgs()); - SmallVector false_args(cond_br.getFalseArgs()); - - if (is_true_branch) { - true_args.push_back(trigger_value); - } - if (is_false_branch) { - false_args.push_back(trigger_value); - } + // yield has NO results, marks as "void" type. + yield_op->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid)); + }); + } else { + // Case 2: kernel has NO counter -> converts yields to direct returns. + llvm::errs() + << "[canonicalize] No counter -> converting yields to returns\n"; + convertYieldsToReturns(kernel_op, builder); - builder.setInsertionPoint(cond_br); - builder.create( - cond_br.getLoc(), cond_br.getCondition(), true_args, false_args, - cond_br.getTrueDest(), cond_br.getFalseDest()); - cond_br.erase(); - } + // No marks the returns we converted. } - - // Updates the yield_void operation to use the block argument as trigger. - builder.setInsertionPoint(void_yield_op); - - SmallVector iter_args_next(void_yield_op.getIterArgsNext()); - SmallVector results = {trigger_arg}; - - auto new_yield = builder.create(void_yield_op.getLoc(), - iter_args_next, results); - new_yield->setAttr(kYieldTypeAttr, builder.getStringAttr(kReturnTypeVoid)); - void_yield_op.erase(); } static void processEmptyReturnVoidBlock(Block *ret_block, @@ -439,6 +319,9 @@ struct CanonicalizeReturnPass }); // Processes all neura.kernel operations. + // There are two cases to handle: + // 1) kernel with counters - the return process is triggered by the counter. + // 2) kernel without counters - same logic as function return. module_op.walk([&](neura::KernelOp kernel_op) { auto accel_attr = kernel_op->getAttrOfType(accel::kAcceleratorAttr); @@ -446,32 +329,8 @@ struct CanonicalizeReturnPass return; } - // Step 1: Processes yields (handles cases 1 & 2) + // Step 1: Processes yields. processYields(kernel_op, builder); - - // Step 2: Collects void yields without trigger values (case 3). - SmallVector yield_void_ops; - kernel_op.walk([&](neura::YieldOp yield_op) { - if (yield_op->hasAttr(kYieldTypeAttr)) { - if (dyn_cast(yield_op->getAttr(kYieldTypeAttr)) - .getValue() == kReturnTypeVoid && - yield_op.getResults().size() == 0) { - yield_void_ops.push_back(yield_op); - } - } - }); - - // Step 3: Processes each yield_void block (case 3) - for (neura::YieldOp yield_void_op : yield_void_ops) { - Block *yield_block = yield_void_op->getBlock(); - bool is_empty_block = (yield_block->getOperations().size() == 1); - - if (is_empty_block) { - processEmptyYieldVoidBlock(yield_block, yield_void_op, builder); - } else { - assert(false && "Unsupported case: yield block is not empty."); - } - } }); } }; diff --git a/lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp b/lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp index 7889922c..39edd185 100644 --- a/lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp +++ b/lib/NeuraDialect/Transforms/PromoteInputArgToConstPass.cpp @@ -20,36 +20,9 @@ using namespace mlir; #include "NeuraDialect/NeuraPasses.h.inc" namespace { -/** - * @brief Specializes a region by "internalizing" its input arguments as - * constants. - * - * This function performs a redirection of the dataflow. It identifies all - * input arguments of the entry block, creates a corresponding - * `neura::ConstantOp` for each, and re-links all internal operations to use - * these constants instead of the original block parameters. - * - * ### Example Transformation: - * * **Before:** - * @code - * func.func @compute(%arg0: i32) { - * %0 = arith.addi %arg0, %arg0 : i32 - * return %0 : i32 - * } - * @endcode - * * **After:** - * @code - * func.func @compute(%arg0: i32) { - * %0 = "neura.constant"() {value = "%arg0"} : () -> i32 - * %1 = arith.addi %0, %0 : i32 // Uses replaced - * return %1 : i32 - * } - * @endcode - * - * @param region The MLIR Region (typically a function body) to transform. - * @return Success if the transformation was applied (even if the region was - * empty). - */ +// Attribute name to mark iter_arg init constants. +constexpr const char *kIterArgInitAttr = "is_iter_arg_init"; + LogicalResult promoteFunctionArgsToConstants(Region ®ion) { if (region.empty()) { return success(); @@ -94,7 +67,7 @@ LogicalResult promoteKernelArgsToConstants(neura::KernelOp kernel_op) { assert(args.size() == num_inputs + num_iter_args && "Kernel block arguments size mismatch"); - // Only promotes input arguments (not iter_args). + // Step 1: promotes input arguments (not iter_args). // Block arguments layout: [input0, input1, ..., iter_arg0, iter_arg1, ...] for (size_t i = 0; i < num_inputs; ++i) { BlockArgument input_arg = args[i]; @@ -109,9 +82,22 @@ LogicalResult promoteKernelArgsToConstants(neura::KernelOp kernel_op) { input_arg.replaceAllUsesWith(const_op.getResult()); } - // Note: iter_args (args[num_inputs] to args[num_inputs + num_iter_args - 1]) - // are NOT promoted here. They will be handled in transform-ctrl-to-data-flow - // pass. + // Step 2: promotes iter_args_init to constants with special attribute. + for (size_t i = 0; i < num_iter_args; i++) { + BlockArgument iter_arg = args[num_inputs + i]; + + // Creates a constant for this iter_arg_init value. + std::string const_name = "%iter_arg_init" + std::to_string(i); + auto const_op = + builder.create(iter_arg.getLoc(), iter_arg.getType(), + builder.getStringAttr(const_name)); + + // Marks this constant as an iter_arg init value. + const_op->setAttr(kIterArgInitAttr, builder.getBoolAttr(true)); + + // Replaces all uses of this iter_arg argument with the constant. + iter_arg.replaceAllUsesWith(const_op.getResult()); + } return success(); } diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp index 9f8f2a86..6847d04d 100644 --- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp +++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp @@ -24,9 +24,67 @@ using namespace mlir; #define GEN_PASS_DEF_TRANSFORMCTRLTODATAFLOW #include "NeuraDialect/NeuraPasses.h.inc" +// Attribute name to mark iter_arg init constants. +constexpr const char *kIterArgInitAttr = "is_iter_arg_init"; + +//--------------------------------------------------------------------------- +// Checks if task has counter (root counter). +//--------------------------------------------------------------------------- +bool taskHasCounter(neura::KernelOp kernel_op) { + if (!kernel_op) { + return false; + } + + bool found_counter = false; + kernel_op.walk([&](neura::CounterOp counter_op) { + if (counter_op) { + found_counter = true; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + + return found_counter; +} + +//--------------------------------------------------------------------------- +// Finds root counter in the neura.kernel operation. +//--------------------------------------------------------------------------- +neura::CounterOp findRootCounterInKernel(neura::KernelOp kernel_op) { + if (!kernel_op) { + return nullptr; + } + + neura::CounterOp root_counter = nullptr; + neura::CounterOp leaf_counter = nullptr; + + // Walks throught kernel body to find counter. + kernel_op.walk([&](neura::CounterOp counter_op) { + StringRef counter_type = counter_op.getCounterType(); + + if (counter_type == "root") { + root_counter = counter_op; + } else if (counter_type == "leaf") { + leaf_counter = counter_op; + } + }); + + if (root_counter) { + return root_counter; + } else if (leaf_counter) { + return leaf_counter; + } + + return nullptr; +} + // Inserts `grant_once` for every predicated value defined in the entry block // that is used outside of the block (i.e., a live-out). -void GrantPredicateInEntryBlock(Block *entry_block, OpBuilder &builder) { +void GrantPredicateInEntryBlock(Block *entry_block, OpBuilder &builder, + bool has_task_counter = false) { + if (has_task_counter) { + return; + } SmallVector live_out_arg_values; SmallVector live_out_non_arg_values; @@ -86,6 +144,183 @@ void GrantPredicateInEntryBlock(Block *entry_block, OpBuilder &builder) { } } +//--------------------------------------------------------------------------- +// Iter_args handling (always grant_once for now). +//--------------------------------------------------------------------------- +void handleKernelIterArgs(neura::KernelOp kernel_op, Block *entry_block, + OpBuilder &builder, + SmallVector &iter_arg_final_values) { + llvm::errs() << "[iter_args] Handling kernel iter_args...\n"; + + SmallVector iter_arg_init_ops; + for (Operation &op : kernel_op.getOps()) { + if (auto const_op = dyn_cast(op)) { + if (const_op->hasAttr(kIterArgInitAttr) && + const_op->getAttrOfType(kIterArgInitAttr).getValue()) { + iter_arg_init_ops.push_back(const_op); + } + } + } + + if (iter_arg_init_ops.empty()) { + llvm::errs() << "[iter_args] No iter_args\n"; + return; + } + + neura::YieldOp yield_op = nullptr; + for (Operation &op : kernel_op.getOps()) { + if (auto yld = dyn_cast(op)) { + yield_op = yld; + break; + } + } + + if (!yield_op || yield_op.getIterArgsNext().empty()) { + llvm::errs() << "[iter_args] No iter_args_next in yield\n"; + return; + } + + for (size_t i = 0; i < iter_arg_init_ops.size(); ++i) { + neura::ConstantOp init_const = iter_arg_init_ops[i]; + Value feedback_value = yield_op.getIterArgsNext()[i]; + + llvm::errs() << "[iter_args] Processing iter_arg " << i << "\n"; + + // Grants once the init. + builder.setInsertionPointAfter(init_const); + neura::GrantOnceOp granted_init = builder.create( + init_const.getLoc(), init_const.getType(), init_const.getResult()); + + // Creates reserve for feedback value. + builder.setInsertionPointAfter(granted_init); + neura::ReserveOp reserve_op = builder.create( + init_const.getLoc(), init_const.getType()); + + // Creates phi for init and feedback. + builder.setInsertionPointAfter(reserve_op); + neura::PhiOp phi = builder.create( + init_const.getLoc(), init_const.getType(), + ValueRange{granted_init.getResult(), reserve_op.getResult()}); + + // Replaces uses. + init_const.getResult().replaceUsesWithIf( + phi.getResult(), [&](OpOperand &use) { + Operation *user = use.getOwner(); + return user != granted_init && !isa(user); + }); + + // Creates ctrl_mov. + builder.setInsertionPoint(yield_op); + builder.create(yield_op.getLoc(), feedback_value, + reserve_op.getResult()); + + iter_arg_final_values.push_back(feedback_value); + llvm::errs() << "[iter_args] Created iter_arg with grant_once\n"; + } + + llvm::errs() << "[iter_args] Iter_args complete\n\n"; +} + +//--------------------------------------------------------------------------- +// Handles kernel yield with counter-based gating. +//--------------------------------------------------------------------------- +void handleKernelYieldTermination( + neura::KernelOp kernel_op, Block *entry_block, OpBuilder &builder, + bool has_task_counter, const SmallVector &iter_arg_final_values) { + llvm::errs() << "[yield] ========================================\n"; + llvm::errs() << "[yield] Handling Yield Termination\n"; + llvm::errs() << "[yield] ========================================\n"; + + neura::YieldOp yield_op = nullptr; + for (Operation &op : kernel_op.getOps()) { + if (auto yld = dyn_cast(op)) { + yield_op = yld; + break; + } + } + + if (!yield_op) { + llvm::errs() << "[yield] No yield operation found\n"; + return; + } + + builder.setInsertionPoint(yield_op); + + if (!yield_op->hasAttr("yield_type")) { + llvm::errs() << "[yield] No yield_type attribute\n"; + yield_op.erase(); + return; + } + + StringRef yield_type = + yield_op->getAttrOfType("yield_type").getValue(); + + //-------------------------------------------------------------------------- + // Case 1: VALUE yield + //-------------------------------------------------------------------------- + if (yield_type == "value") { + llvm::errs() << "[yield] Processing VALUE yield\n"; + + if (has_task_counter) { + llvm::errs() + << "[yield] Has counter → Gate with NOT(counter predicate)\n"; + + // Finds counter in kernel that defines the predicate. + neura::CounterOp counter_op = findRootCounterInKernel(kernel_op); + + assert(counter_op && + "Kernel has outer task counter but no neura::CounterOp found."); + + // Extracts predicate and negates it. + Value counter_value = counter_op.getCurrentIndex(); + + auto pred_type = builder.getType( + builder.getI1Type(), builder.getI1Type()); + + auto extract_pred = builder.create( + counter_op.getLoc(), pred_type, counter_value); + + Value counter_pred = extract_pred.getPredicate(); + + // When the counter predicate is false, we want to trigger the return. + auto not_op = builder.create( + counter_op.getLoc(), counter_pred.getType(), counter_pred); + + Value return_gate = not_op.getResult(); + + llvm::errs() << "[yield] Extracted counter predicate\n"; + llvm::errs() << "[yield] Created NOT gate for return\n"; + + // Gates all results with NOT (counter predicate). + SmallVector gated_results; + for (Value result : yield_op.getResults()) { + auto gated = builder.create( + yield_op.getLoc(), result.getType(), result, return_gate); + gated_results.push_back(gated.getResult()); + + llvm::errs() << "[yield] Gated result with NOT(counter_pred)\n"; + } + + auto return_val = builder.create(yield_op.getLoc(), + gated_results); + llvm::errs() << "[yield] Created return_value with counter gating\n"; + builder.setInsertionPointAfter(return_val); + builder.create(builder.getUnknownLoc()); + + } else { + llvm::errs() << "[yield] No counter, handled as normal case.\n"; + } + yield_op.erase(); + } + //-------------------------------------------------------------------------- + // Case 2: VOID yield + //-------------------------------------------------------------------------- + else if (yield_type == "void") { + llvm::errs() << "[yield] Processing VOID yield\n"; + } + llvm::errs() << "[yield] ========================================\n\n"; +} + // Control flow struct. struct ControlFlowInfo { struct Edge { @@ -264,6 +499,8 @@ void buildControlFlowInfo(Region ®ion, ControlFlowInfo &ctrl_info, } else if (auto rt = dyn_cast(terminator)) { llvm::errs() << "[ctrl2data] ReturnOp found: " << *rt << "\n"; + } else if (auto yield = dyn_cast(terminator)) { + llvm::errs() << "[ctrl2data] YieldOp found: " << *yield << "\n"; } else { assert(false && "Unknown terminator operation in control flow graph."); } @@ -480,8 +717,8 @@ void createReserveAndPhiOps( // Transforms control flow into data flow. void transformControlFlowToDataFlow(Region ®ion, ControlFlowInfo &ctrl_info, - DominanceInfo &dom_info, - OpBuilder &builder) { + DominanceInfo &dom_info, OpBuilder &builder, + bool is_kernel = false) { // Asserts that all live-out values are dominated by block arguments. assertLiveOutValuesDominatedByBlockArgs(region); @@ -572,6 +809,10 @@ void transformControlFlowToDataFlow(Region ®ion, ControlFlowInfo &ctrl_info, block->erase(); } + if (is_kernel) { + return; + } + // Converts neura.return to return_void or return_value. SmallVector return_ops; for (Operation &op : llvm::make_early_inc_range(*entry_block)) { @@ -710,30 +951,103 @@ struct TransformCtrlToDataFlowPass void runOnOperation() override { ModuleOp module = getOperation(); - module.walk([&](Operation *op) { - Region *region = nullptr; - DominanceInfo domInfo; - OpBuilder builder(op->getContext()); - - if (auto func = dyn_cast(op)) { - auto accel_attr = - func->getAttrOfType(accel::kAcceleratorAttr); - if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { - return; - } - region = &func.getBody(); - domInfo = DominanceInfo(func); - GrantPredicateInEntryBlock(®ion->front(), builder); - assertLiveOutValuesDominatedByBlockArgs(*region); - } else { + + // Step 1: Processes each function with neura target in the module. + module.walk([&](func::FuncOp func_op) { + auto accel_attr = + func_op->getAttrOfType(accel::kAcceleratorAttr); + if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { return; } - ControlFlowInfo ctrlInfo; - buildControlFlowInfo(*region, ctrlInfo, domInfo); - transformControlFlowToDataFlow(*region, ctrlInfo, domInfo, builder); + + Region ®ion = func_op.getBody(); + DominanceInfo dom_info(func_op); + OpBuilder builder(func_op.getContext()); + GrantPredicateInEntryBlock(®ion.front(), builder); + assertLiveOutValuesDominatedByBlockArgs(region); + + ControlFlowInfo ctrl_info; + buildControlFlowInfo(region, ctrl_info, dom_info); + transformControlFlowToDataFlow(region, ctrl_info, dom_info, builder); // Converts phi operations to phi_start operations. - convertPhiToPhiStart(*region, builder); + convertPhiToPhiStart(region, builder); + }); + + // Step 2: Processes neura.kernel operation. + // For neura.kernel operations, we need to handle three cases: + // Case 1: outer task has counter, no return value + // - Skips grant predicate in entry block + // - Outer counter (root counter) gates the return + // Case 2: outer task has counter, with return value + // - Skips grant predicate in entry block + // - Outer counter (root counter) gates the return + // - Inserts extract_predicate from outer counter (root counter) to gate + // return values + // Case 3: outer task has no counter, with/without return value + // - Normal grant predicate in entry block + // - Normal transfrom-ctrl-to-data-flow process + module.walk([&](neura::KernelOp kernel_op) { + auto accel_attr = + kernel_op->getAttrOfType(accel::kAcceleratorAttr); + if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { + return; + } + + llvm::errs() + << "\n[ctrl2data] ========================================\n"; + llvm::errs() << "[ctrl2data] Processing KERNEL\n"; + llvm::errs() << "[ctrl2data] ========================================\n"; + + Region &kernel_region = kernel_op.getBody(); + Block *entry_block = &kernel_region.front(); + OpBuilder builder(kernel_op.getContext()); + DominanceInfo dom_info(kernel_op); + + // STEP 0: Checks if the kernel has root counter. + bool has_task_counter = taskHasCounter(kernel_op); + + llvm::errs() << "[ctrl2data] Task has counter: " + << (has_task_counter ? "YES" : "NO") << "\n\n"; + + SmallVector iter_arg_final_values; + + // STEP 1: Handles iter_args of the neura.kernel. + llvm::errs() << "[ctrl2data] === STEP 1: Handle iter_args ===\n"; + handleKernelIterArgs(kernel_op, entry_block, builder, + iter_arg_final_values); + + // STEP 2: Grants predicates (only if NO task counter). + llvm::errs() << "[ctrl2data] === STEP 2: Grant predicates ===\n"; + GrantPredicateInEntryBlock(entry_block, builder, has_task_counter); + + // STEP 3: Transforms control flow (if multi-block). + if (kernel_region.getBlocks().size() > 1) { + llvm::errs() << "[ctrl2data] === STEP 3: Transform control flow ===\n"; + assertLiveOutValuesDominatedByBlockArgs(kernel_region); + ControlFlowInfo ctrl_info; + buildControlFlowInfo(kernel_region, ctrl_info, dom_info); + transformControlFlowToDataFlow(kernel_region, ctrl_info, dom_info, + builder, true); + } else { + llvm::errs() << "[ctrl2data] === STEP 3: Single block (skip) ===\n"; + } + convertPhiToPhiStart(kernel_region, builder); + + // STEP 4: Handles yield termination in neura.kernel. + llvm::errs() << "[ctrl2data] === STEP 4: Handle yield ===\n"; + handleKernelYieldTermination(kernel_op, entry_block, builder, + has_task_counter, iter_arg_final_values); + + kernel_op->setAttr(neura::attr::kDataflowMode, + StringAttr::get(kernel_op.getContext(), + neura::attr::val::kModePredicate)); + + llvm::errs() << "[ctrl2data] ========================================\n"; + llvm::errs() << "[ctrl2data] ✅ KERNEL Complete\n"; + llvm::errs() + << "[ctrl2data] ========================================\n\n"; + llvm::errs() << "transformed kernel op:\n" << kernel_op << "\n"; }); } }; diff --git a/test/neura/ctrl/branch_for.mlir b/test/neura/ctrl/branch_for.mlir index a626575e..0a7d6031 100644 --- a/test/neura/ctrl/branch_for.mlir +++ b/test/neura/ctrl/branch_for.mlir @@ -21,7 +21,8 @@ // RUN: --canonicalize-live-in \ // RUN: --leverage-predicated-value \ // RUN: --transform-ctrl-to-data-flow \ -// RUN: | FileCheck %s -check-prefix=CTRL2DATA +// RUN: -o %t-transformed.mlir +// RU: | FileCheck %s -check-prefix=CTRL2DATA // RUN: mlir-neura-opt %s \ // RUN: --assign-accelerator \ From c5e42ebe53335378df708736131ee2a04541d274 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 24 Jan 2026 13:55:00 +0800 Subject: [PATCH 11/25] enable kernel without counters dataflow lowering --- .../Transforms/CanonicalizeReturnPass.cpp | 41 +++++++++ .../TransformCtrlToDataFlowPass.cpp | 84 +++++++++++++------ 2 files changed, 99 insertions(+), 26 deletions(-) diff --git a/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp index 1ce7fe9b..d17bf786 100644 --- a/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp +++ b/lib/NeuraDialect/Transforms/CanonicalizeReturnPass.cpp @@ -127,6 +127,8 @@ static void processYields(neura::KernelOp kernel_op, OpBuilder &builder) { convertYieldsToReturns(kernel_op, builder); // No marks the returns we converted. + Region &kernel_region = kernel_op.getBody(); + processReturns(kernel_region, builder); } } @@ -248,6 +250,38 @@ static void processEmptyReturnVoidBlock(Block *ret_block, void_ret_op.erase(); } +// Processes void returns in kernel (same logic as function). +static void processVoidReturnsInKernel(neura::KernelOp kernel_op, + OpBuilder &builder) { + Region &kernel_region = kernel_op.getBody(); + + // Collects all return operations with "void" attribute. + SmallVector ret_void_ops; + kernel_region.walk([&](neura::ReturnOp ret_op) { + if (ret_op->hasAttr(kReturnTypeAttr)) { + if (dyn_cast(ret_op->getAttr(kReturnTypeAttr)).getValue() == + kReturnTypeVoid) { + ret_void_ops.push_back(ret_op); + } + } + }); + + llvm::errs() << "[canonicalize] Found " << ret_void_ops.size() + << " void returns in kernel\n"; + + // Processes each return_void block. + for (neura::ReturnOp ret_void_op : ret_void_ops) { + Block *ret_block = ret_void_op->getBlock(); + bool is_empty_block = (ret_block->getOperations().size() == 1); + + if (is_empty_block) { + processEmptyReturnVoidBlock(ret_block, ret_void_op, builder); + } else { + assert(false && "Unsupported case: return block is not empty."); + } + } +} + struct CanonicalizeReturnPass : public PassWrapper> { MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CanonicalizeReturnPass) @@ -331,6 +365,13 @@ struct CanonicalizeReturnPass // Step 1: Processes yields. processYields(kernel_op, builder); + + // Step 2: If yields are converted to returns, processes void returns. + bool has_counter = kernelHasCounter(kernel_op); + if (!has_counter) { + llvm::errs() << "[canonicalize] Processing void returns in kernel\n"; + processVoidReturnsInKernel(kernel_op, builder); + } }); } }; diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp index 6847d04d..556d6181 100644 --- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp +++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp @@ -718,7 +718,7 @@ void createReserveAndPhiOps( // Transforms control flow into data flow. void transformControlFlowToDataFlow(Region ®ion, ControlFlowInfo &ctrl_info, DominanceInfo &dom_info, OpBuilder &builder, - bool is_kernel = false) { + bool has_task_counter = false) { // Asserts that all live-out values are dominated by block arguments. assertLiveOutValuesDominatedByBlockArgs(region); @@ -809,7 +809,7 @@ void transformControlFlowToDataFlow(Region ®ion, ControlFlowInfo &ctrl_info, block->erase(); } - if (is_kernel) { + if (has_task_counter) { return; } @@ -851,29 +851,24 @@ void transformControlFlowToDataFlow(Region ®ion, ControlFlowInfo &ctrl_info, builder.create(builder.getUnknownLoc()); // Sets the "dataflow_mode" attribute to "predicate" for the parent - // function. - if (auto func = dyn_cast(region.getParentOp())) { - if (!func->hasAttr(neura::attr::kDataflowMode)) { - func->setAttr( - neura::attr::kDataflowMode, - StringAttr::get(func.getContext(), neura::attr::val::kModePredicate)); - llvm::errs() - << "[ctrl2data] Set dataflow mode to predicate for function: " - << func.getName() << "\n"; - } else { - llvm::errs() << "[ctrl2data] Function " << func.getName() - << " already has dataflow_mode set to " - << func->getAttrOfType( - neura::attr::kDataflowMode) - .getValue() - << "\n"; - func->setAttr( - neura::attr::kDataflowMode, - StringAttr::get(func.getContext(), neura::attr::val::kModePredicate)); - } + // function/kernel. + Operation *parent_op = region.getParentOp(); + llvm::errs() << "[ctrl2data] Parent operation: " << *parent_op << "\n"; + if (auto func = dyn_cast(parent_op)) { + func->setAttr( + neura::attr::kDataflowMode, + StringAttr::get(func.getContext(), neura::attr::val::kModePredicate)); + llvm::errs() << "[ctrl2data] Set dataflow mode to predicate for function: " + << func.getName() << "\n"; + } else if (auto kernel = dyn_cast(parent_op)) { + // Parent is a kernel. + kernel->setAttr( + neura::attr::kDataflowMode, + StringAttr::get(kernel.getContext(), neura::attr::val::kModePredicate)); + llvm::errs() << "[ctrl2data] Set dataflow mode to predicate for kernel.\n"; } else { - assert(false && - "[ctrl2data] Warning: Parent operation is not a func::FuncOp.\n"); + assert(false && "[ctrl2data] Warning: Parent operation is neither a " + "func::FuncOp nor a neura::KernelOp.\n"); } } @@ -982,8 +977,8 @@ struct TransformCtrlToDataFlowPass // Case 2: outer task has counter, with return value // - Skips grant predicate in entry block // - Outer counter (root counter) gates the return - // - Inserts extract_predicate from outer counter (root counter) to gate - // return values + // - Inserts extract_predicate from outer counter (root counter) to + // gate return values // Case 3: outer task has no counter, with/without return value // - Normal grant predicate in entry block // - Normal transfrom-ctrl-to-data-flow process @@ -1009,6 +1004,43 @@ struct TransformCtrlToDataFlowPass llvm::errs() << "[ctrl2data] Task has counter: " << (has_task_counter ? "YES" : "NO") << "\n\n"; + if (!has_task_counter) { + llvm::errs() << "[ctrl2data] === Kernel WITHOUT counter ===\n"; + llvm::errs() << "[ctrl2data] Using standard function lowering flow\n\n"; + + // Step 1: Grant predicates in entry block + llvm::errs() << "[ctrl2data] STEP 1: Grant predicates\n"; + GrantPredicateInEntryBlock(entry_block, builder, false); + + // Step 2: Assert live-out values + llvm::errs() << "[ctrl2data] STEP 2: Assert live-out values\n"; + assertLiveOutValuesDominatedByBlockArgs(kernel_region); + + // Step 3: Build control flow info + llvm::errs() << "[ctrl2data] STEP 3: Build control flow info\n"; + ControlFlowInfo ctrl_info; + buildControlFlowInfo(kernel_region, ctrl_info, dom_info); + + // Step 4: Transform control flow to data flow + llvm::errs() << "[ctrl2data] STEP 4: Transform control flow\n"; + transformControlFlowToDataFlow(kernel_region, ctrl_info, dom_info, + builder, + false); // ✅ false = use function logic + + // Step 5: Convert phi to phi_start + llvm::errs() << "[ctrl2data] STEP 5: Convert phi to phi_start\n"; + convertPhiToPhiStart(kernel_region, builder); + + llvm::errs() << "[ctrl2data] ✅ Kernel WITHOUT counter complete\n"; + llvm::errs() + << "[ctrl2data] ========================================\n\n"; + + // Set dataflow mode attribute + kernel_op->setAttr(neura::attr::kDataflowMode, + StringAttr::get(kernel_op.getContext(), + neura::attr::val::kModePredicate)); + return; + } SmallVector iter_arg_final_values; From 2842fae364b7e6fbb951e29fea5f3e2c0b3ea102 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 24 Jan 2026 15:14:17 +0800 Subject: [PATCH 12/25] enable kenrel mapping --- include/NeuraDialect/Mapping/mapping_util.h | 12 +- lib/NeuraDialect/Mapping/mapping_util.cpp | 63 +-- .../Transforms/GraphMining/GraMi.cpp | 475 ++++++++++-------- .../Transforms/InsertDataMovPass.cpp | 112 +++-- .../Transforms/MapToAcceleratorPass.cpp | 403 +++++++++------ .../interpreter/lower_and_interpret.mlir | 2 +- .../interpreter/lower_and_interpret_subf.mlir | 2 +- 7 files changed, 642 insertions(+), 427 deletions(-) diff --git a/include/NeuraDialect/Mapping/mapping_util.h b/include/NeuraDialect/Mapping/mapping_util.h index 0a36d476..ee6fcefc 100644 --- a/include/NeuraDialect/Mapping/mapping_util.h +++ b/include/NeuraDialect/Mapping/mapping_util.h @@ -30,13 +30,13 @@ struct RecurrenceCycle { }; // Collects recurrence cycles rooted at reserve and closed by ctrl_mov. -SmallVector collectRecurrenceCycles(Operation *func_op); +SmallVector collectRecurrenceCycles(Region ®ion); // Calculates ResMII: ceil(#ops / #tiles). -int calculateResMii(Operation *func_op, const Architecture &architecture); +int calculateResMii(Region ®ion, const Architecture &architecture); -// Returns topologically sorted operations in func_op. -std::vector getTopologicallySortedOps(Operation *func_op); +// Returns topologically sorted operations in region. +std::vector getTopologicallySortedOps(Region ®ion); // Given the sorted operations, returns a vector of pairs where each pair // contains a vector of operations at the same ALAP (as late as possible) @@ -82,8 +82,8 @@ bool tryRouteBackwardMove(Operation *mov_op, MappingLoc src_loc, // ctrl_mov users found. llvm::SmallVector getCtrlMovUsers(Operation *op); -// Identifies operations on the critical path (i.e., operations with zero slack). -// Returns pair of: (critical_ops_set, asap_level_map) +// Identifies operations on the critical path (i.e., operations with zero +// slack). Returns pair of: (critical_ops_set, asap_level_map) std::pair, llvm::DenseMap> identifyCriticalPathOps(const std::vector &sorted_ops); diff --git a/lib/NeuraDialect/Mapping/mapping_util.cpp b/lib/NeuraDialect/Mapping/mapping_util.cpp index f5b7a86d..814c59a3 100644 --- a/lib/NeuraDialect/Mapping/mapping_util.cpp +++ b/lib/NeuraDialect/Mapping/mapping_util.cpp @@ -191,10 +191,10 @@ void traverseAlongPath(Operation *op, Value reserve_value, } // namespace SmallVector -mlir::neura::collectRecurrenceCycles(Operation *func_op) { +mlir::neura::collectRecurrenceCycles(Region ®ion) { SmallVector recurrence_cycles; - func_op->walk([&](neura::CtrlMovOp ctrl_mov_op) { + region.walk([&](neura::CtrlMovOp ctrl_mov_op) { Value target = ctrl_mov_op.getTarget(); auto reserve_op = target.getDefiningOp(); if (!reserve_op) { @@ -226,12 +226,12 @@ mlir::neura::collectRecurrenceCycles(Operation *func_op) { return recurrence_cycles; } -int mlir::neura::calculateResMii(Operation *func_op, +int mlir::neura::calculateResMii(Region ®ion, const Architecture &architecture) { int num_ops = 0; // Count all "compute" operations (non-terminators, non-block ops). - func_op->walk([&](Operation *op) { + region.walk([&](Operation *op) { // Skips non-materialized ops. if (isa(op) || isa(op)) { @@ -254,13 +254,13 @@ int mlir::neura::calculateResMii(Operation *func_op, } std::vector -mlir::neura::getTopologicallySortedOps(Operation *func_op) { +mlir::neura::getTopologicallySortedOps(Region ®ion) { std::vector sorted_ops; llvm::DenseMap pending_deps; std::deque ready_queue; // Collects recurrence cycle ops. - auto recurrence_cycles = collectRecurrenceCycles(func_op); + auto recurrence_cycles = collectRecurrenceCycles(region); llvm::DenseSet recurrence_ops; for (const auto &cycle : recurrence_cycles) { for (Operation *op : cycle.operations) { @@ -268,10 +268,10 @@ mlir::neura::getTopologicallySortedOps(Operation *func_op) { } } // Counts unresolved dependencies for each op. - func_op->walk([&](Operation *op) { - if (op == func_op) { - return; - } + region.walk([&](Operation *op) { + // if (op == func_op) { + // return; + // } int dep_count = 0; for (Value operand : op->getOperands()) { if (operand.getDefiningOp()) { @@ -417,14 +417,14 @@ std::vector> mlir::neura::flatten_level_buckets( const std::pair &b_pair) { Operation *a = a_pair.first; Operation *b = b_pair.first; - + bool a_is_critical = critical_ops.count(a) > 0; bool b_is_critical = critical_ops.count(b) > 0; - + // Priority 1: Critical ops come first (within same ALAP level). if (a_is_critical != b_is_critical) return a_is_critical > b_is_critical; - + // Priority 2: Degree (connectivity) - higher degree first. int degree_a = a->getNumOperands(); int degree_b = b->getNumOperands(); @@ -438,7 +438,7 @@ std::vector> mlir::neura::flatten_level_buckets( } if (degree_a != degree_b) return degree_a > degree_b; - + // Priority 3: Original index (stability tie-breaker). return a_pair.second < b_pair.second; }); @@ -1036,18 +1036,22 @@ mlir::neura::calculateAward(Operation *op, std::set &critical_ops, } float in_ratio = (total_in > 0) ? (float)occupied_in / total_in : 0; - float out_ratio = (total_out > 0) ? (float)occupied_out / total_out : 0; - + float out_ratio = + (total_out > 0) ? (float)occupied_out / total_out : 0; + // Adaptive penalty strategy: - // - Use very strong penalty (60) only for high fan-in ops (>= 3 producers) + // - Use very strong penalty (60) only for high fan-in ops (>= 3 + // producers) // - Use weak penalty (15) for low fan-in ops - // This optimizes fuse-pattern (II=11 target) without breaking iter-merge + // This optimizes fuse-pattern (II=11 target) without breaking + // iter-merge int base_penalty_coeff = (producers.size() >= 3) ? kStrongCongestionPenalty : kWeakCongestionPenalty; - - int congestion_penalty = static_cast(in_ratio * in_ratio * base_penalty_coeff) + - static_cast(out_ratio * out_ratio * base_penalty_coeff); + + int congestion_penalty = + static_cast(in_ratio * in_ratio * base_penalty_coeff) + + static_cast(out_ratio * out_ratio * base_penalty_coeff); int total_award = tile_award + time_bonus - congestion_penalty; updateAward(locs_with_award, tile_loc_candidate, total_award); @@ -1062,15 +1066,14 @@ mlir::neura::calculateAward(Operation *op, std::set &critical_ops, // Sorts by award (descending). Use stable sort/tie-breaker logic // to minimize noise in mapping results. - std::sort( - locs_award_vec.begin(), locs_award_vec.end(), - [](const std::pair &a, - const std::pair &b) { - if (a.second != b.second) - return a.second > b.second; - // Tie-breaker: earlier time step first. - return a.first.time_step < b.first.time_step; - }); + std::sort(locs_award_vec.begin(), locs_award_vec.end(), + [](const std::pair &a, + const std::pair &b) { + if (a.second != b.second) + return a.second > b.second; + // Tie-breaker: earlier time step first. + return a.first.time_step < b.first.time_step; + }); // TODO: Needs to handle tie case and prioritize lower resource utilization, // however, compiled II becomes worse after adding this tie-breaker: // https://github.com/coredac/dataflow/issues/59. diff --git a/lib/NeuraDialect/Transforms/GraphMining/GraMi.cpp b/lib/NeuraDialect/Transforms/GraphMining/GraMi.cpp index 908d68a4..04cbaf24 100644 --- a/lib/NeuraDialect/Transforms/GraphMining/GraMi.cpp +++ b/lib/NeuraDialect/Transforms/GraphMining/GraMi.cpp @@ -1,12 +1,12 @@ -#include "Common/AcceleratorAttrs.h" -#include "NeuraDialect/NeuraAttributes.h" #include "NeuraDialect/Transforms/GraphMining/GraMi.h" +#include "Common/AcceleratorAttrs.h" #include "NeuraDialect/Mapping/mapping_util.h" +#include "NeuraDialect/NeuraAttributes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/IR/Operation.h" -#include "mlir/IR/Value.h" #include "mlir/IR/Block.h" +#include "mlir/IR/Operation.h" #include "mlir/IR/Region.h" +#include "mlir/IR/Value.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -17,14 +17,14 @@ using namespace mlir::neura; // Static member definition for tracking attempted patterns std::set GraMi::attempted_patterns_; -DfgNode* DfgGraph::addNode(mlir::Operation* op, const std::string& label) { +DfgNode *DfgGraph::addNode(mlir::Operation *op, const std::string &label) { auto node = new DfgNode(next_node_id_++, op, label); nodes_.push_back(node); op_to_node_[op] = node; return node; } -DfgEdge* DfgGraph::addEdge(DfgNode* from, DfgNode* to, mlir::Value value) { +DfgEdge *DfgGraph::addEdge(DfgNode *from, DfgNode *to, mlir::Value value) { auto edge = new DfgEdge(next_edge_id_++, from, to, value); edges_.push_back(edge); from->addOutgoingEdge(edge); @@ -32,14 +32,14 @@ DfgEdge* DfgGraph::addEdge(DfgNode* from, DfgNode* to, mlir::Value value) { return edge; } -DfgNode* DfgGraph::getNode(DfgNode::NodeId id) const { +DfgNode *DfgGraph::getNode(DfgNode::NodeId id) const { if (id < nodes_.size()) { return nodes_[id]; } return nullptr; } -DfgEdge* DfgGraph::getEdge(DfgEdge::EdgeId id) const { +DfgEdge *DfgGraph::getEdge(DfgEdge::EdgeId id) const { if (id < edges_.size()) { return edges_[id]; } @@ -47,10 +47,10 @@ DfgEdge* DfgGraph::getEdge(DfgEdge::EdgeId id) const { } void DfgGraph::clear() { - for (auto* node : nodes_) { + for (auto *node : nodes_) { delete node; } - for (auto* edge : edges_) { + for (auto *edge : edges_) { delete edge; } nodes_.clear(); @@ -60,14 +60,14 @@ void DfgGraph::clear() { next_edge_id_ = 0; } -std::string DfgExtractor::getOperationLabel(mlir::Operation* op) { +std::string DfgExtractor::getOperationLabel(mlir::Operation *op) { std::string op_name = op->getName().getStringRef().str(); - + size_t dot_pos = op_name.find('.'); if (dot_pos != std::string::npos) { op_name = op_name.substr(dot_pos + 1); } - + if (op->getNumResults() > 0) { Type result_type = op->getResult(0).getType(); if (auto int_type = mlir::dyn_cast(result_type)) { @@ -76,36 +76,38 @@ std::string DfgExtractor::getOperationLabel(mlir::Operation* op) { op_name += "_f" + std::to_string(float_type.getWidth()); } } - + return op_name; } -// Excludes operations that are not part of the DFG since they don't involve computation and will not be mapped onto the functional units. -bool DfgExtractor::shouldIncludeOperation(mlir::Operation* op) { +// Excludes operations that are not part of the DFG since they don't involve +// computation and will not be mapped onto the functional units. +bool DfgExtractor::shouldIncludeOperation(mlir::Operation *op) { if (op->getName().getStringRef().contains("func.") || op->getName().getStringRef().contains("module") || - op->getName().getStringRef().contains("return") || - op->getName().getStringRef().contains("data_mov") || + op->getName().getStringRef().contains("return") || + op->getName().getStringRef().contains("data_mov") || op->getName().getStringRef().contains("ctrl_mov") || - op->getName().getStringRef().contains("reserve") || + op->getName().getStringRef().contains("reserve") || op->getName().getStringRef().contains("alloca") || op->getName().getStringRef().contains("yield")) { return false; } - + if (op->getDialect()->getNamespace() == "neura") { return true; } - + if (op->getDialect()->getNamespace() == "llvm") { return false; } - + if (op->getDialect()->getNamespace() == "arith") { return false; } - - llvm::errs() << "Excluding operation: " << op->getName().getStringRef() << "\n"; + + llvm::errs() << "Excluding operation: " << op->getName().getStringRef() + << "\n"; return false; } @@ -113,100 +115,108 @@ bool DfgExtractor::shouldIncludeOperation(mlir::Operation* op) { // Extracts the data flow graph from the module. std::unique_ptr DfgExtractor::extractFromModule(ModuleOp module) { auto graph = std::make_unique(); - + module.walk([&](func::FuncOp func) { llvm::errs() << "Extracting DFG from function: " << func.getName() << "\n"; auto func_graph = extractFromFunction(func); if (func_graph) { - for (auto* node : func_graph->getNodes()) { + for (auto *node : func_graph->getNodes()) { graph->addNode(node->getOperation(), node->getLabel()); } - for (auto* edge : func_graph->getEdges()) { + for (auto *edge : func_graph->getEdges()) { graph->addEdge(edge->getFrom(), edge->getTo(), edge->getValue()); } } }); - + return graph; } // Extracts the data flow graph from the function. std::unique_ptr DfgExtractor::extractFromFunction(func::FuncOp func) { auto graph = std::make_unique(); - - func.walk([&](Block* block) { + + func.walk([&](Block *block) { auto block_graph = extractFromBlock(block); if (block_graph) { - for (auto* node : block_graph->getNodes()) { + for (auto *node : block_graph->getNodes()) { graph->addNode(node->getOperation(), node->getLabel()); } - for (auto* edge : block_graph->getEdges()) { + for (auto *edge : block_graph->getEdges()) { graph->addEdge(edge->getFrom(), edge->getTo(), edge->getValue()); } } }); - + return graph; } // Extracts the data flow graph from the block. -std::unique_ptr DfgExtractor::extractFromBlock(mlir::Block* block) { +std::unique_ptr DfgExtractor::extractFromBlock(mlir::Block *block) { auto graph = std::make_unique(); - llvm::DenseMap value_to_node; - - for (auto& op : block->getOperations()) { + llvm::DenseMap value_to_node; + + for (auto &op : block->getOperations()) { if (shouldIncludeOperation(&op)) { std::string label = getOperationLabel(&op); - DfgNode* node = graph->addNode(&op, label); - + DfgNode *node = graph->addNode(&op, label); + for (mlir::Value result : op.getResults()) { value_to_node[result] = node; } } } - - for (auto& op : block->getOperations()) { + + for (auto &op : block->getOperations()) { if (shouldIncludeOperation(&op)) { - DfgNode* current_node = nullptr; - + DfgNode *current_node = nullptr; + for (mlir::Value result : op.getResults()) { if (value_to_node.count(result)) { current_node = value_to_node[result]; break; } } - - if (!current_node) continue; - + + if (!current_node) + continue; + for (mlir::Value operand : op.getOperands()) { if (value_to_node.count(operand)) { - DfgNode* source_node = value_to_node[operand]; + DfgNode *source_node = value_to_node[operand]; graph->addEdge(source_node, current_node, operand); } } } } - + return graph; } // Mines the frequent subgraphs from the data flow graph. // Algorithm: // 1. Collects all 2-node patterns from the graph -// 2. For each pattern, separates instances into critical path vs non-critical path -// 3. For each pattern, performs MWIS with higher weight for critical path instances +// 2. For each pattern, separates instances into critical path vs non-critical +// path +// 3. For each pattern, performs MWIS with higher weight for critical path +// instances // 4. Performs inter-pattern analysis with critical path conflict priority std::vector GraMi::mineFrequentSubgraphs() { std::vector frequent_subgraphs; - + // Map from pattern string to (critical instances, non-critical instances) - std::map, std::vector>> pattern_instances; - - auto derive_label = [](mlir::Operation* op, const std::string& fallback_label) -> std::string { - if (!op) return fallback_label; + std::map, + std::vector>> + pattern_instances; + + auto derive_label = [](mlir::Operation *op, + const std::string &fallback_label) -> std::string { + if (!op) + return fallback_label; auto name = op->getName().getStringRef(); - if (name.ends_with(attr::val::kOpFused) || name.contains(attr::val::kNeuraFusedOp)) { + if (name.ends_with(attr::val::kOpFused) || + name.contains(attr::val::kNeuraFusedOp)) { if (auto attr = op->getAttr("pattern_name")) { if (auto str_attr = mlir::dyn_cast(attr)) { return std::string("fused_op:") + str_attr.getValue().str(); @@ -217,29 +227,38 @@ std::vector GraMi::mineFrequentSubgraphs() { return fallback_label; }; - llvm::errs() << "[GraMi] Critical path ops count: " << critical_path_ops_.size() << "\n"; + llvm::errs() << "[GraMi] Critical path ops count: " + << critical_path_ops_.size() << "\n"; // Step 1: Collects all 2-node patterns and classifies instances - for (auto* edge : graph_->getEdges()) { - DfgNode* from = edge->getFrom(); - DfgNode* to = edge->getTo(); - - auto* from_op = from->getOperation(); - auto* to_op = to->getOperation(); + for (auto *edge : graph_->getEdges()) { + DfgNode *from = edge->getFrom(); + DfgNode *to = edge->getTo(); + auto *from_op = from->getOperation(); + auto *to_op = to->getOperation(); // Skips operations inside fused_op - if (from_op->getParentRegion()->getParentOp()->getName().getStringRef().str() == "neura.fused_op" || to_op->getParentRegion()->getParentOp()->getName().getStringRef().str() == "neura.fused_op") { + if (from_op->getParentRegion() + ->getParentOp() + ->getName() + .getStringRef() + .str() == "neura.fused_op" || + to_op->getParentRegion() + ->getParentOp() + ->getName() + .getStringRef() + .str() == "neura.fused_op") { continue; } std::string from_label = derive_label(from_op, from->getLabel()); std::string to_label = derive_label(to_op, to->getLabel()); std::string pattern = from_label + "->" + to_label; - + PatternInstance instance; instance.frequency = 1; - + if (from_op->isBeforeInBlock(to_op)) { instance.operations.push_back(from_op); instance.operations.push_back(to_op); @@ -249,11 +268,11 @@ std::vector GraMi::mineFrequentSubgraphs() { instance.operations.push_back(from_op); instance.last_op = from_op; } - - llvm::DenseSet pattern_ops; + + llvm::DenseSet pattern_ops; pattern_ops.insert(from_op); pattern_ops.insert(to_op); - + llvm::SetVector input_set; for (mlir::Value operand : from_op->getOperands()) { input_set.insert(operand); @@ -263,29 +282,31 @@ std::vector GraMi::mineFrequentSubgraphs() { input_set.insert(operand); } } - instance.inputs = std::vector(input_set.begin(), input_set.end()); - + instance.inputs = + std::vector(input_set.begin(), input_set.end()); + llvm::SetVector output_set; - for (mlir::Operation* op : instance.operations) { + for (mlir::Operation *op : instance.operations) { for (mlir::Value result : op->getResults()) { bool has_external_use = false; - for (mlir::OpOperand& use : result.getUses()) { - mlir::Operation* user = use.getOwner(); + for (mlir::OpOperand &use : result.getUses()) { + mlir::Operation *user = use.getOwner(); if (!pattern_ops.contains(user)) { has_external_use = true; break; } } - + if (has_external_use) { output_set.insert(result); } } } - instance.outputs = std::vector(output_set.begin(), output_set.end()); - + instance.outputs = + std::vector(output_set.begin(), output_set.end()); + instance.is_on_critical_path = isInstanceOnCriticalPath(instance); - + if (instance.is_on_critical_path) { pattern_instances[pattern].first.push_back(instance); } else { @@ -295,45 +316,55 @@ std::vector GraMi::mineFrequentSubgraphs() { // Step 2: Processes frequent patterns and performs per-pattern MWIS std::vector candidates; - - for (auto& [pattern, instances_pair] : pattern_instances) { - auto& [critical_instances, non_critical_instances] = instances_pair; - size_t total_count = critical_instances.size() + non_critical_instances.size(); - + + for (auto &[pattern, instances_pair] : pattern_instances) { + auto &[critical_instances, non_critical_instances] = instances_pair; + size_t total_count = + critical_instances.size() + non_critical_instances.size(); + // Skips patterns that have been attempted for fusion if (hasPatternBeenAttempted(pattern)) { continue; } - + if (total_count >= min_support_) { size_t pattern_idx = frequent_subgraphs.size(); std::string from_label = pattern.substr(0, pattern.find("->")); std::string to_label = pattern.substr(pattern.find("->") + 2); - FrequentSubgraph subgraph(pattern, total_count, static_cast(pattern_idx)); + FrequentSubgraph subgraph(pattern, total_count, + static_cast(pattern_idx)); subgraph.addNode(0, from_label); subgraph.addNode(1, to_label); subgraph.addEdge(0, 0, 1); frequent_subgraphs.push_back(subgraph); - - for (auto& inst : critical_instances) { + + for (auto &inst : critical_instances) { inst.pattern_id = static_cast(pattern_idx); } - for (auto& inst : non_critical_instances) { + for (auto &inst : non_critical_instances) { inst.pattern_id = static_cast(pattern_idx); } - - auto [selected_critical, selected_non_critical] = selectMWISForPattern(critical_instances, non_critical_instances, 10.0); - + + auto [selected_critical, selected_non_critical] = selectMWISForPattern( + critical_instances, non_critical_instances, 10.0); + // Creates PatternWithSelectedInstances PatternWithSelectedInstances pwsi(subgraph); pwsi.critical_instances = selected_critical; pwsi.non_critical_instances = selected_non_critical; - pwsi.selected_instances.insert(pwsi.selected_instances.end(), selected_critical.begin(), selected_critical.end()); - pwsi.selected_instances.insert(pwsi.selected_instances.end(), selected_non_critical.begin(), selected_non_critical.end()); - + pwsi.selected_instances.insert(pwsi.selected_instances.end(), + selected_critical.begin(), + selected_critical.end()); + pwsi.selected_instances.insert(pwsi.selected_instances.end(), + selected_non_critical.begin(), + selected_non_critical.end()); + candidates.push_back(pwsi); - - llvm::errs() << "[GraMi] Pattern #" << pattern_idx << " after intra-pattern MWIS: " << selected_critical.size() << " critical, " << selected_non_critical.size() << " non-critical selected\n"; + + llvm::errs() << "[GraMi] Pattern #" << pattern_idx + << " after intra-pattern MWIS: " << selected_critical.size() + << " critical, " << selected_non_critical.size() + << " non-critical selected\n"; } } @@ -341,43 +372,46 @@ std::vector GraMi::mineFrequentSubgraphs() { llvm::errs() << "[GraMi] No frequent patterns found\n"; return {}; } - + // Step 3: Performs inter-pattern analysis with critical path priority - std::vector result = selectPatternsWithCriticalPriority(candidates, min_support_); - + std::vector result = + selectPatternsWithCriticalPriority(candidates, min_support_); + llvm::errs() << "[GraMi] Final result: " << result.size() << " patterns\n"; // Prints summary size_t total_critical = 0, total_non_critical = 0; - for (const auto& p : result) { + for (const auto &p : result) { total_critical += p.critical_instances.size(); total_non_critical += p.non_critical_instances.size(); } - llvm::errs() << "[GraMi] Summary: " << result.size() << " patterns, " << total_critical << " critical instances, " << total_non_critical << " non-critical instances\n"; + llvm::errs() << "[GraMi] Summary: " << result.size() << " patterns, " + << total_critical << " critical instances, " + << total_non_critical << " non-critical instances\n"; return result; } // Checks if the candidate pattern is frequent using the threshold min_support_. -bool GraMi::isFrequent(const FrequentSubgraph& candidate) { +bool GraMi::isFrequent(const FrequentSubgraph &candidate) { size_t support = countSupport(candidate); return support >= min_support_; } // Counts the support of the pattern in the data flow graph. -size_t GraMi::countSupport(const FrequentSubgraph& pattern) { +size_t GraMi::countSupport(const FrequentSubgraph &pattern) { std::map node_counts; - for (const auto& pair : pattern.getNodes()) { + for (const auto &pair : pattern.getNodes()) { node_counts[pair.second]++; } - + std::map graph_node_counts; - for (auto* node : graph_->getNodes()) { + for (auto *node : graph_->getNodes()) { graph_node_counts[node->getLabel()]++; } - + size_t min_count = SIZE_MAX; - for (const auto& pair : node_counts) { + for (const auto &pair : node_counts) { size_t graph_count = graph_node_counts[pair.first]; size_t required_count = pair.second; if (graph_count < required_count) { @@ -385,77 +419,82 @@ size_t GraMi::countSupport(const FrequentSubgraph& pattern) { } min_count = std::min(min_count, graph_count / required_count); } - + return min_count; } // Generates a string representation of the pattern. -std::string GraMi::generatePatternString(const FrequentSubgraph& subgraph) { +std::string GraMi::generatePatternString(const FrequentSubgraph &subgraph) { std::ostringstream oss; oss << "Pattern: "; - + oss << "Nodes["; - for (const auto& pair : subgraph.getNodes()) { + for (const auto &pair : subgraph.getNodes()) { oss << pair.first << ":" << pair.second << " "; } oss << "] "; - + oss << "Edges["; - for (const auto& pair : subgraph.getEdges()) { - oss << pair.first << ":" << pair.second.first << "->" << pair.second.second << " "; + for (const auto &pair : subgraph.getEdges()) { + oss << pair.first << ":" << pair.second.first << "->" << pair.second.second + << " "; } oss << "] "; - + oss << "Support: " << subgraph.getFrequency(); - + return oss.str(); } // Collects critical path operations from the function. // Critical paths are recurrence cycles with maximum length. -llvm::DenseSet GraMi::collectCriticalPathOps(mlir::func::FuncOp func) { - llvm::DenseSet critical_ops; - +llvm::DenseSet +GraMi::collectCriticalPathOps(mlir::func::FuncOp func) { + llvm::DenseSet critical_ops; + // Collects all recurrence cycles - auto recurrence_cycles = collectRecurrenceCycles(func); - + auto recurrence_cycles = collectRecurrenceCycles(func.getBody()); + if (recurrence_cycles.empty()) { llvm::errs() << "[GraMi] No recurrence cycles found\n"; return critical_ops; } - + // Finds the maximum recurrence length int max_length = 0; - for (const auto& cycle : recurrence_cycles) { + for (const auto &cycle : recurrence_cycles) { max_length = std::max(max_length, cycle.length); } - + llvm::errs() << "[GraMi] Maximum recurrence length: " << max_length << "\n"; - + // Collects operations from all cycles with maximum length int critical_cycle_count = 0; - for (const auto& cycle : recurrence_cycles) { + for (const auto &cycle : recurrence_cycles) { if (cycle.length == max_length) { critical_cycle_count++; - for (mlir::Operation* op : cycle.operations) { + for (mlir::Operation *op : cycle.operations) { critical_ops.insert(op); } - llvm::errs() << "[GraMi] Critical path cycle (length " << cycle.length << "):\n"; - for (mlir::Operation* op : cycle.operations) { + llvm::errs() << "[GraMi] Critical path cycle (length " << cycle.length + << "):\n"; + for (mlir::Operation *op : cycle.operations) { llvm::errs() << " " << *op << "\n"; } } } - - llvm::errs() << "[GraMi] Found " << critical_cycle_count << " critical path(s) with " - << critical_ops.size() << " total operations\n"; - + + llvm::errs() << "[GraMi] Found " << critical_cycle_count + << " critical path(s) with " << critical_ops.size() + << " total operations\n"; + return critical_ops; } -// Checks if an instance is on critical path (all operations of the instance must be on critical path) -bool GraMi::isInstanceOnCriticalPath(const PatternInstance& instance) const { - for (mlir::Operation* op : instance.operations) { +// Checks if an instance is on critical path (all operations of the instance +// must be on critical path) +bool GraMi::isInstanceOnCriticalPath(const PatternInstance &instance) const { + for (mlir::Operation *op : instance.operations) { if (!critical_path_ops_.contains(op)) { return false; } @@ -463,20 +502,25 @@ bool GraMi::isInstanceOnCriticalPath(const PatternInstance& instance) const { return true; } -// Checks if the two instances conflict. Conflict occurs if the two instances have the same operation. -bool GraMi::instancesConflict(const PatternInstance& a, const PatternInstance& b) { - for (mlir::Operation* op_a : a.operations) { - for (mlir::Operation* op_b : b.operations) { - if (op_a == op_b) return true; +// Checks if the two instances conflict. Conflict occurs if the two instances +// have the same operation. +bool GraMi::instancesConflict(const PatternInstance &a, + const PatternInstance &b) { + for (mlir::Operation *op_a : a.operations) { + for (mlir::Operation *op_b : b.operations) { + if (op_a == op_b) + return true; } } return false; } -// Checks if the two patterns conflict. If any instance in the two patterns conflict, the patterns conflict. -bool GraMi::patternsConflict(const PatternWithInstances& a, const PatternWithInstances& b) { - for (const auto& inst_a : a.instances) { - for (const auto& inst_b : b.instances) { +// Checks if the two patterns conflict. If any instance in the two patterns +// conflict, the patterns conflict. +bool GraMi::patternsConflict(const PatternWithInstances &a, + const PatternWithInstances &b) { + for (const auto &inst_a : a.instances) { + for (const auto &inst_b : b.instances) { if (instancesConflict(inst_a, inst_b)) { return true; } @@ -486,10 +530,10 @@ bool GraMi::patternsConflict(const PatternWithInstances& a, const PatternWithIns } // Checks if two patterns have conflicting critical path instances -bool GraMi::criticalInstancesConflict(const PatternWithSelectedInstances& a, - const PatternWithSelectedInstances& b) { - for (const auto& inst_a : a.critical_instances) { - for (const auto& inst_b : b.critical_instances) { +bool GraMi::criticalInstancesConflict(const PatternWithSelectedInstances &a, + const PatternWithSelectedInstances &b) { + for (const auto &inst_a : a.critical_instances) { + for (const auto &inst_b : b.critical_instances) { if (instancesConflict(inst_a, inst_b)) { return true; } @@ -500,73 +544,81 @@ bool GraMi::criticalInstancesConflict(const PatternWithSelectedInstances& a, // Selects maximum weighted independent set for a single pattern // Critical path instances have higher weight -std::pair, std::vector> GraMi::selectMWISForPattern(const std::vector& critical_instances, const std::vector& non_critical_instances, double critical_weight_multiplier) { - +std::pair, std::vector> +GraMi::selectMWISForPattern( + const std::vector &critical_instances, + const std::vector &non_critical_instances, + double critical_weight_multiplier) { + // Combines all instances with their weights std::vector> weighted_instances; - - for (const auto& inst : critical_instances) { + + for (const auto &inst : critical_instances) { weighted_instances.push_back({inst, critical_weight_multiplier}); } - for (const auto& inst : non_critical_instances) { + for (const auto &inst : non_critical_instances) { weighted_instances.push_back({inst, 1.0}); } - + if (weighted_instances.empty()) { return {{}, {}}; } - + size_t n = weighted_instances.size(); - + // Builds conflict graph std::vector> conflicts(n); for (size_t i = 0; i < n; ++i) { for (size_t j = i + 1; j < n; ++j) { - if (instancesConflict(weighted_instances[i].first, weighted_instances[j].first)) { + if (instancesConflict(weighted_instances[i].first, + weighted_instances[j].first)) { conflicts[i].push_back(j); conflicts[j].push_back(i); } } } - + // Greedy MWIS selection: prioritizes by weight / (degree + 1) std::vector selected_indices; std::vector available(n, true); - + while (true) { size_t best_idx = n; double best_score = -1.0; - + for (size_t i = 0; i < n; ++i) { - if (!available[i]) continue; - + if (!available[i]) + continue; + size_t active_degree = 0; for (size_t neighbor : conflicts[i]) { - if (available[neighbor]) active_degree++; + if (available[neighbor]) + active_degree++; } - + double score = weighted_instances[i].second / (active_degree + 1); - + if (score > best_score) { best_score = score; best_idx = i; } } - - if (best_idx == n) break; - + + if (best_idx == n) + break; + selected_indices.push_back(best_idx); available[best_idx] = false; - + for (size_t neighbor : conflicts[best_idx]) { available[neighbor] = false; } } - + // Separates selected instances into critical and non-critical std::vector selected_critical; std::vector selected_non_critical; - + size_t critical_count = critical_instances.size(); for (size_t idx : selected_indices) { if (idx < critical_count) { @@ -575,7 +627,7 @@ std::pair, std::vector> GraMi::sel selected_non_critical.push_back(weighted_instances[idx].first); } } - + return {selected_critical, selected_non_critical}; } @@ -583,48 +635,56 @@ std::pair, std::vector> GraMi::sel // Rules: // - If two patterns have conflicting critical instances, they cannot coexist // Chooses the pattern with more critical instances -// - Non-critical vs non-critical or non-critical vs critical conflicts are allowed -std::vector GraMi::selectPatternsWithCriticalPriority(std::vector& candidates, size_t min_support) { - - if (candidates.empty()) return {}; - - // Sorts candidates by number of critical instances (descending), then by total instances +// - Non-critical vs non-critical or non-critical vs critical conflicts are +// allowed +std::vector +GraMi::selectPatternsWithCriticalPriority( + std::vector &candidates, size_t min_support) { + + if (candidates.empty()) + return {}; + + // Sorts candidates by number of critical instances (descending), then by + // total instances std::sort(candidates.begin(), candidates.end(), - [](const PatternWithSelectedInstances& a, const PatternWithSelectedInstances& b) { - if (a.critical_instances.size() != b.critical_instances.size()) { - return a.critical_instances.size() > b.critical_instances.size(); - } - return a.selected_instances.size() > b.selected_instances.size(); - }); - + [](const PatternWithSelectedInstances &a, + const PatternWithSelectedInstances &b) { + if (a.critical_instances.size() != b.critical_instances.size()) { + return a.critical_instances.size() > + b.critical_instances.size(); + } + return a.selected_instances.size() > b.selected_instances.size(); + }); + std::vector result; - + for (size_t i = 0; i < candidates.size(); ++i) { // Checks for critical instance conflicts with already selected patterns bool has_critical_conflict = false; - for (const auto& selected : result) { + for (const auto &selected : result) { if (criticalInstancesConflict(candidates[i], selected)) { has_critical_conflict = true; break; } } - + if (has_critical_conflict) { continue; } - + result.push_back(candidates[i]); } - - // Now handles non-critical conflicts: removes conflicting non-critical instances + + // Now handles non-critical conflicts: removes conflicting non-critical + // instances for (size_t i = 0; i < result.size(); ++i) { for (size_t j = i + 1; j < result.size(); ++j) { // Finds non-critical instances in pattern j that conflict with pattern i std::vector remaining_non_critical; - for (const auto& inst_j : result[j].non_critical_instances) { + for (const auto &inst_j : result[j].non_critical_instances) { bool conflicts_with_i = false; // Checks conflict with pattern i - for (const auto& inst_i : result[i].selected_instances) { + for (const auto &inst_i : result[i].selected_instances) { if (instancesConflict(inst_i, inst_j)) { conflicts_with_i = true; break; @@ -635,26 +695,31 @@ std::vector GraMi::selectPatternsWithCriticalPrior } } result[j].non_critical_instances = remaining_non_critical; - + // Updates selected_instances result[j].selected_instances.clear(); - result[j].selected_instances.insert(result[j].selected_instances.end(), result[j].critical_instances.begin(), result[j].critical_instances.end()); - result[j].selected_instances.insert(result[j].selected_instances.end(), result[j].non_critical_instances.begin(), result[j].non_critical_instances.end()); + result[j].selected_instances.insert(result[j].selected_instances.end(), + result[j].critical_instances.begin(), + result[j].critical_instances.end()); + result[j].selected_instances.insert( + result[j].selected_instances.end(), + result[j].non_critical_instances.begin(), + result[j].non_critical_instances.end()); } } - + return result; } // Gets the label of the operation. -std::string GraMi::getOperationLabel(mlir::Operation* op) { +std::string GraMi::getOperationLabel(mlir::Operation *op) { std::string op_name = op->getName().getStringRef().str(); - + size_t dot_pos = op_name.find('.'); if (dot_pos != std::string::npos) { op_name = op_name.substr(dot_pos + 1); } - + if (op->getNumResults() > 0) { Type result_type = op->getResult(0).getType(); if (auto int_type = mlir::dyn_cast(result_type)) { @@ -663,17 +728,17 @@ std::string GraMi::getOperationLabel(mlir::Operation* op) { op_name += "_f" + std::to_string(float_type.getWidth()); } } - + return op_name; } // Checks if a pattern has been attempted for fusion -bool GraMi::hasPatternBeenAttempted(const std::string& pattern) { +bool GraMi::hasPatternBeenAttempted(const std::string &pattern) { return attempted_patterns_.find(pattern) != attempted_patterns_.end(); } // Marks a pattern as attempted for fusion -void GraMi::markPatternAsAttempted(const std::string& pattern) { +void GraMi::markPatternAsAttempted(const std::string &pattern) { attempted_patterns_.insert(pattern); llvm::errs() << "[GraMi] Marked pattern as attempted: " << pattern << "\n"; } diff --git a/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp b/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp index ae7ef859..1c887a67 100644 --- a/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp +++ b/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp @@ -28,7 +28,7 @@ struct InsertDataMovForNeuraOps : public RewritePattern { return failure(); } - // Skip operations inside fused_op regions + // Skips operations inside fused_op regions. Operation *parent_op = op->getParentOp(); while (parent_op) { if (isa(parent_op)) { @@ -36,7 +36,6 @@ struct InsertDataMovForNeuraOps : public RewritePattern { } parent_op = parent_op->getParentOp(); } - bool all_inputs_are_mov_except_reserve = llvm::all_of(op->getOperands(), [](Value v) { @@ -86,15 +85,16 @@ struct InsertDataMovForNeuraOps : public RewritePattern { return failure(); // do not rewrite } - // Wraps operands in mov, but skip those already wrapped or from reserve. SmallVector new_operands; bool any_change = false; for (Value operand : op->getOperands()) { Operation *producer = operand.getDefiningOp(); - // Skips adding mov for any operand that comes from a reserve op or already from data_mov. - if (producer && (isa(producer) || isa(producer))) { + // Skips adding mov for any operand that comes from a reserve op or + // already from data_mov. + if (producer && (isa(producer) || + isa(producer))) { new_operands.push_back(operand); continue; } @@ -129,7 +129,8 @@ struct InsertDataMovForNeuraOps : public RewritePattern { } }; -// Wraps all fused_op's inputs and outputs with data_mov operations in the module. +// Wraps all fused_op's inputs and outputs with data_mov operations in the +// module. void wrapFusedOpsWithDataMov(ModuleOp module_op) { SmallVector fused_ops_to_process; module_op.walk([&](neura::FusedOp fused_op) { @@ -145,13 +146,14 @@ void wrapFusedOpsWithDataMov(ModuleOp module_op) { SmallVector new_operands; for (Value operand : fused_op->getOperands()) { Operation *producer = operand.getDefiningOp(); - + // Skip if already wrapped in data_mov or from reserve if (isa_and_nonnull(producer) || isa_and_nonnull(producer)) { new_operands.push_back(operand); } else { - auto mov = rewriter.create(loc, operand.getType(), operand); + auto mov = + rewriter.create(loc, operand.getType(), operand); new_operands.push_back(mov); } } @@ -161,9 +163,9 @@ void wrapFusedOpsWithDataMov(ModuleOp module_op) { for (size_t i = 0; i < fused_op->getNumOperands(); ++i) { mapper.map(fused_op->getOperand(i), new_operands[i]); } - + Operation *new_fused_op = rewriter.clone(*fused_op.getOperation(), mapper); - + // Update the operands of the cloned operation for (size_t i = 0; i < new_operands.size(); ++i) { new_fused_op->setOperand(i, new_operands[i]); @@ -171,33 +173,35 @@ void wrapFusedOpsWithDataMov(ModuleOp module_op) { // Wrap outputs with data_mov - create separate data_mov for each user rewriter.setInsertionPointAfter(new_fused_op); - + // For each result of the fused_op, create a separate data_mov for each user - for (size_t result_idx = 0; result_idx < fused_op->getNumResults(); ++result_idx) { + for (size_t result_idx = 0; result_idx < fused_op->getNumResults(); + ++result_idx) { Value old_result = fused_op->getResult(result_idx); Value new_result = new_fused_op->getResult(result_idx); - + // Collect all users first (to avoid iterator invalidation) - SmallVector users_to_update; + SmallVector users_to_update; for (OpOperand &use : old_result.getUses()) { users_to_update.push_back(&use); } - + // Create a separate data_mov for each user for (OpOperand *use : users_to_update) { Operation *user_op = use->getOwner(); - - // If the user is already a data_mov (created by another fused_op's input wrapping), - // just update its operand to avoid nested data_mov + + // If the user is already a data_mov (created by another fused_op's + // input wrapping), just update its operand to avoid nested data_mov if (auto existing_mov = llvm::dyn_cast(user_op)) { if (use->getOperandNumber() == 0) { // data_mov only has one operand existing_mov->setOperand(0, new_result); continue; } } - + // Otherwise, create a new data_mov for this user - auto mov = rewriter.create(loc, new_result.getType(), new_result); + auto mov = rewriter.create(loc, new_result.getType(), + new_result); use->set(mov); } } @@ -226,19 +230,69 @@ struct InsertDataMovPass ModuleOp module_op = getOperation(); - // First, handle fused_op operations specially + // Step 1, handles fused_op operations specially. wrapFusedOpsWithDataMov(module_op); - // Then applies patterns to every region inside the module, excluding fused_op regions. - module_op.walk([&](Operation *op) { - if (!op->getRegions().empty() && !llvm::isa(op)) { - for (Region ®ion : op->getRegions()) { - if (failed(applyPatternsGreedily(region, frozen))) { - signalPassFailure(); - } - } + // Then applies patterns to every region inside the module, excluding + // fused_op regions. + // module_op.walk([&](Operation *op) { + // if (!op->getRegions().empty() && !llvm::isa(op)) { + // for (Region ®ion : op->getRegions()) { + // if (failed(applyPatternsGreedily(region, frozen))) { + // signalPassFailure(); + // } + // } + // } + // }); + + // Step 2: Processes functions with neura accelerator attribute. + llvm::errs() << "[InsertDataMovPass] Processing functions...\n"; + module_op.walk([&](func::FuncOp func_op) { + auto accel_attr = + func_op->getAttrOfType(accel::kAcceleratorAttr); + + if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { + llvm::errs() << "[InsertDataMovPass] Skipping function: " + << func_op.getName() << " (not neura target)\n"; + return; + } + + llvm::errs() << "[InsertDataMovPass] Processing function: " + << func_op.getName() << "\n"; + + Region &func_region = func_op.getBody(); + if (failed(applyPatternsGreedily(func_region, frozen))) { + llvm::errs() << "[InsertDataMovPass] ❌ Failed to apply patterns\n"; + signalPassFailure(); + } else { + llvm::errs() << "[InsertDataMovPass] ✅ Successfully processed\n"; } }); + + // Step 3: Processes kernels with neura accelerator attributes. + llvm::errs() << "[InsertDataMovPass] Processing kernels...\n"; + module_op.walk([&](neura::KernelOp kernel_op) { + auto accel_attr = + kernel_op->getAttrOfType(accel::kAcceleratorAttr); + + if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { + llvm::errs() + << "[InsertDataMovPass] Skipping kernel (not neura target)\n"; + return; + } + + llvm::errs() << "[InsertDataMovPass] Processing kernel...\n"; + + Region &kernel_region = kernel_op.getBody(); + if (failed(applyPatternsGreedily(kernel_region, frozen))) { + llvm::errs() << "[InsertDataMovPass] ❌ Failed to apply patterns\n"; + signalPassFailure(); + } else { + llvm::errs() << "[InsertDataMovPass] ✅ Successfully processed\n"; + } + }); + + llvm::errs() << "[InsertDataMovPass] ✅ Pass complete\n"; } }; } // namespace diff --git a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp index 700c2b4d..d8b7ef57 100644 --- a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp +++ b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp @@ -157,12 +157,11 @@ struct MapToAcceleratorPass } // Assigns unique dfg_id to all operations in SSA topological order. - void assignDfgIds(func::FuncOp func) { + void assignDfgIdsInRegion(Region ®ion, int &next_id) { // Uses existing topological sort to get all operations in order. - std::vector sorted_ops = getTopologicallySortedOps(func); + std::vector sorted_ops = getTopologicallySortedOps(region); - auto ctx = func.getContext(); - int next_id = 0; + auto ctx = region.getContext(); // Assigns ID to each operation in topological order. for (Operation *op : sorted_ops) { @@ -177,6 +176,170 @@ struct MapToAcceleratorPass << " dfg_id(s) in total\n"; } + // Generic mapping function works for both function and kernel mapping. + template + bool mapRegion(OpType op, Region ®ion, Architecture &architecture, + Mapping *mapping_strategy, bool is_spatial_only, + int max_ctrl_mem_items, + const std::string &resolved_mapping_mode, + const std::string &resolved_mapping_strategy) { + // Checks steering mode compatibility with architecture. + auto dataflow_mode_attr = + op->template getAttrOfType(attr::kDataflowMode); + bool is_steering_mode = + (dataflow_mode_attr && + dataflow_mode_attr.getValue() == attr::val::kModeSteering); + if (is_steering_mode) { + if (!is_spatial_only) { + op.emitError() + << "Steering mode mapping only supports spatial-only mapping mode."; + return false; + } + } + + // Collects and reports recurrence cycles found in the function. + auto recurrence_cycles = collectRecurrenceCycles(region); + std::set critical_ops; + RecurrenceCycle *longest = nullptr; + int rec_mii = 1; + for (auto &cycle : recurrence_cycles) { + llvm::outs() << "[DEBUG] Recurrence cycle (length " << cycle.length + << "):\n"; + for (Operation *op : cycle.operations) { + critical_ops.insert(op); + llvm::outs() << " " << *op << "\n"; + } + if (!longest || cycle.length > longest->length) { + longest = &cycle; + } + } + + if (longest) { + llvm::outs() << "[MapToAcceleratorPass] Longest recurrence cycle (length " + << longest->length << "):\n"; + for (Operation *op : longest->operations) { + op->print(llvm::outs()), llvm::outs() << "\n"; + } + rec_mii = longest->length; + } else if (!longest) { + rec_mii = 1; // No recurrence cycles found, set MII to 1. + } + + int res_mii = calculateResMii(region, architecture); + + const int possible_min_ii = std::max(rec_mii, res_mii); + const int max_ii = + max_ctrl_mem_items; // Use YAML config (default 20 if not specified) + + std::vector topologically_sorted_ops = + getTopologicallySortedOps(region); + if (topologically_sorted_ops.empty()) { + assert(false && "Mapping aborted due to empty op list."); + } + + // Filters out operations inside fused_op regions. + // Only map the fused_op itself, not the operations within its region + std::vector filtered_ops; + int skipped_count = 0; + for (Operation *op : topologically_sorted_ops) { + Operation *parent_op = op->getParentOp(); + // Check if parent is a fused_op by checking operation name + if (parent_op && + parent_op->getName().getStringRef().contains(attr::val::kOpFused)) { + // Skip operations inside fused_op region + llvm::outs() << "[MapToAcceleratorPass] Skipping op inside fused_op: " + << *op << "\n"; + skipped_count++; + continue; + } + filtered_ops.push_back(op); + } + topologically_sorted_ops = std::move(filtered_ops); + + if (skipped_count > 0) { + llvm::errs() << "[MapToAcceleratorPass] Filtered out " << skipped_count + << " operations inside fused_op regions\n"; + } + + for (Operation *op : topologically_sorted_ops) { + llvm::outs() << "[MapToAcceleratorPass] Topologically sorted op: " << *op + << "\n"; + } + std::vector> level_buckets = + getOpsInAlapLevels(topologically_sorted_ops, critical_ops); + for (int level = 0; level < static_cast(level_buckets.size()); + ++level) { + llvm::outs() << "[MapToAcceleratorPass] ALAP Bucket Level " << level + << ": " << level_buckets[level].size() << " ops\n"; + for (Operation *op : level_buckets[level]) { + llvm::outs() << " " << *op << "\n"; + } + } + std::vector> sorted_ops_with_alap_levels = + flatten_level_buckets(level_buckets, critical_ops); + for (const auto &[op, level] : sorted_ops_with_alap_levels) { + llvm::outs() << "[MapToAcceleratorPass] ALAP sorted op: " << *op + << " (ALAP level: " << level << ")\n"; + } + // assert(false); + for (int ii = possible_min_ii; ii <= max_ii; ++ii) { + llvm::errs() << "[MapToAcceleratorPass] Start mapping with target II of " + << ii << "\n"; + // Creates a mapping state for the current II. + MappingState mapping_state(architecture, ii, is_spatial_only); + if (mapping_strategy->map(sorted_ops_with_alap_levels, critical_ops, + architecture, mapping_state)) { + // success + if (dumpMappingTable) { + // logs to stderr + mapping_state.dumpOpToLocs(); + } + mapping_state.encodeMappingState(); + + // Assigns unique dfg_id to all operations in SSA topological order. + int next_id = 0; + assignDfgIdsInRegion(region, next_id); + + // Sets the mapping_info attribute on the function. + auto ctx = op->getContext(); + SmallVector mapping_attrs; + mapping_attrs.push_back( + NamedAttribute(StringAttr::get(ctx, attr::kXTiles), + IntegerAttr::get(IntegerType::get(ctx, 32), + architecture.getPerCgraColumns()))); + mapping_attrs.push_back( + NamedAttribute(StringAttr::get(ctx, attr::kYTiles), + IntegerAttr::get(IntegerType::get(ctx, 32), + architecture.getPerCgraRows()))); + mapping_attrs.push_back( + NamedAttribute(StringAttr::get(ctx, attr::kMappingStrategy), + StringAttr::get(ctx, resolved_mapping_strategy))); + mapping_attrs.push_back( + NamedAttribute(StringAttr::get(ctx, attr::kMappingMode), + StringAttr::get(ctx, resolved_mapping_mode))); + mapping_attrs.push_back( + NamedAttribute(StringAttr::get(ctx, attr::kCompiledII), + IntegerAttr::get(IntegerType::get(ctx, 32), ii))); + mapping_attrs.push_back(NamedAttribute( + StringAttr::get(ctx, attr::kRecMII), + IntegerAttr::get(IntegerType::get(ctx, 32), rec_mii))); + mapping_attrs.push_back(NamedAttribute( + StringAttr::get(ctx, attr::kResMII), + IntegerAttr::get(IntegerType::get(ctx, 32), res_mii))); + DictionaryAttr mapping_info = DictionaryAttr::get(ctx, mapping_attrs); + + op->setAttr(attr::kMappingInfo, mapping_info); + return true; + } + llvm::errs() << "[MapToAcceleratorPass] Mapping failed for target II of " + << ii << "\n"; + mapping_state.dumpOpToLocs(); + } + llvm::errs() + << "[MapToAcceleratorPass] Mapping failed for all target II values.\n"; + return false; + } + void runOnOperation() override { ModuleOp module = getOperation(); llvm::errs() << "[MapToAcceleratorPass] Starting mapping pass...\n"; @@ -193,173 +356,103 @@ struct MapToAcceleratorPass const Architecture &architecture = mlir::neura::getArchitecture(); - module.walk([&](func::FuncOp func) { - // Skips functions not targeting the neura accelerator. - auto accel_attr = - func->getAttrOfType(accel::kAcceleratorAttr); - if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { + std::string architecture_spec_file = mlir::neura::getArchitectureSpecFile(); + int multi_cgra_rows = kMultiCgraDefaultRows; + int multi_cgra_columns = kMultiCgraDefaultColumns; + int per_cgra_rows = kPerCgraDefaultRows; + int per_cgra_columns = kPerCgraDefaultColumns; + int max_ctrl_mem_items = kDefaultMaxCtrlMemItems; + mlir::neura::TileDefaults tile_defaults; + std::vector tile_overrides; + mlir::neura::LinkDefaults link_defaults; + std::vector link_overrides; + mlir::neura::BaseTopology multi_cgra_base_topology = + mlir::neura::BaseTopology::MESH; + mlir::neura::BaseTopology per_cgra_base_topology = + mlir::neura::BaseTopology::MESH; + + if (!architecture_spec_file.empty()) { + + // Use LLVM YAML parser to validate the YAML syntax (no mapping yet) + llvm::ErrorOr> buffer_or_err = + llvm::MemoryBuffer::getFile(architecture_spec_file); + if (!buffer_or_err) { + llvm::errs() << "[MapToAcceleratorPass] Failed to open architecture " + "specification file: " + << architecture_spec_file << "\n"; return; } - // Checks the dataflow IR mode. - auto dataflow_mode_attr = - func->getAttrOfType(attr::kDataflowMode); - bool is_steering_mode = - (dataflow_mode_attr && - dataflow_mode_attr.getValue() == attr::val::kModeSteering); - - // If steering mode, enforce spatial-only mapping. - if (is_steering_mode) { - if (!is_spatial_only) { - func.emitError() << "Steering IR mode requires spatial-only mapping, " - << "but got mapping mode: " << resolved_mapping_mode; - signalPassFailure(); - return; - } - llvm::errs() << "[MapToAcceleratorPass] Using spatial-only mapping for " - "steering mode function: " - << func.getName() << "\n"; - } + llvm::SourceMgr sm; + sm.AddNewSourceBuffer(std::move(*buffer_or_err), llvm::SMLoc()); + llvm::yaml::Stream yaml_stream( + sm.getMemoryBuffer(sm.getMainFileID())->getBuffer(), sm); - // Collects and reports recurrence cycles found in the function. - auto recurrence_cycles = collectRecurrenceCycles(func); - std::set critical_ops; - RecurrenceCycle *longest = nullptr; - int rec_mii = 1; - for (auto &cycle : recurrence_cycles) { - llvm::outs() << "[DEBUG] Recurrence cycle (length " << cycle.length - << "):\n"; - for (Operation *op : cycle.operations) { - critical_ops.insert(op); - llvm::outs() << " " << *op << "\n"; - } - if (!longest || cycle.length > longest->length) { - longest = &cycle; - } + bool parse_failed = false; + llvm::yaml::Document &yaml_doc = *yaml_stream.begin(); + (void)yaml_doc; // ensure document is created + if (yaml_stream.failed()) { + parse_failed = true; } - if (longest) { - llvm::outs() - << "[MapToAcceleratorPass] Longest recurrence cycle (length " - << longest->length << "):\n"; - for (Operation *op : longest->operations) { - op->print(llvm::outs()), llvm::outs() << "\n"; - } - rec_mii = longest->length; - } else if (!longest) { - rec_mii = 1; // No recurrence cycles found, set MII to 1. + if (parse_failed) { + llvm::errs() << "[MapToAcceleratorPass] YAML parse error in: " + << architecture_spec_file << "\n"; + return; } - int res_mii = calculateResMii(func, architecture); + // Parses YAML configuration. + if (!parseArchitectureYaml( + yaml_doc, multi_cgra_rows, multi_cgra_columns, + multi_cgra_base_topology, per_cgra_rows, per_cgra_columns, + per_cgra_base_topology, max_ctrl_mem_items, tile_defaults, + tile_overrides, link_defaults, link_overrides)) { + return; + } + } else { + llvm::errs() << "[MapToAcceleratorPass] No architecture specification " + "file provided.\n"; + } - const int possible_min_ii = std::max(rec_mii, res_mii); - const int max_allowed_ii = architecture.getMaxCtrlMemItems(); + // Creates architecture. + Architecture architecture( + multi_cgra_rows, multi_cgra_columns, multi_cgra_base_topology, + per_cgra_rows, per_cgra_columns, per_cgra_base_topology, tile_defaults, + tile_overrides, link_defaults, link_overrides); - std::vector topologically_sorted_ops = - getTopologicallySortedOps(func); - if (topologically_sorted_ops.empty()) { - llvm::errs() - << "[MapToAcceleratorPass] No operations to map in function " - << func.getName() << "\n"; - assert(false && "Mapping aborted due to empty op list."); + // Maps kernels. + module.walk([&](neura::KernelOp kernel_op) { + auto accel_attr = + kernel_op->getAttrOfType(accel::kAcceleratorAttr); + if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { + return; } - // Filter out operations inside fused_op regions - // Only map the fused_op itself, not the operations within its region - std::vector filtered_ops; - int skipped_count = 0; - for (Operation *op : topologically_sorted_ops) { - Operation *parent_op = op->getParentOp(); - // Check if parent is a fused_op by checking operation name - if (parent_op && - parent_op->getName().getStringRef().contains(attr::val::kOpFused)) { - // Skip operations inside fused_op region - llvm::outs() << "[MapToAcceleratorPass] Skipping op inside fused_op: " - << *op << "\n"; - skipped_count++; - continue; - } - filtered_ops.push_back(op); + Region &kernel_region = kernel_op.getBody(); + if (!mapRegion(kernel_op, kernel_region, architecture, + mapping_strategy.get(), is_spatial_only, + max_ctrl_mem_items, resolved_mapping_mode, + resolved_mapping_strategy)) { + llvm::errs() << "[MapToAcceleratorPass] Mapping failed for kernel.\n"; + signalPassFailure(); } - topologically_sorted_ops = std::move(filtered_ops); + }); - if (skipped_count > 0) { - llvm::errs() << "[MapToAcceleratorPass] Filtered out " << skipped_count - << " operations inside fused_op regions\n"; + // Maps functions. + module.walk([&](func::FuncOp func_op) { + auto accel_attr = + func_op->getAttrOfType(accel::kAcceleratorAttr); + if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { + return; } - for (Operation *op : topologically_sorted_ops) { - llvm::outs() << "[MapToAcceleratorPass] Topologically sorted op: " - << *op << "\n"; - } - std::vector> level_buckets = - getOpsInAlapLevels(topologically_sorted_ops, critical_ops); - for (int level = 0; level < static_cast(level_buckets.size()); - ++level) { - llvm::outs() << "[MapToAcceleratorPass] ALAP Bucket Level " << level - << ": " << level_buckets[level].size() << " ops\n"; - for (Operation *op : level_buckets[level]) { - llvm::outs() << " " << *op << "\n"; - } - } - std::vector> sorted_ops_with_alap_levels = - flatten_level_buckets(level_buckets, critical_ops); - for (const auto &[op, level] : sorted_ops_with_alap_levels) { - llvm::outs() << "[MapToAcceleratorPass] ALAP sorted op: " << *op - << " (ALAP level: " << level << ")\n"; - } - // assert(false); - for (int ii = possible_min_ii; ii <= max_allowed_ii; ++ii) { - llvm::errs() - << "[MapToAcceleratorPass] Start mapping with target II of " << ii - << "\n"; - // Creates a mapping state for the current II. - MappingState mapping_state(architecture, ii, is_spatial_only); - if (mapping_strategy->map(sorted_ops_with_alap_levels, critical_ops, - architecture, mapping_state)) { - // success - if (dumpMappingTable) { - // logs to stderr - mapping_state.dumpOpToLocs(); - } - mapping_state.encodeMappingState(); - - // Assigns unique dfg_id to all operations in SSA topological order. - assignDfgIds(func); - - // Sets the mapping_info attribute on the function. - auto ctx = func.getContext(); - SmallVector mapping_attrs; - mapping_attrs.push_back(NamedAttribute( - StringAttr::get(ctx, attr::kXTiles), - IntegerAttr::get(IntegerType::get(ctx, 32), - architecture.getPerCgraColumns()))); - mapping_attrs.push_back( - NamedAttribute(StringAttr::get(ctx, attr::kYTiles), - IntegerAttr::get(IntegerType::get(ctx, 32), - architecture.getPerCgraRows()))); - mapping_attrs.push_back( - NamedAttribute(StringAttr::get(ctx, attr::kMappingStrategy), - StringAttr::get(ctx, resolved_mapping_strategy))); - mapping_attrs.push_back( - NamedAttribute(StringAttr::get(ctx, attr::kMappingMode), - StringAttr::get(ctx, resolved_mapping_mode))); - mapping_attrs.push_back( - NamedAttribute(StringAttr::get(ctx, attr::kCompiledII), - IntegerAttr::get(IntegerType::get(ctx, 32), ii))); - mapping_attrs.push_back(NamedAttribute( - StringAttr::get(ctx, attr::kRecMII), - IntegerAttr::get(IntegerType::get(ctx, 32), rec_mii))); - mapping_attrs.push_back(NamedAttribute( - StringAttr::get(ctx, attr::kResMII), - IntegerAttr::get(IntegerType::get(ctx, 32), res_mii))); - DictionaryAttr mapping_info = DictionaryAttr::get(ctx, mapping_attrs); - - func->setAttr(attr::kMappingInfo, mapping_info); - break; - } - llvm::errs() << "[DEBUG] mapping failed for II = " << ii << "\n"; - mapping_state.dumpOpToLocs(); // logs to stderr + Region &func_region = func_op.getBody(); + + if (!mapRegion(func_op, func_region, architecture, mapping_strategy.get(), + is_spatial_only, max_ctrl_mem_items, resolved_mapping_mode, + resolved_mapping_strategy)) { + llvm::errs() << "[MapToAcceleratorPass] Failed to map function.\n"; + signalPassFailure(); } }); } diff --git a/test/neura/interpreter/lower_and_interpret.mlir b/test/neura/interpreter/lower_and_interpret.mlir index 9d50c317..2559b89b 100644 --- a/test/neura/interpreter/lower_and_interpret.mlir +++ b/test/neura/interpreter/lower_and_interpret.mlir @@ -19,7 +19,7 @@ // RUN: %t-out.bin > %t-dumped_output.txt -// RUN: mlir-neura-opt --lower-arith-to-neura --insert-data-mov %s \ +// RUN: mlir-neura-opt --assign-accelerator --lower-arith-to-neura --insert-data-mov %s \ // RUN: -o %t-neura.mlir // RUN: neura-interpreter %t-neura.mlir >> %t-dumped_output.txt diff --git a/test/neura/interpreter/lower_and_interpret_subf.mlir b/test/neura/interpreter/lower_and_interpret_subf.mlir index a91bed9d..9670adeb 100644 --- a/test/neura/interpreter/lower_and_interpret_subf.mlir +++ b/test/neura/interpreter/lower_and_interpret_subf.mlir @@ -19,7 +19,7 @@ // RUN: %t-out.bin > %t-dumped_output.txt -// RUN: mlir-neura-opt --lower-arith-to-neura --insert-data-mov %s \ +// RUN: mlir-neura-opt --assign-accelerator --lower-arith-to-neura --insert-data-mov %s \ // RUN: -o %t-neura.mlir // RUN: neura-interpreter %t-neura.mlir >> %t-dumped_output.txt From 6f7084d0d722f1b9ffba5046f8de6740f97bc851 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 24 Jan 2026 16:03:05 +0800 Subject: [PATCH 13/25] enable kernel mapping --- .../Transforms/CanonicalizeCastPass.cpp | 46 +++++++++++-------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp index 18bde2b3..5b06f085 100644 --- a/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp +++ b/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp @@ -111,31 +111,39 @@ struct CanonicalizeCastPass void runOnOperation() override { auto module_op = getOperation(); - module_op.walk([&](Operation *op) { - Region *region = nullptr; - if (auto func_op = dyn_cast(op)) { - auto accel_attr = - func_op->getAttrOfType(accel::kAcceleratorAttr); - if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { - return; - } - region = &func_op.getBody(); - } else if (auto llvm_func = dyn_cast(op)) { - auto accel_attr = - llvm_func->getAttrOfType(accel::kAcceleratorAttr); - if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { - return; - } - region = &llvm_func.getBody(); - } else { + // Proceeses function. + module_op.walk([&](func::FuncOp func_op) { + auto accel_attr = + func_op->getAttrOfType(accel::kAcceleratorAttr); + if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { + return; + } + Region &func_region = func_op.getBody(); + + if (func_region.empty()) { + return; + } + + if (failed(canonicalizeCast(func_region))) { + signalPassFailure(); + return; + } + }); + + // Processes neura.kernel. + module_op.walk([&](neura::KernelOp kernel_op) { + auto accel_attr = + kernel_op->getAttrOfType(accel::kAcceleratorAttr); + if (!accel_attr || accel_attr.getValue() != accel::kNeuraTarget) { return; } + Region &kernel_region = kernel_op.getBody(); - if (!region || region->empty()) { + if (kernel_region.empty()) { return; } - if (failed(canonicalizeCast(*region))) { + if (failed(canonicalizeCast(kernel_region))) { signalPassFailure(); return; } From 7a06474ada76788e2289740c7d65139d2cfb1ccb Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 24 Jan 2026 19:14:19 +0800 Subject: [PATCH 14/25] distinguish iter_arg_init in fold-constant pass --- include/NeuraDialect/NeuraPasses.h | 1 - .../Transforms/MapToAcceleratorPass.cpp | 3 + .../HwAgnosticOpt/FoldConstantPass.cpp | 17 +- .../TransformCtrlToDataFlowPass.cpp | 2 + .../Transforms/WrapLoopInKernelPass.cpp | 142 ------------ test/multi-cgra/kernel_mapping/fir/fir.mlir | 202 ++++++++++++++++++ .../kernel_with_yield/kernel_with_yield.mlir | 38 ---- .../kernel_without_yield.mlir | 30 --- .../multi-kernel/multi-kernel.mlir | 89 -------- .../irregular-loop/irregular-loop.mlir | 6 +- 10 files changed, 226 insertions(+), 304 deletions(-) delete mode 100644 lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp create mode 100644 test/multi-cgra/kernel_mapping/fir/fir.mlir delete mode 100644 test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir delete mode 100644 test/multi-cgra/neura-kernel/kernel_without_yield/kernel_without_yield.mlir delete mode 100644 test/multi-cgra/neura-kernel/multi-kernel/multi-kernel.mlir diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h index 0b77521d..75ddbd24 100644 --- a/include/NeuraDialect/NeuraPasses.h +++ b/include/NeuraDialect/NeuraPasses.h @@ -30,7 +30,6 @@ std::unique_ptr createCanonicalizeLiveInPass(); std::unique_ptr createPromoteInputArgToConstPass(); std::unique_ptr createTransformToSteerControlPass(); std::unique_ptr createRemovePredicatedTypePass(); -std::unique_ptr createWrapLoopInKernelPass(); // ==================================== // Optimization Passes diff --git a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp index d8b7ef57..cfe14543 100644 --- a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp +++ b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp @@ -225,6 +225,9 @@ struct MapToAcceleratorPass rec_mii = 1; // No recurrence cycles found, set MII to 1. } + llvm::errs() << "[MapToAcceleratorPass] Calculated Recurrence MII: " + << rec_mii << "\n"; + int res_mii = calculateResMii(region, architecture); const int possible_min_ii = std::max(rec_mii, res_mii); diff --git a/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp b/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp index c50519cf..105f3635 100644 --- a/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp +++ b/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp @@ -1,6 +1,6 @@ +#include "NeuraDialect/NeuraAttributes.h" #include "NeuraDialect/NeuraOps.h" #include "NeuraDialect/NeuraTypes.h" -#include "NeuraDialect/NeuraAttributes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" @@ -22,6 +22,8 @@ using namespace mlir; #include "NeuraDialect/NeuraPasses.h.inc" namespace { +// Attribute name to mark iter_arg init constants. +constexpr const char *kIterArgInitAttr = "is_iter_arg_init"; // ========================================= // Helper Functions @@ -35,6 +37,16 @@ bool isOriginConstantOp(Value value) { return false; } + // Skips constants marked as iter_arg_init. + if (def_op->hasAttr(kIterArgInitAttr)) { + if (auto bool_attr = def_op->getAttrOfType(kIterArgInitAttr)) { + if (bool_attr.getValue()) { + // This constant is an iter_arg_init, should not be folded. + return false; + } + } + } + // Checks if the result type is the original type or the predicated type. Type result_type = value.getType(); if (isa(result_type)) { @@ -434,7 +446,8 @@ struct FuseStoreIndexedConstantPattern LogicalResult matchAndRewrite(neura::StoreIndexedOp op, PatternRewriter &rewriter) const override { // Checks if already folded. - if (op->hasAttr(neura::attr::kLhsValue) || op->hasAttr(neura::attr::kRhsValue)) { + if (op->hasAttr(neura::attr::kLhsValue) || + op->hasAttr(neura::attr::kRhsValue)) { return failure(); } diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp index 556d6181..14257c8c 100644 --- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp +++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp @@ -215,6 +215,8 @@ void handleKernelIterArgs(neura::KernelOp kernel_op, Block *entry_block, reserve_op.getResult()); iter_arg_final_values.push_back(feedback_value); + + init_const->removeAttr(kIterArgInitAttr); llvm::errs() << "[iter_args] Created iter_arg with grant_once\n"; } diff --git a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp deleted file mode 100644 index ac664382..00000000 --- a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp +++ /dev/null @@ -1,142 +0,0 @@ -#include "NeuraDialect/NeuraDialect.h" -#include "NeuraDialect/NeuraOps.h" -#include "NeuraDialect/NeuraPasses.h" -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/Types.h" -#include "mlir/IR/Value.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Support/TypeID.h" -#include "mlir/Transforms/RegionUtils.h" -#include "llvm/ADT/STLExtras.h" -#include - -using namespace mlir; - -namespace { - -static bool isInnermostLoop(affine::AffineForOp for_op) { - bool has_nested_loops = false; - for_op.getBody()->walk([&](affine::AffineForOp) { has_nested_loops = true; }); - return !has_nested_loops; -} - -// Wraps an innermost affine for loop in a neura.kernel operation. -static LogicalResult wrapInnermostLoopAsKernel(affine::AffineForOp for_op, - OpBuilder &builder, - unsigned &kernel_id) { - Location loc = for_op.getLoc(); - - // Collects values that need to be captured by the kernel. - llvm::SetVector captured_values; - getUsedValuesDefinedAbove(for_op.getRegion(), captured_values); - - // Checks if the loop has output values. - bool has_outputs = !for_op.getResults().empty(); - - // Creates the neura.kernel operation. - builder.setInsertionPoint(for_op); - SmallVector inputs(captured_values.begin(), captured_values.end()); - SmallVector input_types; - for (Value val : inputs) { - input_types.push_back(val.getType()); - } - - neura::KernelOp kernel_op = builder.create( - loc, /*output_types=*/for_op->getResultTypes(), - /*inputs=*/inputs); - - // Sets kernel name. - std::string kernel_name = "kernel_" + std::to_string(kernel_id++); - kernel_op.setKernelNameAttr(builder.getStringAttr(kernel_name)); - - // Creats the kernel body block with arguments for captured values. - Block *kernel_body = new Block(); - kernel_op.getBody().push_back(kernel_body); - - // Replaces uses of the original loop's results with kernel results. - if (has_outputs) { - for (auto [orig_result, kernel_result] : - llvm::zip(for_op->getResults(), kernel_op.getResults())) { - orig_result.replaceAllUsesWith(kernel_result); - } - } - - // Moves the loop directly in to the kernel body. - builder.setInsertionPointToStart(kernel_body); - for_op->moveBefore(kernel_body, kernel_body->end()); - - builder.setInsertionPointToEnd(kernel_body); - // Adds yield operation with proper operands. - if (has_outputs) { - // If the loop has outputs, yield the loop results. - SmallVector yield_operands(for_op.getResults()); - builder.create(loc, ValueRange{}, yield_operands); - } else { - // If the loop has no outputs, create an empty yield. - builder.create(loc); - } - - return success(); -} - -struct WrapLoopInKernelPass - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WrapLoopInKernelPass) - - StringRef getArgument() const override { return "wrap-loop-in-kernel"; } - StringRef getDescription() const override { - return "Wraps loops in Neura kernel operations."; - } - - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - } - - void runOnOperation() override { - func::FuncOp func_op = getOperation(); - - // Skips if function already has kerenls. - bool has_kernels = false; - func_op.walk([&](neura::KernelOp) { has_kernels = true; }); - if (has_kernels) { - return; - } - - // Skips main function. - if (func_op.getName() == "main") { - return; - } - - // Collects all innermost affine for loops in the function. - // TODO: Support more kernel wrapping strategies. - SmallVector innermost_loops; - func_op.walk([&](affine::AffineForOp for_op) { - if (isInnermostLoop(for_op)) { - innermost_loops.push_back(for_op); - } - }); - - if (innermost_loops.empty()) { - return; - } - - // Wraps each innermost affine for loop in a neura.kernel operation. - // TODO: Support more kernel wrapping strategies. - OpBuilder builder(func_op->getContext()); - unsigned kernel_id = 0; - for (affine::AffineForOp loop : innermost_loops) { - if (failed(wrapInnermostLoopAsKernel(loop, builder, kernel_id))) { - signalPassFailure(); - return; - } - } - } -}; -} // namespace - -std::unique_ptr mlir::neura::createWrapLoopInKernelPass() { - return std::make_unique(); -} \ No newline at end of file diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir new file mode 100644 index 00000000..5924f46c --- /dev/null +++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir @@ -0,0 +1,202 @@ +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: -o %t.taskflow.mlir +// RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW + +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: -o %t.canonicalized.mlir +// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE + +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: --classify-counters \ +// RUN: --convert-taskflow-to-neura \ +// RUN: -o %t.kernel.mlir +// RUN: FileCheck %s --input-file=%t.kernel.mlir --check-prefixes=KERNEL + +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: --classify-counters \ +// RUN: --convert-taskflow-to-neura \ +// RUN: --lower-affine \ +// RUN: --convert-scf-to-cf \ +// RUN: --convert-cf-to-llvm \ +// RUN: --assign-accelerator \ +// RUN: --lower-memref-to-neura \ +// RUN: --lower-arith-to-neura \ +// RUN: --lower-builtin-to-neura \ +// RUN: --lower-llvm-to-neura \ +// RUN: -o %t.neura.mlir +// RUN: FileCheck %s --input-file=%t.neura.mlir --check-prefixes=NEURA + +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: --classify-counters \ +// RUN: --convert-taskflow-to-neura \ +// RUN: --lower-affine \ +// RUN: --convert-scf-to-cf \ +// RUN: --convert-cf-to-llvm \ +// RUN: --assign-accelerator \ +// RUN: --lower-memref-to-neura \ +// RUN: --lower-arith-to-neura \ +// RUN: --lower-builtin-to-neura \ +// RUN: --lower-llvm-to-neura \ +// RUN: --promote-input-arg-to-const \ +// RUN: --fold-constant \ +// RUN: --canonicalize-return \ +// RUN: --canonicalize-live-in \ +// RUN: --leverage-predicated-value \ +// RUN: --transform-ctrl-to-data-flow \ +// RUN: --fold-constant \ +// RUN: -o %t.dataflow.mlir +// RUN: FileCheck %s --input-file=%t.dataflow.mlir --check-prefixes=DATAFLOW + +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: --classify-counters \ +// RUN: --convert-taskflow-to-neura \ +// RUN: --lower-affine \ +// RUN: --convert-scf-to-cf \ +// RUN: --convert-cf-to-llvm \ +// RUN: --assign-accelerator \ +// RUN: --lower-memref-to-neura \ +// RUN: --lower-arith-to-neura \ +// RUN: --lower-builtin-to-neura \ +// RUN: --lower-llvm-to-neura \ +// RUN: --promote-input-arg-to-const \ +// RUN: --fold-constant \ +// RUN: --canonicalize-return \ +// RUN: --canonicalize-live-in \ +// RUN: --leverage-predicated-value \ +// RUN: --transform-ctrl-to-data-flow \ +// RUN: --fold-constant \ +// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --architecture-spec=%S/../../../arch_spec/architecture.yaml \ +// RUN: -o %t.mapped.mlir +// RUN: FileCheck %s --input-file=%t.mapped.mlir --check-prefixes=MAPPED + + + +module attributes {} { + func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { + %c0_i32 = arith.constant 0 : i32 + %0 = affine.for %arg3 = 0 to 32 iter_args(%arg4 = %c0_i32) -> (i32) { + %1 = affine.load %arg0[%arg3] : memref + %2 = affine.load %arg2[%arg3] : memref + %3 = arith.muli %1, %2 : i32 + %4 = arith.addi %arg4, %3 : i32 + affine.yield %4 : i32 + } + return %0 : i32 + } +} + +// TASKFLOW: module { +// TASKFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// TASKFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 +// TASKFLOW-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// TASKFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): +// TASKFLOW-NEXT: %0 = affine.for %arg6 = 0 to 32 iter_args(%arg7 = %arg5) -> (i32) { +// TASKFLOW-NEXT: %1 = affine.load %arg3[%arg6] : memref +// TASKFLOW-NEXT: %2 = affine.load %arg4[%arg6] : memref +// TASKFLOW-NEXT: %3 = arith.muli %1, %2 : i32 +// TASKFLOW-NEXT: %4 = arith.addi %arg7, %3 : i32 +// TASKFLOW-NEXT: affine.yield %4 : i32 +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: "taskflow.yield"(%0) <{operandSegmentSizes = array}> : (i32) -> () +// TASKFLOW-NEXT: }) : (memref, memref, i32) -> i32 +// TASKFLOW-NEXT: return %value_outputs : i32 +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: } + +// CANONICALIZE: module { +// CANONICALIZE-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// CANONICALIZE-NEXT: %c0_i32 = arith.constant 0 : i32 +// CANONICALIZE-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// CANONICALIZE-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): +// CANONICALIZE-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// CANONICALIZE-NEXT: %1 = "taskflow.hyperblock"(%0, %arg5) <{operandSegmentSizes = array}> ({ +// CANONICALIZE-NEXT: ^bb0(%arg6: index, %arg7: i32): +// CANONICALIZE-NEXT: %2 = memref.load %arg3[%arg6] : memref +// CANONICALIZE-NEXT: %3 = memref.load %arg4[%arg6] : memref +// CANONICALIZE-NEXT: %4 = arith.muli %2, %3 : i32 +// CANONICALIZE-NEXT: %5 = arith.addi %arg7, %4 : i32 +// CANONICALIZE-NEXT: taskflow.hyperblock.yield iter_args_next(%5 : i32) results(%5 : i32) +// CANONICALIZE-NEXT: }) : (index, i32) -> i32 +// CANONICALIZE-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () +// CANONICALIZE-NEXT: }) : (memref, memref, i32) -> i32 +// CANONICALIZE-NEXT: return %value_outputs : i32 +// CANONICALIZE-NEXT: } +// CANONICALIZE-NEXT: } + +// KERNEL: module { +// KERNEL-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// KERNEL-NEXT: %c0_i32 = arith.constant 0 : i32 +// KERNEL-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// KERNEL-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// KERNEL-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) { +// KERNEL-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): +// KERNEL-NEXT: %2 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// KERNEL-NEXT: %3 = memref.load %arg6[%2] : memref +// KERNEL-NEXT: %4 = memref.load %arg7[%2] : memref +// KERNEL-NEXT: %5 = arith.muli %3, %4 : i32 +// KERNEL-NEXT: %6 = arith.addi %arg8, %5 : i32 +// KERNEL-NEXT: neura.yield iter_args_next(%6 : i32) results(%6 : i32) +// KERNEL-NEXT: } : i32 +// KERNEL-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () +// KERNEL-NEXT: }) : (memref, memref, i32) -> i32 +// KERNEL-NEXT: return %value_outputs : i32 +// KERNEL-NEXT: } +// KERNEL-NEXT: } + +// NEURA: module { +// NEURA-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// NEURA-NEXT: %c0_i32 = arith.constant 0 : i32 +// NEURA-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// NEURA-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): +// NEURA-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// NEURA-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura"} { +// NEURA-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): +// NEURA-NEXT: %2 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// NEURA-NEXT: %3 = neura.load_indexed %arg6[%2 : index] memref : i32 +// NEURA-NEXT: %4 = neura.load_indexed %arg7[%2 : index] memref : i32 +// NEURA-NEXT: %5 = "neura.mul"(%3, %4) : (i32, i32) -> i32 +// NEURA-NEXT: %6 = "neura.add"(%arg8, %5) : (i32, i32) -> i32 +// NEURA-NEXT: neura.yield iter_args_next(%6 : i32) results(%6 : i32) +// NEURA-NEXT: } : i32 +// NEURA-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () +// NEURA-NEXT: }) : (memref, memref, i32) -> i32 +// NEURA-NEXT: return %value_outputs : i32 +// NEURA-NEXT: } +// NEURA-NEXT: } + +// DATAFLOW: module { +// DATAFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// DATAFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 +// DATAFLOW-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// DATAFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): +// DATAFLOW-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// DATAFLOW-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} { +// DATAFLOW-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): +// DATAFLOW-NEXT: %2 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : !neura.data +// DATAFLOW-NEXT: %3 = neura.load_indexed [%2 : !neura.data] {lhs_value = "%input0"} : !neura.data +// DATAFLOW-NEXT: %4 = neura.load_indexed [%2 : !neura.data] {lhs_value = "%input1"} : !neura.data +// DATAFLOW-NEXT: %5 = "neura.mul"(%3, %4) : (!neura.data, !neura.data) -> !neura.data +// DATAFLOW-NEXT: %6 = "neura.add"(%5) {lhs_value = "%iter_arg_init0"} : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %7 = neura.extract_predicate %2 : !neura.data -> !neura.data +// DATAFLOW-NEXT: %8 = "neura.not"(%7) : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %9 = neura.grant_predicate %6, %8 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: neura.return_value %9 : !neura.data +// DATAFLOW-NEXT: neura.yield +// DATAFLOW-NEXT: } : i32 +// DATAFLOW-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () +// DATAFLOW-NEXT: }) : (memref, memref, i32) -> i32 +// DATAFLOW-NEXT: return %value_outputs : i32 +// DATAFLOW-NEXT: } +// DATAFLOW-NEXT: } \ No newline at end of file diff --git a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir b/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir deleted file mode 100644 index ad24eac4..00000000 --- a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir +++ /dev/null @@ -1,38 +0,0 @@ -// Wraps the innermost loop within neura.kernel operation. -// RUN: mlir-neura-opt %s \ -// RUN: --wrap-loop-in-kernel \ -// RUN: -o %t-wrapped.mlir -// RUN: FileCheck %s --input-file=%t-wrapped.mlir - -module attributes {} { - func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { - %c0_i32 = arith.constant 0 : i32 - %0 = affine.for %arg1 = 0 to 128 iter_args(%arg2 = %c0_i32) -> (i32) { - %1 = affine.for %arg3 = 0 to 128 iter_args(%arg4 = %arg2) -> (i32) { - %2 = affine.load %arg0[%arg1, %arg3] : memref - %3 = arith.addi %arg4, %2 : i32 - affine.yield %3 : i32 - } - affine.yield %1 : i32 - } - return %0 : i32 - } -} - - // CHECK: module { - // CHECK-NEXT: func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { - // CHECK-NEXT: %c0_i32 = arith.constant 0 : i32 - // CHECK-NEXT: %0 = affine.for %arg1 = 0 to 128 iter_args(%arg2 = %c0_i32) -> (i32) { - // CHECK-NEXT: %1 = neura.kernel ins(%arg0, %arg1 : memref, index) attributes {kernel_name = "kernel_0"} { - // CHECK-NEXT: %2 = affine.for %arg3 = 0 to 128 iter_args(%arg4 = %arg2) -> (i32) { - // CHECK-NEXT: %3 = affine.load %arg0[%arg1, %arg3] : memref - // CHECK-NEXT: %4 = arith.addi %arg4, %3 : i32 - // CHECK-NEXT: affine.yield %4 : i32 - // CHECK-NEXT: } - // CHECK-NEXT: neura.yield %2 : i32 - // CHECK-NEXT: } : i32 - // CHECK-NEXT: affine.yield %1 : i32 - // CHECK-NEXT: } - // CHECK-NEXT: return %0 : i32 - // CHECK-NEXT: } - // CHECK-NEXT: } diff --git a/test/multi-cgra/neura-kernel/kernel_without_yield/kernel_without_yield.mlir b/test/multi-cgra/neura-kernel/kernel_without_yield/kernel_without_yield.mlir deleted file mode 100644 index 0775cf19..00000000 --- a/test/multi-cgra/neura-kernel/kernel_without_yield/kernel_without_yield.mlir +++ /dev/null @@ -1,30 +0,0 @@ -// Wraps the innermost loop within neura.kernel operation. -// RUN: mlir-neura-opt %s \ -// RUN: --wrap-loop-in-kernel \ -// RUN: | FileCheck %s - -module attributes {} { - func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { - affine.for %arg2 = 0 to 128 { - affine.for %arg3 = 0 to 128 { - %0 = affine.load %arg0[0, 0, 0, 0, 0, %arg3] : memref - affine.store %0, %arg1[0, 0, %arg2, 0, 0, %arg3] : memref - } - } - return - } -} - - // CHECK: module { - // CHECK-NEXT: func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { - // CHECK-NEXT: affine.for %arg2 = 0 to 128 { - // CHECK-NEXT: neura.kernel ins(%arg0, %arg1, %arg2 : memref, memref, index) attributes {kernel_name = "kernel_0"} { - // CHECK-NEXT: affine.for %arg3 = 0 to 128 { - // CHECK-NEXT: %0 = affine.load %arg0[0, 0, 0, 0, 0, %arg3] : memref - // CHECK-NEXT: affine.store %0, %arg1[0, 0, %arg2, 0, 0, %arg3] : memref - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: return - // CHECK-NEXT: } - // CHECK-NEXT: } \ No newline at end of file diff --git a/test/multi-cgra/neura-kernel/multi-kernel/multi-kernel.mlir b/test/multi-cgra/neura-kernel/multi-kernel/multi-kernel.mlir deleted file mode 100644 index 12e2846e..00000000 --- a/test/multi-cgra/neura-kernel/multi-kernel/multi-kernel.mlir +++ /dev/null @@ -1,89 +0,0 @@ -// Wraps the innermost loop within neura.kernel operation. -// This function is a convolution followed by ReLU activation. - -// RUN: mlir-neura-opt %s \ -// RUN: --wrap-loop-in-kernel \ -// RUN: | FileCheck %s - -module attributes {} { - func.func @_Z17conv3x3_then_reluPA32_A32_KfPA3_A3_A3_S_PS_PA30_A30_fSA_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref) attributes {llvm.linkage = #llvm.linkage} { - %cst = arith.constant 0.000000e+00 : f32 - affine.for %arg5 = 0 to 64 { - affine.for %arg6 = 0 to 30 { - affine.for %arg7 = 0 to 30 { - %0 = affine.load %arg2[%arg5] : memref - %1 = affine.for %arg8 = 0 to 3 iter_args(%arg9 = %0) -> (f32) { - %2 = affine.for %arg10 = 0 to 3 iter_args(%arg11 = %arg9) -> (f32) { - %3 = affine.for %arg12 = 0 to 3 iter_args(%arg13 = %arg11) -> (f32) { - %4 = affine.load %arg0[%arg8, %arg6 + %arg10, %arg7 + %arg12] : memref - %5 = affine.load %arg1[%arg5, %arg8, %arg10, %arg12] : memref - %6 = arith.mulf %4, %5 : f32 - %7 = arith.addf %arg13, %6 : f32 - affine.yield %7 : f32 - } - affine.yield %3 : f32 - } - affine.yield %2 : f32 - } - affine.store %1, %arg3[%arg5, %arg6, %arg7] : memref - } - } - } - affine.for %arg5 = 0 to 64 { - affine.for %arg6 = 0 to 30 { - affine.for %arg7 = 0 to 30 { - %0 = affine.load %arg3[%arg5, %arg6, %arg7] : memref - %1 = arith.cmpf ogt, %0, %cst : f32 - %2 = arith.select %1, %0, %cst : f32 - affine.store %2, %arg4[%arg5, %arg6, %arg7] : memref - } - } - } - return - } -} - - - // CHECK: module { - // CHECK-NEXT: func.func @_Z17conv3x3_then_reluPA32_A32_KfPA3_A3_A3_S_PS_PA30_A30_fSA_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref) attributes {llvm.linkage = #llvm.linkage} { - // CHECK-NEXT: %cst = arith.constant 0.000000e+00 : f32 - // CHECK-NEXT: affine.for %arg5 = 0 to 64 { - // CHECK-NEXT: affine.for %arg6 = 0 to 30 { - // CHECK-NEXT: affine.for %arg7 = 0 to 30 { - // CHECK-NEXT: %0 = affine.load %arg2[%arg5] : memref - // CHECK-NEXT: %1 = affine.for %arg8 = 0 to 3 iter_args(%arg9 = %0) -> (f32) { - // CHECK-NEXT: %2 = affine.for %arg10 = 0 to 3 iter_args(%arg11 = %arg9) -> (f32) { - // CHECK-NEXT: %3 = neura.kernel ins(%arg0, %arg8, %arg6, %arg10, %arg7, %arg1, %arg5 : memref, index, index, index, index, memref, index) attributes {kernel_name = "kernel_0"} { - // CHECK-NEXT: %4 = affine.for %arg12 = 0 to 3 iter_args(%arg13 = %arg11) -> (f32) { - // CHECK-NEXT: %5 = affine.load %arg0[%arg8, %arg6 + %arg10, %arg7 + %arg12] : memref - // CHECK-NEXT: %6 = affine.load %arg1[%arg5, %arg8, %arg10, %arg12] : memref - // CHECK-NEXT: %7 = arith.mulf %5, %6 : f32 - // CHECK-NEXT: %8 = arith.addf %arg13, %7 : f32 - // CHECK-NEXT: affine.yield %8 : f32 - // CHECK-NEXT: } - // CHECK-NEXT: neura.yield %4 : f32 - // CHECK-NEXT: } : f32 - // CHECK-NEXT: affine.yield %3 : f32 - // CHECK-NEXT: } - // CHECK-NEXT: affine.yield %2 : f32 - // CHECK-NEXT: } - // CHECK-NEXT: affine.store %1, %arg3[%arg5, %arg6, %arg7] : memref - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: affine.for %arg5 = 0 to 64 { - // CHECK-NEXT: affine.for %arg6 = 0 to 30 { - // CHECK-NEXT: neura.kernel ins(%arg3, %arg5, %arg6, %cst, %arg4 : memref, index, index, f32, memref) attributes {kernel_name = "kernel_1"} { - // CHECK-NEXT: affine.for %arg7 = 0 to 30 { - // CHECK-NEXT: %0 = affine.load %arg3[%arg5, %arg6, %arg7] : memref - // CHECK-NEXT: %1 = arith.cmpf ogt, %0, %cst : f32 - // CHECK-NEXT: %2 = arith.select %1, %0, %cst : f32 - // CHECK-NEXT: affine.store %2, %arg4[%arg5, %arg6, %arg7] : memref - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: return - // CHECK-NEXT: } - // CHECK-NEXT: } - diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 6ce8e5e6..9d1e6f46 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -107,7 +107,7 @@ module attributes {} { // HYPERBLOCK-NEXT: ^bb0(%arg1: index, %arg2: i32): // HYPERBLOCK-NEXT: %3 = arith.index_cast %arg1 : index to i32 // HYPERBLOCK-NEXT: %4 = arith.addi %arg2, %3 : i32 -// HYPERBLOCK-NEXT: taskflow.hyperblock.yield outputs(%4 : i32) +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32) // HYPERBLOCK-NEXT: }) : (index, i32) -> i32 // HYPERBLOCK-NEXT: "taskflow.yield"(%2) <{operandSegmentSizes = array}> : (i32) -> () // HYPERBLOCK-NEXT: }) : (i32) -> i32 @@ -151,6 +151,8 @@ module attributes {} { // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: } + + // CANONICALIZE: module { // CANONICALIZE-NEXT: func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { // CANONICALIZE-NEXT: %c2_i32 = arith.constant 2 : i32 @@ -165,7 +167,7 @@ module attributes {} { // CANONICALIZE-NEXT: ^bb0(%arg1: index, %arg2: i32): // CANONICALIZE-NEXT: %3 = arith.index_cast %arg1 : index to i32 // CANONICALIZE-NEXT: %4 = arith.addi %arg2, %3 : i32 -// CANONICALIZE-NEXT: taskflow.hyperblock.yield outputs(%4 : i32) +// CANONICALIZE-NEXT: taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32) // CANONICALIZE-NEXT: }) : (index, i32) -> i32 // CANONICALIZE-NEXT: "taskflow.yield"(%2) <{operandSegmentSizes = array}> : (i32) -> () // CANONICALIZE-NEXT: }) : (i32) -> i32 From 8948aee1e00a7c2932241a61f2623ba8b9687f3e Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 24 Jan 2026 19:41:01 +0800 Subject: [PATCH 15/25] add tests for e2e taskflow2neura test --- .../NeuraDialect/Architecture/Architecture.h | 10 +- .../Architecture/ArchitectureSpec.h | 11 +- test/arch_spec/architecture.yaml | 2 +- test/multi-cgra/kernel_mapping/fir/fir.mlir | 66 +++- .../loop-in-kernel/loop-in-kernel.mlir | 203 +++++++++++++ test/multi-cgra/kernel_mapping/relu/relu.mlir | 286 ++++++++++++++++++ 6 files changed, 560 insertions(+), 18 deletions(-) create mode 100644 test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir create mode 100644 test/multi-cgra/kernel_mapping/relu/relu.mlir diff --git a/include/NeuraDialect/Architecture/Architecture.h b/include/NeuraDialect/Architecture/Architecture.h index 7dcbad9e..a27af7e2 100644 --- a/include/NeuraDialect/Architecture/Architecture.h +++ b/include/NeuraDialect/Architecture/Architecture.h @@ -88,7 +88,10 @@ enum OperationKind { // Data movement operations. IReserve = 38, IDataMov = 39, - ICtrlMov = 40 + ICtrlMov = 40, + // Counter operations. + ICounter = 41, + IExtractPredicate = 42 }; // Maps hardware resource names to their supported operations. @@ -135,7 +138,10 @@ static const std::map> // Predicate operations. {"grant", {IGrantPredicate, IGrantOnce, IGrantAlways}}, -}; + + // Counter operations. + {"counter", {ICounter}}, + {"extract_predicate", {IExtractPredicate}}}; //===----------------------------------------------------------------------===// // BasicResource: abstract base class for Tile, Link, etc. diff --git a/include/NeuraDialect/Architecture/ArchitectureSpec.h b/include/NeuraDialect/Architecture/ArchitectureSpec.h index 70ee0033..9cd4bff9 100644 --- a/include/NeuraDialect/Architecture/ArchitectureSpec.h +++ b/include/NeuraDialect/Architecture/ArchitectureSpec.h @@ -21,11 +21,12 @@ struct TileDefaults { // Default function unit types - include all supported function units // types for newbie convenience. std::vector function_units = { - "add", "mul", "div", "fadd", "fmul", - "fdiv", "logic", "cmp", "sel", "type_conv", - "shift", "vfmul", "fadd_fadd", "fmul_fadd", "grant", - "loop_control", "phi", "constant", "mem", "return", - "mem_indexed", "alloca"}; + "add", "mul", "div", "fadd", + "fmul", "fdiv", "logic", "cmp", + "sel", "type_conv", "shift", "vfmul", + "fadd_fadd", "fmul_fadd", "grant", "loop_control", + "phi", "constant", "mem", "return", + "mem_indexed", "alloca", "counter", "extract_predicate"}; }; // Structure for holding memory configuration. diff --git a/test/arch_spec/architecture.yaml b/test/arch_spec/architecture.yaml index 31235dca..cf0730b5 100644 --- a/test/arch_spec/architecture.yaml +++ b/test/arch_spec/architecture.yaml @@ -15,7 +15,7 @@ per_cgra_defaults: tile_defaults: num_registers: 32 - fu_types: ["add", "mul", "div", "fadd", "fmul", "fdiv", "logic", "cmp", "sel", "type_conv", "vfmul", "fadd_fadd", "fmul_fadd", "grant", "loop_control", "phi", "constant", "mem", "return", "mem_indexed", "alloca", "shift"] + fu_types: ["add", "mul", "div", "fadd", "fmul", "fdiv", "logic", "cmp", "sel", "type_conv", "vfmul", "fadd_fadd", "fmul_fadd", "grant", "loop_control", "phi", "constant", "mem", "return", "mem_indexed", "alloca", "shift", "counter", "extract_predicate"] link_defaults: latency: 1 diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir index 5924f46c..46f62a2c 100644 --- a/test/multi-cgra/kernel_mapping/fir/fir.mlir +++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir @@ -75,6 +75,7 @@ // RUN: --leverage-predicated-value \ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ +// RUN: --insert-data-mov \ // RUN: --map-to-accelerator="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../../arch_spec/architecture.yaml \ // RUN: -o %t.mapped.mlir @@ -184,19 +185,64 @@ module attributes {} { // DATAFLOW-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // DATAFLOW-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} { // DATAFLOW-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): -// DATAFLOW-NEXT: %2 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : !neura.data -// DATAFLOW-NEXT: %3 = neura.load_indexed [%2 : !neura.data] {lhs_value = "%input0"} : !neura.data -// DATAFLOW-NEXT: %4 = neura.load_indexed [%2 : !neura.data] {lhs_value = "%input1"} : !neura.data -// DATAFLOW-NEXT: %5 = "neura.mul"(%3, %4) : (!neura.data, !neura.data) -> !neura.data -// DATAFLOW-NEXT: %6 = "neura.add"(%5) {lhs_value = "%iter_arg_init0"} : (!neura.data) -> !neura.data -// DATAFLOW-NEXT: %7 = neura.extract_predicate %2 : !neura.data -> !neura.data -// DATAFLOW-NEXT: %8 = "neura.not"(%7) : (!neura.data) -> !neura.data -// DATAFLOW-NEXT: %9 = neura.grant_predicate %6, %8 : !neura.data, !neura.data -> !neura.data -// DATAFLOW-NEXT: neura.return_value %9 : !neura.data +// DATAFLOW-NEXT: %2 = "neura.grant_once"() <{constant_value = "%iter_arg_init0"}> : () -> !neura.data +// DATAFLOW-NEXT: %3 = neura.reserve : !neura.data +// DATAFLOW-NEXT: %4 = neura.phi_start %2, %3 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: %5 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : !neura.data +// DATAFLOW-NEXT: %6 = neura.load_indexed [%5 : !neura.data] {lhs_value = "%input0"} : !neura.data +// DATAFLOW-NEXT: %7 = neura.load_indexed [%5 : !neura.data] {lhs_value = "%input1"} : !neura.data +// DATAFLOW-NEXT: %8 = "neura.mul"(%6, %7) : (!neura.data, !neura.data) -> !neura.data +// DATAFLOW-NEXT: %9 = "neura.add"(%4, %8) : (!neura.data, !neura.data) -> !neura.data +// DATAFLOW-NEXT: neura.ctrl_mov %9 -> %3 : !neura.data !neura.data +// DATAFLOW-NEXT: %10 = neura.extract_predicate %5 : !neura.data -> !neura.data +// DATAFLOW-NEXT: %11 = "neura.not"(%10) : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %12 = neura.grant_predicate %9, %11 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: neura.return_value %12 : !neura.data // DATAFLOW-NEXT: neura.yield // DATAFLOW-NEXT: } : i32 // DATAFLOW-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () // DATAFLOW-NEXT: }) : (memref, memref, i32) -> i32 // DATAFLOW-NEXT: return %value_outputs : i32 // DATAFLOW-NEXT: } -// DATAFLOW-NEXT: } \ No newline at end of file +// DATAFLOW-NEXT: } + +// MAPPED: module { +// MAPPED-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// MAPPED-NEXT: %c0_i32 = arith.constant 0 : i32 +// MAPPED-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// MAPPED-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): +// MAPPED-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// MAPPED-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 2 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 2 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { +// MAPPED-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): +// MAPPED-NEXT: %2 = "neura.grant_once"() <{constant_value = "%iter_arg_init0"}> {dfg_id = 0 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 3 : i32}]} : () -> !neura.data +// MAPPED-NEXT: %3 = neura.reserve {dfg_id = 1 : i32} : !neura.data +// MAPPED-NEXT: %4 = "neura.data_mov"(%2) {dfg_id = 4 : i32, mapping_locs = [{id = 42 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}, {id = 288 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %5 = neura.phi_start %4, %3 {dfg_id = 8 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data, !neura.data -> !neura.data +// MAPPED-NEXT: %6 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", dfg_id = 2 : i32, lower_bound = 0 : index, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 1 : i32, y = 1 : i32}], step = 1 : index, upper_bound = 32 : index} : !neura.data +// MAPPED-NEXT: %7 = "neura.data_mov"(%6) {dfg_id = 5 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %8 = neura.load_indexed [%7 : !neura.data] {dfg_id = 9 : i32, lhs_value = "%input0", mapping_locs = [{id = 5 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data +// MAPPED-NEXT: %9 = "neura.data_mov"(%6) {dfg_id = 6 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %10 = neura.load_indexed [%9 : !neura.data] {dfg_id = 10 : i32, lhs_value = "%input1", mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 0 : i32}]} : !neura.data +// MAPPED-NEXT: %11 = "neura.data_mov"(%8) {dfg_id = 13 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %12 = "neura.data_mov"(%10) {dfg_id = 14 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %13 = "neura.mul"(%11, %12) {dfg_id = 16 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 0 : i32}]} : (!neura.data, !neura.data) -> !neura.data +// MAPPED-NEXT: %14 = "neura.data_mov"(%5) {dfg_id = 12 : i32, mapping_locs = [{id = 288 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %15 = "neura.data_mov"(%13) {dfg_id = 18 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 2 : i32}, {id = 16 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %16 = "neura.add"(%14, %15) {dfg_id = 20 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 2 : i32}]} : (!neura.data, !neura.data) -> !neura.data +// MAPPED-NEXT: neura.ctrl_mov %16 -> %3 {dfg_id = 21 : i32, mapping_locs = [{id = 289 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : !neura.data !neura.data +// MAPPED-NEXT: %17 = "neura.data_mov"(%6) {dfg_id = 7 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}, {id = 128 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %18 = neura.extract_predicate %17 {dfg_id = 11 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data -> !neura.data +// MAPPED-NEXT: %19 = "neura.data_mov"(%18) {dfg_id = 15 : i32, mapping_locs = [{id = 128 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %20 = "neura.not"(%19) {dfg_id = 17 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 1 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %21 = "neura.data_mov"(%16) {dfg_id = 22 : i32, mapping_locs = [{id = 27 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %22 = "neura.data_mov"(%20) {dfg_id = 19 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}, {id = 256 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %23 = neura.grant_predicate %21, %22 {dfg_id = 23 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 5 : i32, x = 0 : i32, y = 2 : i32}]} : !neura.data, !neura.data -> !neura.data +// MAPPED-NEXT: %24 = "neura.data_mov"(%23) {dfg_id = 24 : i32, mapping_locs = [{id = 256 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: neura.return_value %24 : !neura.data {dfg_id = 25 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 2 : i32}]} +// MAPPED-NEXT: neura.yield {dfg_id = 3 : i32} +// MAPPED-NEXT: } : i32 +// MAPPED-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () +// MAPPED-NEXT: }) : (memref, memref, i32) -> i32 +// MAPPED-NEXT: return %value_outputs : i32 +// MAPPED-NEXT: } +// MAPPED-NEXT: } \ No newline at end of file diff --git a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir new file mode 100644 index 00000000..f926d548 --- /dev/null +++ b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir @@ -0,0 +1,203 @@ +// RUN: mlir-neura-opt %s \ +// RUN: --lower-affine \ +// RUN: --convert-scf-to-cf \ +// RUN: --convert-cf-to-llvm \ +// RUN: --assign-accelerator \ +// RUN: --lower-memref-to-neura \ +// RUN: --lower-arith-to-neura \ +// RUN: --lower-builtin-to-neura \ +// RUN: --lower-llvm-to-neura \ +// RUN: -o %t.neura.mlir +// RUN: FileCheck %s --input-file=%t.neura.mlir --check-prefixes=NEURA + +// RUN: mlir-neura-opt %s \ +// RUN: --lower-affine \ +// RUN: --convert-scf-to-cf \ +// RUN: --convert-cf-to-llvm \ +// RUN: --assign-accelerator \ +// RUN: --lower-memref-to-neura \ +// RUN: --lower-arith-to-neura \ +// RUN: --lower-builtin-to-neura \ +// RUN: --lower-llvm-to-neura \ +// RUN: --promote-input-arg-to-const \ +// RUN: --fold-constant \ +// RUN: --canonicalize-return \ +// RUN: --canonicalize-live-in \ +// RUN: --leverage-predicated-value \ +// RUN: --transform-ctrl-to-data-flow \ +// RUN: --fold-constant \ +// RUN: -o %t.dataflow.mlir +// RUN: FileCheck %s --input-file=%t.dataflow.mlir --check-prefixes=DATAFLOW + +// RUN: mlir-neura-opt %s \ +// RUN: --lower-affine \ +// RUN: --convert-scf-to-cf \ +// RUN: --convert-cf-to-llvm \ +// RUN: --assign-accelerator \ +// RUN: --lower-memref-to-neura \ +// RUN: --lower-arith-to-neura \ +// RUN: --lower-builtin-to-neura \ +// RUN: --lower-llvm-to-neura \ +// RUN: --promote-input-arg-to-const \ +// RUN: --fold-constant \ +// RUN: --canonicalize-cast \ +// RUN: --canonicalize-return \ +// RUN: --canonicalize-live-in \ +// RUN: --leverage-predicated-value \ +// RUN: --transform-ctrl-to-data-flow \ +// RUN: --fold-constant \ +// RUN: --insert-data-mov \ +// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --architecture-spec=%S/../../../arch_spec/architecture.yaml \ +// RUN: -o %t.mapped.mlir +// RUN: FileCheck %s --input-file=%t.mapped.mlir --check-prefixes=MAPPED + +module { + func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { + %c0_i32 = arith.constant 0 : i32 + %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ + ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): + %1 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) { + ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): + %0 = affine.for %arg9 = 0 to 32 iter_args(%arg10 = %arg8) -> (i32) { + %1 = affine.load %arg6[%arg9] : memref + %2 = affine.load %arg7[%arg9] : memref + %3 = arith.muli %1, %2 : i32 + %4 = arith.addi %arg10, %3 : i32 + affine.yield %4 : i32 + } + neura.yield results(%0 : i32) + } : i32 + "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () + }) : (memref, memref, i32) -> i32 + return %value_outputs : i32 + } +} + +// NEURA: module { +// NEURA-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// NEURA-NEXT: %c0_i32 = arith.constant 0 : i32 +// NEURA-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// NEURA-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): +// NEURA-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura"} { +// NEURA-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): +// NEURA-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index +// NEURA-NEXT: %2 = "neura.constant"() <{value = 32 : index}> : () -> index +// NEURA-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index +// NEURA-NEXT: %4 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64 +// NEURA-NEXT: neura.br %4, %arg8 : i64, i32 to ^bb1 +// NEURA-NEXT: ^bb1(%5: i64, %6: i32): // 2 preds: ^bb0, ^bb2 +// NEURA-NEXT: %7 = "neura.cast"(%5) <{cast_type = "int_to_index"}> : (i64) -> index +// NEURA-NEXT: %8 = "neura.icmp"(%7, %2) <{cmpType = "slt"}> : (index, index) -> i1 +// NEURA-NEXT: neura.cond_br %8 : i1 then to ^bb2 else to ^bb3 +// NEURA-NEXT: ^bb2: // pred: ^bb1 +// NEURA-NEXT: %9 = neura.load_indexed %arg6[%7 : index] memref : i32 +// NEURA-NEXT: %10 = neura.load_indexed %arg7[%7 : index] memref : i32 +// NEURA-NEXT: %11 = "neura.mul"(%9, %10) : (i32, i32) -> i32 +// NEURA-NEXT: %12 = "neura.add"(%6, %11) : (i32, i32) -> i32 +// NEURA-NEXT: %13 = "neura.add"(%7, %1) : (index, index) -> index +// NEURA-NEXT: %14 = "neura.cast"(%13) <{cast_type = "index_to_int"}> : (index) -> i64 +// NEURA-NEXT: neura.br %14, %12 : i64, i32 to ^bb1 +// NEURA-NEXT: ^bb3: // pred: ^bb1 +// NEURA-NEXT: neura.yield results(%6 : i32) +// NEURA-NEXT: } : i32 +// NEURA-NEXT: "taskflow.yield"(%0) <{operandSegmentSizes = array}> : (i32) -> () +// NEURA-NEXT: }) : (memref, memref, i32) -> i32 +// NEURA-NEXT: return %value_outputs : i32 +// NEURA-NEXT: } +// NEURA-NEXT: } + + +// DATAFLOW: module { +// DATAFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// DATAFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 +// DATAFLOW-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// DATAFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): +// DATAFLOW-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} { +// DATAFLOW-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): +// DATAFLOW-NEXT: %1 = "neura.grant_once"() <{constant_value = "%input2"}> : () -> !neura.data +// DATAFLOW-NEXT: %2 = "neura.constant"() <{value = 0 : index}> : () -> !neura.data +// DATAFLOW-NEXT: %3 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %4 = "neura.grant_once"(%3) : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %5 = neura.reserve : !neura.data +// DATAFLOW-NEXT: %6 = neura.phi_start %1, %5 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: %7 = neura.reserve : !neura.data +// DATAFLOW-NEXT: %8 = neura.phi_start %4, %7 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: %9 = "neura.cast"(%8) <{cast_type = "int_to_index"}> : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %10 = "neura.icmp"(%9) <{cmpType = "slt"}> {rhs_value = 32 : index} : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %11 = neura.grant_predicate %9, %10 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: %12 = neura.grant_predicate %6, %10 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: %13 = "neura.not"(%10) : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %14 = neura.grant_predicate %6, %13 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: neura.return_value %14 : !neura.data +// DATAFLOW-NEXT: %15 = neura.load_indexed [%11 : !neura.data] {lhs_value = "%input0"} : !neura.data +// DATAFLOW-NEXT: %16 = neura.load_indexed [%11 : !neura.data] {lhs_value = "%input1"} : !neura.data +// DATAFLOW-NEXT: %17 = "neura.mul"(%15, %16) : (!neura.data, !neura.data) -> !neura.data +// DATAFLOW-NEXT: %18 = "neura.add"(%12, %17) : (!neura.data, !neura.data) -> !neura.data +// DATAFLOW-NEXT: %19 = "neura.add"(%11) {rhs_value = 1 : index} : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %20 = "neura.cast"(%19) <{cast_type = "index_to_int"}> : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: neura.ctrl_mov %20 -> %7 : !neura.data !neura.data +// DATAFLOW-NEXT: neura.ctrl_mov %18 -> %5 : !neura.data !neura.data +// DATAFLOW-NEXT: neura.yield +// DATAFLOW-NEXT: } : i32 +// DATAFLOW-NEXT: "taskflow.yield"(%0) <{operandSegmentSizes = array}> : (i32) -> () +// DATAFLOW-NEXT: }) : (memref, memref, i32) -> i32 +// DATAFLOW-NEXT: return %value_outputs : i32 +// DATAFLOW-NEXT: } +// DATAFLOW-NEXT:} + + +// MAPPED: module { +// MAPPED-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// MAPPED-NEXT: %c0_i32 = arith.constant 0 : i32 +// MAPPED-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// MAPPED-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): +// MAPPED-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 4 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { +// MAPPED-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): +// MAPPED-NEXT: %1 = "neura.grant_once"() <{constant_value = "%input2"}> {dfg_id = 0 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 0 : i32}]} : () -> !neura.data +// MAPPED-NEXT: %2 = "neura.grant_once"() <{constant_value = 0 : i64}> {dfg_id = 1 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 0 : i32, y = 0 : i32}]} : () -> !neura.data +// MAPPED-NEXT: %3 = neura.reserve {dfg_id = 2 : i32} : !neura.data +// MAPPED-NEXT: %4 = "neura.data_mov"(%1) {dfg_id = 5 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %5 = neura.phi_start %4, %3 {dfg_id = 7 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data, !neura.data -> !neura.data +// MAPPED-NEXT: %6 = neura.reserve {dfg_id = 3 : i32} : !neura.data +// MAPPED-NEXT: %7 = "neura.data_mov"(%2) {dfg_id = 6 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %8 = neura.phi_start %7, %6 {dfg_id = 8 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 0 : i32}]} : !neura.data, !neura.data -> !neura.data +// MAPPED-NEXT: %9 = "neura.data_mov"(%8) {dfg_id = 12 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %10 = "neura.icmp"(%9) <{cmpType = "slt"}> {dfg_id = 13 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 0 : i32}], rhs_value = 32 : index} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %11 = "neura.data_mov"(%8) {dfg_id = 11 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 1 : i32}, {id = 33 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %12 = "neura.data_mov"(%10) {dfg_id = 16 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %13 = neura.grant_predicate %11, %12 {dfg_id = 19 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 0 : i32}]} : !neura.data, !neura.data -> !neura.data +// MAPPED-NEXT: %14 = "neura.data_mov"(%5) {dfg_id = 10 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %15 = "neura.data_mov"(%10) {dfg_id = 15 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 160 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 160 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %16 = neura.grant_predicate %14, %15 {dfg_id = 18 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data, !neura.data -> !neura.data +// MAPPED-NEXT: %17 = "neura.data_mov"(%10) {dfg_id = 14 : i32, mapping_locs = [{id = 3 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 64 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %18 = "neura.not"(%17) {dfg_id = 17 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 0 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %19 = "neura.data_mov"(%5) {dfg_id = 9 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 4 : i32}, {id = 0 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %20 = "neura.data_mov"(%18) {dfg_id = 20 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 4 : i32}, {id = 2 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %21 = neura.grant_predicate %19, %20 {dfg_id = 25 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 0 : i32}]} : !neura.data, !neura.data -> !neura.data +// MAPPED-NEXT: %22 = "neura.data_mov"(%21) {dfg_id = 29 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}, {id = 12 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: neura.return_value %22 : !neura.data {dfg_id = 33 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 8 : i32, x = 0 : i32, y = 2 : i32}]} +// MAPPED-NEXT: %23 = "neura.data_mov"(%13) {dfg_id = 24 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}, {id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %24 = neura.load_indexed [%23 : !neura.data] {dfg_id = 28 : i32, lhs_value = "%input0", mapping_locs = [{id = 0 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 0 : i32, y = 0 : i32}]} : !neura.data +// MAPPED-NEXT: %25 = "neura.data_mov"(%13) {dfg_id = 23 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %26 = neura.load_indexed [%25 : !neura.data] {dfg_id = 27 : i32, lhs_value = "%input1", mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data +// MAPPED-NEXT: %27 = "neura.data_mov"(%24) {dfg_id = 32 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %28 = "neura.data_mov"(%26) {dfg_id = 31 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 4 : i32}, {id = 128 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %29 = "neura.mul"(%27, %28) {dfg_id = 34 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 1 : i32}]} : (!neura.data, !neura.data) -> !neura.data +// MAPPED-NEXT: %30 = "neura.data_mov"(%16) {dfg_id = 21 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}, {id = 160 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %31 = "neura.data_mov"(%29) {dfg_id = 35 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %32 = "neura.add"(%30, %31) {dfg_id = 36 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 1 : i32, y = 1 : i32}]} : (!neura.data, !neura.data) -> !neura.data +// MAPPED-NEXT: %33 = "neura.data_mov"(%13) {dfg_id = 22 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %34 = "neura.add"(%33) {dfg_id = 26 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 0 : i32}], rhs_value = 1 : index} : (!neura.data) -> !neura.data +// MAPPED-NEXT: neura.ctrl_mov %34 -> %6 {dfg_id = 30 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : !neura.data !neura.data +// MAPPED-NEXT: neura.ctrl_mov %32 -> %3 {dfg_id = 37 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}]} : !neura.data !neura.data +// MAPPED-NEXT: neura.yield {dfg_id = 4 : i32} +// MAPPED-NEXT: } : i32 +// MAPPED-NEXT: "taskflow.yield"(%0) <{operandSegmentSizes = array}> : (i32) -> () +// MAPPED-NEXT: }) : (memref, memref, i32) -> i32 +// MAPPED-NEXT: return %value_outputs : i32 +// MAPPED-NEXT: } +// MAPPED-NEXT: } + + + diff --git a/test/multi-cgra/kernel_mapping/relu/relu.mlir b/test/multi-cgra/kernel_mapping/relu/relu.mlir new file mode 100644 index 00000000..ebede17a --- /dev/null +++ b/test/multi-cgra/kernel_mapping/relu/relu.mlir @@ -0,0 +1,286 @@ +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: -o %t.taskflow.mlir +// RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW + +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: -o %t.canonicalized.mlir +// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE + +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: --classify-counters \ +// RUN: --convert-taskflow-to-neura \ +// RUN: -o %t.kernel.mlir +// RUN: FileCheck %s --input-file=%t.kernel.mlir --check-prefixes=KERNEL + +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: --classify-counters \ +// RUN: --convert-taskflow-to-neura \ +// RUN: --lower-affine \ +// RUN: --convert-scf-to-cf \ +// RUN: --convert-cf-to-llvm \ +// RUN: --assign-accelerator \ +// RUN: --lower-memref-to-neura \ +// RUN: --lower-arith-to-neura \ +// RUN: --lower-builtin-to-neura \ +// RUN: --lower-llvm-to-neura \ +// RUN: -o %t.neura.mlir +// RUN: FileCheck %s --input-file=%t.neura.mlir --check-prefixes=NEURA + +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: --classify-counters \ +// RUN: --convert-taskflow-to-neura \ +// RUN: --lower-affine \ +// RUN: --convert-scf-to-cf \ +// RUN: --convert-cf-to-llvm \ +// RUN: --assign-accelerator \ +// RUN: --lower-memref-to-neura \ +// RUN: --lower-arith-to-neura \ +// RUN: --lower-builtin-to-neura \ +// RUN: --lower-llvm-to-neura \ +// RUN: --promote-input-arg-to-const \ +// RUN: --fold-constant \ +// RUN: --canonicalize-return \ +// RUN: --canonicalize-live-in \ +// RUN: --leverage-predicated-value \ +// RUN: --transform-ctrl-to-data-flow \ +// RUN: --fold-constant \ +// RUN: -o %t.dataflow.mlir +// RUN: FileCheck %s --input-file=%t.dataflow.mlir --check-prefixes=DATAFLOW + +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task \ +// RUN: --canonicalize-task \ +// RUN: --classify-counters \ +// RUN: --convert-taskflow-to-neura \ +// RUN: --lower-affine \ +// RUN: --convert-scf-to-cf \ +// RUN: --convert-cf-to-llvm \ +// RUN: --assign-accelerator \ +// RUN: --lower-memref-to-neura \ +// RUN: --lower-arith-to-neura \ +// RUN: --lower-builtin-to-neura \ +// RUN: --lower-llvm-to-neura \ +// RUN: --promote-input-arg-to-const \ +// RUN: --fold-constant \ +// RUN: --canonicalize-return \ +// RUN: --canonicalize-live-in \ +// RUN: --leverage-predicated-value \ +// RUN: --transform-ctrl-to-data-flow \ +// RUN: --fold-constant \ +// RUN: --insert-data-mov \ +// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --architecture-spec=%S/../../../arch_spec/architecture.yaml \ +// RUN: -o %t.mapped.mlir +// RUN: FileCheck %s --input-file=%t.mapped.mlir --check-prefixes=MAPPED + +module attributes {} { + func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { + %c0_i32 = arith.constant 0 : i32 + affine.for %arg2 = 0 to 32 { + %0 = affine.load %arg0[%arg2] : memref + %1 = arith.cmpi sgt, %0, %c0_i32 : i32 + scf.if %1 { + %2 = affine.load %arg0[%arg2] : memref + %3 = affine.load %arg1[%arg2] : memref + %4 = arith.addi %3, %2 : i32 + affine.store %4, %arg1[%arg2] : memref + } else { + %2 = affine.load %arg1[%arg2] : memref + affine.store %2, %arg1[%arg2] : memref + } + } + return + } +} + +// TASKFLOW: module { +// TASKFLOW-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { +// TASKFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 +// TASKFLOW-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// TASKFLOW-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: i32): +// TASKFLOW-NEXT: affine.for %arg5 = 0 to 32 { +// TASKFLOW-NEXT: %0 = affine.load %arg2[%arg5] : memref +// TASKFLOW-NEXT: %1 = arith.cmpi sgt, %0, %arg4 : i32 +// TASKFLOW-NEXT: scf.if %1 { +// TASKFLOW-NEXT: %2 = affine.load %arg2[%arg5] : memref +// TASKFLOW-NEXT: %3 = affine.load %arg3[%arg5] : memref +// TASKFLOW-NEXT: %4 = arith.addi %3, %2 : i32 +// TASKFLOW-NEXT: affine.store %4, %arg3[%arg5] : memref +// TASKFLOW-NEXT: } else { +// TASKFLOW-NEXT: %2 = affine.load %arg3[%arg5] : memref +// TASKFLOW-NEXT: affine.store %2, %arg3[%arg5] : memref +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref) -> () +// TASKFLOW-NEXT: }) : (memref, memref, i32) -> memref +// TASKFLOW-NEXT: return +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: } + +// CANONICALIZE: module { +// CANONICALIZE-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { +// CANONICALIZE-NEXT: %c0_i32 = arith.constant 0 : i32 +// CANONICALIZE-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// CANONICALIZE-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: i32): +// CANONICALIZE-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// CANONICALIZE-NEXT: "taskflow.hyperblock"(%0) <{operandSegmentSizes = array}> ({ +// CANONICALIZE-NEXT: ^bb0(%arg5: index): +// CANONICALIZE-NEXT: %1 = memref.load %arg2[%arg5] : memref +// CANONICALIZE-NEXT: %2 = arith.cmpi sgt, %1, %arg4 : i32 +// CANONICALIZE-NEXT: scf.if %2 { +// CANONICALIZE-NEXT: %3 = memref.load %arg2[%arg5] : memref +// CANONICALIZE-NEXT: %4 = memref.load %arg3[%arg5] : memref +// CANONICALIZE-NEXT: %5 = arith.addi %4, %3 : i32 +// CANONICALIZE-NEXT: memref.store %5, %arg3[%arg5] : memref +// CANONICALIZE-NEXT: } else { +// CANONICALIZE-NEXT: %3 = memref.load %arg3[%arg5] : memref +// CANONICALIZE-NEXT: memref.store %3, %arg3[%arg5] : memref +// CANONICALIZE-NEXT: } +// CANONICALIZE-NEXT: taskflow.hyperblock.yield +// CANONICALIZE-NEXT: }) : (index) -> () +// CANONICALIZE-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref) -> () +// CANONICALIZE-NEXT: }) : (memref, memref, i32) -> memref +// CANONICALIZE-NEXT: return +// CANONICALIZE-NEXT: } +// CANONICALIZE-NEXT: } + +// KERNEL: module { +// KERNEL-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { +// KERNEL-NEXT: %c0_i32 = arith.constant 0 : i32 +// KERNEL-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// KERNEL-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: i32): +// KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// KERNEL-NEXT: neura.kernel inputs(%arg2, %arg4, %arg3 : memref, i32, memref) { +// KERNEL-NEXT: ^bb0(%arg5: memref, %arg6: i32, %arg7: memref): +// KERNEL-NEXT: %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// KERNEL-NEXT: %2 = memref.load %arg5[%1] : memref +// KERNEL-NEXT: %3 = arith.cmpi sgt, %2, %arg6 : i32 +// KERNEL-NEXT: scf.if %3 { +// KERNEL-NEXT: %4 = memref.load %arg5[%1] : memref +// KERNEL-NEXT: %5 = memref.load %arg7[%1] : memref +// KERNEL-NEXT: %6 = arith.addi %5, %4 : i32 +// KERNEL-NEXT: memref.store %6, %arg7[%1] : memref +// KERNEL-NEXT: } else { +// KERNEL-NEXT: %4 = memref.load %arg7[%1] : memref +// KERNEL-NEXT: memref.store %4, %arg7[%1] : memref +// KERNEL-NEXT: } +// KERNEL-NEXT: neura.yield +// KERNEL-NEXT: } +// KERNEL-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref) -> () +// KERNEL-NEXT: }) : (memref, memref, i32) -> memref +// KERNEL-NEXT: return +// KERNEL-NEXT: } +// KERNEL-NEXT: } + +// NEURA: module { +// NEURA-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { +// NEURA-NEXT: %c0_i32 = arith.constant 0 : i32 +// NEURA-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// NEURA-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: i32): +// NEURA-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// NEURA-NEXT: neura.kernel inputs(%arg2, %arg4, %arg3 : memref, i32, memref) attributes {accelerator = "neura"} { +// NEURA-NEXT: ^bb0(%arg5: memref, %arg6: i32, %arg7: memref): +// NEURA-NEXT: %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// NEURA-NEXT: %2 = neura.load_indexed %arg5[%1 : index] memref : i32 +// NEURA-NEXT: %3 = "neura.icmp"(%2, %arg6) <{cmpType = "sgt"}> : (i32, i32) -> i1 +// NEURA-NEXT: neura.cond_br %3 : i1 then to ^bb1 else to ^bb2 +// NEURA-NEXT: ^bb1: // pred: ^bb0 +// NEURA-NEXT: %4 = neura.load_indexed %arg5[%1 : index] memref : i32 +// NEURA-NEXT: %5 = neura.load_indexed %arg7[%1 : index] memref : i32 +// NEURA-NEXT: %6 = "neura.add"(%5, %4) : (i32, i32) -> i32 +// NEURA-NEXT: neura.store_indexed %6 to %arg7[%1 : index] memref : i32 +// NEURA-NEXT: neura.br to ^bb3 +// NEURA-NEXT: ^bb2: // pred: ^bb0 +// NEURA-NEXT: %7 = neura.load_indexed %arg7[%1 : index] memref : i32 +// NEURA-NEXT: neura.store_indexed %7 to %arg7[%1 : index] memref : i32 +// NEURA-NEXT: neura.br to ^bb3 +// NEURA-NEXT: ^bb3: // 2 preds: ^bb1, ^bb2 +// NEURA-NEXT: neura.yield +// NEURA-NEXT: } +// NEURA-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref) -> () +// NEURA-NEXT: }) : (memref, memref, i32) -> memref +// NEURA-NEXT: return +// NEURA-NEXT: } +// NEURA-NEXT: } + +// DATAFLOW: module { +// DATAFLOW-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { +// DATAFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 +// DATAFLOW-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// DATAFLOW-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: i32): +// DATAFLOW-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// DATAFLOW-NEXT: neura.kernel inputs(%arg2, %arg4, %arg3 : memref, i32, memref) attributes {accelerator = "neura", dataflow_mode = "predicate"} { +// DATAFLOW-NEXT: ^bb0(%arg5: memref, %arg6: i32, %arg7: memref): +// DATAFLOW-NEXT: %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : !neura.data +// DATAFLOW-NEXT: %2 = neura.load_indexed [%1 : !neura.data] {lhs_value = "%input0"} : !neura.data +// DATAFLOW-NEXT: %3 = "neura.icmp"(%2) <{cmpType = "sgt"}> {rhs_value = "%input1"} : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %4 = neura.grant_predicate %1, %3 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: %5 = "neura.not"(%3) : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %6 = neura.grant_predicate %1, %5 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: %7 = neura.load_indexed [%6 : !neura.data] {lhs_value = "%input2"} : !neura.data +// DATAFLOW-NEXT: neura.store_indexed %7 to [%6 : !neura.data] {rhs_value = "%input2"} : !neura.data +// DATAFLOW-NEXT: %8 = neura.load_indexed [%4 : !neura.data] {lhs_value = "%input0"} : !neura.data +// DATAFLOW-NEXT: %9 = neura.load_indexed [%4 : !neura.data] {lhs_value = "%input2"} : !neura.data +// DATAFLOW-NEXT: %10 = "neura.add"(%9, %8) : (!neura.data, !neura.data) -> !neura.data +// DATAFLOW-NEXT: neura.store_indexed %10 to [%4 : !neura.data] {rhs_value = "%input2"} : !neura.data +// DATAFLOW-NEXT: neura.yield {yield_type = "void"} +// DATAFLOW-NEXT: } +// DATAFLOW-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref) -> () +// DATAFLOW-NEXT: }) : (memref, memref, i32) -> memref +// DATAFLOW-NEXT: return +// DATAFLOW-NEXT: } +// DATAFLOW-NEXT: } + +// MAPPED: module { +// MAPPED-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { +// MAPPED-NEXT: %c0_i32 = arith.constant 0 : i32 +// MAPPED-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// MAPPED-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: i32): +// MAPPED-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// MAPPED-NEXT: neura.kernel inputs(%arg2, %arg4, %arg3 : memref, i32, memref) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 2 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 1 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { +// MAPPED-NEXT: ^bb0(%arg5: memref, %arg6: i32, %arg7: memref): +// MAPPED-NEXT: %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", dfg_id = 0 : i32, lower_bound = 0 : index, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 1 : i32, y = 1 : i32}], step = 1 : index, upper_bound = 32 : index} : !neura.data +// MAPPED-NEXT: %2 = "neura.data_mov"(%1) {dfg_id = 2 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %3 = neura.load_indexed [%2 : !neura.data] {dfg_id = 5 : i32, lhs_value = "%input0", mapping_locs = [{id = 5 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data +// MAPPED-NEXT: %4 = "neura.data_mov"(%3) {dfg_id = 6 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %5 = "neura.icmp"(%4) <{cmpType = "sgt"}> {dfg_id = 7 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 0 : i32}], rhs_value = "%input1"} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %6 = "neura.data_mov"(%1) {dfg_id = 3 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}, {id = 32 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}, {id = 32 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %7 = "neura.data_mov"(%5) {dfg_id = 9 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %8 = neura.grant_predicate %6, %7 {dfg_id = 11 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 0 : i32}]} : !neura.data, !neura.data -> !neura.data +// MAPPED-NEXT: %9 = "neura.data_mov"(%5) {dfg_id = 8 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %10 = "neura.not"(%9) {dfg_id = 10 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 0 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %11 = "neura.data_mov"(%1) {dfg_id = 4 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}, {id = 128 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}, {id = 128 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}, {id = 128 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %12 = "neura.data_mov"(%10) {dfg_id = 12 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %13 = neura.grant_predicate %11, %12 {dfg_id = 16 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 4 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data, !neura.data -> !neura.data +// MAPPED-NEXT: %14 = "neura.data_mov"(%13) {dfg_id = 20 : i32, mapping_locs = [{id = 129 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %15 = neura.load_indexed [%14 : !neura.data] {dfg_id = 23 : i32, lhs_value = "%input2", mapping_locs = [{id = 4 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 5 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data +// MAPPED-NEXT: %16 = "neura.data_mov"(%15) {dfg_id = 25 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %17 = "neura.data_mov"(%13) {dfg_id = 19 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 4 : i32}, {id = 256 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: neura.store_indexed %16 to [%17 : !neura.data] {dfg_id = 27 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 2 : i32}], rhs_value = "%input2"} : !neura.data +// MAPPED-NEXT: %18 = "neura.data_mov"(%8) {dfg_id = 15 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 3 : i32}, {id = 3 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %19 = neura.load_indexed [%18 : !neura.data] {dfg_id = 18 : i32, lhs_value = "%input0", mapping_locs = [{id = 2 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 0 : i32}]} : !neura.data +// MAPPED-NEXT: %20 = "neura.data_mov"(%8) {dfg_id = 14 : i32, mapping_locs = [{id = 3 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %21 = neura.load_indexed [%20 : !neura.data] {dfg_id = 17 : i32, lhs_value = "%input2", mapping_locs = [{id = 2 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 0 : i32}]} : !neura.data +// MAPPED-NEXT: %22 = "neura.data_mov"(%21) {dfg_id = 21 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 4 : i32}, {id = 96 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %23 = "neura.data_mov"(%19) {dfg_id = 22 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %24 = "neura.add"(%22, %23) {dfg_id = 24 : i32, mapping_locs = [{id = 3 : i32, index_per_ii = 0 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 6 : i32, x = 3 : i32, y = 0 : i32}]} : (!neura.data, !neura.data) -> !neura.data +// MAPPED-NEXT: %25 = "neura.data_mov"(%24) {dfg_id = 26 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 0 : i32, invalid_iterations = 3 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %26 = "neura.data_mov"(%8) {dfg_id = 13 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}, {id = 14 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 4 : i32}, {id = 18 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 5 : i32}, {id = 224 : i32, index_per_ii = 0 : i32, invalid_iterations = 3 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: neura.store_indexed %25 to [%26 : !neura.data] {dfg_id = 28 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 1 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 7 : i32, x = 3 : i32, y = 1 : i32}], rhs_value = "%input2"} : !neura.data +// MAPPED-NEXT: neura.yield {dfg_id = 1 : i32, yield_type = "void"} +// MAPPED-NEXT: } +// MAPPED-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref) -> () +// MAPPED-NEXT: }) : (memref, memref, i32) -> memref +// MAPPED-NEXT: return +// MAPPED-NEXT: } +// MAPPED-NEXT: } + + From b86894d0a16f6a479d3139a9016abe7953c5a001 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Thu, 22 Jan 2026 15:45:11 +0800 Subject: [PATCH 16/25] change the definition of taskflow.hyperblock.yield --- .../Transforms/WrapLoopInKernelPass.cpp | 142 ++++++++++++++++++ .../kernel_with_yield/kernel_with_yield.mlir | 38 +++++ 2 files changed, 180 insertions(+) create mode 100644 lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp create mode 100644 test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir diff --git a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp new file mode 100644 index 00000000..ac664382 --- /dev/null +++ b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp @@ -0,0 +1,142 @@ +#include "NeuraDialect/NeuraDialect.h" +#include "NeuraDialect/NeuraOps.h" +#include "NeuraDialect/NeuraPasses.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Types.h" +#include "mlir/IR/Value.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/TypeID.h" +#include "mlir/Transforms/RegionUtils.h" +#include "llvm/ADT/STLExtras.h" +#include + +using namespace mlir; + +namespace { + +static bool isInnermostLoop(affine::AffineForOp for_op) { + bool has_nested_loops = false; + for_op.getBody()->walk([&](affine::AffineForOp) { has_nested_loops = true; }); + return !has_nested_loops; +} + +// Wraps an innermost affine for loop in a neura.kernel operation. +static LogicalResult wrapInnermostLoopAsKernel(affine::AffineForOp for_op, + OpBuilder &builder, + unsigned &kernel_id) { + Location loc = for_op.getLoc(); + + // Collects values that need to be captured by the kernel. + llvm::SetVector captured_values; + getUsedValuesDefinedAbove(for_op.getRegion(), captured_values); + + // Checks if the loop has output values. + bool has_outputs = !for_op.getResults().empty(); + + // Creates the neura.kernel operation. + builder.setInsertionPoint(for_op); + SmallVector inputs(captured_values.begin(), captured_values.end()); + SmallVector input_types; + for (Value val : inputs) { + input_types.push_back(val.getType()); + } + + neura::KernelOp kernel_op = builder.create( + loc, /*output_types=*/for_op->getResultTypes(), + /*inputs=*/inputs); + + // Sets kernel name. + std::string kernel_name = "kernel_" + std::to_string(kernel_id++); + kernel_op.setKernelNameAttr(builder.getStringAttr(kernel_name)); + + // Creats the kernel body block with arguments for captured values. + Block *kernel_body = new Block(); + kernel_op.getBody().push_back(kernel_body); + + // Replaces uses of the original loop's results with kernel results. + if (has_outputs) { + for (auto [orig_result, kernel_result] : + llvm::zip(for_op->getResults(), kernel_op.getResults())) { + orig_result.replaceAllUsesWith(kernel_result); + } + } + + // Moves the loop directly in to the kernel body. + builder.setInsertionPointToStart(kernel_body); + for_op->moveBefore(kernel_body, kernel_body->end()); + + builder.setInsertionPointToEnd(kernel_body); + // Adds yield operation with proper operands. + if (has_outputs) { + // If the loop has outputs, yield the loop results. + SmallVector yield_operands(for_op.getResults()); + builder.create(loc, ValueRange{}, yield_operands); + } else { + // If the loop has no outputs, create an empty yield. + builder.create(loc); + } + + return success(); +} + +struct WrapLoopInKernelPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WrapLoopInKernelPass) + + StringRef getArgument() const override { return "wrap-loop-in-kernel"; } + StringRef getDescription() const override { + return "Wraps loops in Neura kernel operations."; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + func::FuncOp func_op = getOperation(); + + // Skips if function already has kerenls. + bool has_kernels = false; + func_op.walk([&](neura::KernelOp) { has_kernels = true; }); + if (has_kernels) { + return; + } + + // Skips main function. + if (func_op.getName() == "main") { + return; + } + + // Collects all innermost affine for loops in the function. + // TODO: Support more kernel wrapping strategies. + SmallVector innermost_loops; + func_op.walk([&](affine::AffineForOp for_op) { + if (isInnermostLoop(for_op)) { + innermost_loops.push_back(for_op); + } + }); + + if (innermost_loops.empty()) { + return; + } + + // Wraps each innermost affine for loop in a neura.kernel operation. + // TODO: Support more kernel wrapping strategies. + OpBuilder builder(func_op->getContext()); + unsigned kernel_id = 0; + for (affine::AffineForOp loop : innermost_loops) { + if (failed(wrapInnermostLoopAsKernel(loop, builder, kernel_id))) { + signalPassFailure(); + return; + } + } + } +}; +} // namespace + +std::unique_ptr mlir::neura::createWrapLoopInKernelPass() { + return std::make_unique(); +} \ No newline at end of file diff --git a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir b/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir new file mode 100644 index 00000000..ad24eac4 --- /dev/null +++ b/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir @@ -0,0 +1,38 @@ +// Wraps the innermost loop within neura.kernel operation. +// RUN: mlir-neura-opt %s \ +// RUN: --wrap-loop-in-kernel \ +// RUN: -o %t-wrapped.mlir +// RUN: FileCheck %s --input-file=%t-wrapped.mlir + +module attributes {} { + func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { + %c0_i32 = arith.constant 0 : i32 + %0 = affine.for %arg1 = 0 to 128 iter_args(%arg2 = %c0_i32) -> (i32) { + %1 = affine.for %arg3 = 0 to 128 iter_args(%arg4 = %arg2) -> (i32) { + %2 = affine.load %arg0[%arg1, %arg3] : memref + %3 = arith.addi %arg4, %2 : i32 + affine.yield %3 : i32 + } + affine.yield %1 : i32 + } + return %0 : i32 + } +} + + // CHECK: module { + // CHECK-NEXT: func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { + // CHECK-NEXT: %c0_i32 = arith.constant 0 : i32 + // CHECK-NEXT: %0 = affine.for %arg1 = 0 to 128 iter_args(%arg2 = %c0_i32) -> (i32) { + // CHECK-NEXT: %1 = neura.kernel ins(%arg0, %arg1 : memref, index) attributes {kernel_name = "kernel_0"} { + // CHECK-NEXT: %2 = affine.for %arg3 = 0 to 128 iter_args(%arg4 = %arg2) -> (i32) { + // CHECK-NEXT: %3 = affine.load %arg0[%arg1, %arg3] : memref + // CHECK-NEXT: %4 = arith.addi %arg4, %3 : i32 + // CHECK-NEXT: affine.yield %4 : i32 + // CHECK-NEXT: } + // CHECK-NEXT: neura.yield %2 : i32 + // CHECK-NEXT: } : i32 + // CHECK-NEXT: affine.yield %1 : i32 + // CHECK-NEXT: } + // CHECK-NEXT: return %0 : i32 + // CHECK-NEXT: } + // CHECK-NEXT: } From 10b1076baa3c4490d8821c41fdcde2b6b52ea488 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Mon, 26 Jan 2026 11:33:01 +0800 Subject: [PATCH 17/25] [clean] remove redundant code --- lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp index 14257c8c..258a4be5 100644 --- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp +++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp @@ -86,7 +86,6 @@ void GrantPredicateInEntryBlock(Block *entry_block, OpBuilder &builder, return; } SmallVector live_out_arg_values; - SmallVector live_out_non_arg_values; // Step 1: Collects all live-out values first. for (Operation &op : *entry_block) { From 2ce803119ea280a768e2589490208eb811a2dcf6 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Mon, 26 Jan 2026 11:45:05 +0800 Subject: [PATCH 18/25] [clean] remove redudant files --- .../Transforms/WrapLoopInKernelPass.cpp | 142 ------------------ .../kernel_with_yield/kernel_with_yield.mlir | 38 ----- 2 files changed, 180 deletions(-) delete mode 100644 lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp delete mode 100644 test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir diff --git a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp deleted file mode 100644 index ac664382..00000000 --- a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp +++ /dev/null @@ -1,142 +0,0 @@ -#include "NeuraDialect/NeuraDialect.h" -#include "NeuraDialect/NeuraOps.h" -#include "NeuraDialect/NeuraPasses.h" -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/Types.h" -#include "mlir/IR/Value.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Support/TypeID.h" -#include "mlir/Transforms/RegionUtils.h" -#include "llvm/ADT/STLExtras.h" -#include - -using namespace mlir; - -namespace { - -static bool isInnermostLoop(affine::AffineForOp for_op) { - bool has_nested_loops = false; - for_op.getBody()->walk([&](affine::AffineForOp) { has_nested_loops = true; }); - return !has_nested_loops; -} - -// Wraps an innermost affine for loop in a neura.kernel operation. -static LogicalResult wrapInnermostLoopAsKernel(affine::AffineForOp for_op, - OpBuilder &builder, - unsigned &kernel_id) { - Location loc = for_op.getLoc(); - - // Collects values that need to be captured by the kernel. - llvm::SetVector captured_values; - getUsedValuesDefinedAbove(for_op.getRegion(), captured_values); - - // Checks if the loop has output values. - bool has_outputs = !for_op.getResults().empty(); - - // Creates the neura.kernel operation. - builder.setInsertionPoint(for_op); - SmallVector inputs(captured_values.begin(), captured_values.end()); - SmallVector input_types; - for (Value val : inputs) { - input_types.push_back(val.getType()); - } - - neura::KernelOp kernel_op = builder.create( - loc, /*output_types=*/for_op->getResultTypes(), - /*inputs=*/inputs); - - // Sets kernel name. - std::string kernel_name = "kernel_" + std::to_string(kernel_id++); - kernel_op.setKernelNameAttr(builder.getStringAttr(kernel_name)); - - // Creats the kernel body block with arguments for captured values. - Block *kernel_body = new Block(); - kernel_op.getBody().push_back(kernel_body); - - // Replaces uses of the original loop's results with kernel results. - if (has_outputs) { - for (auto [orig_result, kernel_result] : - llvm::zip(for_op->getResults(), kernel_op.getResults())) { - orig_result.replaceAllUsesWith(kernel_result); - } - } - - // Moves the loop directly in to the kernel body. - builder.setInsertionPointToStart(kernel_body); - for_op->moveBefore(kernel_body, kernel_body->end()); - - builder.setInsertionPointToEnd(kernel_body); - // Adds yield operation with proper operands. - if (has_outputs) { - // If the loop has outputs, yield the loop results. - SmallVector yield_operands(for_op.getResults()); - builder.create(loc, ValueRange{}, yield_operands); - } else { - // If the loop has no outputs, create an empty yield. - builder.create(loc); - } - - return success(); -} - -struct WrapLoopInKernelPass - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WrapLoopInKernelPass) - - StringRef getArgument() const override { return "wrap-loop-in-kernel"; } - StringRef getDescription() const override { - return "Wraps loops in Neura kernel operations."; - } - - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - } - - void runOnOperation() override { - func::FuncOp func_op = getOperation(); - - // Skips if function already has kerenls. - bool has_kernels = false; - func_op.walk([&](neura::KernelOp) { has_kernels = true; }); - if (has_kernels) { - return; - } - - // Skips main function. - if (func_op.getName() == "main") { - return; - } - - // Collects all innermost affine for loops in the function. - // TODO: Support more kernel wrapping strategies. - SmallVector innermost_loops; - func_op.walk([&](affine::AffineForOp for_op) { - if (isInnermostLoop(for_op)) { - innermost_loops.push_back(for_op); - } - }); - - if (innermost_loops.empty()) { - return; - } - - // Wraps each innermost affine for loop in a neura.kernel operation. - // TODO: Support more kernel wrapping strategies. - OpBuilder builder(func_op->getContext()); - unsigned kernel_id = 0; - for (affine::AffineForOp loop : innermost_loops) { - if (failed(wrapInnermostLoopAsKernel(loop, builder, kernel_id))) { - signalPassFailure(); - return; - } - } - } -}; -} // namespace - -std::unique_ptr mlir::neura::createWrapLoopInKernelPass() { - return std::make_unique(); -} \ No newline at end of file diff --git a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir b/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir deleted file mode 100644 index ad24eac4..00000000 --- a/test/multi-cgra/neura-kernel/kernel_with_yield/kernel_with_yield.mlir +++ /dev/null @@ -1,38 +0,0 @@ -// Wraps the innermost loop within neura.kernel operation. -// RUN: mlir-neura-opt %s \ -// RUN: --wrap-loop-in-kernel \ -// RUN: -o %t-wrapped.mlir -// RUN: FileCheck %s --input-file=%t-wrapped.mlir - -module attributes {} { - func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { - %c0_i32 = arith.constant 0 : i32 - %0 = affine.for %arg1 = 0 to 128 iter_args(%arg2 = %c0_i32) -> (i32) { - %1 = affine.for %arg3 = 0 to 128 iter_args(%arg4 = %arg2) -> (i32) { - %2 = affine.load %arg0[%arg1, %arg3] : memref - %3 = arith.addi %arg4, %2 : i32 - affine.yield %3 : i32 - } - affine.yield %1 : i32 - } - return %0 : i32 - } -} - - // CHECK: module { - // CHECK-NEXT: func.func @_Z27perfect_nested_reduction_2dPA128_i(%arg0: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { - // CHECK-NEXT: %c0_i32 = arith.constant 0 : i32 - // CHECK-NEXT: %0 = affine.for %arg1 = 0 to 128 iter_args(%arg2 = %c0_i32) -> (i32) { - // CHECK-NEXT: %1 = neura.kernel ins(%arg0, %arg1 : memref, index) attributes {kernel_name = "kernel_0"} { - // CHECK-NEXT: %2 = affine.for %arg3 = 0 to 128 iter_args(%arg4 = %arg2) -> (i32) { - // CHECK-NEXT: %3 = affine.load %arg0[%arg1, %arg3] : memref - // CHECK-NEXT: %4 = arith.addi %arg4, %3 : i32 - // CHECK-NEXT: affine.yield %4 : i32 - // CHECK-NEXT: } - // CHECK-NEXT: neura.yield %2 : i32 - // CHECK-NEXT: } : i32 - // CHECK-NEXT: affine.yield %1 : i32 - // CHECK-NEXT: } - // CHECK-NEXT: return %0 : i32 - // CHECK-NEXT: } - // CHECK-NEXT: } From d8e7c0fb30b8bbbeab9687ec2a4844a072404aca Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Tue, 27 Jan 2026 11:45:46 +0800 Subject: [PATCH 19/25] sync with main --- include/NeuraDialect/NeuraPasses.td | 8 -------- lib/NeuraDialect/Transforms/CMakeLists.txt | 1 - test/neura/fusion/test.mlir | 2 +- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td index fc6cec1e..123bf1c8 100644 --- a/include/NeuraDialect/NeuraPasses.td +++ b/include/NeuraDialect/NeuraPasses.td @@ -167,14 +167,6 @@ def InitPattern : Pass<"init-pattern", "ModuleOp"> { let constructor = "neura::createInitPatternPass()"; } -def WrapLoopInKernelPass : Pass<"wrap-loop-in-kernel", "func::FuncOp">{ - let summary = "Wrap loops in neura.kernel operations"; - let description = [{ - This pass wraps loops in neura.kernel operations to encapsulate loop bodies. - }]; - let constructor = "neura::createWrapLoopInKernelPass()"; -} - def HardwareMerge : Pass<"hardware-merge", "ModuleOp"> { let summary = "Merge and optimize hardware units for pattern execution"; let description = [{ diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt index da7056fb..010fc3c7 100644 --- a/lib/NeuraDialect/Transforms/CMakeLists.txt +++ b/lib/NeuraDialect/Transforms/CMakeLists.txt @@ -19,7 +19,6 @@ add_mlir_library( RemovePredicatedTypePass.cpp HardwareMergePass.cpp GraphMining/HardwareTemplate.cpp - WrapLoopInKernelPass.cpp DEPENDS MLIRNeuraTransformsIncGen diff --git a/test/neura/fusion/test.mlir b/test/neura/fusion/test.mlir index 0e6a3dce..63881151 100644 --- a/test/neura/fusion/test.mlir +++ b/test/neura/fusion/test.mlir @@ -117,7 +117,7 @@ // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --verify-each=true --mlir-print-ir-after-failure \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --canonicalize-return \ // RUN: --canonicalize-cast \ // RUN: --canonicalize-live-in \ From b454a8d03b5eb6db76e7c9afb226733c819b1359 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 31 Jan 2026 14:20:54 +0800 Subject: [PATCH 20/25] sync with main --- .../Transforms/MapToAcceleratorPass.cpp | 75 +------------------ 1 file changed, 4 insertions(+), 71 deletions(-) diff --git a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp index cfe14543..9b5ee423 100644 --- a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp +++ b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp @@ -178,9 +178,8 @@ struct MapToAcceleratorPass // Generic mapping function works for both function and kernel mapping. template - bool mapRegion(OpType op, Region ®ion, Architecture &architecture, + bool mapRegion(OpType op, Region ®ion, const Architecture &architecture, Mapping *mapping_strategy, bool is_spatial_only, - int max_ctrl_mem_items, const std::string &resolved_mapping_mode, const std::string &resolved_mapping_strategy) { // Checks steering mode compatibility with architecture. @@ -231,8 +230,7 @@ struct MapToAcceleratorPass int res_mii = calculateResMii(region, architecture); const int possible_min_ii = std::max(rec_mii, res_mii); - const int max_ii = - max_ctrl_mem_items; // Use YAML config (default 20 if not specified) + const int max_ii = architecture.getMaxCtrlMemItems(); std::vector topologically_sorted_ops = getTopologicallySortedOps(region); @@ -359,70 +357,6 @@ struct MapToAcceleratorPass const Architecture &architecture = mlir::neura::getArchitecture(); - std::string architecture_spec_file = mlir::neura::getArchitectureSpecFile(); - int multi_cgra_rows = kMultiCgraDefaultRows; - int multi_cgra_columns = kMultiCgraDefaultColumns; - int per_cgra_rows = kPerCgraDefaultRows; - int per_cgra_columns = kPerCgraDefaultColumns; - int max_ctrl_mem_items = kDefaultMaxCtrlMemItems; - mlir::neura::TileDefaults tile_defaults; - std::vector tile_overrides; - mlir::neura::LinkDefaults link_defaults; - std::vector link_overrides; - mlir::neura::BaseTopology multi_cgra_base_topology = - mlir::neura::BaseTopology::MESH; - mlir::neura::BaseTopology per_cgra_base_topology = - mlir::neura::BaseTopology::MESH; - - if (!architecture_spec_file.empty()) { - - // Use LLVM YAML parser to validate the YAML syntax (no mapping yet) - llvm::ErrorOr> buffer_or_err = - llvm::MemoryBuffer::getFile(architecture_spec_file); - if (!buffer_or_err) { - llvm::errs() << "[MapToAcceleratorPass] Failed to open architecture " - "specification file: " - << architecture_spec_file << "\n"; - return; - } - - llvm::SourceMgr sm; - sm.AddNewSourceBuffer(std::move(*buffer_or_err), llvm::SMLoc()); - llvm::yaml::Stream yaml_stream( - sm.getMemoryBuffer(sm.getMainFileID())->getBuffer(), sm); - - bool parse_failed = false; - llvm::yaml::Document &yaml_doc = *yaml_stream.begin(); - (void)yaml_doc; // ensure document is created - if (yaml_stream.failed()) { - parse_failed = true; - } - - if (parse_failed) { - llvm::errs() << "[MapToAcceleratorPass] YAML parse error in: " - << architecture_spec_file << "\n"; - return; - } - - // Parses YAML configuration. - if (!parseArchitectureYaml( - yaml_doc, multi_cgra_rows, multi_cgra_columns, - multi_cgra_base_topology, per_cgra_rows, per_cgra_columns, - per_cgra_base_topology, max_ctrl_mem_items, tile_defaults, - tile_overrides, link_defaults, link_overrides)) { - return; - } - } else { - llvm::errs() << "[MapToAcceleratorPass] No architecture specification " - "file provided.\n"; - } - - // Creates architecture. - Architecture architecture( - multi_cgra_rows, multi_cgra_columns, multi_cgra_base_topology, - per_cgra_rows, per_cgra_columns, per_cgra_base_topology, tile_defaults, - tile_overrides, link_defaults, link_overrides); - // Maps kernels. module.walk([&](neura::KernelOp kernel_op) { auto accel_attr = @@ -434,8 +368,7 @@ struct MapToAcceleratorPass Region &kernel_region = kernel_op.getBody(); if (!mapRegion(kernel_op, kernel_region, architecture, mapping_strategy.get(), is_spatial_only, - max_ctrl_mem_items, resolved_mapping_mode, - resolved_mapping_strategy)) { + resolved_mapping_mode, resolved_mapping_strategy)) { llvm::errs() << "[MapToAcceleratorPass] Mapping failed for kernel.\n"; signalPassFailure(); } @@ -452,7 +385,7 @@ struct MapToAcceleratorPass Region &func_region = func_op.getBody(); if (!mapRegion(func_op, func_region, architecture, mapping_strategy.get(), - is_spatial_only, max_ctrl_mem_items, resolved_mapping_mode, + is_spatial_only, resolved_mapping_mode, resolved_mapping_strategy)) { llvm::errs() << "[MapToAcceleratorPass] Failed to map function.\n"; signalPassFailure(); From 75cbdfe01ae39259cfd3c135ff2899b01abe9201 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 31 Jan 2026 14:35:13 +0800 Subject: [PATCH 21/25] recover wraploopinkernel pass --- include/NeuraDialect/NeuraPasses.h | 1 + include/NeuraDialect/NeuraPasses.td | 8 + lib/NeuraDialect/Transforms/CMakeLists.txt | 1 + .../Transforms/WrapLoopInKernelPass.cpp | 142 ++++++++++++++++++ test/compiler_e2e/visualize/test.mlir | 2 +- test/visualize/test2.mlir | 2 +- 6 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h index 75ddbd24..803cc589 100644 --- a/include/NeuraDialect/NeuraPasses.h +++ b/include/NeuraDialect/NeuraPasses.h @@ -37,6 +37,7 @@ std::unique_ptr createRemovePredicatedTypePass(); // Hardware specific optimization passes std::unique_ptr createFuseLoopControlPass(); std::unique_ptr createFusePatternPass(); +std::unique_ptr createWrapLoopInKernelPass(); // Hardware agnostic optimization passes std::unique_ptr createFoldConstantPass(); diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td index 123bf1c8..fc6cec1e 100644 --- a/include/NeuraDialect/NeuraPasses.td +++ b/include/NeuraDialect/NeuraPasses.td @@ -167,6 +167,14 @@ def InitPattern : Pass<"init-pattern", "ModuleOp"> { let constructor = "neura::createInitPatternPass()"; } +def WrapLoopInKernelPass : Pass<"wrap-loop-in-kernel", "func::FuncOp">{ + let summary = "Wrap loops in neura.kernel operations"; + let description = [{ + This pass wraps loops in neura.kernel operations to encapsulate loop bodies. + }]; + let constructor = "neura::createWrapLoopInKernelPass()"; +} + def HardwareMerge : Pass<"hardware-merge", "ModuleOp"> { let summary = "Merge and optimize hardware units for pattern execution"; let description = [{ diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt index 010fc3c7..01df4219 100644 --- a/lib/NeuraDialect/Transforms/CMakeLists.txt +++ b/lib/NeuraDialect/Transforms/CMakeLists.txt @@ -18,6 +18,7 @@ add_mlir_library( TransformToSteerControlPass.cpp RemovePredicatedTypePass.cpp HardwareMergePass.cpp + WrapLoopInKernelPass.cpp GraphMining/HardwareTemplate.cpp DEPENDS diff --git a/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp new file mode 100644 index 00000000..ac664382 --- /dev/null +++ b/lib/NeuraDialect/Transforms/WrapLoopInKernelPass.cpp @@ -0,0 +1,142 @@ +#include "NeuraDialect/NeuraDialect.h" +#include "NeuraDialect/NeuraOps.h" +#include "NeuraDialect/NeuraPasses.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Types.h" +#include "mlir/IR/Value.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/TypeID.h" +#include "mlir/Transforms/RegionUtils.h" +#include "llvm/ADT/STLExtras.h" +#include + +using namespace mlir; + +namespace { + +static bool isInnermostLoop(affine::AffineForOp for_op) { + bool has_nested_loops = false; + for_op.getBody()->walk([&](affine::AffineForOp) { has_nested_loops = true; }); + return !has_nested_loops; +} + +// Wraps an innermost affine for loop in a neura.kernel operation. +static LogicalResult wrapInnermostLoopAsKernel(affine::AffineForOp for_op, + OpBuilder &builder, + unsigned &kernel_id) { + Location loc = for_op.getLoc(); + + // Collects values that need to be captured by the kernel. + llvm::SetVector captured_values; + getUsedValuesDefinedAbove(for_op.getRegion(), captured_values); + + // Checks if the loop has output values. + bool has_outputs = !for_op.getResults().empty(); + + // Creates the neura.kernel operation. + builder.setInsertionPoint(for_op); + SmallVector inputs(captured_values.begin(), captured_values.end()); + SmallVector input_types; + for (Value val : inputs) { + input_types.push_back(val.getType()); + } + + neura::KernelOp kernel_op = builder.create( + loc, /*output_types=*/for_op->getResultTypes(), + /*inputs=*/inputs); + + // Sets kernel name. + std::string kernel_name = "kernel_" + std::to_string(kernel_id++); + kernel_op.setKernelNameAttr(builder.getStringAttr(kernel_name)); + + // Creats the kernel body block with arguments for captured values. + Block *kernel_body = new Block(); + kernel_op.getBody().push_back(kernel_body); + + // Replaces uses of the original loop's results with kernel results. + if (has_outputs) { + for (auto [orig_result, kernel_result] : + llvm::zip(for_op->getResults(), kernel_op.getResults())) { + orig_result.replaceAllUsesWith(kernel_result); + } + } + + // Moves the loop directly in to the kernel body. + builder.setInsertionPointToStart(kernel_body); + for_op->moveBefore(kernel_body, kernel_body->end()); + + builder.setInsertionPointToEnd(kernel_body); + // Adds yield operation with proper operands. + if (has_outputs) { + // If the loop has outputs, yield the loop results. + SmallVector yield_operands(for_op.getResults()); + builder.create(loc, ValueRange{}, yield_operands); + } else { + // If the loop has no outputs, create an empty yield. + builder.create(loc); + } + + return success(); +} + +struct WrapLoopInKernelPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WrapLoopInKernelPass) + + StringRef getArgument() const override { return "wrap-loop-in-kernel"; } + StringRef getDescription() const override { + return "Wraps loops in Neura kernel operations."; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + func::FuncOp func_op = getOperation(); + + // Skips if function already has kerenls. + bool has_kernels = false; + func_op.walk([&](neura::KernelOp) { has_kernels = true; }); + if (has_kernels) { + return; + } + + // Skips main function. + if (func_op.getName() == "main") { + return; + } + + // Collects all innermost affine for loops in the function. + // TODO: Support more kernel wrapping strategies. + SmallVector innermost_loops; + func_op.walk([&](affine::AffineForOp for_op) { + if (isInnermostLoop(for_op)) { + innermost_loops.push_back(for_op); + } + }); + + if (innermost_loops.empty()) { + return; + } + + // Wraps each innermost affine for loop in a neura.kernel operation. + // TODO: Support more kernel wrapping strategies. + OpBuilder builder(func_op->getContext()); + unsigned kernel_id = 0; + for (affine::AffineForOp loop : innermost_loops) { + if (failed(wrapInnermostLoopAsKernel(loop, builder, kernel_id))) { + signalPassFailure(); + return; + } + } + } +}; +} // namespace + +std::unique_ptr mlir::neura::createWrapLoopInKernelPass() { + return std::make_unique(); +} \ No newline at end of file diff --git a/test/compiler_e2e/visualize/test.mlir b/test/compiler_e2e/visualize/test.mlir index a07cc79f..50b1c32a 100644 --- a/test/compiler_e2e/visualize/test.mlir +++ b/test/compiler_e2e/visualize/test.mlir @@ -162,4 +162,4 @@ func.func @test_print_op_graph(%a: f32, %b: f32) -> f32 { // CHECK-GRAPH: label = "neura.fmul : (!neura.data)\n\nrhs_value: 2.000000e+00 : f32", shape = ellipse, style = filled]; // CHECK-GRAPH: label = "neura.data_mov : (!neura.data)\n", shape = ellipse, style = filled]; // CHECK-GRAPH: label = "neura.return_value : ()\n", shape = ellipse, style = filled]; -// CHECK-GRAPH: label = "neura.yield : ()\n", shape = ellipse, style = filled]; +// CHECK-GRAPH: label = "neura.yield : ()\n\noperandSegmentSizes: array", shape = ellipse, style = filled]; diff --git a/test/visualize/test2.mlir b/test/visualize/test2.mlir index 7a686e52..69a3756d 100644 --- a/test/visualize/test2.mlir +++ b/test/visualize/test2.mlir @@ -29,4 +29,4 @@ func.func @test_print_op_graph(%a: f32, %b: f32) -> f32 { // CHECK-GRAPH: label = "neura.fmul : (!neura.data)\n\nrhs_value: 2.000000e+00 : f32", shape = ellipse, style = filled]; // CHECK-GRAPH: label = "neura.data_mov : (!neura.data)\n", shape = ellipse, style = filled]; // CHECK-GRAPH: label = "neura.return_value : ()\n", shape = ellipse, style = filled]; -// CHECK-GRAPH: label = "neura.yield : ()\n", shape = ellipse, style = filled]; +// CHECK-GRAPH: label = "neura.yield : ()\n\noperandSegmentSizes: array", shape = ellipse, style = filled]; From 2898f1557d1f778921c56d8f0a0a688b1914ee72 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 31 Jan 2026 14:42:32 +0800 Subject: [PATCH 22/25] [fix] fix bugs in iter_args handling --- .../TransformCtrlToDataFlowPass.cpp | 23 ++++---- test/multi-cgra/kernel_mapping/fir/fir.mlir | 52 +++++++++---------- 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp index 258a4be5..5fedbec3 100644 --- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp +++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp @@ -148,7 +148,7 @@ void GrantPredicateInEntryBlock(Block *entry_block, OpBuilder &builder, //--------------------------------------------------------------------------- void handleKernelIterArgs(neura::KernelOp kernel_op, Block *entry_block, OpBuilder &builder, - SmallVector &iter_arg_final_values) { + SmallVector &iter_arg_phi_values) { llvm::errs() << "[iter_args] Handling kernel iter_args...\n"; SmallVector iter_arg_init_ops; @@ -213,7 +213,7 @@ void handleKernelIterArgs(neura::KernelOp kernel_op, Block *entry_block, builder.create(yield_op.getLoc(), feedback_value, reserve_op.getResult()); - iter_arg_final_values.push_back(feedback_value); + iter_arg_phi_values.push_back(phi.getResult()); init_const->removeAttr(kIterArgInitAttr); llvm::errs() << "[iter_args] Created iter_arg with grant_once\n"; @@ -227,7 +227,7 @@ void handleKernelIterArgs(neura::KernelOp kernel_op, Block *entry_block, //--------------------------------------------------------------------------- void handleKernelYieldTermination( neura::KernelOp kernel_op, Block *entry_block, OpBuilder &builder, - bool has_task_counter, const SmallVector &iter_arg_final_values) { + bool has_task_counter, const SmallVector &iter_arg_phi_values) { llvm::errs() << "[yield] ========================================\n"; llvm::errs() << "[yield] Handling Yield Termination\n"; llvm::errs() << "[yield] ========================================\n"; @@ -294,9 +294,12 @@ void handleKernelYieldTermination( // Gates all results with NOT (counter predicate). SmallVector gated_results; - for (Value result : yield_op.getResults()) { + for (size_t i = 0; i < yield_op.getResults().size(); ++i) { + Value result_to_gate = iter_arg_phi_values[i]; + auto gated = builder.create( - yield_op.getLoc(), result.getType(), result, return_gate); + yield_op.getLoc(), result_to_gate.getType(), result_to_gate, + return_gate); gated_results.push_back(gated.getResult()); llvm::errs() << "[yield] Gated result with NOT(counter_pred)\n"; @@ -1043,12 +1046,12 @@ struct TransformCtrlToDataFlowPass return; } - SmallVector iter_arg_final_values; + SmallVector iter_arg_phi_values; // STEP 1: Handles iter_args of the neura.kernel. llvm::errs() << "[ctrl2data] === STEP 1: Handle iter_args ===\n"; handleKernelIterArgs(kernel_op, entry_block, builder, - iter_arg_final_values); + iter_arg_phi_values); // STEP 2: Grants predicates (only if NO task counter). llvm::errs() << "[ctrl2data] === STEP 2: Grant predicates ===\n"; @@ -1065,12 +1068,12 @@ struct TransformCtrlToDataFlowPass } else { llvm::errs() << "[ctrl2data] === STEP 3: Single block (skip) ===\n"; } - convertPhiToPhiStart(kernel_region, builder); - // STEP 4: Handles yield termination in neura.kernel. llvm::errs() << "[ctrl2data] === STEP 4: Handle yield ===\n"; handleKernelYieldTermination(kernel_op, entry_block, builder, - has_task_counter, iter_arg_final_values); + has_task_counter, iter_arg_phi_values); + + convertPhiToPhiStart(kernel_region, builder); kernel_op->setAttr(neura::attr::kDataflowMode, StringAttr::get(kernel_op.getContext(), diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir index 46f62a2c..cc2bf924 100644 --- a/test/multi-cgra/kernel_mapping/fir/fir.mlir +++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir @@ -196,7 +196,7 @@ module attributes {} { // DATAFLOW-NEXT: neura.ctrl_mov %9 -> %3 : !neura.data !neura.data // DATAFLOW-NEXT: %10 = neura.extract_predicate %5 : !neura.data -> !neura.data // DATAFLOW-NEXT: %11 = "neura.not"(%10) : (!neura.data) -> !neura.data -// DATAFLOW-NEXT: %12 = neura.grant_predicate %9, %11 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: %12 = neura.grant_predicate %4, %11 : !neura.data, !neura.data -> !neura.data // DATAFLOW-NEXT: neura.return_value %12 : !neura.data // DATAFLOW-NEXT: neura.yield // DATAFLOW-NEXT: } : i32 @@ -212,33 +212,33 @@ module attributes {} { // MAPPED-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ // MAPPED-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // MAPPED-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index -// MAPPED-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 2 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 2 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { +// MAPPED-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 2 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { // MAPPED-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): -// MAPPED-NEXT: %2 = "neura.grant_once"() <{constant_value = "%iter_arg_init0"}> {dfg_id = 0 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 3 : i32}]} : () -> !neura.data +// MAPPED-NEXT: %2 = "neura.grant_once"() <{constant_value = "%iter_arg_init0"}> {dfg_id = 0 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 0 : i32, y = 2 : i32}]} : () -> !neura.data // MAPPED-NEXT: %3 = neura.reserve {dfg_id = 1 : i32} : !neura.data -// MAPPED-NEXT: %4 = "neura.data_mov"(%2) {dfg_id = 4 : i32, mapping_locs = [{id = 42 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}, {id = 288 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: %5 = neura.phi_start %4, %3 {dfg_id = 8 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data, !neura.data -> !neura.data -// MAPPED-NEXT: %6 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", dfg_id = 2 : i32, lower_bound = 0 : index, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 1 : i32, y = 1 : i32}], step = 1 : index, upper_bound = 32 : index} : !neura.data -// MAPPED-NEXT: %7 = "neura.data_mov"(%6) {dfg_id = 5 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: %8 = neura.load_indexed [%7 : !neura.data] {dfg_id = 9 : i32, lhs_value = "%input0", mapping_locs = [{id = 5 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data -// MAPPED-NEXT: %9 = "neura.data_mov"(%6) {dfg_id = 6 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: %10 = neura.load_indexed [%9 : !neura.data] {dfg_id = 10 : i32, lhs_value = "%input1", mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 0 : i32}]} : !neura.data -// MAPPED-NEXT: %11 = "neura.data_mov"(%8) {dfg_id = 13 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: %12 = "neura.data_mov"(%10) {dfg_id = 14 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: %13 = "neura.mul"(%11, %12) {dfg_id = 16 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 0 : i32}]} : (!neura.data, !neura.data) -> !neura.data -// MAPPED-NEXT: %14 = "neura.data_mov"(%5) {dfg_id = 12 : i32, mapping_locs = [{id = 288 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: %15 = "neura.data_mov"(%13) {dfg_id = 18 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 2 : i32}, {id = 16 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: %16 = "neura.add"(%14, %15) {dfg_id = 20 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 2 : i32}]} : (!neura.data, !neura.data) -> !neura.data -// MAPPED-NEXT: neura.ctrl_mov %16 -> %3 {dfg_id = 21 : i32, mapping_locs = [{id = 289 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : !neura.data !neura.data -// MAPPED-NEXT: %17 = "neura.data_mov"(%6) {dfg_id = 7 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}, {id = 128 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: %18 = neura.extract_predicate %17 {dfg_id = 11 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data -> !neura.data -// MAPPED-NEXT: %19 = "neura.data_mov"(%18) {dfg_id = 15 : i32, mapping_locs = [{id = 128 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: %20 = "neura.not"(%19) {dfg_id = 17 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 1 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: %21 = "neura.data_mov"(%16) {dfg_id = 22 : i32, mapping_locs = [{id = 27 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: %22 = "neura.data_mov"(%20) {dfg_id = 19 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 3 : i32}, {id = 256 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: %23 = neura.grant_predicate %21, %22 {dfg_id = 23 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 5 : i32, x = 0 : i32, y = 2 : i32}]} : !neura.data, !neura.data -> !neura.data -// MAPPED-NEXT: %24 = "neura.data_mov"(%23) {dfg_id = 24 : i32, mapping_locs = [{id = 256 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data) -> !neura.data -// MAPPED-NEXT: neura.return_value %24 : !neura.data {dfg_id = 25 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 2 : i32}]} +// MAPPED-NEXT: %4 = "neura.data_mov"(%2) {dfg_id = 4 : i32, mapping_locs = [{id = 24 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %5 = neura.phi_start %4, %3 {dfg_id = 8 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data, !neura.data -> !neura.data +// MAPPED-NEXT: %6 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", dfg_id = 2 : i32, lower_bound = 0 : index, mapping_locs = [{id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 0 : i32, y = 0 : i32}], step = 1 : index, upper_bound = 32 : index} : !neura.data +// MAPPED-NEXT: %7 = "neura.data_mov"(%6) {dfg_id = 5 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 0 : i32}, {id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %8 = neura.load_indexed [%7 : !neura.data] {dfg_id = 9 : i32, lhs_value = "%input0", mapping_locs = [{id = 0 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 0 : i32}]} : !neura.data +// MAPPED-NEXT: %9 = "neura.data_mov"(%6) {dfg_id = 6 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}, {id = 32 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %10 = neura.load_indexed [%9 : !neura.data] {dfg_id = 10 : i32, lhs_value = "%input1", mapping_locs = [{id = 1 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 0 : i32}]} : !neura.data +// MAPPED-NEXT: %11 = "neura.data_mov"(%8) {dfg_id = 14 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %12 = "neura.data_mov"(%10) {dfg_id = 15 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %13 = "neura.mul"(%11, %12) {dfg_id = 17 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 0 : i32}]} : (!neura.data, !neura.data) -> !neura.data +// MAPPED-NEXT: %14 = "neura.data_mov"(%5) {dfg_id = 13 : i32, mapping_locs = [{id = 29 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 160 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %15 = "neura.data_mov"(%13) {dfg_id = 19 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %16 = "neura.add"(%14, %15) {dfg_id = 21 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 1 : i32}]} : (!neura.data, !neura.data) -> !neura.data +// MAPPED-NEXT: neura.ctrl_mov %16 -> %3 {dfg_id = 23 : i32, mapping_locs = [{id = 16 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 4 : i32}, {id = 288 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : !neura.data !neura.data +// MAPPED-NEXT: %17 = "neura.data_mov"(%6) {dfg_id = 7 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %18 = neura.extract_predicate %17 {dfg_id = 11 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 0 : i32, y = 0 : i32}]} : !neura.data -> !neura.data +// MAPPED-NEXT: %19 = "neura.data_mov"(%18) {dfg_id = 16 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %20 = "neura.not"(%19) {dfg_id = 18 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 1 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %21 = "neura.data_mov"(%5) {dfg_id = 12 : i32, mapping_locs = [{id = 27 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %22 = "neura.data_mov"(%20) {dfg_id = 20 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: %23 = neura.grant_predicate %21, %22 {dfg_id = 22 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 2 : i32}]} : !neura.data, !neura.data -> !neura.data +// MAPPED-NEXT: %24 = "neura.data_mov"(%23) {dfg_id = 24 : i32, mapping_locs = [{id = 256 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data) -> !neura.data +// MAPPED-NEXT: neura.return_value %24 : !neura.data {dfg_id = 25 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 0 : i32, y = 2 : i32}]} // MAPPED-NEXT: neura.yield {dfg_id = 3 : i32} // MAPPED-NEXT: } : i32 // MAPPED-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () From 565c4fd61f99b255e895652617d9db10f5228fc7 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 31 Jan 2026 15:07:03 +0800 Subject: [PATCH 23/25] sync with main --- include/NeuraDialect/NeuraPasses.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h index 803cc589..b88bec1f 100644 --- a/include/NeuraDialect/NeuraPasses.h +++ b/include/NeuraDialect/NeuraPasses.h @@ -30,6 +30,7 @@ std::unique_ptr createCanonicalizeLiveInPass(); std::unique_ptr createPromoteInputArgToConstPass(); std::unique_ptr createTransformToSteerControlPass(); std::unique_ptr createRemovePredicatedTypePass(); +std::unique_ptr createWrapLoopInKernelPass(); // ==================================== // Optimization Passes From 616da9d3028db3fd0c85d351db54ed781c41fbcc Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 31 Jan 2026 15:45:19 +0800 Subject: [PATCH 24/25] revert the github workflow --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a7a73b93..1703ce3d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -65,7 +65,7 @@ jobs: run: | mkdir -p ${{ env.CCACHE_DIR }} git --version - git clone --depth 1 --filter=blob:none --revision=${{ env.LLVM_COMMIT }} https://github.com/llvm/llvm-project.git + git clone --revision=${{ env.LLVM_COMMIT }} https://github.com/llvm/llvm-project.git cd llvm-project mkdir build && cd build cmake -G Ninja ../llvm \ From 67ea96b4d948f2ae230fcf4a1ee2313c9cf3d791 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 31 Jan 2026 16:48:48 +0800 Subject: [PATCH 25/25] modify the git clone cmd --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1703ce3d..a7a73b93 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -65,7 +65,7 @@ jobs: run: | mkdir -p ${{ env.CCACHE_DIR }} git --version - git clone --revision=${{ env.LLVM_COMMIT }} https://github.com/llvm/llvm-project.git + git clone --depth 1 --filter=blob:none --revision=${{ env.LLVM_COMMIT }} https://github.com/llvm/llvm-project.git cd llvm-project mkdir build && cd build cmake -G Ninja ../llvm \