diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp
index 307e7d562b9..b23c288a58f 100644
--- a/backends/vulkan/test/custom_ops/utils.cpp
+++ b/backends/vulkan/test/custom_ops/utils.cpp
@@ -9,6 +9,7 @@
 #include <cmath>
 #include <numeric>
 #include <random>
+#include <unordered_map>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
@@ -21,37 +22,55 @@ int get_seed() {
   return seed++;
 }
 
+int get_seed_or_explicit(int explicit_seed) {
+  if (explicit_seed >= 0) {
+    return explicit_seed;
+  }
+  return get_seed();
+}
+
 // Forward declarations for data generation utilities
 void generate_random_float_data(
     std::vector<float>& data,
     float min_val = -1.0f,
-    float max_val = 1.0f);
+    float max_val = 1.0f,
+    int explicit_seed = -1);
 void generate_random_int_data(
     std::vector<int32_t>& data,
     int min_val = -10,
-    int max_val = 10);
+    int max_val = 10,
+    int explicit_seed = -1);
 void generate_randint_float_data(
     std::vector<float>& data,
     int min_val = -10,
-    int max_val = 10);
+    int max_val = 10,
+    int explicit_seed = -1);
 void generate_randint_half_data(
     std::vector<uint16_t>& data,
     int min_val = -10,
-    int max_val = 10);
+    int max_val = 10,
+    int explicit_seed = -1);
 void generate_random_int8_data(
     std::vector<int8_t>& data,
     int8_t min_val = -10,
-    int8_t max_val = 10);
+    int8_t max_val = 10,
+    int explicit_seed = -1);
 void generate_random_uint8_data(
     std::vector<uint8_t>& data,
     uint8_t min_val = 0,
-    uint8_t max_val = 255);
-void generate_random_2xint4_data(std::vector<uint8_t>& data);
-void generate_random_2xint4_data(std::vector<int8_t>& data);
+    uint8_t max_val = 255,
+    int explicit_seed = -1);
+void generate_random_2xint4_data(
+    std::vector<uint8_t>& data,
+    int explicit_seed = -1);
+void generate_random_2xint4_data(
+    std::vector<int8_t>& data,
+    int explicit_seed = -1);
 void generate_random_int4_data(
     std::vector<int8_t>& data,
     int8_t min_val = -8,
-    int8_t max_val = 7);
+    int8_t max_val = 7,
+    int explicit_seed = -1);
 void generate_ones_data(std::vector<float>& data);
 void generate_zeros_data(std::vector<float>& data);
 
@@ -96,7 +115,7 @@ void set_debugging(bool enable_debugging) {
 }
 
 // ValueSpec implementation
-void ValueSpec::generate_tensor_data() {
+void ValueSpec::generate_tensor_data(int seed) {
   if (spec_type != SpecType::Tensor) {
     return;
   }
@@ -107,15 +126,15 @@ void ValueSpec::generate_tensor_data() {
     case vkapi::kFloat: {
       float_data.resize(num_elements);
       if (data_gen_type == DataGenType::RANDOM) {
-        generate_random_float_data(float_data);
+        generate_random_float_data(float_data, -1.0f, 1.0f, seed);
       } else if (data_gen_type == DataGenType::RANDOM_SCALES) {
-        generate_random_float_data(float_data, 0.005, 0.015);
+        generate_random_float_data(float_data, 0.005, 0.015, seed);
       } else if (data_gen_type == DataGenType::RANDINT) {
-        generate_randint_float_data(float_data);
+        generate_randint_float_data(float_data, -10, 10, seed);
       } else if (data_gen_type == DataGenType::RANDINT8) {
-        generate_randint_float_data(float_data, -128, 127);
+        generate_randint_float_data(float_data, -128, 127, seed);
       } else if (data_gen_type == DataGenType::RANDINT4) {
-        generate_randint_float_data(float_data, -8, 7);
+        generate_randint_float_data(float_data, -8, 7, seed);
       } else if (data_gen_type == DataGenType::ONES) {
         generate_ones_data(float_data);
       } else if (data_gen_type == DataGenType::ZEROS) {
@@ -130,17 +149,17 @@ void ValueSpec::generate_tensor_data() {
       if (data_gen_type == DataGenType::RANDOM) {
         // Generate random float data first, then convert to half
         std::vector<float> temp_data(num_elements);
-        generate_random_float_data(temp_data);
+        generate_random_float_data(temp_data, -1.0f, 1.0f, seed);
         for (size_t i = 0; i < temp_data.size(); ++i) {
           // Simple conversion to uint16_t representation of half
           half_data[i] = static_cast<uint16_t>(temp_data[i] * 32767.0f);
         }
       } else if (data_gen_type == DataGenType::RANDINT) {
-        generate_randint_half_data(half_data);
+        generate_randint_half_data(half_data, -10, 10, seed);
       } else if (data_gen_type == DataGenType::RANDINT8) {
-        generate_randint_half_data(half_data, -128, 127);
+        generate_randint_half_data(half_data, -128, 127, seed);
       } else if (data_gen_type == DataGenType::RANDINT4) {
-        generate_randint_half_data(half_data, -8, 7);
+        generate_randint_half_data(half_data, -8, 7, seed);
       } else if (data_gen_type == DataGenType::ONES) {
         std::fill(
             half_data.begin(),
@@ -162,14 +181,17 @@ void ValueSpec::generate_tensor_data() {
     case vkapi::kInt: {
       int32_data.resize(num_elements);
       if (data_gen_type == DataGenType::RANDOM) {
-        generate_random_int_data(int32_data);
+        generate_random_int_data(int32_data, -10, 10, seed);
       } else if (data_gen_type == DataGenType::RANDINT) {
         generate_random_int_data(
-            int32_data); // For int type, RANDINT is same as RANDOM
+            int32_data,
+            -10,
+            10,
+            seed); // For int type, RANDINT is same as RANDOM
       } else if (data_gen_type == DataGenType::RANDINT8) {
-        generate_random_int_data(int32_data, -128, 127);
+        generate_random_int_data(int32_data, -128, 127, seed);
       } else if (data_gen_type == DataGenType::RANDINT4) {
-        generate_random_int_data(int32_data, -8, 7);
+        generate_random_int_data(int32_data, -8, 7, seed);
       } else if (data_gen_type == DataGenType::ONES) {
         std::fill(int32_data.begin(), int32_data.end(), 1);
       } else if (data_gen_type == DataGenType::ZEROS) {
@@ -182,13 +204,13 @@ void ValueSpec::generate_tensor_data() {
     case vkapi::kChar: {
       int8_data.resize(num_elements);
       if (data_gen_type == DataGenType::RANDOM) {
-        generate_random_int8_data(int8_data);
+        generate_random_int8_data(int8_data, -10, 10, seed);
       } else if (data_gen_type == DataGenType::RANDINT) {
-        generate_random_int8_data(int8_data);
+        generate_random_int8_data(int8_data, -10, 10, seed);
       } else if (data_gen_type == DataGenType::RANDINT8) {
-        generate_random_int8_data(int8_data, -128, 127);
+        generate_random_int8_data(int8_data, -128, 127, seed);
       } else if (data_gen_type == DataGenType::RANDINT4) {
-        generate_random_2xint4_data(int8_data);
+        generate_random_2xint4_data(int8_data, seed);
       } else if (data_gen_type == DataGenType::ONES) {
         std::fill(int8_data.begin(), int8_data.end(), 1);
       } else if (data_gen_type == DataGenType::ONES_INT4) {
@@ -204,13 +226,13 @@ void ValueSpec::generate_tensor_data() {
     case vkapi::kByte: {
       uint8_data.resize(num_elements);
       if (data_gen_type == DataGenType::RANDOM) {
-        generate_random_uint8_data(uint8_data);
+        generate_random_uint8_data(uint8_data, 0, 255, seed);
       } else if (data_gen_type == DataGenType::RANDINT) {
-        generate_random_uint8_data(uint8_data);
+        generate_random_uint8_data(uint8_data, 0, 255, seed);
       } else if (data_gen_type == DataGenType::RANDINT8) {
-        generate_random_uint8_data(uint8_data, 0, 255);
+        generate_random_uint8_data(uint8_data, 0, 255, seed);
       } else if (data_gen_type == DataGenType::RANDINT4) {
-        generate_random_2xint4_data(uint8_data);
+        generate_random_2xint4_data(uint8_data, seed);
       } else if (data_gen_type == DataGenType::ONES) {
         std::fill(uint8_data.begin(), uint8_data.end(), 1);
       } else if (data_gen_type == DataGenType::ONES_INT4) {
@@ -227,9 +249,9 @@ void ValueSpec::generate_tensor_data() {
       // Default to float
       float_data.resize(num_elements);
       if (data_gen_type == DataGenType::RANDOM) {
-        generate_random_float_data(float_data);
+        generate_random_float_data(float_data, -1.0f, 1.0f, seed);
       } else if (data_gen_type == DataGenType::RANDINT) {
-        generate_randint_float_data(float_data);
+        generate_randint_float_data(float_data, -10, 10, seed);
       } else if (data_gen_type == DataGenType::ONES) {
         generate_ones_data(float_data);
       } else if (data_gen_type == DataGenType::ZEROS) {
@@ -316,6 +338,11 @@ std::string ValueSpec::to_string() const {
       result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM";
       result += ")";
       return result;
+    case SpecType::String:
+      result += "type=String, value=\"";
+      result += get_string_value();
+      result += "\")";
+      return result;
   }
 
   for (size_t i = 0; i < sizes.size(); ++i) {
@@ -494,8 +521,9 @@ const void* ValueSpec::get_data_ptr() const {
 void generate_random_float_data(
     std::vector<float>& data,
     float min_val,
-    float max_val) {
-  std::mt19937 gen(get_seed());
+    float max_val,
+    int explicit_seed) {
+  std::mt19937 gen(get_seed_or_explicit(explicit_seed));
   std::uniform_real_distribution<float> dis(min_val, max_val);
   for (auto& val : data) {
     val = dis(gen);
@@ -505,8 +533,9 @@ void generate_random_float_data(
 void generate_random_int_data(
     std::vector<int32_t>& data,
     int min_val,
-    int max_val) {
-  std::mt19937 gen(get_seed());
+    int max_val,
+    int explicit_seed) {
+  std::mt19937 gen(get_seed_or_explicit(explicit_seed));
   std::uniform_int_distribution<int32_t> dis(min_val, max_val);
   for (auto& val : data) {
     val = dis(gen);
@@ -516,8 +545,9 @@ void generate_random_int_data(
 void generate_randint_float_data(
     std::vector<float>& data,
     int min_val,
-    int max_val) {
-  std::mt19937 gen(get_seed());
+    int max_val,
+    int explicit_seed) {
+  std::mt19937 gen(get_seed_or_explicit(explicit_seed));
   std::uniform_int_distribution<int32_t> dis(min_val, max_val);
   for (auto& val : data) {
     val = static_cast<float>(dis(gen));
@@ -527,8 +557,9 @@ void generate_randint_float_data(
 void generate_randint_half_data(
     std::vector<uint16_t>& data,
     int min_val,
-    int max_val) {
-  std::mt19937 gen(get_seed());
+    int max_val,
+    int explicit_seed) {
+  std::mt19937 gen(get_seed_or_explicit(explicit_seed));
   std::uniform_int_distribution<int32_t> dis(min_val, max_val);
   for (auto& val : data) {
     val = static_cast<uint16_t>(std::abs(dis(gen)) % 65536);
@@ -542,8 +573,9 @@ void generate_ones_data(std::vector<float>& data) {
 void generate_random_int8_data(
     std::vector<int8_t>& data,
     int8_t min_val,
-    int8_t max_val) {
-  std::mt19937 gen(get_seed());
+    int8_t max_val,
+    int explicit_seed) {
+  std::mt19937 gen(get_seed_or_explicit(explicit_seed));
   std::uniform_int_distribution<int16_t> dis(min_val, max_val);
   for (auto& val : data) {
     val = static_cast<int8_t>(dis(gen));
@@ -553,8 +585,9 @@ void generate_random_int8_data(
 void generate_random_uint8_data(
     std::vector<uint8_t>& data,
     uint8_t min_val,
-    uint8_t max_val) {
-  std::mt19937 gen(get_seed());
+    uint8_t max_val,
+    int explicit_seed) {
+  std::mt19937 gen(get_seed_or_explicit(explicit_seed));
   std::uniform_int_distribution<uint16_t> dis(min_val, max_val);
   for (auto& val : data) {
     val = static_cast<uint8_t>(dis(gen));
@@ -564,16 +597,17 @@ void generate_random_uint8_data(
 void generate_random_int4_data(
     std::vector<int8_t>& data,
     int8_t min_val,
-    int8_t max_val) {
-  std::mt19937 gen(get_seed());
+    int8_t max_val,
+    int explicit_seed) {
+  std::mt19937 gen(get_seed_or_explicit(explicit_seed));
   std::uniform_int_distribution<int16_t> dis(min_val, max_val);
   for (auto& val : data) {
     val = static_cast<int8_t>(dis(gen));
   }
 }
 
-void generate_random_2xint4_data(std::vector<int8_t>& data) {
-  std::mt19937 gen(get_seed());
+void generate_random_2xint4_data(std::vector<int8_t>& data, int explicit_seed) {
+  std::mt19937 gen(get_seed_or_explicit(explicit_seed));
   std::uniform_int_distribution<int16_t> dis(-8, 7); // Signed 4-bit range
   for (auto& val : data) {
     // Generate two separate 4-bit values
@@ -584,8 +618,10 @@ void generate_random_2xint4_data(std::vector<int8_t>& data) {
   }
 }
 
-void generate_random_2xint4_data(std::vector<uint8_t>& data) {
-  std::mt19937 gen(get_seed());
+void generate_random_2xint4_data(
+    std::vector<uint8_t>& data,
+    int explicit_seed) {
+  std::mt19937 gen(get_seed_or_explicit(explicit_seed));
   std::uniform_int_distribution<uint16_t> dis(0, 15); // Unsigned 4-bit range
   for (auto& val : data) {
     // Generate two separate 4-bit values
@@ -652,6 +688,88 @@ bool ValueSpec::validate_against_reference(
   return true;
 }
 
+// Ensure data is generated for this ValueSpec
+void ValueSpec::ensure_data_generated(int seed) {
+  if (data_generated_) {
+    return;
+  }
+  generate_tensor_data(seed);
+  data_generated_ = true;
+}
+
+// Copy input data from another ValueSpec
+void ValueSpec::copy_data_from(const ValueSpec& other) {
+  if (!is_tensor() || !other.is_tensor()) {
+    return;
+  }
+  // Copy raw data based on dtype
+  float_data = other.float_data;
+  int32_data = other.int32_data;
+  half_data = other.half_data;
+  int8_data = other.int8_data;
+  uint8_data = other.uint8_data;
+  data_generated_ = other.data_generated_;
+}
+
+// ReferenceKey implementation
+ReferenceKey ReferenceKey::from_test_case(const TestCase& tc) {
+  std::ostringstream oss;
+
+  // Serialize inputs that affect reference computation
+  // Skip: storage_type, memory_layout, string values (like impl_selector)
+  for (size_t i = 0; i < tc.inputs().size(); ++i) {
+    const ValueSpec& input = tc.inputs()[i];
+    oss << "i" << i << ":";
+
+    if (input.is_tensor()) {
+      // For tensors: sizes, dtype, data_gen_type, is_constant
+      oss << "T[";
+      for (size_t j = 0; j < input.sizes.size(); ++j) {
+        if (j > 0)
+          oss << ",";
+        oss << input.sizes[j];
+      }
+      oss << "]d" << static_cast<int>(input.dtype);
+      oss << "g" << static_cast<int>(input.data_gen_type);
+      oss << "c" << (input.is_constant() ? 1 : 0);
+      oss << "n" << (input.is_none() ? 1 : 0);
+    } else if (input.is_int()) {
+      oss << "I" << input.get_int_value();
+    } else if (input.is_float()) {
+      oss << "F" << input.get_float_value();
+    } else if (input.is_bool()) {
+      oss << "B" << (input.get_bool_value() ? 1 : 0);
+    } else if (input.is_int_list()) {
+      oss << "L[";
+      const auto& list = input.get_int_list();
+      for (size_t j = 0; j < list.size(); ++j) {
+        if (j > 0)
+          oss << ",";
+        oss << list[j];
+      }
+      oss << "]";
+    }
+    // Skip string inputs (like impl_selector) as they don't affect reference
+    oss << ";";
+  }
+
+  // Also include output shapes for completeness
+  for (size_t i = 0; i < tc.outputs().size(); ++i) {
+    const ValueSpec& output = tc.outputs()[i];
+    oss << "o" << i << ":[";
+    for (size_t j = 0; j < output.sizes.size(); ++j) {
+      if (j > 0)
+        oss << ",";
+      oss << output.sizes[j];
+    }
+    oss << "]d" << static_cast<int>(output.dtype) << ";";
+  }
+
+  ReferenceKey key;
+  key.key_string = oss.str();
+  return key;
+}
+
 // Helper function to collect GPU timing from querypool
 float collect_gpu_timing_us(
     ComputeGraph& graph,
@@ -685,11 +803,68 @@ float collect_gpu_timing_us(
   return 0.0f;
 }
 
+// Helper function to collect per-shader GPU timing from querypool
+// Returns a map of shader_name -> timing_us for non-filtered shaders
+std::unordered_map<std::string, float> collect_per_shader_timing_us(
+    ComputeGraph& graph,
+    const std::vector<std::string>& shader_filter) {
+  std::unordered_map<std::string, float> shader_timings;
+
+  graph.context()->querypool().extract_results();
+  const auto results = graph.context()->querypool().get_shader_timestamp_data();
+  for (const auto& shader_result : results) {
+    bool filtered = false;
+    // Check if this shader matches any filter pattern
+    for (const auto& filter_pattern : shader_filter) {
+      if (shader_result.kernel_name.find(filter_pattern) != std::string::npos) {
+        filtered = true;
+        break;
+      }
+    }
+
+    if (!filtered) {
+      // Calculate duration from start and end times, convert from ns to μs
+      uint64_t duration_ns =
+          shader_result.end_time_ns - shader_result.start_time_ns;
+      float duration_us = static_cast<float>(duration_ns) / 1000.0f;
+      // Accumulate timing for shaders with the same name
+      shader_timings[shader_result.kernel_name] += duration_us;
+    }
+  }
+  return shader_timings;
+}
+
 // BenchmarkResult implementation
 void BenchmarkResult::add_iter_timing(float time_us) {
   iter_timings.push_back(time_us);
 }
 
+void BenchmarkResult::add_shader_timing(
+    const std::string& shader_name,
+    float time_us,
+    const uint32_t global_wg[3],
+    const uint32_t local_wg[3]) {
+  // Find existing shader timing or create new one
+  for (auto& st : shader_timings_) {
+    if (st.shader_name == shader_name) {
+      st.iter_timings_us.push_back(time_us);
+      // Work group sizes should be consistent across iterations
+      return;
+    }
+  }
+  // Not found, create new entry
+  ShaderTiming new_timing;
+  new_timing.shader_name = shader_name;
+  new_timing.iter_timings_us.push_back(time_us);
+  new_timing.global_wg_size[0] = global_wg[0];
+  new_timing.global_wg_size[1] = global_wg[1];
+  new_timing.global_wg_size[2] = global_wg[2];
+  new_timing.local_wg_size[0] = local_wg[0];
+  new_timing.local_wg_size[1] = local_wg[1];
+  new_timing.local_wg_size[2] = local_wg[2];
+  shader_timings_.push_back(std::move(new_timing));
+}
+
 float BenchmarkResult::get_avg_time_us() const {
   if (iter_timings.empty()) {
     return 0.0f;
@@ -739,11 +914,27 @@ void BenchmarkResult::print_summary(
     const std::string& size_info,
     float total_gflops) const {
   static constexpr int OPERATOR_NAME_WIDTH = 50;
-  static constexpr int KERNEL_NAME_WIDTH = 70;
+  static constexpr int GLOBAL_WG_WIDTH = 16;
+  static constexpr int LOCAL_WG_WIDTH = 12;
+  static constexpr int KERNEL_NAME_WIDTH = 80;
   static constexpr int SIZE_INFO_WIDTH = 20;
-  static constexpr int TIMING_WIDTH = 20;
-  static constexpr int GFLOPS_WIDTH = 20;
-  static constexpr int CORRECTNESS_WIDTH = 10;
+  static constexpr int TIMING_WIDTH = 16;
+  static constexpr int GFLOPS_WIDTH = 14;
+  static constexpr int CORRECTNESS_WIDTH = 8;
+
+  // Helper to truncate shader names longer than 46 chars to 44 chars + ".."
+  auto truncate_shader_name = [](const std::string& name) -> std::string {
+    if (name.length() > 46) {
+      return name.substr(0, 44) + "..";
+    }
+    return name;
+  };
+
+  // Helper to format work group size as (x,y,z)
+  auto format_wg_size = [](const uint32_t wg[3]) -> std::string {
+    return "(" + std::to_string(wg[0]) + "," + std::to_string(wg[1]) + "," +
+        std::to_string(wg[2]) + ")";
+  };
 
   std::string correctness_str;
   switch (correctness_status_) {
@@ -758,14 +949,74 @@ void BenchmarkResult::print_summary(
       break;
   }
 
-  std::cout << std::left << std::setw(OPERATOR_NAME_WIDTH)
-            << get_operator_name() << " " << std::left
-            << std::setw(KERNEL_NAME_WIDTH) << get_kernel_name() << std::right
-            << " " << std::setw(SIZE_INFO_WIDTH) << size_info
-            << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3)
-            << get_avg_time_us() << " μs " << std::setw(GFLOPS_WIDTH)
-            << std::fixed << std::setprecision(3) << total_gflops << " GFLOP/s "
-            << std::setw(CORRECTNESS_WIDTH) << correctness_str << std::endl;
+  // If we have per-shader timing data, print one line per shader plus overall
+  if (!shader_timings_.empty()) {
+    // If only one shader, print a single combined row
+    if (shader_timings_.size() == 1) {
+      const auto& st = shader_timings_[0];
+      std::cout << std::left << std::setw(OPERATOR_NAME_WIDTH)
+                << truncate_shader_name(st.shader_name) << " " << std::left
+                << std::setw(GLOBAL_WG_WIDTH)
+                << format_wg_size(st.global_wg_size) << std::left
+                << std::setw(LOCAL_WG_WIDTH) << format_wg_size(st.local_wg_size)
+                << std::left << std::setw(KERNEL_NAME_WIDTH)
+                << get_kernel_name() << std::right << " "
+                << std::setw(SIZE_INFO_WIDTH) << size_info
+                << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3)
+                << get_avg_time_us() << " μs " << std::setw(GFLOPS_WIDTH)
+                << std::fixed << std::setprecision(3) << total_gflops
+                << " GFLOP/s " << std::setw(CORRECTNESS_WIDTH)
+                << correctness_str << std::endl;
+    } else {
+      // Multiple shaders: print individual shader lines (without GFLOP/s)
+      for (size_t i = 0; i < shader_timings_.size(); ++i) {
+        const auto& st = shader_timings_[i];
+        float shader_avg_time = st.get_avg_time_us();
+
+        // Shader lines don't show test case info
+        std::cout << std::left << std::setw(OPERATOR_NAME_WIDTH)
+                  << truncate_shader_name(st.shader_name) << " " << std::left
+                  << std::setw(GLOBAL_WG_WIDTH)
+                  << format_wg_size(st.global_wg_size) << std::left
+                  << std::setw(LOCAL_WG_WIDTH)
+                  << format_wg_size(st.local_wg_size) << std::left
+                  << std::setw(KERNEL_NAME_WIDTH) << "" << std::right << " "
+                  << std::setw(SIZE_INFO_WIDTH) << "" << std::setw(TIMING_WIDTH)
+                  << std::fixed << std::setprecision(3) << shader_avg_time
+                  << " μs " << std::setw(GFLOPS_WIDTH) << "" << "          "
+                  << std::setw(CORRECTNESS_WIDTH) << "" << std::endl;
+      }
+
+      // Print overall row with operator name, test case info, total time, and
+      // GFLOP/s
+      std::cout << std::left << std::setw(OPERATOR_NAME_WIDTH)
+                << get_operator_name() << " " << std::left
+                << std::setw(GLOBAL_WG_WIDTH) << "" << std::left
+                << std::setw(LOCAL_WG_WIDTH) << "" << std::left
+                << std::setw(KERNEL_NAME_WIDTH) << get_kernel_name()
+                << std::right << " " << std::setw(SIZE_INFO_WIDTH) << size_info
+                << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3)
+                << get_avg_time_us() << " μs " << std::setw(GFLOPS_WIDTH)
+                << std::fixed << std::setprecision(3) << total_gflops
+                << " GFLOP/s " << std::setw(CORRECTNESS_WIDTH)
+                << correctness_str << std::endl;
+    }
+
+    // Print separator line between test cases
+  } else {
+    // No per-shader timing data, use the original format
+    std::cout << std::left << std::setw(OPERATOR_NAME_WIDTH)
+              << get_operator_name() << " " << std::left
+              << std::setw(GLOBAL_WG_WIDTH) << "" << std::left
+              << std::setw(LOCAL_WG_WIDTH) << "" << std::left
+              << std::setw(KERNEL_NAME_WIDTH) << get_kernel_name() << std::right
+              << " " << std::setw(SIZE_INFO_WIDTH) << size_info
+              << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3)
+              << get_avg_time_us() << " μs " << std::setw(GFLOPS_WIDTH)
+              << std::fixed << std::setprecision(3) << total_gflops
+              << " GFLOP/s " << std::setw(CORRECTNESS_WIDTH) << correctness_str
+              << std::endl;
+  }
 }
 
 // TestResult implementation
@@ -778,7 +1029,7 @@ void TestResult::add_result(BenchmarkResult&& result) {
 }
 
 void TestResult::print_summary() const {
-  static constexpr int CASE_WIDTH = 80;
+  static constexpr int CASE_WIDTH = 100;
   static constexpr int KERNEL_NAME_WIDTH = 20;
   static constexpr int TIMING_WIDTH = 12;
   static constexpr int PASS_WIDTH = 8;
@@ -1069,6 +1320,10 @@ ComputeGraph setup_compute_graph(TestCase& test_case, std::string op_name) {
       }
       ValueRef input_value = graph.add_scalar_list(std::move(int64_list));
       input_values.push_back(input_value);
+    } else if (input_spec.is_string()) {
+      std::string str_copy = input_spec.get_string_value();
+      ValueRef input_value = graph.add_string(std::move(str_copy));
+      input_values.push_back(input_value);
     } else if (input_spec.is_constant()) {
       ValueRef input_value = graph.add_tensorref(
           input_spec.get_tensor_sizes(),
@@ -1200,9 +1455,38 @@ execute_test_case(TestCase& test_case, int warmup_runs, int benchmark_runs) {
     float cpu_time_us = static_cast<float>(cpu_duration.count());
     total_cpu_time_us += cpu_time_us;
 
-    // Collect GPU timing using helper function
-    float gpu_time_us =
-        collect_gpu_timing_us(graph, test_case.get_shader_filter());
+    // Collect per-shader GPU timing - get raw shader results to preserve
+    // metadata
+    graph.context()->querypool().extract_results();
+    const auto shader_results =
+        graph.context()->querypool().get_shader_timestamp_data();
+
+    // Calculate total GPU time from per-shader timings
+    float gpu_time_us = 0.0f;
+    for (const auto& shader_result : shader_results) {
+      // Check if this shader matches any filter pattern
+      bool filtered = false;
+      for (const auto& filter_pattern : test_case.get_shader_filter()) {
+        if (shader_result.kernel_name.find(filter_pattern) !=
+            std::string::npos) {
+          filtered = true;
+          break;
+        }
+      }
+
+      if (!filtered) {
+        uint64_t duration_ns =
+            shader_result.end_time_ns - shader_result.start_time_ns;
+        float duration_us = static_cast<float>(duration_ns) / 1000.0f;
+        gpu_time_us += duration_us;
+        // Store per-shader timing with work group sizes
+        result.add_shader_timing(
+            shader_result.kernel_name,
+            duration_us,
+            shader_result.metadata.global_workgroup_size,
+            shader_result.metadata.local_workgroup_size);
+      }
+    }
     total_gpu_time_us += gpu_time_us;
 
     // Add the appropriate timing based on the flag
@@ -1274,110 +1558,177 @@ TestResult execute_test_cases(
             << operation_name << std::endl;
   print_separator();
 
+  // Group test cases by ReferenceKey for caching reference computations
+  // Use a vector to preserve the order in which groups first appear
+  std::vector<ReferenceKey> group_order;
+  std::unordered_map<ReferenceKey, std::vector<size_t>, ReferenceKeyHash>
+      groups;
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    ReferenceKey key = ReferenceKey::from_test_case(test_cases[i]);
+    if (groups.find(key) == groups.end()) {
+      group_order.push_back(key);
+    }
+    groups[key].push_back(i);
+  }
+
   bool any_correctness_failed = false;
   float total_gflops = 0.0f;
+  size_t test_case_counter = 0;
+
+  // Process each group: generate data, compute reference, execute, and print
+  // Iterate in the order groups first appeared in test_cases
+  for (const auto& key : group_order) {
+    const auto& indices = groups[key];
+    if (indices.empty())
+      continue;
+
+    // Get first test case as the "prototype"
+    size_t prototype_idx = indices[0];
+    TestCase& prototype = test_cases[prototype_idx];
+
+    // Generate data for prototype with deterministic seed based on key
+    int group_seed =
+        static_cast<int>(std::hash<std::string>{}(key.key_string) % 10000);
+    for (auto& input : prototype.inputs()) {
+      input.ensure_data_generated(group_seed++);
+    }
 
-  for (size_t i = 0; i < test_cases.size(); ++i) {
-    TestCase& test_case = test_cases[i];
-
-    // Compute reference data if reference function is provided
-    bool skipped_reference_fn = true;
+    // Compute reference once for prototype
+    bool ref_computed = false;
+    std::vector<std::vector<float>> ref_data;
     if (reference_compute_func) {
       try {
-        reference_compute_func(test_case);
-        skipped_reference_fn = false;
-      } catch (const std::invalid_argument& e) {
-        if (debugging()) {
-          std::cout << "Compute reference skipped: " << e.what() << std::endl;
+        reference_compute_func(prototype);
+        ref_computed = true;
+
+        // Cache the reference output for this group
+        for (const auto& output : prototype.outputs()) {
+          ref_data.push_back(output.get_ref_float_data());
         }
+      } catch (const std::invalid_argument& _) {
+        // Reference computation skipped for this group
       }
     }
 
-    // Execute single test case
-    BenchmarkResult result;
-    bool shader_not_supported = false;
-    try {
-      result = execute_test_case(test_case, warmup_runs, benchmark_runs);
-      result.set_operator_name(test_case.operator_name());
-    } catch (const vkcompute::vkapi::ShaderNotSupportedError&) {
-      result = BenchmarkResult(
-          test_case.name().empty() ? "unnamed_test_case" : test_case.name(),
-          test_case.operator_name());
-      shader_not_supported = true;
+    // Copy data and reference to other test cases in group
+    for (size_t i = 1; i < indices.size(); ++i) {
+      size_t tc_idx = indices[i];
+      TestCase& tc = test_cases[tc_idx];
+
+      // Copy input data from prototype
+      for (size_t j = 0;
+           j < tc.inputs().size() && j < prototype.inputs().size();
+           ++j) {
+        auto& dest = tc.inputs()[j];
+        const auto& src = prototype.inputs()[j];
+        if (dest.is_tensor() && src.is_tensor() && dest.sizes == src.sizes &&
+            dest.dtype == src.dtype) {
+          dest.copy_data_from(src);
+        }
+      }
+
+      // Copy reference output data if available
+      if (ref_computed) {
+        for (size_t j = 0; j < tc.outputs().size() && j < ref_data.size();
+             ++j) {
+          tc.outputs()[j].get_ref_float_data() = ref_data[j];
+        }
+      }
     }
 
-    // Determine if this test case passed (has valid timing data)
-    bool vulkan_execute_succeeded =
-        result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f;
+    // Execute and print results for all test cases in this group
+    for (size_t tc_idx : indices) {
+      TestCase& test_case = test_cases[tc_idx];
+      ++test_case_counter;
 
-    if (shader_not_supported) {
-      result.set_correctness_status(CorrectnessStatus::SKIPPED);
-    } else if (!vulkan_execute_succeeded) {
-      result.set_correctness_status(CorrectnessStatus::FAILED);
-    } else if (skipped_reference_fn) {
-      result.set_correctness_status(CorrectnessStatus::SKIPPED);
-    } else {
-      // Reference function provided and succeeded - validate outputs
-      bool correctness_passed = true;
-
-      for (size_t output_idx = 0; output_idx < test_case.num_outputs();
-           ++output_idx) {
-        const ValueSpec& output_spec = test_case.outputs()[output_idx];
-
-        if (!output_spec.validate_against_reference(
-                test_case.get_abs_tolerance(), test_case.get_rel_tolerance())) {
-          correctness_passed = false;
-          std::cout << "  Correctness validation FAILED for test "
-                    << result.get_kernel_name() << std::endl;
-          print_valuespec_data(output_spec, "vulkan output");
-          print_valuespec_data(output_spec, "ref output", true);
-
-          throw std::runtime_error("Correctness validation failed");
-        }
+      // Execute single test case
+      BenchmarkResult result;
+      bool shader_not_supported = false;
+      try {
+        result = execute_test_case(test_case, warmup_runs, benchmark_runs);
+        result.set_operator_name(test_case.operator_name());
+      } catch (const vkcompute::vkapi::ShaderNotSupportedError&) {
+        result = BenchmarkResult(
+            test_case.name().empty() ? "unnamed_test_case" : test_case.name(),
+            test_case.operator_name());
+        shader_not_supported = true;
       }
 
-      if (correctness_passed) {
-        result.set_correctness_status(CorrectnessStatus::PASSED);
-      } else {
-        any_correctness_failed = true;
+      // Determine if this test case passed (has valid timing data)
+      bool vulkan_execute_succeeded =
+          result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f;
+
+      if (shader_not_supported) {
+        result.set_correctness_status(CorrectnessStatus::SKIPPED);
+      } else if (!vulkan_execute_succeeded) {
         result.set_correctness_status(CorrectnessStatus::FAILED);
-      }
-    }
+      } else if (!ref_computed) {
+        result.set_correctness_status(CorrectnessStatus::SKIPPED);
+      } else {
+        // Reference function provided and succeeded - validate outputs
+        bool correctness_passed = true;
+
+        for (size_t output_idx = 0; output_idx < test_case.num_outputs();
+             ++output_idx) {
+          const ValueSpec& output_spec = test_case.outputs()[output_idx];
+
+          if (!output_spec.validate_against_reference(
+                  test_case.get_abs_tolerance(),
+                  test_case.get_rel_tolerance())) {
+            correctness_passed = false;
+            std::cout << "  Correctness validation FAILED for test "
+                      << result.get_kernel_name() << std::endl;
+            print_valuespec_data(output_spec, "vulkan output");
+            print_valuespec_data(output_spec, "ref output", true);
+
+            throw std::runtime_error("Correctness validation failed");
+          }
+        }
 
-    // Calculate GFLOPS for this test case using the provided FLOP calculator
-    float case_gflops = 0.0f;
-    if (vulkan_execute_succeeded) {
-      // Use the provided FLOP calculator to get total FLOPs for this test case
-      int64_t total_flops = flop_calculator(test_case);
-      float flops = static_cast<float>(total_flops);
-      float avg_time_us = result.get_avg_time_us();
-      if (avg_time_us > 0.0f && total_flops > 0) {
-        case_gflops = (flops / 1e9f) / (avg_time_us / 1e6f);
+        if (correctness_passed) {
+          result.set_correctness_status(CorrectnessStatus::PASSED);
+        } else {
+          any_correctness_failed = true;
+          result.set_correctness_status(CorrectnessStatus::FAILED);
+        }
       }
 
-      total_gflops += case_gflops;
-    } else {
-      case_gflops = -1.0f; // Indicate failure
-    }
+      // Calculate GFLOPS for this test case using the provided FLOP calculator
+      float case_gflops = 0.0f;
+      if (vulkan_execute_succeeded) {
+        // Use the provided FLOP calculator to get total FLOPs for this test
+        // case
+        int64_t total_flops = flop_calculator(test_case);
+        float flops = static_cast<float>(total_flops);
+        float avg_time_us = result.get_avg_time_us();
+        if (avg_time_us > 0.0f && total_flops > 0) {
+          case_gflops = (flops / 1e9f) / (avg_time_us / 1e6f);
+        }
 
-    // Calculate tensor info for display
-    std::string size_info = "[";
-    if (!test_case.empty() && test_case.num_inputs() > 0 &&
-        test_case.inputs()[0].is_tensor()) {
-      const auto& sizes = test_case.inputs()[0].get_tensor_sizes();
-      for (size_t j = 0; j < sizes.size(); ++j) {
-        size_info += std::to_string(sizes[j]);
-        if (j < sizes.size() - 1)
-          size_info += "x";
+        total_gflops += case_gflops;
+      } else {
+        case_gflops = -1.0f; // Indicate failure
       }
-    }
-    size_info += "]";
 
-    // Print progress using the BenchmarkResult member function
-    result.print_summary(i + 1, size_info, case_gflops);
+      // Calculate tensor info for display
+      std::string size_info = "[";
+      if (!test_case.empty() && test_case.num_inputs() > 0 &&
+          test_case.inputs()[0].is_tensor()) {
+        const auto& sizes = test_case.inputs()[0].get_tensor_sizes();
+        for (size_t j = 0; j < sizes.size(); ++j) {
+          size_info += std::to_string(sizes[j]);
+          if (j < sizes.size() - 1)
+            size_info += "x";
+        }
+      }
+      size_info += "]";
+
+      // Print progress using the BenchmarkResult member function
+      result.print_summary(test_case_counter, size_info, case_gflops);
 
-    // Add result to collection
-    results.add_result(std::move(result));
+      // Add result to collection
+      results.add_result(std::move(result));
+    }
   }
 
   // Set the overall results on the TestResult
diff --git a/backends/vulkan/test/custom_ops/utils.h b/backends/vulkan/test/custom_ops/utils.h
index 666b2d2e409..9b5b6a46782 100644
--- a/backends/vulkan/test/custom_ops/utils.h
+++ b/backends/vulkan/test/custom_ops/utils.h
@@ -14,6 +14,7 @@
 #include <functional>
 #include <iomanip>
 #include <iostream>
+#include <sstream>
 #include <string>
 #include <vector>
 
@@ -23,6 +24,29 @@ namespace prototyping {
 
 using namespace vkcompute;
 
+//
+// ReferenceKey for caching reference computations
+//
+
+// Captures the identity of input conditions for test case grouping.
+// Test cases with the same ReferenceKey should produce identical reference
+// outputs, so reference computation can be cached and reused.
+struct ReferenceKey {
+  std::string key_string;
+
+  static ReferenceKey from_test_case(const class TestCase& tc);
+
+  bool operator==(const ReferenceKey& other) const {
+    return key_string == other.key_string;
+  }
+};
+
+struct ReferenceKeyHash {
+  size_t operator()(const ReferenceKey& k) const {
+    return std::hash<std::string>{}(k.key_string);
+  }
+};
+
 //
 // Global configuration options
 //
@@ -57,11 +81,70 @@ inline const std::vector<std::string> kLayoutOnlyShaderFilter = {
     "nchw_to",
     "to_nchw"};
 
+//
+// String utilities
+//
+
+// Helper function to get abbreviated layout names for test case naming
+inline std::string layout_abbrev(utils::GPUMemoryLayout layout) {
+  switch (layout) {
+    case utils::kWidthPacked:
+      return "WP";
+    case utils::kChannelsPacked:
+      return "CP";
+    case utils::kPackedInt8_4W:
+      return "4W";
+    case utils::kPackedInt8_4C:
+      return "4C";
+    case utils::kPackedInt8_4W4C:
+      return "4W4C";
+    case utils::kPackedInt8_4H4W:
+      return "4H4W";
+    case utils::kPackedInt8_4C1W:
+      return "4C1W";
+    default:
+      return "UNK";
+  }
+}
+
+// Helper function to get abbreviated storage type names for test case naming
+inline std::string storage_type_abbrev(utils::StorageType storage_type) {
+  switch (storage_type) {
+    case utils::kTexture3D:
+      return "Tex";
+    case utils::kBuffer:
+      return "Buf";
+    default:
+      return "UNK";
+  }
+}
+
+// Helper function to get combined storage type and layout representation
+// Example: (kBuffer, kPackedInt8_4W4C) -> "Buf_4W4C"
+inline std::string repr_str(
+    utils::StorageType storage_type,
+    utils::GPUMemoryLayout layout) {
+  return storage_type_abbrev(storage_type) + "(" + layout_abbrev(layout) + ")";
+}
+
+// Helper function to generate comma-separated shape string for test case naming
+// Example: {1, 128, 56, 56} -> "1,128,56,56"
+inline std::string shape_string(const std::vector<int64_t>& shape) {
+  std::string result;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (i > 0) {
+      result += ",";
+    }
+    result += std::to_string(shape[i]);
+  }
+  return result;
+}
+
 //
 // ValueSpec class
 //
 
-enum class SpecType { Tensor, IntList, Int, Float, Bool };
+enum class SpecType { Tensor, IntList, Int, Float, Bool, String };
 
 // Data generation types
 enum class DataGenType {
@@ -87,12 +170,14 @@ struct ValueSpec {
   bool is_constant_tensor;
   bool is_none_flag;
   bool is_int4_tensor;
+  bool data_generated_ = false;
 
   std::vector<float> float_data;
   std::vector<int32_t> int32_data;
   std::vector<uint16_t> half_data; // Using uint16_t as substitute for half
   std::vector<int8_t> int8_data; // For kChar (signed 8-bit)
   std::vector<uint8_t> uint8_data; // For kByte (unsigned 8-bit)
+  std::string string_data;
 
   std::vector<float> ref_float_data;
   std::vector<int32_t> ref_int32_data;
@@ -113,8 +198,9 @@ struct ValueSpec {
         data_gen_type(DataGenType::ZEROS),
         is_constant_tensor(false),
         is_none_flag(false),
-        is_int4_tensor(false) {
-    generate_tensor_data();
+        is_int4_tensor(false),
+        data_generated_(false) {
+    // Data generation is deferred until ensure_data_generated() is called
   }
 
   // Constructor for tensor with custom data generation type
@@ -132,8 +218,9 @@ struct ValueSpec {
         data_gen_type(data_gen_type),
         is_constant_tensor(false),
         is_none_flag(false),
-        is_int4_tensor(false) {
-    generate_tensor_data();
+        is_int4_tensor(false),
+        data_generated_(false) {
+    // Data generation is deferred until ensure_data_generated() is called
   }
 
   // Constructor for single int
@@ -146,7 +233,8 @@ struct ValueSpec {
         data_gen_type(DataGenType::FIXED),
         is_constant_tensor(false),
         is_none_flag(false),
-        is_int4_tensor(false) {
+        is_int4_tensor(false),
+        data_generated_(true) {
     int32_data.push_back(value);
   }
 
@@ -160,7 +248,8 @@ struct ValueSpec {
         data_gen_type(DataGenType::FIXED),
         is_constant_tensor(false),
         is_none_flag(false),
-        is_int4_tensor(false) {
+        is_int4_tensor(false),
+        data_generated_(true) {
     float_data.push_back(value);
   }
 
@@ -174,7 +263,8 @@ struct ValueSpec {
         data_gen_type(DataGenType::FIXED),
         is_constant_tensor(false),
         is_none_flag(false),
-        is_int4_tensor(false) {
+        is_int4_tensor(false),
+        data_generated_(true) {
     int32_data.push_back(value ? 1 : 0);
   }
 
@@ -189,8 +279,26 @@ struct ValueSpec {
         is_constant_tensor(false),
         is_none_flag(false),
         is_int4_tensor(false),
+        data_generated_(true),
         int32_data(values) {}
 
+  // Factory method for string (avoids ambiguity with vector constructor)
+  static ValueSpec make_string(const std::string& value) {
+    ValueSpec spec;
+    spec.sizes = {1};
+    spec.dtype = vkapi::kInt;
+    spec.memory_layout = utils::kWidthPacked;
+    spec.storage_type = utils::kTexture3D;
+    spec.spec_type = SpecType::String;
+    spec.data_gen_type = DataGenType::FIXED;
+    spec.is_constant_tensor = false;
+    spec.is_none_flag = false;
+    spec.is_int4_tensor = false;
+    spec.data_generated_ = true;
+    spec.string_data = value;
+    return spec;
+  }
+
   // Default constructor
   ValueSpec()
       : dtype(vkapi::kFloat),
@@ -200,7 +308,8 @@ struct ValueSpec {
         data_gen_type(DataGenType::ZEROS),
         is_constant_tensor(false),
         is_none_flag(false),
-        is_int4_tensor(false) {}
+        is_int4_tensor(false),
+        data_generated_(false) {}
 
   int64_t numel() const;
   size_t nbytes() const;
@@ -221,6 +330,9 @@ struct ValueSpec {
   bool is_bool() const {
     return spec_type == SpecType::Bool;
   }
+  bool is_string() const {
+    return spec_type == SpecType::String;
+  }
 
   int32_t get_int_value() const {
     return int32_data.empty() ? 0 : int32_data[0];
@@ -231,6 +343,9 @@ struct ValueSpec {
   bool get_bool_value() const {
     return int32_data.empty() ? false : (int32_data[0] != 0);
   }
+  const std::string& get_string_value() const {
+    return string_data;
+  }
   const std::vector<int32_t>& get_int_list() const {
     return int32_data;
   }
@@ -306,12 +421,23 @@ struct ValueSpec {
   void* get_mutable_data_ptr();
   float get_element(size_t index) const;
 
+  // Data generation methods for deferred generation and caching
+  bool is_data_generated() const {
+    return data_generated_;
+  }
+  void ensure_data_generated(int seed = -1);
+  void copy_data_from(const ValueSpec& other);
+
   // Set/get constant flag
   bool is_constant() const {
     return is_constant_tensor;
   }
   void set_constant(bool is_constant) {
     is_constant_tensor = is_constant;
+    // Constant tensors need data immediately for test case setup
+    if (is_constant && is_tensor()) {
+      ensure_data_generated();
+    }
   }
 
   // Set/get none flag
@@ -341,7 +467,7 @@ struct ValueSpec {
       float rel_tolerance = 1e-3f) const;
 
  private:
-  void generate_tensor_data();
+  void generate_tensor_data(int seed = -1);
 };
 
 //
@@ -463,6 +589,25 @@ enum class CorrectnessStatus {
   FAILED // Reference function provided but validation failed
 };
 
+// Per-shader timing data for detailed reporting
+struct ShaderTiming {
+  std::string shader_name;
+  std::vector<float> iter_timings_us; // Individual iteration timings
+  uint32_t global_wg_size[3] = {0, 0, 0};
+  uint32_t local_wg_size[3] = {0, 0, 0};
+
+  float get_avg_time_us() const {
+    if (iter_timings_us.empty()) {
+      return 0.0f;
+    }
+    float sum = 0.0f;
+    for (float t : iter_timings_us) {
+      sum += t;
+    }
+    return sum / iter_timings_us.size();
+  }
+};
+
 class BenchmarkResult {
  public:
   BenchmarkResult() : correctness_status_(CorrectnessStatus::SKIPPED) {}
@@ -480,6 +625,18 @@ class BenchmarkResult {
   // Add timing for a single iteration
   void add_iter_timing(float time_us);
 
+  // Add per-shader timing for a single iteration
+  void add_shader_timing(
+      const std::string& shader_name,
+      float time_us,
+      const uint32_t global_wg[3],
+      const uint32_t local_wg[3]);
+
+  // Get per-shader timing data
+  const std::vector<ShaderTiming>& get_shader_timings() const {
+    return shader_timings_;
+  }
+
   // Getters
   const std::string& get_kernel_name() const {
     return kernel_name;
@@ -530,6 +687,7 @@ class BenchmarkResult {
   std::string operator_name;
   std::vector<float>
       iter_timings; // Individual iteration timings in microseconds
+  std::vector<ShaderTiming> shader_timings_; // Per-shader timing data
   CorrectnessStatus correctness_status_;
 };