diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp index 307e7d562b9..b23c288a58f 100644 --- a/backends/vulkan/test/custom_ops/utils.cpp +++ b/backends/vulkan/test/custom_ops/utils.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -21,37 +22,55 @@ int get_seed() { return seed++; } +int get_seed_or_explicit(int explicit_seed) { + if (explicit_seed >= 0) { + return explicit_seed; + } + return get_seed(); +} + // Forward declarations for data generation utilities void generate_random_float_data( std::vector& data, float min_val = -1.0f, - float max_val = 1.0f); + float max_val = 1.0f, + int explicit_seed = -1); void generate_random_int_data( std::vector& data, int min_val = -10, - int max_val = 10); + int max_val = 10, + int explicit_seed = -1); void generate_randint_float_data( std::vector& data, int min_val = -10, - int max_val = 10); + int max_val = 10, + int explicit_seed = -1); void generate_randint_half_data( std::vector& data, int min_val = -10, - int max_val = 10); + int max_val = 10, + int explicit_seed = -1); void generate_random_int8_data( std::vector& data, int8_t min_val = -10, - int8_t max_val = 10); + int8_t max_val = 10, + int explicit_seed = -1); void generate_random_uint8_data( std::vector& data, uint8_t min_val = 0, - uint8_t max_val = 255); -void generate_random_2xint4_data(std::vector& data); -void generate_random_2xint4_data(std::vector& data); + uint8_t max_val = 255, + int explicit_seed = -1); +void generate_random_2xint4_data( + std::vector& data, + int explicit_seed = -1); +void generate_random_2xint4_data( + std::vector& data, + int explicit_seed = -1); void generate_random_int4_data( std::vector& data, int8_t min_val = -8, - int8_t max_val = 7); + int8_t max_val = 7, + int explicit_seed = -1); void generate_ones_data(std::vector& data); void generate_zeros_data(std::vector& data); @@ -96,7 +115,7 @@ void set_debugging(bool enable_debugging) { } // ValueSpec implementation -void ValueSpec::generate_tensor_data() { +void ValueSpec::generate_tensor_data(int seed) { if (spec_type != SpecType::Tensor) { return; } @@ -107,15 +126,15 @@ void ValueSpec::generate_tensor_data() { case vkapi::kFloat: { float_data.resize(num_elements); if (data_gen_type == DataGenType::RANDOM) { - generate_random_float_data(float_data); + generate_random_float_data(float_data, -1.0f, 1.0f, seed); } else if (data_gen_type == DataGenType::RANDOM_SCALES) { - generate_random_float_data(float_data, 0.005, 0.015); + generate_random_float_data(float_data, 0.005, 0.015, seed); } else if (data_gen_type == DataGenType::RANDINT) { - generate_randint_float_data(float_data); + generate_randint_float_data(float_data, -10, 10, seed); } else if (data_gen_type == DataGenType::RANDINT8) { - generate_randint_float_data(float_data, -128, 127); + generate_randint_float_data(float_data, -128, 127, seed); } else if (data_gen_type == DataGenType::RANDINT4) { - generate_randint_float_data(float_data, -8, 7); + generate_randint_float_data(float_data, -8, 7, seed); } else if (data_gen_type == DataGenType::ONES) { generate_ones_data(float_data); } else if (data_gen_type == DataGenType::ZEROS) { @@ -130,17 +149,17 @@ void ValueSpec::generate_tensor_data() { if (data_gen_type == DataGenType::RANDOM) { // Generate random float data first, then convert to half std::vector temp_data(num_elements); - generate_random_float_data(temp_data); + generate_random_float_data(temp_data, -1.0f, 1.0f, seed); for (size_t i = 0; i < temp_data.size(); ++i) { // Simple conversion to uint16_t representation of half half_data[i] = static_cast(temp_data[i] * 32767.0f); } } else if (data_gen_type == DataGenType::RANDINT) { - generate_randint_half_data(half_data); + generate_randint_half_data(half_data, -10, 10, seed); } else if (data_gen_type == DataGenType::RANDINT8) { - generate_randint_half_data(half_data, -128, 127); + generate_randint_half_data(half_data, -128, 127, seed); } else if (data_gen_type == DataGenType::RANDINT4) { - generate_randint_half_data(half_data, -8, 7); + generate_randint_half_data(half_data, -8, 7, seed); } else if (data_gen_type == DataGenType::ONES) { std::fill( half_data.begin(), @@ -162,14 +181,17 @@ void ValueSpec::generate_tensor_data() { case vkapi::kInt: { int32_data.resize(num_elements); if (data_gen_type == DataGenType::RANDOM) { - generate_random_int_data(int32_data); + generate_random_int_data(int32_data, -10, 10, seed); } else if (data_gen_type == DataGenType::RANDINT) { generate_random_int_data( - int32_data); // For int type, RANDINT is same as RANDOM + int32_data, + -10, + 10, + seed); // For int type, RANDINT is same as RANDOM } else if (data_gen_type == DataGenType::RANDINT8) { - generate_random_int_data(int32_data, -128, 127); + generate_random_int_data(int32_data, -128, 127, seed); } else if (data_gen_type == DataGenType::RANDINT4) { - generate_random_int_data(int32_data, -8, 7); + generate_random_int_data(int32_data, -8, 7, seed); } else if (data_gen_type == DataGenType::ONES) { std::fill(int32_data.begin(), int32_data.end(), 1); } else if (data_gen_type == DataGenType::ZEROS) { @@ -182,13 +204,13 @@ void ValueSpec::generate_tensor_data() { case vkapi::kChar: { int8_data.resize(num_elements); if (data_gen_type == DataGenType::RANDOM) { - generate_random_int8_data(int8_data); + generate_random_int8_data(int8_data, -10, 10, seed); } else if (data_gen_type == DataGenType::RANDINT) { - generate_random_int8_data(int8_data); + generate_random_int8_data(int8_data, -10, 10, seed); } else if (data_gen_type == DataGenType::RANDINT8) { - generate_random_int8_data(int8_data, -128, 127); + generate_random_int8_data(int8_data, -128, 127, seed); } else if (data_gen_type == DataGenType::RANDINT4) { - generate_random_2xint4_data(int8_data); + generate_random_2xint4_data(int8_data, seed); } else if (data_gen_type == DataGenType::ONES) { std::fill(int8_data.begin(), int8_data.end(), 1); } else if (data_gen_type == DataGenType::ONES_INT4) { @@ -204,13 +226,13 @@ void ValueSpec::generate_tensor_data() { case vkapi::kByte: { uint8_data.resize(num_elements); if (data_gen_type == DataGenType::RANDOM) { - generate_random_uint8_data(uint8_data); + generate_random_uint8_data(uint8_data, 0, 255, seed); } else if (data_gen_type == DataGenType::RANDINT) { - generate_random_uint8_data(uint8_data); + generate_random_uint8_data(uint8_data, 0, 255, seed); } else if (data_gen_type == DataGenType::RANDINT8) { - generate_random_uint8_data(uint8_data, 0, 255); + generate_random_uint8_data(uint8_data, 0, 255, seed); } else if (data_gen_type == DataGenType::RANDINT4) { - generate_random_2xint4_data(uint8_data); + generate_random_2xint4_data(uint8_data, seed); } else if (data_gen_type == DataGenType::ONES) { std::fill(uint8_data.begin(), uint8_data.end(), 1); } else if (data_gen_type == DataGenType::ONES_INT4) { @@ -227,9 +249,9 @@ void ValueSpec::generate_tensor_data() { // Default to float float_data.resize(num_elements); if (data_gen_type == DataGenType::RANDOM) { - generate_random_float_data(float_data); + generate_random_float_data(float_data, -1.0f, 1.0f, seed); } else if (data_gen_type == DataGenType::RANDINT) { - generate_randint_float_data(float_data); + generate_randint_float_data(float_data, -10, 10, seed); } else if (data_gen_type == DataGenType::ONES) { generate_ones_data(float_data); } else if (data_gen_type == DataGenType::ZEROS) { @@ -316,6 +338,11 @@ std::string ValueSpec::to_string() const { result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM"; result += ")"; return result; + case SpecType::String: + result += "type=String, value=\""; + result += get_string_value(); + result += "\")"; + return result; } for (size_t i = 0; i < sizes.size(); ++i) { @@ -494,8 +521,9 @@ const void* ValueSpec::get_data_ptr() const { void generate_random_float_data( std::vector& data, float min_val, - float max_val) { - std::mt19937 gen(get_seed()); + float max_val, + int explicit_seed) { + std::mt19937 gen(get_seed_or_explicit(explicit_seed)); std::uniform_real_distribution dis(min_val, max_val); for (auto& val : data) { val = dis(gen); @@ -505,8 +533,9 @@ void generate_random_float_data( void generate_random_int_data( std::vector& data, int min_val, - int max_val) { - std::mt19937 gen(get_seed()); + int max_val, + int explicit_seed) { + std::mt19937 gen(get_seed_or_explicit(explicit_seed)); std::uniform_int_distribution dis(min_val, max_val); for (auto& val : data) { val = dis(gen); @@ -516,8 +545,9 @@ void generate_random_int_data( void generate_randint_float_data( std::vector& data, int min_val, - int max_val) { - std::mt19937 gen(get_seed()); + int max_val, + int explicit_seed) { + std::mt19937 gen(get_seed_or_explicit(explicit_seed)); std::uniform_int_distribution dis(min_val, max_val); for (auto& val : data) { val = static_cast(dis(gen)); @@ -527,8 +557,9 @@ void generate_randint_float_data( void generate_randint_half_data( std::vector& data, int min_val, - int max_val) { - std::mt19937 gen(get_seed()); + int max_val, + int explicit_seed) { + std::mt19937 gen(get_seed_or_explicit(explicit_seed)); std::uniform_int_distribution dis(min_val, max_val); for (auto& val : data) { val = static_cast(std::abs(dis(gen)) % 65536); @@ -542,8 +573,9 @@ void generate_ones_data(std::vector& data) { void generate_random_int8_data( std::vector& data, int8_t min_val, - int8_t max_val) { - std::mt19937 gen(get_seed()); + int8_t max_val, + int explicit_seed) { + std::mt19937 gen(get_seed_or_explicit(explicit_seed)); std::uniform_int_distribution dis(min_val, max_val); for (auto& val : data) { val = static_cast(dis(gen)); @@ -553,8 +585,9 @@ void generate_random_int8_data( void generate_random_uint8_data( std::vector& data, uint8_t min_val, - uint8_t max_val) { - std::mt19937 gen(get_seed()); + uint8_t max_val, + int explicit_seed) { + std::mt19937 gen(get_seed_or_explicit(explicit_seed)); std::uniform_int_distribution dis(min_val, max_val); for (auto& val : data) { val = static_cast(dis(gen)); @@ -564,16 +597,17 @@ void generate_random_uint8_data( void generate_random_int4_data( std::vector& data, int8_t min_val, - int8_t max_val) { - std::mt19937 gen(get_seed()); + int8_t max_val, + int explicit_seed) { + std::mt19937 gen(get_seed_or_explicit(explicit_seed)); std::uniform_int_distribution dis(min_val, max_val); for (auto& val : data) { val = static_cast(dis(gen)); } } -void generate_random_2xint4_data(std::vector& data) { - std::mt19937 gen(get_seed()); +void generate_random_2xint4_data(std::vector& data, int explicit_seed) { + std::mt19937 gen(get_seed_or_explicit(explicit_seed)); std::uniform_int_distribution dis(-8, 7); // Signed 4-bit range for (auto& val : data) { // Generate two separate 4-bit values @@ -584,8 +618,10 @@ void generate_random_2xint4_data(std::vector& data) { } } -void generate_random_2xint4_data(std::vector& data) { - std::mt19937 gen(get_seed()); +void generate_random_2xint4_data( + std::vector& data, + int explicit_seed) { + std::mt19937 gen(get_seed_or_explicit(explicit_seed)); std::uniform_int_distribution dis(0, 15); // Unsigned 4-bit range for (auto& val : data) { // Generate two separate 4-bit values @@ -652,6 +688,88 @@ bool ValueSpec::validate_against_reference( return true; } +// Ensure data is generated for this ValueSpec +void ValueSpec::ensure_data_generated(int seed) { + if (data_generated_) { + return; + } + generate_tensor_data(seed); + data_generated_ = true; +} + +// Copy input data from another ValueSpec +void ValueSpec::copy_data_from(const ValueSpec& other) { + if (!is_tensor() || !other.is_tensor()) { + return; + } + // Copy raw data based on dtype + float_data = other.float_data; + int32_data = other.int32_data; + half_data = other.half_data; + int8_data = other.int8_data; + uint8_data = other.uint8_data; + data_generated_ = other.data_generated_; +} + +// ReferenceKey implementation +ReferenceKey ReferenceKey::from_test_case(const TestCase& tc) { + std::ostringstream oss; + + // Serialize inputs that affect reference computation + // Skip: storage_type, memory_layout, string values (like impl_selector) + for (size_t i = 0; i < tc.inputs().size(); ++i) { + const ValueSpec& input = tc.inputs()[i]; + oss << "i" << i << ":"; + + if (input.is_tensor()) { + // For tensors: sizes, dtype, data_gen_type, is_constant + oss << "T["; + for (size_t j = 0; j < input.sizes.size(); ++j) { + if (j > 0) + oss << ","; + oss << input.sizes[j]; + } + oss << "]d" << static_cast(input.dtype); + oss << "g" << static_cast(input.data_gen_type); + oss << "c" << (input.is_constant() ? 1 : 0); + oss << "n" << (input.is_none() ? 1 : 0); + } else if (input.is_int()) { + oss << "I" << input.get_int_value(); + } else if (input.is_float()) { + oss << "F" << input.get_float_value(); + } else if (input.is_bool()) { + oss << "B" << (input.get_bool_value() ? 1 : 0); + } else if (input.is_int_list()) { + oss << "L["; + const auto& list = input.get_int_list(); + for (size_t j = 0; j < list.size(); ++j) { + if (j > 0) + oss << ","; + oss << list[j]; + } + oss << "]"; + } + // Skip string inputs (like impl_selector) as they don't affect reference + oss << ";"; + } + + // Also include output shapes for completeness + for (size_t i = 0; i < tc.outputs().size(); ++i) { + const ValueSpec& output = tc.outputs()[i]; + oss << "o" << i << ":["; + for (size_t j = 0; j < output.sizes.size(); ++j) { + if (j > 0) + oss << ","; + oss << output.sizes[j]; + } + oss << "]d" << static_cast(output.dtype) << ";"; + } + + ReferenceKey key; + key.key_string = oss.str(); + return key; +} + // Helper function to collect GPU timing from querypool float collect_gpu_timing_us( ComputeGraph& graph, @@ -685,11 +803,68 @@ float collect_gpu_timing_us( return 0.0f; } +// Helper function to collect per-shader GPU timing from querypool +// Returns a map of shader_name -> timing_us for non-filtered shaders +std::unordered_map collect_per_shader_timing_us( + ComputeGraph& graph, + const std::vector& shader_filter) { + std::unordered_map shader_timings; + + graph.context()->querypool().extract_results(); + const auto results = graph.context()->querypool().get_shader_timestamp_data(); + for (const auto& shader_result : results) { + bool filtered = false; + // Check if this shader matches any filter pattern + for (const auto& filter_pattern : shader_filter) { + if (shader_result.kernel_name.find(filter_pattern) != std::string::npos) { + filtered = true; + break; + } + } + + if (!filtered) { + // Calculate duration from start and end times, convert from ns to μs + uint64_t duration_ns = + shader_result.end_time_ns - shader_result.start_time_ns; + float duration_us = static_cast(duration_ns) / 1000.0f; + // Accumulate timing for shaders with the same name + shader_timings[shader_result.kernel_name] += duration_us; + } + } + return shader_timings; +} + // BenchmarkResult implementation void BenchmarkResult::add_iter_timing(float time_us) { iter_timings.push_back(time_us); } +void BenchmarkResult::add_shader_timing( + const std::string& shader_name, + float time_us, + const uint32_t global_wg[3], + const uint32_t local_wg[3]) { + // Find existing shader timing or create new one + for (auto& st : shader_timings_) { + if (st.shader_name == shader_name) { + st.iter_timings_us.push_back(time_us); + // Work group sizes should be consistent across iterations + return; + } + } + // Not found, create new entry + ShaderTiming new_timing; + new_timing.shader_name = shader_name; + new_timing.iter_timings_us.push_back(time_us); + new_timing.global_wg_size[0] = global_wg[0]; + new_timing.global_wg_size[1] = global_wg[1]; + new_timing.global_wg_size[2] = global_wg[2]; + new_timing.local_wg_size[0] = local_wg[0]; + new_timing.local_wg_size[1] = local_wg[1]; + new_timing.local_wg_size[2] = local_wg[2]; + shader_timings_.push_back(std::move(new_timing)); +} + float BenchmarkResult::get_avg_time_us() const { if (iter_timings.empty()) { return 0.0f; @@ -739,11 +914,27 @@ void BenchmarkResult::print_summary( const std::string& size_info, float total_gflops) const { static constexpr int OPERATOR_NAME_WIDTH = 50; - static constexpr int KERNEL_NAME_WIDTH = 70; + static constexpr int GLOBAL_WG_WIDTH = 16; + static constexpr int LOCAL_WG_WIDTH = 12; + static constexpr int KERNEL_NAME_WIDTH = 80; static constexpr int SIZE_INFO_WIDTH = 20; - static constexpr int TIMING_WIDTH = 20; - static constexpr int GFLOPS_WIDTH = 20; - static constexpr int CORRECTNESS_WIDTH = 10; + static constexpr int TIMING_WIDTH = 16; + static constexpr int GFLOPS_WIDTH = 14; + static constexpr int CORRECTNESS_WIDTH = 8; + + // Helper to truncate shader names longer than 46 chars to 44 chars + ".." + auto truncate_shader_name = [](const std::string& name) -> std::string { + if (name.length() > 46) { + return name.substr(0, 44) + ".."; + } + return name; + }; + + // Helper to format work group size as (x,y,z) + auto format_wg_size = [](const uint32_t wg[3]) -> std::string { + return "(" + std::to_string(wg[0]) + "," + std::to_string(wg[1]) + "," + + std::to_string(wg[2]) + ")"; + }; std::string correctness_str; switch (correctness_status_) { @@ -758,14 +949,74 @@ void BenchmarkResult::print_summary( break; } - std::cout << std::left << std::setw(OPERATOR_NAME_WIDTH) - << get_operator_name() << " " << std::left - << std::setw(KERNEL_NAME_WIDTH) << get_kernel_name() << std::right - << " " << std::setw(SIZE_INFO_WIDTH) << size_info - << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3) - << get_avg_time_us() << " μs " << std::setw(GFLOPS_WIDTH) - << std::fixed << std::setprecision(3) << total_gflops << " GFLOP/s " - << std::setw(CORRECTNESS_WIDTH) << correctness_str << std::endl; + // If we have per-shader timing data, print one line per shader plus overall + if (!shader_timings_.empty()) { + // If only one shader, print a single combined row + if (shader_timings_.size() == 1) { + const auto& st = shader_timings_[0]; + std::cout << std::left << std::setw(OPERATOR_NAME_WIDTH) + << truncate_shader_name(st.shader_name) << " " << std::left + << std::setw(GLOBAL_WG_WIDTH) + << format_wg_size(st.global_wg_size) << std::left + << std::setw(LOCAL_WG_WIDTH) << format_wg_size(st.local_wg_size) + << std::left << std::setw(KERNEL_NAME_WIDTH) + << get_kernel_name() << std::right << " " + << std::setw(SIZE_INFO_WIDTH) << size_info + << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3) + << get_avg_time_us() << " μs " << std::setw(GFLOPS_WIDTH) + << std::fixed << std::setprecision(3) << total_gflops + << " GFLOP/s " << std::setw(CORRECTNESS_WIDTH) + << correctness_str << std::endl; + } else { + // Multiple shaders: print individual shader lines (without GFLOP/s) + for (size_t i = 0; i < shader_timings_.size(); ++i) { + const auto& st = shader_timings_[i]; + float shader_avg_time = st.get_avg_time_us(); + + // Shader lines don't show test case info + std::cout << std::left << std::setw(OPERATOR_NAME_WIDTH) + << truncate_shader_name(st.shader_name) << " " << std::left + << std::setw(GLOBAL_WG_WIDTH) + << format_wg_size(st.global_wg_size) << std::left + << std::setw(LOCAL_WG_WIDTH) + << format_wg_size(st.local_wg_size) << std::left + << std::setw(KERNEL_NAME_WIDTH) << "" << std::right << " " + << std::setw(SIZE_INFO_WIDTH) << "" << std::setw(TIMING_WIDTH) + << std::fixed << std::setprecision(3) << shader_avg_time + << " μs " << std::setw(GFLOPS_WIDTH) << "" << " " + << std::setw(CORRECTNESS_WIDTH) << "" << std::endl; + } + + // Print overall row with operator name, test case info, total time, and + // GFLOP/s + std::cout << std::left << std::setw(OPERATOR_NAME_WIDTH) + << get_operator_name() << " " << std::left + << std::setw(GLOBAL_WG_WIDTH) << "" << std::left + << std::setw(LOCAL_WG_WIDTH) << "" << std::left + << std::setw(KERNEL_NAME_WIDTH) << get_kernel_name() + << std::right << " " << std::setw(SIZE_INFO_WIDTH) << size_info + << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3) + << get_avg_time_us() << " μs " << std::setw(GFLOPS_WIDTH) + << std::fixed << std::setprecision(3) << total_gflops + << " GFLOP/s " << std::setw(CORRECTNESS_WIDTH) + << correctness_str << std::endl; + } + + // Print separator line between test cases + } else { + // No per-shader timing data, use the original format + std::cout << std::left << std::setw(OPERATOR_NAME_WIDTH) + << get_operator_name() << " " << std::left + << std::setw(GLOBAL_WG_WIDTH) << "" << std::left + << std::setw(LOCAL_WG_WIDTH) << "" << std::left + << std::setw(KERNEL_NAME_WIDTH) << get_kernel_name() << std::right + << " " << std::setw(SIZE_INFO_WIDTH) << size_info + << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3) + << get_avg_time_us() << " μs " << std::setw(GFLOPS_WIDTH) + << std::fixed << std::setprecision(3) << total_gflops + << " GFLOP/s " << std::setw(CORRECTNESS_WIDTH) << correctness_str + << std::endl; + } } // TestResult implementation @@ -778,7 +1029,7 @@ void TestResult::add_result(BenchmarkResult&& result) { } void TestResult::print_summary() const { - static constexpr int CASE_WIDTH = 80; + static constexpr int CASE_WIDTH = 100; static constexpr int KERNEL_NAME_WIDTH = 20; static constexpr int TIMING_WIDTH = 12; static constexpr int PASS_WIDTH = 8; @@ -1069,6 +1320,10 @@ ComputeGraph setup_compute_graph(TestCase& test_case, std::string op_name) { } ValueRef input_value = graph.add_scalar_list(std::move(int64_list)); input_values.push_back(input_value); + } else if (input_spec.is_string()) { + std::string str_copy = input_spec.get_string_value(); + ValueRef input_value = graph.add_string(std::move(str_copy)); + input_values.push_back(input_value); } else if (input_spec.is_constant()) { ValueRef input_value = graph.add_tensorref( input_spec.get_tensor_sizes(), @@ -1200,9 +1455,38 @@ execute_test_case(TestCase& test_case, int warmup_runs, int benchmark_runs) { float cpu_time_us = static_cast(cpu_duration.count()); total_cpu_time_us += cpu_time_us; - // Collect GPU timing using helper function - float gpu_time_us = - collect_gpu_timing_us(graph, test_case.get_shader_filter()); + // Collect per-shader GPU timing - get raw shader results to preserve + // metadata + graph.context()->querypool().extract_results(); + const auto shader_results = + graph.context()->querypool().get_shader_timestamp_data(); + + // Calculate total GPU time from per-shader timings + float gpu_time_us = 0.0f; + for (const auto& shader_result : shader_results) { + // Check if this shader matches any filter pattern + bool filtered = false; + for (const auto& filter_pattern : test_case.get_shader_filter()) { + if (shader_result.kernel_name.find(filter_pattern) != + std::string::npos) { + filtered = true; + break; + } + } + + if (!filtered) { + uint64_t duration_ns = + shader_result.end_time_ns - shader_result.start_time_ns; + float duration_us = static_cast(duration_ns) / 1000.0f; + gpu_time_us += duration_us; + // Store per-shader timing with work group sizes + result.add_shader_timing( + shader_result.kernel_name, + duration_us, + shader_result.metadata.global_workgroup_size, + shader_result.metadata.local_workgroup_size); + } + } total_gpu_time_us += gpu_time_us; // Add the appropriate timing based on the flag @@ -1274,110 +1558,177 @@ TestResult execute_test_cases( << operation_name << std::endl; print_separator(); + // Group test cases by ReferenceKey for caching reference computations + // Use a vector to preserve the order in which groups first appear + std::vector group_order; + std::unordered_map, ReferenceKeyHash> + groups; + for (size_t i = 0; i < test_cases.size(); ++i) { + ReferenceKey key = ReferenceKey::from_test_case(test_cases[i]); + if (groups.find(key) == groups.end()) { + group_order.push_back(key); + } + groups[key].push_back(i); + } + bool any_correctness_failed = false; float total_gflops = 0.0f; + size_t test_case_counter = 0; + + // Process each group: generate data, compute reference, execute, and print + // Iterate in the order groups first appeared in test_cases + for (const auto& key : group_order) { + const auto& indices = groups[key]; + if (indices.empty()) + continue; + + // Get first test case as the "prototype" + size_t prototype_idx = indices[0]; + TestCase& prototype = test_cases[prototype_idx]; + + // Generate data for prototype with deterministic seed based on key + int group_seed = + static_cast(std::hash{}(key.key_string) % 10000); + for (auto& input : prototype.inputs()) { + input.ensure_data_generated(group_seed++); + } - for (size_t i = 0; i < test_cases.size(); ++i) { - TestCase& test_case = test_cases[i]; - - // Compute reference data if reference function is provided - bool skipped_reference_fn = true; + // Compute reference once for prototype + bool ref_computed = false; + std::vector> ref_data; if (reference_compute_func) { try { - reference_compute_func(test_case); - skipped_reference_fn = false; - } catch (const std::invalid_argument& e) { - if (debugging()) { - std::cout << "Compute reference skipped: " << e.what() << std::endl; + reference_compute_func(prototype); + ref_computed = true; + + // Cache the reference output for this group + for (const auto& output : prototype.outputs()) { + ref_data.push_back(output.get_ref_float_data()); } + } catch (const std::invalid_argument& _) { + // Reference computation skipped for this group } } - // Execute single test case - BenchmarkResult result; - bool shader_not_supported = false; - try { - result = execute_test_case(test_case, warmup_runs, benchmark_runs); - result.set_operator_name(test_case.operator_name()); - } catch (const vkcompute::vkapi::ShaderNotSupportedError&) { - result = BenchmarkResult( - test_case.name().empty() ? "unnamed_test_case" : test_case.name(), - test_case.operator_name()); - shader_not_supported = true; + // Copy data and reference to other test cases in group + for (size_t i = 1; i < indices.size(); ++i) { + size_t tc_idx = indices[i]; + TestCase& tc = test_cases[tc_idx]; + + // Copy input data from prototype + for (size_t j = 0; + j < tc.inputs().size() && j < prototype.inputs().size(); + ++j) { + auto& dest = tc.inputs()[j]; + const auto& src = prototype.inputs()[j]; + if (dest.is_tensor() && src.is_tensor() && dest.sizes == src.sizes && + dest.dtype == src.dtype) { + dest.copy_data_from(src); + } + } + + // Copy reference output data if available + if (ref_computed) { + for (size_t j = 0; j < tc.outputs().size() && j < ref_data.size(); + ++j) { + tc.outputs()[j].get_ref_float_data() = ref_data[j]; + } + } } - // Determine if this test case passed (has valid timing data) - bool vulkan_execute_succeeded = - result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; + // Execute and print results for all test cases in this group + for (size_t tc_idx : indices) { + TestCase& test_case = test_cases[tc_idx]; + ++test_case_counter; - if (shader_not_supported) { - result.set_correctness_status(CorrectnessStatus::SKIPPED); - } else if (!vulkan_execute_succeeded) { - result.set_correctness_status(CorrectnessStatus::FAILED); - } else if (skipped_reference_fn) { - result.set_correctness_status(CorrectnessStatus::SKIPPED); - } else { - // Reference function provided and succeeded - validate outputs - bool correctness_passed = true; - - for (size_t output_idx = 0; output_idx < test_case.num_outputs(); - ++output_idx) { - const ValueSpec& output_spec = test_case.outputs()[output_idx]; - - if (!output_spec.validate_against_reference( - test_case.get_abs_tolerance(), test_case.get_rel_tolerance())) { - correctness_passed = false; - std::cout << " Correctness validation FAILED for test " - << result.get_kernel_name() << std::endl; - print_valuespec_data(output_spec, "vulkan output"); - print_valuespec_data(output_spec, "ref output", true); - - throw std::runtime_error("Correctness validation failed"); - } + // Execute single test case + BenchmarkResult result; + bool shader_not_supported = false; + try { + result = execute_test_case(test_case, warmup_runs, benchmark_runs); + result.set_operator_name(test_case.operator_name()); + } catch (const vkcompute::vkapi::ShaderNotSupportedError&) { + result = BenchmarkResult( + test_case.name().empty() ? "unnamed_test_case" : test_case.name(), + test_case.operator_name()); + shader_not_supported = true; } - if (correctness_passed) { - result.set_correctness_status(CorrectnessStatus::PASSED); - } else { - any_correctness_failed = true; + // Determine if this test case passed (has valid timing data) + bool vulkan_execute_succeeded = + result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f; + + if (shader_not_supported) { + result.set_correctness_status(CorrectnessStatus::SKIPPED); + } else if (!vulkan_execute_succeeded) { result.set_correctness_status(CorrectnessStatus::FAILED); - } - } + } else if (!ref_computed) { + result.set_correctness_status(CorrectnessStatus::SKIPPED); + } else { + // Reference function provided and succeeded - validate outputs + bool correctness_passed = true; + + for (size_t output_idx = 0; output_idx < test_case.num_outputs(); + ++output_idx) { + const ValueSpec& output_spec = test_case.outputs()[output_idx]; + + if (!output_spec.validate_against_reference( + test_case.get_abs_tolerance(), + test_case.get_rel_tolerance())) { + correctness_passed = false; + std::cout << " Correctness validation FAILED for test " + << result.get_kernel_name() << std::endl; + print_valuespec_data(output_spec, "vulkan output"); + print_valuespec_data(output_spec, "ref output", true); + + throw std::runtime_error("Correctness validation failed"); + } + } - // Calculate GFLOPS for this test case using the provided FLOP calculator - float case_gflops = 0.0f; - if (vulkan_execute_succeeded) { - // Use the provided FLOP calculator to get total FLOPs for this test case - int64_t total_flops = flop_calculator(test_case); - float flops = static_cast(total_flops); - float avg_time_us = result.get_avg_time_us(); - if (avg_time_us > 0.0f && total_flops > 0) { - case_gflops = (flops / 1e9f) / (avg_time_us / 1e6f); + if (correctness_passed) { + result.set_correctness_status(CorrectnessStatus::PASSED); + } else { + any_correctness_failed = true; + result.set_correctness_status(CorrectnessStatus::FAILED); + } } - total_gflops += case_gflops; - } else { - case_gflops = -1.0f; // Indicate failure - } + // Calculate GFLOPS for this test case using the provided FLOP calculator + float case_gflops = 0.0f; + if (vulkan_execute_succeeded) { + // Use the provided FLOP calculator to get total FLOPs for this test + // case + int64_t total_flops = flop_calculator(test_case); + float flops = static_cast(total_flops); + float avg_time_us = result.get_avg_time_us(); + if (avg_time_us > 0.0f && total_flops > 0) { + case_gflops = (flops / 1e9f) / (avg_time_us / 1e6f); + } - // Calculate tensor info for display - std::string size_info = "["; - if (!test_case.empty() && test_case.num_inputs() > 0 && - test_case.inputs()[0].is_tensor()) { - const auto& sizes = test_case.inputs()[0].get_tensor_sizes(); - for (size_t j = 0; j < sizes.size(); ++j) { - size_info += std::to_string(sizes[j]); - if (j < sizes.size() - 1) - size_info += "x"; + total_gflops += case_gflops; + } else { + case_gflops = -1.0f; // Indicate failure } - } - size_info += "]"; - // Print progress using the BenchmarkResult member function - result.print_summary(i + 1, size_info, case_gflops); + // Calculate tensor info for display + std::string size_info = "["; + if (!test_case.empty() && test_case.num_inputs() > 0 && + test_case.inputs()[0].is_tensor()) { + const auto& sizes = test_case.inputs()[0].get_tensor_sizes(); + for (size_t j = 0; j < sizes.size(); ++j) { + size_info += std::to_string(sizes[j]); + if (j < sizes.size() - 1) + size_info += "x"; + } + } + size_info += "]"; + + // Print progress using the BenchmarkResult member function + result.print_summary(test_case_counter, size_info, case_gflops); - // Add result to collection - results.add_result(std::move(result)); + // Add result to collection + results.add_result(std::move(result)); + } } // Set the overall results on the TestResult diff --git a/backends/vulkan/test/custom_ops/utils.h b/backends/vulkan/test/custom_ops/utils.h index 666b2d2e409..9b5b6a46782 100644 --- a/backends/vulkan/test/custom_ops/utils.h +++ b/backends/vulkan/test/custom_ops/utils.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,29 @@ namespace prototyping { using namespace vkcompute; +// +// ReferenceKey for caching reference computations +// + +// Captures the identity of input conditions for test case grouping. +// Test cases with the same ReferenceKey should produce identical reference +// outputs, so reference computation can be cached and reused. +struct ReferenceKey { + std::string key_string; + + static ReferenceKey from_test_case(const class TestCase& tc); + + bool operator==(const ReferenceKey& other) const { + return key_string == other.key_string; + } +}; + +struct ReferenceKeyHash { + size_t operator()(const ReferenceKey& k) const { + return std::hash{}(k.key_string); + } +}; + // // Global configuration options // @@ -57,11 +81,70 @@ inline const std::vector kLayoutOnlyShaderFilter = { "nchw_to", "to_nchw"}; +// +// String utilities +// + +// Helper function to get abbreviated layout names for test case naming +inline std::string layout_abbrev(utils::GPUMemoryLayout layout) { + switch (layout) { + case utils::kWidthPacked: + return "WP"; + case utils::kChannelsPacked: + return "CP"; + case utils::kPackedInt8_4W: + return "4W"; + case utils::kPackedInt8_4C: + return "4C"; + case utils::kPackedInt8_4W4C: + return "4W4C"; + case utils::kPackedInt8_4H4W: + return "4H4W"; + case utils::kPackedInt8_4C1W: + return "4C1W"; + default: + return "UNK"; + } +} + +// Helper function to get abbreviated storage type names for test case naming +inline std::string storage_type_abbrev(utils::StorageType storage_type) { + switch (storage_type) { + case utils::kTexture3D: + return "Tex"; + case utils::kBuffer: + return "Buf"; + default: + return "UNK"; + } +} + +// Helper function to get combined storage type and layout representation +// Example: (kBuffer, kPackedInt8_4W4C) -> "Buf_4W4C" +inline std::string repr_str( + utils::StorageType storage_type, + utils::GPUMemoryLayout layout) { + return storage_type_abbrev(storage_type) + "(" + layout_abbrev(layout) + ")"; +} + +// Helper function to generate comma-separated shape string for test case naming +// Example: {1, 128, 56, 56} -> "1,128,56,56" +inline std::string shape_string(const std::vector& shape) { + std::string result; + for (size_t i = 0; i < shape.size(); ++i) { + if (i > 0) { + result += ","; + } + result += std::to_string(shape[i]); + } + return result; +} + // // ValueSpec class // -enum class SpecType { Tensor, IntList, Int, Float, Bool }; +enum class SpecType { Tensor, IntList, Int, Float, Bool, String }; // Data generation types enum class DataGenType { @@ -87,12 +170,14 @@ struct ValueSpec { bool is_constant_tensor; bool is_none_flag; bool is_int4_tensor; + bool data_generated_ = false; std::vector float_data; std::vector int32_data; std::vector half_data; // Using uint16_t as substitute for half std::vector int8_data; // For kChar (signed 8-bit) std::vector uint8_data; // For kByte (unsigned 8-bit) + std::string string_data; std::vector ref_float_data; std::vector ref_int32_data; @@ -113,8 +198,9 @@ struct ValueSpec { data_gen_type(DataGenType::ZEROS), is_constant_tensor(false), is_none_flag(false), - is_int4_tensor(false) { - generate_tensor_data(); + is_int4_tensor(false), + data_generated_(false) { + // Data generation is deferred until ensure_data_generated() is called } // Constructor for tensor with custom data generation type @@ -132,8 +218,9 @@ struct ValueSpec { data_gen_type(data_gen_type), is_constant_tensor(false), is_none_flag(false), - is_int4_tensor(false) { - generate_tensor_data(); + is_int4_tensor(false), + data_generated_(false) { + // Data generation is deferred until ensure_data_generated() is called } // Constructor for single int @@ -146,7 +233,8 @@ struct ValueSpec { data_gen_type(DataGenType::FIXED), is_constant_tensor(false), is_none_flag(false), - is_int4_tensor(false) { + is_int4_tensor(false), + data_generated_(true) { int32_data.push_back(value); } @@ -160,7 +248,8 @@ struct ValueSpec { data_gen_type(DataGenType::FIXED), is_constant_tensor(false), is_none_flag(false), - is_int4_tensor(false) { + is_int4_tensor(false), + data_generated_(true) { float_data.push_back(value); } @@ -174,7 +263,8 @@ struct ValueSpec { data_gen_type(DataGenType::FIXED), is_constant_tensor(false), is_none_flag(false), - is_int4_tensor(false) { + is_int4_tensor(false), + data_generated_(true) { int32_data.push_back(value ? 1 : 0); } @@ -189,8 +279,26 @@ struct ValueSpec { is_constant_tensor(false), is_none_flag(false), is_int4_tensor(false), + data_generated_(true), int32_data(values) {} + // Factory method for string (avoids ambiguity with vector constructor) + static ValueSpec make_string(const std::string& value) { + ValueSpec spec; + spec.sizes = {1}; + spec.dtype = vkapi::kInt; + spec.memory_layout = utils::kWidthPacked; + spec.storage_type = utils::kTexture3D; + spec.spec_type = SpecType::String; + spec.data_gen_type = DataGenType::FIXED; + spec.is_constant_tensor = false; + spec.is_none_flag = false; + spec.is_int4_tensor = false; + spec.data_generated_ = true; + spec.string_data = value; + return spec; + } + // Default constructor ValueSpec() : dtype(vkapi::kFloat), @@ -200,7 +308,8 @@ struct ValueSpec { data_gen_type(DataGenType::ZEROS), is_constant_tensor(false), is_none_flag(false), - is_int4_tensor(false) {} + is_int4_tensor(false), + data_generated_(false) {} int64_t numel() const; size_t nbytes() const; @@ -221,6 +330,9 @@ struct ValueSpec { bool is_bool() const { return spec_type == SpecType::Bool; } + bool is_string() const { + return spec_type == SpecType::String; + } int32_t get_int_value() const { return int32_data.empty() ? 0 : int32_data[0]; @@ -231,6 +343,9 @@ struct ValueSpec { bool get_bool_value() const { return int32_data.empty() ? false : (int32_data[0] != 0); } + const std::string& get_string_value() const { + return string_data; + } const std::vector& get_int_list() const { return int32_data; } @@ -306,12 +421,23 @@ struct ValueSpec { void* get_mutable_data_ptr(); float get_element(size_t index) const; + // Data generation methods for deferred generation and caching + bool is_data_generated() const { + return data_generated_; + } + void ensure_data_generated(int seed = -1); + void copy_data_from(const ValueSpec& other); + // Set/get constant flag bool is_constant() const { return is_constant_tensor; } void set_constant(bool is_constant) { is_constant_tensor = is_constant; + // Constant tensors need data immediately for test case setup + if (is_constant && is_tensor()) { + ensure_data_generated(); + } } // Set/get none flag @@ -341,7 +467,7 @@ struct ValueSpec { float rel_tolerance = 1e-3f) const; private: - void generate_tensor_data(); + void generate_tensor_data(int seed = -1); }; // @@ -463,6 +589,25 @@ enum class CorrectnessStatus { FAILED // Reference function provided but validation failed }; +// Per-shader timing data for detailed reporting +struct ShaderTiming { + std::string shader_name; + std::vector iter_timings_us; // Individual iteration timings + uint32_t global_wg_size[3] = {0, 0, 0}; + uint32_t local_wg_size[3] = {0, 0, 0}; + + float get_avg_time_us() const { + if (iter_timings_us.empty()) { + return 0.0f; + } + float sum = 0.0f; + for (float t : iter_timings_us) { + sum += t; + } + return sum / iter_timings_us.size(); + } +}; + class BenchmarkResult { public: BenchmarkResult() : correctness_status_(CorrectnessStatus::SKIPPED) {} @@ -480,6 +625,18 @@ class BenchmarkResult { // Add timing for a single iteration void add_iter_timing(float time_us); + // Add per-shader timing for a single iteration + void add_shader_timing( + const std::string& shader_name, + float time_us, + const uint32_t global_wg[3], + const uint32_t local_wg[3]); + + // Get per-shader timing data + const std::vector& get_shader_timings() const { + return shader_timings_; + } + // Getters const std::string& get_kernel_name() const { return kernel_name; @@ -530,6 +687,7 @@ class BenchmarkResult { std::string operator_name; std::vector iter_timings; // Individual iteration timings in microseconds + std::vector shader_timings_; // Per-shader timing data CorrectnessStatus correctness_status_; };