From 667bec2cf43f26a537c508bcfa22482156a71876 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 13 Jan 2026 15:45:12 +1300 Subject: [PATCH 01/10] [sim] Optionally enable health monitor --- Cargo.lock | 1 + dev-tools/omicron-dev/Cargo.toml | 1 + dev-tools/omicron-dev/src/main.rs | 15 ++++++++++++++- nexus/inventory/src/collector.rs | 3 +++ nexus/test-utils/src/nexus_test.rs | 3 +++ nexus/test-utils/src/starter.rs | 14 ++++++++++++++ nexus/tests/integration_tests/instances.rs | 4 ++++ nexus/tests/integration_tests/sleds.rs | 1 + sled-agent/src/bin/sled-agent-sim.rs | 5 +++++ sled-agent/src/long_running_tasks.rs | 2 +- sled-agent/src/sim/config.rs | 12 ++++++++++++ sled-agent/src/sim/mod.rs | 5 +++-- sled-agent/src/sim/sled_agent.rs | 11 +++++++++-- 13 files changed, 71 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b3cf4ebbf0b..c0c43c38aaf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8158,6 +8158,7 @@ dependencies = [ "omicron-dev-lib", "omicron-nexus", "omicron-rpaths", + "omicron-sled-agent", "omicron-test-utils", "omicron-workspace-hack", "oxide-client", diff --git a/dev-tools/omicron-dev/Cargo.toml b/dev-tools/omicron-dev/Cargo.toml index 21a4bc7210c..971e5af5bfd 100644 --- a/dev-tools/omicron-dev/Cargo.toml +++ b/dev-tools/omicron-dev/Cargo.toml @@ -23,6 +23,7 @@ nexus-test-interface.workspace = true nexus-test-utils = { workspace = true, features = ["omicron-dev"] } omicron-nexus.workspace = true omicron-workspace-hack.workspace = true +omicron-sled-agent.workspace = true oxide-tokio-rt.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. pq-sys = "*" diff --git a/dev-tools/omicron-dev/src/main.rs b/dev-tools/omicron-dev/src/main.rs index 9fa5ac0fc05..a5efe7c766c 100644 --- a/dev-tools/omicron-dev/src/main.rs +++ b/dev-tools/omicron-dev/src/main.rs @@ -11,6 +11,7 @@ use libc::SIGINT; use nexus_config::NexusConfig; use nexus_test_interface::NexusServer; use nexus_test_utils::resource_helpers::DiskTest; +use omicron_sled_agent::sim::ConfigHealthMonitor; use signal_hook_tokio::Signals; use std::fs; @@ -57,6 +58,9 @@ struct RunAllArgs { /// Override the nexus configuration file. #[clap(long, default_value = DEFAULT_NEXUS_CONFIG)] nexus_config: Utf8PathBuf, + /// Enable the sled agent health monitor + #[clap(long, default_value_t = false, action)] + enable_sled_agent_health_monitor: bool, } impl RunAllArgs { @@ -87,10 +91,19 @@ impl RunAllArgs { .set_port(p); } + let sled_agent_health_monitor = ConfigHealthMonitor { + enabled: self.enable_sled_agent_health_monitor, + }; + println!("omicron-dev: setting up all services ... "); let cptestctx = nexus_test_utils::omicron_dev_setup_with_config::< omicron_nexus::Server, - >(&mut config, 0, self.gateway_config.clone()) + >( + &mut config, + 0, + self.gateway_config.clone(), + sled_agent_health_monitor, + ) .await .context("error setting up services")?; diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index 915cfa58af3..9dd16fc028c 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -980,6 +980,9 @@ mod test { None, sim::ZpoolConfig::None, SledCpuFamily::AmdMilan, + // For now we disable the health monitor, we can change this preference + // later if necessary. + sim::ConfigHealthMonitor { enabled: false }, ); let agent = diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index a57bf139a9b..65b18c7ed3e 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -90,6 +90,7 @@ impl<'a> ControlPlaneBuilder<'a> { self.nextra_sled_agents, DEFAULT_SP_SIM_CONFIG.into(), false, + sim::ConfigHealthMonitor { enabled: false }, ) .await } @@ -361,6 +362,7 @@ pub async fn omicron_dev_setup_with_config( config: &mut NexusConfig, extra_sled_agents: u16, gateway_config_file: Utf8PathBuf, + sled_agent_health_monitor: sim::ConfigHealthMonitor, ) -> Result> { let starter = ControlPlaneStarter::::new("omicron-dev", config); @@ -388,6 +390,7 @@ pub async fn omicron_dev_setup_with_config( extra_sled_agents, gateway_config_file, true, + sled_agent_health_monitor, ) .await) } diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs index 696a40b8e88..fb0c0b848a1 100644 --- a/nexus/test-utils/src/starter.rs +++ b/nexus/test-utils/src/starter.rs @@ -880,6 +880,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { sled_id: SledUuid, sled_index: u16, sim_mode: sim::SimMode, + health_monitor: sim::ConfigHealthMonitor, ) { let nexus_address = self.nexus_internal_addr.expect("Must launch Nexus first"); @@ -896,6 +897,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { tempdir.path(), sim_mode, &self.simulated_upstairs, + health_monitor, ) .await .expect("Failed to start sled agent"); @@ -1000,6 +1002,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { sled_id: SledUuid, sled_index: u16, sim_mode: sim::SimMode, + health_monitor: sim::ConfigHealthMonitor, ) { let nexus_address = self.nexus_internal_addr.expect("Must launch Nexus first"); @@ -1016,6 +1019,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { tempdir.path(), sim_mode, &self.simulated_upstairs, + health_monitor, ) .await .expect("Failed to start sled agent"); @@ -1542,6 +1546,7 @@ pub(crate) async fn setup_with_config_impl( extra_sled_agents: u16, gateway_config_file: Utf8PathBuf, second_nexus: bool, + sled_agent_health_monitor: sim::ConfigHealthMonitor, ) -> ControlPlaneTestContext { const STEP_TIMEOUT: Duration = Duration::from_secs(600); @@ -1705,6 +1710,7 @@ pub(crate) async fn setup_with_config_impl( // The first and second sled agents have special UUIDs, and any extra ones // after that are random. + let health_monitor = sled_agent_health_monitor.clone(); starter .init_with_steps( vec![( @@ -1715,6 +1721,7 @@ pub(crate) async fn setup_with_config_impl( SLED_AGENT_UUID.parse().unwrap(), 0, sim_mode, + health_monitor, ) .boxed() }), @@ -1723,6 +1730,7 @@ pub(crate) async fn setup_with_config_impl( ) .await; + let health_monitor = sled_agent_health_monitor.clone(); if extra_sled_agents > 0 { starter .init_with_steps( @@ -1734,6 +1742,7 @@ pub(crate) async fn setup_with_config_impl( SLED_AGENT2_UUID.parse().unwrap(), 1, sim_mode, + health_monitor, ) .boxed() }), @@ -1743,7 +1752,9 @@ pub(crate) async fn setup_with_config_impl( .await; } + let health_monitor = sled_agent_health_monitor.clone(); for index in 1..extra_sled_agents { + let health_monitor = health_monitor.clone(); starter .init_with_steps( vec![( @@ -1754,6 +1765,7 @@ pub(crate) async fn setup_with_config_impl( SledUuid::new_v4(), index.checked_add(1).unwrap(), sim_mode, + health_monitor.clone(), ) .boxed() }), @@ -1847,6 +1859,7 @@ pub async fn start_sled_agent( update_directory: &Utf8Path, sim_mode: sim::SimMode, simulated_upstairs: &Arc, + health_monitor: sim::ConfigHealthMonitor, ) -> Result { // Generate a baseboard serial number that matches the SP configuration // (SimGimlet00, SimGimlet01, etc.) so that inventory can link sled agents @@ -1861,6 +1874,7 @@ pub async fn start_sled_agent( sim::ZpoolConfig::None, SledCpuFamily::AmdMilan, Some(baseboard_serial), + health_monitor, ); start_sled_agent_with_config(log, &config, sled_index, simulated_upstairs) .await diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 70503aa0c1a..614405701cc 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1160,6 +1160,7 @@ async fn test_instance_migration_compatible_cpu_platforms( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::AmdTurin, + omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false }, ); let new_sled_id = config.id; @@ -1349,6 +1350,7 @@ async fn test_instance_migration_incompatible_cpu_platforms( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::AmdTurin, + omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false }, ); let turin_sled_id = config.id; @@ -1426,6 +1428,7 @@ async fn test_instance_migration_unknown_sled_type( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::Unknown, + omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false }, ); let new_sled_id = config.id; @@ -7125,6 +7128,7 @@ async fn test_can_start_instance_with_cpu_platform( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::AmdTurin, + omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false }, ); let new_sled_id = config.id; diff --git a/nexus/tests/integration_tests/sleds.rs b/nexus/tests/integration_tests/sleds.rs index 53ae7d92394..9518872c385 100644 --- a/nexus/tests/integration_tests/sleds.rs +++ b/nexus/tests/integration_tests/sleds.rs @@ -79,6 +79,7 @@ async fn test_sleds_list(cptestctx: &ControlPlaneTestContext) { &update_directory, sim::SimMode::Explicit, &cptestctx.first_sled_agent().simulated_upstairs, + sim::ConfigHealthMonitor { enabled: false }, ) .await .unwrap(), diff --git a/sled-agent/src/bin/sled-agent-sim.rs b/sled-agent/src/bin/sled-agent-sim.rs index e18ab69c213..287bb54b45e 100644 --- a/sled-agent/src/bin/sled-agent-sim.rs +++ b/sled-agent/src/bin/sled-agent-sim.rs @@ -15,6 +15,7 @@ use dropshot::ConfigLoggingLevel; use omicron_common::api::internal::nexus::Certificate; use omicron_common::cmd::CmdError; use omicron_common::cmd::fatal; +use omicron_sled_agent::sim::ConfigHealthMonitor; use omicron_sled_agent::sim::RssArgs; use omicron_sled_agent::sim::{ Config, ConfigHardware, ConfigStorage, ConfigZpool, SimMode, ZpoolConfig, @@ -56,6 +57,9 @@ struct Args { #[clap(action)] nexus_lockstep_port: u16, + #[clap(long, default_value_t = false, action)] + enable_health_monitor: bool, + #[clap(long, name = "NEXUS_EXTERNAL_IP:PORT", action)] /// If specified, when the simulated sled agent initializes the rack, it /// will record the Nexus service running with the specified external IP @@ -127,6 +131,7 @@ async fn do_run() -> Result<(), CmdError> { Some(tmp.path()), ZpoolConfig::TenVirtualU2s, SledCpuFamily::AmdMilan, + ConfigHealthMonitor { enabled: args.enable_health_monitor }, ) }; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index 700d4a08f4b..b9fb087f073 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -275,7 +275,7 @@ async fn spawn_bootstore_tasks( node_handle } -async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { +pub async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { info!(log, "Starting health monitor"); let log = log.new(o!("component" => "HealthMonitor")); HealthMonitorHandle::spawn(log) diff --git a/sled-agent/src/sim/config.rs b/sled-agent/src/sim/config.rs index 744ebb1bea3..5b9d231cf84 100644 --- a/sled-agent/src/sim/config.rs +++ b/sled-agent/src/sim/config.rs @@ -66,6 +66,12 @@ pub struct ConfigHardware { pub baseboard: Baseboard, } +/// Configuration for the health monitor. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +pub struct ConfigHealthMonitor { + pub enabled: bool, +} + /// Configuration for a sled agent #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct Config { @@ -83,6 +89,8 @@ pub struct Config { pub updates: ConfigUpdates, /// configuration to emulate the sled agent's hardware pub hardware: ConfigHardware, + /// configuration for the sled agent's health monitor + pub health_monitor: ConfigHealthMonitor, } pub enum ZpoolConfig { @@ -101,6 +109,7 @@ impl Config { update_directory: Option<&Utf8Path>, zpool_config: ZpoolConfig, cpu_family: SledCpuFamily, + health_monitor: ConfigHealthMonitor, ) -> Config { Self::for_testing_with_baseboard( id, @@ -110,6 +119,7 @@ impl Config { zpool_config, cpu_family, None, + health_monitor, ) } @@ -121,6 +131,7 @@ impl Config { zpool_config: ZpoolConfig, cpu_family: SledCpuFamily, baseboard_serial: Option, + health_monitor: ConfigHealthMonitor, ) -> Config { // This IP range is guaranteed by RFC 6666 to discard traffic. // For tests that don't use a Nexus, we use this address to simulate a @@ -173,6 +184,7 @@ impl Config { revision: 3, }, }, + health_monitor, } } } diff --git a/sled-agent/src/sim/mod.rs b/sled-agent/src/sim/mod.rs index ef7915293e8..6662ee2c1ba 100644 --- a/sled-agent/src/sim/mod.rs +++ b/sled-agent/src/sim/mod.rs @@ -20,8 +20,9 @@ mod upstairs; pub use crate::updates::ConfigUpdates; pub use config::{ - Baseboard, Config, ConfigHardware, ConfigStorage, ConfigZpool, SimMode, - TEST_HARDWARE_THREADS, TEST_RESERVOIR_RAM, ZpoolConfig, + Baseboard, Config, ConfigHardware, ConfigHealthMonitor, ConfigStorage, + ConfigZpool, SimMode, TEST_HARDWARE_THREADS, TEST_RESERVOIR_RAM, + ZpoolConfig, }; pub use server::{RssArgs, Server, run_standalone_server}; pub use sled_agent::SledAgent; diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index bb6e9c028e8..3fc0fd30d61 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -12,9 +12,10 @@ use super::instance::{self, SimInstance}; use super::storage::CrucibleData; use super::storage::Storage; use crate::artifact_store::ArtifactStore; +use crate::long_running_tasks::spawn_health_monitor_tasks; use crate::nexus::NexusClient; -use crate::sim::SimulatedUpstairs; use crate::sim::simulatable::Simulatable; +use crate::sim::{ConfigHealthMonitor, SimulatedUpstairs}; use crate::support_bundle::storage::SupportBundleQueryType; use crate::updates::UpdateManager; use anyhow::Context; @@ -168,7 +169,13 @@ impl SledAgent { .await .start(&log, &config.dropshot); - let health_monitor = HealthMonitorHandle::stub(); + let ConfigHealthMonitor { enabled } = config.health_monitor; + + let health_monitor = if enabled { + spawn_health_monitor_tasks(&log).await + } else { + HealthMonitorHandle::stub() + }; Arc::new(SledAgent { id, From 893b8501259919c35cf3ee0b0918b63b1b7cb3d7 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 13 Jan 2026 16:08:04 +1300 Subject: [PATCH 02/10] clippy --- nexus/test-utils/src/nexus_test.rs | 21 +++++++++++++++------ nexus/test-utils/src/starter.rs | 17 ++++++++++++++--- sled-agent/src/sim/config.rs | 1 + 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index 65b18c7ed3e..6e781ef9b67 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -7,6 +7,7 @@ use crate::ControlPlaneStarter; use crate::ControlPlaneTestContextSledAgent; use crate::starter::PopulateCrdb; +use crate::starter::SledAgentOptions; use crate::starter::setup_with_config_impl; #[cfg(feature = "omicron-dev")] use anyhow::Context; @@ -85,12 +86,16 @@ impl<'a> ControlPlaneBuilder<'a> { setup_with_config_impl( starter, PopulateCrdb::FromEnvironmentSeed, - sim::SimMode::Explicit, + SledAgentOptions { + sim_mode: sim::SimMode::Explicit, + extra_sled_agents: self.nextra_sled_agents, + sled_agent_health_monitor: sim::ConfigHealthMonitor { + enabled: false, + }, + }, self.tls_cert, - self.nextra_sled_agents, DEFAULT_SP_SIM_CONFIG.into(), false, - sim::ConfigHealthMonitor { enabled: false }, ) .await } @@ -364,6 +369,8 @@ pub async fn omicron_dev_setup_with_config( gateway_config_file: Utf8PathBuf, sled_agent_health_monitor: sim::ConfigHealthMonitor, ) -> Result> { + use crate::starter::SledAgentOptions; + let starter = ControlPlaneStarter::::new("omicron-dev", config); let log = &starter.logctx.log; @@ -385,12 +392,14 @@ pub async fn omicron_dev_setup_with_config( Ok(setup_with_config_impl( starter, PopulateCrdb::FromSeed { input_tar: seed_tar }, - sim::SimMode::Auto, + SledAgentOptions { + sim_mode: sim::SimMode::Auto, + extra_sled_agents, + sled_agent_health_monitor, + }, None, - extra_sled_agents, gateway_config_file, true, - sled_agent_health_monitor, ) .await) } diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs index fb0c0b848a1..27281b2affc 100644 --- a/nexus/test-utils/src/starter.rs +++ b/nexus/test-utils/src/starter.rs @@ -1538,16 +1538,26 @@ impl RackInitRequestBuilder { } } +#[derive(Debug, Clone)] +pub(crate) struct SledAgentOptions { + pub sim_mode: sim::SimMode, + pub extra_sled_agents: u16, + pub sled_agent_health_monitor: sim::ConfigHealthMonitor, +} + pub(crate) async fn setup_with_config_impl( mut starter: ControlPlaneStarter<'_, N>, populate: PopulateCrdb, - sim_mode: sim::SimMode, + sled_agent_opts: SledAgentOptions, initial_cert: Option, - extra_sled_agents: u16, gateway_config_file: Utf8PathBuf, second_nexus: bool, - sled_agent_health_monitor: sim::ConfigHealthMonitor, ) -> ControlPlaneTestContext { + let SledAgentOptions { + sim_mode, + extra_sled_agents, + sled_agent_health_monitor, + } = sled_agent_opts; const STEP_TIMEOUT: Duration = Duration::from_secs(600); // All setups will start with CRDB and clickhouse @@ -1851,6 +1861,7 @@ pub(crate) enum PopulateCrdb { /// /// Note: you should probably use the `extra_sled_agents` macro parameter on /// `nexus_test` instead! +#[allow(clippy::too_many_arguments)] pub async fn start_sled_agent( log: Logger, nexus_address: SocketAddr, diff --git a/sled-agent/src/sim/config.rs b/sled-agent/src/sim/config.rs index 5b9d231cf84..c2e96b54df3 100644 --- a/sled-agent/src/sim/config.rs +++ b/sled-agent/src/sim/config.rs @@ -123,6 +123,7 @@ impl Config { ) } + #[allow(clippy::too_many_arguments)] pub fn for_testing_with_baseboard( id: SledUuid, sim_mode: SimMode, From b40ee44f4f367b154f0fa1cd868e2365d3e95add Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 13 Jan 2026 16:12:33 +1300 Subject: [PATCH 03/10] clean up --- nexus/test-utils/src/nexus_test.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index 6e781ef9b67..090e73dca8f 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -369,8 +369,6 @@ pub async fn omicron_dev_setup_with_config( gateway_config_file: Utf8PathBuf, sled_agent_health_monitor: sim::ConfigHealthMonitor, ) -> Result> { - use crate::starter::SledAgentOptions; - let starter = ControlPlaneStarter::::new("omicron-dev", config); let log = &starter.logctx.log; From bcdc133611d8e54223a38c7293cb320b7bb1c723 Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 22 Jan 2026 15:10:08 +1300 Subject: [PATCH 04/10] get this working with fake data --- Cargo.lock | 1 + illumos-utils/src/svcs.rs | 4 +-- nexus/test-utils/src/starter.rs | 2 ++ sled-agent/health-monitor/Cargo.toml | 1 + sled-agent/health-monitor/src/handle.rs | 18 ++++++++++-- .../health-monitor/src/health_checks.rs | 28 +++++++++++++++++++ sled-agent/src/sim/config.rs | 6 ++++ sled-agent/src/sim/sled_agent.rs | 4 ++- 8 files changed, 58 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index eeed5d4d780..d6f65f34f06 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13089,6 +13089,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "chrono", "derive_more 0.99.20", "dropshot", "futures", diff --git a/illumos-utils/src/svcs.rs b/illumos-utils/src/svcs.rs index 17b68412379..d14fbe6819c 100644 --- a/illumos-utils/src/svcs.rs +++ b/illumos-utils/src/svcs.rs @@ -199,8 +199,8 @@ impl From for SvcState { #[serde(rename_all = "snake_case")] /// Information about an SMF service that is enabled but not running pub struct SvcInMaintenance { - fmri: String, - zone: String, + pub fmri: String, + pub zone: String, } impl SvcInMaintenance { diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs index 27281b2affc..97e6b8f2d8f 100644 --- a/nexus/test-utils/src/starter.rs +++ b/nexus/test-utils/src/starter.rs @@ -1752,6 +1752,7 @@ pub(crate) async fn setup_with_config_impl( SLED_AGENT2_UUID.parse().unwrap(), 1, sim_mode, + // TODO-K: Don't start the monitor here? health_monitor, ) .boxed() @@ -1775,6 +1776,7 @@ pub(crate) async fn setup_with_config_impl( SledUuid::new_v4(), index.checked_add(1).unwrap(), sim_mode, + // TODO-K: Don't start the monitor here? health_monitor.clone(), ) .boxed() diff --git a/sled-agent/health-monitor/Cargo.toml b/sled-agent/health-monitor/Cargo.toml index 0e034220fd0..319eeb7a474 100644 --- a/sled-agent/health-monitor/Cargo.toml +++ b/sled-agent/health-monitor/Cargo.toml @@ -10,6 +10,7 @@ workspace = true [dependencies] anyhow.workspace = true async-trait.workspace = true +chrono.workspace = true derive_more.workspace = true dropshot.workspace = true futures.workspace = true diff --git a/sled-agent/health-monitor/src/handle.rs b/sled-agent/health-monitor/src/handle.rs index 030bdbec630..c4a4de5bd8c 100644 --- a/sled-agent/health-monitor/src/handle.rs +++ b/sled-agent/health-monitor/src/handle.rs @@ -3,6 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use crate::health_checks::poll_smf_services_in_maintenance; +use crate::health_checks::sim_poll_smf_services_in_maintenance; use illumos_utils::svcs::SvcsInMaintenanceResult; use sled_agent_types::inventory::HealthMonitorInventory; @@ -22,10 +23,21 @@ pub struct HealthMonitorHandle { impl HealthMonitorHandle { /// Returns a `HealthMonitorHandle` that doesn't monitor health and always - /// reports no problems - pub fn stub() -> Self { - let (_tx, smf_services_in_maintenance_rx) = + /// reports no problems unless a `ConfigSimHealthMonitor` with simulated + /// data is passed. + pub fn spawn_sim(sim_failed_checks: bool) -> Self { + let (smf_services_in_maintenance_tx, smf_services_in_maintenance_rx) = watch::channel(Ok(SvcsInMaintenanceResult::new())); + + if sim_failed_checks { + tokio::spawn(async move { + sim_poll_smf_services_in_maintenance( + smf_services_in_maintenance_tx, + ) + .await + }); + }; + Self { smf_services_in_maintenance_rx } } diff --git a/sled-agent/health-monitor/src/health_checks.rs b/sled-agent/health-monitor/src/health_checks.rs index ec2611ad9ea..33f842efa5d 100644 --- a/sled-agent/health-monitor/src/health_checks.rs +++ b/sled-agent/health-monitor/src/health_checks.rs @@ -4,6 +4,8 @@ //! Helpers for running health checks from the sled agent +use chrono::Utc; +use illumos_utils::svcs::SvcInMaintenance; use illumos_utils::svcs::Svcs; use illumos_utils::svcs::SvcsInMaintenanceResult; use slog::Logger; @@ -43,3 +45,29 @@ pub(crate) async fn poll_smf_services_in_maintenance( }; } } + +pub(crate) async fn sim_poll_smf_services_in_maintenance( + // TODO-K: Add sim config here? + smf_services_in_maintenance_tx: watch::Sender< + Result, + >, +) { + // We poll every minute to mimic what the actual health monitor does + let mut interval = interval(Duration::from_secs(60)); + interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + + loop { + interval.tick().await; + smf_services_in_maintenance_tx.send_modify(|status| { + // TODO-K: Set the config here instead + *status = Ok(SvcsInMaintenanceResult { + services: vec![SvcInMaintenance { + fmri: "fake".to_string(), + zone: "fake-global".to_string(), + }], + errors: vec![], + time_of_status: Some(Utc::now()), + }); + }) + } +} diff --git a/sled-agent/src/sim/config.rs b/sled-agent/src/sim/config.rs index c2e96b54df3..6711afe9460 100644 --- a/sled-agent/src/sim/config.rs +++ b/sled-agent/src/sim/config.rs @@ -69,9 +69,15 @@ pub struct ConfigHardware { /// Configuration for the health monitor. #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] pub struct ConfigHealthMonitor { + // TODO-K: change name here pub enabled: bool, } +//#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +//pub struct ConfigSimHealthCheckResults { +// +//} + /// Configuration for a sled agent #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct Config { diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 3fc0fd30d61..d066f415f81 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -171,10 +171,12 @@ impl SledAgent { let ConfigHealthMonitor { enabled } = config.health_monitor; + // TODO-K: Take configuration file with values and populate a fake + // health monitor report let health_monitor = if enabled { spawn_health_monitor_tasks(&log).await } else { - HealthMonitorHandle::stub() + HealthMonitorHandle::spawn_sim(true) }; Arc::new(SledAgent { From 003d2469bd713750a92b7d6b00016ecbe62a5b3e Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 22 Jan 2026 16:12:27 +1300 Subject: [PATCH 05/10] plumb the config through --- Cargo.lock | 3 +++ dev-tools/omicron-dev/Cargo.toml | 3 +++ dev-tools/omicron-dev/src/main.rs | 14 ++++++++++ nexus/inventory/src/collector.rs | 5 +++- nexus/test-utils/src/nexus_test.rs | 1 + nexus/tests/integration_tests/instances.rs | 20 +++++++++++--- nexus/tests/integration_tests/sleds.rs | 5 +++- sled-agent/health-monitor/src/handle.rs | 13 ++++++--- .../health-monitor/src/health_checks.rs | 27 ++++--------------- sled-agent/src/bin/sled-agent-sim.rs | 6 ++++- sled-agent/src/sim/config.rs | 4 ++- sled-agent/src/sim/sled_agent.rs | 7 +++-- 12 files changed, 70 insertions(+), 38 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d6f65f34f06..5914f8fef3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8148,11 +8148,13 @@ version = "0.1.0" dependencies = [ "anyhow", "camino", + "chrono", "clap", "dropshot", "expectorate", "futures", "gateway-test-utils", + "illumos-utils", "libc", "nexus-config", "nexus-test-interface", @@ -8167,6 +8169,7 @@ dependencies = [ "oxide-tokio-rt", "pq-sys", "signal-hook-tokio", + "sled-agent-types", "subprocess", "tokio", "tokio-postgres", diff --git a/dev-tools/omicron-dev/Cargo.toml b/dev-tools/omicron-dev/Cargo.toml index 971e5af5bfd..0e271a6c15f 100644 --- a/dev-tools/omicron-dev/Cargo.toml +++ b/dev-tools/omicron-dev/Cargo.toml @@ -13,10 +13,12 @@ omicron-rpaths.workspace = true [dependencies] anyhow.workspace = true camino.workspace = true +chrono.workspace = true clap.workspace = true dropshot.workspace = true futures.workspace = true gateway-test-utils.workspace = true +illumos-utils.workspace = true libc.workspace = true nexus-config.workspace = true nexus-test-interface.workspace = true @@ -28,6 +30,7 @@ oxide-tokio-rt.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. pq-sys = "*" signal-hook-tokio.workspace = true +sled-agent-types.workspace = true tokio.workspace = true toml.workspace = true diff --git a/dev-tools/omicron-dev/src/main.rs b/dev-tools/omicron-dev/src/main.rs index a5efe7c766c..9ccbe1d2c9e 100644 --- a/dev-tools/omicron-dev/src/main.rs +++ b/dev-tools/omicron-dev/src/main.rs @@ -4,15 +4,18 @@ use anyhow::Context; use camino::Utf8PathBuf; +use chrono::Utc; use clap::{Args, Parser, Subcommand}; use futures::StreamExt; use gateway_test_utils::setup::DEFAULT_SP_SIM_CONFIG; +use illumos_utils::svcs::{SvcInMaintenance, SvcsInMaintenanceResult}; use libc::SIGINT; use nexus_config::NexusConfig; use nexus_test_interface::NexusServer; use nexus_test_utils::resource_helpers::DiskTest; use omicron_sled_agent::sim::ConfigHealthMonitor; use signal_hook_tokio::Signals; +use sled_agent_types::inventory::HealthMonitorInventory; use std::fs; const DEFAULT_NEXUS_CONFIG: &str = @@ -92,7 +95,18 @@ impl RunAllArgs { } let sled_agent_health_monitor = ConfigHealthMonitor { + // TODO-K: parse the TOML instead of hardcoding this here enabled: self.enable_sled_agent_health_monitor, + sim_health_checks: Some(HealthMonitorInventory { + smf_services_in_maintenance: Ok(SvcsInMaintenanceResult { + services: vec![SvcInMaintenance { + fmri: "fake".to_string(), + zone: "bobzone".to_string(), + }], + errors: vec![], + time_of_status: Some(Utc::now()), + }), + }), }; println!("omicron-dev: setting up all services ... "); diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index 9dd16fc028c..cdb38d53241 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -982,7 +982,10 @@ mod test { SledCpuFamily::AmdMilan, // For now we disable the health monitor, we can change this preference // later if necessary. - sim::ConfigHealthMonitor { enabled: false }, + sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, ); let agent = diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index 090e73dca8f..cb38253185f 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -91,6 +91,7 @@ impl<'a> ControlPlaneBuilder<'a> { extra_sled_agents: self.nextra_sled_agents, sled_agent_health_monitor: sim::ConfigHealthMonitor { enabled: false, + sim_health_checks: None, }, }, self.tls_cert, diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 614405701cc..fb5373b4407 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1160,7 +1160,10 @@ async fn test_instance_migration_compatible_cpu_platforms( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::AmdTurin, - omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false }, + omicron_sled_agent::sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, ); let new_sled_id = config.id; @@ -1350,7 +1353,10 @@ async fn test_instance_migration_incompatible_cpu_platforms( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::AmdTurin, - omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false }, + omicron_sled_agent::sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, ); let turin_sled_id = config.id; @@ -1428,7 +1434,10 @@ async fn test_instance_migration_unknown_sled_type( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::Unknown, - omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false }, + omicron_sled_agent::sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, ); let new_sled_id = config.id; @@ -7128,7 +7137,10 @@ async fn test_can_start_instance_with_cpu_platform( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::AmdTurin, - omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false }, + omicron_sled_agent::sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, ); let new_sled_id = config.id; diff --git a/nexus/tests/integration_tests/sleds.rs b/nexus/tests/integration_tests/sleds.rs index 9518872c385..afc325ce8a0 100644 --- a/nexus/tests/integration_tests/sleds.rs +++ b/nexus/tests/integration_tests/sleds.rs @@ -79,7 +79,10 @@ async fn test_sleds_list(cptestctx: &ControlPlaneTestContext) { &update_directory, sim::SimMode::Explicit, &cptestctx.first_sled_agent().simulated_upstairs, - sim::ConfigHealthMonitor { enabled: false }, + sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, ) .await .unwrap(), diff --git a/sled-agent/health-monitor/src/handle.rs b/sled-agent/health-monitor/src/handle.rs index c4a4de5bd8c..6bdbdc1d557 100644 --- a/sled-agent/health-monitor/src/handle.rs +++ b/sled-agent/health-monitor/src/handle.rs @@ -3,7 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use crate::health_checks::poll_smf_services_in_maintenance; -use crate::health_checks::sim_poll_smf_services_in_maintenance; +use crate::health_checks::sim_smf_services_in_maintenance; use illumos_utils::svcs::SvcsInMaintenanceResult; use sled_agent_types::inventory::HealthMonitorInventory; @@ -25,13 +25,18 @@ impl HealthMonitorHandle { /// Returns a `HealthMonitorHandle` that doesn't monitor health and always /// reports no problems unless a `ConfigSimHealthMonitor` with simulated /// data is passed. - pub fn spawn_sim(sim_failed_checks: bool) -> Self { + pub fn spawn_sim( + sim_health_checks: Option, + ) -> Self { let (smf_services_in_maintenance_tx, smf_services_in_maintenance_rx) = watch::channel(Ok(SvcsInMaintenanceResult::new())); - if sim_failed_checks { + if sim_health_checks.is_some() { + let HealthMonitorInventory { smf_services_in_maintenance } = + sim_health_checks.unwrap(); tokio::spawn(async move { - sim_poll_smf_services_in_maintenance( + sim_smf_services_in_maintenance( + smf_services_in_maintenance, smf_services_in_maintenance_tx, ) .await diff --git a/sled-agent/health-monitor/src/health_checks.rs b/sled-agent/health-monitor/src/health_checks.rs index 33f842efa5d..c4df690586a 100644 --- a/sled-agent/health-monitor/src/health_checks.rs +++ b/sled-agent/health-monitor/src/health_checks.rs @@ -4,8 +4,6 @@ //! Helpers for running health checks from the sled agent -use chrono::Utc; -use illumos_utils::svcs::SvcInMaintenance; use illumos_utils::svcs::Svcs; use illumos_utils::svcs::SvcsInMaintenanceResult; use slog::Logger; @@ -46,28 +44,13 @@ pub(crate) async fn poll_smf_services_in_maintenance( } } -pub(crate) async fn sim_poll_smf_services_in_maintenance( - // TODO-K: Add sim config here? +pub(crate) async fn sim_smf_services_in_maintenance( + sim_smf_services_in_maintenance: Result, smf_services_in_maintenance_tx: watch::Sender< Result, >, ) { - // We poll every minute to mimic what the actual health monitor does - let mut interval = interval(Duration::from_secs(60)); - interval.set_missed_tick_behavior(MissedTickBehavior::Skip); - - loop { - interval.tick().await; - smf_services_in_maintenance_tx.send_modify(|status| { - // TODO-K: Set the config here instead - *status = Ok(SvcsInMaintenanceResult { - services: vec![SvcInMaintenance { - fmri: "fake".to_string(), - zone: "fake-global".to_string(), - }], - errors: vec![], - time_of_status: Some(Utc::now()), - }); - }) - } + smf_services_in_maintenance_tx.send_modify(|status| { + *status = sim_smf_services_in_maintenance; + }) } diff --git a/sled-agent/src/bin/sled-agent-sim.rs b/sled-agent/src/bin/sled-agent-sim.rs index 287bb54b45e..7fd68eb2d9e 100644 --- a/sled-agent/src/bin/sled-agent-sim.rs +++ b/sled-agent/src/bin/sled-agent-sim.rs @@ -131,7 +131,11 @@ async fn do_run() -> Result<(), CmdError> { Some(tmp.path()), ZpoolConfig::TenVirtualU2s, SledCpuFamily::AmdMilan, - ConfigHealthMonitor { enabled: args.enable_health_monitor }, + // TODO-K: Use none for now, we can change later + ConfigHealthMonitor { + enabled: args.enable_health_monitor, + sim_health_checks: None, + }, ) }; diff --git a/sled-agent/src/sim/config.rs b/sled-agent/src/sim/config.rs index 6711afe9460..d3bb44b64fd 100644 --- a/sled-agent/src/sim/config.rs +++ b/sled-agent/src/sim/config.rs @@ -10,6 +10,7 @@ use dropshot::ConfigDropshot; use omicron_uuid_kinds::SledUuid; use serde::Deserialize; use serde::Serialize; +use sled_agent_types::inventory::HealthMonitorInventory; pub use sled_hardware_types::{Baseboard, SledCpuFamily}; use sp_sim::FAKE_GIMLET_MODEL; use std::net::Ipv6Addr; @@ -71,11 +72,12 @@ pub struct ConfigHardware { pub struct ConfigHealthMonitor { // TODO-K: change name here pub enabled: bool, + pub sim_health_checks: Option, } //#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] //pub struct ConfigSimHealthCheckResults { -// +// pub smf_services_in_maintenance: SvcsInMaintenanceResult, //} /// Configuration for a sled agent diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index d066f415f81..ee7b0e0a0e0 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -169,14 +169,13 @@ impl SledAgent { .await .start(&log, &config.dropshot); - let ConfigHealthMonitor { enabled } = config.health_monitor; + let ConfigHealthMonitor { enabled, sim_health_checks } = + config.health_monitor.clone(); - // TODO-K: Take configuration file with values and populate a fake - // health monitor report let health_monitor = if enabled { spawn_health_monitor_tasks(&log).await } else { - HealthMonitorHandle::spawn_sim(true) + HealthMonitorHandle::spawn_sim(sim_health_checks) }; Arc::new(SledAgent { From f4772d5a9e5c079f47c877cce44b01b8c410c24d Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 22 Jan 2026 17:02:40 +1300 Subject: [PATCH 06/10] use the config from the CLI --- dev-tools/omicron-dev/src/main.rs | 46 ++++++++++++------- sled-agent/health-monitor/src/handle.rs | 1 + .../tests/configs/health_monitor_sim.toml | 5 ++ .../configs/health_monitor_sim_enabled.toml | 5 ++ .../configs/health_monitor_sim_unhealthy.toml | 14 ++++++ 5 files changed, 54 insertions(+), 17 deletions(-) create mode 100644 sled-agent/tests/configs/health_monitor_sim.toml create mode 100644 sled-agent/tests/configs/health_monitor_sim_enabled.toml create mode 100644 sled-agent/tests/configs/health_monitor_sim_unhealthy.toml diff --git a/dev-tools/omicron-dev/src/main.rs b/dev-tools/omicron-dev/src/main.rs index 9ccbe1d2c9e..f2405e914a0 100644 --- a/dev-tools/omicron-dev/src/main.rs +++ b/dev-tools/omicron-dev/src/main.rs @@ -21,6 +21,9 @@ use std::fs; const DEFAULT_NEXUS_CONFIG: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../nexus/examples/config.toml"); +const DEFAULT_HEALTH_MONITOR_CONFIG: &str = + concat!(env!("CARGO_MANIFEST_DIR"), "/../../sled-agent/tests/configs/health_monitor_sim.toml"); + fn main() -> anyhow::Result<()> { oxide_tokio_rt::run(async { let args = OmicronDevApp::parse(); @@ -61,9 +64,13 @@ struct RunAllArgs { /// Override the nexus configuration file. #[clap(long, default_value = DEFAULT_NEXUS_CONFIG)] nexus_config: Utf8PathBuf, - /// Enable the sled agent health monitor - #[clap(long, default_value_t = false, action)] - enable_sled_agent_health_monitor: bool, + ///// Enable the sled agent health monitor + //#[clap(long, default_value_t = false, action)] + //enable_sled_agent_health_monitor: bool, + + /// Override the sled agent health monitor configuration file. + #[clap(long, default_value = DEFAULT_HEALTH_MONITOR_CONFIG)] + health_monitor_config: Utf8PathBuf, } impl RunAllArgs { @@ -94,20 +101,25 @@ impl RunAllArgs { .set_port(p); } - let sled_agent_health_monitor = ConfigHealthMonitor { - // TODO-K: parse the TOML instead of hardcoding this here - enabled: self.enable_sled_agent_health_monitor, - sim_health_checks: Some(HealthMonitorInventory { - smf_services_in_maintenance: Ok(SvcsInMaintenanceResult { - services: vec![SvcInMaintenance { - fmri: "fake".to_string(), - zone: "bobzone".to_string(), - }], - errors: vec![], - time_of_status: Some(Utc::now()), - }), - }), - }; + let health_monitor_config_str = fs::read_to_string(&self.health_monitor_config)?; + let sled_agent_health_monitor: ConfigHealthMonitor = toml::from_str(&health_monitor_config_str).context( + format!("parsing config: {}", self.health_monitor_config.as_str()), + )?; + + //let sled_agent_health_monitor = ConfigHealthMonitor { + // // TODO-K: parse the TOML instead of hardcoding this here + // enabled: self.enable_sled_agent_health_monitor, + // sim_health_checks: Some(HealthMonitorInventory { + // smf_services_in_maintenance: Ok(SvcsInMaintenanceResult { + // services: vec![SvcInMaintenance { + // fmri: "fake".to_string(), + // zone: "bobzone".to_string(), + // }], + // errors: vec![], + // time_of_status: Some(Utc::now()), + // }), + // }), + //}; println!("omicron-dev: setting up all services ... "); let cptestctx = nexus_test_utils::omicron_dev_setup_with_config::< diff --git a/sled-agent/health-monitor/src/handle.rs b/sled-agent/health-monitor/src/handle.rs index 6bdbdc1d557..5d909093b34 100644 --- a/sled-agent/health-monitor/src/handle.rs +++ b/sled-agent/health-monitor/src/handle.rs @@ -31,6 +31,7 @@ impl HealthMonitorHandle { let (smf_services_in_maintenance_tx, smf_services_in_maintenance_rx) = watch::channel(Ok(SvcsInMaintenanceResult::new())); + // TODO-K: Don't allow a config to be both "enabled" and with fake data if sim_health_checks.is_some() { let HealthMonitorInventory { smf_services_in_maintenance } = sim_health_checks.unwrap(); diff --git a/sled-agent/tests/configs/health_monitor_sim.toml b/sled-agent/tests/configs/health_monitor_sim.toml new file mode 100644 index 00000000000..5e91f808c0c --- /dev/null +++ b/sled-agent/tests/configs/health_monitor_sim.toml @@ -0,0 +1,5 @@ +# +# Health monitor: example config file +# + +enabled = false \ No newline at end of file diff --git a/sled-agent/tests/configs/health_monitor_sim_enabled.toml b/sled-agent/tests/configs/health_monitor_sim_enabled.toml new file mode 100644 index 00000000000..92a45cb2245 --- /dev/null +++ b/sled-agent/tests/configs/health_monitor_sim_enabled.toml @@ -0,0 +1,5 @@ +# +# Health monitor: example config file +# + +enabled = true \ No newline at end of file diff --git a/sled-agent/tests/configs/health_monitor_sim_unhealthy.toml b/sled-agent/tests/configs/health_monitor_sim_unhealthy.toml new file mode 100644 index 00000000000..fe00be21e27 --- /dev/null +++ b/sled-agent/tests/configs/health_monitor_sim_unhealthy.toml @@ -0,0 +1,14 @@ +# +# Health monitor: example config file +# + +enabled = false + +[sim_health_checks.smf_services_in_maintenance.ok] +services = [ + { fmri = "svc:/system/fake-service-1:default", zone = "oxz_fake_zone_1" }, + { fmri = "svc:/network/fake-service-2:default", zone = "oxz_fake_zone_2" }, + { fmri = "svc:/application/fake-service-3:default", zone = "global" } +] +errors = [] +# TODO-K: Add time of status \ No newline at end of file From ff8553c1ce4f227392e08be8def6d3a7a50ed408 Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 22 Jan 2026 19:29:28 +1300 Subject: [PATCH 07/10] enable health monitor for sled-agent sim --- dev-tools/omicron-dev/src/main.rs | 36 ++++++------------- nexus/test-utils/src/starter.rs | 2 -- sled-agent/src/bin/sled-agent-sim.rs | 29 +++++++++++---- sled-agent/src/sim/config.rs | 10 ++---- .../tests/configs/health_monitor_sim.toml | 4 ++- .../configs/health_monitor_sim_enabled.toml | 5 ++- .../configs/health_monitor_sim_unhealthy.toml | 9 +++-- 7 files changed, 50 insertions(+), 45 deletions(-) diff --git a/dev-tools/omicron-dev/src/main.rs b/dev-tools/omicron-dev/src/main.rs index f2405e914a0..28f03361edc 100644 --- a/dev-tools/omicron-dev/src/main.rs +++ b/dev-tools/omicron-dev/src/main.rs @@ -4,25 +4,24 @@ use anyhow::Context; use camino::Utf8PathBuf; -use chrono::Utc; use clap::{Args, Parser, Subcommand}; use futures::StreamExt; use gateway_test_utils::setup::DEFAULT_SP_SIM_CONFIG; -use illumos_utils::svcs::{SvcInMaintenance, SvcsInMaintenanceResult}; use libc::SIGINT; use nexus_config::NexusConfig; use nexus_test_interface::NexusServer; use nexus_test_utils::resource_helpers::DiskTest; use omicron_sled_agent::sim::ConfigHealthMonitor; use signal_hook_tokio::Signals; -use sled_agent_types::inventory::HealthMonitorInventory; use std::fs; const DEFAULT_NEXUS_CONFIG: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../nexus/examples/config.toml"); -const DEFAULT_HEALTH_MONITOR_CONFIG: &str = - concat!(env!("CARGO_MANIFEST_DIR"), "/../../sled-agent/tests/configs/health_monitor_sim.toml"); +const DEFAULT_HEALTH_MONITOR_CONFIG: &str = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../sled-agent/tests/configs/health_monitor_sim.toml" +); fn main() -> anyhow::Result<()> { oxide_tokio_rt::run(async { @@ -67,7 +66,6 @@ struct RunAllArgs { ///// Enable the sled agent health monitor //#[clap(long, default_value_t = false, action)] //enable_sled_agent_health_monitor: bool, - /// Override the sled agent health monitor configuration file. #[clap(long, default_value = DEFAULT_HEALTH_MONITOR_CONFIG)] health_monitor_config: Utf8PathBuf, @@ -101,25 +99,13 @@ impl RunAllArgs { .set_port(p); } - let health_monitor_config_str = fs::read_to_string(&self.health_monitor_config)?; - let sled_agent_health_monitor: ConfigHealthMonitor = toml::from_str(&health_monitor_config_str).context( - format!("parsing config: {}", self.health_monitor_config.as_str()), - )?; - - //let sled_agent_health_monitor = ConfigHealthMonitor { - // // TODO-K: parse the TOML instead of hardcoding this here - // enabled: self.enable_sled_agent_health_monitor, - // sim_health_checks: Some(HealthMonitorInventory { - // smf_services_in_maintenance: Ok(SvcsInMaintenanceResult { - // services: vec![SvcInMaintenance { - // fmri: "fake".to_string(), - // zone: "bobzone".to_string(), - // }], - // errors: vec![], - // time_of_status: Some(Utc::now()), - // }), - // }), - //}; + let health_monitor_config_str = + fs::read_to_string(&self.health_monitor_config)?; + let sled_agent_health_monitor: ConfigHealthMonitor = + toml::from_str(&health_monitor_config_str).context(format!( + "parsing config: {}", + self.health_monitor_config.as_str() + ))?; println!("omicron-dev: setting up all services ... "); let cptestctx = nexus_test_utils::omicron_dev_setup_with_config::< diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs index 97e6b8f2d8f..27281b2affc 100644 --- a/nexus/test-utils/src/starter.rs +++ b/nexus/test-utils/src/starter.rs @@ -1752,7 +1752,6 @@ pub(crate) async fn setup_with_config_impl( SLED_AGENT2_UUID.parse().unwrap(), 1, sim_mode, - // TODO-K: Don't start the monitor here? health_monitor, ) .boxed() @@ -1776,7 +1775,6 @@ pub(crate) async fn setup_with_config_impl( SledUuid::new_v4(), index.checked_add(1).unwrap(), sim_mode, - // TODO-K: Don't start the monitor here? health_monitor.clone(), ) .boxed() diff --git a/sled-agent/src/bin/sled-agent-sim.rs b/sled-agent/src/bin/sled-agent-sim.rs index 7fd68eb2d9e..482297be97a 100644 --- a/sled-agent/src/bin/sled-agent-sim.rs +++ b/sled-agent/src/bin/sled-agent-sim.rs @@ -23,9 +23,15 @@ use omicron_sled_agent::sim::{ }; use omicron_uuid_kinds::SledUuid; use sled_hardware_types::{Baseboard, SledCpuFamily}; +use std::fs; use std::net::SocketAddr; use std::net::SocketAddrV6; +pub const DEFAULT_HEALTH_MONITOR_CONFIG: &str = concat!( + env!("CARGO_MANIFEST_DIR"), + "/tests/configs/health_monitor_sim.toml" +); + fn parse_sim_mode(src: &str) -> Result { match src { "auto" => Ok(SimMode::Auto), @@ -57,8 +63,9 @@ struct Args { #[clap(action)] nexus_lockstep_port: u16, - #[clap(long, default_value_t = false, action)] - enable_health_monitor: bool, + /// Override the sled agent health monitor configuration file. + #[clap(long, default_value = DEFAULT_HEALTH_MONITOR_CONFIG)] + health_monitor_config: Utf8PathBuf, #[clap(long, name = "NEXUS_EXTERNAL_IP:PORT", action)] /// If specified, when the simulated sled agent initializes the rack, it @@ -101,6 +108,18 @@ fn main() { async fn do_run() -> Result<(), CmdError> { let args = Args::parse(); + let health_monitor_config_str = + fs::read_to_string(&args.health_monitor_config) + .context(format!("reading {:?}", &args.health_monitor_config)) + .map_err(CmdError::Failure)?; + let health_monitor: ConfigHealthMonitor = + toml::from_str(&health_monitor_config_str) + .context(format!( + "parsing config: {}", + args.health_monitor_config.as_str() + )) + .map_err(CmdError::Failure)?; + let tmp = camino_tempfile::tempdir() .map_err(|e| CmdError::Failure(anyhow!(e)))?; let config = Config { @@ -131,11 +150,7 @@ async fn do_run() -> Result<(), CmdError> { Some(tmp.path()), ZpoolConfig::TenVirtualU2s, SledCpuFamily::AmdMilan, - // TODO-K: Use none for now, we can change later - ConfigHealthMonitor { - enabled: args.enable_health_monitor, - sim_health_checks: None, - }, + health_monitor, ) }; diff --git a/sled-agent/src/sim/config.rs b/sled-agent/src/sim/config.rs index d3bb44b64fd..f2a47b6824e 100644 --- a/sled-agent/src/sim/config.rs +++ b/sled-agent/src/sim/config.rs @@ -67,19 +67,15 @@ pub struct ConfigHardware { pub baseboard: Baseboard, } -/// Configuration for the health monitor. +/// Configuration for the simulated health monitor. #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] pub struct ConfigHealthMonitor { - // TODO-K: change name here + /// Whether the real health monitor is running or not pub enabled: bool, + /// Simulated failed health checks pub sim_health_checks: Option, } -//#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] -//pub struct ConfigSimHealthCheckResults { -// pub smf_services_in_maintenance: SvcsInMaintenanceResult, -//} - /// Configuration for a sled agent #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct Config { diff --git a/sled-agent/tests/configs/health_monitor_sim.toml b/sled-agent/tests/configs/health_monitor_sim.toml index 5e91f808c0c..0ef10f01b28 100644 --- a/sled-agent/tests/configs/health_monitor_sim.toml +++ b/sled-agent/tests/configs/health_monitor_sim.toml @@ -1,5 +1,7 @@ # -# Health monitor: example config file +# Sled agent health monitor: example config file +# +# With this configuration, all health checks will appear as successful. # enabled = false \ No newline at end of file diff --git a/sled-agent/tests/configs/health_monitor_sim_enabled.toml b/sled-agent/tests/configs/health_monitor_sim_enabled.toml index 92a45cb2245..6926edbde82 100644 --- a/sled-agent/tests/configs/health_monitor_sim_enabled.toml +++ b/sled-agent/tests/configs/health_monitor_sim_enabled.toml @@ -1,5 +1,8 @@ # -# Health monitor: example config file +# Sled agent health monitor: example config file +# +# With this configuration, the actual health monitor will be running, performing +# the actual health checks against the machine this is running on. # enabled = true \ No newline at end of file diff --git a/sled-agent/tests/configs/health_monitor_sim_unhealthy.toml b/sled-agent/tests/configs/health_monitor_sim_unhealthy.toml index fe00be21e27..2a2dacfcbd1 100644 --- a/sled-agent/tests/configs/health_monitor_sim_unhealthy.toml +++ b/sled-agent/tests/configs/health_monitor_sim_unhealthy.toml @@ -1,5 +1,8 @@ # -# Health monitor: example config file +# Sled agent health monitor: example config file +# +# With this configuration, we have injected some dummy failed health check +# results. # enabled = false @@ -10,5 +13,7 @@ services = [ { fmri = "svc:/network/fake-service-2:default", zone = "oxz_fake_zone_2" }, { fmri = "svc:/application/fake-service-3:default", zone = "global" } ] + errors = [] -# TODO-K: Add time of status \ No newline at end of file + +time_of_status = "2026-04-12T23:20:50.52Z" \ No newline at end of file From a93af402977730a8a902e8195a5009d44b72b89b Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 22 Jan 2026 19:42:57 +1300 Subject: [PATCH 08/10] clean up --- sled-agent/health-monitor/src/handle.rs | 6 +++--- sled-agent/src/sim/config.rs | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sled-agent/health-monitor/src/handle.rs b/sled-agent/health-monitor/src/handle.rs index 5d909093b34..683e8e79cfc 100644 --- a/sled-agent/health-monitor/src/handle.rs +++ b/sled-agent/health-monitor/src/handle.rs @@ -31,10 +31,10 @@ impl HealthMonitorHandle { let (smf_services_in_maintenance_tx, smf_services_in_maintenance_rx) = watch::channel(Ok(SvcsInMaintenanceResult::new())); - // TODO-K: Don't allow a config to be both "enabled" and with fake data - if sim_health_checks.is_some() { + if let Some(results) = sim_health_checks { let HealthMonitorInventory { smf_services_in_maintenance } = - sim_health_checks.unwrap(); + results; + tokio::spawn(async move { sim_smf_services_in_maintenance( smf_services_in_maintenance, diff --git a/sled-agent/src/sim/config.rs b/sled-agent/src/sim/config.rs index f2a47b6824e..37de1cb7823 100644 --- a/sled-agent/src/sim/config.rs +++ b/sled-agent/src/sim/config.rs @@ -70,7 +70,8 @@ pub struct ConfigHardware { /// Configuration for the simulated health monitor. #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] pub struct ConfigHealthMonitor { - /// Whether the real health monitor is running or not + /// Whether the real health monitor is running or not. + /// If set, it will override any simulated health check results. pub enabled: bool, /// Simulated failed health checks pub sim_health_checks: Option, From aa4938397c9aefc2d19b4bb08596b9e0217a948c Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 22 Jan 2026 19:49:35 +1300 Subject: [PATCH 09/10] clean up --- dev-tools/omicron-dev/src/main.rs | 3 -- sled-agent/health-monitor/src/handle.rs | 50 ++++++++++++------------- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/dev-tools/omicron-dev/src/main.rs b/dev-tools/omicron-dev/src/main.rs index 28f03361edc..539432a613d 100644 --- a/dev-tools/omicron-dev/src/main.rs +++ b/dev-tools/omicron-dev/src/main.rs @@ -63,9 +63,6 @@ struct RunAllArgs { /// Override the nexus configuration file. #[clap(long, default_value = DEFAULT_NEXUS_CONFIG)] nexus_config: Utf8PathBuf, - ///// Enable the sled agent health monitor - //#[clap(long, default_value_t = false, action)] - //enable_sled_agent_health_monitor: bool, /// Override the sled agent health monitor configuration file. #[clap(long, default_value = DEFAULT_HEALTH_MONITOR_CONFIG)] health_monitor_config: Utf8PathBuf, diff --git a/sled-agent/health-monitor/src/handle.rs b/sled-agent/health-monitor/src/handle.rs index 683e8e79cfc..ac2259f4a72 100644 --- a/sled-agent/health-monitor/src/handle.rs +++ b/sled-agent/health-monitor/src/handle.rs @@ -22,31 +22,6 @@ pub struct HealthMonitorHandle { } impl HealthMonitorHandle { - /// Returns a `HealthMonitorHandle` that doesn't monitor health and always - /// reports no problems unless a `ConfigSimHealthMonitor` with simulated - /// data is passed. - pub fn spawn_sim( - sim_health_checks: Option, - ) -> Self { - let (smf_services_in_maintenance_tx, smf_services_in_maintenance_rx) = - watch::channel(Ok(SvcsInMaintenanceResult::new())); - - if let Some(results) = sim_health_checks { - let HealthMonitorInventory { smf_services_in_maintenance } = - results; - - tokio::spawn(async move { - sim_smf_services_in_maintenance( - smf_services_in_maintenance, - smf_services_in_maintenance_tx, - ) - .await - }); - }; - - Self { smf_services_in_maintenance_rx } - } - pub fn spawn(log: Logger) -> Self { // Spawn a task to retrieve information about services in maintenance info!(log, "Starting SMF service health poller"); @@ -73,4 +48,29 @@ impl HealthMonitorHandle { .clone(), } } + + /// Returns a `HealthMonitorHandle` that doesn't monitor health and always + /// reports no problems unless a `ConfigSimHealthMonitor` with simulated + /// data is passed. + pub fn spawn_sim( + sim_health_checks: Option, + ) -> Self { + let (smf_services_in_maintenance_tx, smf_services_in_maintenance_rx) = + watch::channel(Ok(SvcsInMaintenanceResult::new())); + + if let Some(results) = sim_health_checks { + let HealthMonitorInventory { smf_services_in_maintenance } = + results; + + tokio::spawn(async move { + sim_smf_services_in_maintenance( + smf_services_in_maintenance, + smf_services_in_maintenance_tx, + ) + .await + }); + }; + + Self { smf_services_in_maintenance_rx } + } } From 53bfd84cec8f1eeeaba7bcda21748346035549f4 Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 22 Jan 2026 20:02:26 +1300 Subject: [PATCH 10/10] make linter happy --- sled-agent/health-monitor/src/lib.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sled-agent/health-monitor/src/lib.rs b/sled-agent/health-monitor/src/lib.rs index fde8d5f55f1..25d313d5d97 100644 --- a/sled-agent/health-monitor/src/lib.rs +++ b/sled-agent/health-monitor/src/lib.rs @@ -4,10 +4,9 @@ //! Machinery for sled-agent to run periodic health checks. //! -//! The initial entry point to this system is [`HealthMonitorHandle::stub()`]. -//! This should be called early in sled-agent startup. Later during the -//! sled-agent start process, sled-agent should spawn each of the polling tasks -//! found in the health_checks module. +//! The initial entry point to this system is [`HealthMonitorHandle::spawn()`]. +//! During the sled-agent start process, sled-agent will spawn each of the +//! polling tasks found in the health_checks module. //! //! The health checks we run are: //!