diff --git a/Cargo.lock b/Cargo.lock index a95ab45a69..1742af434e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1960,7 +1960,6 @@ dependencies = [ name = "carbide-health" version = "0.0.1" dependencies = [ - "arc-swap", "async-trait", "base64", "carbide-health-report", diff --git a/crates/health/Cargo.toml b/crates/health/Cargo.toml index 5423024726..d9b61b9c1b 100644 --- a/crates/health/Cargo.toml +++ b/crates/health/Cargo.toml @@ -27,7 +27,6 @@ name = "forge-hw-health" path = "src/main.rs" [dependencies] -arc-swap = { workspace = true } async-trait = { workspace = true } base64 = { workspace = true } chrono = { workspace = true } diff --git a/crates/health/benches/collector_pipeline.rs b/crates/health/benches/collector_pipeline.rs index 8cf84eef5d..7ef5a9f742 100644 --- a/crates/health/benches/collector_pipeline.rs +++ b/crates/health/benches/collector_pipeline.rs @@ -24,8 +24,8 @@ use std::sync::Arc; use carbide_health::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use carbide_health::metrics::MetricsManager; use carbide_health::sink::{ - CollectorEvent, CompositeDataSink, DataSink, EventContext, FirmwareInfo, LogRecord, - MetricSample, PrometheusSink, + CompositeSyncEventNode, EventContext, FirmwareInfo, HealthEvent, LogRecord, MetricSample, + PrometheusSink, SyncEventNode, }; use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; use mac_address::MacAddress; @@ -34,14 +34,15 @@ const MACHINE_ID: &str = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6r struct CountingSink; -impl DataSink for CountingSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for CountingSink { + fn node_type(&self) -> &'static str { "counting_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { black_box(context); black_box(event); + Vec::new() } } @@ -66,14 +67,14 @@ fn event_context() -> EventContext { } } -fn build_sensor_metric_event(idx: usize, unique_keys: usize) -> CollectorEvent { +fn build_sensor_metric_event(idx: usize, unique_keys: usize) -> HealthEvent { let unique_keys = unique_keys.max(1); let sensor_idx = idx % unique_keys; let sensor_key = format!("sensor-{sensor_idx}"); let machine_idx = idx % 16; let rack_idx = idx % 4; - CollectorEvent::Metric( + HealthEvent::MeasurementObserved( MetricSample { key: sensor_key.clone(), name: "hw_sensor".to_string(), @@ -93,8 +94,8 @@ fn build_sensor_metric_event(idx: usize, unique_keys: usize) -> CollectorEvent { ) } -fn build_nmxt_metric_event(idx: usize) -> CollectorEvent { - CollectorEvent::Metric( +fn build_nmxt_metric_event(idx: usize) -> HealthEvent { + HealthEvent::MeasurementObserved( MetricSample { key: format!("effective_ber:{}", idx % 64), name: "switch_nmxt".to_string(), @@ -113,8 +114,8 @@ fn build_nmxt_metric_event(idx: usize) -> CollectorEvent { ) } -fn build_log_event(idx: usize) -> CollectorEvent { - CollectorEvent::Log( +fn build_log_event(idx: usize) -> HealthEvent { + HealthEvent::LogObserved( LogRecord { body: format!("BMC event line {idx}"), severity: "INFO".to_string(), @@ -129,9 +130,10 @@ fn build_log_event(idx: usize) -> CollectorEvent { ) } -fn build_firmware_event(idx: usize) -> CollectorEvent { +fn build_firmware_event(idx: usize) -> HealthEvent { let component = format!("component-{idx}"); - CollectorEvent::Firmware(FirmwareInfo { + HealthEvent::FirmwareObserved(FirmwareInfo { + id: format!("firmware-{idx}"), component: component.clone(), version: format!("1.0.{}", idx % 100), attributes: vec![ @@ -141,8 +143,8 @@ fn build_firmware_event(idx: usize) -> CollectorEvent { }) } -fn bench_collector_event_build(c: &mut Criterion) { - let mut group = c.benchmark_group("collector_event_build"); +fn bench_health_event_build(c: &mut Criterion) { + let mut group = c.benchmark_group("health_event_build"); let sample_count = 10_000usize; group.throughput(Throughput::Elements(sample_count as u64)); @@ -182,12 +184,12 @@ fn bench_collector_event_build(c: &mut Criterion) { } fn emit_metric_batch_building( - sink: &dyn DataSink, + sink: &dyn SyncEventNode, context: &EventContext, batch_size: usize, unique_keys: usize, ) { - let start = CollectorEvent::MetricCollectionStart; + let start = HealthEvent::ScrapeBatchStarted; sink.handle_event(context, &start); for idx in 0..batch_size { @@ -195,7 +197,7 @@ fn emit_metric_batch_building( sink.handle_event(context, &event); } - let end = CollectorEvent::MetricCollectionEnd; + let end = HealthEvent::ScrapeBatchFinished; sink.handle_event(context, &end); } @@ -225,13 +227,13 @@ fn bench_collector_build_and_emit_prometheus(c: &mut Criterion) { } struct CompositeBuildEmitState { - sink: CompositeDataSink, + sink: CompositeSyncEventNode, context: EventContext, } impl CompositeBuildEmitState { fn new(sink_count: usize) -> Self { - let mut sinks: Vec> = Vec::with_capacity(sink_count); + let mut sinks: Vec> = Vec::with_capacity(sink_count); for _ in 0..sink_count { sinks.push(Arc::new(CountingSink)); } @@ -239,7 +241,7 @@ impl CompositeBuildEmitState { let metrics_manager = Arc::new( MetricsManager::new("bench_collector").expect("metrics manager should initialize"), ); - let sink = CompositeDataSink::new(sinks, metrics_manager); + let sink = CompositeSyncEventNode::new(sinks, metrics_manager); Self { sink, @@ -269,7 +271,7 @@ fn bench_collector_build_and_emit_composite(c: &mut Criterion) { criterion_group!( benches, - bench_collector_event_build, + bench_health_event_build, bench_collector_build_and_emit_prometheus, bench_collector_build_and_emit_composite ); diff --git a/crates/health/benches/processor_pipeline.rs b/crates/health/benches/processor_pipeline.rs index ca98b8630c..de2e8efc2c 100644 --- a/crates/health/benches/processor_pipeline.rs +++ b/crates/health/benches/processor_pipeline.rs @@ -23,11 +23,11 @@ use std::sync::Arc; use carbide_health::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use carbide_health::metrics::MetricsManager; use carbide_health::processor::{ - EventProcessingPipeline, EventProcessor, HealthReportProcessor, LeakEventProcessor, - RackLeakProcessor, + EventGraph, HealthReportProcessor, LeakSyncEventNode, RackLeakProcessor, }; use carbide_health::sink::{ - CollectorEvent, CompositeDataSink, DataSink, EventContext, MetricSample, SensorThresholdContext, + CompositeSyncEventNode, EventContext, HealthEvent, MetricSample, SensorThresholdContext, + SyncEventNode, }; use carbide_uuid::rack::RackId; use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; @@ -38,45 +38,38 @@ const MACHINE_ID: &str = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6r struct CountingSink; -impl DataSink for CountingSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for CountingSink { + fn node_type(&self) -> &'static str { "counting_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { std::hint::black_box(context); std::hint::black_box(event); + Vec::new() } } struct NoopProcessor; -impl EventProcessor for NoopProcessor { - fn processor_type(&self) -> &'static str { +impl SyncEventNode for NoopProcessor { + fn node_type(&self) -> &'static str { "noop_processor" } - fn process_event( - &self, - _context: &EventContext, - _event: &CollectorEvent, - ) -> Vec { + fn handle_event(&self, _context: &EventContext, _event: &HealthEvent) -> Vec { Vec::new() } } struct ReemitProcessor; -impl EventProcessor for ReemitProcessor { - fn processor_type(&self) -> &'static str { +impl SyncEventNode for ReemitProcessor { + fn node_type(&self) -> &'static str { "reemit_processor" } - fn process_event( - &self, - _context: &EventContext, - event: &CollectorEvent, - ) -> Vec { + fn handle_event(&self, _context: &EventContext, event: &HealthEvent) -> Vec { vec![event.clone()] } } @@ -102,19 +95,33 @@ fn event_context() -> EventContext { } } -fn make_composite_sink(count: usize, metrics_manager: Arc) -> Arc { - let mut sinks: Vec> = Vec::with_capacity(count); +fn make_composite_sink( + count: usize, + metrics_manager: Arc, +) -> Arc { + let mut sinks: Vec> = Vec::with_capacity(count); for _ in 0..count { sinks.push(Arc::new(CountingSink)); } - Arc::new(CompositeDataSink::new(sinks, metrics_manager)) + Arc::new(CompositeSyncEventNode::new(sinks, metrics_manager)) +} + +fn make_event_graph( + sink: Arc, + processors: Vec>, + metrics_manager: Arc, +) -> EventGraph { + let mut nodes = Vec::with_capacity(processors.len() + 1); + nodes.push(sink); + nodes.extend(processors); + EventGraph::new(nodes, metrics_manager) } fn metric_events( batch_size: usize, unique_keys: usize, with_health_context: bool, -) -> Vec { +) -> Vec { let unique_keys = unique_keys.max(1); (0..batch_size) @@ -151,18 +158,18 @@ fn metric_events( bmc_health: BmcHealth::Warning, }); } - CollectorEvent::Metric(metric.into()) + HealthEvent::MeasurementObserved(metric.into()) }) .collect() } -fn emit_metric_batch(sink: &dyn DataSink, context: &EventContext, events: &[CollectorEvent]) { - let start = CollectorEvent::MetricCollectionStart; +fn emit_metric_batch(sink: &dyn SyncEventNode, context: &EventContext, events: &[HealthEvent]) { + let start = HealthEvent::ScrapeBatchStarted; sink.handle_event(context, &start); for event in events { sink.handle_event(context, event); } - let end = CollectorEvent::MetricCollectionEnd; + let end = HealthEvent::ScrapeBatchFinished; sink.handle_event(context, &end); } @@ -174,18 +181,14 @@ fn bench_pipeline_baseline(c: &mut Criterion) { let metrics_manager: Arc = Arc::new(MetricsManager::new("bench").expect("metrics manager should initialize")); let sink = make_composite_sink(2, metrics_manager.clone()); - let mut processors: Vec> = Vec::with_capacity(processor_count); + let mut processors: Vec> = Vec::with_capacity(processor_count); for _ in 0..processor_count { processors.push(Arc::new(NoopProcessor)); } - let sink: Arc = if processors.is_empty() { + let sink: Arc = if processors.is_empty() { sink } else { - Arc::new(EventProcessingPipeline::new( - processors, - sink, - metrics_manager.clone(), - )) + Arc::new(make_event_graph(sink, processors, metrics_manager.clone())) }; let context = event_context(); let events = metric_events(batch_size, 64, false); @@ -209,13 +212,13 @@ fn bench_pipeline_health_processors(c: &mut Criterion) { let metrics_manager: Arc = Arc::new(MetricsManager::new("bench").expect("metrics manager should initialize")); - let processors: Vec> = vec![ + let processors: Vec> = vec![ Arc::new(HealthReportProcessor::default()), - Arc::new(LeakEventProcessor::new(1)), + Arc::new(LeakSyncEventNode::new(1)), ]; - let pipeline = EventProcessingPipeline::new( - processors, + let pipeline = make_event_graph( make_composite_sink(2, metrics_manager.clone()), + processors, metrics_manager, ); let context = event_context(); @@ -241,9 +244,9 @@ fn bench_pipeline_loop_guard(c: &mut Criterion) { let metrics_manager: Arc = Arc::new(MetricsManager::new("bench").expect("metrics manager should initialize")); - let pipeline = EventProcessingPipeline::new( - vec![Arc::new(ReemitProcessor)], + let pipeline = make_event_graph( make_composite_sink(2, metrics_manager.clone()), + vec![Arc::new(ReemitProcessor)], metrics_manager, ); let context = event_context(); @@ -288,14 +291,14 @@ fn bench_pipeline_rack_leak(c: &mut Criterion) { let metrics_manager: Arc = Arc::new(MetricsManager::new("bench").expect("metrics manager should initialize")); - let processors: Vec> = vec![ + let processors: Vec> = vec![ Arc::new(HealthReportProcessor::default()), - Arc::new(LeakEventProcessor::new(1)), + Arc::new(LeakSyncEventNode::new(1)), Arc::new(RackLeakProcessor::new(2)), ]; - let pipeline = EventProcessingPipeline::new( - processors, + let pipeline = make_event_graph( make_composite_sink(2, metrics_manager.clone()), + processors, metrics_manager, ); diff --git a/crates/health/benches/sink_pipeline.rs b/crates/health/benches/sink_pipeline.rs index c8e9e6b2e6..f22f3e2347 100644 --- a/crates/health/benches/sink_pipeline.rs +++ b/crates/health/benches/sink_pipeline.rs @@ -24,8 +24,8 @@ use std::sync::Arc; use carbide_health::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use carbide_health::metrics::MetricsManager; use carbide_health::sink::{ - Classification, CollectorEvent, CompositeDataSink, DataSink, EventContext, HealthReport, - HealthReportSink, LogRecord, MetricSample, PrometheusSink, ReportSource, + Classification, CompositeSyncEventNode, EventContext, HealthEvent, HealthReport, + HealthReportSink, LogRecord, MetricSample, PrometheusSink, ReportSource, SyncEventNode, }; use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; use health_report::HealthReport as CarbideHealthReport; @@ -40,14 +40,15 @@ const MACHINE_IDS: [&str; 3] = [ struct CountingSink; -impl DataSink for CountingSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for CountingSink { + fn node_type(&self) -> &'static str { "counting_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { std::hint::black_box(context); std::hint::black_box(event); + Vec::new() } } @@ -76,7 +77,7 @@ fn event_context_for_machine(machine_id: &str) -> EventContext { } } -fn metric_events(batch_size: usize, unique_keys: usize) -> Vec { +fn metric_events(batch_size: usize, unique_keys: usize) -> Vec { let unique_keys = unique_keys.max(1); (0..batch_size) @@ -84,7 +85,7 @@ fn metric_events(batch_size: usize, unique_keys: usize) -> Vec { let sensor_idx = idx % unique_keys; let key = format!("sensor-{sensor_idx}"); - CollectorEvent::Metric( + HealthEvent::MeasurementObserved( MetricSample { key: key.clone(), name: "hw_sensor".to_string(), @@ -100,13 +101,13 @@ fn metric_events(batch_size: usize, unique_keys: usize) -> Vec { .collect() } -fn emit_metric_batch(sink: &dyn DataSink, context: &EventContext, events: &[CollectorEvent]) { - let start = CollectorEvent::MetricCollectionStart; +fn emit_metric_batch(sink: &dyn SyncEventNode, context: &EventContext, events: &[HealthEvent]) { + let start = HealthEvent::ScrapeBatchStarted; sink.handle_event(context, &start); for event in events { sink.handle_event(context, event); } - let end = CollectorEvent::MetricCollectionEnd; + let end = HealthEvent::ScrapeBatchFinished; sink.handle_event(context, &end); } @@ -136,21 +137,21 @@ fn bench_prometheus_sink(c: &mut Criterion) { } struct CompositeBenchState { - sink: CompositeDataSink, + sink: CompositeSyncEventNode, context: EventContext, - events: Vec, + events: Vec, } impl CompositeBenchState { fn new(sink_count: usize, batch_size: usize) -> Self { - let mut sinks: Vec> = Vec::with_capacity(sink_count); + let mut sinks: Vec> = Vec::with_capacity(sink_count); for _ in 0..sink_count { sinks.push(Arc::new(CountingSink)); } let metrics_manager = Arc::new(MetricsManager::new("bench_sink").expect("metrics manager should initialize")); - let sink = CompositeDataSink::new(sinks, metrics_manager); + let sink = CompositeSyncEventNode::new(sinks, metrics_manager); Self { sink, @@ -203,8 +204,8 @@ struct HealthReportBenchState { sink: HealthReportSink, context: EventContext, distinct_contexts: Vec, - sensor_event: CollectorEvent, - leak_event: CollectorEvent, + sensor_event: HealthEvent, + leak_event: HealthEvent, } impl HealthReportBenchState { @@ -215,8 +216,9 @@ impl HealthReportBenchState { .into_iter() .map(event_context_for_machine) .collect(); - let sensor_event = CollectorEvent::HealthReport(Arc::new(health_report_with_alerts(256))); - let leak_event = CollectorEvent::HealthReport(Arc::new(HealthReport { + let sensor_event = + HealthEvent::HealthReportProduced(Arc::new(health_report_with_alerts(256))); + let leak_event = HealthEvent::HealthReportProduced(Arc::new(HealthReport { source: ReportSource::TrayLeakDetection, target: Some(carbide_health::sink::HealthReportTarget::Machine), observed_at: Some(chrono::Utc::now()), @@ -241,8 +243,8 @@ impl HealthReportBenchState { fn filled_health_report_sink( contexts: &[EventContext], - event: &CollectorEvent, - leak_event: &CollectorEvent, + event: &HealthEvent, + leak_event: &HealthEvent, ) -> HealthReportSink { let sink = HealthReportSink::new_for_bench().expect("bench sink should initialize"); for context in contexts { @@ -329,11 +331,11 @@ fn bench_health_report_sink(c: &mut Criterion) { group.finish(); } -fn log_events_with_attrs(count: usize, unique_sensors: usize) -> Vec { +fn log_events_with_attrs(count: usize, unique_sensors: usize) -> Vec { (0..count) .map(|idx| { let sensor = format!("HGX_GPU_{}_Temp_1", idx % unique_sensors); - CollectorEvent::Log(Box::new(LogRecord { + HealthEvent::LogObserved(Box::new(LogRecord { body: format!("{sensor} sensor crossed threshold"), severity: "Warning".to_string(), attributes: vec![ diff --git a/crates/health/src/collectors/discovery.rs b/crates/health/src/collectors/discovery.rs index 0d790bc6d8..9b5a30a8b5 100644 --- a/crates/health/src/collectors/discovery.rs +++ b/crates/health/src/collectors/discovery.rs @@ -24,36 +24,45 @@ use nv_redfish::ServiceRoot; use nv_redfish::core::Bmc; use crate::HealthError; -use crate::collectors::inventory::{DiscoveredEntity, EntityInventory, SharedInventory}; +use crate::bmc::BmcClient; +use crate::collectors::inventory::{DiscoveredEntity, EntityInventory}; use crate::collectors::runtime::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; +use crate::sink::{EventContext, HealthEvent, SyncEventNode}; /// Configuration for the entity discovery collector pub struct EntityDiscoveryCollectorConfig { - pub(crate) shared: SharedInventory, + pub(crate) data_sink: Option>, pub discovery_concurrency: usize, + pub(crate) _bmc: std::marker::PhantomData, } +/// Discovers the entity inventory of a single endpoint and publishes snapshots +/// as [`HealthEvent::InventoryDiscovered`] events. +/// +/// [`HealthEvent::InventoryDiscovered`]: crate::sink::HealthEvent::InventoryDiscovered pub struct EntityDiscoveryCollector { endpoint: Arc, + event_context: EventContext, bmc: Arc, - shared: SharedInventory, + data_sink: Option>, discovery_concurrency: usize, generation: u64, } -impl PeriodicCollector for EntityDiscoveryCollector { - type Config = EntityDiscoveryCollectorConfig; +impl PeriodicCollector for EntityDiscoveryCollector { + type Config = EntityDiscoveryCollectorConfig; fn new_runner( - bmc: Arc, + bmc: Arc, endpoint: Arc, config: Self::Config, ) -> Result { Ok(Self { + event_context: EventContext::from_endpoint(&endpoint, "entity_discovery_collector"), endpoint, bmc, - shared: config.shared, + data_sink: config.data_sink, discovery_concurrency: config.discovery_concurrency.max(1), generation: 0, }) @@ -65,11 +74,15 @@ impl PeriodicCollector for EntityDiscoveryCollector { let entity_count = entities.len(); self.generation = self.generation.wrapping_add(1); - self.shared.store(Some(Arc::new(EntityInventory { + let inventory = Arc::new(EntityInventory { entities, discovered_at: std::time::Instant::now(), generation: self.generation, - }))); + }); + self.emit_event(HealthEvent::InventoryDiscovered { + endpoint_key: self.event_context.endpoint_key().to_string(), + inventory, + }); tracing::info!( bmc = %self.endpoint.addr.mac, @@ -90,12 +103,18 @@ impl PeriodicCollector for EntityDiscoveryCollector { } async fn stop(&mut self) { - // Clear the snapshot so readers stop emitting for a removed endpoint. - self.shared.store(None); + self.emit_event(HealthEvent::NodeRemoved); } } -impl EntityDiscoveryCollector { +impl EntityDiscoveryCollector { + /// Forwards an event into the configured data sink, if any. + fn emit_event(&self, event: HealthEvent) { + if let Some(data_sink) = &self.data_sink { + data_sink.handle_event(&self.event_context, &event); + } + } + fn record_failure( &self, result: Result, @@ -115,7 +134,7 @@ impl EntityDiscoveryCollector { async fn discover_entities( &self, fetch_failures: &AtomicUsize, - ) -> Result>, HealthError> { + ) -> Result>, HealthError> { let service_root = ServiceRoot::new(self.bmc.clone()).await?; let mut entities = Vec::new(); @@ -155,9 +174,9 @@ impl EntityDiscoveryCollector { async fn discover_processors( &self, - system: &Arc>, + system: &Arc>, fetch_failures: &AtomicUsize, - entities: &mut Vec>, + entities: &mut Vec>, sensor_ids: &mut HashSet, ) { let processors = self @@ -194,9 +213,9 @@ impl EntityDiscoveryCollector { async fn discover_memory( &self, - system: &Arc>, + system: &Arc>, fetch_failures: &AtomicUsize, - entities: &mut Vec>, + entities: &mut Vec>, sensor_ids: &mut HashSet, ) { let memory_modules = self @@ -232,9 +251,9 @@ impl EntityDiscoveryCollector { async fn discover_drives( &self, - system: &Arc>, + system: &Arc>, fetch_failures: &AtomicUsize, - entities: &mut Vec>, + entities: &mut Vec>, sensor_ids: &mut HashSet, ) { let storage_list = self @@ -279,9 +298,9 @@ impl EntityDiscoveryCollector { async fn discover_power_supplies( &self, - chassis: &Arc>, + chassis: &Arc>, fetch_failures: &AtomicUsize, - entities: &mut Vec>, + entities: &mut Vec>, sensor_ids: &mut HashSet, ) { let power_supplies = self @@ -316,9 +335,9 @@ impl EntityDiscoveryCollector { async fn discover_chassis( &self, - chassis: &Arc>, + chassis: &Arc>, fetch_failures: &AtomicUsize, - entities: &mut Vec>, + entities: &mut Vec>, sensor_ids: &mut HashSet, ) { let sensors = match chassis.sensor_links().await { diff --git a/crates/health/src/collectors/entity_metrics.rs b/crates/health/src/collectors/entity_metrics.rs index b776a2a692..a939cb066e 100644 --- a/crates/health/src/collectors/entity_metrics.rs +++ b/crates/health/src/collectors/entity_metrics.rs @@ -27,10 +27,11 @@ use nv_redfish::schema::power_supply_metrics::PowerSupplyMetrics; use nv_redfish::schema::processor_metrics::ProcessorMetrics; use crate::HealthError; -use crate::collectors::inventory::{DiscoveredEntity, SharedInventory}; +use crate::bmc::BmcClient; +use crate::collectors::inventory::{DiscoveredEntity, EntityInventory}; use crate::collectors::runtime::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; +use crate::sink::{EventContext, HealthEvent, MetricSample, SyncEventNode}; struct MetricField { metric_type: Cow<'static, str>, @@ -356,25 +357,27 @@ fn power_supply_metric_fields(m: &PowerSupplyMetrics) -> Vec { out } +/// Configuration for the entity metrics collector. pub struct MetricsCollectorConfig { - pub data_sink: Option>, - pub(crate) shared: SharedInventory, + pub data_sink: Option>, pub fetch_concurrency: usize, + pub(crate) _bmc: std::marker::PhantomData, } +/// Metrics collector for a single BMC endpoint. pub struct MetricsCollector { endpoint: Arc, event_context: EventContext, - shared: SharedInventory, - data_sink: Option>, + latest_inventory: Option>>, + data_sink: Option>, fetch_concurrency: usize, } -impl PeriodicCollector for MetricsCollector { - type Config = MetricsCollectorConfig; +impl PeriodicCollector for MetricsCollector { + type Config = MetricsCollectorConfig; fn new_runner( - _bmc: Arc, + _bmc: Arc, endpoint: Arc, config: Self::Config, ) -> Result { @@ -382,14 +385,14 @@ impl PeriodicCollector for MetricsCollector { Ok(Self { endpoint, event_context, - shared: config.shared, + latest_inventory: None, data_sink: config.data_sink, fetch_concurrency: config.fetch_concurrency.max(1), }) } async fn run_iteration(&mut self) -> Result { - let Some(inventory) = self.shared.load_full() else { + let Some(inventory) = self.latest_inventory.clone() else { tracing::debug!( bmc_addr = ?self.endpoint.addr, "No entity inventory available yet; skipping metrics iteration" @@ -410,7 +413,14 @@ impl PeriodicCollector for MetricsCollector { ); let fetch_failures = AtomicUsize::new(0); - self.emit_event(CollectorEvent::MetricCollectionStart); + self.emit_event(HealthEvent::ScrapeBatchStarted); + + // Entity-level derived metrics (drive media life, PSU capacity), once + // per entity. These are hardware metrics, so they flow with the metrics + // collector rather than being tied to the sensor scrape path. + for entity in &inventory.entities { + self.emit_derived_metrics(entity); + } let this = &*self; let failures = &fetch_failures; @@ -427,7 +437,7 @@ impl PeriodicCollector for MetricsCollector { .into_iter() .sum(); - self.emit_event(CollectorEvent::MetricCollectionEnd); + self.emit_event(HealthEvent::ScrapeBatchFinished); Ok(IterationResult { refresh_triggered: false, @@ -440,21 +450,65 @@ impl PeriodicCollector for MetricsCollector { "metrics_collector" } + fn wants_events(&self) -> bool { + true + } + + fn handle_event(&mut self, context: &EventContext, event: &HealthEvent) { + match event { + HealthEvent::InventoryDiscovered { inventory, .. } => { + self.latest_inventory = Some(inventory.clone()); + } + HealthEvent::NodeRemoved + if context.endpoint_key() == self.event_context.endpoint_key() => + { + self.latest_inventory = None; + } + _ => {} + } + } + async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); + self.emit_event(HealthEvent::NodeRemoved); } } -impl MetricsCollector { - fn emit_event(&self, event: CollectorEvent) { +impl MetricsCollector { + /// Forwards an event into the configured data sink, if any. + fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } } + /// Emits the entity-level derived metrics (e.g. drive media life, PSU + /// capacity) for `entity` as measurement events. + fn emit_derived_metrics(&self, entity: &DiscoveredEntity) { + let derived = entity.derived_metrics(); + if derived.is_empty() { + return; + } + let mut attributes = entity.base_attributes(); + attributes.extend(entity.entity_specific_attributes()); + for metric in derived { + self.emit_event(HealthEvent::MeasurementObserved( + MetricSample { + key: format!("{}/{}", entity.key(), metric.metric_type), + name: "hw".to_string(), + metric_type: metric.metric_type.to_string(), + unit: metric.unit.to_string(), + value: metric.value, + labels: attributes.clone(), + context: None, + } + .into(), + )); + } + } + async fn collect_entity( &self, - entity: &DiscoveredEntity, + entity: &DiscoveredEntity, fetch_failures: &AtomicUsize, ) -> usize { let fields = match entity { @@ -502,7 +556,7 @@ impl MetricsCollector { let entity_key = entity.key(); let count = fields.len(); for field in fields { - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: format!("{entity_key}/{}", field.metric_type), name: "hw_metric".to_string(), diff --git a/crates/health/src/collectors/firmware.rs b/crates/health/src/collectors/firmware.rs index 9ce8283a79..1b6c494f91 100644 --- a/crates/health/src/collectors/firmware.rs +++ b/crates/health/src/collectors/firmware.rs @@ -24,16 +24,16 @@ use nv_redfish::core::Bmc; use crate::HealthError; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; -use crate::sink::{CollectorEvent, DataSink, EventContext, FirmwareInfo}; +use crate::sink::{EventContext, FirmwareInfo, HealthEvent, SyncEventNode}; pub struct FirmwareCollectorConfig { - pub data_sink: Option>, + pub data_sink: Option>, } pub struct FirmwareCollector { bmc: Arc, event_context: EventContext, - data_sink: Option>, + data_sink: Option>, } impl PeriodicCollector for FirmwareCollector { @@ -61,12 +61,13 @@ impl PeriodicCollector for FirmwareCollector { } async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); + self.emit_event(HealthEvent::NodeRemoved); } } impl FirmwareCollector { - fn emit_event(&self, event: CollectorEvent) { + /// Forwards an event into the configured data sink, if any. + fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } @@ -102,7 +103,8 @@ impl FirmwareCollector { (Cow::Borrowed("version"), version.clone()), ]; - self.emit_event(CollectorEvent::Firmware(FirmwareInfo { + self.emit_event(HealthEvent::FirmwareObserved(FirmwareInfo { + id: firmware_data.base.id.clone(), component, version, attributes, diff --git a/crates/health/src/collectors/inventory.rs b/crates/health/src/collectors/inventory.rs index 3e390fcd2e..f45639e15d 100644 --- a/crates/health/src/collectors/inventory.rs +++ b/crates/health/src/collectors/inventory.rs @@ -19,7 +19,6 @@ use std::borrow::Cow; use std::sync::Arc; use std::time::Instant; -use arc_swap::ArcSwapOption; use nv_redfish::Resource; use nv_redfish::chassis::{Chassis, PowerSupply}; use nv_redfish::computer_system::{ComputerSystem, Drive, Memory, Processor, Storage}; @@ -213,10 +212,15 @@ impl DiscoveredEntity { } } -pub(crate) struct EntityInventory { +/// An immutable snapshot of the entities discovered at an endpoint. +/// +/// Discovery publishes a new snapshot via [`HealthEvent::InventoryDiscovered`]; +/// consumers cache their own `Arc` to it, so there is no shared mutable state. +/// `generation` increases with each snapshot to let consumers detect refreshes. +/// +/// [`HealthEvent::InventoryDiscovered`]: crate::sink::HealthEvent::InventoryDiscovered +pub struct EntityInventory { pub(crate) entities: Vec>, pub(crate) discovered_at: Instant, pub(crate) generation: u64, } - -pub(crate) type SharedInventory = Arc>>; diff --git a/crates/health/src/collectors/leak_detector.rs b/crates/health/src/collectors/leak_detector.rs index a3caeea902..bc1bf280dc 100644 --- a/crates/health/src/collectors/leak_detector.rs +++ b/crates/health/src/collectors/leak_detector.rs @@ -26,12 +26,12 @@ use crate::HealthError; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; use crate::sink::{ - Classification, CollectorEvent, DataSink, EventContext, HealthReport, HealthReportAlert, - HealthReportSuccess, Probe, ReportSource, + Classification, EventContext, HealthEvent, HealthReport, HealthReportAlert, + HealthReportSuccess, Probe, ReportSource, SyncEventNode, }; pub struct LeakDetectorCollectorConfig { - pub data_sink: Option>, + pub data_sink: Option>, pub state_refresh_interval: Duration, } @@ -39,7 +39,7 @@ pub struct LeakDetectorCollector { bmc: Arc, event_context: EventContext, state: Option, - data_sink: Option>, + data_sink: Option>, state_refresh_interval: Duration, } @@ -80,7 +80,7 @@ where } async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); + self.emit_event(HealthEvent::NodeRemoved); } } @@ -89,7 +89,8 @@ where B: Bmc + 'static, B::Error: 'static, { - fn emit_event(&self, event: CollectorEvent) { + /// Forwards an event into the configured data sink, if any. + fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } @@ -134,7 +135,7 @@ where let detector_count = detectors.len(); let report = build_health_report(detectors, &self.event_context); - self.emit_event(CollectorEvent::HealthReport(Arc::new(report))); + self.emit_event(HealthEvent::HealthReportProduced(Arc::new(report))); Ok(IterationResult { refresh_triggered, diff --git a/crates/health/src/collectors/logs/periodic.rs b/crates/health/src/collectors/logs/periodic.rs index 2f184592f0..14a7207d26 100644 --- a/crates/health/src/collectors/logs/periodic.rs +++ b/crates/health/src/collectors/logs/periodic.rs @@ -32,13 +32,13 @@ use super::diagnostic::{ use crate::HealthError; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::endpoint::{BmcEndpoint, EndpointMetadata}; -use crate::sink::{CollectorEvent, DataSink, EventContext, LogRecord}; +use crate::sink::{EventContext, HealthEvent, LogRecord, SyncEventNode}; /// Configuration for logs collector pub struct LogsCollectorConfig { pub state_file_path: PathBuf, pub service_refresh_interval: Duration, - pub data_sink: Option>, + pub data_sink: Option>, /// Attach Redfish diagnostic payloads to emitted log records. pub include_diagnostics: bool, @@ -68,7 +68,7 @@ pub struct LogsCollector { state_file_path: PathBuf, state: Option>, service_refresh_interval: Duration, - data_sink: Option>, + data_sink: Option>, include_diagnostics: bool, } @@ -103,7 +103,7 @@ impl PeriodicCollector for LogsCollector { async fn stop(&mut self) { if let Some(data_sink) = &self.data_sink { - data_sink.handle_event(&self.event_context, &CollectorEvent::CollectorRemoved); + data_sink.handle_event(&self.event_context, &HealthEvent::NodeRemoved); } } } @@ -363,7 +363,7 @@ impl LogsCollector { }) .flatten(); - let log_event = CollectorEvent::Log( + let log_event = HealthEvent::LogObserved( LogRecord { body, severity: severity_text, diff --git a/crates/health/src/collectors/logs/sse.rs b/crates/health/src/collectors/logs/sse.rs index a645072f29..fedf3a0eca 100644 --- a/crates/health/src/collectors/logs/sse.rs +++ b/crates/health/src/collectors/logs/sse.rs @@ -30,7 +30,7 @@ use super::diagnostic::{ use crate::HealthError; use crate::collectors::runtime::{EventStream, StreamingCollector, open_sse_stream}; use crate::endpoint::BmcEndpoint; -use crate::sink::{CollectorEvent, LogRecord}; +use crate::sink::{HealthEvent, LogRecord}; /// Configuration for the Redfish SSE log collector. pub struct SseLogCollectorConfig { @@ -91,7 +91,7 @@ fn map_payload( result: Result, bmc: &B, include_diagnostics: bool, -) -> Vec> { +) -> Vec> { match result { Ok(EventStreamPayload::Event(event)) => event_to_logs(&event, bmc, include_diagnostics), Ok(EventStreamPayload::MetricReport(_)) => Vec::new(), @@ -104,7 +104,7 @@ fn event_to_logs( event: &Event, bmc: &B, include_diagnostics: bool, -) -> Vec> { +) -> Vec> { event .events .iter() @@ -201,7 +201,7 @@ fn event_to_logs( None }; - Ok(CollectorEvent::Log(Box::new(LogRecord { + Ok(HealthEvent::LogObserved(Box::new(LogRecord { body, severity, attributes, diff --git a/crates/health/src/collectors/mod.rs b/crates/health/src/collectors/mod.rs index 6499644edf..0559c7e490 100644 --- a/crates/health/src/collectors/mod.rs +++ b/crates/health/src/collectors/mod.rs @@ -29,7 +29,6 @@ mod sensors; pub use discovery::{EntityDiscoveryCollector, EntityDiscoveryCollectorConfig}; pub use entity_metrics::{MetricsCollector, MetricsCollectorConfig}; pub use firmware::{FirmwareCollector, FirmwareCollectorConfig}; -pub(crate) use inventory::SharedInventory; pub use leak_detector::{LeakDetectorCollector, LeakDetectorCollectorConfig}; pub(crate) use logs::auto::{AutoFailureBudget, BudgetDecision, FailureKind}; pub use logs::{ diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index 5f38280c93..5381bca954 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -36,7 +36,7 @@ use crate::HealthError; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::config::NmxtCollectorConfig as NmxtCollectorOptions; use crate::endpoint::BmcEndpoint; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; +use crate::sink::{EventContext, HealthEvent, MetricSample, SyncEventNode}; /// default NMX-T port const NMXT_PORT: u16 = 9352; @@ -451,14 +451,14 @@ async fn scrape_switch_nmxt_metrics( pub struct NmxtCollectorConfig { pub nmxt_config: NmxtCollectorOptions, - pub data_sink: Option>, + pub data_sink: Option>, } pub struct NmxtCollector { endpoint: Arc, http_client: reqwest::Client, event_context: EventContext, - data_sink: Option>, + data_sink: Option>, } impl PeriodicCollector for NmxtCollector { @@ -502,12 +502,13 @@ impl PeriodicCollector for NmxtCollector { } async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); + self.emit_event(HealthEvent::NodeRemoved); } } impl NmxtCollector { - fn emit_event(&self, event: CollectorEvent) { + /// Forwards an event into the configured data sink, if any. + fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } @@ -534,7 +535,7 @@ impl NmxtCollector { let metrics = scrape_switch_nmxt_metrics(&self.http_client, &switch_ip).await?; - self.emit_event(CollectorEvent::MetricCollectionStart); + self.emit_event(HealthEvent::ScrapeBatchStarted); // Ports already emitted a cable temperature this iteration (one series per port). let mut cable_temp_ports: HashSet = HashSet::new(); @@ -559,7 +560,7 @@ impl NmxtCollector { }; if cable_temp_ports.insert(port_num.to_string()) { let labels = self.build_labels(&sample_labels); - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: format!("cable_temperature_celsius:{}", port_num), name: NMXT_METRIC_NAME.to_string(), @@ -585,7 +586,7 @@ impl NmxtCollector { for state in DOWN_BLAME_STATES { let mut labels = base_labels.clone(); labels.push((Cow::Borrowed("state"), (*state).to_string())); - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: format!("down_blame:{}:{}", port_num, state), name: NMXT_METRIC_NAME.to_string(), @@ -618,7 +619,7 @@ impl NmxtCollector { let labels = self.build_labels(&sample_labels); - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: metric_key, name: NMXT_METRIC_NAME.to_string(), @@ -632,7 +633,7 @@ impl NmxtCollector { )); } - self.emit_event(CollectorEvent::MetricCollectionEnd); + self.emit_event(HealthEvent::ScrapeBatchFinished); Ok(()) } @@ -938,15 +939,20 @@ Link_Down{Port_Number="1"} 5 samples: StdMutex>, } - impl DataSink for CapturingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CapturingSink { + fn node_type(&self) -> &'static str { "capturing_sink" } - fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { - if let CollectorEvent::Metric(sample) = event { + fn handle_event( + &self, + _context: &EventContext, + event: &HealthEvent, + ) -> Vec { + if let HealthEvent::MeasurementObserved(sample) = event { self.samples.lock().unwrap().push((**sample).clone()); } + Vec::new() } } @@ -978,7 +984,7 @@ Link_Down{Port_Number="1"} 5 for state in DOWN_BLAME_STATES { let mut labels = collector.build_labels(&sample.labels); labels.push((Cow::Borrowed("state"), (*state).to_string())); - collector.emit_event(CollectorEvent::Metric( + collector.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: format!("down_blame:{}:{}", port_num, state), name: NMXT_METRIC_NAME.to_string(), @@ -1047,15 +1053,20 @@ Link_Down{Port_Number="1"} 5 samples: StdMutex>, } - impl DataSink for CapturingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CapturingSink { + fn node_type(&self) -> &'static str { "capturing_sink" } - fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { - if let CollectorEvent::Metric(sample) = event { + fn handle_event( + &self, + _context: &EventContext, + event: &HealthEvent, + ) -> Vec { + if let HealthEvent::MeasurementObserved(sample) = event { self.samples.lock().unwrap().push((**sample).clone()); } + Vec::new() } } @@ -1088,7 +1099,7 @@ Link_Down{Port_Number="1"} 5 }; if cable_temp_ports.insert(port_num.to_string()) { let labels = collector.build_labels(&sample.labels); - collector.emit_event(CollectorEvent::Metric( + collector.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: format!("cable_temperature_celsius:{}", port_num), name: NMXT_METRIC_NAME.to_string(), diff --git a/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs b/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs index 7aebb6b8f5..6127880694 100644 --- a/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs @@ -27,7 +27,7 @@ use super::proto::{self, PathElem}; use super::sample_processor::now_unix_secs; use super::subscriber::GnmiStreamMetrics; use crate::HealthError; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; +use crate::sink::{EventContext, HealthEvent, MetricSample, SyncEventNode}; type ParsedRow = HashMap; type CachedRows = HashMap; @@ -83,7 +83,7 @@ impl OnChangeStreamMetrics { pub(crate) struct GnmiOnChangeProcessor { pub(crate) collector_name: String, pub(crate) stream_metrics: OnChangeStreamMetrics, - pub(crate) data_sink: Option>, + pub(crate) data_sink: Option>, pub(crate) event_context: EventContext, pub(crate) switch_id: String, cached_rows: Mutex, @@ -93,7 +93,7 @@ impl GnmiOnChangeProcessor { pub(crate) fn new( collector_name: String, stream_metrics: OnChangeStreamMetrics, - data_sink: Option>, + data_sink: Option>, event_context: EventContext, switch_id: String, ) -> Self { @@ -276,7 +276,7 @@ impl GnmiOnChangeProcessor { sink.handle_event( &self.event_context, - &CollectorEvent::Metric(Box::new(MetricSample { + &HealthEvent::MeasurementObserved(Box::new(MetricSample { key, name: self.collector_name.clone(), metric_type: "on_change_row".to_string(), @@ -337,19 +337,20 @@ mod tests { #[derive(Default)] struct CapturingSink { - events: Mutex>, + events: Mutex>, } - impl DataSink for CapturingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CapturingSink { + fn node_type(&self) -> &'static str { "capturing_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { self.events .lock() .expect("lock poisoned") .push((context.clone(), event.clone())); + Vec::new() } } @@ -381,7 +382,7 @@ mod tests { } } - fn test_processor(data_sink: Option>) -> GnmiOnChangeProcessor { + fn test_processor(data_sink: Option>) -> GnmiOnChangeProcessor { let registry = prometheus::Registry::new(); let stream_metrics = OnChangeStreamMetrics::new(®istry, "test", TEST_COLLECTOR_NAME, test_labels()) @@ -617,7 +618,7 @@ mod tests { let events = sink.events.lock().expect("lock poisoned"); assert_eq!(events.len(), 2); - let CollectorEvent::Metric(metric) = &events[1].1 else { + let HealthEvent::MeasurementObserved(metric) = &events[1].1 else { panic!("expected metric event"); }; assert_eq!(metric.value, 4.0); @@ -657,7 +658,7 @@ mod tests { let events = sink.events.lock().expect("lock poisoned"); assert_eq!(events.len(), 3); - let CollectorEvent::Metric(metric) = &events[2].1 else { + let HealthEvent::MeasurementObserved(metric) = &events[2].1 else { panic!("expected metric event"); }; assert_eq!(metric.value, 0.0); @@ -697,7 +698,7 @@ mod tests { let events = sink.events.lock().expect("lock poisoned"); assert_eq!(events.len(), 2); - let CollectorEvent::Metric(metric) = &events[1].1 else { + let HealthEvent::MeasurementObserved(metric) = &events[1].1 else { panic!("expected metric event"); }; assert_eq!(metric.value, 0.0); @@ -743,7 +744,7 @@ mod tests { let events = sink.events.lock().expect("lock poisoned"); assert_eq!(events.len(), 2); - let CollectorEvent::Metric(metric) = &events[1].1 else { + let HealthEvent::MeasurementObserved(metric) = &events[1].1 else { panic!("expected metric event"); }; assert_eq!(metric.value, 0.0); @@ -830,7 +831,7 @@ mod tests { assert_eq!(context.switch_slot_number(), Some(7)); assert_eq!(context.switch_tray_index(), Some(3)); assert_eq!(context.rack_id().map(RackId::as_str), Some("RACK_2")); - let CollectorEvent::Metric(metric) = event else { + let HealthEvent::MeasurementObserved(metric) = event else { panic!("expected metric event"); }; assert_eq!(metric.metric_type, "on_change_row"); diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs index b50e1ca748..96cb838bcf 100644 --- a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -23,12 +23,13 @@ use std::time::Instant; use super::client::{typed_value_to_f64, typed_value_to_string}; use super::proto::{self, PathElem}; use super::subscriber::GnmiStreamMetrics; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; +use crate::sink::{EventContext, HealthEvent, MetricSample, SyncEventNode}; pub(crate) const NVUE_GNMI_SAMPLE_STREAM_ID: &str = "nvue_gnmi"; +/// process NVUE gNMI SAMPLE notifications and emit them as `HealthEvent::MeasurementObserved` pub(crate) struct GnmiSampleProcessor { - pub(crate) data_sink: Option>, + pub(crate) data_sink: Option>, pub(crate) event_context: EventContext, pub(crate) switch_id: String, } @@ -214,7 +215,7 @@ impl GnmiSampleProcessor { sink.handle_event( &self.event_context, - &CollectorEvent::Metric(Box::new(MetricSample { + &HealthEvent::MeasurementObserved(Box::new(MetricSample { key, name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), metric_type: metric_type.to_string(), @@ -332,7 +333,7 @@ impl GnmiSampleProcessor { sink.handle_event( &self.event_context, - &CollectorEvent::Metric(Box::new(MetricSample { + &HealthEvent::MeasurementObserved(Box::new(MetricSample { key: metric_type.to_string(), name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), metric_type: metric_type.to_string(), @@ -357,7 +358,7 @@ impl GnmiSampleProcessor { sink.handle_event( &self.event_context, - &CollectorEvent::Metric(Box::new(MetricSample { + &HealthEvent::MeasurementObserved(Box::new(MetricSample { key: metric_type.to_string(), name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), metric_type: metric_type.to_string(), @@ -392,7 +393,7 @@ impl GnmiSampleProcessor { sink.handle_event( &self.event_context, - &CollectorEvent::Metric(Box::new(MetricSample { + &HealthEvent::MeasurementObserved(Box::new(MetricSample { key, name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), metric_type: metric_type.to_string(), @@ -432,7 +433,7 @@ impl GnmiSampleProcessor { sink.handle_event( &self.event_context, - &CollectorEvent::Metric(Box::new(MetricSample { + &HealthEvent::MeasurementObserved(Box::new(MetricSample { key, name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), metric_type: metric_type.to_string(), @@ -982,19 +983,20 @@ mod tests { #[derive(Default)] struct CapturingSink { - events: Mutex>, + events: Mutex>, } - impl DataSink for CapturingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CapturingSink { + fn node_type(&self) -> &'static str { "capturing_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { self.events .lock() .expect("lock poisoned") .push((context.clone(), event.clone())); + Vec::new() } } @@ -1208,7 +1210,7 @@ mod tests { assert_eq!(context.switch_slot_number(), Some(7)); assert_eq!(context.switch_tray_index(), Some(3)); assert_eq!(context.rack_id().map(RackId::as_str), Some("RACK_2")); - assert!(matches!(event, CollectorEvent::Metric(_))); + assert!(matches!(event, HealthEvent::MeasurementObserved(_))); } } @@ -1579,7 +1581,7 @@ mod tests { let events = sink.events.lock().expect("lock poisoned"); assert_eq!(events.len(), 1, "expected exactly one emitted metric"); let (ctx, event) = events[0].clone(); - let CollectorEvent::Metric(sample) = event else { + let HealthEvent::MeasurementObserved(sample) = event else { panic!("expected a Metric event"); }; // shared producer invariants for every interface mapping. The @@ -1625,7 +1627,7 @@ mod tests { let events = sink.events.lock().expect("lock poisoned"); assert_eq!(events.len(), 1, "expected exactly one emitted metric"); let (ctx, event) = events[0].clone(); - let CollectorEvent::Metric(sample) = event else { + let HealthEvent::MeasurementObserved(sample) = event else { panic!("expected a Metric event"); }; assert_eq!(sample.name, NVUE_GNMI_SAMPLE_STREAM_ID); @@ -1671,7 +1673,7 @@ mod tests { .expect("lock poisoned") .iter() .map(|(_, event)| { - let CollectorEvent::Metric(sample) = event else { + let HealthEvent::MeasurementObserved(sample) = event else { panic!("expected a Metric event"); }; (**sample).clone() @@ -1715,7 +1717,7 @@ mod tests { .expect("lock poisoned") .iter() .map(|(_, event)| { - let CollectorEvent::Metric(sample) = event else { + let HealthEvent::MeasurementObserved(sample) = event else { panic!("expected a Metric event"); }; (**sample).clone() @@ -2588,7 +2590,7 @@ mod tests { let events = sink.events.lock().expect("lock poisoned"); assert_eq!(events.len(), 1, "expected exactly one emitted metric"); let (ctx, event) = events[0].clone(); - let CollectorEvent::Metric(sample) = event else { + let HealthEvent::MeasurementObserved(sample) = event else { panic!("expected a Metric event"); }; assert_eq!(sample.name, NVUE_GNMI_SAMPLE_STREAM_ID); @@ -2891,7 +2893,7 @@ mod tests { let events = sink.events.lock().expect("lock poisoned"); assert_eq!(events.len(), 1, "expected exactly one emitted metric"); let (ctx, event) = events[0].clone(); - let CollectorEvent::Metric(sample) = event else { + let HealthEvent::MeasurementObserved(sample) = event else { panic!("expected a Metric event"); }; assert_eq!(sample.name, NVUE_GNMI_SAMPLE_STREAM_ID); diff --git a/crates/health/src/collectors/nvue/gnmi/subscriber.rs b/crates/health/src/collectors/nvue/gnmi/subscriber.rs index 156aabdb4f..a7c3007c20 100644 --- a/crates/health/src/collectors/nvue/gnmi/subscriber.rs +++ b/crates/health/src/collectors/nvue/gnmi/subscriber.rs @@ -39,7 +39,7 @@ use crate::collectors::runtime::{BackoffConfig, ExponentialBackoff, StreamingCon use crate::config::NvueGnmiConfig; use crate::endpoint::{BmcAddr, BmcCredentials, BmcEndpoint}; use crate::metrics::CollectorRegistry; -use crate::sink::{CollectorEvent, DataSink, EventContext}; +use crate::sink::{EventContext, HealthEvent, SyncEventNode}; // gRPC ConnectivityState values for `connection_state`. 0 (UNKNOWN) is the gauge default. const IDLE: i64 = 1; @@ -446,7 +446,7 @@ pub fn spawn_gnmi_collector( gnmi_config: &NvueGnmiConfig, credential_provider: Arc, collector_registry: Arc, - data_sink: Option>, + data_sink: Option>, ) -> Result { let switch_id = endpoint .metadata @@ -557,13 +557,10 @@ pub fn spawn_gnmi_collector( } if let Some(data_sink) = collector_removed_data_sink.as_deref() { - data_sink.handle_event( - &collector_removed_sample_context, - &CollectorEvent::CollectorRemoved, - ); + data_sink.handle_event(&collector_removed_sample_context, &HealthEvent::NodeRemoved); if let Some(event_context) = &collector_removed_on_change_context { - data_sink.handle_event(event_context, &CollectorEvent::CollectorRemoved); + data_sink.handle_event(event_context, &HealthEvent::NodeRemoved); } } })) diff --git a/crates/health/src/collectors/nvue/rest/client.rs b/crates/health/src/collectors/nvue/rest/client.rs index d133b13c5e..887ee8e0ee 100644 --- a/crates/health/src/collectors/nvue/rest/client.rs +++ b/crates/health/src/collectors/nvue/rest/client.rs @@ -16,10 +16,9 @@ */ use std::collections::HashMap; -use std::sync::Arc; +use std::sync::RwLock; use std::time::Duration; -use arc_swap::ArcSwapOption; use reqwest::Client; use reqwest::header::ACCEPT; use serde::Deserialize; @@ -55,7 +54,7 @@ impl std::fmt::Debug for UsernamePassword { pub struct RestClient { pub(crate) switch_id: String, base_url: Url, - credentials: ArcSwapOption, + credentials: RwLock>, paths: NvueRestPaths, client: Client, } @@ -86,22 +85,34 @@ impl RestClient { Ok(Self { switch_id, base_url, - credentials: ArcSwapOption::empty(), + credentials: RwLock::new(None), paths, client, }) } + /// Stores the credentials used for subsequent authenticated requests. pub fn set_credentials(&self, creds: UsernamePassword) { - self.credentials.store(Some(Arc::new(creds))); + *self + .credentials + .write() + .unwrap_or_else(|poisoned| poisoned.into_inner()) = Some(creds); } + /// Drops any stored credentials, e.g. after an authentication failure. pub fn clear_credentials(&self) { - self.credentials.store(None); + *self + .credentials + .write() + .unwrap_or_else(|poisoned| poisoned.into_inner()) = None; } + /// Returns whether credentials are currently stored. pub fn has_credentials(&self) -> bool { - self.credentials.load().is_some() + self.credentials + .read() + .unwrap_or_else(|poisoned| poisoned.into_inner()) + .is_some() } pub async fn get_system_health(&self) -> Result, HealthError> { @@ -219,7 +230,12 @@ impl RestClient { request = request.query(extra_query); } - if let Some(creds) = self.credentials.load_full() { + let credentials = self + .credentials + .read() + .unwrap_or_else(|poisoned| poisoned.into_inner()) + .clone(); + if let Some(creds) = credentials { request = request.basic_auth(&creds.username, creds.password.as_ref()); } diff --git a/crates/health/src/collectors/nvue/rest/collector.rs b/crates/health/src/collectors/nvue/rest/collector.rs index 7a487414e9..527513c434 100644 --- a/crates/health/src/collectors/nvue/rest/collector.rs +++ b/crates/health/src/collectors/nvue/rest/collector.rs @@ -24,7 +24,7 @@ use crate::bmc::{CREDENTIAL_REFRESH_TIMEOUT, CredentialProvider, is_auth_error}; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::config::NvueRestConfig; use crate::endpoint::{BmcAddr, BmcCredentials, BmcEndpoint, EndpointMetadata}; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; +use crate::sink::{EventContext, HealthEvent, MetricSample, SyncEventNode}; const COLLECTOR_NAME: &str = "nvue_rest"; @@ -120,7 +120,7 @@ fn fan_led_to_state(state: Option<&str>) -> Option<&'static str> { pub struct NvueRestCollectorConfig { pub rest_config: NvueRestConfig, - pub data_sink: Option>, + pub data_sink: Option>, pub credential_provider: Arc, } @@ -128,7 +128,7 @@ pub struct NvueRestCollector { client: RestClient, switch_id: String, event_context: EventContext, - data_sink: Option>, + data_sink: Option>, addr: BmcAddr, provider: Arc, } @@ -184,7 +184,7 @@ impl PeriodicCollector for NvueRestCollector { }); } - self.emit_event(CollectorEvent::MetricCollectionStart); + self.emit_event(HealthEvent::ScrapeBatchStarted); let mut entity_count = 0usize; let mut fetch_failures = 0usize; let mut saw_auth_failure = false; @@ -422,7 +422,7 @@ impl PeriodicCollector for NvueRestCollector { self.client.clear_credentials(); } - self.emit_event(CollectorEvent::MetricCollectionEnd); + self.emit_event(HealthEvent::ScrapeBatchFinished); tracing::debug!( switch_id = %self.switch_id, @@ -442,7 +442,7 @@ impl PeriodicCollector for NvueRestCollector { } async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); + self.emit_event(HealthEvent::NodeRemoved); } } @@ -471,7 +471,8 @@ impl NvueRestCollector { } } - fn emit_event(&self, event: CollectorEvent) { + /// Forwards an event into the configured data sink, if any. + fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } @@ -496,7 +497,7 @@ impl NvueRestCollector { None => metric_type.to_string(), }; - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key, name: COLLECTOR_NAME.to_string(), @@ -694,15 +695,20 @@ mod tests { samples: StdMutex>, } - impl DataSink for CapturingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CapturingSink { + fn node_type(&self) -> &'static str { "capturing_sink" } - fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { - if let CollectorEvent::Metric(sample) = event { + fn handle_event( + &self, + _context: &EventContext, + event: &HealthEvent, + ) -> Vec { + if let HealthEvent::MeasurementObserved(sample) = event { self.samples.lock().unwrap().push((**sample).clone()); } + Vec::new() } } @@ -824,15 +830,20 @@ mod tests { samples: StdMutex>, } - impl DataSink for CapturingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CapturingSink { + fn node_type(&self) -> &'static str { "capturing_sink" } - fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { - if let CollectorEvent::Metric(sample) = event { + fn handle_event( + &self, + _context: &EventContext, + event: &HealthEvent, + ) -> Vec { + if let HealthEvent::MeasurementObserved(sample) = event { self.samples.lock().unwrap().push((**sample).clone()); } + Vec::new() } } @@ -984,15 +995,20 @@ mod tests { samples: StdMutex>, } - impl DataSink for CapturingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CapturingSink { + fn node_type(&self) -> &'static str { "capturing_sink" } - fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { - if let CollectorEvent::Metric(sample) = event { + fn handle_event( + &self, + _context: &EventContext, + event: &HealthEvent, + ) -> Vec { + if let HealthEvent::MeasurementObserved(sample) = event { self.samples.lock().unwrap().push((**sample).clone()); } + Vec::new() } } diff --git a/crates/health/src/collectors/runtime.rs b/crates/health/src/collectors/runtime.rs index 50f205390d..6c38d8e554 100644 --- a/crates/health/src/collectors/runtime.rs +++ b/crates/health/src/collectors/runtime.rs @@ -28,6 +28,7 @@ use nv_redfish::core::Bmc; use nv_redfish::event_service::EventStreamPayload; use prometheus::{Counter, Gauge, Histogram, HistogramOpts, IntCounter, IntGauge, Opts}; use rand::RngExt; +use tokio::sync::mpsc; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; @@ -38,7 +39,7 @@ use crate::limiter::RateLimiter; use crate::metrics::{ CollectorRegistry, ComponentKind, MetricsManager, operation_duration_buckets_seconds, }; -use crate::sink::{CollectorEvent, DataSink, EventContext}; +use crate::sink::{EventContext, HealthEvent, SyncEventNode}; /// Result of a collector iteration #[derive(Debug, Clone)] @@ -51,9 +52,16 @@ pub struct IterationResult { pub fetch_failures: usize, } +/// A collector that is polled on a fixed cadence to scrape an endpoint. +/// +/// The runtime owns the timer, rate limiter, and (optionally) an event mailbox; +/// implementors only provide the per-iteration scrape logic and, if they opt in +/// via [`Self::wants_events`], react to routed events. pub trait PeriodicCollector: Send + 'static { + /// Per-collector configuration consumed by [`Self::new_runner`]. type Config: Send + 'static; + /// Builds a runner bound to a specific endpoint and BMC client. fn new_runner( bmc: Arc, endpoint: Arc, @@ -62,6 +70,7 @@ pub trait PeriodicCollector: Send + 'static { where Self: Sized; + /// Performs one scrape pass, emitting events and returning iteration stats. fn run_iteration( &mut self, ) -> impl std::future::Future> + Send; @@ -69,12 +78,25 @@ pub trait PeriodicCollector: Send + 'static { /// Returns the type identifier for this collector fn collector_type(&self) -> &'static str; + /// Whether this collector consumes routed events (via [`Self::handle_event`]). + /// Only collectors that return `true` get an event mailbox exposed through + /// [`Collector::event_node`]. Defaults to `false`. + fn wants_events(&self) -> bool { + false + } + + /// Reacts to an event routed to this collector's mailbox. No-op by default; + /// only invoked for collectors that opt in via [`Self::wants_events`]. + fn handle_event(&mut self, _context: &EventContext, _event: &HealthEvent) {} + + /// Releases any resources when the collector is stopped. No-op by default. fn stop(&mut self) -> impl std::future::Future + Send { async {} } } -pub type EventStream<'a> = BoxStream<'a, Result>; +/// A boxed stream of health events produced by a [`StreamingCollector`]. +pub type EventStream<'a> = BoxStream<'a, Result>; /// Trait for collectors that maintain a long-lived stream (SSE, gRPC, etc.) /// runtime.rs creates the BMC client and injects it, the collector opens the stream and maps payloads to events @@ -246,9 +268,55 @@ impl Drop for StreamingConnectionGuard { } } +/// A running periodic collector task plus the handles used to drive and stop it. pub struct Collector { handle: JoinHandle<()>, cancel_token: CancellationToken, + event_node: Option>, +} + +/// A [`SyncEventNode`] that forwards endpoint-addressed events into a running +/// collector task's channel, decoupling the event graph from the collector's +/// async loop. +struct CollectorEventMailbox { + node_type: &'static str, + endpoint_key: String, + sender: mpsc::Sender<(EventContext, HealthEvent)>, +} + +impl SyncEventNode for CollectorEventMailbox { + fn node_type(&self) -> &'static str { + self.node_type + } + + fn interested_in(&self, event: &HealthEvent) -> bool { + // Route every endpoint-addressed control/stage event to the collector's + // mailbox; the runner's `handle_event` decides which ones it acts on. + // The endpoint_key guard applies to all such events. + match event { + HealthEvent::InventoryDiscovered { endpoint_key, .. } + | HealthEvent::InventoryUpdated { endpoint_key, .. } + | HealthEvent::ScrapeRequested { endpoint_key, .. } => { + endpoint_key == &self.endpoint_key + } + HealthEvent::NodeRemoved => true, + _ => false, + } + } + + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + // A drop on a full mailbox selfheals: discovery re-emits InventoryDiscovered + // every refresh cycle, so the consumer recovers on the next tick + if let Err(error) = self.sender.try_send((context.clone(), event.clone())) { + tracing::warn!( + ?error, + node_type = self.node_type, + endpoint_key = %self.endpoint_key, + "dropping event for collector mailbox" + ); + } + Vec::new() + } } pub struct CollectorStartContext { @@ -281,6 +349,19 @@ impl Collector { let cancel_token_clone = cancel_token.clone(); let mut runner = C::new_runner(bmc, endpoint.clone(), config)?; + // Only event-consuming collectors get a mailbox; others never receive events, + // so we skip the channel entirely and leave `event_node` as None. + let (event_node, mut event_receiver) = if runner.wants_events() { + let (event_sender, event_receiver) = mpsc::channel(16); + let node: Arc = Arc::new(CollectorEventMailbox { + node_type: runner.collector_type(), + endpoint_key: endpoint.key(), + sender: event_sender, + }); + (Some(node), Some(event_receiver)) + } else { + (None, None) + }; let endpoint_key = endpoint.key(); let const_labels = HashMap::from([ @@ -341,6 +422,7 @@ impl Collector { let handle = tokio::spawn(async move { let collector_type = runner.collector_type(); let _collector_registry = collector_registry; + let mut next_iteration = tokio::time::Instant::now(); loop { tokio::select! { _ = cancel_token_clone.cancelled() => { @@ -348,7 +430,22 @@ impl Collector { runner.stop().await; break; } - _ = async { + maybe_event = async { + match event_receiver.as_mut() { + Some(rx) => rx.recv().await, + // No mailbox: park this branch forever so only the timer/cancel fire. + None => std::future::pending().await, + } + } => { + if let Some((event_context, event)) = maybe_event { + runner.handle_event(&event_context, &event); + } + } + // The scrape runs to completion here (not as a racing select future), + // so an inbound event can never cancel an in-flight iteration or reset + // its cadence. Trade-off: cancellation waits for the current scrape, + // bounded by the BMC request timeout. + _ = tokio::time::sleep_until(next_iteration) => { limiter.acquire().await; let start = Instant::now(); @@ -388,8 +485,7 @@ impl Collector { } } - tokio::time::sleep(iteration_interval).await; - } => { + next_iteration = tokio::time::Instant::now() + iteration_interval; } } } @@ -398,6 +494,7 @@ impl Collector { Ok(Self { handle, cancel_token, + event_node, }) } @@ -405,7 +502,7 @@ impl Collector { endpoint: Arc, bmc: Arc, config: S::Config, - data_sink: Arc, + data_sink: Arc, start_context: StreamingCollectorStartContext, mut on_connect_result: F, ) -> Result @@ -533,6 +630,7 @@ impl Collector { Ok(Self { handle, cancel_token, + event_node: None, }) } @@ -550,6 +648,7 @@ impl Collector { Self { handle, cancel_token, + event_node: None, } } @@ -561,4 +660,10 @@ impl Collector { pub fn is_finished(&self) -> bool { self.handle.is_finished() } + + /// Returns this collector's event mailbox, if it consumes events, so the + /// event graph can route events to it. + pub fn event_node(&self) -> Option> { + self.event_node.clone() + } } diff --git a/crates/health/src/collectors/sensors.rs b/crates/health/src/collectors/sensors.rs index 0b4926a5a6..f07840730a 100644 --- a/crates/health/src/collectors/sensors.rs +++ b/crates/health/src/collectors/sensors.rs @@ -24,11 +24,12 @@ use nv_redfish::core::{Bmc, EntityTypeRef, ToSnakeCase}; use nv_redfish::sensor::SensorLink; use crate::HealthError; -use crate::collectors::inventory::{DiscoveredEntity, SharedInventory}; +use crate::bmc::BmcClient; +use crate::collectors::inventory::{DiscoveredEntity, EntityInventory}; use crate::collectors::runtime::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; use crate::metrics::{MetricLabel, sanitize_unit}; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample, SensorThresholdContext}; +use crate::sink::{EventContext, HealthEvent, MetricSample, SensorThresholdContext, SyncEventNode}; #[derive(Clone, Copy)] enum SensorRangeKind { @@ -54,27 +55,29 @@ impl SensorRangeKind { /// Configuration for the sensor collector. pub struct SensorCollectorConfig { - pub data_sink: Option>, - pub(crate) shared: SharedInventory, + pub data_sink: Option>, pub sensor_fetch_concurrency: usize, pub include_sensor_thresholds: bool, + pub emit_derived_metrics: bool, + pub(crate) _bmc: std::marker::PhantomData, } /// Sensor collector for a single BMC endpoint pub struct SensorCollector { endpoint: Arc, event_context: EventContext, - shared: SharedInventory, - data_sink: Option>, + latest_inventory: Option>>, + data_sink: Option>, sensor_fetch_concurrency: usize, include_sensor_thresholds: bool, + emit_derived_metrics: bool, } -impl PeriodicCollector for SensorCollector { - type Config = SensorCollectorConfig; +impl PeriodicCollector for SensorCollector { + type Config = SensorCollectorConfig; fn new_runner( - _bmc: Arc, + _bmc: Arc, endpoint: Arc, config: Self::Config, ) -> Result { @@ -82,15 +85,16 @@ impl PeriodicCollector for SensorCollector { Ok(Self { endpoint, event_context, - shared: config.shared, + latest_inventory: None, data_sink: config.data_sink, sensor_fetch_concurrency: config.sensor_fetch_concurrency.max(1), include_sensor_thresholds: config.include_sensor_thresholds, + emit_derived_metrics: config.emit_derived_metrics, }) } async fn run_iteration(&mut self) -> Result { - let Some(inventory) = self.shared.load_full() else { + let Some(inventory) = self.latest_inventory.clone() else { tracing::debug!( bmc_addr = ?self.endpoint.addr, "No entity inventory available yet; skipping sensor iteration" @@ -111,15 +115,15 @@ impl PeriodicCollector for SensorCollector { ); let fetch_failures = AtomicUsize::new(0); - self.emit_event(CollectorEvent::MetricCollectionStart); + self.emit_event(HealthEvent::ScrapeBatchStarted); - // Entity-level derived metrics (drive media life, PSU capacity), once - // per entity. - for entity in &inventory.entities { - self.emit_derived_metrics(entity); + if self.emit_derived_metrics { + for entity in &inventory.entities { + self.emit_derived_metrics(entity); + } } - // Build the fetch futures borrowing from the shared snapshot, then + // Build fetch futures borrowing from the immutable inventory snapshot, then // drive them concurrently. Each future borrows `&self`, the entity, and // its sensor (all alive for as long as `inventory` is held here). let this = &*self; @@ -142,7 +146,7 @@ impl PeriodicCollector for SensorCollector { .into_iter() .sum(); - self.emit_event(CollectorEvent::MetricCollectionEnd); + self.emit_event(HealthEvent::ScrapeBatchFinished); Ok(IterationResult { refresh_triggered: false, @@ -155,19 +159,38 @@ impl PeriodicCollector for SensorCollector { "sensor_collector" } + fn wants_events(&self) -> bool { + true + } + + fn handle_event(&mut self, context: &EventContext, event: &HealthEvent) { + match event { + HealthEvent::InventoryDiscovered { inventory, .. } => { + self.latest_inventory = Some(inventory.clone()); + } + HealthEvent::NodeRemoved + if context.endpoint_key() == self.event_context.endpoint_key() => + { + self.latest_inventory = None; + } + _ => {} + } + } + async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); + self.emit_event(HealthEvent::NodeRemoved); } } -impl SensorCollector { - fn emit_event(&self, event: CollectorEvent) { +impl SensorCollector { + /// Forwards an event into the configured data sink, if any. + fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } } - fn emit_derived_metrics(&self, entity: &DiscoveredEntity) { + fn emit_derived_metrics(&self, entity: &DiscoveredEntity) { let derived = entity.derived_metrics(); if derived.is_empty() { return; @@ -175,7 +198,7 @@ impl SensorCollector { let mut attributes = entity.base_attributes(); attributes.extend(entity.entity_specific_attributes()); for metric in derived { - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: format!("{}/{}", entity.key(), metric.metric_type), name: "hw".to_string(), @@ -192,8 +215,8 @@ impl SensorCollector { async fn update_sensor( &self, - entity: &DiscoveredEntity, - sensor_link: &SensorLink, + entity: &DiscoveredEntity, + sensor_link: &SensorLink, fetch_failures: &AtomicUsize, ) -> usize { let sensor = match sensor_link.fetch().await { @@ -319,7 +342,7 @@ impl SensorCollector { (None, None, None, None, None, None) }; - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: sensor.odata_id().to_string(), name: "hw_sensor".to_string(), @@ -382,7 +405,7 @@ impl SensorCollector { Cow::Borrowed("sensor_range"), range_kind.label_value().to_string(), )); - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: format!("{sensor_key}/{metric_suffix}"), name: "hw_sensor".to_string(), diff --git a/crates/health/src/discovery/cleanup.rs b/crates/health/src/discovery/cleanup.rs index 5dba8d0728..530b6c8eab 100644 --- a/crates/health/src/discovery/cleanup.rs +++ b/crates/health/src/discovery/cleanup.rs @@ -49,10 +49,6 @@ pub(super) fn stop_removed_bmc_collectors( for kind in CollectorKind::ALL { stop_collectors_for_keys(ctx, kind, &removed_keys); } - for key in &removed_keys { - ctx.collectors.remove_inventory(key); - } - if !removed_keys.is_empty() { tracing::info!( removed_count = removed_keys.len(), diff --git a/crates/health/src/discovery/context.rs b/crates/health/src/discovery/context.rs index 71adf6f2f8..7d5e2f1c46 100644 --- a/crates/health/src/discovery/context.rs +++ b/crates/health/src/discovery/context.rs @@ -19,12 +19,10 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use arc_swap::ArcSwapOption; use prometheus::{Histogram, HistogramOpts}; use crate::HealthError; -use crate::bmc::BmcClient; -use crate::collectors::{Collector, LogDowngradeRegistry, SharedInventory}; +use crate::collectors::{Collector, LogDowngradeRegistry}; use crate::config::{ Config, Configurable, DiscoveryConfig, FirmwareCollectorConfig as FirmwareCollectorOptions, LeakDetectorCollectorConfig as LeakDetectorCollectorOptions, @@ -92,7 +90,6 @@ pub(super) struct CollectorState { nmxt: HashMap, Collector>, nvue_rest: HashMap, Collector>, nvue_gnmi: HashMap, Collector>, - inventories: HashMap, SharedInventory>, } impl CollectorState { @@ -107,7 +104,6 @@ impl CollectorState { nmxt: HashMap::new(), nvue_rest: HashMap::new(), nvue_gnmi: HashMap::new(), - inventories: HashMap::new(), } } @@ -142,25 +138,20 @@ impl CollectorState { } } - pub(super) fn inventory_for(&mut self, key: &str) -> SharedInventory { - if let Some(shared) = self.inventories.get(key) { - return shared.clone(); - } - let shared = Arc::new(ArcSwapOption::empty()); - self.inventories - .insert(Cow::Owned(key.to_string()), shared.clone()); - shared - } - - /// Drop the shared inventory handle for a removed endpoint. - pub(super) fn remove_inventory(&mut self, key: &str) { - self.inventories.remove(key); - } - pub(super) fn contains(&self, kind: CollectorKind, key: &str) -> bool { self.map(kind).contains_key(key) } + /// Returns the event mailbox of the tracked collector of `kind` for `key`, + /// so newly discovered inventory can be routed to already-running consumers. + pub(super) fn event_node( + &self, + kind: CollectorKind, + key: &str, + ) -> Option> { + self.map(kind).get(key).and_then(Collector::event_node) + } + pub(super) fn insert( &mut self, kind: CollectorKind, diff --git a/crates/health/src/discovery/iteration.rs b/crates/health/src/discovery/iteration.rs index 3c4fae159c..d961219dd4 100644 --- a/crates/health/src/discovery/iteration.rs +++ b/crates/health/src/discovery/iteration.rs @@ -27,7 +27,7 @@ use super::spawn::spawn_collectors_for_endpoint; use crate::HealthError; use crate::endpoint::{BmcEndpoint, EndpointSource}; use crate::sharding::ShardManager; -use crate::sink::DataSink; +use crate::sink::SyncEventNode; fn active_keys(sharded_endpoints: &[Arc]) -> HashSet> { sharded_endpoints @@ -40,7 +40,7 @@ pub async fn run_discovery_iteration( endpoint_source: Arc, shard_manager: &ShardManager, ctx: &mut DiscoveryLoopContext, - data_sink: Option>, + data_sink: Option>, metrics_prefix: &str, ) -> Result { let iteration_start = Instant::now(); diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index 1dd99330f1..044451ffb5 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -33,7 +33,51 @@ use crate::collectors::{ }; use crate::config::{Configurable, LogCollectionMode, PeriodicLogConfig}; use crate::endpoint::{BmcEndpoint, EndpointMetadata, SwitchEndpointRole}; -use crate::sink::DataSink; +use crate::sink::{CompositeSyncEventNode, SyncEventNode}; + +/// Spawns the collector graph for one endpoint: given the discovery context, the +/// endpoint, an optional data sink, and a metrics prefix, it starts the +/// appropriate collectors. +type SpawnGraphFn = fn( + &mut DiscoveryLoopContext, + &Arc, + Option>, + &str, +) -> Result<(), HealthError>; + +/// Declarative binding of an endpoint predicate to the collector graph that +/// should be spawned for endpoints matching it. +struct EndpointGraphSpec { + name: &'static str, + applies_to: fn(&BmcEndpoint) -> bool, + spawn: SpawnGraphFn, +} + +/// Returns whether the endpoint is a switch acting in the host role. +fn is_switch_host_endpoint(endpoint: &BmcEndpoint) -> bool { + endpoint + .switch_data() + .is_some_and(|switch| switch.endpoint_role == SwitchEndpointRole::Host) +} + +/// Returns whether the endpoint should use the generic Redfish collector graph +/// (everything that is not a switch host). +fn is_generic_redfish_endpoint(endpoint: &BmcEndpoint) -> bool { + !is_switch_host_endpoint(endpoint) +} + +const ENDPOINT_GRAPH_SPECS: [EndpointGraphSpec; 2] = [ + EndpointGraphSpec { + name: "generic_redfish", + applies_to: is_generic_redfish_endpoint, + spawn: spawn_generic_redfish_collectors, + }, + EndpointGraphSpec { + name: "switch_host", + applies_to: is_switch_host_endpoint, + spawn: spawn_switch_host_collectors, + }, +]; fn logs_state_file_path(template: &str, endpoint_id: &str) -> PathBuf { PathBuf::from(template.replace("{machine_id}", endpoint_id)) @@ -42,22 +86,27 @@ fn logs_state_file_path(template: &str, endpoint_id: &str) -> PathBuf { pub(super) fn spawn_collectors_for_endpoint( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, - data_sink: Option>, + data_sink: Option>, metrics_prefix: &str, ) -> Result<(), HealthError> { - let endpoint_role = endpoint.switch_data().map(|switch| switch.endpoint_role); - - if matches!(endpoint_role, Some(SwitchEndpointRole::Host)) { - spawn_switch_host_collectors(ctx, endpoint, data_sink, metrics_prefix) - } else { - spawn_generic_redfish_collectors(ctx, endpoint, data_sink, metrics_prefix) + for spec in ENDPOINT_GRAPH_SPECS { + if !(spec.applies_to)(endpoint) { + continue; + } + tracing::debug!( + endpoint_key = %endpoint.key(), + graph_spec = spec.name, + "applying endpoint graph spec" + ); + return (spec.spawn)(ctx, endpoint, data_sink, metrics_prefix); } + Ok(()) } fn spawn_generic_redfish_collectors( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, - data_sink: Option>, + data_sink: Option>, metrics_prefix: &str, ) -> Result<(), HealthError> { let key = endpoint.key(); @@ -66,52 +115,11 @@ fn spawn_generic_redfish_collectors( let sensors_enabled = matches!(ctx.sensors_config, Configurable::Enabled(_)); let metrics_enabled = matches!(ctx.metrics_config, Configurable::Enabled(_)); - - if (sensors_enabled || metrics_enabled) - && !ctx.collectors.contains(CollectorKind::Discovery, &key) - { - let shared = ctx.collectors.inventory_for(&key); - let collector_registry = Arc::new(ctx.metrics_manager.create_collector_registry( - format!("entity_discovery_collector_{key}"), - metrics_prefix, - )?); - match Collector::start::>( - endpoint_arc.clone(), - bmc.clone(), - EntityDiscoveryCollectorConfig { - shared, - discovery_concurrency: ctx.discovery_config.discovery_concurrency, - }, - CollectorStartContext { - limiter: ctx.limiter.clone(), - iteration_interval: ctx.discovery_config.refresh_interval, - collector_registry, - metrics_manager: ctx.metrics_manager.clone(), - }, - ) { - Ok(monitor) => { - ctx.collectors - .insert(CollectorKind::Discovery, key.clone().into(), monitor); - tracing::info!( - endpoint_key = %key, - total_collectors = ctx.collectors.len(CollectorKind::Discovery), - "Started entity discovery for BMC endpoint" - ); - } - Err(error) => { - tracing::error!( - ?error, - "Could not start entity discovery collector for: {:?}", - endpoint.addr - ); - } - } - } + let mut inventory_consumer_started = false; if let Configurable::Enabled(sensor_cfg) = &ctx.sensors_config && !ctx.collectors.contains(CollectorKind::Sensor, &key) { - let shared = ctx.collectors.inventory_for(&key); let collector_registry = Arc::new( ctx.metrics_manager .create_collector_registry(format!("sensor_collector_{key}"), metrics_prefix)?, @@ -121,9 +129,10 @@ fn spawn_generic_redfish_collectors( bmc.clone(), SensorCollectorConfig { data_sink: data_sink.clone(), - shared, sensor_fetch_concurrency: sensor_cfg.sensor_fetch_concurrency, include_sensor_thresholds: sensor_cfg.include_sensor_thresholds, + emit_derived_metrics: !metrics_enabled, + _bmc: std::marker::PhantomData, }, CollectorStartContext { limiter: ctx.limiter.clone(), @@ -135,6 +144,7 @@ fn spawn_generic_redfish_collectors( Ok(monitor) => { ctx.collectors .insert(CollectorKind::Sensor, key.clone().into(), monitor); + inventory_consumer_started = true; tracing::info!( endpoint_key = %key, total_collectors = ctx.collectors.len(CollectorKind::Sensor), @@ -154,7 +164,6 @@ fn spawn_generic_redfish_collectors( if let Configurable::Enabled(metrics_cfg) = &ctx.metrics_config && !ctx.collectors.contains(CollectorKind::Metrics, &key) { - let shared = ctx.collectors.inventory_for(&key); let collector_registry = Arc::new( ctx.metrics_manager .create_collector_registry(format!("metrics_collector_{key}"), metrics_prefix)?, @@ -164,8 +173,8 @@ fn spawn_generic_redfish_collectors( bmc.clone(), MetricsCollectorConfig { data_sink: data_sink.clone(), - shared, fetch_concurrency: metrics_cfg.fetch_concurrency, + _bmc: std::marker::PhantomData, }, CollectorStartContext { limiter: ctx.limiter.clone(), @@ -177,6 +186,7 @@ fn spawn_generic_redfish_collectors( Ok(monitor) => { ctx.collectors .insert(CollectorKind::Metrics, key.clone().into(), monitor); + inventory_consumer_started = true; tracing::info!( endpoint_key = %key, total_collectors = ctx.collectors.len(CollectorKind::Metrics), @@ -193,6 +203,87 @@ fn spawn_generic_redfish_collectors( } } + if inventory_consumer_started + && let Some(discovery) = ctx + .collectors + .map_mut(CollectorKind::Discovery) + .remove(key.as_str()) + { + tracing::info!( + endpoint_key = %key, + "Restarting entity discovery to rewire inventory consumers" + ); + tokio::spawn(async move { + discovery.stop().await; + }); + } + + let mut discovery_nodes: Vec> = Vec::new(); + if sensors_enabled + && let Some(event_node) = ctx.collectors.event_node(CollectorKind::Sensor, &key) + { + discovery_nodes.push(event_node); + } + if metrics_enabled + && let Some(event_node) = ctx.collectors.event_node(CollectorKind::Metrics, &key) + { + discovery_nodes.push(event_node); + } + + if (sensors_enabled || metrics_enabled) + && !discovery_nodes.is_empty() + && !ctx.collectors.contains(CollectorKind::Discovery, &key) + { + if let Some(data_sink) = data_sink.clone() { + discovery_nodes.push(data_sink); + } + let discovery_data_sink = if discovery_nodes.is_empty() { + None + } else { + Some(Arc::new(CompositeSyncEventNode::new( + discovery_nodes, + ctx.metrics_manager.clone(), + )) as Arc) + }; + + let collector_registry = Arc::new(ctx.metrics_manager.create_collector_registry( + format!("entity_discovery_collector_{key}"), + metrics_prefix, + )?); + match Collector::start::>( + endpoint_arc.clone(), + bmc.clone(), + EntityDiscoveryCollectorConfig { + data_sink: discovery_data_sink, + discovery_concurrency: ctx.discovery_config.discovery_concurrency, + _bmc: std::marker::PhantomData, + }, + CollectorStartContext { + limiter: ctx.limiter.clone(), + iteration_interval: ctx.discovery_config.refresh_interval, + collector_registry, + metrics_manager: ctx.metrics_manager.clone(), + }, + ) { + Ok(monitor) => { + ctx.collectors + .insert(CollectorKind::Discovery, key.clone().into(), monitor); + tracing::info!( + endpoint_key = %key, + total_collectors = ctx.collectors.len(CollectorKind::Discovery), + "Started entity discovery for BMC endpoint" + ); + } + Err(error) => { + tracing::error!( + ?error, + "Could not start entity discovery collector for: {:?}", + endpoint.addr + ); + } + } + } + if let Configurable::Enabled(logs_cfg) = &ctx.logs_config && !ctx.collectors.contains(CollectorKind::Logs, &key) { @@ -210,7 +301,7 @@ fn spawn_generic_redfish_collectors( }; let spawn_periodic_logs = |pcfg: PeriodicLogConfig, - data_sink: Option>, + data_sink: Option>, collector_registry: Arc<_>| -> Option> { let endpoint_id = endpoint.log_identity().into_owned(); @@ -417,7 +508,7 @@ fn spawn_generic_redfish_collectors( fn spawn_switch_host_collectors( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, - data_sink: Option>, + data_sink: Option>, metrics_prefix: &str, ) -> Result<(), HealthError> { let key = endpoint.key(); @@ -568,16 +659,18 @@ mod tests { }; use crate::limiter::{NoopLimiter, RateLimiter}; use crate::metrics::MetricsManager; - use crate::sink::{CollectorEvent, EventContext}; + use crate::sink::{EventContext, HealthEvent}; struct NoopSink; - impl DataSink for NoopSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for NoopSink { + fn node_type(&self) -> &'static str { "noop" } - fn handle_event(&self, _context: &EventContext, _event: &CollectorEvent) {} + fn handle_event(&self, _context: &EventContext, _event: &HealthEvent) -> Vec { + Vec::new() + } } fn context_with_config(config: Config, metrics_name: &str) -> DiscoveryLoopContext { diff --git a/crates/health/src/lib.rs b/crates/health/src/lib.rs index 51f9b3f4ef..461df1a124 100644 --- a/crates/health/src/lib.rs +++ b/crates/health/src/lib.rs @@ -44,15 +44,14 @@ use crate::endpoint::{CompositeEndpointSource, EndpointSource, StaticEndpointSou use crate::limiter::{BucketLimiter, NoopLimiter, RateLimiter}; use crate::metrics::{MetricsManager, run_metrics_server}; use crate::processor::{ - BmcIntrusionEventProcessor, EventProcessingPipeline, EventProcessor, HealthReportProcessor, - LeakEventProcessor, RackLeakProcessor, + BmcIntrusionSyncEventNode, EventGraph, HealthReportProcessor, LeakSyncEventNode, + RackLeakProcessor, }; use crate::sharding::ShardManager; use crate::sink::event_mapper::{OpenBmcEventMapper, RedfishEventMapper}; use crate::sink::{ - CompositeDataSink, DataSink, HealthReportSink, LogFileSink, OtlpSink, - PowerShelfHealthReportSink, PrometheusSink, RackHealthReportSink, SwitchHealthReportSink, - TracingSink, + HealthReportSink, LogFileSink, OtlpSink, PowerShelfHealthReportSink, PrometheusSink, + RackHealthReportSink, SwitchHealthReportSink, SyncEventNode, TracingSink, }; #[derive(thiserror::Error, Debug)] @@ -112,10 +111,13 @@ impl From> for HealthEr } } +/// The endpoint discovery wiring assembled from configured endpoint sources. struct EndpointWiring { source: Arc, } +/// Assembles the composite endpoint source (static and/or Carbide API) from +/// config, erroring if no sources are configured. fn build_endpoint_wiring(config: &Config) -> Result { let reqwest = ReqwestClient::with_params(ReqwestClientParams::new().accept_invalid_certs(true)) .map_err(BmcError::ReqwestError)?; @@ -160,97 +162,121 @@ fn build_endpoint_wiring(config: &Config) -> Result }) } +/// Builds the root event-graph node wiring together all enabled sinks and +/// processors, returning `None` when no nodes are configured. fn build_data_sink( config: &Config, metrics_manager: Arc, -) -> Result>, HealthError> { - let mut sinks: Vec> = Vec::new(); - let mut processors: Vec> = Vec::new(); +) -> Result>, HealthError> { + let mut nodes: Vec> = Vec::new(); + let mut has_terminal_sink = false; if let Configurable::Enabled(sink_cfg) = &config.sinks.tracing { - sinks.push(Arc::new(TracingSink::new(sink_cfg))); + nodes.push(Arc::new(TracingSink::new(sink_cfg))); + has_terminal_sink = true; } if let Configurable::Enabled(_) = &config.sinks.prometheus { - sinks.push(Arc::new(PrometheusSink::new( + nodes.push(Arc::new(PrometheusSink::new( metrics_manager.clone(), &config.metrics.prefix, )?)); + has_terminal_sink = true; } + let emit_empty_sensor_reports = config + .sinks + .health_report + .as_option() + .is_some_and(|cfg| !cfg.skip_empty_reports) + || config + .sinks + .power_shelf_health_report + .as_option() + .is_some_and(|cfg| !cfg.skip_empty_reports) + || config + .sinks + .switch_health_report + .as_option() + .is_some_and(|cfg| !cfg.skip_empty_reports); + if config.sinks.tracing.is_enabled() || config.sinks.health_report.is_enabled() || config.sinks.power_shelf_health_report.is_enabled() || config.sinks.switch_health_report.is_enabled() + || config.sinks.otlp.is_enabled() || config.processors.leak_detection.is_enabled() { - processors.push(Arc::new(HealthReportProcessor::new())); + nodes.push(Arc::new(HealthReportProcessor::new( + emit_empty_sensor_reports, + ))); } - if config.sinks.health_report.is_enabled() { - processors.push(Arc::new(BmcIntrusionEventProcessor::new())); + // Intrusion reports target the machine; install whenever any consumer of + // machine-targeted HealthReportProduced events is enabled (tracing/otlp + // forward all reports, the machine health-report sink consumes them directly). + if config.sinks.tracing.is_enabled() + || config.sinks.health_report.is_enabled() + || config.sinks.otlp.is_enabled() + { + nodes.push(Arc::new(BmcIntrusionSyncEventNode::new())); } if let Configurable::Enabled(ref leak_detection_cfg) = config.processors.leak_detection { - processors.push(Arc::new(LeakEventProcessor::new( + nodes.push(Arc::new(LeakSyncEventNode::new( leak_detection_cfg.minimum_alerts_per_report, ))); } if let Configurable::Enabled(ref rack_leak_cfg) = config.processors.rack_leak { - processors.push(Arc::new(RackLeakProcessor::new( + nodes.push(Arc::new(RackLeakProcessor::new( rack_leak_cfg.leaking_tray_threshold, ))); } if let Configurable::Enabled(ref sink_cfg) = config.sinks.log_file { - sinks.push(Arc::new( + nodes.push(Arc::new( LogFileSink::new(sink_cfg).map_err(HealthError::GenericError)?, )); + has_terminal_sink = true; } if let Configurable::Enabled(ref sink_cfg) = config.sinks.health_report { - sinks.push(Arc::new(HealthReportSink::new(sink_cfg)?)); + nodes.push(Arc::new(HealthReportSink::new(sink_cfg)?)); + has_terminal_sink = true; } if let Configurable::Enabled(ref sink_cfg) = config.sinks.rack_health_report { - sinks.push(Arc::new(RackHealthReportSink::new(sink_cfg)?)); + nodes.push(Arc::new(RackHealthReportSink::new(sink_cfg)?)); + has_terminal_sink = true; } if let Configurable::Enabled(ref sink_cfg) = config.sinks.switch_health_report { - sinks.push(Arc::new(SwitchHealthReportSink::new(sink_cfg)?)); + nodes.push(Arc::new(SwitchHealthReportSink::new(sink_cfg)?)); + has_terminal_sink = true; } if let Configurable::Enabled(ref sink_cfg) = config.sinks.power_shelf_health_report { - sinks.push(Arc::new(PowerShelfHealthReportSink::new(sink_cfg)?)); + nodes.push(Arc::new(PowerShelfHealthReportSink::new(sink_cfg)?)); + has_terminal_sink = true; } if let Configurable::Enabled(ref otlp_cfg) = config.sinks.otlp { let mapper: Arc = Arc::new(OpenBmcEventMapper); - sinks.push(Arc::new(OtlpSink::new( + nodes.push(Arc::new(OtlpSink::new( otlp_cfg, mapper, &metrics_manager, &config.metrics.prefix, )?)); + has_terminal_sink = true; } - if sinks.is_empty() { + if !has_terminal_sink { return Ok(None); } - let composite_sink: Arc = - Arc::new(CompositeDataSink::new(sinks, metrics_manager.clone())); - - if processors.is_empty() { - return Ok(Some(composite_sink)); - } - - Ok(Some(Arc::new(EventProcessingPipeline::new( - processors, - composite_sink, - metrics_manager, - )))) + Ok(Some(Arc::new(EventGraph::new(nodes, metrics_manager)))) } pub async fn run_service(config: Config) -> Result<(), HealthError> { diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index fff6771d02..8b7da19075 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -28,7 +28,7 @@ use super::metrics::{ }; use super::resource::Resource; use crate::endpoint::SwitchEndpointRole; -use crate::sink::{CollectorEvent, EventContext, MetricSample}; +use crate::sink::{EventContext, HealthEvent, MetricSample}; fn severity_text_to_number(severity: &str) -> i32 { match severity.to_uppercase().as_str() { @@ -157,10 +157,12 @@ fn convert_log(log: &crate::sink::LogRecord, observed_nanos: u64) -> OtlpLogReco } } -fn convert_event(event: &CollectorEvent, observed_nanos: u64) -> Option { +/// Converts a single health event into an OTLP log record, or `None` for events +/// (metrics, lifecycle markers) that are not exported as logs. +fn convert_event(event: &HealthEvent, observed_nanos: u64) -> Option { match event { - CollectorEvent::Log(log) => Some(convert_log(log, observed_nanos)), - CollectorEvent::HealthReport(report) => { + HealthEvent::LogObserved(log) => Some(convert_log(log, observed_nanos)), + HealthEvent::HealthReportProduced(report) => { let body = format!( "health report: {} alerts, {} ok (source: {:?})", report.alerts.len(), @@ -182,7 +184,7 @@ fn convert_event(event: &CollectorEvent, observed_nanos: u64) -> Option { + HealthEvent::FirmwareObserved(info) => { let body = format!("{}: {}", info.component, info.version); Some(OtlpLogRecord { time_unix_nano: observed_nanos, @@ -194,15 +196,18 @@ fn convert_event(event: &CollectorEvent, observed_nanos: u64) -> Option None, + HealthEvent::MeasurementObserved(_) + | HealthEvent::ScrapeRequested { .. } + | HealthEvent::InventoryDiscovered { .. } + | HealthEvent::InventoryUpdated { .. } + | HealthEvent::ScrapeBatchStarted + | HealthEvent::ScrapeBatchFinished + | HealthEvent::NodeRemoved => None, } } /// Builds an OTLP log export request grouped by endpoint. -pub fn build_export_request(batch: &[(EventContext, CollectorEvent)]) -> ExportLogsServiceRequest { +pub fn build_export_request(batch: &[(EventContext, HealthEvent)]) -> ExportLogsServiceRequest { let observed_nanos = SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .unwrap_or_default() @@ -624,7 +629,7 @@ mod tests { #[test] fn log_event_converts_to_otlp_record() { let ctx = test_context(); - let log = CollectorEvent::Log(Box::new(LogRecord { + let log = HealthEvent::LogObserved(Box::new(LogRecord { body: "something happened".to_string(), severity: "WARNING".to_string(), attributes: vec![(Cow::Borrowed("entry_id"), "42".to_string())], @@ -652,7 +657,7 @@ mod tests { r#"{"key":"redfish.parent.log_entry_id","value":"42"}]}"# ); - let log = CollectorEvent::Log(Box::new(LogRecord { + let log = HealthEvent::LogObserved(Box::new(LogRecord { body: body.to_string(), severity: "WARN".to_string(), attributes: vec![ @@ -691,8 +696,8 @@ mod tests { fn metric_events_are_filtered_out() { let ctx = test_context(); let batch = vec![ - (ctx.clone(), CollectorEvent::MetricCollectionStart), - (ctx, CollectorEvent::MetricCollectionEnd), + (ctx.clone(), HealthEvent::ScrapeBatchStarted), + (ctx, HealthEvent::ScrapeBatchFinished), ]; let request = build_export_request(&batch); assert!(request.resource_logs.is_empty()); @@ -701,7 +706,7 @@ mod tests { #[test] fn health_report_converts_with_alert_severity() { let ctx = test_context(); - let report = CollectorEvent::HealthReport( + let report = HealthEvent::HealthReportProduced( HealthReport { source: ReportSource::BmcSensors, target: None, @@ -743,7 +748,7 @@ mod tests { let log = |ctx| { ( ctx, - CollectorEvent::Log(Box::new(LogRecord { + HealthEvent::LogObserved(Box::new(LogRecord { body: "x".to_string(), severity: "INFO".to_string(), attributes: vec![], diff --git a/crates/health/src/otlp/drain.rs b/crates/health/src/otlp/drain.rs index e1e9408a3a..d128113a80 100644 --- a/crates/health/src/otlp/drain.rs +++ b/crates/health/src/otlp/drain.rs @@ -24,7 +24,7 @@ use super::collector_logs::logs_service_client::LogsServiceClient; use super::convert::build_export_request; use crate::collectors::{BackoffConfig, ExponentialBackoff}; use crate::sink::otlp::OtlpQueue; -use crate::sink::{CollectorEvent, EventContext}; +use crate::sink::{EventContext, HealthEvent}; pub(crate) struct OtlpDrainTask { queue: Arc, @@ -48,7 +48,9 @@ impl OtlpDrainTask { } } - fn drain_batch(&self, batch: &mut Vec<(EventContext, CollectorEvent)>) { + /// Pops queued events into `batch` until it reaches `batch_size` or the + /// queue is empty. + fn drain_batch(&self, batch: &mut Vec<(EventContext, HealthEvent)>) { let remaining = self.batch_size.saturating_sub(batch.len()); for _ in 0..remaining { match self.queue.pop() { @@ -127,7 +129,7 @@ impl OtlpDrainTask { async fn flush( &self, client: &mut LogsServiceClient, - batch: &mut Vec<(EventContext, CollectorEvent)>, + batch: &mut Vec<(EventContext, HealthEvent)>, ) { if batch.is_empty() { return; diff --git a/crates/health/src/otlp/metrics_drain.rs b/crates/health/src/otlp/metrics_drain.rs index f8e944f4b7..b86683b65d 100644 --- a/crates/health/src/otlp/metrics_drain.rs +++ b/crates/health/src/otlp/metrics_drain.rs @@ -51,6 +51,8 @@ impl OtlpMetricsDrainTask { } } + /// Pops queued metric samples into `batch` until it reaches `batch_size` or + /// the queue is empty. fn drain_batch(&self, batch: &mut Vec<(EventContext, MetricSample)>) { let remaining = self.batch_size.saturating_sub(batch.len()); for _ in 0..remaining { diff --git a/crates/health/src/processor/health_report.rs b/crates/health/src/processor/health_report.rs index ef0805c430..1215ac1524 100644 --- a/crates/health/src/processor/health_report.rs +++ b/crates/health/src/processor/health_report.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use dashmap::DashMap; use nv_redfish::resource::Health as BmcHealth; -use super::{CollectorEvent, EventContext, EventProcessor}; +use super::{EventContext, HealthEvent, SyncEventNode}; use crate::sink::{ Classification, HealthReport, HealthReportAlert, HealthReportSuccess, MetricSample, Probe, ReportSource, SensorThresholdContext, @@ -58,15 +58,21 @@ struct HealthReportWindow { alerts: Vec, } +/// Processor node that classifies sensor measurements against their thresholds +/// and, at the end of each scrape batch, emits a single health report +/// summarizing the window (suppressing empty windows). #[derive(Default)] pub struct HealthReportProcessor { windows: DashMap, + emit_empty_reports: bool, } impl HealthReportProcessor { - pub fn new() -> Self { + /// Creates a processor with no in-flight scrape windows. + pub fn new(emit_empty_reports: bool) -> Self { Self { windows: DashMap::new(), + emit_empty_reports, } } @@ -198,18 +204,18 @@ impl HealthReportProcessor { } } -impl EventProcessor for HealthReportProcessor { - fn processor_type(&self) -> &'static str { +impl SyncEventNode for HealthReportProcessor { + fn node_type(&self) -> &'static str { "health_report_processor" } - fn process_event(&self, context: &EventContext, event: &CollectorEvent) -> Vec { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { match event { - CollectorEvent::MetricCollectionStart => { + HealthEvent::ScrapeBatchStarted => { self.windows .insert(Self::stream_key(context), HealthReportWindow::default()); } - CollectorEvent::Metric(metric) => { + HealthEvent::MeasurementObserved(metric) => { let Some(health) = metric.context.as_ref() else { return Vec::new(); }; @@ -219,10 +225,21 @@ impl EventProcessor for HealthReportProcessor { SensorHealthResult::Alert(alert) => window.alerts.push(alert), } } - CollectorEvent::MetricCollectionEnd => { + HealthEvent::ScrapeBatchFinished => { let Some((_, window)) = self.windows.remove(&Self::stream_key(context)) else { return Vec::new(); }; + if !self.emit_empty_reports + && window.successes.is_empty() + && window.alerts.is_empty() + { + tracing::debug!( + endpoint = %context.addr.mac, + collector_type = context.collector_type, + "Skipping empty hardware health report" + ); + return Vec::new(); + } let report = HealthReport { source: ReportSource::BmcSensors, target: context.health_report_target(), @@ -233,19 +250,23 @@ impl EventProcessor for HealthReportProcessor { tracing::info!( endpoint = %context.addr.mac, + target = ?report.target, success_count = report.successes.len(), alert_count = report.alerts.len(), "Sending hardware health report" ); - return vec![CollectorEvent::HealthReport(Arc::new(report))]; + return vec![HealthEvent::HealthReportProduced(Arc::new(report))]; } - CollectorEvent::CollectorRemoved => { + HealthEvent::NodeRemoved => { self.windows.remove(&Self::stream_key(context)); } - CollectorEvent::Log(_) - | CollectorEvent::Firmware(_) - | CollectorEvent::HealthReport(_) => {} + HealthEvent::LogObserved(_) + | HealthEvent::ScrapeRequested { .. } + | HealthEvent::InventoryDiscovered { .. } + | HealthEvent::InventoryUpdated { .. } + | HealthEvent::FirmwareObserved(_) + | HealthEvent::HealthReportProduced(_) => {} } Vec::new() @@ -289,13 +310,13 @@ mod tests { #[test] fn metric_window_emits_abstract_health_report() { - let processor = HealthReportProcessor::new(); + let processor = HealthReportProcessor::new(false); let context = test_context(); - let _ = processor.process_event(&context, &CollectorEvent::MetricCollectionStart); - let _ = processor.process_event( + let _ = processor.handle_event(&context, &HealthEvent::ScrapeBatchStarted); + let _ = processor.handle_event( &context, - &CollectorEvent::Metric( + &HealthEvent::MeasurementObserved( MetricSample { key: "sensor-1".to_string(), name: "hw_sensor".to_string(), @@ -320,9 +341,9 @@ mod tests { .into(), ), ); - let emitted = processor.process_event(&context, &CollectorEvent::MetricCollectionEnd); + let emitted = processor.handle_event(&context, &HealthEvent::ScrapeBatchFinished); - let Some(CollectorEvent::HealthReport(report)) = emitted.last() else { + let Some(HealthEvent::HealthReportProduced(report)) = emitted.last() else { panic!("expected health report event"); }; @@ -334,15 +355,42 @@ mod tests { #[test] fn collector_removed_clears_metric_window() { - let processor = HealthReportProcessor::new(); + let processor = HealthReportProcessor::new(false); let context = test_context(); - let _ = processor.process_event(&context, &CollectorEvent::MetricCollectionStart); + let _ = processor.handle_event(&context, &HealthEvent::ScrapeBatchStarted); assert_eq!(processor.windows.len(), 1); - let emitted = processor.process_event(&context, &CollectorEvent::CollectorRemoved); + let emitted = processor.handle_event(&context, &HealthEvent::NodeRemoved); + + assert!(emitted.is_empty()); + assert!(processor.windows.is_empty()); + } + + #[test] + fn empty_metric_window_does_not_emit_health_report() { + let processor = HealthReportProcessor::new(false); + let context = test_context(); + + let _ = processor.handle_event(&context, &HealthEvent::ScrapeBatchStarted); + let emitted = processor.handle_event(&context, &HealthEvent::ScrapeBatchFinished); assert!(emitted.is_empty()); assert!(processor.windows.is_empty()); } + + #[test] + fn empty_metric_window_can_emit_health_report_when_configured() { + let processor = HealthReportProcessor::new(true); + let context = test_context(); + + let _ = processor.handle_event(&context, &HealthEvent::ScrapeBatchStarted); + let emitted = processor.handle_event(&context, &HealthEvent::ScrapeBatchFinished); + + let Some(HealthEvent::HealthReportProduced(report)) = emitted.last() else { + panic!("expected health report event"); + }; + assert!(report.is_empty()); + assert!(processor.windows.is_empty()); + } } diff --git a/crates/health/src/processor/intrusion_events.rs b/crates/health/src/processor/intrusion_events.rs index f1c82afd07..5a13efbcd5 100644 --- a/crates/health/src/processor/intrusion_events.rs +++ b/crates/health/src/processor/intrusion_events.rs @@ -18,7 +18,7 @@ use std::borrow::Cow; use std::sync::Arc; -use super::{CollectorEvent, EventContext, EventProcessor}; +use super::{EventContext, HealthEvent, SyncEventNode}; use crate::sink::{ Classification, HealthReport, HealthReportAlert, HealthReportSuccess, HealthReportTarget, LogRecord, Probe, ReportSource, @@ -33,10 +33,13 @@ enum IntrusionEventState { Clear, } +/// Processor node that turns BMC intrusion log records into machine-targeted +/// health reports (an alert when intrusion is asserted, a success when cleared). #[derive(Default)] -pub struct BmcIntrusionEventProcessor; +pub struct BmcIntrusionSyncEventNode; -impl BmcIntrusionEventProcessor { +impl BmcIntrusionSyncEventNode { + /// Creates a new intrusion-event processor. pub fn new() -> Self { Self } @@ -98,17 +101,13 @@ impl BmcIntrusionEventProcessor { } } -impl EventProcessor for BmcIntrusionEventProcessor { - fn processor_type(&self) -> &'static str { +impl SyncEventNode for BmcIntrusionSyncEventNode { + fn node_type(&self) -> &'static str { "bmc_intrusion_event_processor" } - fn process_event( - &self, - _context: &EventContext, - event: &CollectorEvent, - ) -> Vec { - let CollectorEvent::Log(record) = event else { + fn handle_event(&self, _context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::LogObserved(record) = event else { return Vec::new(); }; @@ -146,7 +145,7 @@ impl EventProcessor for BmcIntrusionEventProcessor { alerts, }; - vec![CollectorEvent::HealthReport(Arc::new(report))] + vec![HealthEvent::HealthReportProduced(Arc::new(report))] } } @@ -192,13 +191,13 @@ mod tests { } } - fn log(body: &str, severity: &str, message_args: Option<&str>) -> CollectorEvent { + fn log(body: &str, severity: &str, message_args: Option<&str>) -> HealthEvent { let mut attributes = Vec::new(); if let Some(message_args) = message_args { attributes.push((Cow::Borrowed("message_args"), message_args.to_string())); } - CollectorEvent::Log(Box::new(LogRecord { + HealthEvent::LogObserved(Box::new(LogRecord { body: body.to_string(), severity: severity.to_string(), attributes, @@ -206,12 +205,12 @@ mod tests { })) } - fn emitted_report(event: CollectorEvent) -> Arc { - let processor = BmcIntrusionEventProcessor::new(); - let emitted = processor.process_event(&context(), &event); + fn emitted_report(event: HealthEvent) -> Arc { + let processor = BmcIntrusionSyncEventNode::new(); + let emitted = processor.handle_event(&context(), &event); assert_eq!(emitted.len(), 1); - let CollectorEvent::HealthReport(report) = &emitted[0] else { + let HealthEvent::HealthReportProduced(report) = &emitted[0] else { panic!("expected health report"); }; @@ -338,8 +337,8 @@ mod tests { #[test] fn ignores_unrelated_logs() { - let processor = BmcIntrusionEventProcessor::new(); - let emitted = processor.process_event( + let processor = BmcIntrusionSyncEventNode::new(); + let emitted = processor.handle_event( &context(), &log("CPU temperature threshold warning", "Warning", None), ); diff --git a/crates/health/src/processor/leak_events.rs b/crates/health/src/processor/leak_events.rs index a34dadc775..bd49f443d0 100644 --- a/crates/health/src/processor/leak_events.rs +++ b/crates/health/src/processor/leak_events.rs @@ -18,28 +18,34 @@ use std::collections::BTreeSet; use std::sync::Arc; -use super::{EventContext, EventProcessor}; +use super::{EventContext, SyncEventNode}; use crate::sink::{ - Classification, CollectorEvent, HealthReport, HealthReportAlert, HealthReportSuccess, + Classification, HealthEvent, HealthReport, HealthReportAlert, HealthReportSuccess, HealthReportTarget, Probe, ReportSource, }; -pub struct LeakEventProcessor { +/// Processor node that aggregates per-detector BMC leak alerts into a single +/// tray-level leak report, declaring a leak once enough detectors fire. +pub struct LeakSyncEventNode { minimum_alerts_per_report: usize, } -impl LeakEventProcessor { +impl LeakSyncEventNode { + /// Creates a leak processor that declares a leak once at least + /// `minimum_alerts_per_report` leak-detector alerts are seen in a report. pub fn new(minimum_alerts_per_report: usize) -> Self { Self { minimum_alerts_per_report, } } + /// Returns whether `alerts` meets the configured leak threshold. fn is_leaking(&self, alerts: usize) -> bool { alerts >= self.minimum_alerts_per_report } } +/// Returns whether an alert was raised by a leak detector. fn is_leak_detector_alert(alert: &HealthReportAlert) -> bool { alert .classifications @@ -47,6 +53,8 @@ fn is_leak_detector_alert(alert: &HealthReportAlert) -> bool { .any(|classification| classification == &Classification::LeakDetector) } +/// Builds a comma-separated, de-duplicated list of the leaking detector targets +/// for inclusion in the report message. fn leak_details(alerts: &[&HealthReportAlert]) -> String { let targets: BTreeSet = alerts .iter() @@ -60,17 +68,13 @@ fn leak_details(alerts: &[&HealthReportAlert]) -> String { targets.iter().cloned().collect::>().join(", ") } -impl EventProcessor for LeakEventProcessor { - fn processor_type(&self) -> &'static str { +impl SyncEventNode for LeakSyncEventNode { + fn node_type(&self) -> &'static str { "leak_event_processor" } - fn process_event( - &self, - _context: &EventContext, - event: &CollectorEvent, - ) -> Vec { - let CollectorEvent::HealthReport(report) = event else { + fn handle_event(&self, _context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::HealthReportProduced(report) = event else { return Vec::new(); }; @@ -119,7 +123,7 @@ impl EventProcessor for LeakEventProcessor { alerts, }; - vec![CollectorEvent::HealthReport(Arc::new(leak_report))] + vec![HealthEvent::HealthReportProduced(Arc::new(leak_report))] } } @@ -158,7 +162,7 @@ mod tests { #[test] fn does_not_emit_alert_when_threshold_not_met() { - let processor = LeakEventProcessor::new(2); + let processor = LeakSyncEventNode::new(2); let report = HealthReport { source: ReportSource::BmcLeakDetectors, target: Some(HealthReportTarget::Machine), @@ -167,11 +171,13 @@ mod tests { alerts: vec![leak_alert("LeakDetector_Probe")], }; - let emitted = - processor.process_event(&context(), &CollectorEvent::HealthReport(Arc::new(report))); + let emitted = processor.handle_event( + &context(), + &HealthEvent::HealthReportProduced(Arc::new(report)), + ); assert_eq!(emitted.len(), 1); - let CollectorEvent::HealthReport(derived) = &emitted[0] else { + let HealthEvent::HealthReportProduced(derived) = &emitted[0] else { panic!("expected derived health report"); }; @@ -183,7 +189,7 @@ mod tests { #[test] fn emits_derived_leak_report_when_threshold_met() { - let processor = LeakEventProcessor::new(1); + let processor = LeakSyncEventNode::new(1); let report = HealthReport { source: ReportSource::BmcLeakDetectors, target: Some(HealthReportTarget::Machine), @@ -192,11 +198,13 @@ mod tests { alerts: vec![leak_alert("LeakDetector_Probe")], }; - let emitted = - processor.process_event(&context(), &CollectorEvent::HealthReport(Arc::new(report))); + let emitted = processor.handle_event( + &context(), + &HealthEvent::HealthReportProduced(Arc::new(report)), + ); assert_eq!(emitted.len(), 1); - let CollectorEvent::HealthReport(derived) = &emitted[0] else { + let HealthEvent::HealthReportProduced(derived) = &emitted[0] else { panic!("expected derived health report"); }; assert_eq!(derived.source, ReportSource::TrayLeakDetection); @@ -213,8 +221,8 @@ mod tests { #[test] fn ignores_non_health_report_events() { - let processor = LeakEventProcessor::new(1); - let metric_event = CollectorEvent::Metric( + let processor = LeakSyncEventNode::new(1); + let metric_event = HealthEvent::MeasurementObserved( crate::sink::MetricSample { key: "k".to_string(), name: "n".to_string(), @@ -226,13 +234,13 @@ mod tests { } .into(), ); - let emitted = processor.process_event(&context(), &metric_event); + let emitted = processor.handle_event(&context(), &metric_event); assert!(emitted.is_empty()); } #[test] fn ignores_sensor_health_reports() { - let processor = LeakEventProcessor::new(1); + let processor = LeakSyncEventNode::new(1); let report = HealthReport { source: ReportSource::BmcSensors, observed_at: Some(chrono::Utc::now()), @@ -244,8 +252,10 @@ mod tests { target: Some(HealthReportTarget::Machine), }; - let emitted = - processor.process_event(&context(), &CollectorEvent::HealthReport(Arc::new(report))); + let emitted = processor.handle_event( + &context(), + &HealthEvent::HealthReportProduced(Arc::new(report)), + ); assert!(emitted.is_empty()); } diff --git a/crates/health/src/processor/mod.rs b/crates/health/src/processor/mod.rs index 1e9bbc95eb..1b796c8aa0 100644 --- a/crates/health/src/processor/mod.rs +++ b/crates/health/src/processor/mod.rs @@ -25,63 +25,65 @@ mod intrusion_events; mod leak_events; mod rack_leak; pub use health_report::HealthReportProcessor; -pub use intrusion_events::BmcIntrusionEventProcessor; -pub use leak_events::LeakEventProcessor; +pub use intrusion_events::BmcIntrusionSyncEventNode; +pub use leak_events::LeakSyncEventNode; pub use rack_leak::RackLeakProcessor; use crate::metrics::{ComponentMetrics, MetricsManager}; -use crate::sink::{CollectorEvent, DataSink, EventContext}; - -pub trait EventProcessor: Send + Sync { - fn processor_type(&self) -> &'static str; - fn process_event(&self, context: &EventContext, event: &CollectorEvent) -> Vec; -} +use crate::sink::{EventContext, HealthEvent, SyncEventNode}; +/// A queued event plus the set of nodes that may not re-consume it, so a node +/// never re-processes events derived from its own output. struct PendingEvent<'a> { - event: Cow<'a, CollectorEvent>, + event: Cow<'a, HealthEvent>, blocked_processors: Vec, } -pub struct EventProcessingPipeline { - processors: Vec>, - sink: Arc, +/// Runs a pipeline of [`SyncEventNode`]s: each input event is offered to every +/// interested node, and any events a node emits are fed back through the graph +/// (excluding the emitting node) until the work queue drains. +pub struct EventGraph { + nodes: Vec>, component_metrics: Arc, } -impl EventProcessingPipeline { - pub fn new( - processors: Vec>, - sink: Arc, - metrics_manager: Arc, - ) -> Self { +impl EventGraph { + /// Builds a graph over `nodes`. Callers must only construct this when at + /// least one node is configured. + pub fn new(nodes: Vec>, metrics_manager: Arc) -> Self { debug_assert!( - !processors.is_empty(), - "EventProcessingPipeline should only be used when processors are configured" + !nodes.is_empty(), + "EventGraph should only be used when nodes are configured" ); Self { - processors, - sink, + nodes, component_metrics: metrics_manager.component_metrics(), } } + /// Offers `current_event` to every interested, non-blocked node and queues + /// the events they emit for further processing. fn next_events( &self, context: &EventContext, - current_event: &CollectorEvent, - blocked_processors: &[bool], + current_event: &HealthEvent, + blocked_nodes: &[bool], queue: &mut VecDeque, ) { - for (processor_idx, processor) in self.processors.iter().enumerate() { - if blocked_processors[processor_idx] { + for (node_idx, node) in self.nodes.iter().enumerate() { + if blocked_nodes[node_idx] { + continue; + } + + if !node.interested_in(current_event) { continue; } let start = Instant::now(); - let emitted = processor.process_event(context, current_event); + let emitted = node.handle_event(context, current_event); self.component_metrics.record_operation( crate::metrics::ComponentKind::Processor, - processor.processor_type(), + node.node_type(), start.elapsed(), true, ); @@ -90,8 +92,8 @@ impl EventProcessingPipeline { } for event in emitted { - let mut next_blocked_processors = blocked_processors.to_vec(); - next_blocked_processors[processor_idx] = true; + let mut next_blocked_processors = blocked_nodes.to_vec(); + next_blocked_processors[node_idx] = true; queue.push_back(PendingEvent { event: Cow::Owned(event), blocked_processors: next_blocked_processors, @@ -101,19 +103,18 @@ impl EventProcessingPipeline { } } -impl DataSink for EventProcessingPipeline { - fn sink_type(&self) -> &'static str { - "event_processing_pipeline" +impl SyncEventNode for EventGraph { + fn node_type(&self) -> &'static str { + "event_graph" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { let mut queue = VecDeque::from(vec![PendingEvent { event: Cow::Borrowed(event), - blocked_processors: vec![false; self.processors.len()], + blocked_processors: vec![false; self.nodes.len()], }]); while let Some(current) = queue.pop_front() { - self.sink.handle_event(context, ¤t.event); self.next_events( context, ¤t.event, @@ -121,6 +122,7 @@ impl DataSink for EventProcessingPipeline { &mut queue, ); } + Vec::new() } } @@ -141,13 +143,14 @@ mod tests { counter: Arc, } - impl DataSink for CountingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CountingSink { + fn node_type(&self) -> &'static str { "counting_sink" } - fn handle_event(&self, _context: &EventContext, _event: &CollectorEvent) { + fn handle_event(&self, _context: &EventContext, _event: &HealthEvent) -> Vec { self.counter.fetch_add(1, Ordering::SeqCst); + Vec::new() } } @@ -155,16 +158,12 @@ mod tests { counter: Arc, } - impl EventProcessor for SelfReemittingProcessor { - fn processor_type(&self) -> &'static str { + impl SyncEventNode for SelfReemittingProcessor { + fn node_type(&self) -> &'static str { "self_reemitting_processor" } - fn process_event( - &self, - _context: &EventContext, - event: &CollectorEvent, - ) -> Vec { + fn handle_event(&self, _context: &EventContext, event: &HealthEvent) -> Vec { self.counter.fetch_add(1, Ordering::SeqCst); vec![event.clone()] } @@ -190,17 +189,19 @@ mod tests { let sink_counter = Arc::new(AtomicUsize::new(0)); let metrics_manager = Arc::new(MetricsManager::new("test").expect("should create metrics manager")); - let pipeline = EventProcessingPipeline::new( - vec![Arc::new(SelfReemittingProcessor { - counter: processor_counter.clone(), - })], - Arc::new(CountingSink { - counter: sink_counter.clone(), - }), + let pipeline = EventGraph::new( + vec![ + Arc::new(CountingSink { + counter: sink_counter.clone(), + }), + Arc::new(SelfReemittingProcessor { + counter: processor_counter.clone(), + }), + ], metrics_manager, ); - let event = CollectorEvent::Metric( + let event = HealthEvent::MeasurementObserved( crate::sink::MetricSample { key: "k".to_string(), name: "n".to_string(), diff --git a/crates/health/src/processor/rack_leak.rs b/crates/health/src/processor/rack_leak.rs index 69a2d69cc0..023b5fcc86 100644 --- a/crates/health/src/processor/rack_leak.rs +++ b/crates/health/src/processor/rack_leak.rs @@ -21,22 +21,28 @@ use std::sync::Arc; use carbide_uuid::rack::RackId; use dashmap::DashMap; -use super::{EventContext, EventProcessor}; +use super::{EventContext, SyncEventNode}; use crate::sink::{ - Classification, CollectorEvent, HealthReport, HealthReportAlert, HealthReportSuccess, + Classification, HealthEvent, HealthReport, HealthReportAlert, HealthReportSuccess, HealthReportTarget, Probe, ReportSource, }; +/// Per-rack tally of which trays are currently reporting a leak. struct RackLeakState { leaking_trays: HashSet, } +/// Processor node that rolls up per-tray leak reports into a rack-level leak +/// report, alerting once the number of simultaneously-leaking trays in a rack +/// crosses a threshold. pub struct RackLeakProcessor { racks: DashMap, leaking_tray_threshold: usize, } impl RackLeakProcessor { + /// Creates a rack-leak processor that alerts once `leaking_tray_threshold` + /// trays in a rack are leaking at the same time. pub fn new(leaking_tray_threshold: usize) -> Self { Self { racks: DashMap::new(), @@ -44,6 +50,8 @@ impl RackLeakProcessor { } } + /// Builds the rack-level report (alert or success) for `leaking_count` + /// currently-leaking trays. fn build_report(&self, leaking_count: usize) -> HealthReport { if leaking_count >= self.leaking_tray_threshold { HealthReport { @@ -76,24 +84,24 @@ impl RackLeakProcessor { } } -impl EventProcessor for RackLeakProcessor { - fn processor_type(&self) -> &'static str { +impl SyncEventNode for RackLeakProcessor { + fn node_type(&self) -> &'static str { "rack_leak_processor" } - fn process_event(&self, context: &EventContext, event: &CollectorEvent) -> Vec { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { let Some(rack_id) = context.rack_id() else { return Vec::new(); }; - if matches!(event, CollectorEvent::CollectorRemoved) { + if matches!(event, HealthEvent::NodeRemoved) { if let Some(mut entry) = self.racks.get_mut(rack_id) { entry.leaking_trays.remove(context.endpoint_key()); } return Vec::new(); } - let CollectorEvent::HealthReport(report) = event else { + let HealthEvent::HealthReportProduced(report) = event else { return Vec::new(); }; @@ -124,7 +132,7 @@ impl EventProcessor for RackLeakProcessor { let leaking_count = entry.leaking_trays.len(); let report = self.build_report(leaking_count); - vec![CollectorEvent::HealthReport(Arc::new(report))] + vec![HealthEvent::HealthReportProduced(Arc::new(report))] } } @@ -166,7 +174,7 @@ mod tests { } } - fn tray_leak_report(leaking: bool) -> CollectorEvent { + fn tray_leak_report(leaking: bool) -> HealthEvent { let report = if leaking { HealthReport { source: ReportSource::TrayLeakDetection, @@ -192,7 +200,7 @@ mod tests { alerts: vec![], } }; - CollectorEvent::HealthReport(Arc::new(report)) + HealthEvent::HealthReportProduced(Arc::new(report)) } #[test] @@ -207,7 +215,7 @@ mod tests { alerts: vec![], }; let emitted = - processor.process_event(&ctx, &CollectorEvent::HealthReport(Arc::new(report))); + processor.handle_event(&ctx, &HealthEvent::HealthReportProduced(Arc::new(report))); assert!(emitted.is_empty()); } @@ -215,7 +223,7 @@ mod tests { fn ignores_events_without_rack_id() { let processor = RackLeakProcessor::new(2); let ctx = context_without_rack("42:9e:b1:bd:9d:dd"); - let emitted = processor.process_event(&ctx, &tray_leak_report(true)); + let emitted = processor.handle_event(&ctx, &tray_leak_report(true)); assert!(emitted.is_empty()); } @@ -224,10 +232,10 @@ mod tests { let processor = RackLeakProcessor::new(2); let ctx = context_with_rack("42:9e:b1:bd:9d:dd", "rack-1"); - let emitted = processor.process_event(&ctx, &tray_leak_report(true)); + let emitted = processor.handle_event(&ctx, &tray_leak_report(true)); assert_eq!(emitted.len(), 1); - let CollectorEvent::HealthReport(report) = &emitted[0] else { + let HealthEvent::HealthReportProduced(report) = &emitted[0] else { panic!("expected health report"); }; assert_eq!(report.source, ReportSource::RackLeakDetection); @@ -243,10 +251,10 @@ mod tests { let ctx_a = context_with_rack("42:9e:b1:bd:9d:dd", "rack-1"); let ctx_b = context_with_rack("42:9e:b1:bd:9d:ee", "rack-1"); - processor.process_event(&ctx_a, &tray_leak_report(true)); - let emitted = processor.process_event(&ctx_b, &tray_leak_report(true)); + processor.handle_event(&ctx_a, &tray_leak_report(true)); + let emitted = processor.handle_event(&ctx_b, &tray_leak_report(true)); - let CollectorEvent::HealthReport(report) = &emitted[0] else { + let HealthEvent::HealthReportProduced(report) = &emitted[0] else { panic!("expected health report"); }; assert_eq!(report.source, ReportSource::RackLeakDetection); @@ -263,13 +271,13 @@ mod tests { let ctx_b = context_with_rack("42:9e:b1:bd:9d:ee", "rack-1"); processor - .process_event(&ctx_a, &tray_leak_report(true)) + .handle_event(&ctx_a, &tray_leak_report(true)) .len(); - processor.process_event(&ctx_b, &tray_leak_report(true)); + processor.handle_event(&ctx_b, &tray_leak_report(true)); - let emitted = processor.process_event(&ctx_a, &tray_leak_report(false)); + let emitted = processor.handle_event(&ctx_a, &tray_leak_report(false)); - let CollectorEvent::HealthReport(report) = &emitted[0] else { + let HealthEvent::HealthReportProduced(report) = &emitted[0] else { panic!("expected health report"); }; assert!(report.alerts.is_empty()); @@ -284,12 +292,12 @@ mod tests { let ctx_b = context_with_rack("42:9e:b1:bd:9d:ee", "rack-1"); let ctx_c = context_with_rack("42:9e:b1:bd:9d:ff", "rack-1"); - processor.process_event(&ctx_a, &tray_leak_report(true)); - processor.process_event(&ctx_b, &tray_leak_report(true)); + processor.handle_event(&ctx_a, &tray_leak_report(true)); + processor.handle_event(&ctx_b, &tray_leak_report(true)); - let emitted = processor.process_event(&ctx_c, &tray_leak_report(false)); + let emitted = processor.handle_event(&ctx_c, &tray_leak_report(false)); - let CollectorEvent::HealthReport(report) = &emitted[0] else { + let HealthEvent::HealthReportProduced(report) = &emitted[0] else { panic!("expected health report"); }; assert_eq!(report.alerts.len(), 1, "rack should still be in alert"); @@ -302,10 +310,10 @@ mod tests { let ctx_a = context_with_rack("42:9e:b1:bd:9d:dd", "rack-1"); let ctx_b = context_with_rack("42:9e:b1:bd:9d:ee", "rack-1"); - processor.process_event(&ctx_a, &tray_leak_report(true)); - processor.process_event(&ctx_b, &tray_leak_report(true)); + processor.handle_event(&ctx_a, &tray_leak_report(true)); + processor.handle_event(&ctx_b, &tray_leak_report(true)); - let emitted = processor.process_event(&ctx_a, &CollectorEvent::CollectorRemoved); + let emitted = processor.handle_event(&ctx_a, &HealthEvent::NodeRemoved); assert!(emitted.is_empty()); let Some(rack) = processor.racks.get(ctx_a.rack_id().expect("rack id")) else { @@ -322,10 +330,10 @@ mod tests { let ctx_r1 = context_with_rack("42:9e:b1:bd:9d:dd", "rack-1"); let ctx_r2 = context_with_rack("42:9e:b1:bd:9d:ee", "rack-2"); - processor.process_event(&ctx_r1, &tray_leak_report(true)); - let emitted = processor.process_event(&ctx_r2, &tray_leak_report(true)); + processor.handle_event(&ctx_r1, &tray_leak_report(true)); + let emitted = processor.handle_event(&ctx_r2, &tray_leak_report(true)); - let CollectorEvent::HealthReport(report) = &emitted[0] else { + let HealthEvent::HealthReportProduced(report) = &emitted[0] else { panic!("expected health report"); }; diff --git a/crates/health/src/sink/composite.rs b/crates/health/src/sink/composite.rs index 7948d94da3..20605e6e8e 100644 --- a/crates/health/src/sink/composite.rs +++ b/crates/health/src/sink/composite.rs @@ -18,42 +18,52 @@ use std::sync::Arc; use std::time::Instant; -use super::{CollectorEvent, DataSink, EventContext}; +use super::{EventContext, HealthEvent, SyncEventNode}; use crate::metrics::{ComponentKind, ComponentMetrics, MetricsManager}; -pub struct CompositeDataSink { - sinks: Vec>, +/// A [`SyncEventNode`] that fans every event out to a set of inner sinks, +/// recording per-sink timing metrics. Terminal node: it never emits derived +/// events. +pub struct CompositeSyncEventNode { + sinks: Vec>, component_metrics: Arc, } -impl CompositeDataSink { - pub fn new(sinks: Vec>, metrics_manager: Arc) -> Self { +impl CompositeSyncEventNode { + /// Creates a composite over `sinks`, sourcing timing metrics from + /// `metrics_manager`. + pub fn new(sinks: Vec>, metrics_manager: Arc) -> Self { Self { sinks, component_metrics: metrics_manager.component_metrics(), } } - fn record_sink_operation(&self, sink: &dyn DataSink, duration: std::time::Duration) { + /// Records the time a single inner sink spent handling one event. + fn record_sink_operation(&self, sink: &dyn SyncEventNode, duration: std::time::Duration) { self.component_metrics.record_operation( ComponentKind::Sink, - sink.sink_type(), + sink.node_type(), duration, true, ); } } -impl DataSink for CompositeDataSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for CompositeSyncEventNode { + fn node_type(&self) -> &'static str { "composite_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { for sink in &self.sinks { + if !sink.interested_in(event) { + continue; + } let start = Instant::now(); sink.handle_event(context, event); self.record_sink_operation(sink.as_ref(), start.elapsed()); } + Vec::new() } } diff --git a/crates/health/src/sink/events.rs b/crates/health/src/sink/events.rs index 48e7844163..e666b6e524 100644 --- a/crates/health/src/sink/events.rs +++ b/crates/health/src/sink/events.rs @@ -30,6 +30,8 @@ use health_report::{ use nv_redfish::resource::Health as BmcHealth; use serde::Serialize; +use crate::bmc::BmcClient; +use crate::collectors::inventory::EntityInventory; use crate::endpoint::{BmcAddr, BmcEndpoint, EndpointMetadata, MachineData, SwitchEndpointRole}; use crate::metrics::MetricLabel; @@ -322,6 +324,7 @@ struct DiagnosticLogBodyAttribute<'a> { #[derive(Clone, Debug)] pub struct FirmwareInfo { + pub id: String, pub component: String, pub version: String, pub attributes: Vec, @@ -356,15 +359,59 @@ impl HealthReport { } } -#[derive(Clone, Debug)] -pub enum CollectorEvent { - MetricCollectionStart, - Metric(Box), - MetricCollectionEnd, - CollectorRemoved, - Log(Box), - Firmware(FirmwareInfo), - HealthReport(Arc), +/// Canonical event flowing through the health event graph. +/// +/// Every collector, processor, and sink communicates exclusively in terms of +/// these events; the variants are domain facts (what was observed) rather than +/// source-specific shapes, so new data sources can reuse them unchanged. +#[derive(Clone)] +pub enum HealthEvent { + /// Request to scrape a specific endpoint with the given cadence/kind. + ScrapeRequested { + endpoint_key: String, + kind: ScrapeKind, + }, + /// A fresh inventory snapshot was discovered for an endpoint; consumers + /// cache their own copy of the immutable snapshot. + InventoryDiscovered { + endpoint_key: String, + inventory: Arc>, + }, + /// A consumer's cached inventory advanced to `generation`. + InventoryUpdated { + endpoint_key: String, + generation: u64, + }, + /// Marks the start of a scrape batch (used by sinks to window samples). + ScrapeBatchStarted, + /// A single metric measurement was observed. + MeasurementObserved(Box), + /// Marks the end of a scrape batch. + ScrapeBatchFinished, + /// The owning node/endpoint was removed; sinks should drop its state. + NodeRemoved, + /// A log record was observed. + LogObserved(Box), + /// Firmware version information was observed. + FirmwareObserved(FirmwareInfo), + /// A health report was produced by a processor for downstream sinks. + HealthReportProduced(Arc), +} + +/// The category of data a [`HealthEvent::ScrapeRequested`] asks a collector to +/// gather, allowing one collector type to serve multiple data domains. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +pub enum ScrapeKind { + Inventory, + Sensors, + Metrics, + Logs, + Firmware, + LeakDetectors, + Nmxt, + NvueRest, + NvueGnmi, + Telemetry, } #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] diff --git a/crates/health/src/sink/health_report.rs b/crates/health/src/sink/health_report.rs index 3218b79cbe..402253fbf0 100644 --- a/crates/health/src/sink/health_report.rs +++ b/crates/health/src/sink/health_report.rs @@ -25,7 +25,7 @@ use carbide_uuid::machine::MachineId; use super::dedup_queue::DedupQueue; use super::{ - CollectorEvent, DataSink, EventContext, HealthReport, HealthReportTarget, ReportSource, + EventContext, HealthEvent, HealthReport, HealthReportTarget, ReportSource, SyncEventNode, }; use crate::HealthError; use crate::api_client::ApiClientWrapper; @@ -159,17 +159,17 @@ impl HealthReportSink { } } -impl DataSink for HealthReportSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for HealthReportSink { + fn node_type(&self) -> &'static str { "health_report_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - let CollectorEvent::HealthReport(report) = event else { - return; + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::HealthReportProduced(report) = event else { + return Vec::new(); }; if report.target != Some(HealthReportTarget::Machine) { - return; + return Vec::new(); } if self.skip_empty_reports && report.is_empty() { @@ -177,7 +177,7 @@ impl DataSink for HealthReportSink { source = ?report.source, "Skipping empty machine health report" ); - return; + return Vec::new(); } if let Some(machine_id) = context.machine_id() { @@ -201,7 +201,7 @@ impl DataSink for HealthReportSink { machine_id = %key.id, "Suppressing unchanged success-only health report" ); - return; + return Vec::new(); } cache.entries.insert( key.clone(), @@ -224,6 +224,7 @@ impl DataSink for HealthReportSink { "Received machine-target HealthReport event without machine_id context" ); } + Vec::new() } } @@ -388,7 +389,7 @@ mod tests { let ctx = machine_context(mid); let sink = make_sink(Some(Duration::from_secs(300))); let report = success_report(ReportSource::BmcSensors); - let event = CollectorEvent::HealthReport(Arc::clone(&report)); + let event = HealthEvent::HealthReportProduced(Arc::clone(&report)); sink.handle_event(&ctx, &event); assert!(sink.queue.pop().is_some(), "first send should go through"); @@ -411,13 +412,19 @@ mod tests { // Send a success first to populate last_sent, then send an alert. // The alert must not be suppressed, and the subsequent success must // also go through (alert clears the suppression entry). - sink.handle_event(&ctx, &CollectorEvent::HealthReport(Arc::clone(&success))); + sink.handle_event( + &ctx, + &HealthEvent::HealthReportProduced(Arc::clone(&success)), + ); sink.queue.pop(); - sink.handle_event(&ctx, &CollectorEvent::HealthReport(Arc::clone(&alert))); + sink.handle_event(&ctx, &HealthEvent::HealthReportProduced(Arc::clone(&alert))); assert!(sink.queue.pop().is_some(), "alert should not be suppressed"); - sink.handle_event(&ctx, &CollectorEvent::HealthReport(Arc::clone(&success))); + sink.handle_event( + &ctx, + &HealthEvent::HealthReportProduced(Arc::clone(&success)), + ); assert!( sink.queue.pop().is_some(), "first success after alert should not be suppressed" @@ -431,7 +438,10 @@ mod tests { let sink = make_sink(Some(Duration::from_secs(300))); let report_a = success_report(ReportSource::BmcSensors); - sink.handle_event(&ctx, &CollectorEvent::HealthReport(Arc::clone(&report_a))); + sink.handle_event( + &ctx, + &HealthEvent::HealthReportProduced(Arc::clone(&report_a)), + ); sink.queue.pop(); let report_b = Arc::new(HealthReport { @@ -444,7 +454,10 @@ mod tests { }], alerts: Vec::new(), }); - sink.handle_event(&ctx, &CollectorEvent::HealthReport(Arc::clone(&report_b))); + sink.handle_event( + &ctx, + &HealthEvent::HealthReportProduced(Arc::clone(&report_b)), + ); assert!( sink.queue.pop().is_some(), @@ -458,7 +471,7 @@ mod tests { let ctx = machine_context(mid); let sink = make_sink(None); let report = success_report(ReportSource::BmcSensors); - let event = CollectorEvent::HealthReport(Arc::clone(&report)); + let event = HealthEvent::HealthReportProduced(Arc::clone(&report)); sink.handle_event(&ctx, &event); sink.queue.pop(); diff --git a/crates/health/src/sink/log_file.rs b/crates/health/src/sink/log_file.rs index f0d597c1bf..3ae88fbe62 100644 --- a/crates/health/src/sink/log_file.rs +++ b/crates/health/src/sink/log_file.rs @@ -22,11 +22,11 @@ use std::sync::Mutex; use serde::Serialize; -use super::{CollectorEvent, DataSink, EventContext, LogRecord}; +use super::{EventContext, HealthEvent, LogRecord, SyncEventNode}; use crate::config::LogFileSinkConfig; -/// Durable JSONL log sink. Writes CollectorEvent::Log records to rotating -/// files using sync I/O, safe to call from DataSink::handle_event. +/// Durable JSONL log sink. Writes HealthEvent::LogObserved records to rotating +/// files using sync I/O, safe to call from SyncEventNode::handle_event. pub struct LogFileSink { writer: Mutex, include_diagnostics: bool, @@ -46,14 +46,14 @@ impl LogFileSink { } } -impl DataSink for LogFileSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for LogFileSink { + fn node_type(&self) -> &'static str { "log_file_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - let CollectorEvent::Log(record) = event else { - return; + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::LogObserved(record) = event else { + return Vec::new(); }; // Diagnostics are opt-in for log files. When enabled, fold the @@ -66,18 +66,19 @@ impl DataSink for LogFileSink { Ok(json) => json, Err(e) => { tracing::error!(error = ?e, "failed to serialize log record"); - return; + return Vec::new(); } }; let Ok(mut writer) = self.writer.lock() else { tracing::error!("log file writer lock poisoned"); - return; + return Vec::new(); }; if let Err(e) = writer.write_line(&line) { tracing::error!(error = ?e, "failed to write log record to file"); } + Vec::new() } } @@ -292,7 +293,7 @@ mod tests { let sink = LogFileSink::new(&config).expect("sink"); let ctx = test_context(); - let metric_event = CollectorEvent::MetricCollectionStart; + let metric_event = HealthEvent::ScrapeBatchStarted; sink.handle_event(&ctx, &metric_event); let log_path = dir.path().join("health_logs.jsonl"); @@ -312,7 +313,7 @@ mod tests { let sink = LogFileSink::new(&config).expect("sink"); let ctx = test_context(); - let event = CollectorEvent::Log( + let event = HealthEvent::LogObserved( LogRecord { body: "something happened".to_string(), severity: "INFO".to_string(), @@ -347,7 +348,7 @@ mod tests { let sink = LogFileSink::new(&config).expect("sink"); let ctx = test_context(); - let event = CollectorEvent::Log( + let event = HealthEvent::LogObserved( LogRecord { body: "parent log".to_string(), severity: "INFO".to_string(), @@ -397,7 +398,7 @@ mod tests { let sink = LogFileSink::new(&config).expect("sink"); let ctx = test_context(); - let event = CollectorEvent::Log( + let event = HealthEvent::LogObserved( LogRecord { body: "parent log".to_string(), severity: "INFO".to_string(), @@ -433,7 +434,7 @@ mod tests { let sink = LogFileSink::new(&config).expect("sink"); let ctx = machine_context(); - let event = CollectorEvent::Log( + let event = HealthEvent::LogObserved( LogRecord { body: "xid event".to_string(), severity: "WARN".to_string(), @@ -476,7 +477,7 @@ mod tests { let ctx = test_context(); for i in 0..5 { - let event = CollectorEvent::Log( + let event = HealthEvent::LogObserved( LogRecord { body: format!("log entry {i}"), severity: "INFO".to_string(), @@ -508,7 +509,7 @@ mod tests { let ctx = test_context(); for i in 0..5 { - let event = CollectorEvent::Log( + let event = HealthEvent::LogObserved( LogRecord { body: format!("entry {i}"), severity: "WARN".to_string(), diff --git a/crates/health/src/sink/mod.rs b/crates/health/src/sink/mod.rs index d79734a75b..275dd5a37f 100644 --- a/crates/health/src/sink/mod.rs +++ b/crates/health/src/sink/mod.rs @@ -31,11 +31,11 @@ mod rack_health_report; mod switch_health_report; mod tracing; -pub use composite::CompositeDataSink; +pub use composite::CompositeSyncEventNode; pub use events::{ - Classification, CollectorEvent, DiagnosticLogRecord, EventContext, FirmwareInfo, HealthReport, + Classification, DiagnosticLogRecord, EventContext, FirmwareInfo, HealthEvent, HealthReport, HealthReportAlert, HealthReportSuccess, HealthReportTarget, LogRecord, MetricSample, Probe, - ReportSource, SensorThresholdContext, + ReportSource, ScrapeKind, SensorThresholdContext, }; pub use health_report::HealthReportSink; pub use log_file::LogFileSink; @@ -50,9 +50,27 @@ pub(crate) use self::otlp::OtlpSink; #[cfg(feature = "bench-hooks")] pub use self::otlp::OtlpSink; -pub trait DataSink: Send + Sync { - fn sink_type(&self) -> &'static str; - fn handle_event(&self, context: &EventContext, event: &CollectorEvent); +/// A node in the synchronous health event graph. +/// +/// Every processing unit (sinks, transforms, collector mailboxes) implements +/// this single trait. A node receives a [`HealthEvent`], may act on it, and +/// returns any derived events to be fed back into the graph. This unifies what +/// used to be separate "collector", "sink", and "processor" abstractions. +pub trait SyncEventNode: Send + Sync { + /// Stable identifier for this node, used in logs and metrics labels. + fn node_type(&self) -> &'static str; + + /// Returns whether this node wants to receive `event`. + /// + /// Dispatchers consult this before calling [`Self::handle_event`] so nodes + /// can cheaply opt out of events they never act on. Defaults to `true`. + fn interested_in(&self, _event: &HealthEvent) -> bool { + true + } + + /// Processes `event` and returns any derived events to re-feed into the + /// graph (empty when the node is a terminal sink). + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec; } #[cfg(test)] @@ -65,8 +83,8 @@ mod tests { use mac_address::MacAddress; use super::{ - CollectorEvent, CompositeDataSink, DataSink, DiagnosticLogRecord, EventContext, LogRecord, - MetricSample, PrometheusSink, + CompositeSyncEventNode, DiagnosticLogRecord, EventContext, HealthEvent, LogRecord, + MetricSample, PrometheusSink, SyncEventNode, }; use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use crate::metrics::MetricsManager; @@ -75,24 +93,27 @@ mod tests { counter: Arc, } - impl DataSink for CountingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CountingSink { + fn node_type(&self) -> &'static str { "counting_sink" } - fn handle_event(&self, _context: &EventContext, _event: &CollectorEvent) { + fn handle_event(&self, _context: &EventContext, _event: &HealthEvent) -> Vec { self.counter.fetch_add(1, Ordering::SeqCst); + Vec::new() } } struct NoopSink; - impl DataSink for NoopSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for NoopSink { + fn node_type(&self) -> &'static str { "noop_sink" } - fn handle_event(&self, _context: &EventContext, _event: &CollectorEvent) {} + fn handle_event(&self, _context: &EventContext, _event: &HealthEvent) -> Vec { + Vec::new() + } } #[tokio::test] @@ -110,7 +131,7 @@ mod tests { }); let composite = - CompositeDataSink::new(vec![sink_ok_1, sink_noop, sink_ok_2], metrics_manager); + CompositeSyncEventNode::new(vec![sink_ok_1, sink_noop, sink_ok_2], metrics_manager); let context = EventContext { endpoint_key: "42:9e:b1:bd:9d:dd".to_string(), @@ -124,7 +145,7 @@ mod tests { rack_id: None, }; - let event = CollectorEvent::Metric( + let event = HealthEvent::MeasurementObserved( MetricSample { key: "key".to_string(), name: "metric".to_string(), @@ -169,7 +190,7 @@ mod tests { rack_id: None, }; - let log_event = CollectorEvent::Log( + let log_event = HealthEvent::LogObserved( LogRecord { body: "ignored by prometheus sink".to_string(), severity: "INFO".to_string(), @@ -188,7 +209,7 @@ mod tests { .expect("telemetry export should work"); assert!(!export_after_log.contains("test_sink_hw_sensor")); - let metric_event = CollectorEvent::Metric( + let metric_event = HealthEvent::MeasurementObserved( MetricSample { key: "metric_key".to_string(), name: "hw_sensor".to_string(), @@ -242,7 +263,7 @@ mod tests { rack_id: None, }; - let metric_event = CollectorEvent::Metric( + let metric_event = HealthEvent::MeasurementObserved( MetricSample { key: "metric_key".to_string(), name: "hw_sensor".to_string(), @@ -261,7 +282,7 @@ mod tests { .expect("telemetry export should work"); assert!(export_before_remove.contains("test_sink_hw_sensor_temperature_celsius")); - sink.handle_event(&context, &CollectorEvent::CollectorRemoved); + sink.handle_event(&context, &HealthEvent::NodeRemoved); let export_after_remove = metrics_manager .export_telemetry() @@ -298,9 +319,9 @@ mod tests { rack_id: None, }; - let start_event = CollectorEvent::MetricCollectionStart; + let start_event = HealthEvent::ScrapeBatchStarted; sink.handle_event(&context, &start_event); - let s1_event = CollectorEvent::Metric( + let s1_event = HealthEvent::MeasurementObserved( MetricSample { key: "s1".to_string(), name: "hw_sensor".to_string(), @@ -313,7 +334,7 @@ mod tests { .into(), ); sink.handle_event(&context, &s1_event); - let end_event = CollectorEvent::MetricCollectionEnd; + let end_event = HealthEvent::ScrapeBatchFinished; sink.handle_event(&context, &end_event); let first_export = metrics_manager @@ -321,9 +342,9 @@ mod tests { .expect("telemetry export should work"); assert!(first_export.contains("sensor=\"temp1\"")); - let start_event = CollectorEvent::MetricCollectionStart; + let start_event = HealthEvent::ScrapeBatchStarted; sink.handle_event(&context, &start_event); - let s2_event = CollectorEvent::Metric( + let s2_event = HealthEvent::MeasurementObserved( MetricSample { key: "s2".to_string(), name: "hw_sensor".to_string(), @@ -336,7 +357,7 @@ mod tests { .into(), ); sink.handle_event(&context, &s2_event); - let end_event = CollectorEvent::MetricCollectionEnd; + let end_event = HealthEvent::ScrapeBatchFinished; sink.handle_event(&context, &end_event); let second_export = metrics_manager diff --git a/crates/health/src/sink/otlp.rs b/crates/health/src/sink/otlp.rs index 74cca0f713..99e9d3ea01 100644 --- a/crates/health/src/sink/otlp.rs +++ b/crates/health/src/sink/otlp.rs @@ -21,14 +21,16 @@ use prometheus::Counter; use super::dedup_queue::DedupQueue; use super::event_mapper::RedfishEventMapper; -use super::{CollectorEvent, DataSink, EventContext, LogRecord, MetricSample}; +use super::{EventContext, HealthEvent, LogRecord, MetricSample, SyncEventNode}; use crate::HealthError; use crate::config::OtlpSinkConfig; use crate::metrics::MetricsManager; use crate::otlp::drain::OtlpDrainTask; use crate::otlp::metrics_drain::OtlpMetricsDrainTask; -pub(crate) type OtlpQueue = DedupQueue; +/// Dedup queue of log-shaped events awaiting OTLP export, keyed by endpoint. +pub(crate) type OtlpQueue = DedupQueue; +/// Dedup queue of metric samples awaiting OTLP export, keyed by sample identity. pub(crate) type OtlpMetricsQueue = DedupQueue; #[derive(Clone, Debug, Eq, Hash, PartialEq)] @@ -62,13 +64,13 @@ pub struct OtlpSink { } /// Returns whether an event belongs in the logs drain. -pub(crate) fn is_otlp_log_relevant(event: &CollectorEvent) -> bool { +pub(crate) fn is_otlp_log_relevant(event: &HealthEvent) -> bool { !matches!( event, - CollectorEvent::Metric(_) - | CollectorEvent::MetricCollectionStart - | CollectorEvent::MetricCollectionEnd - | CollectorEvent::CollectorRemoved + HealthEvent::MeasurementObserved(_) + | HealthEvent::ScrapeBatchStarted + | HealthEvent::ScrapeBatchFinished + | HealthEvent::NodeRemoved ) } @@ -153,7 +155,7 @@ impl OtlpSink { let record = record .emitted_log_record(self.include_diagnostics) .into_owned(); - let event = CollectorEvent::Log(Box::new(record)); + let event = HealthEvent::LogObserved(Box::new(record)); if self.queue.save_latest(key, (context.clone(), event)) { self.replaced_total.inc(); @@ -185,22 +187,24 @@ impl OtlpSink { #[cfg(feature = "bench-hooks")] impl OtlpSink { - pub fn pop_for_bench(&self) -> Option<(EventContext, CollectorEvent)> { + /// Pops one queued log event from the sink's internal queue (benchmarks only). + pub fn pop_for_bench(&self) -> Option<(EventContext, HealthEvent)> { self.queue.pop().map(|(_key, value)| value) } + /// Pops one queued metric sample from the sink's internal queue (benchmarks only). pub fn pop_metric_for_bench(&self) -> Option<(EventContext, MetricSample)> { self.metrics_queue.pop().map(|(_key, value)| value) } } -impl DataSink for OtlpSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for OtlpSink { + fn node_type(&self) -> &'static str { "otlp_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - if let CollectorEvent::Metric(sample) = event { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + if let HealthEvent::MeasurementObserved(sample) = event { let key = metric_queue_key(context, sample); if self @@ -210,19 +214,19 @@ impl DataSink for OtlpSink { self.metrics_replaced_total.inc(); } - return; + return Vec::new(); } if !is_otlp_log_relevant(event) { - return; + return Vec::new(); } let (key, event) = match event { - CollectorEvent::Log(record) => { + HealthEvent::LogObserved(record) => { self.enqueue_log_event(context, record); - return; + return Vec::new(); } - CollectorEvent::HealthReport(report) => { + HealthEvent::HealthReportProduced(report) => { let key = format!( "{}|health_report|{}", context.endpoint_key, @@ -231,16 +235,20 @@ impl DataSink for OtlpSink { (key, event.clone()) } - CollectorEvent::Firmware(info) => { - let key = format!("{}|firmware|{}", context.endpoint_key, info.component); + HealthEvent::FirmwareObserved(info) => { + let key = format!( + "{}|firmware|{}|{}", + context.endpoint_key, info.id, info.component + ); (key, event.clone()) } - _ => return, + _ => return Vec::new(), }; if self.queue.save_latest(key, (context.clone(), event)) { self.replaced_total.inc(); } + Vec::new() } } @@ -253,7 +261,7 @@ mod tests { use super::*; use crate::sink::event_mapper::OpenBmcEventMapper; - use crate::sink::{DiagnosticLogRecord, LogRecord, MetricSample}; + use crate::sink::{DiagnosticLogRecord, FirmwareInfo, LogRecord, MetricSample}; fn test_context() -> EventContext { EventContext { @@ -269,7 +277,7 @@ mod tests { } } - fn log_event(message_id: &str, message_args: &str) -> CollectorEvent { + fn log_event(message_id: &str, message_args: &str) -> HealthEvent { log_event_with_diagnostic_record(message_id, message_args, None) } @@ -278,8 +286,8 @@ mod tests { message_id: &str, message_args: &str, diagnostic_record: Option, - ) -> CollectorEvent { - CollectorEvent::Log(Box::new(LogRecord { + ) -> HealthEvent { + HealthEvent::LogObserved(Box::new(LogRecord { body: "test".to_string(), severity: "OK".to_string(), attributes: vec![ @@ -301,21 +309,16 @@ mod tests { } } - fn metric_event() -> CollectorEvent { + fn metric_event() -> HealthEvent { metric_event_with("k", "gauge", "celsius") } - fn metric_event_with(key: &str, metric_type: &str, unit: &str) -> CollectorEvent { + fn metric_event_with(key: &str, metric_type: &str, unit: &str) -> HealthEvent { metric_event_with_name("temp", key, metric_type, unit) } - fn metric_event_with_name( - name: &str, - key: &str, - metric_type: &str, - unit: &str, - ) -> CollectorEvent { - CollectorEvent::Metric(Box::new(MetricSample { + fn metric_event_with_name(name: &str, key: &str, metric_type: &str, unit: &str) -> HealthEvent { + HealthEvent::MeasurementObserved(Box::new(MetricSample { key: key.to_string(), name: name.to_string(), metric_type: metric_type.to_string(), @@ -326,6 +329,15 @@ mod tests { })) } + fn firmware_event(id: &str, component: &str, version: &str) -> HealthEvent { + HealthEvent::FirmwareObserved(FirmwareInfo { + id: id.to_string(), + component: component.to_string(), + version: version.to_string(), + attributes: Vec::new(), + }) + } + fn test_sink() -> OtlpSink { OtlpSink::new_for_bench(Arc::new(OpenBmcEventMapper)) } @@ -333,10 +345,8 @@ mod tests { #[test] fn is_otlp_log_relevant_excludes_metric_events() { assert!(!is_otlp_log_relevant(&metric_event())); - assert!(!is_otlp_log_relevant( - &CollectorEvent::MetricCollectionStart - )); - assert!(!is_otlp_log_relevant(&CollectorEvent::MetricCollectionEnd)); + assert!(!is_otlp_log_relevant(&HealthEvent::ScrapeBatchStarted)); + assert!(!is_otlp_log_relevant(&HealthEvent::ScrapeBatchFinished)); } #[test] @@ -360,8 +370,8 @@ mod tests { fn metric_collection_sentinels_are_no_op() { let sink = test_sink(); let ctx = test_context(); - sink.handle_event(&ctx, &CollectorEvent::MetricCollectionStart); - sink.handle_event(&ctx, &CollectorEvent::MetricCollectionEnd); + sink.handle_event(&ctx, &HealthEvent::ScrapeBatchStarted); + sink.handle_event(&ctx, &HealthEvent::ScrapeBatchFinished); assert!(sink.queue.pop().is_none()); assert!(sink.metrics_queue.pop().is_none()); } @@ -380,6 +390,28 @@ mod tests { assert_eq!(sink.metrics_replaced_total.get() as u64, 1); } + #[test] + fn firmware_events_dedup_by_id_and_component() { + let sink = test_sink(); + let ctx = test_context(); + + sink.handle_event(&ctx, &firmware_event("1", "BIOS", "1.0")); + sink.handle_event(&ctx, &firmware_event("2", "BIOS", "1.0")); + sink.handle_event(&ctx, &firmware_event("", "BMC", "1.0")); + sink.handle_event(&ctx, &firmware_event("", "BIOS", "1.0")); + + let mut count = 0; + while sink.queue.pop().is_some() { + count += 1; + } + + assert_eq!( + count, 4, + "id and component are both part of firmware identity" + ); + assert_eq!(sink.replaced_total.get() as u64, 0); + } + #[test] fn metric_events_with_same_sample_key_but_different_type_are_separate_entries() { let sink = test_sink(); @@ -508,7 +540,7 @@ mod tests { ); let mut bodies = Vec::new(); - while let Some((_key, (_context, CollectorEvent::Log(record)))) = sink.queue.pop() { + while let Some((_key, (_context, HealthEvent::LogObserved(record)))) = sink.queue.pop() { bodies.push(record.body); } @@ -548,7 +580,7 @@ mod tests { ); let mut records = Vec::new(); - while let Some((_key, (_context, CollectorEvent::Log(record)))) = sink.queue.pop() { + while let Some((_key, (_context, HealthEvent::LogObserved(record)))) = sink.queue.pop() { records.push(record); } diff --git a/crates/health/src/sink/power_shelf_health_report.rs b/crates/health/src/sink/power_shelf_health_report.rs index 4152f586a8..55aefddf86 100644 --- a/crates/health/src/sink/power_shelf_health_report.rs +++ b/crates/health/src/sink/power_shelf_health_report.rs @@ -21,7 +21,7 @@ use carbide_uuid::power_shelf::PowerShelfId; use super::dedup_queue::DedupQueue; use super::{ - CollectorEvent, DataSink, EventContext, HealthReport, HealthReportTarget, ReportSource, + EventContext, HealthEvent, HealthReport, HealthReportTarget, ReportSource, SyncEventNode, }; use crate::HealthError; use crate::api_client::ApiClientWrapper; @@ -97,18 +97,18 @@ impl PowerShelfHealthReportSink { } } -impl DataSink for PowerShelfHealthReportSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for PowerShelfHealthReportSink { + fn node_type(&self) -> &'static str { "power_shelf_health_report_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - let CollectorEvent::HealthReport(report) = event else { - return; + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::HealthReportProduced(report) = event else { + return Vec::new(); }; if report.target != Some(HealthReportTarget::PowerShelf) { - return; + return Vec::new(); } if self.skip_empty_reports && report.is_empty() { @@ -116,7 +116,7 @@ impl DataSink for PowerShelfHealthReportSink { source = ?report.source, "Skipping empty power shelf health report" ); - return; + return Vec::new(); } let power_shelf_id = if let Some(power_shelf_id) = context.power_shelf_id() { @@ -126,7 +126,7 @@ impl DataSink for PowerShelfHealthReportSink { endpoint_key = context.endpoint_key(), "Received power-shelf-target HealthReport event without power_shelf_id context" ); - return; + return Vec::new(); }; let key = PowerShelfHealthReportKey { @@ -134,5 +134,6 @@ impl DataSink for PowerShelfHealthReportSink { source: report.source, }; self.queue.save_latest(key, Arc::clone(report)); + Vec::new() } } diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index 4b1679d848..00fb7cd976 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use dashmap::DashMap; -use super::{CollectorEvent, DataSink, EventContext, MetricSample}; +use super::{EventContext, HealthEvent, MetricSample, SyncEventNode}; use crate::HealthError; use crate::metrics::{CollectorRegistry, GaugeMetrics, GaugeReading, MetricsManager}; @@ -174,51 +174,51 @@ impl PrometheusSink { } } -impl DataSink for PrometheusSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for PrometheusSink { + fn node_type(&self) -> &'static str { "prometheus_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { match event { - CollectorEvent::MetricCollectionStart => { + HealthEvent::ScrapeBatchStarted => match self.get_or_create_stream_metrics(context) { + Ok(stream_metrics) => stream_metrics.begin_update(), + Err(error) => { + tracing::warn!( + ?error, + endpoint_key = context.endpoint_key(), + collector = context.collector_type, + "Failed to initialize Prometheus stream metrics" + ); + } + }, + HealthEvent::MeasurementObserved(sample) => { match self.get_or_create_stream_metrics(context) { - Ok(stream_metrics) => stream_metrics.begin_update(), + Ok(stream_metrics) => { + stream_metrics.record( + GaugeReading::new( + Self::metric_reading_key(sample), + sample.name.clone(), + sample.metric_type.clone(), + sample.unit.clone(), + sample.value, + ) + .with_labels(sample.labels.clone()), + ); + } Err(error) => { tracing::warn!( ?error, endpoint_key = context.endpoint_key(), collector = context.collector_type, - "Failed to initialize Prometheus stream metrics" + metric = sample.name, + metric_type = sample.metric_type, + "Failed to record Prometheus metric sample" ); } } } - CollectorEvent::Metric(sample) => match self.get_or_create_stream_metrics(context) { - Ok(stream_metrics) => { - stream_metrics.record( - GaugeReading::new( - Self::metric_reading_key(sample), - sample.name.clone(), - sample.metric_type.clone(), - sample.unit.clone(), - sample.value, - ) - .with_labels(sample.labels.clone()), - ); - } - Err(error) => { - tracing::warn!( - ?error, - endpoint_key = context.endpoint_key(), - collector = context.collector_type, - metric = sample.name, - metric_type = sample.metric_type, - "Failed to record Prometheus metric sample" - ); - } - }, - CollectorEvent::MetricCollectionEnd => { + HealthEvent::ScrapeBatchFinished => { if let Some(endpoint_metrics) = self.stream_metrics.get::(context.endpoint_key()) && let Some(entry) = endpoint_metrics.get(context.collector_type) @@ -226,11 +226,15 @@ impl DataSink for PrometheusSink { entry.value().sweep_stale(); } } - CollectorEvent::CollectorRemoved => self.remove_collector_metrics(context), - CollectorEvent::Log(_) - | CollectorEvent::Firmware(_) - | CollectorEvent::HealthReport(_) => {} + HealthEvent::NodeRemoved => self.remove_collector_metrics(context), + HealthEvent::LogObserved(_) + | HealthEvent::ScrapeRequested { .. } + | HealthEvent::InventoryDiscovered { .. } + | HealthEvent::InventoryUpdated { .. } + | HealthEvent::FirmwareObserved(_) + | HealthEvent::HealthReportProduced(_) => {} } + Vec::new() } } diff --git a/crates/health/src/sink/rack_health_report.rs b/crates/health/src/sink/rack_health_report.rs index 9eee20e38d..04a7851e25 100644 --- a/crates/health/src/sink/rack_health_report.rs +++ b/crates/health/src/sink/rack_health_report.rs @@ -21,7 +21,7 @@ use carbide_uuid::rack::RackId; use super::dedup_queue::DedupQueue; use super::{ - CollectorEvent, DataSink, EventContext, HealthReport, HealthReportTarget, ReportSource, + EventContext, HealthEvent, HealthReport, HealthReportTarget, ReportSource, SyncEventNode, }; use crate::HealthError; use crate::api_client::ApiClientWrapper; @@ -97,18 +97,18 @@ impl RackHealthReportSink { } } -impl DataSink for RackHealthReportSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for RackHealthReportSink { + fn node_type(&self) -> &'static str { "rack_health_report_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - let CollectorEvent::HealthReport(report) = event else { - return; + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::HealthReportProduced(report) = event else { + return Vec::new(); }; if report.target != Some(HealthReportTarget::Rack) { - return; + return Vec::new(); } if self.skip_empty_reports && report.is_empty() { @@ -116,7 +116,7 @@ impl DataSink for RackHealthReportSink { source = ?report.source, "Skipping empty rack health report" ); - return; + return Vec::new(); } let Some(rack_id) = context.rack_id() else { @@ -124,7 +124,7 @@ impl DataSink for RackHealthReportSink { endpoint_key = context.endpoint_key(), "Received rack-target HealthReport event without rack_id context" ); - return; + return Vec::new(); }; let key = RackHealthReportKey { @@ -132,5 +132,6 @@ impl DataSink for RackHealthReportSink { source: report.source, }; self.queue.save_latest(key, Arc::clone(report)); + Vec::new() } } diff --git a/crates/health/src/sink/switch_health_report.rs b/crates/health/src/sink/switch_health_report.rs index 497a555a03..5a9d7558ce 100644 --- a/crates/health/src/sink/switch_health_report.rs +++ b/crates/health/src/sink/switch_health_report.rs @@ -21,7 +21,7 @@ use carbide_uuid::switch::SwitchId; use super::dedup_queue::DedupQueue; use super::{ - CollectorEvent, DataSink, EventContext, HealthReport, HealthReportTarget, ReportSource, + EventContext, HealthEvent, HealthReport, HealthReportTarget, ReportSource, SyncEventNode, }; use crate::HealthError; use crate::api_client::ApiClientWrapper; @@ -97,18 +97,18 @@ impl SwitchHealthReportSink { } } -impl DataSink for SwitchHealthReportSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for SwitchHealthReportSink { + fn node_type(&self) -> &'static str { "switch_health_report_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - let CollectorEvent::HealthReport(report) = event else { - return; + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::HealthReportProduced(report) = event else { + return Vec::new(); }; if report.target != Some(HealthReportTarget::Switch) { - return; + return Vec::new(); } if self.skip_empty_reports && report.is_empty() { @@ -116,7 +116,7 @@ impl DataSink for SwitchHealthReportSink { source = ?report.source, "Skipping empty switch health report" ); - return; + return Vec::new(); } let switch_id = if let Some(switch_id) = context.switch_id() { @@ -126,7 +126,7 @@ impl DataSink for SwitchHealthReportSink { endpoint_key = context.endpoint_key(), "Received switch-target HealthReport event without switch_id context" ); - return; + return Vec::new(); }; let key = SwitchHealthReportKey { @@ -134,5 +134,6 @@ impl DataSink for SwitchHealthReportSink { source: report.source, }; self.queue.save_latest(key, Arc::clone(report)); + Vec::new() } } diff --git a/crates/health/src/sink/tracing.rs b/crates/health/src/sink/tracing.rs index 34e2f78390..4b3c52873f 100644 --- a/crates/health/src/sink/tracing.rs +++ b/crates/health/src/sink/tracing.rs @@ -15,7 +15,7 @@ * limitations under the License. */ -use super::{CollectorEvent, DataSink, EventContext}; +use super::{EventContext, HealthEvent, SyncEventNode}; use crate::config::TracingSinkConfig; /// Sink that writes health events through the process tracing subscriber. @@ -32,21 +32,21 @@ impl TracingSink { } } -impl DataSink for TracingSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for TracingSink { + fn node_type(&self) -> &'static str { "tracing_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { match event { - CollectorEvent::MetricCollectionStart => { + HealthEvent::ScrapeBatchStarted => { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, "Metric collection start" ); } - CollectorEvent::Metric(metric) => { + HealthEvent::MeasurementObserved(metric) => { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, @@ -58,21 +58,21 @@ impl DataSink for TracingSink { "Metric event" ); } - CollectorEvent::MetricCollectionEnd => { + HealthEvent::ScrapeBatchFinished => { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, "Metric collection end" ); } - CollectorEvent::CollectorRemoved => { + HealthEvent::NodeRemoved => { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, "Collector removed" ); } - CollectorEvent::Log(record) => { + HealthEvent::LogObserved(record) => { let has_included_diagnostics = self.include_diagnostics && record.diagnostic_record.is_some(); @@ -107,7 +107,7 @@ impl DataSink for TracingSink { ); } } - CollectorEvent::Firmware(info) => { + HealthEvent::FirmwareObserved(info) => { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, @@ -116,7 +116,7 @@ impl DataSink for TracingSink { "Firmware info event" ); } - CollectorEvent::HealthReport(report) => { + HealthEvent::HealthReportProduced(report) => { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, @@ -129,6 +129,10 @@ impl DataSink for TracingSink { "Health report event" ); } + HealthEvent::ScrapeRequested { .. } + | HealthEvent::InventoryDiscovered { .. } + | HealthEvent::InventoryUpdated { .. } => {} } + Vec::new() } }