From 7bab2abcd6ade369d9a5ebad2299ea0cdd9a014d Mon Sep 17 00:00:00 2001 From: ianisimov Date: Fri, 26 Jun 2026 16:40:10 -0700 Subject: [PATCH 1/4] change: hw-health universal stage support Signed-off-by: ianisimov --- Cargo.lock | 1 - crates/health/Cargo.toml | 1 - crates/health/benches/collector_pipeline.rs | 45 +++-- crates/health/benches/processor_pipeline.rs | 93 ++++----- crates/health/benches/sink_pipeline.rs | 46 +++-- crates/health/src/collectors/discovery.rs | 60 +++--- .../health/src/collectors/entity_metrics.rs | 47 +++-- crates/health/src/collectors/firmware.rs | 12 +- crates/health/src/collectors/inventory.rs | 11 +- crates/health/src/collectors/leak_detector.rs | 14 +- crates/health/src/collectors/logs/periodic.rs | 10 +- crates/health/src/collectors/logs/sse.rs | 8 +- crates/health/src/collectors/mod.rs | 1 - crates/health/src/collectors/nmxt.rs | 16 +- .../nvue/gnmi/on_change_processor.rs | 29 +-- .../collectors/nvue/gnmi/sample_processor.rs | 19 +- .../src/collectors/nvue/gnmi/subscriber.rs | 11 +- .../health/src/collectors/nvue/rest/client.rs | 29 ++- .../src/collectors/nvue/rest/collector.rs | 16 +- crates/health/src/collectors/runtime.rs | 92 ++++++++- crates/health/src/collectors/sensors.rs | 55 +++-- crates/health/src/discovery/cleanup.rs | 4 - crates/health/src/discovery/context.rs | 29 +-- crates/health/src/discovery/iteration.rs | 4 +- crates/health/src/discovery/spawn.rs | 191 ++++++++++++------ crates/health/src/lib.rs | 53 ++--- crates/health/src/otlp/convert.rs | 35 ++-- crates/health/src/otlp/drain.rs | 6 +- crates/health/src/processor/health_report.rs | 62 ++++-- .../health/src/processor/intrusion_events.rs | 36 ++-- crates/health/src/processor/leak_events.rs | 56 ++--- crates/health/src/processor/mod.rs | 100 +++++---- crates/health/src/processor/rack_leak.rs | 62 +++--- crates/health/src/sink/composite.rs | 24 ++- crates/health/src/sink/events.rs | 46 ++++- crates/health/src/sink/health_report.rs | 45 +++-- crates/health/src/sink/log_file.rs | 33 +-- crates/health/src/sink/mod.rs | 58 +++--- crates/health/src/sink/otlp.rs | 74 ++++--- .../src/sink/power_shelf_health_report.rs | 19 +- crates/health/src/sink/prometheus.rs | 76 +++---- crates/health/src/sink/rack_health_report.rs | 19 +- .../health/src/sink/switch_health_report.rs | 19 +- crates/health/src/sink/tracing.rs | 26 ++- 44 files changed, 969 insertions(+), 724 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 68993566f6..14f7f0d813 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1946,7 +1946,6 @@ dependencies = [ name = "carbide-health" version = "0.0.1" dependencies = [ - "arc-swap", "async-trait", "base64", "carbide-health-report", diff --git a/crates/health/Cargo.toml b/crates/health/Cargo.toml index 5423024726..d9b61b9c1b 100644 --- a/crates/health/Cargo.toml +++ b/crates/health/Cargo.toml @@ -27,7 +27,6 @@ name = "forge-hw-health" path = "src/main.rs" [dependencies] -arc-swap = { workspace = true } async-trait = { workspace = true } base64 = { workspace = true } chrono = { workspace = true } diff --git a/crates/health/benches/collector_pipeline.rs b/crates/health/benches/collector_pipeline.rs index 273865b714..57f69ae6d4 100644 --- a/crates/health/benches/collector_pipeline.rs +++ b/crates/health/benches/collector_pipeline.rs @@ -24,8 +24,8 @@ use std::sync::Arc; use carbide_health::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use carbide_health::metrics::MetricsManager; use carbide_health::sink::{ - CollectorEvent, CompositeDataSink, DataSink, EventContext, FirmwareInfo, LogRecord, - MetricSample, PrometheusSink, + CompositeSyncEventNode, EventContext, FirmwareInfo, HealthEvent, LogRecord, MetricSample, + PrometheusSink, SyncEventNode, }; use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; use mac_address::MacAddress; @@ -34,14 +34,15 @@ const MACHINE_ID: &str = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6r struct CountingSink; -impl DataSink for CountingSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for CountingSink { + fn node_type(&self) -> &'static str { "counting_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { black_box(context); black_box(event); + Vec::new() } } @@ -65,14 +66,14 @@ fn event_context() -> EventContext { } } -fn build_sensor_metric_event(idx: usize, unique_keys: usize) -> CollectorEvent { +fn build_sensor_metric_event(idx: usize, unique_keys: usize) -> HealthEvent { let unique_keys = unique_keys.max(1); let sensor_idx = idx % unique_keys; let sensor_key = format!("sensor-{sensor_idx}"); let machine_idx = idx % 16; let rack_idx = idx % 4; - CollectorEvent::Metric( + HealthEvent::MeasurementObserved( MetricSample { key: sensor_key.clone(), name: "hw_sensor".to_string(), @@ -92,8 +93,8 @@ fn build_sensor_metric_event(idx: usize, unique_keys: usize) -> CollectorEvent { ) } -fn build_nmxt_metric_event(idx: usize) -> CollectorEvent { - CollectorEvent::Metric( +fn build_nmxt_metric_event(idx: usize) -> HealthEvent { + HealthEvent::MeasurementObserved( MetricSample { key: format!("effective_ber:{}", idx % 64), name: "switch_nmxt".to_string(), @@ -112,8 +113,8 @@ fn build_nmxt_metric_event(idx: usize) -> CollectorEvent { ) } -fn build_log_event(idx: usize) -> CollectorEvent { - CollectorEvent::Log( +fn build_log_event(idx: usize) -> HealthEvent { + HealthEvent::LogObserved( LogRecord { body: format!("BMC event line {idx}"), severity: "INFO".to_string(), @@ -128,9 +129,9 @@ fn build_log_event(idx: usize) -> CollectorEvent { ) } -fn build_firmware_event(idx: usize) -> CollectorEvent { +fn build_firmware_event(idx: usize) -> HealthEvent { let component = format!("component-{idx}"); - CollectorEvent::Firmware(FirmwareInfo { + HealthEvent::FirmwareObserved(FirmwareInfo { component: component.clone(), version: format!("1.0.{}", idx % 100), attributes: vec![ @@ -140,8 +141,8 @@ fn build_firmware_event(idx: usize) -> CollectorEvent { }) } -fn bench_collector_event_build(c: &mut Criterion) { - let mut group = c.benchmark_group("collector_event_build"); +fn bench_health_event_build(c: &mut Criterion) { + let mut group = c.benchmark_group("health_event_build"); let sample_count = 10_000usize; group.throughput(Throughput::Elements(sample_count as u64)); @@ -181,12 +182,12 @@ fn bench_collector_event_build(c: &mut Criterion) { } fn emit_metric_batch_building( - sink: &dyn DataSink, + sink: &dyn SyncEventNode, context: &EventContext, batch_size: usize, unique_keys: usize, ) { - let start = CollectorEvent::MetricCollectionStart; + let start = HealthEvent::ScrapeBatchStarted; sink.handle_event(context, &start); for idx in 0..batch_size { @@ -194,7 +195,7 @@ fn emit_metric_batch_building( sink.handle_event(context, &event); } - let end = CollectorEvent::MetricCollectionEnd; + let end = HealthEvent::ScrapeBatchFinished; sink.handle_event(context, &end); } @@ -224,13 +225,13 @@ fn bench_collector_build_and_emit_prometheus(c: &mut Criterion) { } struct CompositeBuildEmitState { - sink: CompositeDataSink, + sink: CompositeSyncEventNode, context: EventContext, } impl CompositeBuildEmitState { fn new(sink_count: usize) -> Self { - let mut sinks: Vec> = Vec::with_capacity(sink_count); + let mut sinks: Vec> = Vec::with_capacity(sink_count); for _ in 0..sink_count { sinks.push(Arc::new(CountingSink)); } @@ -238,7 +239,7 @@ impl CompositeBuildEmitState { let metrics_manager = Arc::new( MetricsManager::new("bench_collector").expect("metrics manager should initialize"), ); - let sink = CompositeDataSink::new(sinks, metrics_manager); + let sink = CompositeSyncEventNode::new(sinks, metrics_manager); Self { sink, @@ -268,7 +269,7 @@ fn bench_collector_build_and_emit_composite(c: &mut Criterion) { criterion_group!( benches, - bench_collector_event_build, + bench_health_event_build, bench_collector_build_and_emit_prometheus, bench_collector_build_and_emit_composite ); diff --git a/crates/health/benches/processor_pipeline.rs b/crates/health/benches/processor_pipeline.rs index d005eac8df..0379e42963 100644 --- a/crates/health/benches/processor_pipeline.rs +++ b/crates/health/benches/processor_pipeline.rs @@ -23,11 +23,11 @@ use std::sync::Arc; use carbide_health::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use carbide_health::metrics::MetricsManager; use carbide_health::processor::{ - EventProcessingPipeline, EventProcessor, HealthReportProcessor, LeakEventProcessor, - RackLeakProcessor, + EventGraph, HealthReportProcessor, LeakSyncEventNode, RackLeakProcessor, }; use carbide_health::sink::{ - CollectorEvent, CompositeDataSink, DataSink, EventContext, MetricSample, SensorThresholdContext, + CompositeSyncEventNode, EventContext, HealthEvent, MetricSample, SensorThresholdContext, + SyncEventNode, }; use carbide_uuid::rack::RackId; use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; @@ -38,45 +38,38 @@ const MACHINE_ID: &str = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6r struct CountingSink; -impl DataSink for CountingSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for CountingSink { + fn node_type(&self) -> &'static str { "counting_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { std::hint::black_box(context); std::hint::black_box(event); + Vec::new() } } struct NoopProcessor; -impl EventProcessor for NoopProcessor { - fn processor_type(&self) -> &'static str { +impl SyncEventNode for NoopProcessor { + fn node_type(&self) -> &'static str { "noop_processor" } - fn process_event( - &self, - _context: &EventContext, - _event: &CollectorEvent, - ) -> Vec { + fn handle_event(&self, _context: &EventContext, _event: &HealthEvent) -> Vec { Vec::new() } } struct ReemitProcessor; -impl EventProcessor for ReemitProcessor { - fn processor_type(&self) -> &'static str { +impl SyncEventNode for ReemitProcessor { + fn node_type(&self) -> &'static str { "reemit_processor" } - fn process_event( - &self, - _context: &EventContext, - event: &CollectorEvent, - ) -> Vec { + fn handle_event(&self, _context: &EventContext, event: &HealthEvent) -> Vec { vec![event.clone()] } } @@ -101,19 +94,33 @@ fn event_context() -> EventContext { } } -fn make_composite_sink(count: usize, metrics_manager: Arc) -> Arc { - let mut sinks: Vec> = Vec::with_capacity(count); +fn make_composite_sink( + count: usize, + metrics_manager: Arc, +) -> Arc { + let mut sinks: Vec> = Vec::with_capacity(count); for _ in 0..count { sinks.push(Arc::new(CountingSink)); } - Arc::new(CompositeDataSink::new(sinks, metrics_manager)) + Arc::new(CompositeSyncEventNode::new(sinks, metrics_manager)) +} + +fn make_event_graph( + sink: Arc, + processors: Vec>, + metrics_manager: Arc, +) -> EventGraph { + let mut nodes = Vec::with_capacity(processors.len() + 1); + nodes.push(sink); + nodes.extend(processors); + EventGraph::new(nodes, metrics_manager) } fn metric_events( batch_size: usize, unique_keys: usize, with_health_context: bool, -) -> Vec { +) -> Vec { let unique_keys = unique_keys.max(1); (0..batch_size) @@ -150,18 +157,18 @@ fn metric_events( bmc_health: BmcHealth::Warning, }); } - CollectorEvent::Metric(metric.into()) + HealthEvent::MeasurementObserved(metric.into()) }) .collect() } -fn emit_metric_batch(sink: &dyn DataSink, context: &EventContext, events: &[CollectorEvent]) { - let start = CollectorEvent::MetricCollectionStart; +fn emit_metric_batch(sink: &dyn SyncEventNode, context: &EventContext, events: &[HealthEvent]) { + let start = HealthEvent::ScrapeBatchStarted; sink.handle_event(context, &start); for event in events { sink.handle_event(context, event); } - let end = CollectorEvent::MetricCollectionEnd; + let end = HealthEvent::ScrapeBatchFinished; sink.handle_event(context, &end); } @@ -173,18 +180,14 @@ fn bench_pipeline_baseline(c: &mut Criterion) { let metrics_manager: Arc = Arc::new(MetricsManager::new("bench").expect("metrics manager should initialize")); let sink = make_composite_sink(2, metrics_manager.clone()); - let mut processors: Vec> = Vec::with_capacity(processor_count); + let mut processors: Vec> = Vec::with_capacity(processor_count); for _ in 0..processor_count { processors.push(Arc::new(NoopProcessor)); } - let sink: Arc = if processors.is_empty() { + let sink: Arc = if processors.is_empty() { sink } else { - Arc::new(EventProcessingPipeline::new( - processors, - sink, - metrics_manager.clone(), - )) + Arc::new(make_event_graph(sink, processors, metrics_manager.clone())) }; let context = event_context(); let events = metric_events(batch_size, 64, false); @@ -208,13 +211,13 @@ fn bench_pipeline_health_processors(c: &mut Criterion) { let metrics_manager: Arc = Arc::new(MetricsManager::new("bench").expect("metrics manager should initialize")); - let processors: Vec> = vec![ + let processors: Vec> = vec![ Arc::new(HealthReportProcessor::default()), - Arc::new(LeakEventProcessor::new(1)), + Arc::new(LeakSyncEventNode::new(1)), ]; - let pipeline = EventProcessingPipeline::new( - processors, + let pipeline = make_event_graph( make_composite_sink(2, metrics_manager.clone()), + processors, metrics_manager, ); let context = event_context(); @@ -240,9 +243,9 @@ fn bench_pipeline_loop_guard(c: &mut Criterion) { let metrics_manager: Arc = Arc::new(MetricsManager::new("bench").expect("metrics manager should initialize")); - let pipeline = EventProcessingPipeline::new( - vec![Arc::new(ReemitProcessor)], + let pipeline = make_event_graph( make_composite_sink(2, metrics_manager.clone()), + vec![Arc::new(ReemitProcessor)], metrics_manager, ); let context = event_context(); @@ -286,14 +289,14 @@ fn bench_pipeline_rack_leak(c: &mut Criterion) { let metrics_manager: Arc = Arc::new(MetricsManager::new("bench").expect("metrics manager should initialize")); - let processors: Vec> = vec![ + let processors: Vec> = vec![ Arc::new(HealthReportProcessor::default()), - Arc::new(LeakEventProcessor::new(1)), + Arc::new(LeakSyncEventNode::new(1)), Arc::new(RackLeakProcessor::new(2)), ]; - let pipeline = EventProcessingPipeline::new( - processors, + let pipeline = make_event_graph( make_composite_sink(2, metrics_manager.clone()), + processors, metrics_manager, ); diff --git a/crates/health/benches/sink_pipeline.rs b/crates/health/benches/sink_pipeline.rs index 0e8e9d730f..9a146a84c5 100644 --- a/crates/health/benches/sink_pipeline.rs +++ b/crates/health/benches/sink_pipeline.rs @@ -24,8 +24,8 @@ use std::sync::Arc; use carbide_health::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use carbide_health::metrics::MetricsManager; use carbide_health::sink::{ - Classification, CollectorEvent, CompositeDataSink, DataSink, EventContext, HealthReport, - HealthReportSink, LogRecord, MetricSample, PrometheusSink, ReportSource, + Classification, CompositeSyncEventNode, EventContext, HealthEvent, HealthReport, + HealthReportSink, LogRecord, MetricSample, PrometheusSink, ReportSource, SyncEventNode, }; use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; use health_report::HealthReport as CarbideHealthReport; @@ -40,14 +40,15 @@ const MACHINE_IDS: [&str; 3] = [ struct CountingSink; -impl DataSink for CountingSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for CountingSink { + fn node_type(&self) -> &'static str { "counting_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { std::hint::black_box(context); std::hint::black_box(event); + Vec::new() } } @@ -75,7 +76,7 @@ fn event_context_for_machine(machine_id: &str) -> EventContext { } } -fn metric_events(batch_size: usize, unique_keys: usize) -> Vec { +fn metric_events(batch_size: usize, unique_keys: usize) -> Vec { let unique_keys = unique_keys.max(1); (0..batch_size) @@ -83,7 +84,7 @@ fn metric_events(batch_size: usize, unique_keys: usize) -> Vec { let sensor_idx = idx % unique_keys; let key = format!("sensor-{sensor_idx}"); - CollectorEvent::Metric( + HealthEvent::MeasurementObserved( MetricSample { key: key.clone(), name: "hw_sensor".to_string(), @@ -99,13 +100,13 @@ fn metric_events(batch_size: usize, unique_keys: usize) -> Vec { .collect() } -fn emit_metric_batch(sink: &dyn DataSink, context: &EventContext, events: &[CollectorEvent]) { - let start = CollectorEvent::MetricCollectionStart; +fn emit_metric_batch(sink: &dyn SyncEventNode, context: &EventContext, events: &[HealthEvent]) { + let start = HealthEvent::ScrapeBatchStarted; sink.handle_event(context, &start); for event in events { sink.handle_event(context, event); } - let end = CollectorEvent::MetricCollectionEnd; + let end = HealthEvent::ScrapeBatchFinished; sink.handle_event(context, &end); } @@ -135,21 +136,21 @@ fn bench_prometheus_sink(c: &mut Criterion) { } struct CompositeBenchState { - sink: CompositeDataSink, + sink: CompositeSyncEventNode, context: EventContext, - events: Vec, + events: Vec, } impl CompositeBenchState { fn new(sink_count: usize, batch_size: usize) -> Self { - let mut sinks: Vec> = Vec::with_capacity(sink_count); + let mut sinks: Vec> = Vec::with_capacity(sink_count); for _ in 0..sink_count { sinks.push(Arc::new(CountingSink)); } let metrics_manager = Arc::new(MetricsManager::new("bench_sink").expect("metrics manager should initialize")); - let sink = CompositeDataSink::new(sinks, metrics_manager); + let sink = CompositeSyncEventNode::new(sinks, metrics_manager); Self { sink, @@ -202,8 +203,8 @@ struct HealthReportBenchState { sink: HealthReportSink, context: EventContext, distinct_contexts: Vec, - sensor_event: CollectorEvent, - leak_event: CollectorEvent, + sensor_event: HealthEvent, + leak_event: HealthEvent, } impl HealthReportBenchState { @@ -214,8 +215,9 @@ impl HealthReportBenchState { .into_iter() .map(event_context_for_machine) .collect(); - let sensor_event = CollectorEvent::HealthReport(Arc::new(health_report_with_alerts(256))); - let leak_event = CollectorEvent::HealthReport(Arc::new(HealthReport { + let sensor_event = + HealthEvent::HealthReportProduced(Arc::new(health_report_with_alerts(256))); + let leak_event = HealthEvent::HealthReportProduced(Arc::new(HealthReport { source: ReportSource::TrayLeakDetection, target: Some(carbide_health::sink::HealthReportTarget::Machine), observed_at: Some(chrono::Utc::now()), @@ -240,8 +242,8 @@ impl HealthReportBenchState { fn filled_health_report_sink( contexts: &[EventContext], - event: &CollectorEvent, - leak_event: &CollectorEvent, + event: &HealthEvent, + leak_event: &HealthEvent, ) -> HealthReportSink { let sink = HealthReportSink::new_for_bench().expect("bench sink should initialize"); for context in contexts { @@ -328,11 +330,11 @@ fn bench_health_report_sink(c: &mut Criterion) { group.finish(); } -fn log_events_with_attrs(count: usize, unique_sensors: usize) -> Vec { +fn log_events_with_attrs(count: usize, unique_sensors: usize) -> Vec { (0..count) .map(|idx| { let sensor = format!("HGX_GPU_{}_Temp_1", idx % unique_sensors); - CollectorEvent::Log(Box::new(LogRecord { + HealthEvent::LogObserved(Box::new(LogRecord { body: format!("{sensor} sensor crossed threshold"), severity: "Warning".to_string(), attributes: vec![ diff --git a/crates/health/src/collectors/discovery.rs b/crates/health/src/collectors/discovery.rs index 0d790bc6d8..6e761f77d1 100644 --- a/crates/health/src/collectors/discovery.rs +++ b/crates/health/src/collectors/discovery.rs @@ -24,36 +24,41 @@ use nv_redfish::ServiceRoot; use nv_redfish::core::Bmc; use crate::HealthError; -use crate::collectors::inventory::{DiscoveredEntity, EntityInventory, SharedInventory}; +use crate::bmc::BmcClient; +use crate::collectors::inventory::{DiscoveredEntity, EntityInventory}; use crate::collectors::runtime::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; +use crate::sink::{EventContext, HealthEvent, SyncEventNode}; /// Configuration for the entity discovery collector pub struct EntityDiscoveryCollectorConfig { - pub(crate) shared: SharedInventory, + pub(crate) data_sink: Option>, pub discovery_concurrency: usize, + pub(crate) _bmc: std::marker::PhantomData, } pub struct EntityDiscoveryCollector { endpoint: Arc, + event_context: EventContext, bmc: Arc, - shared: SharedInventory, + data_sink: Option>, discovery_concurrency: usize, generation: u64, } -impl PeriodicCollector for EntityDiscoveryCollector { - type Config = EntityDiscoveryCollectorConfig; +impl PeriodicCollector for EntityDiscoveryCollector { + type Config = EntityDiscoveryCollectorConfig; fn new_runner( - bmc: Arc, + bmc: Arc, endpoint: Arc, config: Self::Config, ) -> Result { Ok(Self { + event_context: EventContext::from_endpoint(&endpoint, "entity_discovery_collector"), endpoint, bmc, - shared: config.shared, + data_sink: config.data_sink, discovery_concurrency: config.discovery_concurrency.max(1), generation: 0, }) @@ -65,11 +70,15 @@ impl PeriodicCollector for EntityDiscoveryCollector { let entity_count = entities.len(); self.generation = self.generation.wrapping_add(1); - self.shared.store(Some(Arc::new(EntityInventory { + let inventory = Arc::new(EntityInventory { entities, discovered_at: std::time::Instant::now(), generation: self.generation, - }))); + }); + self.emit_event(HealthEvent::InventoryDiscovered { + endpoint_key: self.event_context.endpoint_key().to_string(), + inventory, + }); tracing::info!( bmc = %self.endpoint.addr.mac, @@ -90,12 +99,17 @@ impl PeriodicCollector for EntityDiscoveryCollector { } async fn stop(&mut self) { - // Clear the snapshot so readers stop emitting for a removed endpoint. - self.shared.store(None); + self.emit_event(HealthEvent::NodeRemoved); } } -impl EntityDiscoveryCollector { +impl EntityDiscoveryCollector { + fn emit_event(&self, event: HealthEvent) { + if let Some(data_sink) = &self.data_sink { + data_sink.handle_event(&self.event_context, &event); + } + } + fn record_failure( &self, result: Result, @@ -115,7 +129,7 @@ impl EntityDiscoveryCollector { async fn discover_entities( &self, fetch_failures: &AtomicUsize, - ) -> Result>, HealthError> { + ) -> Result>, HealthError> { let service_root = ServiceRoot::new(self.bmc.clone()).await?; let mut entities = Vec::new(); @@ -155,9 +169,9 @@ impl EntityDiscoveryCollector { async fn discover_processors( &self, - system: &Arc>, + system: &Arc>, fetch_failures: &AtomicUsize, - entities: &mut Vec>, + entities: &mut Vec>, sensor_ids: &mut HashSet, ) { let processors = self @@ -194,9 +208,9 @@ impl EntityDiscoveryCollector { async fn discover_memory( &self, - system: &Arc>, + system: &Arc>, fetch_failures: &AtomicUsize, - entities: &mut Vec>, + entities: &mut Vec>, sensor_ids: &mut HashSet, ) { let memory_modules = self @@ -232,9 +246,9 @@ impl EntityDiscoveryCollector { async fn discover_drives( &self, - system: &Arc>, + system: &Arc>, fetch_failures: &AtomicUsize, - entities: &mut Vec>, + entities: &mut Vec>, sensor_ids: &mut HashSet, ) { let storage_list = self @@ -279,9 +293,9 @@ impl EntityDiscoveryCollector { async fn discover_power_supplies( &self, - chassis: &Arc>, + chassis: &Arc>, fetch_failures: &AtomicUsize, - entities: &mut Vec>, + entities: &mut Vec>, sensor_ids: &mut HashSet, ) { let power_supplies = self @@ -316,9 +330,9 @@ impl EntityDiscoveryCollector { async fn discover_chassis( &self, - chassis: &Arc>, + chassis: &Arc>, fetch_failures: &AtomicUsize, - entities: &mut Vec>, + entities: &mut Vec>, sensor_ids: &mut HashSet, ) { let sensors = match chassis.sensor_links().await { diff --git a/crates/health/src/collectors/entity_metrics.rs b/crates/health/src/collectors/entity_metrics.rs index b776a2a692..c3c5c4eef5 100644 --- a/crates/health/src/collectors/entity_metrics.rs +++ b/crates/health/src/collectors/entity_metrics.rs @@ -27,10 +27,11 @@ use nv_redfish::schema::power_supply_metrics::PowerSupplyMetrics; use nv_redfish::schema::processor_metrics::ProcessorMetrics; use crate::HealthError; -use crate::collectors::inventory::{DiscoveredEntity, SharedInventory}; +use crate::bmc::BmcClient; +use crate::collectors::inventory::{DiscoveredEntity, EntityInventory}; use crate::collectors::runtime::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; +use crate::sink::{EventContext, HealthEvent, MetricSample, SyncEventNode}; struct MetricField { metric_type: Cow<'static, str>, @@ -357,24 +358,24 @@ fn power_supply_metric_fields(m: &PowerSupplyMetrics) -> Vec { } pub struct MetricsCollectorConfig { - pub data_sink: Option>, - pub(crate) shared: SharedInventory, + pub data_sink: Option>, pub fetch_concurrency: usize, + pub(crate) _bmc: std::marker::PhantomData, } pub struct MetricsCollector { endpoint: Arc, event_context: EventContext, - shared: SharedInventory, - data_sink: Option>, + latest_inventory: Option>>, + data_sink: Option>, fetch_concurrency: usize, } -impl PeriodicCollector for MetricsCollector { - type Config = MetricsCollectorConfig; +impl PeriodicCollector for MetricsCollector { + type Config = MetricsCollectorConfig; fn new_runner( - _bmc: Arc, + _bmc: Arc, endpoint: Arc, config: Self::Config, ) -> Result { @@ -382,14 +383,14 @@ impl PeriodicCollector for MetricsCollector { Ok(Self { endpoint, event_context, - shared: config.shared, + latest_inventory: None, data_sink: config.data_sink, fetch_concurrency: config.fetch_concurrency.max(1), }) } async fn run_iteration(&mut self) -> Result { - let Some(inventory) = self.shared.load_full() else { + let Some(inventory) = self.latest_inventory.clone() else { tracing::debug!( bmc_addr = ?self.endpoint.addr, "No entity inventory available yet; skipping metrics iteration" @@ -410,7 +411,7 @@ impl PeriodicCollector for MetricsCollector { ); let fetch_failures = AtomicUsize::new(0); - self.emit_event(CollectorEvent::MetricCollectionStart); + self.emit_event(HealthEvent::ScrapeBatchStarted); let this = &*self; let failures = &fetch_failures; @@ -427,7 +428,7 @@ impl PeriodicCollector for MetricsCollector { .into_iter() .sum(); - self.emit_event(CollectorEvent::MetricCollectionEnd); + self.emit_event(HealthEvent::ScrapeBatchFinished); Ok(IterationResult { refresh_triggered: false, @@ -440,13 +441,23 @@ impl PeriodicCollector for MetricsCollector { "metrics_collector" } + fn wants_events(&self) -> bool { + true + } + + fn handle_event(&mut self, _context: &EventContext, event: &HealthEvent) { + if let HealthEvent::InventoryDiscovered { inventory, .. } = event { + self.latest_inventory = Some(inventory.clone()); + } + } + async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); + self.emit_event(HealthEvent::NodeRemoved); } } -impl MetricsCollector { - fn emit_event(&self, event: CollectorEvent) { +impl MetricsCollector { + fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } @@ -454,7 +465,7 @@ impl MetricsCollector { async fn collect_entity( &self, - entity: &DiscoveredEntity, + entity: &DiscoveredEntity, fetch_failures: &AtomicUsize, ) -> usize { let fields = match entity { @@ -502,7 +513,7 @@ impl MetricsCollector { let entity_key = entity.key(); let count = fields.len(); for field in fields { - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: format!("{entity_key}/{}", field.metric_type), name: "hw_metric".to_string(), diff --git a/crates/health/src/collectors/firmware.rs b/crates/health/src/collectors/firmware.rs index 9ce8283a79..55df7c8b62 100644 --- a/crates/health/src/collectors/firmware.rs +++ b/crates/health/src/collectors/firmware.rs @@ -24,16 +24,16 @@ use nv_redfish::core::Bmc; use crate::HealthError; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; -use crate::sink::{CollectorEvent, DataSink, EventContext, FirmwareInfo}; +use crate::sink::{EventContext, FirmwareInfo, HealthEvent, SyncEventNode}; pub struct FirmwareCollectorConfig { - pub data_sink: Option>, + pub data_sink: Option>, } pub struct FirmwareCollector { bmc: Arc, event_context: EventContext, - data_sink: Option>, + data_sink: Option>, } impl PeriodicCollector for FirmwareCollector { @@ -61,12 +61,12 @@ impl PeriodicCollector for FirmwareCollector { } async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); + self.emit_event(HealthEvent::NodeRemoved); } } impl FirmwareCollector { - fn emit_event(&self, event: CollectorEvent) { + fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } @@ -102,7 +102,7 @@ impl FirmwareCollector { (Cow::Borrowed("version"), version.clone()), ]; - self.emit_event(CollectorEvent::Firmware(FirmwareInfo { + self.emit_event(HealthEvent::FirmwareObserved(FirmwareInfo { component, version, attributes, diff --git a/crates/health/src/collectors/inventory.rs b/crates/health/src/collectors/inventory.rs index 3e390fcd2e..df33b74094 100644 --- a/crates/health/src/collectors/inventory.rs +++ b/crates/health/src/collectors/inventory.rs @@ -19,7 +19,6 @@ use std::borrow::Cow; use std::sync::Arc; use std::time::Instant; -use arc_swap::ArcSwapOption; use nv_redfish::Resource; use nv_redfish::chassis::{Chassis, PowerSupply}; use nv_redfish::computer_system::{ComputerSystem, Drive, Memory, Processor, Storage}; @@ -132,10 +131,10 @@ impl DiscoveredEntity { let mut attrs = Vec::new(); match self { DiscoveredEntity::Processor { entity, .. } => { - if let Some(processor_type) = entity.raw().processor_type.flatten() { + if let Some(node_type) = entity.raw().processor_type.flatten() { attrs.push(( - Cow::Borrowed("processor_type"), - processor_type.to_snake_case().to_string(), + Cow::Borrowed("node_type"), + node_type.to_snake_case().to_string(), )); } if let Some(model) = entity.raw().model.clone().flatten() { @@ -213,10 +212,8 @@ impl DiscoveredEntity { } } -pub(crate) struct EntityInventory { +pub struct EntityInventory { pub(crate) entities: Vec>, pub(crate) discovered_at: Instant, pub(crate) generation: u64, } - -pub(crate) type SharedInventory = Arc>>; diff --git a/crates/health/src/collectors/leak_detector.rs b/crates/health/src/collectors/leak_detector.rs index a3caeea902..d37ec7b1f2 100644 --- a/crates/health/src/collectors/leak_detector.rs +++ b/crates/health/src/collectors/leak_detector.rs @@ -26,12 +26,12 @@ use crate::HealthError; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; use crate::sink::{ - Classification, CollectorEvent, DataSink, EventContext, HealthReport, HealthReportAlert, - HealthReportSuccess, Probe, ReportSource, + Classification, EventContext, HealthEvent, HealthReport, HealthReportAlert, + HealthReportSuccess, Probe, ReportSource, SyncEventNode, }; pub struct LeakDetectorCollectorConfig { - pub data_sink: Option>, + pub data_sink: Option>, pub state_refresh_interval: Duration, } @@ -39,7 +39,7 @@ pub struct LeakDetectorCollector { bmc: Arc, event_context: EventContext, state: Option, - data_sink: Option>, + data_sink: Option>, state_refresh_interval: Duration, } @@ -80,7 +80,7 @@ where } async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); + self.emit_event(HealthEvent::NodeRemoved); } } @@ -89,7 +89,7 @@ where B: Bmc + 'static, B::Error: 'static, { - fn emit_event(&self, event: CollectorEvent) { + fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } @@ -134,7 +134,7 @@ where let detector_count = detectors.len(); let report = build_health_report(detectors, &self.event_context); - self.emit_event(CollectorEvent::HealthReport(Arc::new(report))); + self.emit_event(HealthEvent::HealthReportProduced(Arc::new(report))); Ok(IterationResult { refresh_triggered, diff --git a/crates/health/src/collectors/logs/periodic.rs b/crates/health/src/collectors/logs/periodic.rs index 2f184592f0..14a7207d26 100644 --- a/crates/health/src/collectors/logs/periodic.rs +++ b/crates/health/src/collectors/logs/periodic.rs @@ -32,13 +32,13 @@ use super::diagnostic::{ use crate::HealthError; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::endpoint::{BmcEndpoint, EndpointMetadata}; -use crate::sink::{CollectorEvent, DataSink, EventContext, LogRecord}; +use crate::sink::{EventContext, HealthEvent, LogRecord, SyncEventNode}; /// Configuration for logs collector pub struct LogsCollectorConfig { pub state_file_path: PathBuf, pub service_refresh_interval: Duration, - pub data_sink: Option>, + pub data_sink: Option>, /// Attach Redfish diagnostic payloads to emitted log records. pub include_diagnostics: bool, @@ -68,7 +68,7 @@ pub struct LogsCollector { state_file_path: PathBuf, state: Option>, service_refresh_interval: Duration, - data_sink: Option>, + data_sink: Option>, include_diagnostics: bool, } @@ -103,7 +103,7 @@ impl PeriodicCollector for LogsCollector { async fn stop(&mut self) { if let Some(data_sink) = &self.data_sink { - data_sink.handle_event(&self.event_context, &CollectorEvent::CollectorRemoved); + data_sink.handle_event(&self.event_context, &HealthEvent::NodeRemoved); } } } @@ -363,7 +363,7 @@ impl LogsCollector { }) .flatten(); - let log_event = CollectorEvent::Log( + let log_event = HealthEvent::LogObserved( LogRecord { body, severity: severity_text, diff --git a/crates/health/src/collectors/logs/sse.rs b/crates/health/src/collectors/logs/sse.rs index a645072f29..fedf3a0eca 100644 --- a/crates/health/src/collectors/logs/sse.rs +++ b/crates/health/src/collectors/logs/sse.rs @@ -30,7 +30,7 @@ use super::diagnostic::{ use crate::HealthError; use crate::collectors::runtime::{EventStream, StreamingCollector, open_sse_stream}; use crate::endpoint::BmcEndpoint; -use crate::sink::{CollectorEvent, LogRecord}; +use crate::sink::{HealthEvent, LogRecord}; /// Configuration for the Redfish SSE log collector. pub struct SseLogCollectorConfig { @@ -91,7 +91,7 @@ fn map_payload( result: Result, bmc: &B, include_diagnostics: bool, -) -> Vec> { +) -> Vec> { match result { Ok(EventStreamPayload::Event(event)) => event_to_logs(&event, bmc, include_diagnostics), Ok(EventStreamPayload::MetricReport(_)) => Vec::new(), @@ -104,7 +104,7 @@ fn event_to_logs( event: &Event, bmc: &B, include_diagnostics: bool, -) -> Vec> { +) -> Vec> { event .events .iter() @@ -201,7 +201,7 @@ fn event_to_logs( None }; - Ok(CollectorEvent::Log(Box::new(LogRecord { + Ok(HealthEvent::LogObserved(Box::new(LogRecord { body, severity, attributes, diff --git a/crates/health/src/collectors/mod.rs b/crates/health/src/collectors/mod.rs index 6499644edf..0559c7e490 100644 --- a/crates/health/src/collectors/mod.rs +++ b/crates/health/src/collectors/mod.rs @@ -29,7 +29,6 @@ mod sensors; pub use discovery::{EntityDiscoveryCollector, EntityDiscoveryCollectorConfig}; pub use entity_metrics::{MetricsCollector, MetricsCollectorConfig}; pub use firmware::{FirmwareCollector, FirmwareCollectorConfig}; -pub(crate) use inventory::SharedInventory; pub use leak_detector::{LeakDetectorCollector, LeakDetectorCollectorConfig}; pub(crate) use logs::auto::{AutoFailureBudget, BudgetDecision, FailureKind}; pub use logs::{ diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index 7f762a2ed8..2f685342de 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -29,7 +29,7 @@ use crate::HealthError; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::config::NmxtCollectorConfig as NmxtCollectorOptions; use crate::endpoint::{BmcEndpoint, EndpointMetadata}; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; +use crate::sink::{EventContext, HealthEvent, MetricSample, SyncEventNode}; /// default NMX-T port const NMXT_PORT: u16 = 9352; @@ -139,7 +139,7 @@ async fn scrape_switch_nmxt_metrics( pub struct NmxtCollectorConfig { pub nmxt_config: NmxtCollectorOptions, - pub data_sink: Option>, + pub data_sink: Option>, } /// NMX-T collector for a single switch/endpoint @@ -148,7 +148,7 @@ pub struct NmxtCollector { switch_id: String, http_client: reqwest::Client, event_context: EventContext, - data_sink: Option>, + data_sink: Option>, } impl PeriodicCollector for NmxtCollector { @@ -196,12 +196,12 @@ impl PeriodicCollector for NmxtCollector { } async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); + self.emit_event(HealthEvent::NodeRemoved); } } impl NmxtCollector { - fn emit_event(&self, event: CollectorEvent) { + fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } @@ -212,7 +212,7 @@ impl NmxtCollector { let metrics = scrape_switch_nmxt_metrics(&self.http_client, &switch_ip).await?; - self.emit_event(CollectorEvent::MetricCollectionStart); + self.emit_event(HealthEvent::ScrapeBatchStarted); for sample in metrics { let NmxtMetricSample { @@ -242,7 +242,7 @@ impl NmxtCollector { (Cow::Borrowed("port_num"), port_num), ]; - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: metric_key, name: "switch_nmxt".to_string(), @@ -256,7 +256,7 @@ impl NmxtCollector { )); } - self.emit_event(CollectorEvent::MetricCollectionEnd); + self.emit_event(HealthEvent::ScrapeBatchFinished); Ok(()) } diff --git a/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs b/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs index 7aebb6b8f5..6127880694 100644 --- a/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs @@ -27,7 +27,7 @@ use super::proto::{self, PathElem}; use super::sample_processor::now_unix_secs; use super::subscriber::GnmiStreamMetrics; use crate::HealthError; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; +use crate::sink::{EventContext, HealthEvent, MetricSample, SyncEventNode}; type ParsedRow = HashMap; type CachedRows = HashMap; @@ -83,7 +83,7 @@ impl OnChangeStreamMetrics { pub(crate) struct GnmiOnChangeProcessor { pub(crate) collector_name: String, pub(crate) stream_metrics: OnChangeStreamMetrics, - pub(crate) data_sink: Option>, + pub(crate) data_sink: Option>, pub(crate) event_context: EventContext, pub(crate) switch_id: String, cached_rows: Mutex, @@ -93,7 +93,7 @@ impl GnmiOnChangeProcessor { pub(crate) fn new( collector_name: String, stream_metrics: OnChangeStreamMetrics, - data_sink: Option>, + data_sink: Option>, event_context: EventContext, switch_id: String, ) -> Self { @@ -276,7 +276,7 @@ impl GnmiOnChangeProcessor { sink.handle_event( &self.event_context, - &CollectorEvent::Metric(Box::new(MetricSample { + &HealthEvent::MeasurementObserved(Box::new(MetricSample { key, name: self.collector_name.clone(), metric_type: "on_change_row".to_string(), @@ -337,19 +337,20 @@ mod tests { #[derive(Default)] struct CapturingSink { - events: Mutex>, + events: Mutex>, } - impl DataSink for CapturingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CapturingSink { + fn node_type(&self) -> &'static str { "capturing_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { self.events .lock() .expect("lock poisoned") .push((context.clone(), event.clone())); + Vec::new() } } @@ -381,7 +382,7 @@ mod tests { } } - fn test_processor(data_sink: Option>) -> GnmiOnChangeProcessor { + fn test_processor(data_sink: Option>) -> GnmiOnChangeProcessor { let registry = prometheus::Registry::new(); let stream_metrics = OnChangeStreamMetrics::new(®istry, "test", TEST_COLLECTOR_NAME, test_labels()) @@ -617,7 +618,7 @@ mod tests { let events = sink.events.lock().expect("lock poisoned"); assert_eq!(events.len(), 2); - let CollectorEvent::Metric(metric) = &events[1].1 else { + let HealthEvent::MeasurementObserved(metric) = &events[1].1 else { panic!("expected metric event"); }; assert_eq!(metric.value, 4.0); @@ -657,7 +658,7 @@ mod tests { let events = sink.events.lock().expect("lock poisoned"); assert_eq!(events.len(), 3); - let CollectorEvent::Metric(metric) = &events[2].1 else { + let HealthEvent::MeasurementObserved(metric) = &events[2].1 else { panic!("expected metric event"); }; assert_eq!(metric.value, 0.0); @@ -697,7 +698,7 @@ mod tests { let events = sink.events.lock().expect("lock poisoned"); assert_eq!(events.len(), 2); - let CollectorEvent::Metric(metric) = &events[1].1 else { + let HealthEvent::MeasurementObserved(metric) = &events[1].1 else { panic!("expected metric event"); }; assert_eq!(metric.value, 0.0); @@ -743,7 +744,7 @@ mod tests { let events = sink.events.lock().expect("lock poisoned"); assert_eq!(events.len(), 2); - let CollectorEvent::Metric(metric) = &events[1].1 else { + let HealthEvent::MeasurementObserved(metric) = &events[1].1 else { panic!("expected metric event"); }; assert_eq!(metric.value, 0.0); @@ -830,7 +831,7 @@ mod tests { assert_eq!(context.switch_slot_number(), Some(7)); assert_eq!(context.switch_tray_index(), Some(3)); assert_eq!(context.rack_id().map(RackId::as_str), Some("RACK_2")); - let CollectorEvent::Metric(metric) = event else { + let HealthEvent::MeasurementObserved(metric) = event else { panic!("expected metric event"); }; assert_eq!(metric.metric_type, "on_change_row"); diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs index 20c06e3854..896085eb05 100644 --- a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -23,13 +23,13 @@ use std::time::Instant; use super::client::{typed_value_to_f64, typed_value_to_string}; use super::proto::{self, PathElem}; use super::subscriber::GnmiStreamMetrics; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; +use crate::sink::{EventContext, HealthEvent, MetricSample, SyncEventNode}; pub(crate) const NVUE_GNMI_SAMPLE_STREAM_ID: &str = "nvue_gnmi"; -/// process NVUE gNMI SAMPLE notifications and emit them as `CollectorEvent::Metric` +/// process NVUE gNMI SAMPLE notifications and emit them as `HealthEvent::MeasurementObserved` pub(crate) struct GnmiSampleProcessor { - pub(crate) data_sink: Option>, + pub(crate) data_sink: Option>, pub(crate) event_context: EventContext, pub(crate) switch_id: String, } @@ -235,7 +235,7 @@ impl GnmiSampleProcessor { sink.handle_event( &self.event_context, - &CollectorEvent::Metric(Box::new(MetricSample { + &HealthEvent::MeasurementObserved(Box::new(MetricSample { key, name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), metric_type: metric_type.to_string(), @@ -305,19 +305,20 @@ mod tests { #[derive(Default)] struct CapturingSink { - events: Mutex>, + events: Mutex>, } - impl DataSink for CapturingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CapturingSink { + fn node_type(&self) -> &'static str { "capturing_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { self.events .lock() .expect("lock poisoned") .push((context.clone(), event.clone())); + Vec::new() } } @@ -526,7 +527,7 @@ mod tests { assert_eq!(context.switch_slot_number(), Some(7)); assert_eq!(context.switch_tray_index(), Some(3)); assert_eq!(context.rack_id().map(RackId::as_str), Some("RACK_2")); - assert!(matches!(event, CollectorEvent::Metric(_))); + assert!(matches!(event, HealthEvent::MeasurementObserved(_))); } #[test] diff --git a/crates/health/src/collectors/nvue/gnmi/subscriber.rs b/crates/health/src/collectors/nvue/gnmi/subscriber.rs index 7c4f8e4bfb..ea9abdf715 100644 --- a/crates/health/src/collectors/nvue/gnmi/subscriber.rs +++ b/crates/health/src/collectors/nvue/gnmi/subscriber.rs @@ -39,7 +39,7 @@ use crate::collectors::runtime::{BackoffConfig, ExponentialBackoff, StreamingCon use crate::config::NvueGnmiConfig; use crate::endpoint::{BmcAddr, BmcCredentials, BmcEndpoint}; use crate::metrics::CollectorRegistry; -use crate::sink::{CollectorEvent, DataSink, EventContext}; +use crate::sink::{EventContext, HealthEvent, SyncEventNode}; // gRPC ConnectivityState values for `connection_state`. 0 (UNKNOWN) is the gauge default. const IDLE: i64 = 1; @@ -444,7 +444,7 @@ pub fn spawn_gnmi_collector( gnmi_config: &NvueGnmiConfig, credential_provider: Arc, collector_registry: Arc, - data_sink: Option>, + data_sink: Option>, ) -> Result { let switch_id = endpoint .metadata @@ -554,13 +554,10 @@ pub fn spawn_gnmi_collector( } if let Some(data_sink) = collector_removed_data_sink.as_deref() { - data_sink.handle_event( - &collector_removed_sample_context, - &CollectorEvent::CollectorRemoved, - ); + data_sink.handle_event(&collector_removed_sample_context, &HealthEvent::NodeRemoved); if let Some(event_context) = &collector_removed_on_change_context { - data_sink.handle_event(event_context, &CollectorEvent::CollectorRemoved); + data_sink.handle_event(event_context, &HealthEvent::NodeRemoved); } } })) diff --git a/crates/health/src/collectors/nvue/rest/client.rs b/crates/health/src/collectors/nvue/rest/client.rs index d9f53dd5a3..9c680b9b71 100644 --- a/crates/health/src/collectors/nvue/rest/client.rs +++ b/crates/health/src/collectors/nvue/rest/client.rs @@ -16,10 +16,9 @@ */ use std::collections::HashMap; -use std::sync::Arc; +use std::sync::RwLock; use std::time::Duration; -use arc_swap::ArcSwapOption; use reqwest::Client; use reqwest::header::ACCEPT; use serde::Deserialize; @@ -52,7 +51,7 @@ impl std::fmt::Debug for UsernamePassword { pub struct RestClient { pub(crate) switch_id: String, base_url: Url, - credentials: ArcSwapOption, + credentials: RwLock>, paths: NvueRestPaths, client: Client, } @@ -83,22 +82,31 @@ impl RestClient { Ok(Self { switch_id, base_url, - credentials: ArcSwapOption::empty(), + credentials: RwLock::new(None), paths, client, }) } pub fn set_credentials(&self, creds: UsernamePassword) { - self.credentials.store(Some(Arc::new(creds))); + *self + .credentials + .write() + .unwrap_or_else(|poisoned| poisoned.into_inner()) = Some(creds); } pub fn clear_credentials(&self) { - self.credentials.store(None); + *self + .credentials + .write() + .unwrap_or_else(|poisoned| poisoned.into_inner()) = None; } pub fn has_credentials(&self) -> bool { - self.credentials.load().is_some() + self.credentials + .read() + .unwrap_or_else(|poisoned| poisoned.into_inner()) + .is_some() } pub async fn get_system_health(&self) -> Result, HealthError> { @@ -186,7 +194,12 @@ impl RestClient { request = request.query(extra_query); } - if let Some(creds) = self.credentials.load_full() { + let credentials = self + .credentials + .read() + .unwrap_or_else(|poisoned| poisoned.into_inner()) + .clone(); + if let Some(creds) = credentials { request = request.basic_auth(&creds.username, creds.password.as_ref()); } diff --git a/crates/health/src/collectors/nvue/rest/collector.rs b/crates/health/src/collectors/nvue/rest/collector.rs index 2165a5f9d2..4c2a0a951f 100644 --- a/crates/health/src/collectors/nvue/rest/collector.rs +++ b/crates/health/src/collectors/nvue/rest/collector.rs @@ -24,7 +24,7 @@ use crate::bmc::{CREDENTIAL_REFRESH_TIMEOUT, CredentialProvider, is_auth_error}; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::config::NvueRestConfig; use crate::endpoint::{BmcAddr, BmcCredentials, BmcEndpoint, EndpointMetadata}; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; +use crate::sink::{EventContext, HealthEvent, MetricSample, SyncEventNode}; const COLLECTOR_NAME: &str = "nvue_rest"; @@ -64,7 +64,7 @@ fn diagnostic_opcode_to_f64(code: &str) -> f64 { pub struct NvueRestCollectorConfig { pub rest_config: NvueRestConfig, - pub data_sink: Option>, + pub data_sink: Option>, pub credential_provider: Arc, } @@ -72,7 +72,7 @@ pub struct NvueRestCollector { client: RestClient, switch_id: String, event_context: EventContext, - data_sink: Option>, + data_sink: Option>, addr: BmcAddr, provider: Arc, } @@ -128,7 +128,7 @@ impl PeriodicCollector for NvueRestCollector { }); } - self.emit_event(CollectorEvent::MetricCollectionStart); + self.emit_event(HealthEvent::ScrapeBatchStarted); let mut entity_count = 0usize; let mut fetch_failures = 0usize; let mut saw_auth_failure = false; @@ -254,7 +254,7 @@ impl PeriodicCollector for NvueRestCollector { self.client.clear_credentials(); } - self.emit_event(CollectorEvent::MetricCollectionEnd); + self.emit_event(HealthEvent::ScrapeBatchFinished); tracing::debug!( switch_id = %self.switch_id, @@ -274,7 +274,7 @@ impl PeriodicCollector for NvueRestCollector { } async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); + self.emit_event(HealthEvent::NodeRemoved); } } @@ -303,7 +303,7 @@ impl NvueRestCollector { } } - fn emit_event(&self, event: CollectorEvent) { + fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } @@ -328,7 +328,7 @@ impl NvueRestCollector { None => metric_type.to_string(), }; - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key, name: COLLECTOR_NAME.to_string(), diff --git a/crates/health/src/collectors/runtime.rs b/crates/health/src/collectors/runtime.rs index 50f205390d..fb6c031a37 100644 --- a/crates/health/src/collectors/runtime.rs +++ b/crates/health/src/collectors/runtime.rs @@ -28,6 +28,7 @@ use nv_redfish::core::Bmc; use nv_redfish::event_service::EventStreamPayload; use prometheus::{Counter, Gauge, Histogram, HistogramOpts, IntCounter, IntGauge, Opts}; use rand::RngExt; +use tokio::sync::mpsc; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; @@ -38,7 +39,7 @@ use crate::limiter::RateLimiter; use crate::metrics::{ CollectorRegistry, ComponentKind, MetricsManager, operation_duration_buckets_seconds, }; -use crate::sink::{CollectorEvent, DataSink, EventContext}; +use crate::sink::{EventContext, HealthEvent, SyncEventNode}; /// Result of a collector iteration #[derive(Debug, Clone)] @@ -69,12 +70,21 @@ pub trait PeriodicCollector: Send + 'static { /// Returns the type identifier for this collector fn collector_type(&self) -> &'static str; + /// Whether this collector consumes routed events (via [`Self::handle_event`]). + /// Only collectors that return `true` get an event mailbox exposed through + /// [`Collector::event_node`]. Defaults to `false`. + fn wants_events(&self) -> bool { + false + } + + fn handle_event(&mut self, _context: &EventContext, _event: &HealthEvent) {} + fn stop(&mut self) -> impl std::future::Future + Send { async {} } } -pub type EventStream<'a> = BoxStream<'a, Result>; +pub type EventStream<'a> = BoxStream<'a, Result>; /// Trait for collectors that maintain a long-lived stream (SSE, gRPC, etc.) /// runtime.rs creates the BMC client and injects it, the collector opens the stream and maps payloads to events @@ -249,6 +259,41 @@ impl Drop for StreamingConnectionGuard { pub struct Collector { handle: JoinHandle<()>, cancel_token: CancellationToken, + event_node: Option>, +} + +struct CollectorEventMailbox { + node_type: &'static str, + endpoint_key: String, + sender: mpsc::Sender<(EventContext, HealthEvent)>, +} + +impl SyncEventNode for CollectorEventMailbox { + fn node_type(&self) -> &'static str { + self.node_type + } + + fn interested_in(&self, event: &HealthEvent) -> bool { + matches!( + event, + HealthEvent::InventoryDiscovered { endpoint_key, .. } + if endpoint_key == &self.endpoint_key + ) + } + + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + // A drop on a full mailbox selfheals: discovery re-emits InventoryDiscovered + // every refresh cycle, so the consumer recovers on the next tick + if let Err(error) = self.sender.try_send((context.clone(), event.clone())) { + tracing::warn!( + ?error, + node_type = self.node_type, + endpoint_key = %self.endpoint_key, + "dropping event for collector mailbox" + ); + } + Vec::new() + } } pub struct CollectorStartContext { @@ -281,6 +326,19 @@ impl Collector { let cancel_token_clone = cancel_token.clone(); let mut runner = C::new_runner(bmc, endpoint.clone(), config)?; + // Only event-consuming collectors get a mailbox; others never receive events, + // so we skip the channel entirely and leave `event_node` as None. + let (event_node, mut event_receiver) = if runner.wants_events() { + let (event_sender, event_receiver) = mpsc::channel(16); + let node: Arc = Arc::new(CollectorEventMailbox { + node_type: runner.collector_type(), + endpoint_key: endpoint.key(), + sender: event_sender, + }); + (Some(node), Some(event_receiver)) + } else { + (None, None) + }; let endpoint_key = endpoint.key(); let const_labels = HashMap::from([ @@ -341,6 +399,7 @@ impl Collector { let handle = tokio::spawn(async move { let collector_type = runner.collector_type(); let _collector_registry = collector_registry; + let mut next_iteration = tokio::time::Instant::now(); loop { tokio::select! { _ = cancel_token_clone.cancelled() => { @@ -348,7 +407,22 @@ impl Collector { runner.stop().await; break; } - _ = async { + maybe_event = async { + match event_receiver.as_mut() { + Some(rx) => rx.recv().await, + // No mailbox: park this branch forever so only the timer/cancel fire. + None => std::future::pending().await, + } + } => { + if let Some((event_context, event)) = maybe_event { + runner.handle_event(&event_context, &event); + } + } + // The scrape runs to completion here (not as a racing select future), + // so an inbound event can never cancel an in-flight iteration or reset + // its cadence. Trade-off: cancellation waits for the current scrape, + // bounded by the BMC request timeout. + _ = tokio::time::sleep_until(next_iteration) => { limiter.acquire().await; let start = Instant::now(); @@ -388,8 +462,7 @@ impl Collector { } } - tokio::time::sleep(iteration_interval).await; - } => { + next_iteration = tokio::time::Instant::now() + iteration_interval; } } } @@ -398,6 +471,7 @@ impl Collector { Ok(Self { handle, cancel_token, + event_node, }) } @@ -405,7 +479,7 @@ impl Collector { endpoint: Arc, bmc: Arc, config: S::Config, - data_sink: Arc, + data_sink: Arc, start_context: StreamingCollectorStartContext, mut on_connect_result: F, ) -> Result @@ -533,6 +607,7 @@ impl Collector { Ok(Self { handle, cancel_token, + event_node: None, }) } @@ -550,6 +625,7 @@ impl Collector { Self { handle, cancel_token, + event_node: None, } } @@ -561,4 +637,8 @@ impl Collector { pub fn is_finished(&self) -> bool { self.handle.is_finished() } + + pub fn event_node(&self) -> Option> { + self.event_node.clone() + } } diff --git a/crates/health/src/collectors/sensors.rs b/crates/health/src/collectors/sensors.rs index d05275d05c..026338475f 100644 --- a/crates/health/src/collectors/sensors.rs +++ b/crates/health/src/collectors/sensors.rs @@ -24,35 +24,36 @@ use nv_redfish::core::{Bmc, EntityTypeRef, ToSnakeCase}; use nv_redfish::sensor::SensorLink; use crate::HealthError; -use crate::collectors::inventory::{DiscoveredEntity, SharedInventory}; +use crate::bmc::BmcClient; +use crate::collectors::inventory::{DiscoveredEntity, EntityInventory}; use crate::collectors::runtime::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; use crate::metrics::sanitize_unit; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample, SensorThresholdContext}; +use crate::sink::{EventContext, HealthEvent, MetricSample, SensorThresholdContext, SyncEventNode}; /// Configuration for the sensor collector. pub struct SensorCollectorConfig { - pub data_sink: Option>, - pub(crate) shared: SharedInventory, + pub data_sink: Option>, pub sensor_fetch_concurrency: usize, pub include_sensor_thresholds: bool, + pub(crate) _bmc: std::marker::PhantomData, } /// Sensor collector for a single BMC endpoint pub struct SensorCollector { endpoint: Arc, event_context: EventContext, - shared: SharedInventory, - data_sink: Option>, + latest_inventory: Option>>, + data_sink: Option>, sensor_fetch_concurrency: usize, include_sensor_thresholds: bool, } -impl PeriodicCollector for SensorCollector { - type Config = SensorCollectorConfig; +impl PeriodicCollector for SensorCollector { + type Config = SensorCollectorConfig; fn new_runner( - _bmc: Arc, + _bmc: Arc, endpoint: Arc, config: Self::Config, ) -> Result { @@ -60,7 +61,7 @@ impl PeriodicCollector for SensorCollector { Ok(Self { endpoint, event_context, - shared: config.shared, + latest_inventory: None, data_sink: config.data_sink, sensor_fetch_concurrency: config.sensor_fetch_concurrency.max(1), include_sensor_thresholds: config.include_sensor_thresholds, @@ -68,7 +69,7 @@ impl PeriodicCollector for SensorCollector { } async fn run_iteration(&mut self) -> Result { - let Some(inventory) = self.shared.load_full() else { + let Some(inventory) = self.latest_inventory.clone() else { tracing::debug!( bmc_addr = ?self.endpoint.addr, "No entity inventory available yet; skipping sensor iteration" @@ -89,7 +90,7 @@ impl PeriodicCollector for SensorCollector { ); let fetch_failures = AtomicUsize::new(0); - self.emit_event(CollectorEvent::MetricCollectionStart); + self.emit_event(HealthEvent::ScrapeBatchStarted); // Entity-level derived metrics (drive media life, PSU capacity), once // per entity. @@ -97,7 +98,7 @@ impl PeriodicCollector for SensorCollector { self.emit_derived_metrics(entity); } - // Build the fetch futures borrowing from the shared snapshot, then + // Build fetch futures borrowing from the immutable inventory snapshot, then // drive them concurrently. Each future borrows `&self`, the entity, and // its sensor (all alive for as long as `inventory` is held here). let this = &*self; @@ -120,7 +121,7 @@ impl PeriodicCollector for SensorCollector { .into_iter() .sum(); - self.emit_event(CollectorEvent::MetricCollectionEnd); + self.emit_event(HealthEvent::ScrapeBatchFinished); Ok(IterationResult { refresh_triggered: false, @@ -133,19 +134,29 @@ impl PeriodicCollector for SensorCollector { "sensor_collector" } + fn wants_events(&self) -> bool { + true + } + + fn handle_event(&mut self, _context: &EventContext, event: &HealthEvent) { + if let HealthEvent::InventoryDiscovered { inventory, .. } = event { + self.latest_inventory = Some(inventory.clone()); + } + } + async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); + self.emit_event(HealthEvent::NodeRemoved); } } -impl SensorCollector { - fn emit_event(&self, event: CollectorEvent) { +impl SensorCollector { + fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } } - fn emit_derived_metrics(&self, entity: &DiscoveredEntity) { + fn emit_derived_metrics(&self, entity: &DiscoveredEntity) { let derived = entity.derived_metrics(); if derived.is_empty() { return; @@ -153,7 +164,7 @@ impl SensorCollector { let mut attributes = entity.base_attributes(); attributes.extend(entity.entity_specific_attributes()); for metric in derived { - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: format!("{}/{}", entity.key(), metric.metric_type), name: "hw".to_string(), @@ -170,8 +181,8 @@ impl SensorCollector { async fn update_sensor( &self, - entity: &DiscoveredEntity, - sensor_link: &SensorLink, + entity: &DiscoveredEntity, + sensor_link: &SensorLink, fetch_failures: &AtomicUsize, ) -> usize { let sensor = match sensor_link.fetch().await { @@ -295,7 +306,7 @@ impl SensorCollector { (None, None, None, None, None, None) }; - self.emit_event(CollectorEvent::Metric( + self.emit_event(HealthEvent::MeasurementObserved( MetricSample { key: sensor.odata_id().to_string(), name: "hw_sensor".to_string(), diff --git a/crates/health/src/discovery/cleanup.rs b/crates/health/src/discovery/cleanup.rs index 5dba8d0728..530b6c8eab 100644 --- a/crates/health/src/discovery/cleanup.rs +++ b/crates/health/src/discovery/cleanup.rs @@ -49,10 +49,6 @@ pub(super) fn stop_removed_bmc_collectors( for kind in CollectorKind::ALL { stop_collectors_for_keys(ctx, kind, &removed_keys); } - for key in &removed_keys { - ctx.collectors.remove_inventory(key); - } - if !removed_keys.is_empty() { tracing::info!( removed_count = removed_keys.len(), diff --git a/crates/health/src/discovery/context.rs b/crates/health/src/discovery/context.rs index 71adf6f2f8..3377222cd8 100644 --- a/crates/health/src/discovery/context.rs +++ b/crates/health/src/discovery/context.rs @@ -19,12 +19,10 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use arc_swap::ArcSwapOption; use prometheus::{Histogram, HistogramOpts}; use crate::HealthError; -use crate::bmc::BmcClient; -use crate::collectors::{Collector, LogDowngradeRegistry, SharedInventory}; +use crate::collectors::{Collector, LogDowngradeRegistry}; use crate::config::{ Config, Configurable, DiscoveryConfig, FirmwareCollectorConfig as FirmwareCollectorOptions, LeakDetectorCollectorConfig as LeakDetectorCollectorOptions, @@ -92,7 +90,6 @@ pub(super) struct CollectorState { nmxt: HashMap, Collector>, nvue_rest: HashMap, Collector>, nvue_gnmi: HashMap, Collector>, - inventories: HashMap, SharedInventory>, } impl CollectorState { @@ -107,7 +104,6 @@ impl CollectorState { nmxt: HashMap::new(), nvue_rest: HashMap::new(), nvue_gnmi: HashMap::new(), - inventories: HashMap::new(), } } @@ -142,25 +138,18 @@ impl CollectorState { } } - pub(super) fn inventory_for(&mut self, key: &str) -> SharedInventory { - if let Some(shared) = self.inventories.get(key) { - return shared.clone(); - } - let shared = Arc::new(ArcSwapOption::empty()); - self.inventories - .insert(Cow::Owned(key.to_string()), shared.clone()); - shared - } - - /// Drop the shared inventory handle for a removed endpoint. - pub(super) fn remove_inventory(&mut self, key: &str) { - self.inventories.remove(key); - } - pub(super) fn contains(&self, kind: CollectorKind, key: &str) -> bool { self.map(kind).contains_key(key) } + pub(super) fn event_node( + &self, + kind: CollectorKind, + key: &str, + ) -> Option> { + self.map(kind).get(key).and_then(Collector::event_node) + } + pub(super) fn insert( &mut self, kind: CollectorKind, diff --git a/crates/health/src/discovery/iteration.rs b/crates/health/src/discovery/iteration.rs index 3c4fae159c..d961219dd4 100644 --- a/crates/health/src/discovery/iteration.rs +++ b/crates/health/src/discovery/iteration.rs @@ -27,7 +27,7 @@ use super::spawn::spawn_collectors_for_endpoint; use crate::HealthError; use crate::endpoint::{BmcEndpoint, EndpointSource}; use crate::sharding::ShardManager; -use crate::sink::DataSink; +use crate::sink::SyncEventNode; fn active_keys(sharded_endpoints: &[Arc]) -> HashSet> { sharded_endpoints @@ -40,7 +40,7 @@ pub async fn run_discovery_iteration( endpoint_source: Arc, shard_manager: &ShardManager, ctx: &mut DiscoveryLoopContext, - data_sink: Option>, + data_sink: Option>, metrics_prefix: &str, ) -> Result { let iteration_start = Instant::now(); diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index d0c045dba1..efe0b38c6f 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -33,7 +33,43 @@ use crate::collectors::{ }; use crate::config::{Configurable, LogCollectionMode, PeriodicLogConfig}; use crate::endpoint::{BmcEndpoint, EndpointMetadata, SwitchEndpointRole}; -use crate::sink::DataSink; +use crate::sink::{CompositeSyncEventNode, SyncEventNode}; + +type SpawnGraphFn = fn( + &mut DiscoveryLoopContext, + &Arc, + Option>, + &str, +) -> Result<(), HealthError>; + +struct EndpointGraphSpec { + name: &'static str, + applies_to: fn(&BmcEndpoint) -> bool, + spawn: SpawnGraphFn, +} + +fn is_switch_host_endpoint(endpoint: &BmcEndpoint) -> bool { + endpoint + .switch_data() + .is_some_and(|switch| switch.endpoint_role == SwitchEndpointRole::Host) +} + +fn is_generic_redfish_endpoint(endpoint: &BmcEndpoint) -> bool { + !is_switch_host_endpoint(endpoint) +} + +const ENDPOINT_GRAPH_SPECS: [EndpointGraphSpec; 2] = [ + EndpointGraphSpec { + name: "generic_redfish", + applies_to: is_generic_redfish_endpoint, + spawn: spawn_generic_redfish_collectors, + }, + EndpointGraphSpec { + name: "switch_host", + applies_to: is_switch_host_endpoint, + spawn: spawn_switch_host_collectors, + }, +]; fn logs_state_file_path(template: &str, endpoint_id: &str) -> PathBuf { PathBuf::from(template.replace("{machine_id}", endpoint_id)) @@ -42,22 +78,27 @@ fn logs_state_file_path(template: &str, endpoint_id: &str) -> PathBuf { pub(super) fn spawn_collectors_for_endpoint( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, - data_sink: Option>, + data_sink: Option>, metrics_prefix: &str, ) -> Result<(), HealthError> { - let endpoint_role = endpoint.switch_data().map(|switch| switch.endpoint_role); - - if matches!(endpoint_role, Some(SwitchEndpointRole::Host)) { - spawn_switch_host_collectors(ctx, endpoint, data_sink, metrics_prefix) - } else { - spawn_generic_redfish_collectors(ctx, endpoint, data_sink, metrics_prefix) + for spec in ENDPOINT_GRAPH_SPECS { + if !(spec.applies_to)(endpoint) { + continue; + } + tracing::debug!( + endpoint_key = %endpoint.key(), + graph_spec = spec.name, + "applying endpoint graph spec" + ); + return (spec.spawn)(ctx, endpoint, data_sink, metrics_prefix); } + Ok(()) } fn spawn_generic_redfish_collectors( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, - data_sink: Option>, + data_sink: Option>, metrics_prefix: &str, ) -> Result<(), HealthError> { let key = endpoint.key(); @@ -67,51 +108,9 @@ fn spawn_generic_redfish_collectors( let sensors_enabled = matches!(ctx.sensors_config, Configurable::Enabled(_)); let metrics_enabled = matches!(ctx.metrics_config, Configurable::Enabled(_)); - if (sensors_enabled || metrics_enabled) - && !ctx.collectors.contains(CollectorKind::Discovery, &key) - { - let shared = ctx.collectors.inventory_for(&key); - let collector_registry = Arc::new(ctx.metrics_manager.create_collector_registry( - format!("entity_discovery_collector_{key}"), - metrics_prefix, - )?); - match Collector::start::>( - endpoint_arc.clone(), - bmc.clone(), - EntityDiscoveryCollectorConfig { - shared, - discovery_concurrency: ctx.discovery_config.discovery_concurrency, - }, - CollectorStartContext { - limiter: ctx.limiter.clone(), - iteration_interval: ctx.discovery_config.refresh_interval, - collector_registry, - metrics_manager: ctx.metrics_manager.clone(), - }, - ) { - Ok(monitor) => { - ctx.collectors - .insert(CollectorKind::Discovery, key.clone().into(), monitor); - tracing::info!( - endpoint_key = %key, - total_collectors = ctx.collectors.len(CollectorKind::Discovery), - "Started entity discovery for BMC endpoint" - ); - } - Err(error) => { - tracing::error!( - ?error, - "Could not start entity discovery collector for: {:?}", - endpoint.addr - ); - } - } - } - if let Configurable::Enabled(sensor_cfg) = &ctx.sensors_config && !ctx.collectors.contains(CollectorKind::Sensor, &key) { - let shared = ctx.collectors.inventory_for(&key); let collector_registry = Arc::new( ctx.metrics_manager .create_collector_registry(format!("sensor_collector_{key}"), metrics_prefix)?, @@ -121,9 +120,9 @@ fn spawn_generic_redfish_collectors( bmc.clone(), SensorCollectorConfig { data_sink: data_sink.clone(), - shared, sensor_fetch_concurrency: sensor_cfg.sensor_fetch_concurrency, include_sensor_thresholds: sensor_cfg.include_sensor_thresholds, + _bmc: std::marker::PhantomData, }, CollectorStartContext { limiter: ctx.limiter.clone(), @@ -154,7 +153,6 @@ fn spawn_generic_redfish_collectors( if let Configurable::Enabled(metrics_cfg) = &ctx.metrics_config && !ctx.collectors.contains(CollectorKind::Metrics, &key) { - let shared = ctx.collectors.inventory_for(&key); let collector_registry = Arc::new( ctx.metrics_manager .create_collector_registry(format!("metrics_collector_{key}"), metrics_prefix)?, @@ -164,8 +162,8 @@ fn spawn_generic_redfish_collectors( bmc.clone(), MetricsCollectorConfig { data_sink: data_sink.clone(), - shared, fetch_concurrency: metrics_cfg.fetch_concurrency, + _bmc: std::marker::PhantomData, }, CollectorStartContext { limiter: ctx.limiter.clone(), @@ -193,6 +191,77 @@ fn spawn_generic_redfish_collectors( } } + // Discovery's inventory fanout is captured when it starts, so only start it once + // every enabled consumer is up. Otherwise a consumer that starts in a later + // iteration (e.g. after a transient start failure) would never be wired in. + let sensor_ready = !sensors_enabled || ctx.collectors.contains(CollectorKind::Sensor, &key); + let metrics_ready = !metrics_enabled || ctx.collectors.contains(CollectorKind::Metrics, &key); + if (sensors_enabled || metrics_enabled) + && sensor_ready + && metrics_ready + && !ctx.collectors.contains(CollectorKind::Discovery, &key) + { + let mut discovery_nodes: Vec> = Vec::new(); + if sensors_enabled + && let Some(event_node) = ctx.collectors.event_node(CollectorKind::Sensor, &key) + { + discovery_nodes.push(event_node); + } + if metrics_enabled + && let Some(event_node) = ctx.collectors.event_node(CollectorKind::Metrics, &key) + { + discovery_nodes.push(event_node); + } + if let Some(data_sink) = data_sink.clone() { + discovery_nodes.push(data_sink); + } + let discovery_data_sink = if discovery_nodes.is_empty() { + None + } else { + Some(Arc::new(CompositeSyncEventNode::new( + discovery_nodes, + ctx.metrics_manager.clone(), + )) as Arc) + }; + + let collector_registry = Arc::new(ctx.metrics_manager.create_collector_registry( + format!("entity_discovery_collector_{key}"), + metrics_prefix, + )?); + match Collector::start::>( + endpoint_arc.clone(), + bmc.clone(), + EntityDiscoveryCollectorConfig { + data_sink: discovery_data_sink, + discovery_concurrency: ctx.discovery_config.discovery_concurrency, + _bmc: std::marker::PhantomData, + }, + CollectorStartContext { + limiter: ctx.limiter.clone(), + iteration_interval: ctx.discovery_config.refresh_interval, + collector_registry, + metrics_manager: ctx.metrics_manager.clone(), + }, + ) { + Ok(monitor) => { + ctx.collectors + .insert(CollectorKind::Discovery, key.clone().into(), monitor); + tracing::info!( + endpoint_key = %key, + total_collectors = ctx.collectors.len(CollectorKind::Discovery), + "Started entity discovery for BMC endpoint" + ); + } + Err(error) => { + tracing::error!( + ?error, + "Could not start entity discovery collector for: {:?}", + endpoint.addr + ); + } + } + } + if let Configurable::Enabled(logs_cfg) = &ctx.logs_config && !ctx.collectors.contains(CollectorKind::Logs, &key) { @@ -210,7 +279,7 @@ fn spawn_generic_redfish_collectors( }; let spawn_periodic_logs = |pcfg: PeriodicLogConfig, - data_sink: Option>, + data_sink: Option>, collector_registry: Arc<_>| -> Option> { let endpoint_id = endpoint.log_identity().into_owned(); @@ -417,7 +486,7 @@ fn spawn_generic_redfish_collectors( fn spawn_switch_host_collectors( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, - data_sink: Option>, + data_sink: Option>, metrics_prefix: &str, ) -> Result<(), HealthError> { let key = endpoint.key(); @@ -568,16 +637,18 @@ mod tests { }; use crate::limiter::{NoopLimiter, RateLimiter}; use crate::metrics::MetricsManager; - use crate::sink::{CollectorEvent, EventContext}; + use crate::sink::{EventContext, HealthEvent}; struct NoopSink; - impl DataSink for NoopSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for NoopSink { + fn node_type(&self) -> &'static str { "noop" } - fn handle_event(&self, _context: &EventContext, _event: &CollectorEvent) {} + fn handle_event(&self, _context: &EventContext, _event: &HealthEvent) -> Vec { + Vec::new() + } } fn context_with_config(config: Config, metrics_name: &str) -> DiscoveryLoopContext { diff --git a/crates/health/src/lib.rs b/crates/health/src/lib.rs index 51f9b3f4ef..870575540c 100644 --- a/crates/health/src/lib.rs +++ b/crates/health/src/lib.rs @@ -44,15 +44,14 @@ use crate::endpoint::{CompositeEndpointSource, EndpointSource, StaticEndpointSou use crate::limiter::{BucketLimiter, NoopLimiter, RateLimiter}; use crate::metrics::{MetricsManager, run_metrics_server}; use crate::processor::{ - BmcIntrusionEventProcessor, EventProcessingPipeline, EventProcessor, HealthReportProcessor, - LeakEventProcessor, RackLeakProcessor, + BmcIntrusionSyncEventNode, EventGraph, HealthReportProcessor, LeakSyncEventNode, + RackLeakProcessor, }; use crate::sharding::ShardManager; use crate::sink::event_mapper::{OpenBmcEventMapper, RedfishEventMapper}; use crate::sink::{ - CompositeDataSink, DataSink, HealthReportSink, LogFileSink, OtlpSink, - PowerShelfHealthReportSink, PrometheusSink, RackHealthReportSink, SwitchHealthReportSink, - TracingSink, + HealthReportSink, LogFileSink, OtlpSink, PowerShelfHealthReportSink, PrometheusSink, + RackHealthReportSink, SwitchHealthReportSink, SyncEventNode, TracingSink, }; #[derive(thiserror::Error, Debug)] @@ -163,16 +162,15 @@ fn build_endpoint_wiring(config: &Config) -> Result fn build_data_sink( config: &Config, metrics_manager: Arc, -) -> Result>, HealthError> { - let mut sinks: Vec> = Vec::new(); - let mut processors: Vec> = Vec::new(); +) -> Result>, HealthError> { + let mut nodes: Vec> = Vec::new(); if let Configurable::Enabled(sink_cfg) = &config.sinks.tracing { - sinks.push(Arc::new(TracingSink::new(sink_cfg))); + nodes.push(Arc::new(TracingSink::new(sink_cfg))); } if let Configurable::Enabled(_) = &config.sinks.prometheus { - sinks.push(Arc::new(PrometheusSink::new( + nodes.push(Arc::new(PrometheusSink::new( metrics_manager.clone(), &config.metrics.prefix, )?)); @@ -184,50 +182,50 @@ fn build_data_sink( || config.sinks.switch_health_report.is_enabled() || config.processors.leak_detection.is_enabled() { - processors.push(Arc::new(HealthReportProcessor::new())); + nodes.push(Arc::new(HealthReportProcessor::new())); } if config.sinks.health_report.is_enabled() { - processors.push(Arc::new(BmcIntrusionEventProcessor::new())); + nodes.push(Arc::new(BmcIntrusionSyncEventNode::new())); } if let Configurable::Enabled(ref leak_detection_cfg) = config.processors.leak_detection { - processors.push(Arc::new(LeakEventProcessor::new( + nodes.push(Arc::new(LeakSyncEventNode::new( leak_detection_cfg.minimum_alerts_per_report, ))); } if let Configurable::Enabled(ref rack_leak_cfg) = config.processors.rack_leak { - processors.push(Arc::new(RackLeakProcessor::new( + nodes.push(Arc::new(RackLeakProcessor::new( rack_leak_cfg.leaking_tray_threshold, ))); } if let Configurable::Enabled(ref sink_cfg) = config.sinks.log_file { - sinks.push(Arc::new( + nodes.push(Arc::new( LogFileSink::new(sink_cfg).map_err(HealthError::GenericError)?, )); } if let Configurable::Enabled(ref sink_cfg) = config.sinks.health_report { - sinks.push(Arc::new(HealthReportSink::new(sink_cfg)?)); + nodes.push(Arc::new(HealthReportSink::new(sink_cfg)?)); } if let Configurable::Enabled(ref sink_cfg) = config.sinks.rack_health_report { - sinks.push(Arc::new(RackHealthReportSink::new(sink_cfg)?)); + nodes.push(Arc::new(RackHealthReportSink::new(sink_cfg)?)); } if let Configurable::Enabled(ref sink_cfg) = config.sinks.switch_health_report { - sinks.push(Arc::new(SwitchHealthReportSink::new(sink_cfg)?)); + nodes.push(Arc::new(SwitchHealthReportSink::new(sink_cfg)?)); } if let Configurable::Enabled(ref sink_cfg) = config.sinks.power_shelf_health_report { - sinks.push(Arc::new(PowerShelfHealthReportSink::new(sink_cfg)?)); + nodes.push(Arc::new(PowerShelfHealthReportSink::new(sink_cfg)?)); } if let Configurable::Enabled(ref otlp_cfg) = config.sinks.otlp { let mapper: Arc = Arc::new(OpenBmcEventMapper); - sinks.push(Arc::new(OtlpSink::new( + nodes.push(Arc::new(OtlpSink::new( otlp_cfg, mapper, &metrics_manager, @@ -235,22 +233,11 @@ fn build_data_sink( )?)); } - if sinks.is_empty() { + if nodes.is_empty() { return Ok(None); } - let composite_sink: Arc = - Arc::new(CompositeDataSink::new(sinks, metrics_manager.clone())); - - if processors.is_empty() { - return Ok(Some(composite_sink)); - } - - Ok(Some(Arc::new(EventProcessingPipeline::new( - processors, - composite_sink, - metrics_manager, - )))) + Ok(Some(Arc::new(EventGraph::new(nodes, metrics_manager)))) } pub async fn run_service(config: Config) -> Result<(), HealthError> { diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index fb71066fb9..0efa0893dd 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -28,7 +28,7 @@ use super::metrics::{ }; use super::resource::Resource; use crate::endpoint::SwitchEndpointRole; -use crate::sink::{CollectorEvent, EventContext, MetricSample}; +use crate::sink::{EventContext, HealthEvent, MetricSample}; fn severity_text_to_number(severity: &str) -> i32 { match severity.to_uppercase().as_str() { @@ -148,10 +148,10 @@ fn convert_log(log: &crate::sink::LogRecord, observed_nanos: u64) -> OtlpLogReco } } -fn convert_event(event: &CollectorEvent, observed_nanos: u64) -> Option { +fn convert_event(event: &HealthEvent, observed_nanos: u64) -> Option { match event { - CollectorEvent::Log(log) => Some(convert_log(log, observed_nanos)), - CollectorEvent::HealthReport(report) => { + HealthEvent::LogObserved(log) => Some(convert_log(log, observed_nanos)), + HealthEvent::HealthReportProduced(report) => { let body = format!( "health report: {} alerts, {} ok (source: {:?})", report.alerts.len(), @@ -173,7 +173,7 @@ fn convert_event(event: &CollectorEvent, observed_nanos: u64) -> Option { + HealthEvent::FirmwareObserved(info) => { let body = format!("{}: {}", info.component, info.version); Some(OtlpLogRecord { time_unix_nano: observed_nanos, @@ -185,15 +185,18 @@ fn convert_event(event: &CollectorEvent, observed_nanos: u64) -> Option None, + HealthEvent::MeasurementObserved(_) + | HealthEvent::ScrapeRequested { .. } + | HealthEvent::InventoryDiscovered { .. } + | HealthEvent::InventoryUpdated { .. } + | HealthEvent::ScrapeBatchStarted + | HealthEvent::ScrapeBatchFinished + | HealthEvent::NodeRemoved => None, } } /// Builds an OTLP log export request grouped by endpoint. -pub fn build_export_request(batch: &[(EventContext, CollectorEvent)]) -> ExportLogsServiceRequest { +pub fn build_export_request(batch: &[(EventContext, HealthEvent)]) -> ExportLogsServiceRequest { let observed_nanos = SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .unwrap_or_default() @@ -529,7 +532,7 @@ mod tests { #[test] fn log_event_converts_to_otlp_record() { let ctx = test_context(); - let log = CollectorEvent::Log(Box::new(LogRecord { + let log = HealthEvent::LogObserved(Box::new(LogRecord { body: "something happened".to_string(), severity: "WARNING".to_string(), attributes: vec![(Cow::Borrowed("entry_id"), "42".to_string())], @@ -557,7 +560,7 @@ mod tests { r#"{"key":"redfish.parent.log_entry_id","value":"42"}]}"# ); - let log = CollectorEvent::Log(Box::new(LogRecord { + let log = HealthEvent::LogObserved(Box::new(LogRecord { body: body.to_string(), severity: "WARN".to_string(), attributes: vec![ @@ -596,8 +599,8 @@ mod tests { fn metric_events_are_filtered_out() { let ctx = test_context(); let batch = vec![ - (ctx.clone(), CollectorEvent::MetricCollectionStart), - (ctx, CollectorEvent::MetricCollectionEnd), + (ctx.clone(), HealthEvent::ScrapeBatchStarted), + (ctx, HealthEvent::ScrapeBatchFinished), ]; let request = build_export_request(&batch); assert!(request.resource_logs.is_empty()); @@ -606,7 +609,7 @@ mod tests { #[test] fn health_report_converts_with_alert_severity() { let ctx = test_context(); - let report = CollectorEvent::HealthReport( + let report = HealthEvent::HealthReportProduced( HealthReport { source: ReportSource::BmcSensors, target: None, @@ -648,7 +651,7 @@ mod tests { let log = |ctx| { ( ctx, - CollectorEvent::Log(Box::new(LogRecord { + HealthEvent::LogObserved(Box::new(LogRecord { body: "x".to_string(), severity: "INFO".to_string(), attributes: vec![], diff --git a/crates/health/src/otlp/drain.rs b/crates/health/src/otlp/drain.rs index e1e9408a3a..0d1d9aa892 100644 --- a/crates/health/src/otlp/drain.rs +++ b/crates/health/src/otlp/drain.rs @@ -24,7 +24,7 @@ use super::collector_logs::logs_service_client::LogsServiceClient; use super::convert::build_export_request; use crate::collectors::{BackoffConfig, ExponentialBackoff}; use crate::sink::otlp::OtlpQueue; -use crate::sink::{CollectorEvent, EventContext}; +use crate::sink::{EventContext, HealthEvent}; pub(crate) struct OtlpDrainTask { queue: Arc, @@ -48,7 +48,7 @@ impl OtlpDrainTask { } } - fn drain_batch(&self, batch: &mut Vec<(EventContext, CollectorEvent)>) { + fn drain_batch(&self, batch: &mut Vec<(EventContext, HealthEvent)>) { let remaining = self.batch_size.saturating_sub(batch.len()); for _ in 0..remaining { match self.queue.pop() { @@ -127,7 +127,7 @@ impl OtlpDrainTask { async fn flush( &self, client: &mut LogsServiceClient, - batch: &mut Vec<(EventContext, CollectorEvent)>, + batch: &mut Vec<(EventContext, HealthEvent)>, ) { if batch.is_empty() { return; diff --git a/crates/health/src/processor/health_report.rs b/crates/health/src/processor/health_report.rs index 4ccfc23089..22e070447a 100644 --- a/crates/health/src/processor/health_report.rs +++ b/crates/health/src/processor/health_report.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use dashmap::DashMap; use nv_redfish::resource::Health as BmcHealth; -use super::{CollectorEvent, EventContext, EventProcessor}; +use super::{EventContext, HealthEvent, SyncEventNode}; use crate::sink::{ Classification, HealthReport, HealthReportAlert, HealthReportSuccess, MetricSample, Probe, ReportSource, SensorThresholdContext, @@ -198,18 +198,18 @@ impl HealthReportProcessor { } } -impl EventProcessor for HealthReportProcessor { - fn processor_type(&self) -> &'static str { +impl SyncEventNode for HealthReportProcessor { + fn node_type(&self) -> &'static str { "health_report_processor" } - fn process_event(&self, context: &EventContext, event: &CollectorEvent) -> Vec { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { match event { - CollectorEvent::MetricCollectionStart => { + HealthEvent::ScrapeBatchStarted => { self.windows .insert(Self::stream_key(context), HealthReportWindow::default()); } - CollectorEvent::Metric(metric) => { + HealthEvent::MeasurementObserved(metric) => { let Some(health) = metric.context.as_ref() else { return Vec::new(); }; @@ -219,10 +219,18 @@ impl EventProcessor for HealthReportProcessor { SensorHealthResult::Alert(alert) => window.alerts.push(alert), } } - CollectorEvent::MetricCollectionEnd => { + HealthEvent::ScrapeBatchFinished => { let Some((_, window)) = self.windows.remove(&Self::stream_key(context)) else { return Vec::new(); }; + if window.successes.is_empty() && window.alerts.is_empty() { + tracing::debug!( + endpoint = %context.addr.mac, + collector_type = context.collector_type, + "Skipping empty hardware health report" + ); + return Vec::new(); + } let report = HealthReport { source: ReportSource::BmcSensors, target: context.health_report_target(), @@ -233,19 +241,23 @@ impl EventProcessor for HealthReportProcessor { tracing::info!( endpoint = %context.addr.mac, + target = ?report.target, success_count = report.successes.len(), alert_count = report.alerts.len(), "Sending hardware health report" ); - return vec![CollectorEvent::HealthReport(Arc::new(report))]; + return vec![HealthEvent::HealthReportProduced(Arc::new(report))]; } - CollectorEvent::CollectorRemoved => { + HealthEvent::NodeRemoved => { self.windows.remove(&Self::stream_key(context)); } - CollectorEvent::Log(_) - | CollectorEvent::Firmware(_) - | CollectorEvent::HealthReport(_) => {} + HealthEvent::LogObserved(_) + | HealthEvent::ScrapeRequested { .. } + | HealthEvent::InventoryDiscovered { .. } + | HealthEvent::InventoryUpdated { .. } + | HealthEvent::FirmwareObserved(_) + | HealthEvent::HealthReportProduced(_) => {} } Vec::new() @@ -291,10 +303,10 @@ mod tests { let processor = HealthReportProcessor::new(); let context = test_context(); - let _ = processor.process_event(&context, &CollectorEvent::MetricCollectionStart); - let _ = processor.process_event( + let _ = processor.handle_event(&context, &HealthEvent::ScrapeBatchStarted); + let _ = processor.handle_event( &context, - &CollectorEvent::Metric( + &HealthEvent::MeasurementObserved( MetricSample { key: "sensor-1".to_string(), name: "hw_sensor".to_string(), @@ -319,9 +331,9 @@ mod tests { .into(), ), ); - let emitted = processor.process_event(&context, &CollectorEvent::MetricCollectionEnd); + let emitted = processor.handle_event(&context, &HealthEvent::ScrapeBatchFinished); - let Some(CollectorEvent::HealthReport(report)) = emitted.last() else { + let Some(HealthEvent::HealthReportProduced(report)) = emitted.last() else { panic!("expected health report event"); }; @@ -336,10 +348,22 @@ mod tests { let processor = HealthReportProcessor::new(); let context = test_context(); - let _ = processor.process_event(&context, &CollectorEvent::MetricCollectionStart); + let _ = processor.handle_event(&context, &HealthEvent::ScrapeBatchStarted); assert_eq!(processor.windows.len(), 1); - let emitted = processor.process_event(&context, &CollectorEvent::CollectorRemoved); + let emitted = processor.handle_event(&context, &HealthEvent::NodeRemoved); + + assert!(emitted.is_empty()); + assert!(processor.windows.is_empty()); + } + + #[test] + fn empty_metric_window_does_not_emit_health_report() { + let processor = HealthReportProcessor::new(); + let context = test_context(); + + let _ = processor.handle_event(&context, &HealthEvent::ScrapeBatchStarted); + let emitted = processor.handle_event(&context, &HealthEvent::ScrapeBatchFinished); assert!(emitted.is_empty()); assert!(processor.windows.is_empty()); diff --git a/crates/health/src/processor/intrusion_events.rs b/crates/health/src/processor/intrusion_events.rs index f1c82afd07..5de2ab59c1 100644 --- a/crates/health/src/processor/intrusion_events.rs +++ b/crates/health/src/processor/intrusion_events.rs @@ -18,7 +18,7 @@ use std::borrow::Cow; use std::sync::Arc; -use super::{CollectorEvent, EventContext, EventProcessor}; +use super::{EventContext, HealthEvent, SyncEventNode}; use crate::sink::{ Classification, HealthReport, HealthReportAlert, HealthReportSuccess, HealthReportTarget, LogRecord, Probe, ReportSource, @@ -34,9 +34,9 @@ enum IntrusionEventState { } #[derive(Default)] -pub struct BmcIntrusionEventProcessor; +pub struct BmcIntrusionSyncEventNode; -impl BmcIntrusionEventProcessor { +impl BmcIntrusionSyncEventNode { pub fn new() -> Self { Self } @@ -98,17 +98,13 @@ impl BmcIntrusionEventProcessor { } } -impl EventProcessor for BmcIntrusionEventProcessor { - fn processor_type(&self) -> &'static str { +impl SyncEventNode for BmcIntrusionSyncEventNode { + fn node_type(&self) -> &'static str { "bmc_intrusion_event_processor" } - fn process_event( - &self, - _context: &EventContext, - event: &CollectorEvent, - ) -> Vec { - let CollectorEvent::Log(record) = event else { + fn handle_event(&self, _context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::LogObserved(record) = event else { return Vec::new(); }; @@ -146,7 +142,7 @@ impl EventProcessor for BmcIntrusionEventProcessor { alerts, }; - vec![CollectorEvent::HealthReport(Arc::new(report))] + vec![HealthEvent::HealthReportProduced(Arc::new(report))] } } @@ -192,13 +188,13 @@ mod tests { } } - fn log(body: &str, severity: &str, message_args: Option<&str>) -> CollectorEvent { + fn log(body: &str, severity: &str, message_args: Option<&str>) -> HealthEvent { let mut attributes = Vec::new(); if let Some(message_args) = message_args { attributes.push((Cow::Borrowed("message_args"), message_args.to_string())); } - CollectorEvent::Log(Box::new(LogRecord { + HealthEvent::LogObserved(Box::new(LogRecord { body: body.to_string(), severity: severity.to_string(), attributes, @@ -206,12 +202,12 @@ mod tests { })) } - fn emitted_report(event: CollectorEvent) -> Arc { - let processor = BmcIntrusionEventProcessor::new(); - let emitted = processor.process_event(&context(), &event); + fn emitted_report(event: HealthEvent) -> Arc { + let processor = BmcIntrusionSyncEventNode::new(); + let emitted = processor.handle_event(&context(), &event); assert_eq!(emitted.len(), 1); - let CollectorEvent::HealthReport(report) = &emitted[0] else { + let HealthEvent::HealthReportProduced(report) = &emitted[0] else { panic!("expected health report"); }; @@ -338,8 +334,8 @@ mod tests { #[test] fn ignores_unrelated_logs() { - let processor = BmcIntrusionEventProcessor::new(); - let emitted = processor.process_event( + let processor = BmcIntrusionSyncEventNode::new(); + let emitted = processor.handle_event( &context(), &log("CPU temperature threshold warning", "Warning", None), ); diff --git a/crates/health/src/processor/leak_events.rs b/crates/health/src/processor/leak_events.rs index a34dadc775..2ce547d8a8 100644 --- a/crates/health/src/processor/leak_events.rs +++ b/crates/health/src/processor/leak_events.rs @@ -18,17 +18,17 @@ use std::collections::BTreeSet; use std::sync::Arc; -use super::{EventContext, EventProcessor}; +use super::{EventContext, SyncEventNode}; use crate::sink::{ - Classification, CollectorEvent, HealthReport, HealthReportAlert, HealthReportSuccess, + Classification, HealthEvent, HealthReport, HealthReportAlert, HealthReportSuccess, HealthReportTarget, Probe, ReportSource, }; -pub struct LeakEventProcessor { +pub struct LeakSyncEventNode { minimum_alerts_per_report: usize, } -impl LeakEventProcessor { +impl LeakSyncEventNode { pub fn new(minimum_alerts_per_report: usize) -> Self { Self { minimum_alerts_per_report, @@ -60,17 +60,13 @@ fn leak_details(alerts: &[&HealthReportAlert]) -> String { targets.iter().cloned().collect::>().join(", ") } -impl EventProcessor for LeakEventProcessor { - fn processor_type(&self) -> &'static str { +impl SyncEventNode for LeakSyncEventNode { + fn node_type(&self) -> &'static str { "leak_event_processor" } - fn process_event( - &self, - _context: &EventContext, - event: &CollectorEvent, - ) -> Vec { - let CollectorEvent::HealthReport(report) = event else { + fn handle_event(&self, _context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::HealthReportProduced(report) = event else { return Vec::new(); }; @@ -119,7 +115,7 @@ impl EventProcessor for LeakEventProcessor { alerts, }; - vec![CollectorEvent::HealthReport(Arc::new(leak_report))] + vec![HealthEvent::HealthReportProduced(Arc::new(leak_report))] } } @@ -158,7 +154,7 @@ mod tests { #[test] fn does_not_emit_alert_when_threshold_not_met() { - let processor = LeakEventProcessor::new(2); + let processor = LeakSyncEventNode::new(2); let report = HealthReport { source: ReportSource::BmcLeakDetectors, target: Some(HealthReportTarget::Machine), @@ -167,11 +163,13 @@ mod tests { alerts: vec![leak_alert("LeakDetector_Probe")], }; - let emitted = - processor.process_event(&context(), &CollectorEvent::HealthReport(Arc::new(report))); + let emitted = processor.handle_event( + &context(), + &HealthEvent::HealthReportProduced(Arc::new(report)), + ); assert_eq!(emitted.len(), 1); - let CollectorEvent::HealthReport(derived) = &emitted[0] else { + let HealthEvent::HealthReportProduced(derived) = &emitted[0] else { panic!("expected derived health report"); }; @@ -183,7 +181,7 @@ mod tests { #[test] fn emits_derived_leak_report_when_threshold_met() { - let processor = LeakEventProcessor::new(1); + let processor = LeakSyncEventNode::new(1); let report = HealthReport { source: ReportSource::BmcLeakDetectors, target: Some(HealthReportTarget::Machine), @@ -192,11 +190,13 @@ mod tests { alerts: vec![leak_alert("LeakDetector_Probe")], }; - let emitted = - processor.process_event(&context(), &CollectorEvent::HealthReport(Arc::new(report))); + let emitted = processor.handle_event( + &context(), + &HealthEvent::HealthReportProduced(Arc::new(report)), + ); assert_eq!(emitted.len(), 1); - let CollectorEvent::HealthReport(derived) = &emitted[0] else { + let HealthEvent::HealthReportProduced(derived) = &emitted[0] else { panic!("expected derived health report"); }; assert_eq!(derived.source, ReportSource::TrayLeakDetection); @@ -213,8 +213,8 @@ mod tests { #[test] fn ignores_non_health_report_events() { - let processor = LeakEventProcessor::new(1); - let metric_event = CollectorEvent::Metric( + let processor = LeakSyncEventNode::new(1); + let metric_event = HealthEvent::MeasurementObserved( crate::sink::MetricSample { key: "k".to_string(), name: "n".to_string(), @@ -226,13 +226,13 @@ mod tests { } .into(), ); - let emitted = processor.process_event(&context(), &metric_event); + let emitted = processor.handle_event(&context(), &metric_event); assert!(emitted.is_empty()); } #[test] fn ignores_sensor_health_reports() { - let processor = LeakEventProcessor::new(1); + let processor = LeakSyncEventNode::new(1); let report = HealthReport { source: ReportSource::BmcSensors, observed_at: Some(chrono::Utc::now()), @@ -244,8 +244,10 @@ mod tests { target: Some(HealthReportTarget::Machine), }; - let emitted = - processor.process_event(&context(), &CollectorEvent::HealthReport(Arc::new(report))); + let emitted = processor.handle_event( + &context(), + &HealthEvent::HealthReportProduced(Arc::new(report)), + ); assert!(emitted.is_empty()); } diff --git a/crates/health/src/processor/mod.rs b/crates/health/src/processor/mod.rs index 1e9bbc95eb..1c487543ca 100644 --- a/crates/health/src/processor/mod.rs +++ b/crates/health/src/processor/mod.rs @@ -25,42 +25,31 @@ mod intrusion_events; mod leak_events; mod rack_leak; pub use health_report::HealthReportProcessor; -pub use intrusion_events::BmcIntrusionEventProcessor; -pub use leak_events::LeakEventProcessor; +pub use intrusion_events::BmcIntrusionSyncEventNode; +pub use leak_events::LeakSyncEventNode; pub use rack_leak::RackLeakProcessor; use crate::metrics::{ComponentMetrics, MetricsManager}; -use crate::sink::{CollectorEvent, DataSink, EventContext}; - -pub trait EventProcessor: Send + Sync { - fn processor_type(&self) -> &'static str; - fn process_event(&self, context: &EventContext, event: &CollectorEvent) -> Vec; -} +use crate::sink::{EventContext, HealthEvent, SyncEventNode}; struct PendingEvent<'a> { - event: Cow<'a, CollectorEvent>, + event: Cow<'a, HealthEvent>, blocked_processors: Vec, } -pub struct EventProcessingPipeline { - processors: Vec>, - sink: Arc, +pub struct EventGraph { + nodes: Vec>, component_metrics: Arc, } -impl EventProcessingPipeline { - pub fn new( - processors: Vec>, - sink: Arc, - metrics_manager: Arc, - ) -> Self { +impl EventGraph { + pub fn new(nodes: Vec>, metrics_manager: Arc) -> Self { debug_assert!( - !processors.is_empty(), - "EventProcessingPipeline should only be used when processors are configured" + !nodes.is_empty(), + "EventGraph should only be used when nodes are configured" ); Self { - processors, - sink, + nodes, component_metrics: metrics_manager.component_metrics(), } } @@ -68,20 +57,24 @@ impl EventProcessingPipeline { fn next_events( &self, context: &EventContext, - current_event: &CollectorEvent, - blocked_processors: &[bool], + current_event: &HealthEvent, + blocked_nodes: &[bool], queue: &mut VecDeque, ) { - for (processor_idx, processor) in self.processors.iter().enumerate() { - if blocked_processors[processor_idx] { + for (node_idx, node) in self.nodes.iter().enumerate() { + if blocked_nodes[node_idx] { + continue; + } + + if !node.interested_in(current_event) { continue; } let start = Instant::now(); - let emitted = processor.process_event(context, current_event); + let emitted = node.handle_event(context, current_event); self.component_metrics.record_operation( crate::metrics::ComponentKind::Processor, - processor.processor_type(), + node.node_type(), start.elapsed(), true, ); @@ -90,8 +83,8 @@ impl EventProcessingPipeline { } for event in emitted { - let mut next_blocked_processors = blocked_processors.to_vec(); - next_blocked_processors[processor_idx] = true; + let mut next_blocked_processors = blocked_nodes.to_vec(); + next_blocked_processors[node_idx] = true; queue.push_back(PendingEvent { event: Cow::Owned(event), blocked_processors: next_blocked_processors, @@ -101,19 +94,18 @@ impl EventProcessingPipeline { } } -impl DataSink for EventProcessingPipeline { - fn sink_type(&self) -> &'static str { - "event_processing_pipeline" +impl SyncEventNode for EventGraph { + fn node_type(&self) -> &'static str { + "event_graph" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { let mut queue = VecDeque::from(vec![PendingEvent { event: Cow::Borrowed(event), - blocked_processors: vec![false; self.processors.len()], + blocked_processors: vec![false; self.nodes.len()], }]); while let Some(current) = queue.pop_front() { - self.sink.handle_event(context, ¤t.event); self.next_events( context, ¤t.event, @@ -121,6 +113,7 @@ impl DataSink for EventProcessingPipeline { &mut queue, ); } + Vec::new() } } @@ -141,13 +134,14 @@ mod tests { counter: Arc, } - impl DataSink for CountingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CountingSink { + fn node_type(&self) -> &'static str { "counting_sink" } - fn handle_event(&self, _context: &EventContext, _event: &CollectorEvent) { + fn handle_event(&self, _context: &EventContext, _event: &HealthEvent) -> Vec { self.counter.fetch_add(1, Ordering::SeqCst); + Vec::new() } } @@ -155,16 +149,12 @@ mod tests { counter: Arc, } - impl EventProcessor for SelfReemittingProcessor { - fn processor_type(&self) -> &'static str { + impl SyncEventNode for SelfReemittingProcessor { + fn node_type(&self) -> &'static str { "self_reemitting_processor" } - fn process_event( - &self, - _context: &EventContext, - event: &CollectorEvent, - ) -> Vec { + fn handle_event(&self, _context: &EventContext, event: &HealthEvent) -> Vec { self.counter.fetch_add(1, Ordering::SeqCst); vec![event.clone()] } @@ -190,17 +180,19 @@ mod tests { let sink_counter = Arc::new(AtomicUsize::new(0)); let metrics_manager = Arc::new(MetricsManager::new("test").expect("should create metrics manager")); - let pipeline = EventProcessingPipeline::new( - vec![Arc::new(SelfReemittingProcessor { - counter: processor_counter.clone(), - })], - Arc::new(CountingSink { - counter: sink_counter.clone(), - }), + let pipeline = EventGraph::new( + vec![ + Arc::new(CountingSink { + counter: sink_counter.clone(), + }), + Arc::new(SelfReemittingProcessor { + counter: processor_counter.clone(), + }), + ], metrics_manager, ); - let event = CollectorEvent::Metric( + let event = HealthEvent::MeasurementObserved( crate::sink::MetricSample { key: "k".to_string(), name: "n".to_string(), diff --git a/crates/health/src/processor/rack_leak.rs b/crates/health/src/processor/rack_leak.rs index 69a2d69cc0..4bcc2c4c13 100644 --- a/crates/health/src/processor/rack_leak.rs +++ b/crates/health/src/processor/rack_leak.rs @@ -21,9 +21,9 @@ use std::sync::Arc; use carbide_uuid::rack::RackId; use dashmap::DashMap; -use super::{EventContext, EventProcessor}; +use super::{EventContext, SyncEventNode}; use crate::sink::{ - Classification, CollectorEvent, HealthReport, HealthReportAlert, HealthReportSuccess, + Classification, HealthEvent, HealthReport, HealthReportAlert, HealthReportSuccess, HealthReportTarget, Probe, ReportSource, }; @@ -76,24 +76,24 @@ impl RackLeakProcessor { } } -impl EventProcessor for RackLeakProcessor { - fn processor_type(&self) -> &'static str { +impl SyncEventNode for RackLeakProcessor { + fn node_type(&self) -> &'static str { "rack_leak_processor" } - fn process_event(&self, context: &EventContext, event: &CollectorEvent) -> Vec { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { let Some(rack_id) = context.rack_id() else { return Vec::new(); }; - if matches!(event, CollectorEvent::CollectorRemoved) { + if matches!(event, HealthEvent::NodeRemoved) { if let Some(mut entry) = self.racks.get_mut(rack_id) { entry.leaking_trays.remove(context.endpoint_key()); } return Vec::new(); } - let CollectorEvent::HealthReport(report) = event else { + let HealthEvent::HealthReportProduced(report) = event else { return Vec::new(); }; @@ -124,7 +124,7 @@ impl EventProcessor for RackLeakProcessor { let leaking_count = entry.leaking_trays.len(); let report = self.build_report(leaking_count); - vec![CollectorEvent::HealthReport(Arc::new(report))] + vec![HealthEvent::HealthReportProduced(Arc::new(report))] } } @@ -166,7 +166,7 @@ mod tests { } } - fn tray_leak_report(leaking: bool) -> CollectorEvent { + fn tray_leak_report(leaking: bool) -> HealthEvent { let report = if leaking { HealthReport { source: ReportSource::TrayLeakDetection, @@ -192,7 +192,7 @@ mod tests { alerts: vec![], } }; - CollectorEvent::HealthReport(Arc::new(report)) + HealthEvent::HealthReportProduced(Arc::new(report)) } #[test] @@ -207,7 +207,7 @@ mod tests { alerts: vec![], }; let emitted = - processor.process_event(&ctx, &CollectorEvent::HealthReport(Arc::new(report))); + processor.handle_event(&ctx, &HealthEvent::HealthReportProduced(Arc::new(report))); assert!(emitted.is_empty()); } @@ -215,7 +215,7 @@ mod tests { fn ignores_events_without_rack_id() { let processor = RackLeakProcessor::new(2); let ctx = context_without_rack("42:9e:b1:bd:9d:dd"); - let emitted = processor.process_event(&ctx, &tray_leak_report(true)); + let emitted = processor.handle_event(&ctx, &tray_leak_report(true)); assert!(emitted.is_empty()); } @@ -224,10 +224,10 @@ mod tests { let processor = RackLeakProcessor::new(2); let ctx = context_with_rack("42:9e:b1:bd:9d:dd", "rack-1"); - let emitted = processor.process_event(&ctx, &tray_leak_report(true)); + let emitted = processor.handle_event(&ctx, &tray_leak_report(true)); assert_eq!(emitted.len(), 1); - let CollectorEvent::HealthReport(report) = &emitted[0] else { + let HealthEvent::HealthReportProduced(report) = &emitted[0] else { panic!("expected health report"); }; assert_eq!(report.source, ReportSource::RackLeakDetection); @@ -243,10 +243,10 @@ mod tests { let ctx_a = context_with_rack("42:9e:b1:bd:9d:dd", "rack-1"); let ctx_b = context_with_rack("42:9e:b1:bd:9d:ee", "rack-1"); - processor.process_event(&ctx_a, &tray_leak_report(true)); - let emitted = processor.process_event(&ctx_b, &tray_leak_report(true)); + processor.handle_event(&ctx_a, &tray_leak_report(true)); + let emitted = processor.handle_event(&ctx_b, &tray_leak_report(true)); - let CollectorEvent::HealthReport(report) = &emitted[0] else { + let HealthEvent::HealthReportProduced(report) = &emitted[0] else { panic!("expected health report"); }; assert_eq!(report.source, ReportSource::RackLeakDetection); @@ -263,13 +263,13 @@ mod tests { let ctx_b = context_with_rack("42:9e:b1:bd:9d:ee", "rack-1"); processor - .process_event(&ctx_a, &tray_leak_report(true)) + .handle_event(&ctx_a, &tray_leak_report(true)) .len(); - processor.process_event(&ctx_b, &tray_leak_report(true)); + processor.handle_event(&ctx_b, &tray_leak_report(true)); - let emitted = processor.process_event(&ctx_a, &tray_leak_report(false)); + let emitted = processor.handle_event(&ctx_a, &tray_leak_report(false)); - let CollectorEvent::HealthReport(report) = &emitted[0] else { + let HealthEvent::HealthReportProduced(report) = &emitted[0] else { panic!("expected health report"); }; assert!(report.alerts.is_empty()); @@ -284,12 +284,12 @@ mod tests { let ctx_b = context_with_rack("42:9e:b1:bd:9d:ee", "rack-1"); let ctx_c = context_with_rack("42:9e:b1:bd:9d:ff", "rack-1"); - processor.process_event(&ctx_a, &tray_leak_report(true)); - processor.process_event(&ctx_b, &tray_leak_report(true)); + processor.handle_event(&ctx_a, &tray_leak_report(true)); + processor.handle_event(&ctx_b, &tray_leak_report(true)); - let emitted = processor.process_event(&ctx_c, &tray_leak_report(false)); + let emitted = processor.handle_event(&ctx_c, &tray_leak_report(false)); - let CollectorEvent::HealthReport(report) = &emitted[0] else { + let HealthEvent::HealthReportProduced(report) = &emitted[0] else { panic!("expected health report"); }; assert_eq!(report.alerts.len(), 1, "rack should still be in alert"); @@ -302,10 +302,10 @@ mod tests { let ctx_a = context_with_rack("42:9e:b1:bd:9d:dd", "rack-1"); let ctx_b = context_with_rack("42:9e:b1:bd:9d:ee", "rack-1"); - processor.process_event(&ctx_a, &tray_leak_report(true)); - processor.process_event(&ctx_b, &tray_leak_report(true)); + processor.handle_event(&ctx_a, &tray_leak_report(true)); + processor.handle_event(&ctx_b, &tray_leak_report(true)); - let emitted = processor.process_event(&ctx_a, &CollectorEvent::CollectorRemoved); + let emitted = processor.handle_event(&ctx_a, &HealthEvent::NodeRemoved); assert!(emitted.is_empty()); let Some(rack) = processor.racks.get(ctx_a.rack_id().expect("rack id")) else { @@ -322,10 +322,10 @@ mod tests { let ctx_r1 = context_with_rack("42:9e:b1:bd:9d:dd", "rack-1"); let ctx_r2 = context_with_rack("42:9e:b1:bd:9d:ee", "rack-2"); - processor.process_event(&ctx_r1, &tray_leak_report(true)); - let emitted = processor.process_event(&ctx_r2, &tray_leak_report(true)); + processor.handle_event(&ctx_r1, &tray_leak_report(true)); + let emitted = processor.handle_event(&ctx_r2, &tray_leak_report(true)); - let CollectorEvent::HealthReport(report) = &emitted[0] else { + let HealthEvent::HealthReportProduced(report) = &emitted[0] else { panic!("expected health report"); }; diff --git a/crates/health/src/sink/composite.rs b/crates/health/src/sink/composite.rs index 7948d94da3..0c279a6e65 100644 --- a/crates/health/src/sink/composite.rs +++ b/crates/health/src/sink/composite.rs @@ -18,42 +18,46 @@ use std::sync::Arc; use std::time::Instant; -use super::{CollectorEvent, DataSink, EventContext}; +use super::{EventContext, HealthEvent, SyncEventNode}; use crate::metrics::{ComponentKind, ComponentMetrics, MetricsManager}; -pub struct CompositeDataSink { - sinks: Vec>, +pub struct CompositeSyncEventNode { + sinks: Vec>, component_metrics: Arc, } -impl CompositeDataSink { - pub fn new(sinks: Vec>, metrics_manager: Arc) -> Self { +impl CompositeSyncEventNode { + pub fn new(sinks: Vec>, metrics_manager: Arc) -> Self { Self { sinks, component_metrics: metrics_manager.component_metrics(), } } - fn record_sink_operation(&self, sink: &dyn DataSink, duration: std::time::Duration) { + fn record_sink_operation(&self, sink: &dyn SyncEventNode, duration: std::time::Duration) { self.component_metrics.record_operation( ComponentKind::Sink, - sink.sink_type(), + sink.node_type(), duration, true, ); } } -impl DataSink for CompositeDataSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for CompositeSyncEventNode { + fn node_type(&self) -> &'static str { "composite_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { for sink in &self.sinks { + if !sink.interested_in(event) { + continue; + } let start = Instant::now(); sink.handle_event(context, event); self.record_sink_operation(sink.as_ref(), start.elapsed()); } + Vec::new() } } diff --git a/crates/health/src/sink/events.rs b/crates/health/src/sink/events.rs index 7b9a507c05..fc184985e5 100644 --- a/crates/health/src/sink/events.rs +++ b/crates/health/src/sink/events.rs @@ -30,6 +30,8 @@ use health_report::{ use nv_redfish::resource::Health as BmcHealth; use serde::Serialize; +use crate::bmc::BmcClient; +use crate::collectors::inventory::EntityInventory; use crate::endpoint::{BmcAddr, BmcEndpoint, EndpointMetadata, SwitchEndpointRole}; use crate::metrics::MetricLabel; @@ -335,15 +337,41 @@ impl HealthReport { } } -#[derive(Clone, Debug)] -pub enum CollectorEvent { - MetricCollectionStart, - Metric(Box), - MetricCollectionEnd, - CollectorRemoved, - Log(Box), - Firmware(FirmwareInfo), - HealthReport(Arc), +#[derive(Clone)] +pub enum HealthEvent { + ScrapeRequested { + endpoint_key: String, + kind: ScrapeKind, + }, + InventoryDiscovered { + endpoint_key: String, + inventory: Arc>, + }, + InventoryUpdated { + endpoint_key: String, + generation: u64, + }, + ScrapeBatchStarted, + MeasurementObserved(Box), + ScrapeBatchFinished, + NodeRemoved, + LogObserved(Box), + FirmwareObserved(FirmwareInfo), + HealthReportProduced(Arc), +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +pub enum ScrapeKind { + Inventory, + Sensors, + Metrics, + Logs, + Firmware, + LeakDetectors, + Nmxt, + NvueRest, + NvueGnmi, + Telemetry, } #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] diff --git a/crates/health/src/sink/health_report.rs b/crates/health/src/sink/health_report.rs index 73c5558f1c..8c3f3b722b 100644 --- a/crates/health/src/sink/health_report.rs +++ b/crates/health/src/sink/health_report.rs @@ -25,7 +25,7 @@ use carbide_uuid::machine::MachineId; use super::dedup_queue::DedupQueue; use super::{ - CollectorEvent, DataSink, EventContext, HealthReport, HealthReportTarget, ReportSource, + EventContext, HealthEvent, HealthReport, HealthReportTarget, ReportSource, SyncEventNode, }; use crate::HealthError; use crate::api_client::ApiClientWrapper; @@ -159,17 +159,17 @@ impl HealthReportSink { } } -impl DataSink for HealthReportSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for HealthReportSink { + fn node_type(&self) -> &'static str { "health_report_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - let CollectorEvent::HealthReport(report) = event else { - return; + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::HealthReportProduced(report) = event else { + return Vec::new(); }; if report.target != Some(HealthReportTarget::Machine) { - return; + return Vec::new(); } if self.skip_empty_reports && report.is_empty() { @@ -177,7 +177,7 @@ impl DataSink for HealthReportSink { source = ?report.source, "Skipping empty machine health report" ); - return; + return Vec::new(); } if let Some(machine_id) = context.machine_id() { @@ -201,7 +201,7 @@ impl DataSink for HealthReportSink { machine_id = %key.id, "Suppressing unchanged success-only health report" ); - return; + return Vec::new(); } cache.entries.insert( key.clone(), @@ -224,6 +224,7 @@ impl DataSink for HealthReportSink { "Received machine-target HealthReport event without machine_id context" ); } + Vec::new() } } @@ -387,7 +388,7 @@ mod tests { let ctx = machine_context(mid); let sink = make_sink(Some(Duration::from_secs(300))); let report = success_report(ReportSource::BmcSensors); - let event = CollectorEvent::HealthReport(Arc::clone(&report)); + let event = HealthEvent::HealthReportProduced(Arc::clone(&report)); sink.handle_event(&ctx, &event); assert!(sink.queue.pop().is_some(), "first send should go through"); @@ -410,13 +411,19 @@ mod tests { // Send a success first to populate last_sent, then send an alert. // The alert must not be suppressed, and the subsequent success must // also go through (alert clears the suppression entry). - sink.handle_event(&ctx, &CollectorEvent::HealthReport(Arc::clone(&success))); + sink.handle_event( + &ctx, + &HealthEvent::HealthReportProduced(Arc::clone(&success)), + ); sink.queue.pop(); - sink.handle_event(&ctx, &CollectorEvent::HealthReport(Arc::clone(&alert))); + sink.handle_event(&ctx, &HealthEvent::HealthReportProduced(Arc::clone(&alert))); assert!(sink.queue.pop().is_some(), "alert should not be suppressed"); - sink.handle_event(&ctx, &CollectorEvent::HealthReport(Arc::clone(&success))); + sink.handle_event( + &ctx, + &HealthEvent::HealthReportProduced(Arc::clone(&success)), + ); assert!( sink.queue.pop().is_some(), "first success after alert should not be suppressed" @@ -430,7 +437,10 @@ mod tests { let sink = make_sink(Some(Duration::from_secs(300))); let report_a = success_report(ReportSource::BmcSensors); - sink.handle_event(&ctx, &CollectorEvent::HealthReport(Arc::clone(&report_a))); + sink.handle_event( + &ctx, + &HealthEvent::HealthReportProduced(Arc::clone(&report_a)), + ); sink.queue.pop(); let report_b = Arc::new(HealthReport { @@ -443,7 +453,10 @@ mod tests { }], alerts: Vec::new(), }); - sink.handle_event(&ctx, &CollectorEvent::HealthReport(Arc::clone(&report_b))); + sink.handle_event( + &ctx, + &HealthEvent::HealthReportProduced(Arc::clone(&report_b)), + ); assert!( sink.queue.pop().is_some(), @@ -457,7 +470,7 @@ mod tests { let ctx = machine_context(mid); let sink = make_sink(None); let report = success_report(ReportSource::BmcSensors); - let event = CollectorEvent::HealthReport(Arc::clone(&report)); + let event = HealthEvent::HealthReportProduced(Arc::clone(&report)); sink.handle_event(&ctx, &event); sink.queue.pop(); diff --git a/crates/health/src/sink/log_file.rs b/crates/health/src/sink/log_file.rs index 5d6f44ada9..826a1b1427 100644 --- a/crates/health/src/sink/log_file.rs +++ b/crates/health/src/sink/log_file.rs @@ -22,11 +22,11 @@ use std::sync::Mutex; use serde::Serialize; -use super::{CollectorEvent, DataSink, EventContext, LogRecord}; +use super::{EventContext, HealthEvent, LogRecord, SyncEventNode}; use crate::config::LogFileSinkConfig; -/// Durable JSONL log sink. Writes CollectorEvent::Log records to rotating -/// files using sync I/O, safe to call from DataSink::handle_event. +/// Durable JSONL log sink. Writes HealthEvent::LogObserved records to rotating +/// files using sync I/O, safe to call from SyncEventNode::handle_event. pub struct LogFileSink { writer: Mutex, include_diagnostics: bool, @@ -46,14 +46,14 @@ impl LogFileSink { } } -impl DataSink for LogFileSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for LogFileSink { + fn node_type(&self) -> &'static str { "log_file_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - let CollectorEvent::Log(record) = event else { - return; + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::LogObserved(record) = event else { + return Vec::new(); }; // Diagnostics are opt-in for log files. When enabled, fold the @@ -66,18 +66,19 @@ impl DataSink for LogFileSink { Ok(json) => json, Err(e) => { tracing::error!(error = ?e, "failed to serialize log record"); - return; + return Vec::new(); } }; let Ok(mut writer) = self.writer.lock() else { tracing::error!("log file writer lock poisoned"); - return; + return Vec::new(); }; if let Err(e) = writer.write_line(&line) { tracing::error!(error = ?e, "failed to write log record to file"); } + Vec::new() } } @@ -257,7 +258,7 @@ mod tests { let sink = LogFileSink::new(&config).expect("sink"); let ctx = test_context(); - let metric_event = CollectorEvent::MetricCollectionStart; + let metric_event = HealthEvent::ScrapeBatchStarted; sink.handle_event(&ctx, &metric_event); let log_path = dir.path().join("health_logs.jsonl"); @@ -277,7 +278,7 @@ mod tests { let sink = LogFileSink::new(&config).expect("sink"); let ctx = test_context(); - let event = CollectorEvent::Log( + let event = HealthEvent::LogObserved( LogRecord { body: "something happened".to_string(), severity: "INFO".to_string(), @@ -312,7 +313,7 @@ mod tests { let sink = LogFileSink::new(&config).expect("sink"); let ctx = test_context(); - let event = CollectorEvent::Log( + let event = HealthEvent::LogObserved( LogRecord { body: "parent log".to_string(), severity: "INFO".to_string(), @@ -362,7 +363,7 @@ mod tests { let sink = LogFileSink::new(&config).expect("sink"); let ctx = test_context(); - let event = CollectorEvent::Log( + let event = HealthEvent::LogObserved( LogRecord { body: "parent log".to_string(), severity: "INFO".to_string(), @@ -399,7 +400,7 @@ mod tests { let ctx = test_context(); for i in 0..5 { - let event = CollectorEvent::Log( + let event = HealthEvent::LogObserved( LogRecord { body: format!("log entry {i}"), severity: "INFO".to_string(), @@ -431,7 +432,7 @@ mod tests { let ctx = test_context(); for i in 0..5 { - let event = CollectorEvent::Log( + let event = HealthEvent::LogObserved( LogRecord { body: format!("entry {i}"), severity: "WARN".to_string(), diff --git a/crates/health/src/sink/mod.rs b/crates/health/src/sink/mod.rs index 9df648faf8..d474a8378f 100644 --- a/crates/health/src/sink/mod.rs +++ b/crates/health/src/sink/mod.rs @@ -31,11 +31,11 @@ mod rack_health_report; mod switch_health_report; mod tracing; -pub use composite::CompositeDataSink; +pub use composite::CompositeSyncEventNode; pub use events::{ - Classification, CollectorEvent, DiagnosticLogRecord, EventContext, FirmwareInfo, HealthReport, + Classification, DiagnosticLogRecord, EventContext, FirmwareInfo, HealthEvent, HealthReport, HealthReportAlert, HealthReportSuccess, HealthReportTarget, LogRecord, MetricSample, Probe, - ReportSource, SensorThresholdContext, + ReportSource, ScrapeKind, SensorThresholdContext, }; pub use health_report::HealthReportSink; pub use log_file::LogFileSink; @@ -50,9 +50,12 @@ pub(crate) use self::otlp::OtlpSink; #[cfg(feature = "bench-hooks")] pub use self::otlp::OtlpSink; -pub trait DataSink: Send + Sync { - fn sink_type(&self) -> &'static str; - fn handle_event(&self, context: &EventContext, event: &CollectorEvent); +pub trait SyncEventNode: Send + Sync { + fn node_type(&self) -> &'static str; + fn interested_in(&self, _event: &HealthEvent) -> bool { + true + } + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec; } #[cfg(test)] @@ -65,8 +68,8 @@ mod tests { use mac_address::MacAddress; use super::{ - CollectorEvent, CompositeDataSink, DataSink, DiagnosticLogRecord, EventContext, LogRecord, - MetricSample, PrometheusSink, + CompositeSyncEventNode, DiagnosticLogRecord, EventContext, HealthEvent, LogRecord, + MetricSample, PrometheusSink, SyncEventNode, }; use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use crate::metrics::MetricsManager; @@ -75,24 +78,27 @@ mod tests { counter: Arc, } - impl DataSink for CountingSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for CountingSink { + fn node_type(&self) -> &'static str { "counting_sink" } - fn handle_event(&self, _context: &EventContext, _event: &CollectorEvent) { + fn handle_event(&self, _context: &EventContext, _event: &HealthEvent) -> Vec { self.counter.fetch_add(1, Ordering::SeqCst); + Vec::new() } } struct NoopSink; - impl DataSink for NoopSink { - fn sink_type(&self) -> &'static str { + impl SyncEventNode for NoopSink { + fn node_type(&self) -> &'static str { "noop_sink" } - fn handle_event(&self, _context: &EventContext, _event: &CollectorEvent) {} + fn handle_event(&self, _context: &EventContext, _event: &HealthEvent) -> Vec { + Vec::new() + } } #[tokio::test] @@ -110,7 +116,7 @@ mod tests { }); let composite = - CompositeDataSink::new(vec![sink_ok_1, sink_noop, sink_ok_2], metrics_manager); + CompositeSyncEventNode::new(vec![sink_ok_1, sink_noop, sink_ok_2], metrics_manager); let context = EventContext { endpoint_key: "42:9e:b1:bd:9d:dd".to_string(), @@ -124,7 +130,7 @@ mod tests { rack_id: None, }; - let event = CollectorEvent::Metric( + let event = HealthEvent::MeasurementObserved( MetricSample { key: "key".to_string(), name: "metric".to_string(), @@ -168,7 +174,7 @@ mod tests { rack_id: None, }; - let log_event = CollectorEvent::Log( + let log_event = HealthEvent::LogObserved( LogRecord { body: "ignored by prometheus sink".to_string(), severity: "INFO".to_string(), @@ -187,7 +193,7 @@ mod tests { .expect("telemetry export should work"); assert!(!export_after_log.contains("test_sink_hw_sensor")); - let metric_event = CollectorEvent::Metric( + let metric_event = HealthEvent::MeasurementObserved( MetricSample { key: "metric_key".to_string(), name: "hw_sensor".to_string(), @@ -240,7 +246,7 @@ mod tests { rack_id: None, }; - let metric_event = CollectorEvent::Metric( + let metric_event = HealthEvent::MeasurementObserved( MetricSample { key: "metric_key".to_string(), name: "hw_sensor".to_string(), @@ -259,7 +265,7 @@ mod tests { .expect("telemetry export should work"); assert!(export_before_remove.contains("test_sink_hw_sensor_temperature_celsius")); - sink.handle_event(&context, &CollectorEvent::CollectorRemoved); + sink.handle_event(&context, &HealthEvent::NodeRemoved); let export_after_remove = metrics_manager .export_telemetry() @@ -295,9 +301,9 @@ mod tests { rack_id: None, }; - let start_event = CollectorEvent::MetricCollectionStart; + let start_event = HealthEvent::ScrapeBatchStarted; sink.handle_event(&context, &start_event); - let s1_event = CollectorEvent::Metric( + let s1_event = HealthEvent::MeasurementObserved( MetricSample { key: "s1".to_string(), name: "hw_sensor".to_string(), @@ -310,7 +316,7 @@ mod tests { .into(), ); sink.handle_event(&context, &s1_event); - let end_event = CollectorEvent::MetricCollectionEnd; + let end_event = HealthEvent::ScrapeBatchFinished; sink.handle_event(&context, &end_event); let first_export = metrics_manager @@ -318,9 +324,9 @@ mod tests { .expect("telemetry export should work"); assert!(first_export.contains("sensor=\"temp1\"")); - let start_event = CollectorEvent::MetricCollectionStart; + let start_event = HealthEvent::ScrapeBatchStarted; sink.handle_event(&context, &start_event); - let s2_event = CollectorEvent::Metric( + let s2_event = HealthEvent::MeasurementObserved( MetricSample { key: "s2".to_string(), name: "hw_sensor".to_string(), @@ -333,7 +339,7 @@ mod tests { .into(), ); sink.handle_event(&context, &s2_event); - let end_event = CollectorEvent::MetricCollectionEnd; + let end_event = HealthEvent::ScrapeBatchFinished; sink.handle_event(&context, &end_event); let second_export = metrics_manager diff --git a/crates/health/src/sink/otlp.rs b/crates/health/src/sink/otlp.rs index 4f422d9a8b..410787680d 100644 --- a/crates/health/src/sink/otlp.rs +++ b/crates/health/src/sink/otlp.rs @@ -21,14 +21,14 @@ use prometheus::Counter; use super::dedup_queue::DedupQueue; use super::event_mapper::RedfishEventMapper; -use super::{CollectorEvent, DataSink, EventContext, LogRecord, MetricSample}; +use super::{EventContext, HealthEvent, LogRecord, MetricSample, SyncEventNode}; use crate::HealthError; use crate::config::OtlpSinkConfig; use crate::metrics::MetricsManager; use crate::otlp::drain::OtlpDrainTask; use crate::otlp::metrics_drain::OtlpMetricsDrainTask; -pub(crate) type OtlpQueue = DedupQueue; +pub(crate) type OtlpQueue = DedupQueue; pub(crate) type OtlpMetricsQueue = DedupQueue; #[derive(Clone, Debug, Eq, Hash, PartialEq)] @@ -62,13 +62,13 @@ pub struct OtlpSink { } /// Returns whether an event belongs in the logs drain. -pub(crate) fn is_otlp_log_relevant(event: &CollectorEvent) -> bool { +pub(crate) fn is_otlp_log_relevant(event: &HealthEvent) -> bool { !matches!( event, - CollectorEvent::Metric(_) - | CollectorEvent::MetricCollectionStart - | CollectorEvent::MetricCollectionEnd - | CollectorEvent::CollectorRemoved + HealthEvent::MeasurementObserved(_) + | HealthEvent::ScrapeBatchStarted + | HealthEvent::ScrapeBatchFinished + | HealthEvent::NodeRemoved ) } @@ -152,7 +152,7 @@ impl OtlpSink { let record = record .emitted_log_record(self.include_diagnostics) .into_owned(); - let event = CollectorEvent::Log(Box::new(record)); + let event = HealthEvent::LogObserved(Box::new(record)); if self.queue.save_latest(key, (context.clone(), event)) { self.replaced_total.inc(); @@ -184,7 +184,7 @@ impl OtlpSink { #[cfg(feature = "bench-hooks")] impl OtlpSink { - pub fn pop_for_bench(&self) -> Option<(EventContext, CollectorEvent)> { + pub fn pop_for_bench(&self) -> Option<(EventContext, HealthEvent)> { self.queue.pop().map(|(_key, value)| value) } @@ -193,13 +193,13 @@ impl OtlpSink { } } -impl DataSink for OtlpSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for OtlpSink { + fn node_type(&self) -> &'static str { "otlp_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - if let CollectorEvent::Metric(sample) = event { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + if let HealthEvent::MeasurementObserved(sample) = event { let key = metric_queue_key(context, sample); if self @@ -209,19 +209,19 @@ impl DataSink for OtlpSink { self.metrics_replaced_total.inc(); } - return; + return Vec::new(); } if !is_otlp_log_relevant(event) { - return; + return Vec::new(); } let (key, event) = match event { - CollectorEvent::Log(record) => { + HealthEvent::LogObserved(record) => { self.enqueue_log_event(context, record); - return; + return Vec::new(); } - CollectorEvent::HealthReport(report) => { + HealthEvent::HealthReportProduced(report) => { let key = format!( "{}|health_report|{}", context.endpoint_key, @@ -230,16 +230,17 @@ impl DataSink for OtlpSink { (key, event.clone()) } - CollectorEvent::Firmware(info) => { + HealthEvent::FirmwareObserved(info) => { let key = format!("{}|firmware|{}", context.endpoint_key, info.component); (key, event.clone()) } - _ => return, + _ => return Vec::new(), }; if self.queue.save_latest(key, (context.clone(), event)) { self.replaced_total.inc(); } + Vec::new() } } @@ -268,7 +269,7 @@ mod tests { } } - fn log_event(message_id: &str, message_args: &str) -> CollectorEvent { + fn log_event(message_id: &str, message_args: &str) -> HealthEvent { log_event_with_diagnostic_record(message_id, message_args, None) } @@ -277,8 +278,8 @@ mod tests { message_id: &str, message_args: &str, diagnostic_record: Option, - ) -> CollectorEvent { - CollectorEvent::Log(Box::new(LogRecord { + ) -> HealthEvent { + HealthEvent::LogObserved(Box::new(LogRecord { body: "test".to_string(), severity: "OK".to_string(), attributes: vec![ @@ -300,21 +301,16 @@ mod tests { } } - fn metric_event() -> CollectorEvent { + fn metric_event() -> HealthEvent { metric_event_with("k", "gauge", "celsius") } - fn metric_event_with(key: &str, metric_type: &str, unit: &str) -> CollectorEvent { + fn metric_event_with(key: &str, metric_type: &str, unit: &str) -> HealthEvent { metric_event_with_name("temp", key, metric_type, unit) } - fn metric_event_with_name( - name: &str, - key: &str, - metric_type: &str, - unit: &str, - ) -> CollectorEvent { - CollectorEvent::Metric(Box::new(MetricSample { + fn metric_event_with_name(name: &str, key: &str, metric_type: &str, unit: &str) -> HealthEvent { + HealthEvent::MeasurementObserved(Box::new(MetricSample { key: key.to_string(), name: name.to_string(), metric_type: metric_type.to_string(), @@ -332,10 +328,8 @@ mod tests { #[test] fn is_otlp_log_relevant_excludes_metric_events() { assert!(!is_otlp_log_relevant(&metric_event())); - assert!(!is_otlp_log_relevant( - &CollectorEvent::MetricCollectionStart - )); - assert!(!is_otlp_log_relevant(&CollectorEvent::MetricCollectionEnd)); + assert!(!is_otlp_log_relevant(&HealthEvent::ScrapeBatchStarted)); + assert!(!is_otlp_log_relevant(&HealthEvent::ScrapeBatchFinished)); } #[test] @@ -359,8 +353,8 @@ mod tests { fn metric_collection_sentinels_are_no_op() { let sink = test_sink(); let ctx = test_context(); - sink.handle_event(&ctx, &CollectorEvent::MetricCollectionStart); - sink.handle_event(&ctx, &CollectorEvent::MetricCollectionEnd); + sink.handle_event(&ctx, &HealthEvent::ScrapeBatchStarted); + sink.handle_event(&ctx, &HealthEvent::ScrapeBatchFinished); assert!(sink.queue.pop().is_none()); assert!(sink.metrics_queue.pop().is_none()); } @@ -507,7 +501,7 @@ mod tests { ); let mut bodies = Vec::new(); - while let Some((_key, (_context, CollectorEvent::Log(record)))) = sink.queue.pop() { + while let Some((_key, (_context, HealthEvent::LogObserved(record)))) = sink.queue.pop() { bodies.push(record.body); } @@ -547,7 +541,7 @@ mod tests { ); let mut records = Vec::new(); - while let Some((_key, (_context, CollectorEvent::Log(record)))) = sink.queue.pop() { + while let Some((_key, (_context, HealthEvent::LogObserved(record)))) = sink.queue.pop() { records.push(record); } diff --git a/crates/health/src/sink/power_shelf_health_report.rs b/crates/health/src/sink/power_shelf_health_report.rs index 4152f586a8..55aefddf86 100644 --- a/crates/health/src/sink/power_shelf_health_report.rs +++ b/crates/health/src/sink/power_shelf_health_report.rs @@ -21,7 +21,7 @@ use carbide_uuid::power_shelf::PowerShelfId; use super::dedup_queue::DedupQueue; use super::{ - CollectorEvent, DataSink, EventContext, HealthReport, HealthReportTarget, ReportSource, + EventContext, HealthEvent, HealthReport, HealthReportTarget, ReportSource, SyncEventNode, }; use crate::HealthError; use crate::api_client::ApiClientWrapper; @@ -97,18 +97,18 @@ impl PowerShelfHealthReportSink { } } -impl DataSink for PowerShelfHealthReportSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for PowerShelfHealthReportSink { + fn node_type(&self) -> &'static str { "power_shelf_health_report_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - let CollectorEvent::HealthReport(report) = event else { - return; + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::HealthReportProduced(report) = event else { + return Vec::new(); }; if report.target != Some(HealthReportTarget::PowerShelf) { - return; + return Vec::new(); } if self.skip_empty_reports && report.is_empty() { @@ -116,7 +116,7 @@ impl DataSink for PowerShelfHealthReportSink { source = ?report.source, "Skipping empty power shelf health report" ); - return; + return Vec::new(); } let power_shelf_id = if let Some(power_shelf_id) = context.power_shelf_id() { @@ -126,7 +126,7 @@ impl DataSink for PowerShelfHealthReportSink { endpoint_key = context.endpoint_key(), "Received power-shelf-target HealthReport event without power_shelf_id context" ); - return; + return Vec::new(); }; let key = PowerShelfHealthReportKey { @@ -134,5 +134,6 @@ impl DataSink for PowerShelfHealthReportSink { source: report.source, }; self.queue.save_latest(key, Arc::clone(report)); + Vec::new() } } diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index 3822b24d31..56af2f30f9 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use dashmap::DashMap; -use super::{CollectorEvent, DataSink, EventContext, MetricSample}; +use super::{EventContext, HealthEvent, MetricSample, SyncEventNode}; use crate::HealthError; use crate::metrics::{CollectorRegistry, GaugeMetrics, GaugeReading, MetricsManager}; @@ -174,51 +174,51 @@ impl PrometheusSink { } } -impl DataSink for PrometheusSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for PrometheusSink { + fn node_type(&self) -> &'static str { "prometheus_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { match event { - CollectorEvent::MetricCollectionStart => { + HealthEvent::ScrapeBatchStarted => match self.get_or_create_stream_metrics(context) { + Ok(stream_metrics) => stream_metrics.begin_update(), + Err(error) => { + tracing::warn!( + ?error, + endpoint_key = context.endpoint_key(), + collector = context.collector_type, + "Failed to initialize Prometheus stream metrics" + ); + } + }, + HealthEvent::MeasurementObserved(sample) => { match self.get_or_create_stream_metrics(context) { - Ok(stream_metrics) => stream_metrics.begin_update(), + Ok(stream_metrics) => { + stream_metrics.record( + GaugeReading::new( + Self::metric_reading_key(sample), + sample.name.clone(), + sample.metric_type.clone(), + sample.unit.clone(), + sample.value, + ) + .with_labels(sample.labels.clone()), + ); + } Err(error) => { tracing::warn!( ?error, endpoint_key = context.endpoint_key(), collector = context.collector_type, - "Failed to initialize Prometheus stream metrics" + metric = sample.name, + metric_type = sample.metric_type, + "Failed to record Prometheus metric sample" ); } } } - CollectorEvent::Metric(sample) => match self.get_or_create_stream_metrics(context) { - Ok(stream_metrics) => { - stream_metrics.record( - GaugeReading::new( - Self::metric_reading_key(sample), - sample.name.clone(), - sample.metric_type.clone(), - sample.unit.clone(), - sample.value, - ) - .with_labels(sample.labels.clone()), - ); - } - Err(error) => { - tracing::warn!( - ?error, - endpoint_key = context.endpoint_key(), - collector = context.collector_type, - metric = sample.name, - metric_type = sample.metric_type, - "Failed to record Prometheus metric sample" - ); - } - }, - CollectorEvent::MetricCollectionEnd => { + HealthEvent::ScrapeBatchFinished => { if let Some(endpoint_metrics) = self.stream_metrics.get::(context.endpoint_key()) && let Some(entry) = endpoint_metrics.get(context.collector_type) @@ -226,11 +226,15 @@ impl DataSink for PrometheusSink { entry.value().sweep_stale(); } } - CollectorEvent::CollectorRemoved => self.remove_collector_metrics(context), - CollectorEvent::Log(_) - | CollectorEvent::Firmware(_) - | CollectorEvent::HealthReport(_) => {} + HealthEvent::NodeRemoved => self.remove_collector_metrics(context), + HealthEvent::LogObserved(_) + | HealthEvent::ScrapeRequested { .. } + | HealthEvent::InventoryDiscovered { .. } + | HealthEvent::InventoryUpdated { .. } + | HealthEvent::FirmwareObserved(_) + | HealthEvent::HealthReportProduced(_) => {} } + Vec::new() } } diff --git a/crates/health/src/sink/rack_health_report.rs b/crates/health/src/sink/rack_health_report.rs index 9eee20e38d..04a7851e25 100644 --- a/crates/health/src/sink/rack_health_report.rs +++ b/crates/health/src/sink/rack_health_report.rs @@ -21,7 +21,7 @@ use carbide_uuid::rack::RackId; use super::dedup_queue::DedupQueue; use super::{ - CollectorEvent, DataSink, EventContext, HealthReport, HealthReportTarget, ReportSource, + EventContext, HealthEvent, HealthReport, HealthReportTarget, ReportSource, SyncEventNode, }; use crate::HealthError; use crate::api_client::ApiClientWrapper; @@ -97,18 +97,18 @@ impl RackHealthReportSink { } } -impl DataSink for RackHealthReportSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for RackHealthReportSink { + fn node_type(&self) -> &'static str { "rack_health_report_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - let CollectorEvent::HealthReport(report) = event else { - return; + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::HealthReportProduced(report) = event else { + return Vec::new(); }; if report.target != Some(HealthReportTarget::Rack) { - return; + return Vec::new(); } if self.skip_empty_reports && report.is_empty() { @@ -116,7 +116,7 @@ impl DataSink for RackHealthReportSink { source = ?report.source, "Skipping empty rack health report" ); - return; + return Vec::new(); } let Some(rack_id) = context.rack_id() else { @@ -124,7 +124,7 @@ impl DataSink for RackHealthReportSink { endpoint_key = context.endpoint_key(), "Received rack-target HealthReport event without rack_id context" ); - return; + return Vec::new(); }; let key = RackHealthReportKey { @@ -132,5 +132,6 @@ impl DataSink for RackHealthReportSink { source: report.source, }; self.queue.save_latest(key, Arc::clone(report)); + Vec::new() } } diff --git a/crates/health/src/sink/switch_health_report.rs b/crates/health/src/sink/switch_health_report.rs index 497a555a03..5a9d7558ce 100644 --- a/crates/health/src/sink/switch_health_report.rs +++ b/crates/health/src/sink/switch_health_report.rs @@ -21,7 +21,7 @@ use carbide_uuid::switch::SwitchId; use super::dedup_queue::DedupQueue; use super::{ - CollectorEvent, DataSink, EventContext, HealthReport, HealthReportTarget, ReportSource, + EventContext, HealthEvent, HealthReport, HealthReportTarget, ReportSource, SyncEventNode, }; use crate::HealthError; use crate::api_client::ApiClientWrapper; @@ -97,18 +97,18 @@ impl SwitchHealthReportSink { } } -impl DataSink for SwitchHealthReportSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for SwitchHealthReportSink { + fn node_type(&self) -> &'static str { "switch_health_report_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - let CollectorEvent::HealthReport(report) = event else { - return; + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { + let HealthEvent::HealthReportProduced(report) = event else { + return Vec::new(); }; if report.target != Some(HealthReportTarget::Switch) { - return; + return Vec::new(); } if self.skip_empty_reports && report.is_empty() { @@ -116,7 +116,7 @@ impl DataSink for SwitchHealthReportSink { source = ?report.source, "Skipping empty switch health report" ); - return; + return Vec::new(); } let switch_id = if let Some(switch_id) = context.switch_id() { @@ -126,7 +126,7 @@ impl DataSink for SwitchHealthReportSink { endpoint_key = context.endpoint_key(), "Received switch-target HealthReport event without switch_id context" ); - return; + return Vec::new(); }; let key = SwitchHealthReportKey { @@ -134,5 +134,6 @@ impl DataSink for SwitchHealthReportSink { source: report.source, }; self.queue.save_latest(key, Arc::clone(report)); + Vec::new() } } diff --git a/crates/health/src/sink/tracing.rs b/crates/health/src/sink/tracing.rs index a8ba685d2d..33b26c69ea 100644 --- a/crates/health/src/sink/tracing.rs +++ b/crates/health/src/sink/tracing.rs @@ -15,7 +15,7 @@ * limitations under the License. */ -use super::{CollectorEvent, DataSink, EventContext}; +use super::{EventContext, HealthEvent, SyncEventNode}; use crate::config::TracingSinkConfig; /// Sink that writes health events through the process tracing subscriber. @@ -32,21 +32,21 @@ impl TracingSink { } } -impl DataSink for TracingSink { - fn sink_type(&self) -> &'static str { +impl SyncEventNode for TracingSink { + fn node_type(&self) -> &'static str { "tracing_sink" } - fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { match event { - CollectorEvent::MetricCollectionStart => { + HealthEvent::ScrapeBatchStarted => { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, "Metric collection start" ); } - CollectorEvent::Metric(metric) => { + HealthEvent::MeasurementObserved(metric) => { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, @@ -58,21 +58,21 @@ impl DataSink for TracingSink { "Metric event" ); } - CollectorEvent::MetricCollectionEnd => { + HealthEvent::ScrapeBatchFinished => { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, "Metric collection end" ); } - CollectorEvent::CollectorRemoved => { + HealthEvent::NodeRemoved => { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, "Collector removed" ); } - CollectorEvent::Log(record) => { + HealthEvent::LogObserved(record) => { let has_included_diagnostics = self.include_diagnostics && record.diagnostic_record.is_some(); @@ -97,7 +97,7 @@ impl DataSink for TracingSink { ); } } - CollectorEvent::Firmware(info) => { + HealthEvent::FirmwareObserved(info) => { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, @@ -106,7 +106,7 @@ impl DataSink for TracingSink { "Firmware info event" ); } - CollectorEvent::HealthReport(report) => { + HealthEvent::HealthReportProduced(report) => { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, @@ -119,6 +119,10 @@ impl DataSink for TracingSink { "Health report event" ); } + HealthEvent::ScrapeRequested { .. } + | HealthEvent::InventoryDiscovered { .. } + | HealthEvent::InventoryUpdated { .. } => {} } + Vec::new() } } From 21bd45705df0d0a62761855e5c4a056b048b3e45 Mon Sep 17 00:00:00 2001 From: ianisimov Date: Fri, 26 Jun 2026 18:49:20 -0700 Subject: [PATCH 2/4] change: hw-health universal stage support Signed-off-by: ianisimov --- crates/health/src/collectors/discovery.rs | 5 +++ .../health/src/collectors/entity_metrics.rs | 35 +++++++++++++++++++ crates/health/src/collectors/firmware.rs | 1 + crates/health/src/collectors/inventory.rs | 7 ++++ crates/health/src/collectors/leak_detector.rs | 1 + crates/health/src/collectors/nmxt.rs | 1 + .../health/src/collectors/nvue/rest/client.rs | 3 ++ .../src/collectors/nvue/rest/collector.rs | 1 + crates/health/src/collectors/runtime.rs | 32 ++++++++++++++--- crates/health/src/collectors/sensors.rs | 30 +--------------- crates/health/src/discovery/context.rs | 2 ++ crates/health/src/discovery/spawn.rs | 8 +++++ crates/health/src/lib.rs | 14 +++++++- crates/health/src/otlp/convert.rs | 2 ++ crates/health/src/otlp/drain.rs | 2 ++ crates/health/src/otlp/metrics_drain.rs | 2 ++ crates/health/src/processor/health_report.rs | 4 +++ .../health/src/processor/intrusion_events.rs | 3 ++ crates/health/src/processor/leak_events.rs | 8 +++++ crates/health/src/processor/mod.rs | 9 +++++ crates/health/src/processor/rack_leak.rs | 8 +++++ crates/health/src/sink/composite.rs | 6 ++++ crates/health/src/sink/events.rs | 18 ++++++++++ crates/health/src/sink/mod.rs | 15 ++++++++ crates/health/src/sink/otlp.rs | 4 +++ 25 files changed, 187 insertions(+), 34 deletions(-) diff --git a/crates/health/src/collectors/discovery.rs b/crates/health/src/collectors/discovery.rs index 6e761f77d1..9b5a30a8b5 100644 --- a/crates/health/src/collectors/discovery.rs +++ b/crates/health/src/collectors/discovery.rs @@ -37,6 +37,10 @@ pub struct EntityDiscoveryCollectorConfig { pub(crate) _bmc: std::marker::PhantomData, } +/// Discovers the entity inventory of a single endpoint and publishes snapshots +/// as [`HealthEvent::InventoryDiscovered`] events. +/// +/// [`HealthEvent::InventoryDiscovered`]: crate::sink::HealthEvent::InventoryDiscovered pub struct EntityDiscoveryCollector { endpoint: Arc, event_context: EventContext, @@ -104,6 +108,7 @@ impl PeriodicCollector for EntityDiscoveryCollector { } impl EntityDiscoveryCollector { + /// Forwards an event into the configured data sink, if any. fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); diff --git a/crates/health/src/collectors/entity_metrics.rs b/crates/health/src/collectors/entity_metrics.rs index c3c5c4eef5..bef92810c5 100644 --- a/crates/health/src/collectors/entity_metrics.rs +++ b/crates/health/src/collectors/entity_metrics.rs @@ -357,12 +357,14 @@ fn power_supply_metric_fields(m: &PowerSupplyMetrics) -> Vec { out } +/// Configuration for the entity metrics collector. pub struct MetricsCollectorConfig { pub data_sink: Option>, pub fetch_concurrency: usize, pub(crate) _bmc: std::marker::PhantomData, } +/// Metrics collector for a single BMC endpoint. pub struct MetricsCollector { endpoint: Arc, event_context: EventContext, @@ -413,6 +415,13 @@ impl PeriodicCollector for MetricsCollector { let fetch_failures = AtomicUsize::new(0); self.emit_event(HealthEvent::ScrapeBatchStarted); + // Entity-level derived metrics (drive media life, PSU capacity), once + // per entity. These are hardware metrics, so they flow with the metrics + // collector rather than being tied to the sensor scrape path. + for entity in &inventory.entities { + self.emit_derived_metrics(entity); + } + let this = &*self; let failures = &fetch_failures; let futures: Vec<_> = inventory @@ -457,12 +466,38 @@ impl PeriodicCollector for MetricsCollector { } impl MetricsCollector { + /// Forwards an event into the configured data sink, if any. fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } } + /// Emits the entity-level derived metrics (e.g. drive media life, PSU + /// capacity) for `entity` as measurement events. + fn emit_derived_metrics(&self, entity: &DiscoveredEntity) { + let derived = entity.derived_metrics(); + if derived.is_empty() { + return; + } + let mut attributes = entity.base_attributes(); + attributes.extend(entity.entity_specific_attributes()); + for metric in derived { + self.emit_event(HealthEvent::MeasurementObserved( + MetricSample { + key: format!("{}/{}", entity.key(), metric.metric_type), + name: "hw".to_string(), + metric_type: metric.metric_type.to_string(), + unit: metric.unit.to_string(), + value: metric.value, + labels: attributes.clone(), + context: None, + } + .into(), + )); + } + } + async fn collect_entity( &self, entity: &DiscoveredEntity, diff --git a/crates/health/src/collectors/firmware.rs b/crates/health/src/collectors/firmware.rs index 55df7c8b62..0a06ae370b 100644 --- a/crates/health/src/collectors/firmware.rs +++ b/crates/health/src/collectors/firmware.rs @@ -66,6 +66,7 @@ impl PeriodicCollector for FirmwareCollector { } impl FirmwareCollector { + /// Forwards an event into the configured data sink, if any. fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); diff --git a/crates/health/src/collectors/inventory.rs b/crates/health/src/collectors/inventory.rs index df33b74094..2c9117c9db 100644 --- a/crates/health/src/collectors/inventory.rs +++ b/crates/health/src/collectors/inventory.rs @@ -212,6 +212,13 @@ impl DiscoveredEntity { } } +/// An immutable snapshot of the entities discovered at an endpoint. +/// +/// Discovery publishes a new snapshot via [`HealthEvent::InventoryDiscovered`]; +/// consumers cache their own `Arc` to it, so there is no shared mutable state. +/// `generation` increases with each snapshot to let consumers detect refreshes. +/// +/// [`HealthEvent::InventoryDiscovered`]: crate::sink::HealthEvent::InventoryDiscovered pub struct EntityInventory { pub(crate) entities: Vec>, pub(crate) discovered_at: Instant, diff --git a/crates/health/src/collectors/leak_detector.rs b/crates/health/src/collectors/leak_detector.rs index d37ec7b1f2..bc1bf280dc 100644 --- a/crates/health/src/collectors/leak_detector.rs +++ b/crates/health/src/collectors/leak_detector.rs @@ -89,6 +89,7 @@ where B: Bmc + 'static, B::Error: 'static, { + /// Forwards an event into the configured data sink, if any. fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index 2f685342de..baa16fddbb 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -201,6 +201,7 @@ impl PeriodicCollector for NmxtCollector { } impl NmxtCollector { + /// Forwards an event into the configured data sink, if any. fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); diff --git a/crates/health/src/collectors/nvue/rest/client.rs b/crates/health/src/collectors/nvue/rest/client.rs index 9c680b9b71..8112f38a2c 100644 --- a/crates/health/src/collectors/nvue/rest/client.rs +++ b/crates/health/src/collectors/nvue/rest/client.rs @@ -88,6 +88,7 @@ impl RestClient { }) } + /// Stores the credentials used for subsequent authenticated requests. pub fn set_credentials(&self, creds: UsernamePassword) { *self .credentials @@ -95,6 +96,7 @@ impl RestClient { .unwrap_or_else(|poisoned| poisoned.into_inner()) = Some(creds); } + /// Drops any stored credentials, e.g. after an authentication failure. pub fn clear_credentials(&self) { *self .credentials @@ -102,6 +104,7 @@ impl RestClient { .unwrap_or_else(|poisoned| poisoned.into_inner()) = None; } + /// Returns whether credentials are currently stored. pub fn has_credentials(&self) -> bool { self.credentials .read() diff --git a/crates/health/src/collectors/nvue/rest/collector.rs b/crates/health/src/collectors/nvue/rest/collector.rs index 4c2a0a951f..2a5059e7d9 100644 --- a/crates/health/src/collectors/nvue/rest/collector.rs +++ b/crates/health/src/collectors/nvue/rest/collector.rs @@ -303,6 +303,7 @@ impl NvueRestCollector { } } + /// Forwards an event into the configured data sink, if any. fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); diff --git a/crates/health/src/collectors/runtime.rs b/crates/health/src/collectors/runtime.rs index fb6c031a37..310a4d95ef 100644 --- a/crates/health/src/collectors/runtime.rs +++ b/crates/health/src/collectors/runtime.rs @@ -52,9 +52,16 @@ pub struct IterationResult { pub fetch_failures: usize, } +/// A collector that is polled on a fixed cadence to scrape an endpoint. +/// +/// The runtime owns the timer, rate limiter, and (optionally) an event mailbox; +/// implementors only provide the per-iteration scrape logic and, if they opt in +/// via [`Self::wants_events`], react to routed events. pub trait PeriodicCollector: Send + 'static { + /// Per-collector configuration consumed by [`Self::new_runner`]. type Config: Send + 'static; + /// Builds a runner bound to a specific endpoint and BMC client. fn new_runner( bmc: Arc, endpoint: Arc, @@ -63,6 +70,7 @@ pub trait PeriodicCollector: Send + 'static { where Self: Sized; + /// Performs one scrape pass, emitting events and returning iteration stats. fn run_iteration( &mut self, ) -> impl std::future::Future> + Send; @@ -77,13 +85,17 @@ pub trait PeriodicCollector: Send + 'static { false } + /// Reacts to an event routed to this collector's mailbox. No-op by default; + /// only invoked for collectors that opt in via [`Self::wants_events`]. fn handle_event(&mut self, _context: &EventContext, _event: &HealthEvent) {} + /// Releases any resources when the collector is stopped. No-op by default. fn stop(&mut self) -> impl std::future::Future + Send { async {} } } +/// A boxed stream of health events produced by a [`StreamingCollector`]. pub type EventStream<'a> = BoxStream<'a, Result>; /// Trait for collectors that maintain a long-lived stream (SSE, gRPC, etc.) @@ -256,12 +268,16 @@ impl Drop for StreamingConnectionGuard { } } +/// A running periodic collector task plus the handles used to drive and stop it. pub struct Collector { handle: JoinHandle<()>, cancel_token: CancellationToken, event_node: Option>, } +/// A [`SyncEventNode`] that forwards endpoint-addressed events into a running +/// collector task's channel, decoupling the event graph from the collector's +/// async loop. struct CollectorEventMailbox { node_type: &'static str, endpoint_key: String, @@ -274,11 +290,17 @@ impl SyncEventNode for CollectorEventMailbox { } fn interested_in(&self, event: &HealthEvent) -> bool { - matches!( - event, + // Route every endpoint-addressed control/stage event to the collector's + // mailbox; the runner's `handle_event` decides which ones it acts on. + // The endpoint_key guard applies to all such events. + match event { HealthEvent::InventoryDiscovered { endpoint_key, .. } - if endpoint_key == &self.endpoint_key - ) + | HealthEvent::InventoryUpdated { endpoint_key, .. } + | HealthEvent::ScrapeRequested { endpoint_key, .. } => { + endpoint_key == &self.endpoint_key + } + _ => false, + } } fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec { @@ -638,6 +660,8 @@ impl Collector { self.handle.is_finished() } + /// Returns this collector's event mailbox, if it consumes events, so the + /// event graph can route events to it. pub fn event_node(&self) -> Option> { self.event_node.clone() } diff --git a/crates/health/src/collectors/sensors.rs b/crates/health/src/collectors/sensors.rs index 026338475f..77853dc51f 100644 --- a/crates/health/src/collectors/sensors.rs +++ b/crates/health/src/collectors/sensors.rs @@ -92,12 +92,6 @@ impl PeriodicCollector for SensorCollector { let fetch_failures = AtomicUsize::new(0); self.emit_event(HealthEvent::ScrapeBatchStarted); - // Entity-level derived metrics (drive media life, PSU capacity), once - // per entity. - for entity in &inventory.entities { - self.emit_derived_metrics(entity); - } - // Build fetch futures borrowing from the immutable inventory snapshot, then // drive them concurrently. Each future borrows `&self`, the entity, and // its sensor (all alive for as long as `inventory` is held here). @@ -150,35 +144,13 @@ impl PeriodicCollector for SensorCollector { } impl SensorCollector { + /// Forwards an event into the configured data sink, if any. fn emit_event(&self, event: HealthEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); } } - fn emit_derived_metrics(&self, entity: &DiscoveredEntity) { - let derived = entity.derived_metrics(); - if derived.is_empty() { - return; - } - let mut attributes = entity.base_attributes(); - attributes.extend(entity.entity_specific_attributes()); - for metric in derived { - self.emit_event(HealthEvent::MeasurementObserved( - MetricSample { - key: format!("{}/{}", entity.key(), metric.metric_type), - name: "hw".to_string(), - metric_type: metric.metric_type.to_string(), - unit: metric.unit.to_string(), - value: metric.value, - labels: attributes.clone(), - context: None, - } - .into(), - )); - } - } - async fn update_sensor( &self, entity: &DiscoveredEntity, diff --git a/crates/health/src/discovery/context.rs b/crates/health/src/discovery/context.rs index 3377222cd8..7d5e2f1c46 100644 --- a/crates/health/src/discovery/context.rs +++ b/crates/health/src/discovery/context.rs @@ -142,6 +142,8 @@ impl CollectorState { self.map(kind).contains_key(key) } + /// Returns the event mailbox of the tracked collector of `kind` for `key`, + /// so newly discovered inventory can be routed to already-running consumers. pub(super) fn event_node( &self, kind: CollectorKind, diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index efe0b38c6f..d897a6a5e6 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -35,6 +35,9 @@ use crate::config::{Configurable, LogCollectionMode, PeriodicLogConfig}; use crate::endpoint::{BmcEndpoint, EndpointMetadata, SwitchEndpointRole}; use crate::sink::{CompositeSyncEventNode, SyncEventNode}; +/// Spawns the collector graph for one endpoint: given the discovery context, the +/// endpoint, an optional data sink, and a metrics prefix, it starts the +/// appropriate collectors. type SpawnGraphFn = fn( &mut DiscoveryLoopContext, &Arc, @@ -42,18 +45,23 @@ type SpawnGraphFn = fn( &str, ) -> Result<(), HealthError>; +/// Declarative binding of an endpoint predicate to the collector graph that +/// should be spawned for endpoints matching it. struct EndpointGraphSpec { name: &'static str, applies_to: fn(&BmcEndpoint) -> bool, spawn: SpawnGraphFn, } +/// Returns whether the endpoint is a switch acting in the host role. fn is_switch_host_endpoint(endpoint: &BmcEndpoint) -> bool { endpoint .switch_data() .is_some_and(|switch| switch.endpoint_role == SwitchEndpointRole::Host) } +/// Returns whether the endpoint should use the generic Redfish collector graph +/// (everything that is not a switch host). fn is_generic_redfish_endpoint(endpoint: &BmcEndpoint) -> bool { !is_switch_host_endpoint(endpoint) } diff --git a/crates/health/src/lib.rs b/crates/health/src/lib.rs index 870575540c..568b6c2f4d 100644 --- a/crates/health/src/lib.rs +++ b/crates/health/src/lib.rs @@ -111,10 +111,13 @@ impl From> for HealthEr } } +/// The endpoint discovery wiring assembled from configured endpoint sources. struct EndpointWiring { source: Arc, } +/// Assembles the composite endpoint source (static and/or Carbide API) from +/// config, erroring if no sources are configured. fn build_endpoint_wiring(config: &Config) -> Result { let reqwest = ReqwestClient::with_params(ReqwestClientParams::new().accept_invalid_certs(true)) .map_err(BmcError::ReqwestError)?; @@ -159,6 +162,8 @@ fn build_endpoint_wiring(config: &Config) -> Result }) } +/// Builds the root event-graph node wiring together all enabled sinks and +/// processors, returning `None` when no nodes are configured. fn build_data_sink( config: &Config, metrics_manager: Arc, @@ -180,12 +185,19 @@ fn build_data_sink( || config.sinks.health_report.is_enabled() || config.sinks.power_shelf_health_report.is_enabled() || config.sinks.switch_health_report.is_enabled() + || config.sinks.otlp.is_enabled() || config.processors.leak_detection.is_enabled() { nodes.push(Arc::new(HealthReportProcessor::new())); } - if config.sinks.health_report.is_enabled() { + // Intrusion reports target the machine; install whenever any consumer of + // machine-targeted HealthReportProduced events is enabled (tracing/otlp + // forward all reports, the machine health-report sink consumes them directly). + if config.sinks.tracing.is_enabled() + || config.sinks.health_report.is_enabled() + || config.sinks.otlp.is_enabled() + { nodes.push(Arc::new(BmcIntrusionSyncEventNode::new())); } diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index 0efa0893dd..4c0496d272 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -148,6 +148,8 @@ fn convert_log(log: &crate::sink::LogRecord, observed_nanos: u64) -> OtlpLogReco } } +/// Converts a single health event into an OTLP log record, or `None` for events +/// (metrics, lifecycle markers) that are not exported as logs. fn convert_event(event: &HealthEvent, observed_nanos: u64) -> Option { match event { HealthEvent::LogObserved(log) => Some(convert_log(log, observed_nanos)), diff --git a/crates/health/src/otlp/drain.rs b/crates/health/src/otlp/drain.rs index 0d1d9aa892..d128113a80 100644 --- a/crates/health/src/otlp/drain.rs +++ b/crates/health/src/otlp/drain.rs @@ -48,6 +48,8 @@ impl OtlpDrainTask { } } + /// Pops queued events into `batch` until it reaches `batch_size` or the + /// queue is empty. fn drain_batch(&self, batch: &mut Vec<(EventContext, HealthEvent)>) { let remaining = self.batch_size.saturating_sub(batch.len()); for _ in 0..remaining { diff --git a/crates/health/src/otlp/metrics_drain.rs b/crates/health/src/otlp/metrics_drain.rs index bb04dac56a..76ec92324c 100644 --- a/crates/health/src/otlp/metrics_drain.rs +++ b/crates/health/src/otlp/metrics_drain.rs @@ -48,6 +48,8 @@ impl OtlpMetricsDrainTask { } } + /// Pops queued metric samples into `batch` until it reaches `batch_size` or + /// the queue is empty. fn drain_batch(&self, batch: &mut Vec<(EventContext, MetricSample)>) { let remaining = self.batch_size.saturating_sub(batch.len()); for _ in 0..remaining { diff --git a/crates/health/src/processor/health_report.rs b/crates/health/src/processor/health_report.rs index 22e070447a..661a8f349d 100644 --- a/crates/health/src/processor/health_report.rs +++ b/crates/health/src/processor/health_report.rs @@ -58,12 +58,16 @@ struct HealthReportWindow { alerts: Vec, } +/// Processor node that classifies sensor measurements against their thresholds +/// and, at the end of each scrape batch, emits a single health report +/// summarizing the window (suppressing empty windows). #[derive(Default)] pub struct HealthReportProcessor { windows: DashMap, } impl HealthReportProcessor { + /// Creates a processor with no in-flight scrape windows. pub fn new() -> Self { Self { windows: DashMap::new(), diff --git a/crates/health/src/processor/intrusion_events.rs b/crates/health/src/processor/intrusion_events.rs index 5de2ab59c1..5a13efbcd5 100644 --- a/crates/health/src/processor/intrusion_events.rs +++ b/crates/health/src/processor/intrusion_events.rs @@ -33,10 +33,13 @@ enum IntrusionEventState { Clear, } +/// Processor node that turns BMC intrusion log records into machine-targeted +/// health reports (an alert when intrusion is asserted, a success when cleared). #[derive(Default)] pub struct BmcIntrusionSyncEventNode; impl BmcIntrusionSyncEventNode { + /// Creates a new intrusion-event processor. pub fn new() -> Self { Self } diff --git a/crates/health/src/processor/leak_events.rs b/crates/health/src/processor/leak_events.rs index 2ce547d8a8..bd49f443d0 100644 --- a/crates/health/src/processor/leak_events.rs +++ b/crates/health/src/processor/leak_events.rs @@ -24,22 +24,28 @@ use crate::sink::{ HealthReportTarget, Probe, ReportSource, }; +/// Processor node that aggregates per-detector BMC leak alerts into a single +/// tray-level leak report, declaring a leak once enough detectors fire. pub struct LeakSyncEventNode { minimum_alerts_per_report: usize, } impl LeakSyncEventNode { + /// Creates a leak processor that declares a leak once at least + /// `minimum_alerts_per_report` leak-detector alerts are seen in a report. pub fn new(minimum_alerts_per_report: usize) -> Self { Self { minimum_alerts_per_report, } } + /// Returns whether `alerts` meets the configured leak threshold. fn is_leaking(&self, alerts: usize) -> bool { alerts >= self.minimum_alerts_per_report } } +/// Returns whether an alert was raised by a leak detector. fn is_leak_detector_alert(alert: &HealthReportAlert) -> bool { alert .classifications @@ -47,6 +53,8 @@ fn is_leak_detector_alert(alert: &HealthReportAlert) -> bool { .any(|classification| classification == &Classification::LeakDetector) } +/// Builds a comma-separated, de-duplicated list of the leaking detector targets +/// for inclusion in the report message. fn leak_details(alerts: &[&HealthReportAlert]) -> String { let targets: BTreeSet = alerts .iter() diff --git a/crates/health/src/processor/mod.rs b/crates/health/src/processor/mod.rs index 1c487543ca..1b796c8aa0 100644 --- a/crates/health/src/processor/mod.rs +++ b/crates/health/src/processor/mod.rs @@ -32,17 +32,24 @@ pub use rack_leak::RackLeakProcessor; use crate::metrics::{ComponentMetrics, MetricsManager}; use crate::sink::{EventContext, HealthEvent, SyncEventNode}; +/// A queued event plus the set of nodes that may not re-consume it, so a node +/// never re-processes events derived from its own output. struct PendingEvent<'a> { event: Cow<'a, HealthEvent>, blocked_processors: Vec, } +/// Runs a pipeline of [`SyncEventNode`]s: each input event is offered to every +/// interested node, and any events a node emits are fed back through the graph +/// (excluding the emitting node) until the work queue drains. pub struct EventGraph { nodes: Vec>, component_metrics: Arc, } impl EventGraph { + /// Builds a graph over `nodes`. Callers must only construct this when at + /// least one node is configured. pub fn new(nodes: Vec>, metrics_manager: Arc) -> Self { debug_assert!( !nodes.is_empty(), @@ -54,6 +61,8 @@ impl EventGraph { } } + /// Offers `current_event` to every interested, non-blocked node and queues + /// the events they emit for further processing. fn next_events( &self, context: &EventContext, diff --git a/crates/health/src/processor/rack_leak.rs b/crates/health/src/processor/rack_leak.rs index 4bcc2c4c13..023b5fcc86 100644 --- a/crates/health/src/processor/rack_leak.rs +++ b/crates/health/src/processor/rack_leak.rs @@ -27,16 +27,22 @@ use crate::sink::{ HealthReportTarget, Probe, ReportSource, }; +/// Per-rack tally of which trays are currently reporting a leak. struct RackLeakState { leaking_trays: HashSet, } +/// Processor node that rolls up per-tray leak reports into a rack-level leak +/// report, alerting once the number of simultaneously-leaking trays in a rack +/// crosses a threshold. pub struct RackLeakProcessor { racks: DashMap, leaking_tray_threshold: usize, } impl RackLeakProcessor { + /// Creates a rack-leak processor that alerts once `leaking_tray_threshold` + /// trays in a rack are leaking at the same time. pub fn new(leaking_tray_threshold: usize) -> Self { Self { racks: DashMap::new(), @@ -44,6 +50,8 @@ impl RackLeakProcessor { } } + /// Builds the rack-level report (alert or success) for `leaking_count` + /// currently-leaking trays. fn build_report(&self, leaking_count: usize) -> HealthReport { if leaking_count >= self.leaking_tray_threshold { HealthReport { diff --git a/crates/health/src/sink/composite.rs b/crates/health/src/sink/composite.rs index 0c279a6e65..20605e6e8e 100644 --- a/crates/health/src/sink/composite.rs +++ b/crates/health/src/sink/composite.rs @@ -21,12 +21,17 @@ use std::time::Instant; use super::{EventContext, HealthEvent, SyncEventNode}; use crate::metrics::{ComponentKind, ComponentMetrics, MetricsManager}; +/// A [`SyncEventNode`] that fans every event out to a set of inner sinks, +/// recording per-sink timing metrics. Terminal node: it never emits derived +/// events. pub struct CompositeSyncEventNode { sinks: Vec>, component_metrics: Arc, } impl CompositeSyncEventNode { + /// Creates a composite over `sinks`, sourcing timing metrics from + /// `metrics_manager`. pub fn new(sinks: Vec>, metrics_manager: Arc) -> Self { Self { sinks, @@ -34,6 +39,7 @@ impl CompositeSyncEventNode { } } + /// Records the time a single inner sink spent handling one event. fn record_sink_operation(&self, sink: &dyn SyncEventNode, duration: std::time::Duration) { self.component_metrics.record_operation( ComponentKind::Sink, diff --git a/crates/health/src/sink/events.rs b/crates/health/src/sink/events.rs index fc184985e5..86167f646c 100644 --- a/crates/health/src/sink/events.rs +++ b/crates/health/src/sink/events.rs @@ -337,29 +337,47 @@ impl HealthReport { } } +/// Canonical event flowing through the health event graph. +/// +/// Every collector, processor, and sink communicates exclusively in terms of +/// these events; the variants are domain facts (what was observed) rather than +/// source-specific shapes, so new data sources can reuse them unchanged. #[derive(Clone)] pub enum HealthEvent { + /// Request to scrape a specific endpoint with the given cadence/kind. ScrapeRequested { endpoint_key: String, kind: ScrapeKind, }, + /// A fresh inventory snapshot was discovered for an endpoint; consumers + /// cache their own copy of the immutable snapshot. InventoryDiscovered { endpoint_key: String, inventory: Arc>, }, + /// A consumer's cached inventory advanced to `generation`. InventoryUpdated { endpoint_key: String, generation: u64, }, + /// Marks the start of a scrape batch (used by sinks to window samples). ScrapeBatchStarted, + /// A single metric measurement was observed. MeasurementObserved(Box), + /// Marks the end of a scrape batch. ScrapeBatchFinished, + /// The owning node/endpoint was removed; sinks should drop its state. NodeRemoved, + /// A log record was observed. LogObserved(Box), + /// Firmware version information was observed. FirmwareObserved(FirmwareInfo), + /// A health report was produced by a processor for downstream sinks. HealthReportProduced(Arc), } +/// The category of data a [`HealthEvent::ScrapeRequested`] asks a collector to +/// gather, allowing one collector type to serve multiple data domains. #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] pub enum ScrapeKind { Inventory, diff --git a/crates/health/src/sink/mod.rs b/crates/health/src/sink/mod.rs index d474a8378f..78dda2b747 100644 --- a/crates/health/src/sink/mod.rs +++ b/crates/health/src/sink/mod.rs @@ -50,11 +50,26 @@ pub(crate) use self::otlp::OtlpSink; #[cfg(feature = "bench-hooks")] pub use self::otlp::OtlpSink; +/// A node in the synchronous health event graph. +/// +/// Every processing unit (sinks, transforms, collector mailboxes) implements +/// this single trait. A node receives a [`HealthEvent`], may act on it, and +/// returns any derived events to be fed back into the graph. This unifies what +/// used to be separate "collector", "sink", and "processor" abstractions. pub trait SyncEventNode: Send + Sync { + /// Stable identifier for this node, used in logs and metrics labels. fn node_type(&self) -> &'static str; + + /// Returns whether this node wants to receive `event`. + /// + /// Dispatchers consult this before calling [`Self::handle_event`] so nodes + /// can cheaply opt out of events they never act on. Defaults to `true`. fn interested_in(&self, _event: &HealthEvent) -> bool { true } + + /// Processes `event` and returns any derived events to re-feed into the + /// graph (empty when the node is a terminal sink). fn handle_event(&self, context: &EventContext, event: &HealthEvent) -> Vec; } diff --git a/crates/health/src/sink/otlp.rs b/crates/health/src/sink/otlp.rs index 410787680d..def84139cc 100644 --- a/crates/health/src/sink/otlp.rs +++ b/crates/health/src/sink/otlp.rs @@ -28,7 +28,9 @@ use crate::metrics::MetricsManager; use crate::otlp::drain::OtlpDrainTask; use crate::otlp::metrics_drain::OtlpMetricsDrainTask; +/// Dedup queue of log-shaped events awaiting OTLP export, keyed by endpoint. pub(crate) type OtlpQueue = DedupQueue; +/// Dedup queue of metric samples awaiting OTLP export, keyed by sample identity. pub(crate) type OtlpMetricsQueue = DedupQueue; #[derive(Clone, Debug, Eq, Hash, PartialEq)] @@ -184,10 +186,12 @@ impl OtlpSink { #[cfg(feature = "bench-hooks")] impl OtlpSink { + /// Pops one queued log event from the sink's internal queue (benchmarks only). pub fn pop_for_bench(&self) -> Option<(EventContext, HealthEvent)> { self.queue.pop().map(|(_key, value)| value) } + /// Pops one queued metric sample from the sink's internal queue (benchmarks only). pub fn pop_metric_for_bench(&self) -> Option<(EventContext, MetricSample)> { self.metrics_queue.pop().map(|(_key, value)| value) } From 37f998a1303dc7eef46d16d737e02910a3abf708 Mon Sep 17 00:00:00 2001 From: ianisimov Date: Fri, 26 Jun 2026 19:11:32 -0700 Subject: [PATCH 3/4] change: hw-health universal stage support Signed-off-by: ianisimov --- crates/health/src/collectors/firmware.rs | 1 + crates/health/src/collectors/inventory.rs | 6 +++--- crates/health/src/sink/events.rs | 1 + crates/health/src/sink/otlp.rs | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/crates/health/src/collectors/firmware.rs b/crates/health/src/collectors/firmware.rs index 0a06ae370b..1b6c494f91 100644 --- a/crates/health/src/collectors/firmware.rs +++ b/crates/health/src/collectors/firmware.rs @@ -104,6 +104,7 @@ impl FirmwareCollector { ]; self.emit_event(HealthEvent::FirmwareObserved(FirmwareInfo { + id: firmware_data.base.id.clone(), component, version, attributes, diff --git a/crates/health/src/collectors/inventory.rs b/crates/health/src/collectors/inventory.rs index 2c9117c9db..f45639e15d 100644 --- a/crates/health/src/collectors/inventory.rs +++ b/crates/health/src/collectors/inventory.rs @@ -131,10 +131,10 @@ impl DiscoveredEntity { let mut attrs = Vec::new(); match self { DiscoveredEntity::Processor { entity, .. } => { - if let Some(node_type) = entity.raw().processor_type.flatten() { + if let Some(processor_type) = entity.raw().processor_type.flatten() { attrs.push(( - Cow::Borrowed("node_type"), - node_type.to_snake_case().to_string(), + Cow::Borrowed("processor_type"), + processor_type.to_snake_case().to_string(), )); } if let Some(model) = entity.raw().model.clone().flatten() { diff --git a/crates/health/src/sink/events.rs b/crates/health/src/sink/events.rs index 86167f646c..8e99fb34b5 100644 --- a/crates/health/src/sink/events.rs +++ b/crates/health/src/sink/events.rs @@ -303,6 +303,7 @@ struct DiagnosticLogBodyAttribute<'a> { #[derive(Clone, Debug)] pub struct FirmwareInfo { + pub id: String, pub component: String, pub version: String, pub attributes: Vec, diff --git a/crates/health/src/sink/otlp.rs b/crates/health/src/sink/otlp.rs index def84139cc..b919266f8f 100644 --- a/crates/health/src/sink/otlp.rs +++ b/crates/health/src/sink/otlp.rs @@ -235,7 +235,7 @@ impl SyncEventNode for OtlpSink { (key, event.clone()) } HealthEvent::FirmwareObserved(info) => { - let key = format!("{}|firmware|{}", context.endpoint_key, info.component); + let key = format!("{}|firmware|{}", context.endpoint_key, info.id); (key, event.clone()) } _ => return Vec::new(), From fc7c56a8ed97f1043fbcb782b0b72a459075ced4 Mon Sep 17 00:00:00 2001 From: ianisimov Date: Fri, 26 Jun 2026 19:21:59 -0700 Subject: [PATCH 4/4] change: hw-health universal stage support Signed-off-by: ianisimov --- crates/health/benches/collector_pipeline.rs | 1 + .../health/src/collectors/entity_metrics.rs | 14 ++++-- crates/health/src/collectors/runtime.rs | 1 + crates/health/src/collectors/sensors.rs | 46 +++++++++++++++-- crates/health/src/discovery/spawn.rs | 50 ++++++++++++------- crates/health/src/lib.rs | 31 +++++++++++- crates/health/src/processor/health_report.rs | 30 +++++++++-- crates/health/src/sink/otlp.rs | 38 +++++++++++++- 8 files changed, 178 insertions(+), 33 deletions(-) diff --git a/crates/health/benches/collector_pipeline.rs b/crates/health/benches/collector_pipeline.rs index 57f69ae6d4..4b1bcee0f7 100644 --- a/crates/health/benches/collector_pipeline.rs +++ b/crates/health/benches/collector_pipeline.rs @@ -132,6 +132,7 @@ fn build_log_event(idx: usize) -> HealthEvent { fn build_firmware_event(idx: usize) -> HealthEvent { let component = format!("component-{idx}"); HealthEvent::FirmwareObserved(FirmwareInfo { + id: format!("firmware-{idx}"), component: component.clone(), version: format!("1.0.{}", idx % 100), attributes: vec![ diff --git a/crates/health/src/collectors/entity_metrics.rs b/crates/health/src/collectors/entity_metrics.rs index bef92810c5..a939cb066e 100644 --- a/crates/health/src/collectors/entity_metrics.rs +++ b/crates/health/src/collectors/entity_metrics.rs @@ -454,9 +454,17 @@ impl PeriodicCollector for MetricsCollector { true } - fn handle_event(&mut self, _context: &EventContext, event: &HealthEvent) { - if let HealthEvent::InventoryDiscovered { inventory, .. } = event { - self.latest_inventory = Some(inventory.clone()); + fn handle_event(&mut self, context: &EventContext, event: &HealthEvent) { + match event { + HealthEvent::InventoryDiscovered { inventory, .. } => { + self.latest_inventory = Some(inventory.clone()); + } + HealthEvent::NodeRemoved + if context.endpoint_key() == self.event_context.endpoint_key() => + { + self.latest_inventory = None; + } + _ => {} } } diff --git a/crates/health/src/collectors/runtime.rs b/crates/health/src/collectors/runtime.rs index 310a4d95ef..6c38d8e554 100644 --- a/crates/health/src/collectors/runtime.rs +++ b/crates/health/src/collectors/runtime.rs @@ -299,6 +299,7 @@ impl SyncEventNode for CollectorEventMailbox { | HealthEvent::ScrapeRequested { endpoint_key, .. } => { endpoint_key == &self.endpoint_key } + HealthEvent::NodeRemoved => true, _ => false, } } diff --git a/crates/health/src/collectors/sensors.rs b/crates/health/src/collectors/sensors.rs index 77853dc51f..47877a6f6c 100644 --- a/crates/health/src/collectors/sensors.rs +++ b/crates/health/src/collectors/sensors.rs @@ -36,6 +36,7 @@ pub struct SensorCollectorConfig { pub data_sink: Option>, pub sensor_fetch_concurrency: usize, pub include_sensor_thresholds: bool, + pub emit_derived_metrics: bool, pub(crate) _bmc: std::marker::PhantomData, } @@ -47,6 +48,7 @@ pub struct SensorCollector { data_sink: Option>, sensor_fetch_concurrency: usize, include_sensor_thresholds: bool, + emit_derived_metrics: bool, } impl PeriodicCollector for SensorCollector { @@ -65,6 +67,7 @@ impl PeriodicCollector for SensorCollector { data_sink: config.data_sink, sensor_fetch_concurrency: config.sensor_fetch_concurrency.max(1), include_sensor_thresholds: config.include_sensor_thresholds, + emit_derived_metrics: config.emit_derived_metrics, }) } @@ -92,6 +95,12 @@ impl PeriodicCollector for SensorCollector { let fetch_failures = AtomicUsize::new(0); self.emit_event(HealthEvent::ScrapeBatchStarted); + if self.emit_derived_metrics { + for entity in &inventory.entities { + self.emit_derived_metrics(entity); + } + } + // Build fetch futures borrowing from the immutable inventory snapshot, then // drive them concurrently. Each future borrows `&self`, the entity, and // its sensor (all alive for as long as `inventory` is held here). @@ -132,9 +141,17 @@ impl PeriodicCollector for SensorCollector { true } - fn handle_event(&mut self, _context: &EventContext, event: &HealthEvent) { - if let HealthEvent::InventoryDiscovered { inventory, .. } = event { - self.latest_inventory = Some(inventory.clone()); + fn handle_event(&mut self, context: &EventContext, event: &HealthEvent) { + match event { + HealthEvent::InventoryDiscovered { inventory, .. } => { + self.latest_inventory = Some(inventory.clone()); + } + HealthEvent::NodeRemoved + if context.endpoint_key() == self.event_context.endpoint_key() => + { + self.latest_inventory = None; + } + _ => {} } } @@ -151,6 +168,29 @@ impl SensorCollector { } } + fn emit_derived_metrics(&self, entity: &DiscoveredEntity) { + let derived = entity.derived_metrics(); + if derived.is_empty() { + return; + } + let mut attributes = entity.base_attributes(); + attributes.extend(entity.entity_specific_attributes()); + for metric in derived { + self.emit_event(HealthEvent::MeasurementObserved( + MetricSample { + key: format!("{}/{}", entity.key(), metric.metric_type), + name: "hw".to_string(), + metric_type: metric.metric_type.to_string(), + unit: metric.unit.to_string(), + value: metric.value, + labels: attributes.clone(), + context: None, + } + .into(), + )); + } + } + async fn update_sensor( &self, entity: &DiscoveredEntity, diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index d897a6a5e6..3c880766f8 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -115,6 +115,7 @@ fn spawn_generic_redfish_collectors( let sensors_enabled = matches!(ctx.sensors_config, Configurable::Enabled(_)); let metrics_enabled = matches!(ctx.metrics_config, Configurable::Enabled(_)); + let mut inventory_consumer_started = false; if let Configurable::Enabled(sensor_cfg) = &ctx.sensors_config && !ctx.collectors.contains(CollectorKind::Sensor, &key) @@ -130,6 +131,7 @@ fn spawn_generic_redfish_collectors( data_sink: data_sink.clone(), sensor_fetch_concurrency: sensor_cfg.sensor_fetch_concurrency, include_sensor_thresholds: sensor_cfg.include_sensor_thresholds, + emit_derived_metrics: !metrics_enabled, _bmc: std::marker::PhantomData, }, CollectorStartContext { @@ -142,6 +144,7 @@ fn spawn_generic_redfish_collectors( Ok(monitor) => { ctx.collectors .insert(CollectorKind::Sensor, key.clone().into(), monitor); + inventory_consumer_started = true; tracing::info!( endpoint_key = %key, total_collectors = ctx.collectors.len(CollectorKind::Sensor), @@ -183,6 +186,7 @@ fn spawn_generic_redfish_collectors( Ok(monitor) => { ctx.collectors .insert(CollectorKind::Metrics, key.clone().into(), monitor); + inventory_consumer_started = true; tracing::info!( endpoint_key = %key, total_collectors = ctx.collectors.len(CollectorKind::Metrics), @@ -199,27 +203,37 @@ fn spawn_generic_redfish_collectors( } } - // Discovery's inventory fanout is captured when it starts, so only start it once - // every enabled consumer is up. Otherwise a consumer that starts in a later - // iteration (e.g. after a transient start failure) would never be wired in. - let sensor_ready = !sensors_enabled || ctx.collectors.contains(CollectorKind::Sensor, &key); - let metrics_ready = !metrics_enabled || ctx.collectors.contains(CollectorKind::Metrics, &key); + if inventory_consumer_started + && let Some(discovery) = ctx + .collectors + .map_mut(CollectorKind::Discovery) + .remove(key.as_str()) + { + tracing::info!( + endpoint_key = %key, + "Restarting entity discovery to rewire inventory consumers" + ); + tokio::spawn(async move { + discovery.stop().await; + }); + } + + let mut discovery_nodes: Vec> = Vec::new(); + if sensors_enabled + && let Some(event_node) = ctx.collectors.event_node(CollectorKind::Sensor, &key) + { + discovery_nodes.push(event_node); + } + if metrics_enabled + && let Some(event_node) = ctx.collectors.event_node(CollectorKind::Metrics, &key) + { + discovery_nodes.push(event_node); + } + if (sensors_enabled || metrics_enabled) - && sensor_ready - && metrics_ready + && !discovery_nodes.is_empty() && !ctx.collectors.contains(CollectorKind::Discovery, &key) { - let mut discovery_nodes: Vec> = Vec::new(); - if sensors_enabled - && let Some(event_node) = ctx.collectors.event_node(CollectorKind::Sensor, &key) - { - discovery_nodes.push(event_node); - } - if metrics_enabled - && let Some(event_node) = ctx.collectors.event_node(CollectorKind::Metrics, &key) - { - discovery_nodes.push(event_node); - } if let Some(data_sink) = data_sink.clone() { discovery_nodes.push(data_sink); } diff --git a/crates/health/src/lib.rs b/crates/health/src/lib.rs index 568b6c2f4d..461df1a124 100644 --- a/crates/health/src/lib.rs +++ b/crates/health/src/lib.rs @@ -169,9 +169,11 @@ fn build_data_sink( metrics_manager: Arc, ) -> Result>, HealthError> { let mut nodes: Vec> = Vec::new(); + let mut has_terminal_sink = false; if let Configurable::Enabled(sink_cfg) = &config.sinks.tracing { nodes.push(Arc::new(TracingSink::new(sink_cfg))); + has_terminal_sink = true; } if let Configurable::Enabled(_) = &config.sinks.prometheus { @@ -179,8 +181,25 @@ fn build_data_sink( metrics_manager.clone(), &config.metrics.prefix, )?)); + has_terminal_sink = true; } + let emit_empty_sensor_reports = config + .sinks + .health_report + .as_option() + .is_some_and(|cfg| !cfg.skip_empty_reports) + || config + .sinks + .power_shelf_health_report + .as_option() + .is_some_and(|cfg| !cfg.skip_empty_reports) + || config + .sinks + .switch_health_report + .as_option() + .is_some_and(|cfg| !cfg.skip_empty_reports); + if config.sinks.tracing.is_enabled() || config.sinks.health_report.is_enabled() || config.sinks.power_shelf_health_report.is_enabled() @@ -188,7 +207,9 @@ fn build_data_sink( || config.sinks.otlp.is_enabled() || config.processors.leak_detection.is_enabled() { - nodes.push(Arc::new(HealthReportProcessor::new())); + nodes.push(Arc::new(HealthReportProcessor::new( + emit_empty_sensor_reports, + ))); } // Intrusion reports target the machine; install whenever any consumer of @@ -217,22 +238,27 @@ fn build_data_sink( nodes.push(Arc::new( LogFileSink::new(sink_cfg).map_err(HealthError::GenericError)?, )); + has_terminal_sink = true; } if let Configurable::Enabled(ref sink_cfg) = config.sinks.health_report { nodes.push(Arc::new(HealthReportSink::new(sink_cfg)?)); + has_terminal_sink = true; } if let Configurable::Enabled(ref sink_cfg) = config.sinks.rack_health_report { nodes.push(Arc::new(RackHealthReportSink::new(sink_cfg)?)); + has_terminal_sink = true; } if let Configurable::Enabled(ref sink_cfg) = config.sinks.switch_health_report { nodes.push(Arc::new(SwitchHealthReportSink::new(sink_cfg)?)); + has_terminal_sink = true; } if let Configurable::Enabled(ref sink_cfg) = config.sinks.power_shelf_health_report { nodes.push(Arc::new(PowerShelfHealthReportSink::new(sink_cfg)?)); + has_terminal_sink = true; } if let Configurable::Enabled(ref otlp_cfg) = config.sinks.otlp { @@ -243,9 +269,10 @@ fn build_data_sink( &metrics_manager, &config.metrics.prefix, )?)); + has_terminal_sink = true; } - if nodes.is_empty() { + if !has_terminal_sink { return Ok(None); } diff --git a/crates/health/src/processor/health_report.rs b/crates/health/src/processor/health_report.rs index 661a8f349d..221f61f3ea 100644 --- a/crates/health/src/processor/health_report.rs +++ b/crates/health/src/processor/health_report.rs @@ -64,13 +64,15 @@ struct HealthReportWindow { #[derive(Default)] pub struct HealthReportProcessor { windows: DashMap, + emit_empty_reports: bool, } impl HealthReportProcessor { /// Creates a processor with no in-flight scrape windows. - pub fn new() -> Self { + pub fn new(emit_empty_reports: bool) -> Self { Self { windows: DashMap::new(), + emit_empty_reports, } } @@ -227,7 +229,10 @@ impl SyncEventNode for HealthReportProcessor { let Some((_, window)) = self.windows.remove(&Self::stream_key(context)) else { return Vec::new(); }; - if window.successes.is_empty() && window.alerts.is_empty() { + if !self.emit_empty_reports + && window.successes.is_empty() + && window.alerts.is_empty() + { tracing::debug!( endpoint = %context.addr.mac, collector_type = context.collector_type, @@ -304,7 +309,7 @@ mod tests { #[test] fn metric_window_emits_abstract_health_report() { - let processor = HealthReportProcessor::new(); + let processor = HealthReportProcessor::new(false); let context = test_context(); let _ = processor.handle_event(&context, &HealthEvent::ScrapeBatchStarted); @@ -349,7 +354,7 @@ mod tests { #[test] fn collector_removed_clears_metric_window() { - let processor = HealthReportProcessor::new(); + let processor = HealthReportProcessor::new(false); let context = test_context(); let _ = processor.handle_event(&context, &HealthEvent::ScrapeBatchStarted); @@ -363,7 +368,7 @@ mod tests { #[test] fn empty_metric_window_does_not_emit_health_report() { - let processor = HealthReportProcessor::new(); + let processor = HealthReportProcessor::new(false); let context = test_context(); let _ = processor.handle_event(&context, &HealthEvent::ScrapeBatchStarted); @@ -372,4 +377,19 @@ mod tests { assert!(emitted.is_empty()); assert!(processor.windows.is_empty()); } + + #[test] + fn empty_metric_window_can_emit_health_report_when_configured() { + let processor = HealthReportProcessor::new(true); + let context = test_context(); + + let _ = processor.handle_event(&context, &HealthEvent::ScrapeBatchStarted); + let emitted = processor.handle_event(&context, &HealthEvent::ScrapeBatchFinished); + + let Some(HealthEvent::HealthReportProduced(report)) = emitted.last() else { + panic!("expected health report event"); + }; + assert!(report.is_empty()); + assert!(processor.windows.is_empty()); + } } diff --git a/crates/health/src/sink/otlp.rs b/crates/health/src/sink/otlp.rs index b919266f8f..da5fdb644a 100644 --- a/crates/health/src/sink/otlp.rs +++ b/crates/health/src/sink/otlp.rs @@ -235,7 +235,10 @@ impl SyncEventNode for OtlpSink { (key, event.clone()) } HealthEvent::FirmwareObserved(info) => { - let key = format!("{}|firmware|{}", context.endpoint_key, info.id); + let key = format!( + "{}|firmware|{}|{}", + context.endpoint_key, info.id, info.component + ); (key, event.clone()) } _ => return Vec::new(), @@ -257,7 +260,7 @@ mod tests { use super::*; use crate::sink::event_mapper::OpenBmcEventMapper; - use crate::sink::{DiagnosticLogRecord, LogRecord, MetricSample}; + use crate::sink::{DiagnosticLogRecord, FirmwareInfo, LogRecord, MetricSample}; fn test_context() -> EventContext { EventContext { @@ -325,6 +328,15 @@ mod tests { })) } + fn firmware_event(id: &str, component: &str, version: &str) -> HealthEvent { + HealthEvent::FirmwareObserved(FirmwareInfo { + id: id.to_string(), + component: component.to_string(), + version: version.to_string(), + attributes: Vec::new(), + }) + } + fn test_sink() -> OtlpSink { OtlpSink::new_for_bench(Arc::new(OpenBmcEventMapper)) } @@ -377,6 +389,28 @@ mod tests { assert_eq!(sink.metrics_replaced_total.get() as u64, 1); } + #[test] + fn firmware_events_dedup_by_id_and_component() { + let sink = test_sink(); + let ctx = test_context(); + + sink.handle_event(&ctx, &firmware_event("1", "BIOS", "1.0")); + sink.handle_event(&ctx, &firmware_event("2", "BIOS", "1.0")); + sink.handle_event(&ctx, &firmware_event("", "BMC", "1.0")); + sink.handle_event(&ctx, &firmware_event("", "BIOS", "1.0")); + + let mut count = 0; + while sink.queue.pop().is_some() { + count += 1; + } + + assert_eq!( + count, 4, + "id and component are both part of firmware identity" + ); + assert_eq!(sink.replaced_total.get() as u64, 0); + } + #[test] fn metric_events_with_same_sample_key_but_different_type_are_separate_entries() { let sink = test_sink();