Implement hysteresis for metric status changes to prevent flapping

Add comprehensive hysteresis support to prevent status oscillation near
threshold boundaries while maintaining responsive alerting.

Key Features:
- HysteresisThresholds with configurable upper/lower limits
- StatusTracker for per-metric status history
- Default gaps: CPU load 10%, memory 5%, disk temp 5°C

Updated Components:
- CPU load collector (5-minute average with hysteresis)
- Memory usage collector (percentage-based thresholds)
- Disk temperature collector (SMART data monitoring)
- All collectors updated to support StatusTracker interface

Cache Interval Adjustments:
- Service status: 60s → 10s (faster response)
- Disk usage: 300s → 60s (more frequent checks)
- Backup status: 900s → 60s (quicker updates)
- SMART data: moved to 600s tier (10 minutes)

Architecture:
- Individual metric status calculation in collectors
- Centralized StatusTracker in MetricCollectionManager
- Status aggregation preserved in dashboard widgets
This commit is contained in:
2025-10-20 18:45:41 +02:00
parent e998679901
commit 00a8ed3da2
34 changed files with 1037 additions and 770 deletions

View File

@@ -4,16 +4,13 @@ use crossterm::{
execute,
terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
};
use ratatui::{
backend::CrosstermBackend,
Terminal,
};
use ratatui::{backend::CrosstermBackend, Terminal};
use std::io;
use std::time::{Duration, Instant};
use tracing::{info, error, debug, warn};
use tracing::{debug, error, info, warn};
use crate::communication::{AgentCommand, ZmqCommandSender, ZmqConsumer};
use crate::config::DashboardConfig;
use crate::communication::{ZmqConsumer, ZmqCommandSender, AgentCommand};
use crate::metrics::MetricStore;
use crate::ui::TuiApp;
@@ -30,14 +27,14 @@ pub struct Dashboard {
impl Dashboard {
pub async fn new(config_path: Option<String>, headless: bool) -> Result<Self> {
info!("Initializing dashboard");
// Load configuration
let config = if let Some(path) = config_path {
DashboardConfig::load_from_file(&path)?
} else {
DashboardConfig::default()
};
// Initialize ZMQ consumer
let mut zmq_consumer = match ZmqConsumer::new(&config.zmq).await {
Ok(consumer) => consumer,
@@ -46,7 +43,7 @@ impl Dashboard {
return Err(e);
}
};
// Initialize ZMQ command sender
let zmq_command_sender = match ZmqCommandSender::new(&config.zmq) {
Ok(sender) => sender,
@@ -55,22 +52,25 @@ impl Dashboard {
return Err(e);
}
};
// Connect to predefined hosts from configuration
let hosts = config.hosts.predefined_hosts.clone();
// Try to connect to hosts but don't fail if none are available
match zmq_consumer.connect_to_predefined_hosts(&hosts).await {
Ok(_) => info!("Successfully connected to ZMQ hosts"),
Err(e) => {
warn!("Failed to connect to hosts (this is normal if no agents are running): {}", e);
warn!(
"Failed to connect to hosts (this is normal if no agents are running): {}",
e
);
info!("Dashboard will start anyway and connect when agents become available");
}
}
// Initialize metric store
let metric_store = MetricStore::new(10000, 24); // 10k metrics, 24h retention
// Initialize TUI components only if not headless
let (tui_app, terminal) = if headless {
info!("Running in headless mode (no TUI)");
@@ -78,22 +78,24 @@ impl Dashboard {
} else {
// Initialize TUI app
let tui_app = TuiApp::new();
// Setup terminal
if let Err(e) = enable_raw_mode() {
error!("Failed to enable raw mode: {}", e);
error!("This usually means the dashboard is being run without a proper terminal (TTY)");
error!(
"This usually means the dashboard is being run without a proper terminal (TTY)"
);
error!("Try running with --headless flag or in a proper terminal");
return Err(e.into());
}
let mut stdout = io::stdout();
if let Err(e) = execute!(stdout, EnterAlternateScreen) {
error!("Failed to enter alternate screen: {}", e);
let _ = disable_raw_mode();
return Err(e.into());
}
let backend = CrosstermBackend::new(stdout);
let terminal = match Terminal::new(backend) {
Ok(term) => term,
@@ -103,12 +105,12 @@ impl Dashboard {
return Err(e.into());
}
};
(Some(tui_app), Some(terminal))
};
info!("Dashboard initialization complete");
Ok(Self {
zmq_consumer,
zmq_command_sender,
@@ -119,66 +121,65 @@ impl Dashboard {
initial_commands_sent: std::collections::HashSet::new(),
})
}
/// Send a command to a specific agent
pub async fn send_command(&mut self, hostname: &str, command: AgentCommand) -> Result<()> {
self.zmq_command_sender.send_command(hostname, command).await
self.zmq_command_sender
.send_command(hostname, command)
.await
}
pub async fn run(&mut self) -> Result<()> {
info!("Starting dashboard main loop");
let mut last_metrics_check = Instant::now();
let metrics_check_interval = Duration::from_millis(100); // Check for metrics every 100ms
loop {
// Handle terminal events (keyboard input) only if not headless
if !self.headless {
match event::poll(Duration::from_millis(50)) {
Ok(true) => {
match event::read() {
Ok(Event::Key(key)) => {
match key.code {
KeyCode::Char('q') => {
info!("Quit key pressed, exiting dashboard");
break;
}
KeyCode::Left => {
debug!("Navigate left");
if let Some(ref mut tui_app) = self.tui_app {
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
error!("Error handling left navigation: {}", e);
}
}
}
KeyCode::Right => {
debug!("Navigate right");
if let Some(ref mut tui_app) = self.tui_app {
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
error!("Error handling right navigation: {}", e);
}
}
}
KeyCode::Char('r') => {
debug!("Refresh requested");
if let Some(ref mut tui_app) = self.tui_app {
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
error!("Error handling refresh: {}", e);
}
}
}
KeyCode::Tab => {
debug!("Tab pressed - next host");
if let Some(ref mut tui_app) = self.tui_app {
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
error!("Error handling tab navigation: {}", e);
}
}
}
_ => {}
Ok(Event::Key(key)) => match key.code {
KeyCode::Char('q') => {
info!("Quit key pressed, exiting dashboard");
break;
}
}
KeyCode::Left => {
debug!("Navigate left");
if let Some(ref mut tui_app) = self.tui_app {
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
error!("Error handling left navigation: {}", e);
}
}
}
KeyCode::Right => {
debug!("Navigate right");
if let Some(ref mut tui_app) = self.tui_app {
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
error!("Error handling right navigation: {}", e);
}
}
}
KeyCode::Char('r') => {
debug!("Refresh requested");
if let Some(ref mut tui_app) = self.tui_app {
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
error!("Error handling refresh: {}", e);
}
}
}
KeyCode::Tab => {
debug!("Tab pressed - next host");
if let Some(ref mut tui_app) = self.tui_app {
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
error!("Error handling tab navigation: {}", e);
}
}
}
_ => {}
},
Ok(_) => {} // Other events (mouse, resize, etc.)
Err(e) => {
error!("Error reading terminal event: {}", e);
@@ -193,44 +194,67 @@ impl Dashboard {
}
}
}
// Check for new metrics
if last_metrics_check.elapsed() >= metrics_check_interval {
if let Ok(Some(metric_message)) = self.zmq_consumer.receive_metrics().await {
debug!("Received metrics from {}: {} metrics",
metric_message.hostname, metric_message.metrics.len());
debug!(
"Received metrics from {}: {} metrics",
metric_message.hostname,
metric_message.metrics.len()
);
// Check if this is the first time we've seen this host
let is_new_host = !self.initial_commands_sent.contains(&metric_message.hostname);
let is_new_host = !self
.initial_commands_sent
.contains(&metric_message.hostname);
if is_new_host {
info!("First contact with host {}, sending initial CollectNow command", metric_message.hostname);
info!(
"First contact with host {}, sending initial CollectNow command",
metric_message.hostname
);
// Send CollectNow command for immediate refresh
if let Err(e) = self.send_command(&metric_message.hostname, AgentCommand::CollectNow).await {
error!("Failed to send initial CollectNow command to {}: {}", metric_message.hostname, e);
if let Err(e) = self
.send_command(&metric_message.hostname, AgentCommand::CollectNow)
.await
{
error!(
"Failed to send initial CollectNow command to {}: {}",
metric_message.hostname, e
);
} else {
info!("✓ Sent initial CollectNow command to {}", metric_message.hostname);
self.initial_commands_sent.insert(metric_message.hostname.clone());
info!(
"✓ Sent initial CollectNow command to {}",
metric_message.hostname
);
self.initial_commands_sent
.insert(metric_message.hostname.clone());
}
}
// Update metric store
self.metric_store.update_metrics(&metric_message.hostname, metric_message.metrics);
self.metric_store
.update_metrics(&metric_message.hostname, metric_message.metrics);
// Update TUI with new hosts and metrics (only if not headless)
if let Some(ref mut tui_app) = self.tui_app {
let connected_hosts = self.metric_store.get_connected_hosts(Duration::from_secs(30));
let connected_hosts = self
.metric_store
.get_connected_hosts(Duration::from_secs(30));
tui_app.update_hosts(connected_hosts);
tui_app.update_metrics(&self.metric_store);
}
}
last_metrics_check = Instant::now();
}
// Render TUI (only if not headless)
if !self.headless {
if let (Some(ref mut terminal), Some(ref mut tui_app)) = (&mut self.terminal, &mut self.tui_app) {
if let (Some(ref mut terminal), Some(ref mut tui_app)) =
(&mut self.terminal, &mut self.tui_app)
{
if let Err(e) = terminal.draw(|frame| {
tui_app.render(frame, &self.metric_store);
}) {
@@ -239,11 +263,11 @@ impl Dashboard {
}
}
}
// Small sleep to prevent excessive CPU usage
tokio::time::sleep(Duration::from_millis(10)).await;
}
info!("Dashboard main loop ended");
Ok(())
}
@@ -255,12 +279,9 @@ impl Drop for Dashboard {
if !self.headless {
let _ = disable_raw_mode();
if let Some(ref mut terminal) = self.terminal {
let _ = execute!(
terminal.backend_mut(),
LeaveAlternateScreen
);
let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen);
let _ = terminal.show_cursor();
}
}
}
}
}