Implement hysteresis for metric status changes to prevent flapping
Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
This commit is contained in:
@@ -4,16 +4,13 @@ use crossterm::{
|
||||
execute,
|
||||
terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
|
||||
};
|
||||
use ratatui::{
|
||||
backend::CrosstermBackend,
|
||||
Terminal,
|
||||
};
|
||||
use ratatui::{backend::CrosstermBackend, Terminal};
|
||||
use std::io;
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::{info, error, debug, warn};
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::communication::{AgentCommand, ZmqCommandSender, ZmqConsumer};
|
||||
use crate::config::DashboardConfig;
|
||||
use crate::communication::{ZmqConsumer, ZmqCommandSender, AgentCommand};
|
||||
use crate::metrics::MetricStore;
|
||||
use crate::ui::TuiApp;
|
||||
|
||||
@@ -30,14 +27,14 @@ pub struct Dashboard {
|
||||
impl Dashboard {
|
||||
pub async fn new(config_path: Option<String>, headless: bool) -> Result<Self> {
|
||||
info!("Initializing dashboard");
|
||||
|
||||
|
||||
// Load configuration
|
||||
let config = if let Some(path) = config_path {
|
||||
DashboardConfig::load_from_file(&path)?
|
||||
} else {
|
||||
DashboardConfig::default()
|
||||
};
|
||||
|
||||
|
||||
// Initialize ZMQ consumer
|
||||
let mut zmq_consumer = match ZmqConsumer::new(&config.zmq).await {
|
||||
Ok(consumer) => consumer,
|
||||
@@ -46,7 +43,7 @@ impl Dashboard {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Initialize ZMQ command sender
|
||||
let zmq_command_sender = match ZmqCommandSender::new(&config.zmq) {
|
||||
Ok(sender) => sender,
|
||||
@@ -55,22 +52,25 @@ impl Dashboard {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Connect to predefined hosts from configuration
|
||||
let hosts = config.hosts.predefined_hosts.clone();
|
||||
|
||||
|
||||
// Try to connect to hosts but don't fail if none are available
|
||||
match zmq_consumer.connect_to_predefined_hosts(&hosts).await {
|
||||
Ok(_) => info!("Successfully connected to ZMQ hosts"),
|
||||
Err(e) => {
|
||||
warn!("Failed to connect to hosts (this is normal if no agents are running): {}", e);
|
||||
warn!(
|
||||
"Failed to connect to hosts (this is normal if no agents are running): {}",
|
||||
e
|
||||
);
|
||||
info!("Dashboard will start anyway and connect when agents become available");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Initialize metric store
|
||||
let metric_store = MetricStore::new(10000, 24); // 10k metrics, 24h retention
|
||||
|
||||
|
||||
// Initialize TUI components only if not headless
|
||||
let (tui_app, terminal) = if headless {
|
||||
info!("Running in headless mode (no TUI)");
|
||||
@@ -78,22 +78,24 @@ impl Dashboard {
|
||||
} else {
|
||||
// Initialize TUI app
|
||||
let tui_app = TuiApp::new();
|
||||
|
||||
|
||||
// Setup terminal
|
||||
if let Err(e) = enable_raw_mode() {
|
||||
error!("Failed to enable raw mode: {}", e);
|
||||
error!("This usually means the dashboard is being run without a proper terminal (TTY)");
|
||||
error!(
|
||||
"This usually means the dashboard is being run without a proper terminal (TTY)"
|
||||
);
|
||||
error!("Try running with --headless flag or in a proper terminal");
|
||||
return Err(e.into());
|
||||
}
|
||||
|
||||
|
||||
let mut stdout = io::stdout();
|
||||
if let Err(e) = execute!(stdout, EnterAlternateScreen) {
|
||||
error!("Failed to enter alternate screen: {}", e);
|
||||
let _ = disable_raw_mode();
|
||||
return Err(e.into());
|
||||
}
|
||||
|
||||
|
||||
let backend = CrosstermBackend::new(stdout);
|
||||
let terminal = match Terminal::new(backend) {
|
||||
Ok(term) => term,
|
||||
@@ -103,12 +105,12 @@ impl Dashboard {
|
||||
return Err(e.into());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
(Some(tui_app), Some(terminal))
|
||||
};
|
||||
|
||||
|
||||
info!("Dashboard initialization complete");
|
||||
|
||||
|
||||
Ok(Self {
|
||||
zmq_consumer,
|
||||
zmq_command_sender,
|
||||
@@ -119,66 +121,65 @@ impl Dashboard {
|
||||
initial_commands_sent: std::collections::HashSet::new(),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Send a command to a specific agent
|
||||
pub async fn send_command(&mut self, hostname: &str, command: AgentCommand) -> Result<()> {
|
||||
self.zmq_command_sender.send_command(hostname, command).await
|
||||
self.zmq_command_sender
|
||||
.send_command(hostname, command)
|
||||
.await
|
||||
}
|
||||
|
||||
|
||||
|
||||
pub async fn run(&mut self) -> Result<()> {
|
||||
info!("Starting dashboard main loop");
|
||||
|
||||
|
||||
let mut last_metrics_check = Instant::now();
|
||||
let metrics_check_interval = Duration::from_millis(100); // Check for metrics every 100ms
|
||||
|
||||
|
||||
loop {
|
||||
// Handle terminal events (keyboard input) only if not headless
|
||||
if !self.headless {
|
||||
match event::poll(Duration::from_millis(50)) {
|
||||
Ok(true) => {
|
||||
match event::read() {
|
||||
Ok(Event::Key(key)) => {
|
||||
match key.code {
|
||||
KeyCode::Char('q') => {
|
||||
info!("Quit key pressed, exiting dashboard");
|
||||
break;
|
||||
}
|
||||
KeyCode::Left => {
|
||||
debug!("Navigate left");
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
|
||||
error!("Error handling left navigation: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
KeyCode::Right => {
|
||||
debug!("Navigate right");
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
|
||||
error!("Error handling right navigation: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
KeyCode::Char('r') => {
|
||||
debug!("Refresh requested");
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
|
||||
error!("Error handling refresh: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
KeyCode::Tab => {
|
||||
debug!("Tab pressed - next host");
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
|
||||
error!("Error handling tab navigation: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
Ok(Event::Key(key)) => match key.code {
|
||||
KeyCode::Char('q') => {
|
||||
info!("Quit key pressed, exiting dashboard");
|
||||
break;
|
||||
}
|
||||
}
|
||||
KeyCode::Left => {
|
||||
debug!("Navigate left");
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
|
||||
error!("Error handling left navigation: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
KeyCode::Right => {
|
||||
debug!("Navigate right");
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
|
||||
error!("Error handling right navigation: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
KeyCode::Char('r') => {
|
||||
debug!("Refresh requested");
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
|
||||
error!("Error handling refresh: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
KeyCode::Tab => {
|
||||
debug!("Tab pressed - next host");
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
|
||||
error!("Error handling tab navigation: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
},
|
||||
Ok(_) => {} // Other events (mouse, resize, etc.)
|
||||
Err(e) => {
|
||||
error!("Error reading terminal event: {}", e);
|
||||
@@ -193,44 +194,67 @@ impl Dashboard {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Check for new metrics
|
||||
if last_metrics_check.elapsed() >= metrics_check_interval {
|
||||
if let Ok(Some(metric_message)) = self.zmq_consumer.receive_metrics().await {
|
||||
debug!("Received metrics from {}: {} metrics",
|
||||
metric_message.hostname, metric_message.metrics.len());
|
||||
|
||||
debug!(
|
||||
"Received metrics from {}: {} metrics",
|
||||
metric_message.hostname,
|
||||
metric_message.metrics.len()
|
||||
);
|
||||
|
||||
// Check if this is the first time we've seen this host
|
||||
let is_new_host = !self.initial_commands_sent.contains(&metric_message.hostname);
|
||||
|
||||
let is_new_host = !self
|
||||
.initial_commands_sent
|
||||
.contains(&metric_message.hostname);
|
||||
|
||||
if is_new_host {
|
||||
info!("First contact with host {}, sending initial CollectNow command", metric_message.hostname);
|
||||
|
||||
info!(
|
||||
"First contact with host {}, sending initial CollectNow command",
|
||||
metric_message.hostname
|
||||
);
|
||||
|
||||
// Send CollectNow command for immediate refresh
|
||||
if let Err(e) = self.send_command(&metric_message.hostname, AgentCommand::CollectNow).await {
|
||||
error!("Failed to send initial CollectNow command to {}: {}", metric_message.hostname, e);
|
||||
if let Err(e) = self
|
||||
.send_command(&metric_message.hostname, AgentCommand::CollectNow)
|
||||
.await
|
||||
{
|
||||
error!(
|
||||
"Failed to send initial CollectNow command to {}: {}",
|
||||
metric_message.hostname, e
|
||||
);
|
||||
} else {
|
||||
info!("✓ Sent initial CollectNow command to {}", metric_message.hostname);
|
||||
self.initial_commands_sent.insert(metric_message.hostname.clone());
|
||||
info!(
|
||||
"✓ Sent initial CollectNow command to {}",
|
||||
metric_message.hostname
|
||||
);
|
||||
self.initial_commands_sent
|
||||
.insert(metric_message.hostname.clone());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Update metric store
|
||||
self.metric_store.update_metrics(&metric_message.hostname, metric_message.metrics);
|
||||
|
||||
self.metric_store
|
||||
.update_metrics(&metric_message.hostname, metric_message.metrics);
|
||||
|
||||
// Update TUI with new hosts and metrics (only if not headless)
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
let connected_hosts = self.metric_store.get_connected_hosts(Duration::from_secs(30));
|
||||
let connected_hosts = self
|
||||
.metric_store
|
||||
.get_connected_hosts(Duration::from_secs(30));
|
||||
tui_app.update_hosts(connected_hosts);
|
||||
tui_app.update_metrics(&self.metric_store);
|
||||
}
|
||||
}
|
||||
last_metrics_check = Instant::now();
|
||||
}
|
||||
|
||||
|
||||
// Render TUI (only if not headless)
|
||||
if !self.headless {
|
||||
if let (Some(ref mut terminal), Some(ref mut tui_app)) = (&mut self.terminal, &mut self.tui_app) {
|
||||
if let (Some(ref mut terminal), Some(ref mut tui_app)) =
|
||||
(&mut self.terminal, &mut self.tui_app)
|
||||
{
|
||||
if let Err(e) = terminal.draw(|frame| {
|
||||
tui_app.render(frame, &self.metric_store);
|
||||
}) {
|
||||
@@ -239,11 +263,11 @@ impl Dashboard {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Small sleep to prevent excessive CPU usage
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
}
|
||||
|
||||
|
||||
info!("Dashboard main loop ended");
|
||||
Ok(())
|
||||
}
|
||||
@@ -255,12 +279,9 @@ impl Drop for Dashboard {
|
||||
if !self.headless {
|
||||
let _ = disable_raw_mode();
|
||||
if let Some(ref mut terminal) = self.terminal {
|
||||
let _ = execute!(
|
||||
terminal.backend_mut(),
|
||||
LeaveAlternateScreen
|
||||
);
|
||||
let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen);
|
||||
let _ = terminal.show_cursor();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user