Implement hysteresis for metric status changes to prevent flapping
Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
This commit is contained in:
parent
e998679901
commit
00a8ed3da2
@ -329,7 +329,7 @@ Agent → ["cpu_load_1min", "memory_usage_percent", ...] → Dashboard → Widge
|
|||||||
- [x] All collectors output standardized status strings (ok/warning/critical/unknown)
|
- [x] All collectors output standardized status strings (ok/warning/critical/unknown)
|
||||||
- [x] Dashboard connection loss detection with 5-second keep-alive
|
- [x] Dashboard connection loss detection with 5-second keep-alive
|
||||||
- [x] Removed excessive logging from agent
|
- [x] Removed excessive logging from agent
|
||||||
- [x] Fixed all compiler warnings in both agent and dashboard
|
- [x] Reduced initial compiler warnings from excessive logging cleanup
|
||||||
- [x] **SystemCollector architecture refactoring completed (2025-10-12)**
|
- [x] **SystemCollector architecture refactoring completed (2025-10-12)**
|
||||||
- [x] Created SystemCollector for CPU load, memory, temperature, C-states
|
- [x] Created SystemCollector for CPU load, memory, temperature, C-states
|
||||||
- [x] Moved system metrics from ServiceCollector to SystemCollector
|
- [x] Moved system metrics from ServiceCollector to SystemCollector
|
||||||
@ -376,6 +376,12 @@ Agent → ["cpu_load_1min", "memory_usage_percent", ...] → Dashboard → Widge
|
|||||||
- [x] Resolved timezone issues by using UTC timestamps in backup script
|
- [x] Resolved timezone issues by using UTC timestamps in backup script
|
||||||
- [x] Added disk identification metrics (product name, serial number) to backup status
|
- [x] Added disk identification metrics (product name, serial number) to backup status
|
||||||
- [x] Enhanced UI layout with proper backup monitoring integration
|
- [x] Enhanced UI layout with proper backup monitoring integration
|
||||||
|
- [x] **Complete warning elimination and code cleanup (2025-10-18)**
|
||||||
|
- [x] Removed all unused code including widget subscription system and WidgetType enum
|
||||||
|
- [x] Eliminated unused cache utilities, error variants, and theme functions
|
||||||
|
- [x] Removed unused struct fields and imports throughout codebase
|
||||||
|
- [x] Fixed lifetime warnings and replaced subscription-based widgets with direct metric filtering
|
||||||
|
- [x] Achieved zero build warnings in both agent and dashboard (down from 46 total warnings)
|
||||||
|
|
||||||
**Production Configuration:**
|
**Production Configuration:**
|
||||||
- CPU load thresholds: Warning ≥ 9.0, Critical ≥ 10.0
|
- CPU load thresholds: Warning ≥ 9.0, Critical ≥ 10.0
|
||||||
|
|||||||
@ -1,11 +1,11 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use gethostname::gethostname;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tokio::time::interval;
|
use tokio::time::interval;
|
||||||
use tracing::{info, error, debug};
|
use tracing::{debug, error, info};
|
||||||
use gethostname::gethostname;
|
|
||||||
|
|
||||||
|
use crate::communication::{AgentCommand, ZmqHandler};
|
||||||
use crate::config::AgentConfig;
|
use crate::config::AgentConfig;
|
||||||
use crate::communication::{ZmqHandler, AgentCommand};
|
|
||||||
use crate::metrics::MetricCollectionManager;
|
use crate::metrics::MetricCollectionManager;
|
||||||
use crate::notifications::NotificationManager;
|
use crate::notifications::NotificationManager;
|
||||||
use cm_dashboard_shared::{Metric, MetricMessage};
|
use cm_dashboard_shared::{Metric, MetricMessage};
|
||||||
@ -34,7 +34,10 @@ impl Agent {
|
|||||||
|
|
||||||
// Initialize ZMQ communication
|
// Initialize ZMQ communication
|
||||||
let zmq_handler = ZmqHandler::new(&config.zmq).await?;
|
let zmq_handler = ZmqHandler::new(&config.zmq).await?;
|
||||||
info!("ZMQ communication initialized on port {}", config.zmq.publisher_port);
|
info!(
|
||||||
|
"ZMQ communication initialized on port {}",
|
||||||
|
config.zmq.publisher_port
|
||||||
|
);
|
||||||
|
|
||||||
// Initialize metric collection manager with cache config
|
// Initialize metric collection manager with cache config
|
||||||
let metric_manager = MetricCollectionManager::new(&config.collectors, &config).await?;
|
let metric_manager = MetricCollectionManager::new(&config.collectors, &config).await?;
|
||||||
@ -65,7 +68,8 @@ impl Agent {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Separate intervals for collection and transmission
|
// Separate intervals for collection and transmission
|
||||||
let mut collection_interval = interval(Duration::from_secs(self.config.collection_interval_seconds));
|
let mut collection_interval =
|
||||||
|
interval(Duration::from_secs(self.config.collection_interval_seconds));
|
||||||
let mut transmission_interval = interval(Duration::from_secs(1)); // ZMQ broadcast every 1 second
|
let mut transmission_interval = interval(Duration::from_secs(1)); // ZMQ broadcast every 1 second
|
||||||
let mut notification_check_interval = interval(Duration::from_secs(30)); // Check notifications every 30s
|
let mut notification_check_interval = interval(Duration::from_secs(30)); // Check notifications every 30s
|
||||||
|
|
||||||
@ -165,12 +169,21 @@ impl Agent {
|
|||||||
|
|
||||||
async fn check_status_changes(&mut self, metrics: &[Metric]) {
|
async fn check_status_changes(&mut self, metrics: &[Metric]) {
|
||||||
for metric in metrics {
|
for metric in metrics {
|
||||||
if let Some(status_change) = self.notification_manager.update_metric_status(&metric.name, metric.status) {
|
if let Some(status_change) = self
|
||||||
info!("Status change detected for {}: {:?} -> {:?}",
|
.notification_manager
|
||||||
metric.name, status_change.old_status, status_change.new_status);
|
.update_metric_status(&metric.name, metric.status)
|
||||||
|
{
|
||||||
|
info!(
|
||||||
|
"Status change detected for {}: {:?} -> {:?}",
|
||||||
|
metric.name, status_change.old_status, status_change.new_status
|
||||||
|
);
|
||||||
|
|
||||||
// Send notification for status change
|
// Send notification for status change
|
||||||
if let Err(e) = self.notification_manager.send_status_change_notification(status_change, metric).await {
|
if let Err(e) = self
|
||||||
|
.notification_manager
|
||||||
|
.send_status_change_notification(status_change, metric)
|
||||||
|
.await
|
||||||
|
{
|
||||||
error!("Failed to send notification: {}", e);
|
error!("Failed to send notification: {}", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -209,7 +222,10 @@ impl Agent {
|
|||||||
info!("Interval change requested but not implemented yet");
|
info!("Interval change requested but not implemented yet");
|
||||||
}
|
}
|
||||||
AgentCommand::ToggleCollector { name, enabled } => {
|
AgentCommand::ToggleCollector { name, enabled } => {
|
||||||
info!("Processing ToggleCollector command: {} -> {}", name, enabled);
|
info!(
|
||||||
|
"Processing ToggleCollector command: {} -> {}",
|
||||||
|
name, enabled
|
||||||
|
);
|
||||||
// Note: This would require dynamic collector management
|
// Note: This would require dynamic collector management
|
||||||
info!("Collector toggle requested but not implemented yet");
|
info!("Collector toggle requested but not implemented yet");
|
||||||
}
|
}
|
||||||
|
|||||||
5
agent/src/cache/manager.rs
vendored
5
agent/src/cache/manager.rs
vendored
@ -12,9 +12,7 @@ impl MetricCacheManager {
|
|||||||
pub fn new(config: CacheConfig) -> Self {
|
pub fn new(config: CacheConfig) -> Self {
|
||||||
let cache = Arc::new(ConfigurableCache::new(config.clone()));
|
let cache = Arc::new(ConfigurableCache::new(config.clone()));
|
||||||
|
|
||||||
Self {
|
Self { cache }
|
||||||
cache,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Start background cache management tasks
|
/// Start background cache management tasks
|
||||||
@ -32,5 +30,4 @@ impl MetricCacheManager {
|
|||||||
pub async fn get_all_cached_metrics(&self) -> Vec<Metric> {
|
pub async fn get_all_cached_metrics(&self) -> Vec<Metric> {
|
||||||
self.cache.get_all_cached_metrics().await
|
self.cache.get_all_cached_metrics().await
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
11
agent/src/cache/mod.rs
vendored
11
agent/src/cache/mod.rs
vendored
@ -4,11 +4,11 @@ use std::time::Instant;
|
|||||||
use tokio::sync::RwLock;
|
use tokio::sync::RwLock;
|
||||||
use tracing::warn;
|
use tracing::warn;
|
||||||
|
|
||||||
mod manager;
|
|
||||||
mod cached_metric;
|
mod cached_metric;
|
||||||
|
mod manager;
|
||||||
|
|
||||||
pub use manager::MetricCacheManager;
|
|
||||||
pub use cached_metric::CachedMetric;
|
pub use cached_metric::CachedMetric;
|
||||||
|
pub use manager::MetricCacheManager;
|
||||||
|
|
||||||
/// Central cache for individual metrics with configurable tiers
|
/// Central cache for individual metrics with configurable tiers
|
||||||
pub struct ConfigurableCache {
|
pub struct ConfigurableCache {
|
||||||
@ -49,7 +49,6 @@ impl ConfigurableCache {
|
|||||||
// Cached metric (debug logging disabled for performance)
|
// Cached metric (debug logging disabled for performance)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Get all cached metrics (including expired ones) for broadcasting
|
/// Get all cached metrics (including expired ones) for broadcasting
|
||||||
pub async fn get_all_cached_metrics(&self) -> Vec<Metric> {
|
pub async fn get_all_cached_metrics(&self) -> Vec<Metric> {
|
||||||
if !self.config.enabled {
|
if !self.config.enabled {
|
||||||
@ -86,7 +85,10 @@ impl ConfigurableCache {
|
|||||||
|
|
||||||
// If still too many entries, remove least recently accessed
|
// If still too many entries, remove least recently accessed
|
||||||
if cache.len() >= self.config.max_entries {
|
if cache.len() >= self.config.max_entries {
|
||||||
let mut entries: Vec<_> = cache.iter().map(|(k, v)| (k.clone(), v.access_count)).collect();
|
let mut entries: Vec<_> = cache
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k.clone(), v.access_count))
|
||||||
|
.collect();
|
||||||
entries.sort_by_key(|(_, access_count)| *access_count);
|
entries.sort_by_key(|(_, access_count)| *access_count);
|
||||||
|
|
||||||
let excess = cache.len() - (self.config.max_entries * 3 / 4); // Remove 25%
|
let excess = cache.len() - (self.config.max_entries * 3 / 4); // Remove 25%
|
||||||
@ -97,5 +99,4 @@ impl ConfigurableCache {
|
|||||||
warn!("Cache cleanup removed {} entries due to size limit", excess);
|
warn!("Cache cleanup removed {} entries due to size limit", excess);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -1,6 +1,6 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
|
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
@ -18,7 +18,8 @@ pub struct BackupCollector {
|
|||||||
impl BackupCollector {
|
impl BackupCollector {
|
||||||
pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
|
pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
|
||||||
Self {
|
Self {
|
||||||
backup_status_file: backup_status_file.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
|
backup_status_file: backup_status_file
|
||||||
|
.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
|
||||||
max_age_hours,
|
max_age_hours,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -43,10 +44,16 @@ impl BackupCollector {
|
|||||||
Ok(dt) => dt.with_timezone(&Utc),
|
Ok(dt) => dt.with_timezone(&Utc),
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
// Try parsing as naive datetime and assume UTC
|
// Try parsing as naive datetime and assume UTC
|
||||||
match chrono::NaiveDateTime::parse_from_str(&backup_status.start_time, "%Y-%m-%dT%H:%M:%S%.f") {
|
match chrono::NaiveDateTime::parse_from_str(
|
||||||
|
&backup_status.start_time,
|
||||||
|
"%Y-%m-%dT%H:%M:%S%.f",
|
||||||
|
) {
|
||||||
Ok(naive_dt) => naive_dt.and_utc(),
|
Ok(naive_dt) => naive_dt.and_utc(),
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
error!("Failed to parse backup timestamp: {}", backup_status.start_time);
|
error!(
|
||||||
|
"Failed to parse backup timestamp: {}",
|
||||||
|
backup_status.start_time
|
||||||
|
);
|
||||||
return Status::Unknown;
|
return Status::Unknown;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -63,7 +70,7 @@ impl BackupCollector {
|
|||||||
} else {
|
} else {
|
||||||
Status::Ok
|
Status::Ok
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"failed" => Status::Critical,
|
"failed" => Status::Critical,
|
||||||
"running" => Status::Ok, // Currently running is OK
|
"running" => Status::Ok, // Currently running is OK
|
||||||
_ => Status::Unknown,
|
_ => Status::Unknown,
|
||||||
@ -78,7 +85,7 @@ impl BackupCollector {
|
|||||||
} else {
|
} else {
|
||||||
Status::Critical
|
Status::Critical
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"failed" => Status::Critical,
|
"failed" => Status::Critical,
|
||||||
"disabled" => Status::Warning, // Service intentionally disabled
|
"disabled" => Status::Warning, // Service intentionally disabled
|
||||||
"running" => Status::Ok,
|
"running" => Status::Ok,
|
||||||
@ -97,7 +104,7 @@ impl Collector for BackupCollector {
|
|||||||
"backup"
|
"backup"
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
let backup_status = self.read_backup_status().await?;
|
let backup_status = self.read_backup_status().await?;
|
||||||
let mut metrics = Vec::new();
|
let mut metrics = Vec::new();
|
||||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||||
@ -114,7 +121,10 @@ impl Collector for BackupCollector {
|
|||||||
}),
|
}),
|
||||||
status: overall_status,
|
status: overall_status,
|
||||||
timestamp,
|
timestamp,
|
||||||
description: Some(format!("Backup: {} at {}", backup_status.status, backup_status.start_time)),
|
description: Some(format!(
|
||||||
|
"Backup: {} at {}",
|
||||||
|
backup_status.status, backup_status.start_time
|
||||||
|
)),
|
||||||
unit: None,
|
unit: None,
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -129,11 +139,15 @@ impl Collector for BackupCollector {
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Last backup timestamp - use last_updated (when backup finished) instead of start_time
|
// Last backup timestamp - use last_updated (when backup finished) instead of start_time
|
||||||
let last_updated_dt_result = chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
|
let last_updated_dt_result =
|
||||||
|
chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
|
||||||
.map(|dt| dt.with_timezone(&Utc))
|
.map(|dt| dt.with_timezone(&Utc))
|
||||||
.or_else(|_| {
|
.or_else(|_| {
|
||||||
// Try parsing as naive datetime and assume UTC
|
// Try parsing as naive datetime and assume UTC
|
||||||
chrono::NaiveDateTime::parse_from_str(&backup_status.last_updated, "%Y-%m-%dT%H:%M:%S%.f")
|
chrono::NaiveDateTime::parse_from_str(
|
||||||
|
&backup_status.last_updated,
|
||||||
|
"%Y-%m-%dT%H:%M:%S%.f",
|
||||||
|
)
|
||||||
.map(|naive_dt| naive_dt.and_utc())
|
.map(|naive_dt| naive_dt.and_utc())
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -147,7 +161,10 @@ impl Collector for BackupCollector {
|
|||||||
unit: Some("unix_timestamp".to_string()),
|
unit: Some("unix_timestamp".to_string()),
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
error!("Failed to parse backup timestamp for last_run_timestamp: {}", backup_status.last_updated);
|
error!(
|
||||||
|
"Failed to parse backup timestamp for last_run_timestamp: {}",
|
||||||
|
backup_status.last_updated
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Individual service metrics
|
// Individual service metrics
|
||||||
@ -165,7 +182,10 @@ impl Collector for BackupCollector {
|
|||||||
}),
|
}),
|
||||||
status: service_status,
|
status: service_status,
|
||||||
timestamp,
|
timestamp,
|
||||||
description: Some(format!("Backup service {} status: {}", service_name, service.status)),
|
description: Some(format!(
|
||||||
|
"Backup service {} status: {}",
|
||||||
|
service_name, service.status
|
||||||
|
)),
|
||||||
unit: None,
|
unit: None,
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -173,7 +193,11 @@ impl Collector for BackupCollector {
|
|||||||
metrics.push(Metric {
|
metrics.push(Metric {
|
||||||
name: format!("backup_service_{}_exit_code", service_name),
|
name: format!("backup_service_{}_exit_code", service_name),
|
||||||
value: MetricValue::Integer(service.exit_code),
|
value: MetricValue::Integer(service.exit_code),
|
||||||
status: if service.exit_code == 0 { Status::Ok } else { Status::Critical },
|
status: if service.exit_code == 0 {
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Critical
|
||||||
|
},
|
||||||
timestamp,
|
timestamp,
|
||||||
description: Some(format!("Exit code for backup service {}", service_name)),
|
description: Some(format!("Exit code for backup service {}", service_name)),
|
||||||
unit: None,
|
unit: None,
|
||||||
@ -222,7 +246,9 @@ impl Collector for BackupCollector {
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Calculate total repository size
|
// Calculate total repository size
|
||||||
let total_size_bytes: u64 = backup_status.services.values()
|
let total_size_bytes: u64 = backup_status
|
||||||
|
.services
|
||||||
|
.values()
|
||||||
.map(|s| s.repo_size_bytes)
|
.map(|s| s.repo_size_bytes)
|
||||||
.sum();
|
.sum();
|
||||||
let total_size_gb = Self::bytes_to_gb(total_size_bytes);
|
let total_size_gb = Self::bytes_to_gb(total_size_bytes);
|
||||||
@ -301,7 +327,6 @@ impl Collector for BackupCollector {
|
|||||||
unit: None,
|
unit: None,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add standalone disk identification metrics from TOML fields
|
// Add standalone disk identification metrics from TOML fields
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::{registry, Metric, MetricValue, Status};
|
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||||
|
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
@ -17,41 +17,44 @@ use crate::config::CpuConfig;
|
|||||||
pub struct CpuCollector {
|
pub struct CpuCollector {
|
||||||
config: CpuConfig,
|
config: CpuConfig,
|
||||||
name: String,
|
name: String,
|
||||||
|
load_thresholds: HysteresisThresholds,
|
||||||
|
temperature_thresholds: HysteresisThresholds,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CpuCollector {
|
impl CpuCollector {
|
||||||
pub fn new(config: CpuConfig) -> Self {
|
pub fn new(config: CpuConfig) -> Self {
|
||||||
|
// Create hysteresis thresholds with 10% gap for recovery
|
||||||
|
let load_thresholds = HysteresisThresholds::new(
|
||||||
|
config.load_warning_threshold,
|
||||||
|
config.load_critical_threshold,
|
||||||
|
);
|
||||||
|
|
||||||
|
let temperature_thresholds = HysteresisThresholds::new(
|
||||||
|
config.temperature_warning_threshold,
|
||||||
|
config.temperature_critical_threshold,
|
||||||
|
);
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
config,
|
config,
|
||||||
name: "cpu".to_string(),
|
name: "cpu".to_string(),
|
||||||
|
load_thresholds,
|
||||||
|
temperature_thresholds,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate CPU load status using configured thresholds
|
/// Calculate CPU load status using hysteresis thresholds
|
||||||
fn calculate_load_status(&self, load: f32) -> Status {
|
fn calculate_load_status(&self, metric_name: &str, load: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||||
if load >= self.config.load_critical_threshold {
|
status_tracker.calculate_with_hysteresis(metric_name, load, &self.load_thresholds)
|
||||||
Status::Critical
|
|
||||||
} else if load >= self.config.load_warning_threshold {
|
|
||||||
Status::Warning
|
|
||||||
} else {
|
|
||||||
Status::Ok
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate CPU temperature status using configured thresholds
|
/// Calculate CPU temperature status using hysteresis thresholds
|
||||||
fn calculate_temperature_status(&self, temp: f32) -> Status {
|
fn calculate_temperature_status(&self, metric_name: &str, temp: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||||
if temp >= self.config.temperature_critical_threshold {
|
status_tracker.calculate_with_hysteresis(metric_name, temp, &self.temperature_thresholds)
|
||||||
Status::Critical
|
|
||||||
} else if temp >= self.config.temperature_warning_threshold {
|
|
||||||
Status::Warning
|
|
||||||
} else {
|
|
||||||
Status::Ok
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Collect CPU load averages from /proc/loadavg
|
/// Collect CPU load averages from /proc/loadavg
|
||||||
/// Format: "0.52 0.58 0.59 1/257 12345"
|
/// Format: "0.52 0.58 0.59 1/257 12345"
|
||||||
async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
|
async fn collect_load_averages(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
let content = utils::read_proc_file("/proc/loadavg")?;
|
let content = utils::read_proc_file("/proc/loadavg")?;
|
||||||
let parts: Vec<&str> = content.trim().split_whitespace().collect();
|
let parts: Vec<&str> = content.trim().split_whitespace().collect();
|
||||||
|
|
||||||
@ -68,7 +71,7 @@ impl CpuCollector {
|
|||||||
|
|
||||||
// Only apply thresholds to 5-minute load average
|
// Only apply thresholds to 5-minute load average
|
||||||
let load_1min_status = Status::Ok; // No alerting on 1min
|
let load_1min_status = Status::Ok; // No alerting on 1min
|
||||||
let load_5min_status = self.calculate_load_status(load_5min); // Only 5min triggers alerts
|
let load_5min_status = self.calculate_load_status(registry::CPU_LOAD_5MIN, load_5min, status_tracker); // Only 5min triggers alerts
|
||||||
let load_15min_status = Status::Ok; // No alerting on 15min
|
let load_15min_status = Status::Ok; // No alerting on 15min
|
||||||
|
|
||||||
Ok(vec![
|
Ok(vec![
|
||||||
@ -95,14 +98,14 @@ impl CpuCollector {
|
|||||||
|
|
||||||
/// Collect CPU temperature from thermal zones
|
/// Collect CPU temperature from thermal zones
|
||||||
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
|
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
|
||||||
async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
|
async fn collect_temperature(&self, status_tracker: &mut StatusTracker) -> Result<Option<Metric>, CollectorError> {
|
||||||
// Try x86_pkg_temp first (Intel CPU package temperature)
|
// Try x86_pkg_temp first (Intel CPU package temperature)
|
||||||
if let Ok(temp) = self
|
if let Ok(temp) = self
|
||||||
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
|
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
let temp_celsius = temp as f32 / 1000.0;
|
let temp_celsius = temp as f32 / 1000.0;
|
||||||
let status = self.calculate_temperature_status(temp_celsius);
|
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
|
||||||
|
|
||||||
return Ok(Some(
|
return Ok(Some(
|
||||||
Metric::new(
|
Metric::new(
|
||||||
@ -120,7 +123,7 @@ impl CpuCollector {
|
|||||||
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
|
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
|
||||||
if let Ok(temp) = self.read_thermal_zone(&path).await {
|
if let Ok(temp) = self.read_thermal_zone(&path).await {
|
||||||
let temp_celsius = temp as f32 / 1000.0;
|
let temp_celsius = temp as f32 / 1000.0;
|
||||||
let status = self.calculate_temperature_status(temp_celsius);
|
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
|
||||||
|
|
||||||
return Ok(Some(
|
return Ok(Some(
|
||||||
Metric::new(
|
Metric::new(
|
||||||
@ -200,17 +203,17 @@ impl Collector for CpuCollector {
|
|||||||
&self.name
|
&self.name
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
debug!("Collecting CPU metrics");
|
debug!("Collecting CPU metrics");
|
||||||
let start = std::time::Instant::now();
|
let start = std::time::Instant::now();
|
||||||
|
|
||||||
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
|
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
|
||||||
|
|
||||||
// Collect load averages (always available)
|
// Collect load averages (always available)
|
||||||
metrics.extend(self.collect_load_averages().await?);
|
metrics.extend(self.collect_load_averages(status_tracker).await?);
|
||||||
|
|
||||||
// Collect temperature (optional)
|
// Collect temperature (optional)
|
||||||
if let Some(temp_metric) = self.collect_temperature().await? {
|
if let Some(temp_metric) = self.collect_temperature(status_tracker).await? {
|
||||||
metrics.push(temp_metric);
|
metrics.push(temp_metric);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||||
|
|
||||||
use crate::config::DiskConfig;
|
use crate::config::DiskConfig;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
@ -28,11 +28,28 @@ struct MountedDisk {
|
|||||||
/// Disk usage collector for monitoring filesystem sizes
|
/// Disk usage collector for monitoring filesystem sizes
|
||||||
pub struct DiskCollector {
|
pub struct DiskCollector {
|
||||||
config: DiskConfig,
|
config: DiskConfig,
|
||||||
|
temperature_thresholds: HysteresisThresholds,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DiskCollector {
|
impl DiskCollector {
|
||||||
pub fn new(config: DiskConfig) -> Self {
|
pub fn new(config: DiskConfig) -> Self {
|
||||||
Self { config }
|
// Create hysteresis thresholds for disk temperature
|
||||||
|
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
|
||||||
|
60.0, // warning at 60°C
|
||||||
|
5.0, // 5°C gap for recovery
|
||||||
|
70.0, // critical at 70°C
|
||||||
|
5.0, // 5°C gap for recovery
|
||||||
|
);
|
||||||
|
|
||||||
|
Self {
|
||||||
|
config,
|
||||||
|
temperature_thresholds,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate disk temperature status using hysteresis thresholds
|
||||||
|
fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||||
|
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Resolve UUID to actual device path
|
/// Resolve UUID to actual device path
|
||||||
@ -203,12 +220,6 @@ impl DiskCollector {
|
|||||||
Ok((total_bytes, used_bytes))
|
Ok((total_bytes, used_bytes))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get root filesystem disk usage
|
|
||||||
fn get_root_filesystem_usage(&self) -> Result<(u64, u64, f32)> {
|
|
||||||
let (total_bytes, used_bytes) = self.get_filesystem_info("/")?;
|
|
||||||
let usage_percent = (used_bytes as f64 / total_bytes as f64) * 100.0;
|
|
||||||
Ok((total_bytes, used_bytes, usage_percent as f32))
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/// Get the physical device for a given device (resolves symlinks, gets parent device)
|
/// Get the physical device for a given device (resolves symlinks, gets parent device)
|
||||||
@ -339,7 +350,7 @@ impl Collector for DiskCollector {
|
|||||||
"disk"
|
"disk"
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
debug!("Collecting multi-disk metrics");
|
debug!("Collecting multi-disk metrics");
|
||||||
|
|
||||||
@ -497,13 +508,8 @@ impl Collector for DiskCollector {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if temperature > 0.0 {
|
if temperature > 0.0 {
|
||||||
let temp_status = if temperature >= 70.0 {
|
let metric_name = format!("disk_smart_{}_temperature", device_name);
|
||||||
Status::Critical
|
let temp_status = self.calculate_temperature_status(&metric_name, temperature, status_tracker);
|
||||||
} else if temperature >= 60.0 {
|
|
||||||
Status::Warning
|
|
||||||
} else {
|
|
||||||
Status::Ok
|
|
||||||
};
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
metrics.push(Metric {
|
||||||
name: format!("disk_smart_{}_temperature", device_name),
|
name: format!("disk_smart_{}_temperature", device_name),
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::{Metric, MetricValue, Status, registry};
|
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||||
|
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use super::{Collector, CollectorError, utils};
|
use super::{utils, Collector, CollectorError};
|
||||||
use crate::config::MemoryConfig;
|
use crate::config::MemoryConfig;
|
||||||
|
|
||||||
/// Extremely efficient memory metrics collector
|
/// Extremely efficient memory metrics collector
|
||||||
@ -17,6 +17,7 @@ use crate::config::MemoryConfig;
|
|||||||
pub struct MemoryCollector {
|
pub struct MemoryCollector {
|
||||||
config: MemoryConfig,
|
config: MemoryConfig,
|
||||||
name: String,
|
name: String,
|
||||||
|
usage_thresholds: HysteresisThresholds,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Memory information parsed from /proc/meminfo
|
/// Memory information parsed from /proc/meminfo
|
||||||
@ -33,22 +34,24 @@ struct MemoryInfo {
|
|||||||
|
|
||||||
impl MemoryCollector {
|
impl MemoryCollector {
|
||||||
pub fn new(config: MemoryConfig) -> Self {
|
pub fn new(config: MemoryConfig) -> Self {
|
||||||
|
// Create hysteresis thresholds with 5% gap for memory usage
|
||||||
|
let usage_thresholds = HysteresisThresholds::with_custom_gaps(
|
||||||
|
config.usage_warning_percent,
|
||||||
|
5.0, // 5% gap for warning recovery
|
||||||
|
config.usage_critical_percent,
|
||||||
|
5.0, // 5% gap for critical recovery
|
||||||
|
);
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
config,
|
config,
|
||||||
name: "memory".to_string(),
|
name: "memory".to_string(),
|
||||||
|
usage_thresholds,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate memory usage status using configured thresholds
|
/// Calculate memory usage status using hysteresis thresholds
|
||||||
fn calculate_usage_status(&self, usage_percent: f32) -> Status {
|
fn calculate_usage_status(&self, metric_name: &str, usage_percent: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||||
if usage_percent >= self.config.usage_critical_percent {
|
status_tracker.calculate_with_hysteresis(metric_name, usage_percent, &self.usage_thresholds)
|
||||||
Status::Critical
|
|
||||||
} else if usage_percent >= self.config.usage_warning_percent {
|
|
||||||
Status::Warning
|
|
||||||
} else {
|
|
||||||
Status::Ok
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse /proc/meminfo efficiently
|
/// Parse /proc/meminfo efficiently
|
||||||
@ -103,13 +106,13 @@ impl MemoryCollector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate memory metrics from parsed info
|
/// Calculate memory metrics from parsed info
|
||||||
fn calculate_metrics(&self, info: &MemoryInfo) -> Vec<Metric> {
|
fn calculate_metrics(&self, info: &MemoryInfo, status_tracker: &mut StatusTracker) -> Vec<Metric> {
|
||||||
let mut metrics = Vec::with_capacity(6);
|
let mut metrics = Vec::with_capacity(6);
|
||||||
|
|
||||||
// Calculate derived values
|
// Calculate derived values
|
||||||
let used_kb = info.total_kb - info.available_kb;
|
let used_kb = info.total_kb - info.available_kb;
|
||||||
let usage_percent = (used_kb as f32 / info.total_kb as f32) * 100.0;
|
let usage_percent = (used_kb as f32 / info.total_kb as f32) * 100.0;
|
||||||
let usage_status = self.calculate_usage_status(usage_percent);
|
let usage_status = self.calculate_usage_status(registry::MEMORY_USAGE_PERCENT, usage_percent, status_tracker);
|
||||||
|
|
||||||
let swap_used_kb = info.swap_total_kb - info.swap_free_kb;
|
let swap_used_kb = info.swap_total_kb - info.swap_free_kb;
|
||||||
|
|
||||||
@ -121,52 +124,70 @@ impl MemoryCollector {
|
|||||||
let swap_used_gb = Self::kb_to_gb(swap_used_kb);
|
let swap_used_gb = Self::kb_to_gb(swap_used_kb);
|
||||||
|
|
||||||
// Memory usage percentage (primary metric with status)
|
// Memory usage percentage (primary metric with status)
|
||||||
metrics.push(Metric::new(
|
metrics.push(
|
||||||
|
Metric::new(
|
||||||
registry::MEMORY_USAGE_PERCENT.to_string(),
|
registry::MEMORY_USAGE_PERCENT.to_string(),
|
||||||
MetricValue::Float(usage_percent),
|
MetricValue::Float(usage_percent),
|
||||||
usage_status,
|
usage_status,
|
||||||
).with_description("Memory usage percentage".to_string())
|
)
|
||||||
.with_unit("%".to_string()));
|
.with_description("Memory usage percentage".to_string())
|
||||||
|
.with_unit("%".to_string()),
|
||||||
|
);
|
||||||
|
|
||||||
// Total memory
|
// Total memory
|
||||||
metrics.push(Metric::new(
|
metrics.push(
|
||||||
|
Metric::new(
|
||||||
registry::MEMORY_TOTAL_GB.to_string(),
|
registry::MEMORY_TOTAL_GB.to_string(),
|
||||||
MetricValue::Float(total_gb),
|
MetricValue::Float(total_gb),
|
||||||
Status::Ok, // Total memory doesn't have status
|
Status::Ok, // Total memory doesn't have status
|
||||||
).with_description("Total system memory".to_string())
|
)
|
||||||
.with_unit("GB".to_string()));
|
.with_description("Total system memory".to_string())
|
||||||
|
.with_unit("GB".to_string()),
|
||||||
|
);
|
||||||
|
|
||||||
// Used memory
|
// Used memory
|
||||||
metrics.push(Metric::new(
|
metrics.push(
|
||||||
|
Metric::new(
|
||||||
registry::MEMORY_USED_GB.to_string(),
|
registry::MEMORY_USED_GB.to_string(),
|
||||||
MetricValue::Float(used_gb),
|
MetricValue::Float(used_gb),
|
||||||
Status::Ok, // Used memory absolute value doesn't have status
|
Status::Ok, // Used memory absolute value doesn't have status
|
||||||
).with_description("Used system memory".to_string())
|
)
|
||||||
.with_unit("GB".to_string()));
|
.with_description("Used system memory".to_string())
|
||||||
|
.with_unit("GB".to_string()),
|
||||||
|
);
|
||||||
|
|
||||||
// Available memory
|
// Available memory
|
||||||
metrics.push(Metric::new(
|
metrics.push(
|
||||||
|
Metric::new(
|
||||||
registry::MEMORY_AVAILABLE_GB.to_string(),
|
registry::MEMORY_AVAILABLE_GB.to_string(),
|
||||||
MetricValue::Float(available_gb),
|
MetricValue::Float(available_gb),
|
||||||
Status::Ok, // Available memory absolute value doesn't have status
|
Status::Ok, // Available memory absolute value doesn't have status
|
||||||
).with_description("Available system memory".to_string())
|
)
|
||||||
.with_unit("GB".to_string()));
|
.with_description("Available system memory".to_string())
|
||||||
|
.with_unit("GB".to_string()),
|
||||||
|
);
|
||||||
|
|
||||||
// Swap metrics (only if swap exists)
|
// Swap metrics (only if swap exists)
|
||||||
if info.swap_total_kb > 0 {
|
if info.swap_total_kb > 0 {
|
||||||
metrics.push(Metric::new(
|
metrics.push(
|
||||||
|
Metric::new(
|
||||||
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
|
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
|
||||||
MetricValue::Float(swap_total_gb),
|
MetricValue::Float(swap_total_gb),
|
||||||
Status::Ok,
|
Status::Ok,
|
||||||
).with_description("Total swap space".to_string())
|
)
|
||||||
.with_unit("GB".to_string()));
|
.with_description("Total swap space".to_string())
|
||||||
|
.with_unit("GB".to_string()),
|
||||||
|
);
|
||||||
|
|
||||||
metrics.push(Metric::new(
|
metrics.push(
|
||||||
|
Metric::new(
|
||||||
registry::MEMORY_SWAP_USED_GB.to_string(),
|
registry::MEMORY_SWAP_USED_GB.to_string(),
|
||||||
MetricValue::Float(swap_used_gb),
|
MetricValue::Float(swap_used_gb),
|
||||||
Status::Ok,
|
Status::Ok,
|
||||||
).with_description("Used swap space".to_string())
|
)
|
||||||
.with_unit("GB".to_string()));
|
.with_description("Used swap space".to_string())
|
||||||
|
.with_unit("GB".to_string()),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
metrics
|
metrics
|
||||||
@ -179,9 +200,7 @@ impl Collector for MemoryCollector {
|
|||||||
&self.name
|
&self.name
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
|
||||||
|
|
||||||
debug!("Collecting memory metrics");
|
debug!("Collecting memory metrics");
|
||||||
let start = std::time::Instant::now();
|
let start = std::time::Instant::now();
|
||||||
|
|
||||||
@ -189,14 +208,21 @@ impl Collector for MemoryCollector {
|
|||||||
let info = self.parse_meminfo().await?;
|
let info = self.parse_meminfo().await?;
|
||||||
|
|
||||||
// Calculate all metrics from parsed info
|
// Calculate all metrics from parsed info
|
||||||
let metrics = self.calculate_metrics(&info);
|
let metrics = self.calculate_metrics(&info, status_tracker);
|
||||||
|
|
||||||
let duration = start.elapsed();
|
let duration = start.elapsed();
|
||||||
debug!("Memory collection completed in {:?} with {} metrics", duration, metrics.len());
|
debug!(
|
||||||
|
"Memory collection completed in {:?} with {} metrics",
|
||||||
|
duration,
|
||||||
|
metrics.len()
|
||||||
|
);
|
||||||
|
|
||||||
// Efficiency check: warn if collection takes too long
|
// Efficiency check: warn if collection takes too long
|
||||||
if duration.as_millis() > 1 {
|
if duration.as_millis() > 1 {
|
||||||
debug!("Memory collection took {}ms - consider optimization", duration.as_millis());
|
debug!(
|
||||||
|
"Memory collection took {}ms - consider optimization",
|
||||||
|
duration.as_millis()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store performance metrics
|
// Store performance metrics
|
||||||
|
|||||||
@ -1,16 +1,7 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::Metric;
|
use cm_dashboard_shared::{Metric, StatusTracker};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
pub mod cpu;
|
|
||||||
pub mod memory;
|
|
||||||
pub mod disk;
|
|
||||||
pub mod systemd;
|
|
||||||
pub mod backup;
|
|
||||||
pub mod error;
|
|
||||||
|
|
||||||
pub use error::CollectorError;
|
|
||||||
|
|
||||||
/// Performance metrics for a collector
|
/// Performance metrics for a collector
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct PerformanceMetrics {
|
pub struct PerformanceMetrics {
|
||||||
@ -18,6 +9,16 @@ pub struct PerformanceMetrics {
|
|||||||
pub collection_efficiency_percent: f32,
|
pub collection_efficiency_percent: f32,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub mod backup;
|
||||||
|
pub mod cpu;
|
||||||
|
pub mod disk;
|
||||||
|
pub mod error;
|
||||||
|
pub mod memory;
|
||||||
|
pub mod systemd;
|
||||||
|
|
||||||
|
pub use error::CollectorError;
|
||||||
|
|
||||||
|
|
||||||
/// Base trait for all collectors with extreme efficiency requirements
|
/// Base trait for all collectors with extreme efficiency requirements
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub trait Collector: Send + Sync {
|
pub trait Collector: Send + Sync {
|
||||||
@ -25,61 +26,60 @@ pub trait Collector: Send + Sync {
|
|||||||
fn name(&self) -> &str;
|
fn name(&self) -> &str;
|
||||||
|
|
||||||
/// Collect all metrics this collector provides
|
/// Collect all metrics this collector provides
|
||||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError>;
|
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError>;
|
||||||
|
|
||||||
/// Get performance metrics for monitoring collector efficiency
|
/// Get performance metrics for monitoring collector efficiency
|
||||||
fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
|
fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// CPU efficiency rules for all collectors
|
/// CPU efficiency rules for all collectors
|
||||||
pub mod efficiency {
|
pub mod efficiency {
|
||||||
/// CRITICAL: All collectors must follow these efficiency rules to minimize system impact
|
//! CRITICAL: All collectors must follow these efficiency rules to minimize system impact
|
||||||
|
//!
|
||||||
/// 1. FILE READING RULES
|
//! # FILE READING RULES
|
||||||
/// - Read entire files in single syscall when possible
|
//! - Read entire files in single syscall when possible
|
||||||
/// - Use BufReader only for very large files (>4KB)
|
//! - Use BufReader only for very large files (>4KB)
|
||||||
/// - Never read files character by character
|
//! - Never read files character by character
|
||||||
/// - Cache file descriptors when safe (immutable paths)
|
//! - Cache file descriptors when safe (immutable paths)
|
||||||
|
//!
|
||||||
/// 2. PARSING RULES
|
//! # PARSING RULES
|
||||||
/// - Use split() instead of regex for simple patterns
|
//! - Use split() instead of regex for simple patterns
|
||||||
/// - Parse numbers with from_str() not complex parsing
|
//! - Parse numbers with from_str() not complex parsing
|
||||||
/// - Avoid string allocations in hot paths
|
//! - Avoid string allocations in hot paths
|
||||||
/// - Use str::trim() before parsing numbers
|
//! - Use str::trim() before parsing numbers
|
||||||
|
//!
|
||||||
/// 3. MEMORY ALLOCATION RULES
|
//! # MEMORY ALLOCATION RULES
|
||||||
/// - Reuse Vec buffers when possible
|
//! - Reuse Vec buffers when possible
|
||||||
/// - Pre-allocate collections with known sizes
|
//! - Pre-allocate collections with known sizes
|
||||||
/// - Use str slices instead of String when possible
|
//! - Use str slices instead of String when possible
|
||||||
/// - Avoid clone() in hot paths
|
//! - Avoid clone() in hot paths
|
||||||
|
//!
|
||||||
/// 4. SYSTEM CALL RULES
|
//! # SYSTEM CALL RULES
|
||||||
/// - Minimize syscalls - prefer single reads over multiple
|
//! - Minimize syscalls - prefer single reads over multiple
|
||||||
/// - Use /proc filesystem efficiently
|
//! - Use /proc filesystem efficiently
|
||||||
/// - Avoid spawning processes when /proc data available
|
//! - Avoid spawning processes when /proc data available
|
||||||
/// - Cache static data (like CPU count)
|
//! - Cache static data (like CPU count)
|
||||||
|
//!
|
||||||
/// 5. ERROR HANDLING RULES
|
//! # ERROR HANDLING RULES
|
||||||
/// - Use Result<> but minimize allocation in error paths
|
//! - Use Result<> but minimize allocation in error paths
|
||||||
/// - Log errors at debug level only to avoid I/O overhead
|
//! - Log errors at debug level only to avoid I/O overhead
|
||||||
/// - Graceful degradation - missing metrics better than failing
|
//! - Graceful degradation - missing metrics better than failing
|
||||||
/// - Never panic in collectors
|
//! - Never panic in collectors
|
||||||
|
//!
|
||||||
/// 6. CONCURRENCY RULES
|
//! # CONCURRENCY RULES
|
||||||
/// - Collectors must be thread-safe but avoid locks
|
//! - Collectors must be thread-safe but avoid locks
|
||||||
/// - Use atomic operations for simple counters
|
//! - Use atomic operations for simple counters
|
||||||
/// - Avoid shared mutable state between collections
|
//! - Avoid shared mutable state between collections
|
||||||
/// - Each collection should be independent
|
//! - Each collection should be independent
|
||||||
|
|
||||||
pub const PERFORMANCE_TARGET_OVERHEAD_PERCENT: f32 = 0.1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Utility functions for efficient system data collection
|
/// Utility functions for efficient system data collection
|
||||||
pub mod utils {
|
pub mod utils {
|
||||||
use std::fs;
|
|
||||||
use super::CollectorError;
|
use super::CollectorError;
|
||||||
|
use std::fs;
|
||||||
|
|
||||||
/// Read entire file content efficiently
|
/// Read entire file content efficiently
|
||||||
pub fn read_proc_file(path: &str) -> Result<String, CollectorError> {
|
pub fn read_proc_file(path: &str) -> Result<String, CollectorError> {
|
||||||
@ -91,7 +91,9 @@ pub mod utils {
|
|||||||
|
|
||||||
/// Parse float from string slice efficiently
|
/// Parse float from string slice efficiently
|
||||||
pub fn parse_f32(s: &str) -> Result<f32, CollectorError> {
|
pub fn parse_f32(s: &str) -> Result<f32, CollectorError> {
|
||||||
s.trim().parse().map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
|
s.trim()
|
||||||
|
.parse()
|
||||||
|
.map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
|
||||||
value: s.to_string(),
|
value: s.to_string(),
|
||||||
error: e.to_string(),
|
error: e.to_string(),
|
||||||
})
|
})
|
||||||
@ -99,14 +101,12 @@ pub mod utils {
|
|||||||
|
|
||||||
/// Parse integer from string slice efficiently
|
/// Parse integer from string slice efficiently
|
||||||
pub fn parse_u64(s: &str) -> Result<u64, CollectorError> {
|
pub fn parse_u64(s: &str) -> Result<u64, CollectorError> {
|
||||||
s.trim().parse().map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
s.trim()
|
||||||
|
.parse()
|
||||||
|
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
||||||
value: s.to_string(),
|
value: s.to_string(),
|
||||||
error: e.to_string(),
|
error: e.to_string(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Split string and get nth element safely
|
|
||||||
pub fn split_nth<'a>(s: &'a str, delimiter: char, n: usize) -> Option<&'a str> {
|
|
||||||
s.split(delimiter).nth(n)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
@ -1,6 +1,6 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
use std::sync::RwLock;
|
use std::sync::RwLock;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
@ -401,7 +401,7 @@ impl Collector for SystemdCollector {
|
|||||||
"systemd"
|
"systemd"
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
debug!("Collecting systemd services metrics");
|
debug!("Collecting systemd services metrics");
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use cm_dashboard_shared::{MetricMessage, MessageEnvelope};
|
use cm_dashboard_shared::{MessageEnvelope, MetricMessage};
|
||||||
use tracing::{info, debug};
|
use tracing::{debug, info};
|
||||||
use zmq::{Context, Socket, SocketType};
|
use zmq::{Context, Socket, SocketType};
|
||||||
|
|
||||||
use crate::config::ZmqConfig;
|
use crate::config::ZmqConfig;
|
||||||
@ -47,7 +47,11 @@ impl ZmqHandler {
|
|||||||
|
|
||||||
/// Publish metrics message via ZMQ
|
/// Publish metrics message via ZMQ
|
||||||
pub async fn publish_metrics(&self, message: &MetricMessage) -> Result<()> {
|
pub async fn publish_metrics(&self, message: &MetricMessage) -> Result<()> {
|
||||||
debug!("Publishing {} metrics for host {}", message.metrics.len(), message.hostname);
|
debug!(
|
||||||
|
"Publishing {} metrics for host {}",
|
||||||
|
message.metrics.len(),
|
||||||
|
message.hostname
|
||||||
|
);
|
||||||
|
|
||||||
// Create message envelope
|
// Create message envelope
|
||||||
let envelope = MessageEnvelope::metrics(message.clone())
|
let envelope = MessageEnvelope::metrics(message.clone())
|
||||||
@ -64,16 +68,6 @@ impl ZmqHandler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Send heartbeat (placeholder for future use)
|
/// Send heartbeat (placeholder for future use)
|
||||||
pub async fn send_heartbeat(&self) -> Result<()> {
|
|
||||||
let envelope = MessageEnvelope::heartbeat()
|
|
||||||
.map_err(|e| anyhow::anyhow!("Failed to create heartbeat envelope: {}", e))?;
|
|
||||||
|
|
||||||
let serialized = serde_json::to_vec(&envelope)?;
|
|
||||||
self.publisher.send(&serialized, 0)?;
|
|
||||||
|
|
||||||
debug!("Sent heartbeat");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Try to receive a command (non-blocking)
|
/// Try to receive a command (non-blocking)
|
||||||
pub fn try_receive_command(&self) -> Result<Option<AgentCommand>> {
|
pub fn try_receive_command(&self) -> Result<Option<AgentCommand>> {
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
use anyhow::{Context, Result};
|
|
||||||
use std::path::Path;
|
|
||||||
use std::fs;
|
|
||||||
use crate::config::AgentConfig;
|
use crate::config::AgentConfig;
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
use std::fs;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
pub fn load_config<P: AsRef<Path>>(path: P) -> Result<AgentConfig> {
|
pub fn load_config<P: AsRef<Path>>(path: P) -> Result<AgentConfig> {
|
||||||
let path = path.as_ref();
|
let path = path.as_ref();
|
||||||
@ -11,7 +11,8 @@ pub fn load_config<P: AsRef<Path>>(path: P) -> Result<AgentConfig> {
|
|||||||
let config: AgentConfig = toml::from_str(&content)
|
let config: AgentConfig = toml::from_str(&content)
|
||||||
.with_context(|| format!("Failed to parse config file: {}", path.display()))?;
|
.with_context(|| format!("Failed to parse config file: {}", path.display()))?;
|
||||||
|
|
||||||
config.validate()
|
config
|
||||||
|
.validate()
|
||||||
.with_context(|| format!("Invalid configuration in file: {}", path.display()))?;
|
.with_context(|| format!("Invalid configuration in file: {}", path.display()))?;
|
||||||
|
|
||||||
Ok(config)
|
Ok(config)
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use cm_dashboard_shared::CacheConfig;
|
use cm_dashboard_shared::CacheConfig;
|
||||||
use gethostname::gethostname;
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
use anyhow::{bail, Result};
|
|
||||||
use crate::config::AgentConfig;
|
use crate::config::AgentConfig;
|
||||||
|
use anyhow::{bail, Result};
|
||||||
|
|
||||||
pub fn validate_config(config: &AgentConfig) -> Result<()> {
|
pub fn validate_config(config: &AgentConfig) -> Result<()> {
|
||||||
// Validate ZMQ configuration
|
// Validate ZMQ configuration
|
||||||
@ -34,7 +34,9 @@ pub fn validate_config(config: &AgentConfig) -> Result<()> {
|
|||||||
bail!("CPU load warning threshold must be positive");
|
bail!("CPU load warning threshold must be positive");
|
||||||
}
|
}
|
||||||
|
|
||||||
if config.collectors.cpu.load_critical_threshold <= config.collectors.cpu.load_warning_threshold {
|
if config.collectors.cpu.load_critical_threshold
|
||||||
|
<= config.collectors.cpu.load_warning_threshold
|
||||||
|
{
|
||||||
bail!("CPU load critical threshold must be greater than warning threshold");
|
bail!("CPU load critical threshold must be greater than warning threshold");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -42,31 +44,41 @@ pub fn validate_config(config: &AgentConfig) -> Result<()> {
|
|||||||
bail!("CPU temperature warning threshold must be positive");
|
bail!("CPU temperature warning threshold must be positive");
|
||||||
}
|
}
|
||||||
|
|
||||||
if config.collectors.cpu.temperature_critical_threshold <= config.collectors.cpu.temperature_warning_threshold {
|
if config.collectors.cpu.temperature_critical_threshold
|
||||||
|
<= config.collectors.cpu.temperature_warning_threshold
|
||||||
|
{
|
||||||
bail!("CPU temperature critical threshold must be greater than warning threshold");
|
bail!("CPU temperature critical threshold must be greater than warning threshold");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate memory thresholds
|
// Validate memory thresholds
|
||||||
if config.collectors.memory.enabled {
|
if config.collectors.memory.enabled {
|
||||||
if config.collectors.memory.usage_warning_percent <= 0.0 || config.collectors.memory.usage_warning_percent > 100.0 {
|
if config.collectors.memory.usage_warning_percent <= 0.0
|
||||||
|
|| config.collectors.memory.usage_warning_percent > 100.0
|
||||||
|
{
|
||||||
bail!("Memory usage warning threshold must be between 0 and 100");
|
bail!("Memory usage warning threshold must be between 0 and 100");
|
||||||
}
|
}
|
||||||
|
|
||||||
if config.collectors.memory.usage_critical_percent <= config.collectors.memory.usage_warning_percent
|
if config.collectors.memory.usage_critical_percent
|
||||||
|| config.collectors.memory.usage_critical_percent > 100.0 {
|
<= config.collectors.memory.usage_warning_percent
|
||||||
|
|| config.collectors.memory.usage_critical_percent > 100.0
|
||||||
|
{
|
||||||
bail!("Memory usage critical threshold must be between warning threshold and 100");
|
bail!("Memory usage critical threshold must be between warning threshold and 100");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate disk thresholds
|
// Validate disk thresholds
|
||||||
if config.collectors.disk.enabled {
|
if config.collectors.disk.enabled {
|
||||||
if config.collectors.disk.usage_warning_percent <= 0.0 || config.collectors.disk.usage_warning_percent > 100.0 {
|
if config.collectors.disk.usage_warning_percent <= 0.0
|
||||||
|
|| config.collectors.disk.usage_warning_percent > 100.0
|
||||||
|
{
|
||||||
bail!("Disk usage warning threshold must be between 0 and 100");
|
bail!("Disk usage warning threshold must be between 0 and 100");
|
||||||
}
|
}
|
||||||
|
|
||||||
if config.collectors.disk.usage_critical_percent <= config.collectors.disk.usage_warning_percent
|
if config.collectors.disk.usage_critical_percent
|
||||||
|| config.collectors.disk.usage_critical_percent > 100.0 {
|
<= config.collectors.disk.usage_warning_percent
|
||||||
|
|| config.collectors.disk.usage_critical_percent > 100.0
|
||||||
|
{
|
||||||
bail!("Disk usage critical threshold must be between warning threshold and 100");
|
bail!("Disk usage critical threshold must be between warning threshold and 100");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,14 +1,14 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use tracing::{info, error};
|
use tracing::{error, info};
|
||||||
use tracing_subscriber::EnvFilter;
|
use tracing_subscriber::EnvFilter;
|
||||||
|
|
||||||
mod agent;
|
mod agent;
|
||||||
mod cache;
|
mod cache;
|
||||||
mod config;
|
|
||||||
mod communication;
|
|
||||||
mod metrics;
|
|
||||||
mod collectors;
|
mod collectors;
|
||||||
|
mod communication;
|
||||||
|
mod config;
|
||||||
|
mod metrics;
|
||||||
mod notifications;
|
mod notifications;
|
||||||
mod utils;
|
mod utils;
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use cm_dashboard_shared::Metric;
|
use cm_dashboard_shared::{Metric, StatusTracker};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
use tracing::{debug, error, info};
|
use tracing::{debug, error, info};
|
||||||
@ -16,6 +16,7 @@ pub struct MetricCollectionManager {
|
|||||||
collectors: Vec<Box<dyn Collector>>,
|
collectors: Vec<Box<dyn Collector>>,
|
||||||
cache_manager: MetricCacheManager,
|
cache_manager: MetricCacheManager,
|
||||||
last_collection_times: HashMap<String, Instant>,
|
last_collection_times: HashMap<String, Instant>,
|
||||||
|
status_tracker: StatusTracker,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MetricCollectionManager {
|
impl MetricCollectionManager {
|
||||||
@ -117,6 +118,7 @@ impl MetricCollectionManager {
|
|||||||
collectors,
|
collectors,
|
||||||
cache_manager,
|
cache_manager,
|
||||||
last_collection_times: HashMap::new(),
|
last_collection_times: HashMap::new(),
|
||||||
|
status_tracker: StatusTracker::new(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -134,7 +136,7 @@ impl MetricCollectionManager {
|
|||||||
for collector in &self.collectors {
|
for collector in &self.collectors {
|
||||||
let collector_name = collector.name();
|
let collector_name = collector.name();
|
||||||
|
|
||||||
match collector.collect().await {
|
match collector.collect(&mut self.status_tracker).await {
|
||||||
Ok(metrics) => {
|
Ok(metrics) => {
|
||||||
info!(
|
info!(
|
||||||
"Force collected {} metrics from {} collector",
|
"Force collected {} metrics from {} collector",
|
||||||
@ -200,7 +202,7 @@ impl MetricCollectionManager {
|
|||||||
|
|
||||||
if should_collect {
|
if should_collect {
|
||||||
collecting_fresh.insert(collector_name.to_string());
|
collecting_fresh.insert(collector_name.to_string());
|
||||||
match collector.collect().await {
|
match collector.collect(&mut self.status_tracker).await {
|
||||||
Ok(metrics) => {
|
Ok(metrics) => {
|
||||||
// Collector returned fresh metrics (debug logging disabled for performance)
|
// Collector returned fresh metrics (debug logging disabled for performance)
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,8 @@ pub mod system {
|
|||||||
pub fn get_cpu_count() -> Result<usize, std::io::Error> {
|
pub fn get_cpu_count() -> Result<usize, std::io::Error> {
|
||||||
// Try /proc/cpuinfo first (most reliable)
|
// Try /proc/cpuinfo first (most reliable)
|
||||||
if let Ok(content) = fs::read_to_string("/proc/cpuinfo") {
|
if let Ok(content) = fs::read_to_string("/proc/cpuinfo") {
|
||||||
let count = content.lines()
|
let count = content
|
||||||
|
.lines()
|
||||||
.filter(|line| line.starts_with("processor"))
|
.filter(|line| line.starts_with("processor"))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
@ -27,8 +28,8 @@ pub mod system {
|
|||||||
/// Check if running in container
|
/// Check if running in container
|
||||||
pub fn is_container() -> bool {
|
pub fn is_container() -> bool {
|
||||||
// Check for common container indicators
|
// Check for common container indicators
|
||||||
fs::metadata("/.dockerenv").is_ok() ||
|
fs::metadata("/.dockerenv").is_ok()
|
||||||
fs::read_to_string("/proc/1/cgroup")
|
|| fs::read_to_string("/proc/1/cgroup")
|
||||||
.map(|content| content.contains("docker") || content.contains("containerd"))
|
.map(|content| content.contains("docker") || content.contains("containerd"))
|
||||||
.unwrap_or(false)
|
.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,16 +4,13 @@ use crossterm::{
|
|||||||
execute,
|
execute,
|
||||||
terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
|
terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
|
||||||
};
|
};
|
||||||
use ratatui::{
|
use ratatui::{backend::CrosstermBackend, Terminal};
|
||||||
backend::CrosstermBackend,
|
|
||||||
Terminal,
|
|
||||||
};
|
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
use tracing::{info, error, debug, warn};
|
use tracing::{debug, error, info, warn};
|
||||||
|
|
||||||
|
use crate::communication::{AgentCommand, ZmqCommandSender, ZmqConsumer};
|
||||||
use crate::config::DashboardConfig;
|
use crate::config::DashboardConfig;
|
||||||
use crate::communication::{ZmqConsumer, ZmqCommandSender, AgentCommand};
|
|
||||||
use crate::metrics::MetricStore;
|
use crate::metrics::MetricStore;
|
||||||
use crate::ui::TuiApp;
|
use crate::ui::TuiApp;
|
||||||
|
|
||||||
@ -63,7 +60,10 @@ impl Dashboard {
|
|||||||
match zmq_consumer.connect_to_predefined_hosts(&hosts).await {
|
match zmq_consumer.connect_to_predefined_hosts(&hosts).await {
|
||||||
Ok(_) => info!("Successfully connected to ZMQ hosts"),
|
Ok(_) => info!("Successfully connected to ZMQ hosts"),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
warn!("Failed to connect to hosts (this is normal if no agents are running): {}", e);
|
warn!(
|
||||||
|
"Failed to connect to hosts (this is normal if no agents are running): {}",
|
||||||
|
e
|
||||||
|
);
|
||||||
info!("Dashboard will start anyway and connect when agents become available");
|
info!("Dashboard will start anyway and connect when agents become available");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -82,7 +82,9 @@ impl Dashboard {
|
|||||||
// Setup terminal
|
// Setup terminal
|
||||||
if let Err(e) = enable_raw_mode() {
|
if let Err(e) = enable_raw_mode() {
|
||||||
error!("Failed to enable raw mode: {}", e);
|
error!("Failed to enable raw mode: {}", e);
|
||||||
error!("This usually means the dashboard is being run without a proper terminal (TTY)");
|
error!(
|
||||||
|
"This usually means the dashboard is being run without a proper terminal (TTY)"
|
||||||
|
);
|
||||||
error!("Try running with --headless flag or in a proper terminal");
|
error!("Try running with --headless flag or in a proper terminal");
|
||||||
return Err(e.into());
|
return Err(e.into());
|
||||||
}
|
}
|
||||||
@ -122,10 +124,11 @@ impl Dashboard {
|
|||||||
|
|
||||||
/// Send a command to a specific agent
|
/// Send a command to a specific agent
|
||||||
pub async fn send_command(&mut self, hostname: &str, command: AgentCommand) -> Result<()> {
|
pub async fn send_command(&mut self, hostname: &str, command: AgentCommand) -> Result<()> {
|
||||||
self.zmq_command_sender.send_command(hostname, command).await
|
self.zmq_command_sender
|
||||||
|
.send_command(hostname, command)
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pub async fn run(&mut self) -> Result<()> {
|
pub async fn run(&mut self) -> Result<()> {
|
||||||
info!("Starting dashboard main loop");
|
info!("Starting dashboard main loop");
|
||||||
|
|
||||||
@ -138,8 +141,7 @@ impl Dashboard {
|
|||||||
match event::poll(Duration::from_millis(50)) {
|
match event::poll(Duration::from_millis(50)) {
|
||||||
Ok(true) => {
|
Ok(true) => {
|
||||||
match event::read() {
|
match event::read() {
|
||||||
Ok(Event::Key(key)) => {
|
Ok(Event::Key(key)) => match key.code {
|
||||||
match key.code {
|
|
||||||
KeyCode::Char('q') => {
|
KeyCode::Char('q') => {
|
||||||
info!("Quit key pressed, exiting dashboard");
|
info!("Quit key pressed, exiting dashboard");
|
||||||
break;
|
break;
|
||||||
@ -177,8 +179,7 @@ impl Dashboard {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
},
|
||||||
}
|
|
||||||
Ok(_) => {} // Other events (mouse, resize, etc.)
|
Ok(_) => {} // Other events (mouse, resize, etc.)
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Error reading terminal event: {}", e);
|
error!("Error reading terminal event: {}", e);
|
||||||
@ -197,30 +198,51 @@ impl Dashboard {
|
|||||||
// Check for new metrics
|
// Check for new metrics
|
||||||
if last_metrics_check.elapsed() >= metrics_check_interval {
|
if last_metrics_check.elapsed() >= metrics_check_interval {
|
||||||
if let Ok(Some(metric_message)) = self.zmq_consumer.receive_metrics().await {
|
if let Ok(Some(metric_message)) = self.zmq_consumer.receive_metrics().await {
|
||||||
debug!("Received metrics from {}: {} metrics",
|
debug!(
|
||||||
metric_message.hostname, metric_message.metrics.len());
|
"Received metrics from {}: {} metrics",
|
||||||
|
metric_message.hostname,
|
||||||
|
metric_message.metrics.len()
|
||||||
|
);
|
||||||
|
|
||||||
// Check if this is the first time we've seen this host
|
// Check if this is the first time we've seen this host
|
||||||
let is_new_host = !self.initial_commands_sent.contains(&metric_message.hostname);
|
let is_new_host = !self
|
||||||
|
.initial_commands_sent
|
||||||
|
.contains(&metric_message.hostname);
|
||||||
|
|
||||||
if is_new_host {
|
if is_new_host {
|
||||||
info!("First contact with host {}, sending initial CollectNow command", metric_message.hostname);
|
info!(
|
||||||
|
"First contact with host {}, sending initial CollectNow command",
|
||||||
|
metric_message.hostname
|
||||||
|
);
|
||||||
|
|
||||||
// Send CollectNow command for immediate refresh
|
// Send CollectNow command for immediate refresh
|
||||||
if let Err(e) = self.send_command(&metric_message.hostname, AgentCommand::CollectNow).await {
|
if let Err(e) = self
|
||||||
error!("Failed to send initial CollectNow command to {}: {}", metric_message.hostname, e);
|
.send_command(&metric_message.hostname, AgentCommand::CollectNow)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
error!(
|
||||||
|
"Failed to send initial CollectNow command to {}: {}",
|
||||||
|
metric_message.hostname, e
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
info!("✓ Sent initial CollectNow command to {}", metric_message.hostname);
|
info!(
|
||||||
self.initial_commands_sent.insert(metric_message.hostname.clone());
|
"✓ Sent initial CollectNow command to {}",
|
||||||
|
metric_message.hostname
|
||||||
|
);
|
||||||
|
self.initial_commands_sent
|
||||||
|
.insert(metric_message.hostname.clone());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update metric store
|
// Update metric store
|
||||||
self.metric_store.update_metrics(&metric_message.hostname, metric_message.metrics);
|
self.metric_store
|
||||||
|
.update_metrics(&metric_message.hostname, metric_message.metrics);
|
||||||
|
|
||||||
// Update TUI with new hosts and metrics (only if not headless)
|
// Update TUI with new hosts and metrics (only if not headless)
|
||||||
if let Some(ref mut tui_app) = self.tui_app {
|
if let Some(ref mut tui_app) = self.tui_app {
|
||||||
let connected_hosts = self.metric_store.get_connected_hosts(Duration::from_secs(30));
|
let connected_hosts = self
|
||||||
|
.metric_store
|
||||||
|
.get_connected_hosts(Duration::from_secs(30));
|
||||||
tui_app.update_hosts(connected_hosts);
|
tui_app.update_hosts(connected_hosts);
|
||||||
tui_app.update_metrics(&self.metric_store);
|
tui_app.update_metrics(&self.metric_store);
|
||||||
}
|
}
|
||||||
@ -230,7 +252,9 @@ impl Dashboard {
|
|||||||
|
|
||||||
// Render TUI (only if not headless)
|
// Render TUI (only if not headless)
|
||||||
if !self.headless {
|
if !self.headless {
|
||||||
if let (Some(ref mut terminal), Some(ref mut tui_app)) = (&mut self.terminal, &mut self.tui_app) {
|
if let (Some(ref mut terminal), Some(ref mut tui_app)) =
|
||||||
|
(&mut self.terminal, &mut self.tui_app)
|
||||||
|
{
|
||||||
if let Err(e) = terminal.draw(|frame| {
|
if let Err(e) = terminal.draw(|frame| {
|
||||||
tui_app.render(frame, &self.metric_store);
|
tui_app.render(frame, &self.metric_store);
|
||||||
}) {
|
}) {
|
||||||
@ -255,10 +279,7 @@ impl Drop for Dashboard {
|
|||||||
if !self.headless {
|
if !self.headless {
|
||||||
let _ = disable_raw_mode();
|
let _ = disable_raw_mode();
|
||||||
if let Some(ref mut terminal) = self.terminal {
|
if let Some(ref mut terminal) = self.terminal {
|
||||||
let _ = execute!(
|
let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen);
|
||||||
terminal.backend_mut(),
|
|
||||||
LeaveAlternateScreen
|
|
||||||
);
|
|
||||||
let _ = terminal.show_cursor();
|
let _ = terminal.show_cursor();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use cm_dashboard_shared::{MetricMessage, MessageEnvelope, MessageType};
|
use cm_dashboard_shared::{MessageEnvelope, MessageType, MetricMessage};
|
||||||
use tracing::{info, error, debug, warn};
|
use tracing::{debug, error, info, warn};
|
||||||
use zmq::{Context, Socket, SocketType};
|
use zmq::{Context, Socket, SocketType};
|
||||||
|
|
||||||
use crate::config::ZmqConfig;
|
use crate::config::ZmqConfig;
|
||||||
@ -73,13 +73,15 @@ impl ZmqConsumer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("Connected to {} out of {} configured hosts",
|
info!(
|
||||||
self.connected_hosts.len(), hosts.len());
|
"Connected to {} out of {} configured hosts",
|
||||||
|
self.connected_hosts.len(),
|
||||||
|
hosts.len()
|
||||||
|
);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Receive metrics from any connected agent (non-blocking)
|
/// Receive metrics from any connected agent (non-blocking)
|
||||||
pub async fn receive_metrics(&mut self) -> Result<Option<MetricMessage>> {
|
pub async fn receive_metrics(&mut self) -> Result<Option<MetricMessage>> {
|
||||||
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
|
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
|
||||||
@ -93,11 +95,15 @@ impl ZmqConsumer {
|
|||||||
// Check message type
|
// Check message type
|
||||||
match envelope.message_type {
|
match envelope.message_type {
|
||||||
MessageType::Metrics => {
|
MessageType::Metrics => {
|
||||||
let metrics = envelope.decode_metrics()
|
let metrics = envelope
|
||||||
|
.decode_metrics()
|
||||||
.map_err(|e| anyhow::anyhow!("Failed to decode metrics: {}", e))?;
|
.map_err(|e| anyhow::anyhow!("Failed to decode metrics: {}", e))?;
|
||||||
|
|
||||||
debug!("Received {} metrics from {}",
|
debug!(
|
||||||
metrics.metrics.len(), metrics.hostname);
|
"Received {} metrics from {}",
|
||||||
|
metrics.metrics.len(),
|
||||||
|
metrics.hostname
|
||||||
|
);
|
||||||
|
|
||||||
Ok(Some(metrics))
|
Ok(Some(metrics))
|
||||||
}
|
}
|
||||||
@ -121,7 +127,6 @@ impl ZmqConsumer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// ZMQ command sender for sending commands to agents
|
/// ZMQ command sender for sending commands to agents
|
||||||
@ -135,9 +140,7 @@ impl ZmqCommandSender {
|
|||||||
|
|
||||||
info!("ZMQ command sender initialized");
|
info!("ZMQ command sender initialized");
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self { context })
|
||||||
context,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Send a command to a specific agent
|
/// Send a command to a specific agent
|
||||||
|
|||||||
@ -1,14 +1,14 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use tracing::{info, error};
|
use tracing::{error, info};
|
||||||
use tracing_subscriber::EnvFilter;
|
use tracing_subscriber::EnvFilter;
|
||||||
|
|
||||||
mod app;
|
mod app;
|
||||||
mod config;
|
|
||||||
mod communication;
|
mod communication;
|
||||||
|
mod config;
|
||||||
|
mod hosts;
|
||||||
mod metrics;
|
mod metrics;
|
||||||
mod ui;
|
mod ui;
|
||||||
mod hosts;
|
|
||||||
mod utils;
|
mod utils;
|
||||||
|
|
||||||
use app::Dashboard;
|
use app::Dashboard;
|
||||||
|
|||||||
@ -4,11 +4,8 @@ pub mod store;
|
|||||||
|
|
||||||
pub use store::MetricStore;
|
pub use store::MetricStore;
|
||||||
|
|
||||||
|
|
||||||
/// Historical metric data point
|
/// Historical metric data point
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct MetricDataPoint {
|
pub struct MetricDataPoint {
|
||||||
pub received_at: Instant,
|
pub received_at: Instant,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -36,12 +36,14 @@ impl MetricStore {
|
|||||||
debug!("Updating {} metrics for host {}", metrics.len(), hostname);
|
debug!("Updating {} metrics for host {}", metrics.len(), hostname);
|
||||||
|
|
||||||
// Get or create host entry
|
// Get or create host entry
|
||||||
let host_metrics = self.current_metrics
|
let host_metrics = self
|
||||||
|
.current_metrics
|
||||||
.entry(hostname.to_string())
|
.entry(hostname.to_string())
|
||||||
.or_insert_with(HashMap::new);
|
.or_insert_with(HashMap::new);
|
||||||
|
|
||||||
// Get or create historical entry
|
// Get or create historical entry
|
||||||
let host_history = self.historical_metrics
|
let host_history = self
|
||||||
|
.historical_metrics
|
||||||
.entry(hostname.to_string())
|
.entry(hostname.to_string())
|
||||||
.or_insert_with(Vec::new);
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
@ -53,9 +55,7 @@ impl MetricStore {
|
|||||||
host_metrics.insert(metric_name.clone(), metric.clone());
|
host_metrics.insert(metric_name.clone(), metric.clone());
|
||||||
|
|
||||||
// Add to history
|
// Add to history
|
||||||
host_history.push(MetricDataPoint {
|
host_history.push(MetricDataPoint { received_at: now });
|
||||||
received_at: now,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update last update timestamp
|
// Update last update timestamp
|
||||||
@ -67,15 +67,15 @@ impl MetricStore {
|
|||||||
// Cleanup old history and enforce limits
|
// Cleanup old history and enforce limits
|
||||||
self.cleanup_host_data(hostname);
|
self.cleanup_host_data(hostname);
|
||||||
|
|
||||||
info!("Updated metrics for {}: {} current metrics",
|
info!(
|
||||||
hostname, metrics_count);
|
"Updated metrics for {}: {} current metrics",
|
||||||
|
hostname, metrics_count
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get current metric for a specific host
|
/// Get current metric for a specific host
|
||||||
pub fn get_metric(&self, hostname: &str, metric_name: &str) -> Option<&Metric> {
|
pub fn get_metric(&self, hostname: &str, metric_name: &str) -> Option<&Metric> {
|
||||||
self.current_metrics
|
self.current_metrics.get(hostname)?.get(metric_name)
|
||||||
.get(hostname)?
|
|
||||||
.get(metric_name)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get all current metrics for a host
|
/// Get all current metrics for a host
|
||||||
@ -93,8 +93,6 @@ impl MetricStore {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// Get connected hosts (hosts with recent updates)
|
/// Get connected hosts (hosts with recent updates)
|
||||||
pub fn get_connected_hosts(&self, timeout: Duration) -> Vec<String> {
|
pub fn get_connected_hosts(&self, timeout: Duration) -> Vec<String> {
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
@ -111,8 +109,6 @@ impl MetricStore {
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// Cleanup old data and enforce limits
|
/// Cleanup old data and enforce limits
|
||||||
fn cleanup_host_data(&mut self, hostname: &str) {
|
fn cleanup_host_data(&mut self, hostname: &str) {
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
@ -126,10 +122,11 @@ impl MetricStore {
|
|||||||
if history.len() > self.max_metrics_per_host {
|
if history.len() > self.max_metrics_per_host {
|
||||||
let excess = history.len() - self.max_metrics_per_host;
|
let excess = history.len() - self.max_metrics_per_host;
|
||||||
history.drain(0..excess);
|
history.drain(0..excess);
|
||||||
warn!("Trimmed {} old metrics for host {} (size limit: {})",
|
warn!(
|
||||||
excess, hostname, self.max_metrics_per_host);
|
"Trimmed {} old metrics for host {} (size limit: {})",
|
||||||
|
excess, hostname, self.max_metrics_per_host
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -1,6 +1,6 @@
|
|||||||
use ratatui::style::{Color, Style, Modifier};
|
|
||||||
use ratatui::widgets::{Block, Borders};
|
|
||||||
use cm_dashboard_shared::Status;
|
use cm_dashboard_shared::Status;
|
||||||
|
use ratatui::style::{Color, Modifier, Style};
|
||||||
|
use ratatui::widgets::{Block, Borders};
|
||||||
|
|
||||||
/// Complete terminal color palette matching your configuration
|
/// Complete terminal color palette matching your configuration
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
@ -194,17 +194,23 @@ impl Theme {
|
|||||||
|
|
||||||
/// Inactive widget border style
|
/// Inactive widget border style
|
||||||
pub fn widget_border_inactive_style() -> Style {
|
pub fn widget_border_inactive_style() -> Style {
|
||||||
Style::default().fg(Self::muted_text()).bg(Self::background())
|
Style::default()
|
||||||
|
.fg(Self::muted_text())
|
||||||
|
.bg(Self::background())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Title style
|
/// Title style
|
||||||
pub fn title_style() -> Style {
|
pub fn title_style() -> Style {
|
||||||
Style::default().fg(Self::border_title()).bg(Self::background())
|
Style::default()
|
||||||
|
.fg(Self::border_title())
|
||||||
|
.bg(Self::background())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Status bar style
|
/// Status bar style
|
||||||
pub fn status_bar_style() -> Style {
|
pub fn status_bar_style() -> Style {
|
||||||
Style::default().fg(Self::muted_text()).bg(Self::background())
|
Style::default()
|
||||||
|
.fg(Self::muted_text())
|
||||||
|
.bg(Self::background())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -245,7 +251,6 @@ impl StatusIcons {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Create spans with status icon colored and text in foreground color
|
/// Create spans with status icon colored and text in foreground color
|
||||||
pub fn create_status_spans(status: Status, text: &str) -> Vec<ratatui::text::Span<'static>> {
|
pub fn create_status_spans(status: Status, text: &str) -> Vec<ratatui::text::Span<'static>> {
|
||||||
let icon = Self::get_icon(status);
|
let icon = Self::get_icon(status);
|
||||||
@ -259,15 +264,16 @@ impl StatusIcons {
|
|||||||
vec![
|
vec![
|
||||||
ratatui::text::Span::styled(
|
ratatui::text::Span::styled(
|
||||||
format!("{} ", icon),
|
format!("{} ", icon),
|
||||||
Style::default().fg(status_color).bg(Theme::background())
|
Style::default().fg(status_color).bg(Theme::background()),
|
||||||
),
|
),
|
||||||
ratatui::text::Span::styled(
|
ratatui::text::Span::styled(
|
||||||
text.to_string(),
|
text.to_string(),
|
||||||
Style::default().fg(Theme::secondary_text()).bg(Theme::background())
|
Style::default()
|
||||||
|
.fg(Theme::secondary_text())
|
||||||
|
.bg(Theme::background()),
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Components {
|
impl Components {
|
||||||
@ -277,9 +283,12 @@ impl Components {
|
|||||||
.title(title)
|
.title(title)
|
||||||
.borders(Borders::ALL)
|
.borders(Borders::ALL)
|
||||||
.style(Style::default().fg(Theme::border()).bg(Theme::background()))
|
.style(Style::default().fg(Theme::border()).bg(Theme::background()))
|
||||||
.title_style(Style::default().fg(Theme::border_title()).bg(Theme::background()))
|
.title_style(
|
||||||
|
Style::default()
|
||||||
|
.fg(Theme::border_title())
|
||||||
|
.bg(Theme::background()),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Typography {
|
impl Typography {
|
||||||
@ -312,5 +321,4 @@ impl Typography {
|
|||||||
.bg(Theme::background())
|
.bg(Theme::background())
|
||||||
.add_modifier(Modifier::BOLD)
|
.add_modifier(Modifier::BOLD)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -7,7 +7,7 @@ use ratatui::{
|
|||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use super::Widget;
|
use super::Widget;
|
||||||
use crate::ui::theme::{Typography, StatusIcons};
|
use crate::ui::theme::{StatusIcons, Typography};
|
||||||
|
|
||||||
/// CPU widget displaying load, temperature, and frequency
|
/// CPU widget displaying load, temperature, and frequency
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@ -56,7 +56,6 @@ impl CpuWidget {
|
|||||||
None => "— MHz".to_string(),
|
None => "— MHz".to_string(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Widget for CpuWidget {
|
impl Widget for CpuWidget {
|
||||||
@ -111,21 +110,28 @@ impl Widget for CpuWidget {
|
|||||||
|
|
||||||
self.has_data = !metrics.is_empty();
|
self.has_data = !metrics.is_empty();
|
||||||
|
|
||||||
debug!("CPU widget updated: load={:?}, temp={:?}, freq={:?}, status={:?}",
|
debug!(
|
||||||
self.load_1min, self.temperature, self.frequency, self.status);
|
"CPU widget updated: load={:?}, temp={:?}, freq={:?}, status={:?}",
|
||||||
|
self.load_1min, self.temperature, self.frequency, self.status
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn render(&mut self, frame: &mut Frame, area: Rect) {
|
fn render(&mut self, frame: &mut Frame, area: Rect) {
|
||||||
let content_chunks = Layout::default().direction(Direction::Vertical).constraints([Constraint::Length(1), Constraint::Length(1)]).split(area);
|
let content_chunks = Layout::default()
|
||||||
|
.direction(Direction::Vertical)
|
||||||
|
.constraints([Constraint::Length(1), Constraint::Length(1)])
|
||||||
|
.split(area);
|
||||||
let cpu_title = Paragraph::new("CPU:").style(Typography::widget_title());
|
let cpu_title = Paragraph::new("CPU:").style(Typography::widget_title());
|
||||||
frame.render_widget(cpu_title, content_chunks[0]);
|
frame.render_widget(cpu_title, content_chunks[0]);
|
||||||
let load_freq_spans = StatusIcons::create_status_spans(self.status, &format!("Load: {} • {}", self.format_load(), self.format_frequency()));
|
let load_freq_spans = StatusIcons::create_status_spans(
|
||||||
|
self.status,
|
||||||
|
&format!("Load: {} • {}", self.format_load(), self.format_frequency()),
|
||||||
|
);
|
||||||
let load_freq_para = Paragraph::new(ratatui::text::Line::from(load_freq_spans));
|
let load_freq_para = Paragraph::new(ratatui::text::Line::from(load_freq_spans));
|
||||||
frame.render_widget(load_freq_para, content_chunks[1]);
|
frame.render_widget(load_freq_para, content_chunks[1]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
impl Default for CpuWidget {
|
impl Default for CpuWidget {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self::new()
|
Self::new()
|
||||||
|
|||||||
@ -7,7 +7,7 @@ use ratatui::{
|
|||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use super::Widget;
|
use super::Widget;
|
||||||
use crate::ui::theme::{Typography, StatusIcons};
|
use crate::ui::theme::{StatusIcons, Typography};
|
||||||
|
|
||||||
/// Memory widget displaying usage, totals, and swap information
|
/// Memory widget displaying usage, totals, and swap information
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@ -53,7 +53,6 @@ impl MemoryWidget {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Get memory usage percentage for gauge
|
/// Get memory usage percentage for gauge
|
||||||
fn get_memory_percentage(&self) -> u16 {
|
fn get_memory_percentage(&self) -> u16 {
|
||||||
match self.usage_percent {
|
match self.usage_percent {
|
||||||
@ -108,10 +107,8 @@ impl MemoryWidget {
|
|||||||
let total_str = Self::format_size_units(total_mb);
|
let total_str = Self::format_size_units(total_mb);
|
||||||
format!("{}/{}", used_str, total_str)
|
format!("{}/{}", used_str, total_str)
|
||||||
}
|
}
|
||||||
(None, Some(used_mb), None) => {
|
(None, Some(used_mb), None) => Self::format_size_units(used_mb),
|
||||||
Self::format_size_units(used_mb)
|
_ => "—".to_string(),
|
||||||
}
|
|
||||||
_ => "—".to_string()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -129,7 +126,6 @@ impl MemoryWidget {
|
|||||||
Status::Unknown
|
Status::Unknown
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Widget for MemoryWidget {
|
impl Widget for MemoryWidget {
|
||||||
@ -213,21 +209,38 @@ impl Widget for MemoryWidget {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn render(&mut self, frame: &mut Frame, area: Rect) {
|
fn render(&mut self, frame: &mut Frame, area: Rect) {
|
||||||
let content_chunks = Layout::default().direction(Direction::Vertical).constraints([Constraint::Length(1), Constraint::Length(1), Constraint::Length(1)]).split(area);
|
let content_chunks = Layout::default()
|
||||||
|
.direction(Direction::Vertical)
|
||||||
|
.constraints([
|
||||||
|
Constraint::Length(1),
|
||||||
|
Constraint::Length(1),
|
||||||
|
Constraint::Length(1),
|
||||||
|
])
|
||||||
|
.split(area);
|
||||||
let mem_title = Paragraph::new("RAM:").style(Typography::widget_title());
|
let mem_title = Paragraph::new("RAM:").style(Typography::widget_title());
|
||||||
frame.render_widget(mem_title, content_chunks[0]);
|
frame.render_widget(mem_title, content_chunks[0]);
|
||||||
|
|
||||||
// Format used and total memory with smart units, percentage, and status icon
|
// Format used and total memory with smart units, percentage, and status icon
|
||||||
let used_str = self.used_gb.map_or("—".to_string(), |v| Self::format_size_units(v * 1024.0)); // Convert GB to MB for formatting
|
let used_str = self
|
||||||
let total_str = self.total_gb.map_or("—".to_string(), |v| Self::format_size_units(v * 1024.0)); // Convert GB to MB for formatting
|
.used_gb
|
||||||
|
.map_or("—".to_string(), |v| Self::format_size_units(v * 1024.0)); // Convert GB to MB for formatting
|
||||||
|
let total_str = self
|
||||||
|
.total_gb
|
||||||
|
.map_or("—".to_string(), |v| Self::format_size_units(v * 1024.0)); // Convert GB to MB for formatting
|
||||||
let percentage = self.get_memory_percentage();
|
let percentage = self.get_memory_percentage();
|
||||||
let mem_details_spans = StatusIcons::create_status_spans(self.status, &format!("Used: {}% {}/{}", percentage, used_str, total_str));
|
let mem_details_spans = StatusIcons::create_status_spans(
|
||||||
|
self.status,
|
||||||
|
&format!("Used: {}% {}/{}", percentage, used_str, total_str),
|
||||||
|
);
|
||||||
let mem_details_para = Paragraph::new(ratatui::text::Line::from(mem_details_spans));
|
let mem_details_para = Paragraph::new(ratatui::text::Line::from(mem_details_spans));
|
||||||
frame.render_widget(mem_details_para, content_chunks[1]);
|
frame.render_widget(mem_details_para, content_chunks[1]);
|
||||||
|
|
||||||
// /tmp usage line with status icon
|
// /tmp usage line with status icon
|
||||||
let tmp_status = self.get_tmp_status();
|
let tmp_status = self.get_tmp_status();
|
||||||
let tmp_spans = StatusIcons::create_status_spans(tmp_status, &format!("tmp: {}", self.format_tmp_usage()));
|
let tmp_spans = StatusIcons::create_status_spans(
|
||||||
|
tmp_status,
|
||||||
|
&format!("tmp: {}", self.format_tmp_usage()),
|
||||||
|
);
|
||||||
let tmp_para = Paragraph::new(ratatui::text::Line::from(tmp_spans));
|
let tmp_para = Paragraph::new(ratatui::text::Line::from(tmp_spans));
|
||||||
frame.render_widget(tmp_para, content_chunks[2]);
|
frame.render_widget(tmp_para, content_chunks[2]);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,15 +1,15 @@
|
|||||||
use cm_dashboard_shared::Metric;
|
use cm_dashboard_shared::Metric;
|
||||||
use ratatui::{layout::Rect, Frame};
|
use ratatui::{layout::Rect, Frame};
|
||||||
|
|
||||||
|
pub mod backup;
|
||||||
pub mod cpu;
|
pub mod cpu;
|
||||||
pub mod memory;
|
pub mod memory;
|
||||||
pub mod services;
|
pub mod services;
|
||||||
pub mod backup;
|
|
||||||
|
|
||||||
|
pub use backup::BackupWidget;
|
||||||
pub use cpu::CpuWidget;
|
pub use cpu::CpuWidget;
|
||||||
pub use memory::MemoryWidget;
|
pub use memory::MemoryWidget;
|
||||||
pub use services::ServicesWidget;
|
pub use services::ServicesWidget;
|
||||||
pub use backup::BackupWidget;
|
|
||||||
|
|
||||||
/// Widget trait for UI components that display metrics
|
/// Widget trait for UI components that display metrics
|
||||||
pub trait Widget {
|
pub trait Widget {
|
||||||
|
|||||||
@ -24,26 +24,44 @@ pub struct CacheConfig {
|
|||||||
impl Default for CacheConfig {
|
impl Default for CacheConfig {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
let mut tiers = HashMap::new();
|
let mut tiers = HashMap::new();
|
||||||
tiers.insert("realtime".to_string(), CacheTier {
|
tiers.insert(
|
||||||
|
"realtime".to_string(),
|
||||||
|
CacheTier {
|
||||||
interval_seconds: 2,
|
interval_seconds: 2,
|
||||||
description: "Memory/CPU operations - no disk I/O (CPU, memory, service CPU/RAM)".to_string(),
|
description: "Memory/CPU operations - no disk I/O (CPU, memory, service CPU/RAM)"
|
||||||
});
|
.to_string(),
|
||||||
tiers.insert("disk_light".to_string(), CacheTier {
|
},
|
||||||
|
);
|
||||||
|
tiers.insert(
|
||||||
|
"disk_light".to_string(),
|
||||||
|
CacheTier {
|
||||||
|
interval_seconds: 10,
|
||||||
|
description: "Light disk operations - 10 seconds (service status checks)".to_string(),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
tiers.insert(
|
||||||
|
"disk_medium".to_string(),
|
||||||
|
CacheTier {
|
||||||
interval_seconds: 60,
|
interval_seconds: 60,
|
||||||
description: "Light disk operations - 1 minute (service status checks)".to_string(),
|
description: "Medium disk operations - 1 minute (disk usage, service disk)"
|
||||||
});
|
.to_string(),
|
||||||
tiers.insert("disk_medium".to_string(), CacheTier {
|
},
|
||||||
interval_seconds: 300,
|
);
|
||||||
description: "Medium disk operations - 5 minutes (disk usage, service disk)".to_string(),
|
tiers.insert(
|
||||||
});
|
"disk_heavy".to_string(),
|
||||||
tiers.insert("disk_heavy".to_string(), CacheTier {
|
CacheTier {
|
||||||
interval_seconds: 900,
|
interval_seconds: 60,
|
||||||
description: "Heavy disk operations - 15 minutes (SMART data, backup status)".to_string(),
|
description: "Heavy disk operations - 1 minute (backup status)"
|
||||||
});
|
.to_string(),
|
||||||
tiers.insert("static".to_string(), CacheTier {
|
},
|
||||||
interval_seconds: 3600,
|
);
|
||||||
description: "Hardware info that rarely changes - 1 hour".to_string(),
|
tiers.insert(
|
||||||
});
|
"static".to_string(),
|
||||||
|
CacheTier {
|
||||||
|
interval_seconds: 600,
|
||||||
|
description: "SMART data operations - 10 minutes".to_string(),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
let mut metric_assignments = HashMap::new();
|
let mut metric_assignments = HashMap::new();
|
||||||
|
|
||||||
@ -65,12 +83,14 @@ impl Default for CacheConfig {
|
|||||||
metric_assignments.insert("disk_*_usage_*".to_string(), "disk_medium".to_string());
|
metric_assignments.insert("disk_*_usage_*".to_string(), "disk_medium".to_string());
|
||||||
metric_assignments.insert("disk_*_size_*".to_string(), "disk_medium".to_string());
|
metric_assignments.insert("disk_*_size_*".to_string(), "disk_medium".to_string());
|
||||||
|
|
||||||
// DISK_HEAVY (15min) - Heavy disk operations: SMART data, backup status
|
// DISK_HEAVY (1min) - Heavy disk operations: backup status
|
||||||
metric_assignments.insert("disk_*_temperature".to_string(), "disk_heavy".to_string());
|
|
||||||
metric_assignments.insert("disk_*_wear_percent".to_string(), "disk_heavy".to_string());
|
|
||||||
metric_assignments.insert("smart_*".to_string(), "disk_heavy".to_string());
|
|
||||||
metric_assignments.insert("backup_*".to_string(), "disk_heavy".to_string());
|
metric_assignments.insert("backup_*".to_string(), "disk_heavy".to_string());
|
||||||
|
|
||||||
|
// STATIC (10min) - SMART data operations
|
||||||
|
metric_assignments.insert("disk_*_temperature".to_string(), "static".to_string());
|
||||||
|
metric_assignments.insert("disk_*_wear_percent".to_string(), "static".to_string());
|
||||||
|
metric_assignments.insert("smart_*".to_string(), "static".to_string());
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
enabled: true,
|
enabled: true,
|
||||||
default_ttl_seconds: 30,
|
default_ttl_seconds: 30,
|
||||||
@ -118,9 +138,9 @@ impl CacheConfig {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// More complex patterns - for now, just check if all parts are present
|
// More complex patterns - for now, just check if all parts are present
|
||||||
pattern_parts.iter().all(|part| {
|
pattern_parts
|
||||||
part.is_empty() || metric_name.contains(part)
|
.iter()
|
||||||
})
|
.all(|part| part.is_empty() || metric_name.contains(part))
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
metric_name == pattern
|
metric_name == pattern
|
||||||
@ -157,15 +177,15 @@ mod tests {
|
|||||||
assert_eq!(config.get_cache_interval("memory_usage_percent"), 2);
|
assert_eq!(config.get_cache_interval("memory_usage_percent"), 2);
|
||||||
assert_eq!(config.get_cache_interval("service_nginx_cpu_percent"), 2);
|
assert_eq!(config.get_cache_interval("service_nginx_cpu_percent"), 2);
|
||||||
|
|
||||||
// Disk light (60s) - Service status
|
// Disk light (10s) - Service status
|
||||||
assert_eq!(config.get_cache_interval("service_nginx_status"), 60);
|
assert_eq!(config.get_cache_interval("service_nginx_status"), 10);
|
||||||
|
|
||||||
// Disk medium (300s) - Disk usage
|
// Disk medium (60s) - Disk usage
|
||||||
assert_eq!(config.get_cache_interval("service_nginx_disk_gb"), 300);
|
assert_eq!(config.get_cache_interval("service_nginx_disk_gb"), 60);
|
||||||
assert_eq!(config.get_cache_interval("disk_tmp_usage_percent"), 300);
|
assert_eq!(config.get_cache_interval("disk_tmp_usage_percent"), 60);
|
||||||
|
|
||||||
// Disk heavy (900s) - SMART data
|
// Static (600s) - SMART data
|
||||||
assert_eq!(config.get_cache_interval("disk_nvme0_temperature"), 900);
|
assert_eq!(config.get_cache_interval("disk_nvme0_temperature"), 600);
|
||||||
assert_eq!(config.get_cache_interval("smart_nvme0_wear_percent"), 900);
|
assert_eq!(config.get_cache_interval("smart_nvme0_wear_percent"), 600);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1,5 +1,6 @@
|
|||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
/// Individual metric with value, status, and metadata
|
/// Individual metric with value, status, and metadata
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@ -100,6 +101,118 @@ impl Default for Status {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Hysteresis thresholds for preventing status flapping
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct HysteresisThresholds {
|
||||||
|
/// Warning threshold - trigger warning when value >= this
|
||||||
|
pub warning_high: f32,
|
||||||
|
/// Warning recovery - return to ok when value < this
|
||||||
|
pub warning_low: f32,
|
||||||
|
/// Critical threshold - trigger critical when value >= this
|
||||||
|
pub critical_high: f32,
|
||||||
|
/// Critical recovery - return to warning when value < this
|
||||||
|
pub critical_low: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HysteresisThresholds {
|
||||||
|
pub fn new(warning_high: f32, critical_high: f32) -> Self {
|
||||||
|
// Default hysteresis: 10% gap for recovery
|
||||||
|
let warning_gap = warning_high * 0.1;
|
||||||
|
let critical_gap = critical_high * 0.1;
|
||||||
|
|
||||||
|
Self {
|
||||||
|
warning_high,
|
||||||
|
warning_low: warning_high - warning_gap,
|
||||||
|
critical_high,
|
||||||
|
critical_low: critical_high - critical_gap,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_custom_gaps(warning_high: f32, warning_gap: f32, critical_high: f32, critical_gap: f32) -> Self {
|
||||||
|
Self {
|
||||||
|
warning_high,
|
||||||
|
warning_low: warning_high - warning_gap,
|
||||||
|
critical_high,
|
||||||
|
critical_low: critical_high - critical_gap,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate status with hysteresis based on current value and previous status
|
||||||
|
pub fn calculate_status(&self, value: f32, previous_status: Status) -> Status {
|
||||||
|
match previous_status {
|
||||||
|
Status::Ok => {
|
||||||
|
if value >= self.critical_high {
|
||||||
|
Status::Critical
|
||||||
|
} else if value >= self.warning_high {
|
||||||
|
Status::Warning
|
||||||
|
} else {
|
||||||
|
Status::Ok
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Status::Warning => {
|
||||||
|
if value >= self.critical_high {
|
||||||
|
Status::Critical
|
||||||
|
} else if value < self.warning_low {
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Warning
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Status::Critical => {
|
||||||
|
if value < self.critical_low {
|
||||||
|
if value < self.warning_low {
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Warning
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Status::Critical
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Status::Unknown => {
|
||||||
|
// First measurement, use normal thresholds
|
||||||
|
if value >= self.critical_high {
|
||||||
|
Status::Critical
|
||||||
|
} else if value >= self.warning_high {
|
||||||
|
Status::Warning
|
||||||
|
} else {
|
||||||
|
Status::Ok
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Status tracker for hysteresis - tracks previous status per metric
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct StatusTracker {
|
||||||
|
previous_statuses: HashMap<String, Status>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StatusTracker {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self::default()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get previous status for a metric
|
||||||
|
pub fn get_previous_status(&self, metric_name: &str) -> Status {
|
||||||
|
self.previous_statuses.get(metric_name).copied().unwrap_or(Status::Unknown)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update status for a metric
|
||||||
|
pub fn update_status(&mut self, metric_name: String, status: Status) {
|
||||||
|
self.previous_statuses.insert(metric_name, status);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate status with hysteresis
|
||||||
|
pub fn calculate_with_hysteresis(&mut self, metric_name: &str, value: f32, thresholds: &HysteresisThresholds) -> Status {
|
||||||
|
let previous = self.get_previous_status(metric_name);
|
||||||
|
let new_status = thresholds.calculate_status(value, previous);
|
||||||
|
self.update_status(metric_name.to_string(), new_status);
|
||||||
|
new_status
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Metric name registry - constants for all metric names
|
/// Metric name registry - constants for all metric names
|
||||||
pub mod registry {
|
pub mod registry {
|
||||||
// CPU metrics
|
// CPU metrics
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use crate::metrics::Metric;
|
use crate::metrics::Metric;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
/// Message sent from agent to dashboard via ZMQ
|
/// Message sent from agent to dashboard via ZMQ
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user