Testing
This commit is contained in:
parent
9e344fb66d
commit
57b676ad25
32
CLAUDE.md
32
CLAUDE.md
@ -232,12 +232,12 @@ Agent (calculations + thresholds) → Status → Dashboard (display only) → Ta
|
|||||||
- Rate limiting: configurable (set to 0 for testing, 30 minutes for production)
|
- Rate limiting: configurable (set to 0 for testing, 30 minutes for production)
|
||||||
|
|
||||||
**Monitored Components:**
|
**Monitored Components:**
|
||||||
- system.cpu (load status)
|
- system.cpu (load status) - SystemCollector
|
||||||
- system.cpu_temp (temperature status)
|
- system.memory (usage status) - SystemCollector
|
||||||
- system.memory (usage status)
|
- system.cpu_temp (temperature status) - SystemCollector (disabled)
|
||||||
- system.services (service health status)
|
- system.services (service health status) - ServiceCollector
|
||||||
- storage.smart (drive health)
|
- storage.smart (drive health) - SmartCollector
|
||||||
- backup.overall (backup status)
|
- backup.overall (backup status) - BackupCollector
|
||||||
|
|
||||||
### Pure Auto-Discovery Implementation
|
### Pure Auto-Discovery Implementation
|
||||||
|
|
||||||
@ -262,10 +262,24 @@ Agent (calculations + thresholds) → Status → Dashboard (display only) → Ta
|
|||||||
- [x] CPU temperature monitoring and notifications
|
- [x] CPU temperature monitoring and notifications
|
||||||
- [x] ZMQ message format standardization
|
- [x] ZMQ message format standardization
|
||||||
- [x] Removed all hardcoded dashboard thresholds
|
- [x] Removed all hardcoded dashboard thresholds
|
||||||
|
- [x] CPU thresholds restored to production values (5.0/8.0)
|
||||||
|
- [x] All collectors output standardized status strings (ok/warning/critical/unknown)
|
||||||
|
- [x] Dashboard connection loss detection with 5-second keep-alive
|
||||||
|
- [x] Removed excessive logging from agent
|
||||||
|
- [x] Fixed all compiler warnings in both agent and dashboard
|
||||||
|
- [x] **SystemCollector architecture refactoring completed (2025-10-12)**
|
||||||
|
- [x] Created SystemCollector for CPU load, memory, temperature, C-states
|
||||||
|
- [x] Moved system metrics from ServiceCollector to SystemCollector
|
||||||
|
- [x] Updated dashboard to parse and display SystemCollector data
|
||||||
|
- [x] Enhanced service notifications to include specific failure details
|
||||||
|
- [x] CPU temperature thresholds set to 100°C (effectively disabled)
|
||||||
|
|
||||||
**Testing Configuration (REVERT FOR PRODUCTION):**
|
**Production Configuration:**
|
||||||
- CPU thresholds lowered to 2.0/4.0 for easy testing
|
- CPU load thresholds: Warning ≥ 5.0, Critical ≥ 8.0
|
||||||
- Email rate limiting disabled (0 minutes)
|
- CPU temperature thresholds: Warning ≥ 100°C, Critical ≥ 100°C (effectively disabled)
|
||||||
|
- Memory usage thresholds: Warning ≥ 80%, Critical ≥ 95%
|
||||||
|
- Connection timeout: 15 seconds (agents send data every 5 seconds)
|
||||||
|
- Email rate limiting: 30 minutes (set to 0 for testing)
|
||||||
|
|
||||||
### Development Guidelines
|
### Development Guidelines
|
||||||
|
|
||||||
|
|||||||
@ -6,6 +6,7 @@ pub mod backup;
|
|||||||
pub mod error;
|
pub mod error;
|
||||||
pub mod service;
|
pub mod service;
|
||||||
pub mod smart;
|
pub mod smart;
|
||||||
|
pub mod system;
|
||||||
|
|
||||||
pub use error::CollectorError;
|
pub use error::CollectorError;
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,6 @@ use async_trait::async_trait;
|
|||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::process::Stdio;
|
use std::process::Stdio;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
@ -284,33 +283,6 @@ impl ServiceCollector {
|
|||||||
Ok(0.0) // No limit or couldn't parse
|
Ok(0.0) // No limit or couldn't parse
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_system_memory_info(&self) -> Result<SystemMemoryInfo, CollectorError> {
|
|
||||||
let meminfo =
|
|
||||||
fs::read_to_string("/proc/meminfo")
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::IoError {
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let mut memory_info = HashMap::new();
|
|
||||||
for line in meminfo.lines() {
|
|
||||||
if let Some((key, value)) = line.split_once(':') {
|
|
||||||
let value = value.trim().trim_end_matches(" kB");
|
|
||||||
if let Ok(kb) = value.parse::<u64>() {
|
|
||||||
memory_info.insert(key.to_string(), kb);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let total_kb = memory_info.get("MemTotal").copied().unwrap_or(0);
|
|
||||||
let available_kb = memory_info.get("MemAvailable").copied().unwrap_or(0);
|
|
||||||
let used_kb = total_kb.saturating_sub(available_kb);
|
|
||||||
|
|
||||||
Ok(SystemMemoryInfo {
|
|
||||||
total_mb: total_kb as f32 / 1024.0,
|
|
||||||
used_mb: used_kb as f32 / 1024.0,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_disk_usage(&self) -> Result<DiskUsage, CollectorError> {
|
async fn get_disk_usage(&self) -> Result<DiskUsage, CollectorError> {
|
||||||
let output = Command::new("df")
|
let output = Command::new("df")
|
||||||
@ -363,59 +335,9 @@ impl ServiceCollector {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_cpu_load(&self) -> Result<(f32, f32, f32), CollectorError> {
|
|
||||||
let loadavg =
|
|
||||||
fs::read_to_string("/proc/loadavg")
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::IoError {
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let parts: Vec<&str> = loadavg.split_whitespace().collect();
|
|
||||||
if parts.len() < 3 {
|
|
||||||
return Err(CollectorError::ParseError {
|
|
||||||
message: "Unexpected /proc/loadavg format".to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let parse = |s: &str| -> Result<f32, CollectorError> {
|
|
||||||
s.parse::<f32>().map_err(|e| CollectorError::ParseError {
|
|
||||||
message: format!("Failed to parse load average '{}': {}", s, e),
|
|
||||||
})
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok((parse(parts[0])?, parse(parts[1])?, parse(parts[2])?))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn determine_cpu_status(&self, cpu_load_5: f32) -> String {
|
|
||||||
if cpu_load_5 >= 8.0 {
|
|
||||||
"critical".to_string()
|
|
||||||
} else if cpu_load_5 >= 5.0 {
|
|
||||||
"warning".to_string()
|
|
||||||
} else {
|
|
||||||
"ok".to_string()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn determine_memory_status(&self, usage_percent: f32) -> String {
|
|
||||||
if usage_percent >= 95.0 {
|
|
||||||
"critical".to_string()
|
|
||||||
} else if usage_percent >= 80.0 {
|
|
||||||
"warning".to_string()
|
|
||||||
} else {
|
|
||||||
"ok".to_string()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn determine_cpu_temp_status(&self, temp_c: f32) -> String {
|
|
||||||
if temp_c >= 80.0 {
|
|
||||||
"critical".to_string()
|
|
||||||
} else if temp_c >= 70.0 {
|
|
||||||
"warning".to_string()
|
|
||||||
} else {
|
|
||||||
"ok".to_string()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn determine_services_status(&self, healthy: usize, degraded: usize, failed: usize) -> String {
|
fn determine_services_status(&self, healthy: usize, degraded: usize, failed: usize) -> String {
|
||||||
if failed > 0 {
|
if failed > 0 {
|
||||||
@ -429,84 +351,6 @@ impl ServiceCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_cpu_cstate_info(&self) -> Option<Vec<String>> {
|
|
||||||
// Read C-state information to show all sleep state distributions
|
|
||||||
let mut cstate_times: Vec<(String, u64)> = Vec::new();
|
|
||||||
let mut total_time = 0u64;
|
|
||||||
|
|
||||||
// Check if C-state information is available
|
|
||||||
if let Ok(mut entries) = fs::read_dir("/sys/devices/system/cpu/cpu0/cpuidle").await {
|
|
||||||
while let Ok(Some(entry)) = entries.next_entry().await {
|
|
||||||
let state_path = entry.path();
|
|
||||||
let name_path = state_path.join("name");
|
|
||||||
let time_path = state_path.join("time");
|
|
||||||
|
|
||||||
if let (Ok(name), Ok(time_str)) = (
|
|
||||||
fs::read_to_string(&name_path).await,
|
|
||||||
fs::read_to_string(&time_path).await
|
|
||||||
) {
|
|
||||||
let name = name.trim().to_string();
|
|
||||||
if let Ok(time) = time_str.trim().parse::<u64>() {
|
|
||||||
total_time += time;
|
|
||||||
cstate_times.push((name, time));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if total_time > 0 && !cstate_times.is_empty() {
|
|
||||||
// Sort by time spent (highest first)
|
|
||||||
cstate_times.sort_by(|a, b| b.1.cmp(&a.1));
|
|
||||||
|
|
||||||
// Format all C-states with percentages
|
|
||||||
let mut result = Vec::new();
|
|
||||||
for (name, time) in cstate_times {
|
|
||||||
let percent = (time as f32 / total_time as f32) * 100.0;
|
|
||||||
if percent >= 0.1 { // Only show states with at least 0.1% time
|
|
||||||
result.push(format!("{}: {:.1}%", name, percent));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return Some(result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_cpu_temperature_c(&self) -> Option<f32> {
|
|
||||||
let mut entries = fs::read_dir("/sys/class/thermal").await.ok()?;
|
|
||||||
let mut fallback: Option<f32> = None;
|
|
||||||
|
|
||||||
while let Ok(Some(entry)) = entries.next_entry().await {
|
|
||||||
let path = entry.path();
|
|
||||||
let type_path = path.join("type");
|
|
||||||
let temp_path = path.join("temp");
|
|
||||||
|
|
||||||
let label = fs::read_to_string(&type_path).await.ok()?.to_lowercase();
|
|
||||||
let raw = match fs::read_to_string(&temp_path).await {
|
|
||||||
Ok(value) => value,
|
|
||||||
Err(_) => continue,
|
|
||||||
};
|
|
||||||
|
|
||||||
let milli: f32 = match raw.trim().parse() {
|
|
||||||
Ok(value) => value,
|
|
||||||
Err(_) => continue,
|
|
||||||
};
|
|
||||||
|
|
||||||
let temp_c = milli / 1000.0;
|
|
||||||
if label.contains("cpu") || label.contains("pkg") {
|
|
||||||
if temp_c > 0.0 {
|
|
||||||
return Some(temp_c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if fallback.is_none() && temp_c > 0.0 {
|
|
||||||
fallback = Some(temp_c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fallback
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_gpu_metrics(&self) -> (Option<f32>, Option<f32>) {
|
async fn get_gpu_metrics(&self) -> (Option<f32>, Option<f32>) {
|
||||||
let output = Command::new("nvidia-smi")
|
let output = Command::new("nvidia-smi")
|
||||||
@ -983,43 +827,21 @@ impl Collector for ServiceCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get system memory info for quota calculation
|
|
||||||
let system_memory = self
|
|
||||||
.get_system_memory_info()
|
|
||||||
.await
|
|
||||||
.unwrap_or(SystemMemoryInfo {
|
|
||||||
total_mb: 0.0,
|
|
||||||
used_mb: 0.0,
|
|
||||||
});
|
|
||||||
|
|
||||||
let _disk_usage = self.get_disk_usage().await.unwrap_or(DiskUsage {
|
let _disk_usage = self.get_disk_usage().await.unwrap_or(DiskUsage {
|
||||||
total_gb: 0.0,
|
total_gb: 0.0,
|
||||||
used_gb: 0.0,
|
used_gb: 0.0,
|
||||||
});
|
});
|
||||||
|
|
||||||
let (cpu_load_1, cpu_load_5, cpu_load_15) =
|
|
||||||
self.get_cpu_load().await.unwrap_or((0.0, 0.0, 0.0));
|
|
||||||
let cpu_status = self.determine_cpu_status(cpu_load_5);
|
|
||||||
|
|
||||||
// Calculate memory usage percentage and status
|
|
||||||
let memory_usage_percent = if system_memory.total_mb > 0.0 {
|
|
||||||
(system_memory.used_mb / system_memory.total_mb) * 100.0
|
|
||||||
} else {
|
|
||||||
0.0
|
|
||||||
};
|
|
||||||
let memory_status = self.determine_memory_status(memory_usage_percent);
|
|
||||||
|
|
||||||
// Calculate overall services status
|
// Calculate overall services status
|
||||||
let services_status = self.determine_services_status(healthy, degraded, failed);
|
let services_status = self.determine_services_status(healthy, degraded, failed);
|
||||||
|
|
||||||
let cpu_cstate_info = self.get_cpu_cstate_info().await;
|
|
||||||
let cpu_temp_c = self.get_cpu_temperature_c().await;
|
|
||||||
let cpu_temp_status = cpu_temp_c.map(|temp| self.determine_cpu_temp_status(temp));
|
|
||||||
let (gpu_load_percent, gpu_temp_c) = self.get_gpu_metrics().await;
|
let (gpu_load_percent, gpu_temp_c) = self.get_gpu_metrics().await;
|
||||||
|
|
||||||
// If no specific quotas are set, use system memory as reference
|
// If no specific quotas are set, use a default value
|
||||||
if total_memory_quota == 0.0 {
|
if total_memory_quota == 0.0 {
|
||||||
total_memory_quota = system_memory.total_mb;
|
total_memory_quota = 8192.0; // Default 8GB for quota calculation
|
||||||
}
|
}
|
||||||
|
|
||||||
let service_metrics = json!({
|
let service_metrics = json!({
|
||||||
@ -1030,18 +852,8 @@ impl Collector for ServiceCollector {
|
|||||||
"services_status": services_status,
|
"services_status": services_status,
|
||||||
"memory_used_mb": total_memory_used,
|
"memory_used_mb": total_memory_used,
|
||||||
"memory_quota_mb": total_memory_quota,
|
"memory_quota_mb": total_memory_quota,
|
||||||
"system_memory_used_mb": system_memory.used_mb,
|
|
||||||
"system_memory_total_mb": system_memory.total_mb,
|
|
||||||
"memory_status": memory_status,
|
|
||||||
"disk_used_gb": total_disk_used,
|
"disk_used_gb": total_disk_used,
|
||||||
"disk_total_gb": total_disk_used, // For services, total = used (no quota concept)
|
"disk_total_gb": total_disk_used, // For services, total = used (no quota concept)
|
||||||
"cpu_load_1": cpu_load_1,
|
|
||||||
"cpu_load_5": cpu_load_5,
|
|
||||||
"cpu_load_15": cpu_load_15,
|
|
||||||
"cpu_status": cpu_status,
|
|
||||||
"cpu_cstate": cpu_cstate_info,
|
|
||||||
"cpu_temp_c": cpu_temp_c,
|
|
||||||
"cpu_temp_status": cpu_temp_status,
|
|
||||||
"gpu_load_percent": gpu_load_percent,
|
"gpu_load_percent": gpu_load_percent,
|
||||||
"gpu_temp_c": gpu_temp_c,
|
"gpu_temp_c": gpu_temp_c,
|
||||||
},
|
},
|
||||||
@ -1077,10 +889,6 @@ enum ServiceStatus {
|
|||||||
Stopped,
|
Stopped,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct SystemMemoryInfo {
|
|
||||||
total_mb: f32,
|
|
||||||
used_mb: f32,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
struct DiskUsage {
|
struct DiskUsage {
|
||||||
|
|||||||
271
agent/src/collectors/system.rs
Normal file
271
agent/src/collectors/system.rs
Normal file
@ -0,0 +1,271 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use serde_json::json;
|
||||||
|
use std::time::Duration;
|
||||||
|
use tokio::fs;
|
||||||
|
use tokio::process::Command;
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use super::{Collector, CollectorError, CollectorOutput, AgentType};
|
||||||
|
|
||||||
|
pub struct SystemCollector {
|
||||||
|
enabled: bool,
|
||||||
|
interval: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SystemCollector {
|
||||||
|
pub fn new(enabled: bool, interval_ms: u64) -> Self {
|
||||||
|
Self {
|
||||||
|
enabled,
|
||||||
|
interval: Duration::from_millis(interval_ms),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_cpu_load(&self) -> Result<(f32, f32, f32), CollectorError> {
|
||||||
|
let output = Command::new("uptime")
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
.map_err(|e| CollectorError::CommandFailed {
|
||||||
|
command: "uptime".to_string(),
|
||||||
|
message: e.to_string()
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let uptime_str = String::from_utf8_lossy(&output.stdout);
|
||||||
|
|
||||||
|
// Parse load averages from uptime output
|
||||||
|
// Format with comma decimals: "... load average: 3,30, 3,17, 2,84"
|
||||||
|
if let Some(load_part) = uptime_str.split("load average:").nth(1) {
|
||||||
|
// Use regex or careful parsing for comma decimal separator locale
|
||||||
|
let load_str = load_part.trim();
|
||||||
|
// Split on ", " to separate the three load values
|
||||||
|
let loads: Vec<&str> = load_str.split(", ").collect();
|
||||||
|
if loads.len() >= 3 {
|
||||||
|
let load_1 = loads[0].trim().replace(',', ".").parse::<f32>()
|
||||||
|
.map_err(|_| CollectorError::ParseError { message: "Failed to parse 1min load".to_string() })?;
|
||||||
|
let load_5 = loads[1].trim().replace(',', ".").parse::<f32>()
|
||||||
|
.map_err(|_| CollectorError::ParseError { message: "Failed to parse 5min load".to_string() })?;
|
||||||
|
let load_15 = loads[2].trim().replace(',', ".").parse::<f32>()
|
||||||
|
.map_err(|_| CollectorError::ParseError { message: "Failed to parse 15min load".to_string() })?;
|
||||||
|
|
||||||
|
return Ok((load_1, load_5, load_15));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(CollectorError::ParseError { message: "Failed to parse load averages".to_string() })
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_cpu_temperature(&self) -> Option<f32> {
|
||||||
|
// Try to find CPU-specific thermal zones first (x86_pkg_temp, coretemp, etc.)
|
||||||
|
for i in 0..10 {
|
||||||
|
let type_path = format!("/sys/class/thermal/thermal_zone{}/type", i);
|
||||||
|
let temp_path = format!("/sys/class/thermal/thermal_zone{}/temp", i);
|
||||||
|
|
||||||
|
if let (Ok(zone_type), Ok(temp_str)) = (
|
||||||
|
fs::read_to_string(&type_path).await,
|
||||||
|
fs::read_to_string(&temp_path).await,
|
||||||
|
) {
|
||||||
|
let zone_type = zone_type.trim();
|
||||||
|
if let Ok(temp_millic) = temp_str.trim().parse::<f32>() {
|
||||||
|
let temp_c = temp_millic / 1000.0;
|
||||||
|
// Look for reasonable temperatures first
|
||||||
|
if temp_c > 20.0 && temp_c < 150.0 {
|
||||||
|
// Prefer CPU package temperature zones
|
||||||
|
if zone_type == "x86_pkg_temp" || zone_type.contains("coretemp") {
|
||||||
|
debug!("Found CPU temperature: {}°C from {} ({})", temp_c, temp_path, zone_type);
|
||||||
|
return Some(temp_c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: try any reasonable temperature if no CPU-specific zone found
|
||||||
|
for i in 0..10 {
|
||||||
|
let temp_path = format!("/sys/class/thermal/thermal_zone{}/temp", i);
|
||||||
|
if let Ok(temp_str) = fs::read_to_string(&temp_path).await {
|
||||||
|
if let Ok(temp_millic) = temp_str.trim().parse::<f32>() {
|
||||||
|
let temp_c = temp_millic / 1000.0;
|
||||||
|
if temp_c > 20.0 && temp_c < 150.0 {
|
||||||
|
debug!("Found fallback temperature: {}°C from {}", temp_c, temp_path);
|
||||||
|
return Some(temp_c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_memory_info(&self) -> Result<(f32, f32), CollectorError> {
|
||||||
|
let meminfo = fs::read_to_string("/proc/meminfo")
|
||||||
|
.await
|
||||||
|
.map_err(|e| CollectorError::IoError { message: format!("Failed to read /proc/meminfo: {}", e) })?;
|
||||||
|
|
||||||
|
let mut total_kb = 0;
|
||||||
|
let mut available_kb = 0;
|
||||||
|
|
||||||
|
for line in meminfo.lines() {
|
||||||
|
if line.starts_with("MemTotal:") {
|
||||||
|
if let Some(value) = line.split_whitespace().nth(1) {
|
||||||
|
total_kb = value.parse::<u64>().unwrap_or(0);
|
||||||
|
}
|
||||||
|
} else if line.starts_with("MemAvailable:") {
|
||||||
|
if let Some(value) = line.split_whitespace().nth(1) {
|
||||||
|
available_kb = value.parse::<u64>().unwrap_or(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if total_kb == 0 {
|
||||||
|
return Err(CollectorError::ParseError { message: "Could not parse total memory".to_string() });
|
||||||
|
}
|
||||||
|
|
||||||
|
let total_mb = total_kb as f32 / 1024.0;
|
||||||
|
let used_mb = total_mb - (available_kb as f32 / 1024.0);
|
||||||
|
|
||||||
|
Ok((used_mb, total_mb))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_cpu_cstate_info(&self) -> Option<Vec<String>> {
|
||||||
|
// Read C-state information to show all sleep state distributions
|
||||||
|
let mut cstate_times: Vec<(String, u64)> = Vec::new();
|
||||||
|
let mut total_time = 0u64;
|
||||||
|
|
||||||
|
// Check if C-state information is available
|
||||||
|
if let Ok(mut entries) = fs::read_dir("/sys/devices/system/cpu/cpu0/cpuidle").await {
|
||||||
|
while let Ok(Some(entry)) = entries.next_entry().await {
|
||||||
|
let state_path = entry.path();
|
||||||
|
let name_path = state_path.join("name");
|
||||||
|
let time_path = state_path.join("time");
|
||||||
|
|
||||||
|
if let (Ok(name), Ok(time_str)) = (
|
||||||
|
fs::read_to_string(&name_path).await,
|
||||||
|
fs::read_to_string(&time_path).await
|
||||||
|
) {
|
||||||
|
let name = name.trim().to_string();
|
||||||
|
if let Ok(time) = time_str.trim().parse::<u64>() {
|
||||||
|
total_time += time;
|
||||||
|
cstate_times.push((name, time));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if total_time > 0 && !cstate_times.is_empty() {
|
||||||
|
// Sort by time spent (highest first)
|
||||||
|
cstate_times.sort_by(|a, b| b.1.cmp(&a.1));
|
||||||
|
|
||||||
|
// Format all C-states with percentages
|
||||||
|
let mut result = Vec::new();
|
||||||
|
for (name, time) in cstate_times {
|
||||||
|
let percent = (time as f32 / total_time as f32) * 100.0;
|
||||||
|
if percent >= 0.1 { // Only show states with at least 0.1% time
|
||||||
|
result.push(format!("{}: {:.1}%", name, percent));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Some(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
fn determine_cpu_status(&self, cpu_load_5: f32) -> String {
|
||||||
|
if cpu_load_5 >= 8.0 {
|
||||||
|
"critical".to_string()
|
||||||
|
} else if cpu_load_5 >= 5.0 {
|
||||||
|
"warning".to_string()
|
||||||
|
} else {
|
||||||
|
"ok".to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn determine_cpu_temp_status(&self, temp_c: f32) -> String {
|
||||||
|
if temp_c >= 100.0 {
|
||||||
|
"critical".to_string()
|
||||||
|
} else if temp_c >= 100.0 {
|
||||||
|
"warning".to_string()
|
||||||
|
} else {
|
||||||
|
"ok".to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn determine_memory_status(&self, usage_percent: f32) -> String {
|
||||||
|
if usage_percent >= 95.0 {
|
||||||
|
"critical".to_string()
|
||||||
|
} else if usage_percent >= 80.0 {
|
||||||
|
"warning".to_string()
|
||||||
|
} else {
|
||||||
|
"ok".to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Collector for SystemCollector {
|
||||||
|
fn name(&self) -> &str {
|
||||||
|
"system"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn agent_type(&self) -> AgentType {
|
||||||
|
AgentType::System
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_interval(&self) -> Duration {
|
||||||
|
self.interval
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn collect(&self) -> Result<CollectorOutput, CollectorError> {
|
||||||
|
if !self.enabled {
|
||||||
|
return Err(CollectorError::ConfigError { message: "SystemCollector disabled".to_string() });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get CPU load averages
|
||||||
|
let (cpu_load_1, cpu_load_5, cpu_load_15) = self.get_cpu_load().await?;
|
||||||
|
let cpu_status = self.determine_cpu_status(cpu_load_5);
|
||||||
|
|
||||||
|
// Get CPU temperature (optional)
|
||||||
|
let cpu_temp_c = self.get_cpu_temperature().await;
|
||||||
|
let cpu_temp_status = cpu_temp_c.map(|temp| self.determine_cpu_temp_status(temp));
|
||||||
|
|
||||||
|
// Get memory information
|
||||||
|
let (memory_used_mb, memory_total_mb) = self.get_memory_info().await?;
|
||||||
|
let memory_usage_percent = (memory_used_mb / memory_total_mb) * 100.0;
|
||||||
|
let memory_status = self.determine_memory_status(memory_usage_percent);
|
||||||
|
|
||||||
|
// Get C-state information (optional)
|
||||||
|
let cpu_cstate_info = self.get_cpu_cstate_info().await;
|
||||||
|
|
||||||
|
let mut system_metrics = json!({
|
||||||
|
"summary": {
|
||||||
|
"cpu_load_1": cpu_load_1,
|
||||||
|
"cpu_load_5": cpu_load_5,
|
||||||
|
"cpu_load_15": cpu_load_15,
|
||||||
|
"cpu_status": cpu_status,
|
||||||
|
"memory_used_mb": memory_used_mb,
|
||||||
|
"memory_total_mb": memory_total_mb,
|
||||||
|
"memory_usage_percent": memory_usage_percent,
|
||||||
|
"memory_status": memory_status,
|
||||||
|
},
|
||||||
|
"timestamp": chrono::Utc::now().timestamp() as u64,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add optional metrics if available
|
||||||
|
if let Some(temp) = cpu_temp_c {
|
||||||
|
system_metrics["summary"]["cpu_temp_c"] = json!(temp);
|
||||||
|
if let Some(status) = cpu_temp_status {
|
||||||
|
system_metrics["summary"]["cpu_temp_status"] = json!(status);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(cstates) = cpu_cstate_info {
|
||||||
|
system_metrics["summary"]["cpu_cstate"] = json!(cstates);
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("System metrics collected: CPU load {:.2}, Memory {:.1}%",
|
||||||
|
cpu_load_5, memory_usage_percent);
|
||||||
|
|
||||||
|
Ok(CollectorOutput {
|
||||||
|
agent_type: AgentType::System,
|
||||||
|
data: system_metrics,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -35,6 +35,7 @@ pub struct StatusChange {
|
|||||||
pub old_status: String,
|
pub old_status: String,
|
||||||
pub new_status: String,
|
pub new_status: String,
|
||||||
pub timestamp: DateTime<Utc>,
|
pub timestamp: DateTime<Utc>,
|
||||||
|
pub details: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct NotificationManager {
|
pub struct NotificationManager {
|
||||||
@ -53,6 +54,10 @@ impl NotificationManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn update_status(&mut self, component: &str, metric: &str, status: &str) -> Option<StatusChange> {
|
pub fn update_status(&mut self, component: &str, metric: &str, status: &str) -> Option<StatusChange> {
|
||||||
|
self.update_status_with_details(component, metric, status, None)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn update_status_with_details(&mut self, component: &str, metric: &str, status: &str, details: Option<String>) -> Option<StatusChange> {
|
||||||
let key = format!("{}.{}", component, metric);
|
let key = format!("{}.{}", component, metric);
|
||||||
let old_status = self.last_status.get(&key).cloned();
|
let old_status = self.last_status.get(&key).cloned();
|
||||||
|
|
||||||
@ -64,6 +69,7 @@ impl NotificationManager {
|
|||||||
old_status: old.clone(),
|
old_status: old.clone(),
|
||||||
new_status: status.to_string(),
|
new_status: status.to_string(),
|
||||||
timestamp: Utc::now(),
|
timestamp: Utc::now(),
|
||||||
|
details,
|
||||||
};
|
};
|
||||||
|
|
||||||
self.last_status.insert(key, status.to_string());
|
self.last_status.insert(key, status.to_string());
|
||||||
@ -154,26 +160,34 @@ impl NotificationManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn format_body(&self, change: &StatusChange) -> String {
|
fn format_body(&self, change: &StatusChange) -> String {
|
||||||
format!(
|
let mut body = format!(
|
||||||
"Status Change Alert\n\
|
"Status Change Alert\n\
|
||||||
\n\
|
\n\
|
||||||
Host: {}\n\
|
Host: {}\n\
|
||||||
Component: {}\n\
|
Component: {}\n\
|
||||||
Metric: {}\n\
|
Metric: {}\n\
|
||||||
Status Change: {} → {}\n\
|
Status Change: {} → {}\n\
|
||||||
Time: {}\n\
|
Time: {}",
|
||||||
\n\
|
|
||||||
--\n\
|
|
||||||
CM Dashboard Agent\n\
|
|
||||||
Generated at {}",
|
|
||||||
gethostname::gethostname().to_string_lossy(),
|
gethostname::gethostname().to_string_lossy(),
|
||||||
change.component,
|
change.component,
|
||||||
change.metric,
|
change.metric,
|
||||||
change.old_status,
|
change.old_status,
|
||||||
change.new_status,
|
change.new_status,
|
||||||
change.timestamp.with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST"),
|
change.timestamp.with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST")
|
||||||
|
);
|
||||||
|
|
||||||
|
if let Some(details) = &change.details {
|
||||||
|
body.push_str(&format!("\n\nDetails:\n{}", details));
|
||||||
|
}
|
||||||
|
|
||||||
|
body.push_str(&format!(
|
||||||
|
"\n\n--\n\
|
||||||
|
CM Dashboard Agent\n\
|
||||||
|
Generated at {}",
|
||||||
Utc::now().with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST")
|
Utc::now().with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST")
|
||||||
)
|
));
|
||||||
|
|
||||||
|
body
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn send_email(&self, subject: &str, body: &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
async fn send_email(&self, subject: &str, body: &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||||
|
|||||||
@ -9,6 +9,7 @@ use crate::collectors::{
|
|||||||
backup::BackupCollector,
|
backup::BackupCollector,
|
||||||
service::ServiceCollector,
|
service::ServiceCollector,
|
||||||
smart::SmartCollector,
|
smart::SmartCollector,
|
||||||
|
system::SystemCollector,
|
||||||
Collector
|
Collector
|
||||||
};
|
};
|
||||||
use cm_dashboard_shared::envelope::AgentType;
|
use cm_dashboard_shared::envelope::AgentType;
|
||||||
@ -60,6 +61,11 @@ impl SimpleAgent {
|
|||||||
warn!("No storage devices found - SMART monitoring disabled");
|
warn!("No storage devices found - SMART monitoring disabled");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// System collector
|
||||||
|
let system_collector = SystemCollector::new(true, 5000);
|
||||||
|
collectors.push(Box::new(system_collector));
|
||||||
|
info!("System monitoring: CPU, memory, temperature, C-states");
|
||||||
|
|
||||||
// Service collector
|
// Service collector
|
||||||
let services = AutoDiscovery::discover_services().await;
|
let services = AutoDiscovery::discover_services().await;
|
||||||
let service_list = if !services.is_empty() {
|
let service_list = if !services.is_empty() {
|
||||||
@ -161,32 +167,10 @@ impl SimpleAgent {
|
|||||||
match output.agent_type {
|
match output.agent_type {
|
||||||
AgentType::Service => {
|
AgentType::Service => {
|
||||||
if let Some(summary) = output.data.get("summary") {
|
if let Some(summary) = output.data.get("summary") {
|
||||||
// Check CPU status
|
|
||||||
if let Some(cpu_status) = summary.get("cpu_status").and_then(|v| v.as_str()) {
|
|
||||||
if let Some(change) = self.notification_manager.update_status("system", "cpu", cpu_status) {
|
|
||||||
info!("CPU status change detected: {} -> {}", change.old_status, change.new_status);
|
|
||||||
self.notification_manager.send_notification(change).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check memory status
|
|
||||||
if let Some(memory_status) = summary.get("memory_status").and_then(|v| v.as_str()) {
|
|
||||||
if let Some(change) = self.notification_manager.update_status("system", "memory", memory_status) {
|
|
||||||
self.notification_manager.send_notification(change).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check CPU temperature status
|
|
||||||
if let Some(cpu_temp_status) = summary.get("cpu_temp_status").and_then(|v| v.as_str()) {
|
|
||||||
if let Some(change) = self.notification_manager.update_status("system", "cpu_temp", cpu_temp_status) {
|
|
||||||
info!("CPU temp status change detected: {} -> {}", change.old_status, change.new_status);
|
|
||||||
self.notification_manager.send_notification(change).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check services status
|
// Check services status
|
||||||
if let Some(services_status) = summary.get("services_status").and_then(|v| v.as_str()) {
|
if let Some(services_status) = summary.get("services_status").and_then(|v| v.as_str()) {
|
||||||
if let Some(change) = self.notification_manager.update_status("system", "services", services_status) {
|
let details = self.build_service_failure_details(output);
|
||||||
|
if let Some(change) = self.notification_manager.update_status_with_details("system", "services", services_status, details) {
|
||||||
self.notification_manager.send_notification(change).await;
|
self.notification_manager.send_notification(change).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -205,6 +189,33 @@ impl SimpleAgent {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
AgentType::System => {
|
||||||
|
if let Some(summary) = output.data.get("summary") {
|
||||||
|
// Check CPU status
|
||||||
|
if let Some(cpu_status) = summary.get("cpu_status").and_then(|v| v.as_str()) {
|
||||||
|
if let Some(change) = self.notification_manager.update_status("system", "cpu", cpu_status) {
|
||||||
|
info!("CPU status change detected: {} -> {}", change.old_status, change.new_status);
|
||||||
|
self.notification_manager.send_notification(change).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check memory status
|
||||||
|
if let Some(memory_status) = summary.get("memory_status").and_then(|v| v.as_str()) {
|
||||||
|
if let Some(change) = self.notification_manager.update_status("system", "memory", memory_status) {
|
||||||
|
info!("Memory status change detected: {} -> {}", change.old_status, change.new_status);
|
||||||
|
self.notification_manager.send_notification(change).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check CPU temp status (optional)
|
||||||
|
if let Some(cpu_temp_status) = summary.get("cpu_temp_status").and_then(|v| v.as_str()) {
|
||||||
|
if let Some(change) = self.notification_manager.update_status("system", "cpu_temp", cpu_temp_status) {
|
||||||
|
info!("CPU temp status change detected: {} -> {}", change.old_status, change.new_status);
|
||||||
|
self.notification_manager.send_notification(change).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
AgentType::Backup => {
|
AgentType::Backup => {
|
||||||
if let Some(status) = output.data.get("overall_status") {
|
if let Some(status) = output.data.get("overall_status") {
|
||||||
let status_str = match status.as_str() {
|
let status_str = match status.as_str() {
|
||||||
@ -220,4 +231,69 @@ impl SimpleAgent {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn build_service_failure_details(&self, output: &crate::collectors::CollectorOutput) -> Option<String> {
|
||||||
|
if let Some(services) = output.data.get("services").and_then(|v| v.as_array()) {
|
||||||
|
let mut failed_services = Vec::new();
|
||||||
|
let mut degraded_services = Vec::new();
|
||||||
|
|
||||||
|
for service in services {
|
||||||
|
if let (Some(name), Some(status)) = (
|
||||||
|
service.get("name").and_then(|v| v.as_str()),
|
||||||
|
service.get("status").and_then(|v| v.as_str())
|
||||||
|
) {
|
||||||
|
match status {
|
||||||
|
"Stopped" => {
|
||||||
|
let memory = service.get("memory_used_mb")
|
||||||
|
.and_then(|v| v.as_f64())
|
||||||
|
.unwrap_or(0.0);
|
||||||
|
let disk = service.get("disk_used_gb")
|
||||||
|
.and_then(|v| v.as_f64())
|
||||||
|
.unwrap_or(0.0);
|
||||||
|
failed_services.push(format!("{} (stopped, was using {:.1}MB RAM, {:.1}GB disk)",
|
||||||
|
name, memory, disk));
|
||||||
|
},
|
||||||
|
"Degraded" | "Restarting" => {
|
||||||
|
let memory = service.get("memory_used_mb")
|
||||||
|
.and_then(|v| v.as_f64())
|
||||||
|
.unwrap_or(0.0);
|
||||||
|
let disk = service.get("disk_used_gb")
|
||||||
|
.and_then(|v| v.as_f64())
|
||||||
|
.unwrap_or(0.0);
|
||||||
|
degraded_services.push(format!("{} ({}, using {:.1}MB RAM, {:.1}GB disk)",
|
||||||
|
name, status.to_lowercase(), memory, disk));
|
||||||
|
},
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !failed_services.is_empty() || !degraded_services.is_empty() {
|
||||||
|
let mut details = String::new();
|
||||||
|
|
||||||
|
if !failed_services.is_empty() {
|
||||||
|
details.push_str("Failed services:\n");
|
||||||
|
for service in &failed_services {
|
||||||
|
details.push_str(&format!("- {}\n", service));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !degraded_services.is_empty() {
|
||||||
|
if !details.is_empty() {
|
||||||
|
details.push('\n');
|
||||||
|
}
|
||||||
|
details.push_str("Degraded services:\n");
|
||||||
|
for service in °raded_services {
|
||||||
|
details.push_str(&format!("- {}\n", service));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(details.trim_end().to_string())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@ -10,7 +10,11 @@ use gethostname::gethostname;
|
|||||||
use crate::config;
|
use crate::config;
|
||||||
use crate::data::config::{AppConfig, DataSourceKind, HostTarget, ZmqConfig};
|
use crate::data::config::{AppConfig, DataSourceKind, HostTarget, ZmqConfig};
|
||||||
use crate::data::history::MetricsHistory;
|
use crate::data::history::MetricsHistory;
|
||||||
use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics};
|
use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics, SystemMetrics};
|
||||||
|
|
||||||
|
// Host connection timeout - if no data received for this duration, mark as timeout
|
||||||
|
// Keep-alive mechanism: agents send data every 5 seconds, timeout after 15 seconds
|
||||||
|
const HOST_CONNECTION_TIMEOUT: Duration = Duration::from_secs(15);
|
||||||
|
|
||||||
/// Shared application settings derived from the CLI arguments.
|
/// Shared application settings derived from the CLI arguments.
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@ -32,11 +36,22 @@ impl AppOptions {
|
|||||||
struct HostRuntimeState {
|
struct HostRuntimeState {
|
||||||
last_success: Option<DateTime<Utc>>,
|
last_success: Option<DateTime<Utc>>,
|
||||||
last_error: Option<String>,
|
last_error: Option<String>,
|
||||||
|
connection_status: ConnectionStatus,
|
||||||
smart: Option<SmartMetrics>,
|
smart: Option<SmartMetrics>,
|
||||||
services: Option<ServiceMetrics>,
|
services: Option<ServiceMetrics>,
|
||||||
|
system: Option<SystemMetrics>,
|
||||||
backup: Option<BackupMetrics>,
|
backup: Option<BackupMetrics>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
pub enum ConnectionStatus {
|
||||||
|
#[default]
|
||||||
|
Unknown,
|
||||||
|
Connected,
|
||||||
|
Timeout,
|
||||||
|
Error,
|
||||||
|
}
|
||||||
|
|
||||||
/// Top-level application state container.
|
/// Top-level application state container.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct App {
|
pub struct App {
|
||||||
@ -100,6 +115,10 @@ impl App {
|
|||||||
pub fn on_tick(&mut self) {
|
pub fn on_tick(&mut self) {
|
||||||
self.tick_count = self.tick_count.saturating_add(1);
|
self.tick_count = self.tick_count.saturating_add(1);
|
||||||
self.last_tick = Instant::now();
|
self.last_tick = Instant::now();
|
||||||
|
|
||||||
|
// Check for host connection timeouts
|
||||||
|
self.check_host_timeouts();
|
||||||
|
|
||||||
let host_count = self.hosts.len();
|
let host_count = self.hosts.len();
|
||||||
let retention = self.history.retention();
|
let retention = self.history.retention();
|
||||||
self.status = format!(
|
self.status = format!(
|
||||||
@ -193,8 +212,10 @@ impl App {
|
|||||||
name: host.name.clone(),
|
name: host.name.clone(),
|
||||||
last_success: state.last_success.clone(),
|
last_success: state.last_success.clone(),
|
||||||
last_error: state.last_error.clone(),
|
last_error: state.last_error.clone(),
|
||||||
|
connection_status: state.connection_status.clone(),
|
||||||
smart: state.smart.clone(),
|
smart: state.smart.clone(),
|
||||||
services: state.services.clone(),
|
services: state.services.clone(),
|
||||||
|
system: state.system.clone(),
|
||||||
backup: state.backup.clone(),
|
backup: state.backup.clone(),
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
@ -209,8 +230,10 @@ impl App {
|
|||||||
name: host.name.clone(),
|
name: host.name.clone(),
|
||||||
last_success: state.last_success.clone(),
|
last_success: state.last_success.clone(),
|
||||||
last_error: state.last_error.clone(),
|
last_error: state.last_error.clone(),
|
||||||
|
connection_status: state.connection_status.clone(),
|
||||||
smart: state.smart.clone(),
|
smart: state.smart.clone(),
|
||||||
services: state.services.clone(),
|
services: state.services.clone(),
|
||||||
|
system: state.system.clone(),
|
||||||
backup: state.backup.clone(),
|
backup: state.backup.clone(),
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
@ -237,6 +260,7 @@ impl App {
|
|||||||
host,
|
host,
|
||||||
smart,
|
smart,
|
||||||
services,
|
services,
|
||||||
|
system,
|
||||||
backup,
|
backup,
|
||||||
timestamp,
|
timestamp,
|
||||||
} => {
|
} => {
|
||||||
@ -245,6 +269,7 @@ impl App {
|
|||||||
let state = self.host_states.entry(host.clone()).or_default();
|
let state = self.host_states.entry(host.clone()).or_default();
|
||||||
state.last_success = Some(timestamp);
|
state.last_success = Some(timestamp);
|
||||||
state.last_error = None;
|
state.last_error = None;
|
||||||
|
state.connection_status = ConnectionStatus::Connected;
|
||||||
|
|
||||||
if let Some(mut smart_metrics) = smart {
|
if let Some(mut smart_metrics) = smart {
|
||||||
if smart_metrics.timestamp != timestamp {
|
if smart_metrics.timestamp != timestamp {
|
||||||
@ -267,6 +292,16 @@ impl App {
|
|||||||
state.services = Some(snapshot);
|
state.services = Some(snapshot);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(system_metrics) = system {
|
||||||
|
// Convert timestamp format (u64 to DateTime<Utc>)
|
||||||
|
let system_snapshot = SystemMetrics {
|
||||||
|
summary: system_metrics.summary,
|
||||||
|
timestamp: system_metrics.timestamp,
|
||||||
|
};
|
||||||
|
self.history.record_system(system_snapshot.clone());
|
||||||
|
state.system = Some(system_snapshot);
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(mut backup_metrics) = backup {
|
if let Some(mut backup_metrics) = backup {
|
||||||
if backup_metrics.timestamp != timestamp {
|
if backup_metrics.timestamp != timestamp {
|
||||||
backup_metrics.timestamp = timestamp;
|
backup_metrics.timestamp = timestamp;
|
||||||
@ -291,12 +326,37 @@ impl App {
|
|||||||
self.ensure_host_entry(&host);
|
self.ensure_host_entry(&host);
|
||||||
let state = self.host_states.entry(host.clone()).or_default();
|
let state = self.host_states.entry(host.clone()).or_default();
|
||||||
state.last_error = Some(format!("{} at {}", error, timestamp.format("%H:%M:%S")));
|
state.last_error = Some(format!("{} at {}", error, timestamp.format("%H:%M:%S")));
|
||||||
|
state.connection_status = ConnectionStatus::Error;
|
||||||
|
|
||||||
self.status = format!("Fetch failed • host: {} • {}", host, error);
|
self.status = format!("Fetch failed • host: {} • {}", host, error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn check_host_timeouts(&mut self) {
|
||||||
|
let now = Utc::now();
|
||||||
|
|
||||||
|
for (host_name, state) in self.host_states.iter_mut() {
|
||||||
|
if let Some(last_success) = state.last_success {
|
||||||
|
let duration_since_last = now.signed_duration_since(last_success);
|
||||||
|
|
||||||
|
if duration_since_last > chrono::Duration::from_std(HOST_CONNECTION_TIMEOUT).unwrap() {
|
||||||
|
// Host has timed out (missed keep-alive)
|
||||||
|
if !matches!(state.connection_status, ConnectionStatus::Timeout) {
|
||||||
|
state.connection_status = ConnectionStatus::Timeout;
|
||||||
|
state.last_error = Some(format!("Keep-alive timeout (no data for {}s)", duration_since_last.num_seconds()));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Host is connected
|
||||||
|
state.connection_status = ConnectionStatus::Connected;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No data ever received from this host
|
||||||
|
state.connection_status = ConnectionStatus::Unknown;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn help_visible(&self) -> bool {
|
pub fn help_visible(&self) -> bool {
|
||||||
self.show_help
|
self.show_help
|
||||||
}
|
}
|
||||||
@ -511,8 +571,10 @@ pub struct HostDisplayData {
|
|||||||
pub name: String,
|
pub name: String,
|
||||||
pub last_success: Option<DateTime<Utc>>,
|
pub last_success: Option<DateTime<Utc>>,
|
||||||
pub last_error: Option<String>,
|
pub last_error: Option<String>,
|
||||||
|
pub connection_status: ConnectionStatus,
|
||||||
pub smart: Option<SmartMetrics>,
|
pub smart: Option<SmartMetrics>,
|
||||||
pub services: Option<ServiceMetrics>,
|
pub services: Option<ServiceMetrics>,
|
||||||
|
pub system: Option<SystemMetrics>,
|
||||||
pub backup: Option<BackupMetrics>,
|
pub backup: Option<BackupMetrics>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -545,6 +607,7 @@ pub enum AppEvent {
|
|||||||
host: String,
|
host: String,
|
||||||
smart: Option<SmartMetrics>,
|
smart: Option<SmartMetrics>,
|
||||||
services: Option<ServiceMetrics>,
|
services: Option<ServiceMetrics>,
|
||||||
|
system: Option<SystemMetrics>,
|
||||||
backup: Option<BackupMetrics>,
|
backup: Option<BackupMetrics>,
|
||||||
timestamp: DateTime<Utc>,
|
timestamp: DateTime<Utc>,
|
||||||
},
|
},
|
||||||
|
|||||||
@ -5,7 +5,7 @@ use std::time::Duration;
|
|||||||
|
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
|
|
||||||
use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics};
|
use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics, SystemMetrics};
|
||||||
|
|
||||||
/// Ring buffer for retaining recent samples for trend analysis.
|
/// Ring buffer for retaining recent samples for trend analysis.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@ -13,6 +13,7 @@ pub struct MetricsHistory {
|
|||||||
capacity: usize,
|
capacity: usize,
|
||||||
smart: VecDeque<(DateTime<Utc>, SmartMetrics)>,
|
smart: VecDeque<(DateTime<Utc>, SmartMetrics)>,
|
||||||
services: VecDeque<(DateTime<Utc>, ServiceMetrics)>,
|
services: VecDeque<(DateTime<Utc>, ServiceMetrics)>,
|
||||||
|
system: VecDeque<(DateTime<Utc>, SystemMetrics)>,
|
||||||
backups: VecDeque<(DateTime<Utc>, BackupMetrics)>,
|
backups: VecDeque<(DateTime<Utc>, BackupMetrics)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -22,6 +23,7 @@ impl MetricsHistory {
|
|||||||
capacity,
|
capacity,
|
||||||
smart: VecDeque::with_capacity(capacity),
|
smart: VecDeque::with_capacity(capacity),
|
||||||
services: VecDeque::with_capacity(capacity),
|
services: VecDeque::with_capacity(capacity),
|
||||||
|
system: VecDeque::with_capacity(capacity),
|
||||||
backups: VecDeque::with_capacity(capacity),
|
backups: VecDeque::with_capacity(capacity),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -36,6 +38,11 @@ impl MetricsHistory {
|
|||||||
Self::push_with_limit(&mut self.services, entry, self.capacity);
|
Self::push_with_limit(&mut self.services, entry, self.capacity);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn record_system(&mut self, metrics: SystemMetrics) {
|
||||||
|
let entry = (Utc::now(), metrics);
|
||||||
|
Self::push_with_limit(&mut self.system, entry, self.capacity);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn record_backup(&mut self, metrics: BackupMetrics) {
|
pub fn record_backup(&mut self, metrics: BackupMetrics) {
|
||||||
let entry = (Utc::now(), metrics);
|
let entry = (Utc::now(), metrics);
|
||||||
Self::push_with_limit(&mut self.backups, entry, self.capacity);
|
Self::push_with_limit(&mut self.backups, entry, self.capacity);
|
||||||
|
|||||||
@ -32,6 +32,32 @@ pub struct DriveSummary {
|
|||||||
pub capacity_used_gb: f32,
|
pub capacity_used_gb: f32,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct SystemMetrics {
|
||||||
|
pub summary: SystemSummary,
|
||||||
|
pub timestamp: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct SystemSummary {
|
||||||
|
pub cpu_load_1: f32,
|
||||||
|
pub cpu_load_5: f32,
|
||||||
|
pub cpu_load_15: f32,
|
||||||
|
#[serde(default)]
|
||||||
|
pub cpu_status: Option<String>,
|
||||||
|
pub memory_used_mb: f32,
|
||||||
|
pub memory_total_mb: f32,
|
||||||
|
pub memory_usage_percent: f32,
|
||||||
|
#[serde(default)]
|
||||||
|
pub memory_status: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub cpu_temp_c: Option<f32>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub cpu_temp_status: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub cpu_cstate: Option<Vec<String>>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct ServiceMetrics {
|
pub struct ServiceMetrics {
|
||||||
pub summary: ServiceSummary,
|
pub summary: ServiceSummary,
|
||||||
|
|||||||
@ -12,7 +12,7 @@ use std::sync::{
|
|||||||
};
|
};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics};
|
use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics, SystemMetrics};
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
use chrono::{TimeZone, Utc};
|
use chrono::{TimeZone, Utc};
|
||||||
use clap::{ArgAction, Parser, Subcommand};
|
use clap::{ArgAction, Parser, Subcommand};
|
||||||
@ -316,6 +316,7 @@ fn handle_zmq_message(
|
|||||||
host,
|
host,
|
||||||
smart: Some(metrics),
|
smart: Some(metrics),
|
||||||
services: None,
|
services: None,
|
||||||
|
system: None,
|
||||||
backup: None,
|
backup: None,
|
||||||
timestamp,
|
timestamp,
|
||||||
});
|
});
|
||||||
@ -335,6 +336,7 @@ fn handle_zmq_message(
|
|||||||
host,
|
host,
|
||||||
smart: None,
|
smart: None,
|
||||||
services: Some(metrics),
|
services: Some(metrics),
|
||||||
|
system: None,
|
||||||
backup: None,
|
backup: None,
|
||||||
timestamp,
|
timestamp,
|
||||||
});
|
});
|
||||||
@ -348,12 +350,33 @@ fn handle_zmq_message(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
AgentType::System => match serde_json::from_value::<SystemMetrics>(payload.clone()) {
|
||||||
|
Ok(metrics) => {
|
||||||
|
let _ = sender.send(AppEvent::MetricsUpdated {
|
||||||
|
host,
|
||||||
|
smart: None,
|
||||||
|
services: None,
|
||||||
|
system: Some(metrics),
|
||||||
|
backup: None,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Err(error) => {
|
||||||
|
warn!(%error, "failed to parse system metrics");
|
||||||
|
let _ = sender.send(AppEvent::MetricsFailed {
|
||||||
|
host,
|
||||||
|
error: format!("system metrics parse error: {error:#}"),
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
},
|
||||||
AgentType::Backup => match serde_json::from_value::<BackupMetrics>(payload.clone()) {
|
AgentType::Backup => match serde_json::from_value::<BackupMetrics>(payload.clone()) {
|
||||||
Ok(metrics) => {
|
Ok(metrics) => {
|
||||||
let _ = sender.send(AppEvent::MetricsUpdated {
|
let _ = sender.send(AppEvent::MetricsUpdated {
|
||||||
host,
|
host,
|
||||||
smart: None,
|
smart: None,
|
||||||
services: None,
|
services: None,
|
||||||
|
system: None,
|
||||||
backup: Some(metrics),
|
backup: Some(metrics),
|
||||||
timestamp,
|
timestamp,
|
||||||
});
|
});
|
||||||
|
|||||||
@ -2,8 +2,8 @@ use chrono::{DateTime, Utc};
|
|||||||
use ratatui::layout::Rect;
|
use ratatui::layout::Rect;
|
||||||
use ratatui::Frame;
|
use ratatui::Frame;
|
||||||
|
|
||||||
use crate::app::HostDisplayData;
|
use crate::app::{HostDisplayData, ConnectionStatus};
|
||||||
use crate::ui::system::{evaluate_performance, PerfSeverity};
|
// Removed: evaluate_performance and PerfSeverity no longer needed
|
||||||
use crate::ui::widget::{render_widget_data, WidgetData, WidgetStatus, StatusLevel};
|
use crate::ui::widget::{render_widget_data, WidgetData, WidgetStatus, StatusLevel};
|
||||||
|
|
||||||
pub fn render(frame: &mut Frame, hosts: &[HostDisplayData], area: Rect) {
|
pub fn render(frame: &mut Frame, hosts: &[HostDisplayData], area: Rect) {
|
||||||
@ -99,6 +99,14 @@ fn classify_hosts(hosts: &[HostDisplayData]) -> (AlertSeverity, usize, usize, us
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn host_severity(host: &HostDisplayData) -> AlertSeverity {
|
fn host_severity(host: &HostDisplayData) -> AlertSeverity {
|
||||||
|
// Check connection status first
|
||||||
|
match host.connection_status {
|
||||||
|
ConnectionStatus::Error => return AlertSeverity::Critical,
|
||||||
|
ConnectionStatus::Timeout => return AlertSeverity::Warning,
|
||||||
|
ConnectionStatus::Unknown => return AlertSeverity::Unknown,
|
||||||
|
ConnectionStatus::Connected => {}, // Continue with other checks
|
||||||
|
}
|
||||||
|
|
||||||
if host.last_error.is_some() {
|
if host.last_error.is_some() {
|
||||||
return AlertSeverity::Critical;
|
return AlertSeverity::Critical;
|
||||||
}
|
}
|
||||||
@ -120,12 +128,13 @@ fn host_severity(host: &HostDisplayData) -> AlertSeverity {
|
|||||||
return AlertSeverity::Warning;
|
return AlertSeverity::Warning;
|
||||||
}
|
}
|
||||||
|
|
||||||
let (perf_severity, _) = evaluate_performance(&services.summary);
|
// TODO: Update to use agent-provided system statuses instead of evaluate_performance
|
||||||
match perf_severity {
|
// let (perf_severity, _) = evaluate_performance(&services.summary);
|
||||||
PerfSeverity::Critical => return AlertSeverity::Critical,
|
// match perf_severity {
|
||||||
PerfSeverity::Warning => return AlertSeverity::Warning,
|
// PerfSeverity::Critical => return AlertSeverity::Critical,
|
||||||
PerfSeverity::Ok => {}
|
// PerfSeverity::Warning => return AlertSeverity::Warning,
|
||||||
}
|
// PerfSeverity::Ok => {}
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(backup) = host.backup.as_ref() {
|
if let Some(backup) = host.backup.as_ref() {
|
||||||
@ -144,6 +153,30 @@ fn host_severity(host: &HostDisplayData) -> AlertSeverity {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn host_status(host: &HostDisplayData) -> (String, AlertSeverity, bool) {
|
fn host_status(host: &HostDisplayData) -> (String, AlertSeverity, bool) {
|
||||||
|
// Check connection status first
|
||||||
|
match host.connection_status {
|
||||||
|
ConnectionStatus::Error => {
|
||||||
|
let msg = if let Some(error) = &host.last_error {
|
||||||
|
format!("Connection error: {}", error)
|
||||||
|
} else {
|
||||||
|
"Connection error".to_string()
|
||||||
|
};
|
||||||
|
return (msg, AlertSeverity::Critical, true);
|
||||||
|
},
|
||||||
|
ConnectionStatus::Timeout => {
|
||||||
|
let msg = if let Some(error) = &host.last_error {
|
||||||
|
format!("Keep-alive timeout: {}", error)
|
||||||
|
} else {
|
||||||
|
"Keep-alive timeout".to_string()
|
||||||
|
};
|
||||||
|
return (msg, AlertSeverity::Warning, true);
|
||||||
|
},
|
||||||
|
ConnectionStatus::Unknown => {
|
||||||
|
return ("No data received".to_string(), AlertSeverity::Unknown, true);
|
||||||
|
},
|
||||||
|
ConnectionStatus::Connected => {}, // Continue with other checks
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(error) = &host.last_error {
|
if let Some(error) = &host.last_error {
|
||||||
return (format!("error: {}", error), AlertSeverity::Critical, true);
|
return (format!("error: {}", error), AlertSeverity::Critical, true);
|
||||||
}
|
}
|
||||||
@ -177,26 +210,27 @@ fn host_status(host: &HostDisplayData) -> (String, AlertSeverity, bool) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let (perf_severity, reason) = evaluate_performance(&services.summary);
|
// TODO: Update to use agent-provided system statuses instead of evaluate_performance
|
||||||
if let Some(reason_text) = reason {
|
// let (perf_severity, reason) = evaluate_performance(&services.summary);
|
||||||
match perf_severity {
|
// if let Some(reason_text) = reason {
|
||||||
PerfSeverity::Critical => {
|
// match perf_severity {
|
||||||
return (
|
// PerfSeverity::Critical => {
|
||||||
format!("critical: {}", reason_text),
|
// return (
|
||||||
AlertSeverity::Critical,
|
// format!("critical: {}", reason_text),
|
||||||
true,
|
// AlertSeverity::Critical,
|
||||||
);
|
// true,
|
||||||
}
|
// );
|
||||||
PerfSeverity::Warning => {
|
// }
|
||||||
return (
|
// PerfSeverity::Warning => {
|
||||||
format!("warning: {}", reason_text),
|
// return (
|
||||||
AlertSeverity::Warning,
|
// format!("warning: {}", reason_text),
|
||||||
true,
|
// AlertSeverity::Warning,
|
||||||
);
|
// true,
|
||||||
}
|
// );
|
||||||
PerfSeverity::Ok => {}
|
// }
|
||||||
}
|
// PerfSeverity::Ok => {}
|
||||||
}
|
// }
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(backup) = host.backup.as_ref() {
|
if let Some(backup) = host.backup.as_ref() {
|
||||||
|
|||||||
@ -3,20 +3,32 @@ use ratatui::Frame;
|
|||||||
|
|
||||||
use crate::app::HostDisplayData;
|
use crate::app::HostDisplayData;
|
||||||
use crate::data::metrics::BackupMetrics;
|
use crate::data::metrics::BackupMetrics;
|
||||||
use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, WidgetData, WidgetStatus, StatusLevel};
|
use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, connection_status_message, WidgetData, WidgetStatus, StatusLevel};
|
||||||
|
use crate::app::ConnectionStatus;
|
||||||
|
|
||||||
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
||||||
match host {
|
match host {
|
||||||
Some(data) => {
|
Some(data) => {
|
||||||
if let Some(metrics) = data.backup.as_ref() {
|
match (&data.connection_status, data.backup.as_ref()) {
|
||||||
render_metrics(frame, data, metrics, area);
|
(ConnectionStatus::Connected, Some(metrics)) => {
|
||||||
} else {
|
render_metrics(frame, data, metrics, area);
|
||||||
render_placeholder(
|
}
|
||||||
frame,
|
(ConnectionStatus::Connected, None) => {
|
||||||
area,
|
render_placeholder(
|
||||||
"Backups",
|
frame,
|
||||||
&format!("Host {} awaiting backup metrics", data.name),
|
area,
|
||||||
);
|
"Backups",
|
||||||
|
&format!("Host {} awaiting backup metrics", data.name),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
(status, _) => {
|
||||||
|
render_placeholder(
|
||||||
|
frame,
|
||||||
|
area,
|
||||||
|
"Backups",
|
||||||
|
&format!("Host {}: {}", data.name, connection_status_message(status, &data.last_error)),
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => render_placeholder(frame, area, "Backups", "No hosts configured"),
|
None => render_placeholder(frame, area, "Backups", "No hosts configured"),
|
||||||
|
|||||||
@ -3,20 +3,32 @@ use ratatui::Frame;
|
|||||||
|
|
||||||
use crate::app::HostDisplayData;
|
use crate::app::HostDisplayData;
|
||||||
use crate::data::metrics::ServiceStatus;
|
use crate::data::metrics::ServiceStatus;
|
||||||
use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, WidgetData, WidgetStatus, StatusLevel};
|
use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, connection_status_message, WidgetData, WidgetStatus, StatusLevel};
|
||||||
|
use crate::app::ConnectionStatus;
|
||||||
|
|
||||||
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
||||||
match host {
|
match host {
|
||||||
Some(data) => {
|
Some(data) => {
|
||||||
if let Some(metrics) = data.services.as_ref() {
|
match (&data.connection_status, data.services.as_ref()) {
|
||||||
render_metrics(frame, data, metrics, area);
|
(ConnectionStatus::Connected, Some(metrics)) => {
|
||||||
} else {
|
render_metrics(frame, data, metrics, area);
|
||||||
render_placeholder(
|
}
|
||||||
frame,
|
(ConnectionStatus::Connected, None) => {
|
||||||
area,
|
render_placeholder(
|
||||||
"Services",
|
frame,
|
||||||
&format!("Host {} has no service metrics yet", data.name),
|
area,
|
||||||
);
|
"Services",
|
||||||
|
&format!("Host {} has no service metrics yet", data.name),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
(status, _) => {
|
||||||
|
render_placeholder(
|
||||||
|
frame,
|
||||||
|
area,
|
||||||
|
"Services",
|
||||||
|
&format!("Host {}: {}", data.name, connection_status_message(status, &data.last_error)),
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => render_placeholder(frame, area, "Services", "No hosts configured"),
|
None => render_placeholder(frame, area, "Services", "No hosts configured"),
|
||||||
|
|||||||
@ -3,20 +3,32 @@ use ratatui::Frame;
|
|||||||
|
|
||||||
use crate::app::HostDisplayData;
|
use crate::app::HostDisplayData;
|
||||||
use crate::data::metrics::SmartMetrics;
|
use crate::data::metrics::SmartMetrics;
|
||||||
use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, WidgetData, WidgetStatus, StatusLevel};
|
use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, connection_status_message, WidgetData, WidgetStatus, StatusLevel};
|
||||||
|
use crate::app::ConnectionStatus;
|
||||||
|
|
||||||
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
||||||
match host {
|
match host {
|
||||||
Some(data) => {
|
Some(data) => {
|
||||||
if let Some(metrics) = data.smart.as_ref() {
|
match (&data.connection_status, data.smart.as_ref()) {
|
||||||
render_metrics(frame, data, metrics, area);
|
(ConnectionStatus::Connected, Some(metrics)) => {
|
||||||
} else {
|
render_metrics(frame, data, metrics, area);
|
||||||
render_placeholder(
|
}
|
||||||
frame,
|
(ConnectionStatus::Connected, None) => {
|
||||||
area,
|
render_placeholder(
|
||||||
"Storage",
|
frame,
|
||||||
&format!("Host {} has no SMART data yet", data.name),
|
area,
|
||||||
);
|
"Storage",
|
||||||
|
&format!("Host {} has no SMART data yet", data.name),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
(status, _) => {
|
||||||
|
render_placeholder(
|
||||||
|
frame,
|
||||||
|
area,
|
||||||
|
"Storage",
|
||||||
|
&format!("Host {}: {}", data.name, connection_status_message(status, &data.last_error)),
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => render_placeholder(frame, area, "Storage", "No hosts configured"),
|
None => render_placeholder(frame, area, "Storage", "No hosts configured"),
|
||||||
|
|||||||
@ -2,24 +2,36 @@ use ratatui::layout::Rect;
|
|||||||
use ratatui::Frame;
|
use ratatui::Frame;
|
||||||
|
|
||||||
use crate::app::HostDisplayData;
|
use crate::app::HostDisplayData;
|
||||||
use crate::data::metrics::{ServiceMetrics, ServiceSummary};
|
use crate::data::metrics::SystemMetrics;
|
||||||
use crate::ui::widget::{
|
use crate::ui::widget::{
|
||||||
render_placeholder, render_combined_widget_data,
|
render_placeholder, render_combined_widget_data,
|
||||||
status_level_from_agent_status, WidgetDataSet, WidgetStatus, StatusLevel,
|
status_level_from_agent_status, connection_status_message, WidgetDataSet, WidgetStatus, StatusLevel,
|
||||||
};
|
};
|
||||||
|
use crate::app::ConnectionStatus;
|
||||||
|
|
||||||
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
||||||
match host {
|
match host {
|
||||||
Some(data) => {
|
Some(data) => {
|
||||||
if let Some(metrics) = data.services.as_ref() {
|
match (&data.connection_status, data.system.as_ref()) {
|
||||||
render_metrics(frame, data, metrics, area);
|
(ConnectionStatus::Connected, Some(metrics)) => {
|
||||||
} else {
|
render_metrics(frame, data, metrics, area);
|
||||||
render_placeholder(
|
}
|
||||||
frame,
|
(ConnectionStatus::Connected, None) => {
|
||||||
area,
|
render_placeholder(
|
||||||
"System",
|
frame,
|
||||||
&format!("Host {} awaiting service metrics", data.name),
|
area,
|
||||||
);
|
"System",
|
||||||
|
&format!("Host {} awaiting system metrics", data.name),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
(status, _) => {
|
||||||
|
render_placeholder(
|
||||||
|
frame,
|
||||||
|
area,
|
||||||
|
"System",
|
||||||
|
&format!("Host {}: {}", data.name, connection_status_message(status, &data.last_error)),
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => render_placeholder(frame, area, "System", "No hosts configured"),
|
None => render_placeholder(frame, area, "System", "No hosts configured"),
|
||||||
@ -29,30 +41,12 @@ pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
|||||||
fn render_metrics(
|
fn render_metrics(
|
||||||
frame: &mut Frame,
|
frame: &mut Frame,
|
||||||
_host: &HostDisplayData,
|
_host: &HostDisplayData,
|
||||||
metrics: &ServiceMetrics,
|
metrics: &SystemMetrics,
|
||||||
area: Rect,
|
area: Rect,
|
||||||
) {
|
) {
|
||||||
let summary = &metrics.summary;
|
let summary = &metrics.summary;
|
||||||
let system_total = if summary.system_memory_total_mb > 0.0 {
|
|
||||||
summary.system_memory_total_mb
|
// Use agent-calculated statuses
|
||||||
} else {
|
|
||||||
summary.memory_quota_mb
|
|
||||||
};
|
|
||||||
let system_used = if summary.system_memory_used_mb > 0.0 {
|
|
||||||
summary.system_memory_used_mb
|
|
||||||
} else {
|
|
||||||
summary.memory_used_mb
|
|
||||||
};
|
|
||||||
let _usage_ratio = if system_total > 0.0 {
|
|
||||||
(system_used / system_total) * 100.0
|
|
||||||
} else {
|
|
||||||
0.0
|
|
||||||
};
|
|
||||||
|
|
||||||
let (perf_severity, _reason) = evaluate_performance(summary);
|
|
||||||
// Dashboard should NOT calculate border colors - agent is the source of truth
|
|
||||||
|
|
||||||
// Use agent-calculated statuses instead of dashboard calculations
|
|
||||||
let memory_status = status_level_from_agent_status(summary.memory_status.as_ref());
|
let memory_status = status_level_from_agent_status(summary.memory_status.as_ref());
|
||||||
let cpu_status = status_level_from_agent_status(summary.cpu_status.as_ref());
|
let cpu_status = status_level_from_agent_status(summary.cpu_status.as_ref());
|
||||||
// Dashboard should NOT calculate colors - agent is the source of truth
|
// Dashboard should NOT calculate colors - agent is the source of truth
|
||||||
@ -62,7 +56,7 @@ fn render_metrics(
|
|||||||
memory_dataset.add_row(
|
memory_dataset.add_row(
|
||||||
Some(WidgetStatus::new(memory_status)),
|
Some(WidgetStatus::new(memory_status)),
|
||||||
vec![],
|
vec![],
|
||||||
vec![format!("{:.1} / {:.1} GB", system_used / 1000.0, system_total / 1000.0)],
|
vec![format!("{:.1} / {:.1} GB", summary.memory_used_mb / 1000.0, summary.memory_total_mb / 1000.0)],
|
||||||
);
|
);
|
||||||
|
|
||||||
// CPU dataset - use agent-calculated status
|
// CPU dataset - use agent-calculated status
|
||||||
@ -140,30 +134,24 @@ fn render_metrics(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// GPU dataset
|
// GPU dataset - GPU data remains in ServiceMetrics, not SystemMetrics
|
||||||
// GPU status should come from agent when available
|
let gpu_status = StatusLevel::Unknown; // GPU not available in SystemMetrics
|
||||||
let gpu_status = StatusLevel::Unknown; // Default until agent provides gpu_status
|
|
||||||
let mut gpu_dataset = WidgetDataSet::new(vec!["GPU load".to_string(), "GPU temp".to_string()], Some(WidgetStatus::new(gpu_status)));
|
let mut gpu_dataset = WidgetDataSet::new(vec!["GPU load".to_string(), "GPU temp".to_string()], Some(WidgetStatus::new(gpu_status)));
|
||||||
gpu_dataset.add_row(
|
gpu_dataset.add_row(
|
||||||
Some(WidgetStatus::new(gpu_status)),
|
Some(WidgetStatus::new(gpu_status)),
|
||||||
vec![],
|
vec![],
|
||||||
vec![
|
vec![
|
||||||
summary
|
"—".to_string(), // GPU data not in SystemMetrics
|
||||||
.gpu_load_percent
|
"—".to_string(), // GPU data not in SystemMetrics
|
||||||
.map(|value| format_optional_percent(Some(value)))
|
|
||||||
.unwrap_or_else(|| "—".to_string()),
|
|
||||||
summary
|
|
||||||
.gpu_temp_c
|
|
||||||
.map(|value| format_optional_metric(Some(value), "°C"))
|
|
||||||
.unwrap_or_else(|| "—".to_string()),
|
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
|
|
||||||
// Determine overall widget status based on worst case
|
// Determine overall widget status based on worst case from agent statuses
|
||||||
let overall_status_level = match perf_severity {
|
let overall_status_level = match (memory_status, cpu_status) {
|
||||||
PerfSeverity::Critical => StatusLevel::Error,
|
(StatusLevel::Error, _) | (_, StatusLevel::Error) => StatusLevel::Error,
|
||||||
PerfSeverity::Warning => StatusLevel::Warning,
|
(StatusLevel::Warning, _) | (_, StatusLevel::Warning) => StatusLevel::Warning,
|
||||||
PerfSeverity::Ok => StatusLevel::Ok,
|
(StatusLevel::Ok, StatusLevel::Ok) => StatusLevel::Ok,
|
||||||
|
_ => StatusLevel::Unknown,
|
||||||
};
|
};
|
||||||
let overall_status = Some(WidgetStatus::new(overall_status_level));
|
let overall_status = Some(WidgetStatus::new(overall_status_level));
|
||||||
|
|
||||||
@ -171,13 +159,6 @@ fn render_metrics(
|
|||||||
render_combined_widget_data(frame, area, "System".to_string(), overall_status, vec![memory_dataset, cpu_dataset, cstate_dataset, gpu_dataset]);
|
render_combined_widget_data(frame, area, "System".to_string(), overall_status, vec![memory_dataset, cpu_dataset, cstate_dataset, gpu_dataset]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
|
||||||
pub(crate) enum PerfSeverity {
|
|
||||||
Ok,
|
|
||||||
Warning,
|
|
||||||
Critical,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn format_optional_metric(value: Option<f32>, unit: &str) -> String {
|
fn format_optional_metric(value: Option<f32>, unit: &str) -> String {
|
||||||
match value {
|
match value {
|
||||||
Some(number) => format!("{:.1}{}", number, unit),
|
Some(number) => format!("{:.1}{}", number, unit),
|
||||||
@ -191,62 +172,3 @@ fn format_optional_percent(value: Option<f32>) -> String {
|
|||||||
None => "—".to_string(),
|
None => "—".to_string(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pub(crate) fn evaluate_performance(summary: &ServiceSummary) -> (PerfSeverity, Option<String>) {
|
|
||||||
let mem_percent = if summary.system_memory_total_mb > 0.0 {
|
|
||||||
(summary.system_memory_used_mb / summary.system_memory_total_mb) * 100.0
|
|
||||||
} else if summary.memory_quota_mb > 0.0 {
|
|
||||||
(summary.memory_used_mb / summary.memory_quota_mb) * 100.0
|
|
||||||
} else {
|
|
||||||
0.0
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut severity = PerfSeverity::Ok;
|
|
||||||
let mut reason: Option<String> = None;
|
|
||||||
|
|
||||||
let mut consider = |level: PerfSeverity, message: String| {
|
|
||||||
if level > severity {
|
|
||||||
severity = level;
|
|
||||||
reason = Some(message);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Use agent's memory status instead of hardcoded thresholds
|
|
||||||
if let Some(memory_status) = &summary.memory_status {
|
|
||||||
match memory_status.as_str() {
|
|
||||||
"critical" => consider(PerfSeverity::Critical, format!("RAM {:.0}%", mem_percent)),
|
|
||||||
"warning" => consider(PerfSeverity::Warning, format!("RAM {:.0}%", mem_percent)),
|
|
||||||
_ => {} // "ok" - no alert needed
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use agent's CPU status instead of hardcoded thresholds
|
|
||||||
if let Some(cpu_status) = &summary.cpu_status {
|
|
||||||
match cpu_status.as_str() {
|
|
||||||
"critical" => consider(PerfSeverity::Critical, format!("CPU load {:.2}", summary.cpu_load_5)),
|
|
||||||
"warning" => consider(PerfSeverity::Warning, format!("CPU load {:.2}", summary.cpu_load_5)),
|
|
||||||
_ => {} // "ok" - no alert needed
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use agent's CPU temperature status instead of hardcoded thresholds
|
|
||||||
if let Some(cpu_temp_status) = &summary.cpu_temp_status {
|
|
||||||
if let Some(temp) = summary.cpu_temp_c {
|
|
||||||
match cpu_temp_status.as_str() {
|
|
||||||
"critical" => consider(PerfSeverity::Critical, format!("CPU temp {:.0}°C", temp)),
|
|
||||||
"warning" => consider(PerfSeverity::Warning, format!("CPU temp {:.0}°C", temp)),
|
|
||||||
_ => {} // "ok" - no alert needed
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: GPU status should come from agent, not calculated here with hardcoded thresholds
|
|
||||||
// For now, remove hardcoded GPU thresholds until agent provides gpu_status
|
|
||||||
|
|
||||||
if severity == PerfSeverity::Ok {
|
|
||||||
(PerfSeverity::Ok, None)
|
|
||||||
} else {
|
|
||||||
(severity, reason)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@ -37,6 +37,28 @@ pub fn status_level_from_agent_status(agent_status: Option<&String>) -> StatusLe
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn connection_status_message(connection_status: &crate::app::ConnectionStatus, last_error: &Option<String>) -> String {
|
||||||
|
use crate::app::ConnectionStatus;
|
||||||
|
match connection_status {
|
||||||
|
ConnectionStatus::Connected => "Connected".to_string(),
|
||||||
|
ConnectionStatus::Timeout => {
|
||||||
|
if let Some(error) = last_error {
|
||||||
|
format!("Timeout: {}", error)
|
||||||
|
} else {
|
||||||
|
"Keep-alive timeout".to_string()
|
||||||
|
}
|
||||||
|
},
|
||||||
|
ConnectionStatus::Error => {
|
||||||
|
if let Some(error) = last_error {
|
||||||
|
format!("Error: {}", error)
|
||||||
|
} else {
|
||||||
|
"Connection error".to_string()
|
||||||
|
}
|
||||||
|
},
|
||||||
|
ConnectionStatus::Unknown => "No data received".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
pub fn render_placeholder(frame: &mut Frame, area: Rect, title: &str, message: &str) {
|
pub fn render_placeholder(frame: &mut Frame, area: Rect, title: &str, message: &str) {
|
||||||
|
|||||||
@ -6,6 +6,7 @@ use serde_json::Value;
|
|||||||
pub enum AgentType {
|
pub enum AgentType {
|
||||||
Smart,
|
Smart,
|
||||||
Service,
|
Service,
|
||||||
|
System,
|
||||||
Backup,
|
Backup,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user