Christoffer Martinsson 66ab7a492d
All checks were successful
Build and Release / build-and-release (push) Successful in 2m39s
Complete monitoring system restoration
Fully restored CM Dashboard as a complete monitoring system with working
status evaluation and email notifications.

COMPLETED PHASES:
 Phase 1: Fixed storage display issues
  - Use lsblk instead of findmnt (eliminates /nix/store bind mount)
  - Fixed NVMe SMART parsing (Temperature: and Percentage Used:)
  - Added sudo to smartctl for permissions
  - Consistent filesystem and tmpfs sorting

 Phase 2a: Fixed missing NixOS build information
  - Added build_version field to AgentData
  - NixOS collector now populates build info
  - Dashboard shows actual build instead of "unknown"

 Phase 2b: Restored status evaluation system
  - Added status fields to all structured data types
  - CPU: load and temperature status evaluation
  - Memory: usage status evaluation
  - Storage: temperature, health, and filesystem usage status
  - All collectors now use their threshold configurations

 Phase 3: Restored notification system
  - Status change detection between collection cycles
  - Email alerts on status degradation (OK→Warning/Critical)
  - Detailed notification content with metric values
  - Full NotificationManager integration

CORE FUNCTIONALITY RESTORED:
- Real-time monitoring with proper status evaluation
- Email notifications on threshold violations
- Correct storage display (nvme0n1 T: 28°C W: 1%)
- Complete status-aware infrastructure monitoring
- Dashboard is now a monitoring system, not just data viewer

The CM Dashboard monitoring system is fully operational.
2025-11-24 19:58:26 +01:00

193 lines
6.7 KiB
Rust

use async_trait::async_trait;
use cm_dashboard_shared::{AgentData, Status, HysteresisThresholds};
use tracing::debug;
use super::{utils, Collector, CollectorError};
use crate::config::CpuConfig;
/// Extremely efficient CPU metrics collector
///
/// EFFICIENCY OPTIMIZATIONS:
/// - Single /proc/loadavg read for all load metrics
/// - Single /proc/stat read for CPU usage
/// - Minimal string allocations
/// - No process spawning
/// - <0.1ms collection time target
pub struct CpuCollector {
load_thresholds: HysteresisThresholds,
temperature_thresholds: HysteresisThresholds,
}
impl CpuCollector {
pub fn new(config: CpuConfig) -> Self {
// Create hysteresis thresholds with 10% gap for recovery
let load_thresholds = HysteresisThresholds::new(
config.load_warning_threshold,
config.load_critical_threshold,
);
let temperature_thresholds = HysteresisThresholds::new(
config.temperature_warning_threshold,
config.temperature_critical_threshold,
);
Self {
load_thresholds,
temperature_thresholds,
}
}
/// Calculate CPU load status using thresholds
fn calculate_load_status(&self, load: f32) -> Status {
if load >= self.load_thresholds.critical_high {
Status::Critical
} else if load >= self.load_thresholds.warning_high {
Status::Warning
} else {
Status::Ok
}
}
/// Calculate CPU temperature status using thresholds
fn calculate_temperature_status(&self, temp: f32) -> Status {
if temp >= self.temperature_thresholds.critical_high {
Status::Critical
} else if temp >= self.temperature_thresholds.warning_high {
Status::Warning
} else {
Status::Ok
}
}
/// Collect CPU load averages and populate AgentData
/// Format: "0.52 0.58 0.59 1/257 12345"
async fn collect_load_averages(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
let content = utils::read_proc_file("/proc/loadavg")?;
let parts: Vec<&str> = content.trim().split_whitespace().collect();
if parts.len() < 3 {
return Err(CollectorError::Parse {
value: content,
error: "Expected at least 3 values in /proc/loadavg".to_string(),
});
}
let load_1min = utils::parse_f32(parts[0])?;
let load_5min = utils::parse_f32(parts[1])?;
let load_15min = utils::parse_f32(parts[2])?;
// Populate CPU data directly
agent_data.system.cpu.load_1min = load_1min;
agent_data.system.cpu.load_5min = load_5min;
agent_data.system.cpu.load_15min = load_15min;
Ok(())
}
/// Collect CPU temperature and populate AgentData
/// Prioritizes x86_pkg_temp over generic thermal zones
async fn collect_temperature(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
// Try x86_pkg_temp first (Intel CPU package temperature)
if let Ok(temp) = self
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
.await
{
let temp_celsius = temp as f32 / 1000.0;
agent_data.system.cpu.temperature_celsius = Some(temp_celsius);
return Ok(());
}
// Fallback: try other thermal zones
for zone_id in 0..10 {
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
if let Ok(temp) = self.read_thermal_zone(&path).await {
let temp_celsius = temp as f32 / 1000.0;
agent_data.system.cpu.temperature_celsius = Some(temp_celsius);
return Ok(());
}
}
debug!("No CPU temperature sensors found");
// Leave temperature as None if not available
Ok(())
}
/// Read temperature from thermal zone efficiently
async fn read_thermal_zone(&self, path: &str) -> Result<u64, CollectorError> {
let content = utils::read_proc_file(path)?;
utils::parse_u64(content.trim())
}
/// Collect CPU frequency and populate AgentData
async fn collect_frequency(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
// Try scaling frequency first (more accurate for current frequency)
if let Ok(freq) =
utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq")
{
if let Ok(freq_khz) = utils::parse_u64(freq.trim()) {
let freq_mhz = freq_khz as f32 / 1000.0;
agent_data.system.cpu.frequency_mhz = freq_mhz;
return Ok(());
}
}
// Fallback: parse /proc/cpuinfo for base frequency
if let Ok(content) = utils::read_proc_file("/proc/cpuinfo") {
for line in content.lines() {
if line.starts_with("cpu MHz") {
if let Some(freq_str) = line.split(':').nth(1) {
if let Ok(freq_mhz) = utils::parse_f32(freq_str) {
agent_data.system.cpu.frequency_mhz = freq_mhz;
return Ok(());
}
}
break; // Only need first CPU entry
}
}
}
debug!("CPU frequency not available");
// Leave frequency as 0.0 if not available
Ok(())
}
}
#[async_trait]
impl Collector for CpuCollector {
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
debug!("Collecting CPU metrics");
let start = std::time::Instant::now();
// Collect load averages (always available)
self.collect_load_averages(agent_data).await?;
// Collect temperature (optional)
self.collect_temperature(agent_data).await?;
// Collect frequency (optional)
self.collect_frequency(agent_data).await?;
let duration = start.elapsed();
debug!("CPU collection completed in {:?}", duration);
// Efficiency check: warn if collection takes too long
if duration.as_millis() > 1 {
debug!(
"CPU collection took {}ms - consider optimization",
duration.as_millis()
);
}
// Calculate status using thresholds
agent_data.system.cpu.load_status = self.calculate_load_status(agent_data.system.cpu.load_1min);
agent_data.system.cpu.temperature_status = if let Some(temp) = agent_data.system.cpu.temperature_celsius {
self.calculate_temperature_status(temp)
} else {
Status::Unknown
};
Ok(())
}
}