All checks were successful
Build and Release / build-and-release (push) Successful in 2m39s
Fully restored CM Dashboard as a complete monitoring system with working status evaluation and email notifications. COMPLETED PHASES: ✅ Phase 1: Fixed storage display issues - Use lsblk instead of findmnt (eliminates /nix/store bind mount) - Fixed NVMe SMART parsing (Temperature: and Percentage Used:) - Added sudo to smartctl for permissions - Consistent filesystem and tmpfs sorting ✅ Phase 2a: Fixed missing NixOS build information - Added build_version field to AgentData - NixOS collector now populates build info - Dashboard shows actual build instead of "unknown" ✅ Phase 2b: Restored status evaluation system - Added status fields to all structured data types - CPU: load and temperature status evaluation - Memory: usage status evaluation - Storage: temperature, health, and filesystem usage status - All collectors now use their threshold configurations ✅ Phase 3: Restored notification system - Status change detection between collection cycles - Email alerts on status degradation (OK→Warning/Critical) - Detailed notification content with metric values - Full NotificationManager integration CORE FUNCTIONALITY RESTORED: - Real-time monitoring with proper status evaluation - Email notifications on threshold violations - Correct storage display (nvme0n1 T: 28°C W: 1%) - Complete status-aware infrastructure monitoring - Dashboard is now a monitoring system, not just data viewer The CM Dashboard monitoring system is fully operational.
319 lines
11 KiB
Rust
319 lines
11 KiB
Rust
use chrono::Utc;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::HashMap;
|
|
|
|
/// Individual metric with value, status, and metadata
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct Metric {
|
|
pub name: String,
|
|
pub value: MetricValue,
|
|
pub status: Status,
|
|
pub timestamp: u64,
|
|
pub description: Option<String>,
|
|
pub unit: Option<String>,
|
|
}
|
|
|
|
impl Metric {
|
|
pub fn new(name: String, value: MetricValue, status: Status) -> Self {
|
|
Self {
|
|
name,
|
|
value,
|
|
status,
|
|
timestamp: Utc::now().timestamp() as u64,
|
|
description: None,
|
|
unit: None,
|
|
}
|
|
}
|
|
|
|
pub fn with_description(mut self, description: String) -> Self {
|
|
self.description = Some(description);
|
|
self
|
|
}
|
|
|
|
pub fn with_unit(mut self, unit: String) -> Self {
|
|
self.unit = Some(unit);
|
|
self
|
|
}
|
|
}
|
|
|
|
/// Typed metric values
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub enum MetricValue {
|
|
Float(f32),
|
|
Integer(i64),
|
|
String(String),
|
|
Boolean(bool),
|
|
}
|
|
|
|
impl MetricValue {
|
|
pub fn as_f32(&self) -> Option<f32> {
|
|
match self {
|
|
MetricValue::Float(f) => Some(*f),
|
|
MetricValue::Integer(i) => Some(*i as f32),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
pub fn as_i64(&self) -> Option<i64> {
|
|
match self {
|
|
MetricValue::Integer(i) => Some(*i),
|
|
MetricValue::Float(f) => Some(*f as i64),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
pub fn as_string(&self) -> String {
|
|
match self {
|
|
MetricValue::String(s) => s.clone(),
|
|
MetricValue::Float(f) => f.to_string(),
|
|
MetricValue::Integer(i) => i.to_string(),
|
|
MetricValue::Boolean(b) => b.to_string(),
|
|
}
|
|
}
|
|
|
|
pub fn as_bool(&self) -> Option<bool> {
|
|
match self {
|
|
MetricValue::Boolean(b) => Some(*b),
|
|
_ => None,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Health status for metrics
|
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
|
pub enum Status {
|
|
Inactive, // Lowest priority
|
|
Unknown, //
|
|
Offline, //
|
|
Pending, //
|
|
Ok, // 5th place - good status has higher priority than unknown states
|
|
Warning, //
|
|
Critical, // Highest priority
|
|
}
|
|
|
|
impl Status {
|
|
/// Aggregate multiple statuses - returns the worst status
|
|
pub fn aggregate(statuses: &[Status]) -> Status {
|
|
statuses.iter().max().copied().unwrap_or(Status::Unknown)
|
|
}
|
|
}
|
|
|
|
impl Default for Status {
|
|
fn default() -> Self {
|
|
Status::Unknown
|
|
}
|
|
}
|
|
|
|
/// Hysteresis thresholds for preventing status flapping
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct HysteresisThresholds {
|
|
/// Warning threshold - trigger warning when value >= this
|
|
pub warning_high: f32,
|
|
/// Warning recovery - return to ok when value < this
|
|
pub warning_low: f32,
|
|
/// Critical threshold - trigger critical when value >= this
|
|
pub critical_high: f32,
|
|
/// Critical recovery - return to warning when value < this
|
|
pub critical_low: f32,
|
|
}
|
|
|
|
impl HysteresisThresholds {
|
|
pub fn new(warning_high: f32, critical_high: f32) -> Self {
|
|
// Default hysteresis: 10% gap for recovery
|
|
let warning_gap = warning_high * 0.1;
|
|
let critical_gap = critical_high * 0.1;
|
|
|
|
Self {
|
|
warning_high,
|
|
warning_low: warning_high - warning_gap,
|
|
critical_high,
|
|
critical_low: critical_high - critical_gap,
|
|
}
|
|
}
|
|
|
|
/// Evaluate value against thresholds to determine status
|
|
pub fn evaluate(&self, value: f32) -> Status {
|
|
if value >= self.critical_high {
|
|
Status::Critical
|
|
} else if value >= self.warning_high {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
}
|
|
}
|
|
|
|
pub fn with_custom_gaps(warning_high: f32, warning_gap: f32, critical_high: f32, critical_gap: f32) -> Self {
|
|
Self {
|
|
warning_high,
|
|
warning_low: warning_high - warning_gap,
|
|
critical_high,
|
|
critical_low: critical_high - critical_gap,
|
|
}
|
|
}
|
|
|
|
/// Calculate status with hysteresis based on current value and previous status
|
|
pub fn calculate_status(&self, value: f32, previous_status: Status) -> Status {
|
|
match previous_status {
|
|
Status::Ok => {
|
|
if value >= self.critical_high {
|
|
Status::Critical
|
|
} else if value >= self.warning_high {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
}
|
|
}
|
|
Status::Warning => {
|
|
if value >= self.critical_high {
|
|
Status::Critical
|
|
} else if value < self.warning_low {
|
|
Status::Ok
|
|
} else {
|
|
Status::Warning
|
|
}
|
|
}
|
|
Status::Critical => {
|
|
if value < self.critical_low {
|
|
if value < self.warning_low {
|
|
Status::Ok
|
|
} else {
|
|
Status::Warning
|
|
}
|
|
} else {
|
|
Status::Critical
|
|
}
|
|
}
|
|
Status::Unknown => {
|
|
// First measurement, use normal thresholds
|
|
if value >= self.critical_high {
|
|
Status::Critical
|
|
} else if value >= self.warning_high {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
}
|
|
}
|
|
Status::Inactive => {
|
|
// Inactive services use normal thresholds like first measurement
|
|
if value >= self.critical_high {
|
|
Status::Critical
|
|
} else if value >= self.warning_high {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
}
|
|
}
|
|
Status::Pending => {
|
|
// Service transitioning, use normal thresholds like first measurement
|
|
if value >= self.critical_high {
|
|
Status::Critical
|
|
} else if value >= self.warning_high {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
}
|
|
}
|
|
Status::Offline => {
|
|
// Host coming back online, use normal thresholds like first measurement
|
|
if value >= self.critical_high {
|
|
Status::Critical
|
|
} else if value >= self.warning_high {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Status tracker for hysteresis - tracks previous status per metric
|
|
#[derive(Debug, Default)]
|
|
pub struct StatusTracker {
|
|
previous_statuses: HashMap<String, Status>,
|
|
}
|
|
|
|
impl StatusTracker {
|
|
pub fn new() -> Self {
|
|
Self::default()
|
|
}
|
|
|
|
/// Get previous status for a metric
|
|
pub fn get_previous_status(&self, metric_name: &str) -> Status {
|
|
self.previous_statuses.get(metric_name).copied().unwrap_or(Status::Unknown)
|
|
}
|
|
|
|
/// Update status for a metric
|
|
pub fn update_status(&mut self, metric_name: String, status: Status) {
|
|
self.previous_statuses.insert(metric_name, status);
|
|
}
|
|
|
|
/// Calculate status with hysteresis
|
|
pub fn calculate_with_hysteresis(&mut self, metric_name: &str, value: f32, thresholds: &HysteresisThresholds) -> Status {
|
|
let previous = self.get_previous_status(metric_name);
|
|
let new_status = thresholds.calculate_status(value, previous);
|
|
self.update_status(metric_name.to_string(), new_status);
|
|
new_status
|
|
}
|
|
}
|
|
|
|
/// Metric name registry - constants for all metric names
|
|
pub mod registry {
|
|
// CPU metrics
|
|
pub const CPU_LOAD_1MIN: &str = "cpu_load_1min";
|
|
pub const CPU_LOAD_5MIN: &str = "cpu_load_5min";
|
|
pub const CPU_LOAD_15MIN: &str = "cpu_load_15min";
|
|
pub const CPU_TEMPERATURE_CELSIUS: &str = "cpu_temperature_celsius";
|
|
pub const CPU_FREQUENCY_MHZ: &str = "cpu_frequency_mhz";
|
|
pub const CPU_USAGE_PERCENT: &str = "cpu_usage_percent";
|
|
|
|
// Memory metrics
|
|
pub const MEMORY_USAGE_PERCENT: &str = "memory_usage_percent";
|
|
pub const MEMORY_TOTAL_GB: &str = "memory_total_gb";
|
|
pub const MEMORY_USED_GB: &str = "memory_used_gb";
|
|
pub const MEMORY_AVAILABLE_GB: &str = "memory_available_gb";
|
|
pub const MEMORY_SWAP_TOTAL_GB: &str = "memory_swap_total_gb";
|
|
pub const MEMORY_SWAP_USED_GB: &str = "memory_swap_used_gb";
|
|
|
|
// Disk metrics (template - actual names include device)
|
|
pub const DISK_USAGE_PERCENT_TEMPLATE: &str = "disk_{device}_usage_percent";
|
|
pub const DISK_TEMPERATURE_CELSIUS_TEMPLATE: &str = "disk_{device}_temperature_celsius";
|
|
pub const DISK_WEAR_PERCENT_TEMPLATE: &str = "disk_{device}_wear_percent";
|
|
pub const DISK_SPARE_PERCENT_TEMPLATE: &str = "disk_{device}_spare_percent";
|
|
pub const DISK_HOURS_TEMPLATE: &str = "disk_{device}_hours";
|
|
pub const DISK_CAPACITY_GB_TEMPLATE: &str = "disk_{device}_capacity_gb";
|
|
|
|
// Service metrics (template - actual names include service)
|
|
pub const SERVICE_STATUS_TEMPLATE: &str = "service_{name}_status";
|
|
pub const SERVICE_MEMORY_MB_TEMPLATE: &str = "service_{name}_memory_mb";
|
|
pub const SERVICE_CPU_PERCENT_TEMPLATE: &str = "service_{name}_cpu_percent";
|
|
|
|
// Backup metrics
|
|
pub const BACKUP_STATUS: &str = "backup_status";
|
|
pub const BACKUP_LAST_RUN_TIMESTAMP: &str = "backup_last_run_timestamp";
|
|
pub const BACKUP_SIZE_GB: &str = "backup_size_gb";
|
|
pub const BACKUP_DURATION_MINUTES: &str = "backup_duration_minutes";
|
|
pub const BACKUP_NEXT_SCHEDULED_TIMESTAMP: &str = "backup_next_scheduled_timestamp";
|
|
|
|
// Network metrics (template - actual names include interface)
|
|
pub const NETWORK_RX_BYTES_TEMPLATE: &str = "network_{interface}_rx_bytes";
|
|
pub const NETWORK_TX_BYTES_TEMPLATE: &str = "network_{interface}_tx_bytes";
|
|
pub const NETWORK_RX_PACKETS_TEMPLATE: &str = "network_{interface}_rx_packets";
|
|
pub const NETWORK_TX_PACKETS_TEMPLATE: &str = "network_{interface}_tx_packets";
|
|
|
|
/// Generate disk metric name from template
|
|
pub fn disk_metric(template: &str, device: &str) -> String {
|
|
template.replace("{device}", device)
|
|
}
|
|
|
|
/// Generate service metric name from template
|
|
pub fn service_metric(template: &str, name: &str) -> String {
|
|
template.replace("{name}", name)
|
|
}
|
|
|
|
/// Generate network metric name from template
|
|
pub fn network_metric(template: &str, interface: &str) -> String {
|
|
template.replace("{interface}", interface)
|
|
}
|
|
}
|