cm-dashboard/shared/src/metrics.rs

use chrono::Utc;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

/// Individual metric with value, status, and metadata
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Metric {
    pub name: String,
    pub value: MetricValue,
    pub status: Status,
    pub timestamp: u64,
    pub description: Option<String>,
    pub unit: Option<String>,
}

impl Metric {
    pub fn new(name: String, value: MetricValue, status: Status) -> Self {
        Self {
            name,
            value,
            status,
            timestamp: Utc::now().timestamp() as u64,
            description: None,
            unit: None,
        }
    }

    pub fn with_description(mut self, description: String) -> Self {
        self.description = Some(description);
        self
    }

    pub fn with_unit(mut self, unit: String) -> Self {
        self.unit = Some(unit);
        self
    }
}

/// Typed metric values
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum MetricValue {
    Float(f32),
    Integer(i64),
    String(String),
    Boolean(bool),
}

impl MetricValue {
    pub fn as_f32(&self) -> Option<f32> {
        match self {
            MetricValue::Float(f) => Some(*f),
            MetricValue::Integer(i) => Some(*i as f32),
            _ => None,
        }
    }

    pub fn as_i64(&self) -> Option<i64> {
        match self {
            MetricValue::Integer(i) => Some(*i),
            MetricValue::Float(f) => Some(*f as i64),
            _ => None,
        }
    }

    pub fn as_string(&self) -> String {
        match self {
            MetricValue::String(s) => s.clone(),
            MetricValue::Float(f) => f.to_string(),
            MetricValue::Integer(i) => i.to_string(),
            MetricValue::Boolean(b) => b.to_string(),
        }
    }

    pub fn as_bool(&self) -> Option<bool> {
        match self {
            MetricValue::Boolean(b) => Some(*b),
            _ => None,
        }
    }
}

/// Health status for metrics
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub enum Status {
    Inactive, // Lowest priority
    Unknown,  //
    Offline,  //
    Pending,  //
    Ok,       // 5th place - good status has higher priority than unknown states
    Warning,  //
    Critical, // Highest priority
}

impl Status {
    /// Aggregate multiple statuses - returns the worst status
    pub fn aggregate(statuses: &[Status]) -> Status {
        statuses.iter().max().copied().unwrap_or(Status::Unknown)
    }
}

impl Default for Status {
    fn default() -> Self {
        Status::Unknown
    }
}

/// Hysteresis thresholds for preventing status flapping
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HysteresisThresholds {
    /// Warning threshold - trigger warning when value >= this
    pub warning_high: f32,
    /// Warning recovery - return to ok when value < this
    pub warning_low: f32,
    /// Critical threshold - trigger critical when value >= this
    pub critical_high: f32,
    /// Critical recovery - return to warning when value < this
    pub critical_low: f32,
}

impl HysteresisThresholds {
    pub fn new(warning_high: f32, critical_high: f32) -> Self {
        // Default hysteresis: 10% gap for recovery
        let warning_gap = warning_high * 0.1;
        let critical_gap = critical_high * 0.1;

        Self {
            warning_high,
            warning_low: warning_high - warning_gap,
            critical_high,
            critical_low: critical_high - critical_gap,
        }
    }

    /// Evaluate value against thresholds to determine status
    pub fn evaluate(&self, value: f32) -> Status {
        if value >= self.critical_high {
            Status::Critical
        } else if value >= self.warning_high {
            Status::Warning
        } else {
            Status::Ok
        }
    }

    pub fn with_custom_gaps(warning_high: f32, warning_gap: f32, critical_high: f32, critical_gap: f32) -> Self {
        Self {
            warning_high,
            warning_low: warning_high - warning_gap,
            critical_high,
            critical_low: critical_high - critical_gap,
        }
    }

    /// Calculate status with hysteresis based on current value and previous status
    pub fn calculate_status(&self, value: f32, previous_status: Status) -> Status {
        match previous_status {
            Status::Ok => {
                if value >= self.critical_high {
                    Status::Critical
                } else if value >= self.warning_high {
                    Status::Warning
                } else {
                    Status::Ok
                }
            }
            Status::Warning => {
                if value >= self.critical_high {
                    Status::Critical
                } else if value < self.warning_low {
                    Status::Ok
                } else {
                    Status::Warning
                }
            }
            Status::Critical => {
                if value < self.critical_low {
                    if value < self.warning_low {
                        Status::Ok
                    } else {
                        Status::Warning
                    }
                } else {
                    Status::Critical
                }
            }
            Status::Unknown => {
                // First measurement, use normal thresholds
                if value >= self.critical_high {
                    Status::Critical
                } else if value >= self.warning_high {
                    Status::Warning
                } else {
                    Status::Ok
                }
            }
            Status::Inactive => {
                // Inactive services use normal thresholds like first measurement
                if value >= self.critical_high {
                    Status::Critical
                } else if value >= self.warning_high {
                    Status::Warning
                } else {
                    Status::Ok
                }
            }
            Status::Pending => {
                // Service transitioning, use normal thresholds like first measurement
                if value >= self.critical_high {
                    Status::Critical
                } else if value >= self.warning_high {
                    Status::Warning
                } else {
                    Status::Ok
                }
            }
            Status::Offline => {
                // Host coming back online, use normal thresholds like first measurement
                if value >= self.critical_high {
                    Status::Critical
                } else if value >= self.warning_high {
                    Status::Warning
                } else {
                    Status::Ok
                }
            }
        }
    }
}

/// Status tracker for hysteresis - tracks previous status per metric
#[derive(Debug, Default)]
pub struct StatusTracker {
    previous_statuses: HashMap<String, Status>,
}

impl StatusTracker {
    pub fn new() -> Self {
        Self::default()
    }

    /// Get previous status for a metric
    pub fn get_previous_status(&self, metric_name: &str) -> Status {
        self.previous_statuses.get(metric_name).copied().unwrap_or(Status::Unknown)
    }

    /// Update status for a metric
    pub fn update_status(&mut self, metric_name: String, status: Status) {
        self.previous_statuses.insert(metric_name, status);
    }

    /// Calculate status with hysteresis
    pub fn calculate_with_hysteresis(&mut self, metric_name: &str, value: f32, thresholds: &HysteresisThresholds) -> Status {
        let previous = self.get_previous_status(metric_name);
        let new_status = thresholds.calculate_status(value, previous);
        self.update_status(metric_name.to_string(), new_status);
        new_status
    }
}

/// Metric name registry - constants for all metric names
pub mod registry {
    // CPU metrics
    pub const CPU_LOAD_1MIN: &str = "cpu_load_1min";
    pub const CPU_LOAD_5MIN: &str = "cpu_load_5min";
    pub const CPU_LOAD_15MIN: &str = "cpu_load_15min";
    pub const CPU_TEMPERATURE_CELSIUS: &str = "cpu_temperature_celsius";
    pub const CPU_FREQUENCY_MHZ: &str = "cpu_frequency_mhz";
    pub const CPU_USAGE_PERCENT: &str = "cpu_usage_percent";

    // Memory metrics
    pub const MEMORY_USAGE_PERCENT: &str = "memory_usage_percent";
    pub const MEMORY_TOTAL_GB: &str = "memory_total_gb";
    pub const MEMORY_USED_GB: &str = "memory_used_gb";
    pub const MEMORY_AVAILABLE_GB: &str = "memory_available_gb";
    pub const MEMORY_SWAP_TOTAL_GB: &str = "memory_swap_total_gb";
    pub const MEMORY_SWAP_USED_GB: &str = "memory_swap_used_gb";

    // Disk metrics (template - actual names include device)
    pub const DISK_USAGE_PERCENT_TEMPLATE: &str = "disk_{device}_usage_percent";
    pub const DISK_TEMPERATURE_CELSIUS_TEMPLATE: &str = "disk_{device}_temperature_celsius";
    pub const DISK_WEAR_PERCENT_TEMPLATE: &str = "disk_{device}_wear_percent";
    pub const DISK_SPARE_PERCENT_TEMPLATE: &str = "disk_{device}_spare_percent";
    pub const DISK_HOURS_TEMPLATE: &str = "disk_{device}_hours";
    pub const DISK_CAPACITY_GB_TEMPLATE: &str = "disk_{device}_capacity_gb";

    // Service metrics (template - actual names include service)
    pub const SERVICE_STATUS_TEMPLATE: &str = "service_{name}_status";
    pub const SERVICE_MEMORY_MB_TEMPLATE: &str = "service_{name}_memory_mb";
    pub const SERVICE_CPU_PERCENT_TEMPLATE: &str = "service_{name}_cpu_percent";

    // Backup metrics
    pub const BACKUP_STATUS: &str = "backup_status";
    pub const BACKUP_LAST_RUN_TIMESTAMP: &str = "backup_last_run_timestamp";
    pub const BACKUP_SIZE_GB: &str = "backup_size_gb";
    pub const BACKUP_DURATION_MINUTES: &str = "backup_duration_minutes";
    pub const BACKUP_NEXT_SCHEDULED_TIMESTAMP: &str = "backup_next_scheduled_timestamp";

    // Network metrics (template - actual names include interface)
    pub const NETWORK_RX_BYTES_TEMPLATE: &str = "network_{interface}_rx_bytes";
    pub const NETWORK_TX_BYTES_TEMPLATE: &str = "network_{interface}_tx_bytes";
    pub const NETWORK_RX_PACKETS_TEMPLATE: &str = "network_{interface}_rx_packets";
    pub const NETWORK_TX_PACKETS_TEMPLATE: &str = "network_{interface}_tx_packets";

    /// Generate disk metric name from template
    pub fn disk_metric(template: &str, device: &str) -> String {
        template.replace("{device}", device)
    }

    /// Generate service metric name from template
    pub fn service_metric(template: &str, name: &str) -> String {
        template.replace("{name}", name)
    }

    /// Generate network metric name from template
    pub fn network_metric(template: &str, interface: &str) -> String {
        template.replace("{interface}", interface)
    }
}