cm-dashboard/agent/src/collectors/cpu.rs

use async_trait::async_trait;
use cm_dashboard_shared::{AgentData, Status, HysteresisThresholds};

use tracing::debug;

use super::{utils, Collector, CollectorError};
use crate::config::CpuConfig;

/// Extremely efficient CPU metrics collector
///
/// EFFICIENCY OPTIMIZATIONS:
/// - Single /proc/loadavg read for all load metrics
/// - Single /proc/stat read for CPU usage
/// - Minimal string allocations
/// - No process spawning
/// - <0.1ms collection time target
pub struct CpuCollector {
    load_thresholds: HysteresisThresholds,
    temperature_thresholds: HysteresisThresholds,
}

impl CpuCollector {
    pub fn new(config: CpuConfig) -> Self {
        // Create hysteresis thresholds with 10% gap for recovery
        let load_thresholds = HysteresisThresholds::new(
            config.load_warning_threshold,
            config.load_critical_threshold,
        );

        let temperature_thresholds = HysteresisThresholds::new(
            config.temperature_warning_threshold,
            config.temperature_critical_threshold,
        );

        Self {
            load_thresholds,
            temperature_thresholds,
        }
    }

    /// Calculate CPU load status using thresholds
    fn calculate_load_status(&self, load: f32) -> Status {
        if load >= self.load_thresholds.critical_high {
            Status::Critical
        } else if load >= self.load_thresholds.warning_high {
            Status::Warning
        } else {
            Status::Ok
        }
    }

    /// Calculate CPU temperature status using thresholds
    fn calculate_temperature_status(&self, temp: f32) -> Status {
        if temp >= self.temperature_thresholds.critical_high {
            Status::Critical
        } else if temp >= self.temperature_thresholds.warning_high {
            Status::Warning
        } else {
            Status::Ok
        }
    }

    /// Collect CPU load averages and populate AgentData
    /// Format: "0.52 0.58 0.59 1/257 12345"
    async fn collect_load_averages(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
        let content = utils::read_proc_file("/proc/loadavg")?;
        let parts: Vec<&str> = content.trim().split_whitespace().collect();

        if parts.len() < 3 {
            return Err(CollectorError::Parse {
                value: content,
                error: "Expected at least 3 values in /proc/loadavg".to_string(),
            });
        }

        let load_1min = utils::parse_f32(parts[0])?;
        let load_5min = utils::parse_f32(parts[1])?;
        let load_15min = utils::parse_f32(parts[2])?;

        // Populate CPU data directly
        agent_data.system.cpu.load_1min = load_1min;
        agent_data.system.cpu.load_5min = load_5min;
        agent_data.system.cpu.load_15min = load_15min;

        Ok(())
    }

    /// Collect CPU temperature and populate AgentData
    /// Prioritizes x86_pkg_temp over generic thermal zones
    async fn collect_temperature(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
        // Try x86_pkg_temp first (Intel CPU package temperature)
        if let Ok(temp) = self
            .read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
            .await
        {
            let temp_celsius = temp as f32 / 1000.0;
            agent_data.system.cpu.temperature_celsius = Some(temp_celsius);
            return Ok(());
        }

        // Fallback: try other thermal zones
        for zone_id in 0..10 {
            let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
            if let Ok(temp) = self.read_thermal_zone(&path).await {
                let temp_celsius = temp as f32 / 1000.0;
                agent_data.system.cpu.temperature_celsius = Some(temp_celsius);
                return Ok(());
            }
        }

        debug!("No CPU temperature sensors found");
        // Leave temperature as None if not available
        Ok(())
    }

    /// Read temperature from thermal zone efficiently
    async fn read_thermal_zone(&self, path: &str) -> Result<u64, CollectorError> {
        let content = utils::read_proc_file(path)?;
        utils::parse_u64(content.trim())
    }

    /// Collect CPU C-state (idle depth) and populate AgentData with top 3 C-states by usage
    async fn collect_cstate(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
        // Read C-state usage from first CPU (representative of overall system)
        // C-states indicate CPU idle depth: C1=light sleep, C6=deep sleep, C10=deepest

        let mut cstate_times: Vec<(String, u64)> = Vec::new();
        let mut total_time: u64 = 0;

        // Collect all C-state times from CPU0
        for state_num in 0..=10 {
            let time_path = format!("/sys/devices/system/cpu/cpu0/cpuidle/state{}/time", state_num);
            let name_path = format!("/sys/devices/system/cpu/cpu0/cpuidle/state{}/name", state_num);

            if let Ok(time_str) = utils::read_proc_file(&time_path) {
                if let Ok(time) = utils::parse_u64(time_str.trim()) {
                    if let Ok(name) = utils::read_proc_file(&name_path) {
                        let state_name = name.trim();
                        // Skip POLL state (not real idle)
                        if state_name != "POLL" && time > 0 {
                            // Extract "C" + digits pattern (C3, C10, etc.) to reduce JSON size
                            // Handles formats like "C3_ACPI", "C10_MWAIT", etc.
                            let clean_name = if let Some(c_pos) = state_name.find('C') {
                                let rest = &state_name[c_pos + 1..];
                                let digit_count = rest.chars().take_while(|c| c.is_ascii_digit()).count();
                                if digit_count > 0 {
                                    state_name[c_pos..c_pos + 1 + digit_count].to_string()
                                } else {
                                    state_name.to_string()
                                }
                            } else {
                                state_name.to_string()
                            };
                            cstate_times.push((clean_name, time));
                            total_time += time;
                        }
                    }
                }
            } else {
                // No more states available
                break;
            }
        }

        // Sort by time descending to get top 3
        cstate_times.sort_by(|a, b| b.1.cmp(&a.1));

        // Calculate percentages for top 3 and populate AgentData
        agent_data.system.cpu.cstates = cstate_times
            .iter()
            .take(3)
            .map(|(name, time)| {
                let percent = if total_time > 0 {
                    (*time as f32 / total_time as f32) * 100.0
                } else {
                    0.0
                };
                cm_dashboard_shared::CStateInfo {
                    name: name.clone(),
                    percent,
                }
            })
            .collect();

        Ok(())
    }
}

#[async_trait]
impl Collector for CpuCollector {
    async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
        debug!("Collecting CPU metrics");
        let start = std::time::Instant::now();

        // Collect load averages (always available)
        self.collect_load_averages(agent_data).await?;

        // Collect temperature (optional)
        self.collect_temperature(agent_data).await?;

        // Collect C-state (CPU idle depth)
        self.collect_cstate(agent_data).await?;

        let duration = start.elapsed();
        debug!("CPU collection completed in {:?}", duration);

        // Efficiency check: warn if collection takes too long
        if duration.as_millis() > 1 {
            debug!(
                "CPU collection took {}ms - consider optimization",
                duration.as_millis()
            );
        }

        // Calculate status using thresholds
        agent_data.system.cpu.load_status = self.calculate_load_status(agent_data.system.cpu.load_1min);
        agent_data.system.cpu.temperature_status = if let Some(temp) = agent_data.system.cpu.temperature_celsius {
            self.calculate_temperature_status(temp)
        } else {
            Status::Unknown
        };

        Ok(())
    }
}