Implement real-time process monitoring and fix UI hardcoded data

This commit addresses several key issues identified during development: Major Changes: - Replace hardcoded top CPU/RAM process display with real system data - Add intelligent process monitoring to CpuCollector using ps command - Fix disk metrics permission issues in systemd collector - Optimize service collection to focus on status, memory, and disk only - Update dashboard widgets to display live process information Process Monitoring Implementation: - Added collect_top_cpu_process() and collect_top_ram_process() methods - Implemented ps-based monitoring with accurate CPU percentages - Added filtering to prevent self-monitoring artifacts (ps commands) - Enhanced error handling and validation for process data - Dashboard now shows realistic values like "claude (PID 2974) 11.0%" Service Collection Optimization: - Removed CPU monitoring from systemd collector for efficiency - Enhanced service directory permission error logging - Simplified services widget to show essential metrics only - Fixed service-to-directory mapping accuracy UI and Dashboard Improvements: - Reorganized dashboard layout with btop-inspired multi-panel design - Updated system panel to include real top CPU/RAM process display - Enhanced widget formatting and data presentation - Removed placeholder/hardcoded data throughout the interface Technical Details: - Updated agent/src/collectors/cpu.rs with process monitoring - Modified dashboard/src/ui/mod.rs for real-time process display - Enhanced systemd collector error handling and disk metrics - Updated CLAUDE.md documentation with implementation details
2025-10-16 23:55:05 +02:00
parent 7a664ef0fb
commit 8a36472a3d
81 changed files with 7702 additions and 9608 deletions
--- a/agent/src/collectors/cpu.rs
+++ b/agent/src/collectors/cpu.rs
@@ -0,0 +1,377 @@
+use async_trait::async_trait;
+use cm_dashboard_shared::{Metric, MetricValue, Status, registry};
+use std::time::Duration;
+use tracing::debug;
+
+use super::{Collector, CollectorError, utils};
+use crate::config::CpuConfig;
+
+/// Extremely efficient CPU metrics collector
+/// 
+/// EFFICIENCY OPTIMIZATIONS:
+/// - Single /proc/loadavg read for all load metrics
+/// - Single /proc/stat read for CPU usage
+/// - Minimal string allocations
+/// - No process spawning
+/// - <0.1ms collection time target
+pub struct CpuCollector {
+    config: CpuConfig,
+    name: String,
+}
+
+impl CpuCollector {
+    pub fn new(config: CpuConfig) -> Self {
+        Self {
+            config,
+            name: "cpu".to_string(),
+        }
+    }
+    
+    /// Calculate CPU load status using configured thresholds
+    fn calculate_load_status(&self, load: f32) -> Status {
+        if load >= self.config.load_critical_threshold {
+            Status::Critical
+        } else if load >= self.config.load_warning_threshold {
+            Status::Warning
+        } else {
+            Status::Ok
+        }
+    }
+    
+    /// Calculate CPU temperature status using configured thresholds
+    fn calculate_temperature_status(&self, temp: f32) -> Status {
+        if temp >= self.config.temperature_critical_threshold {
+            Status::Critical
+        } else if temp >= self.config.temperature_warning_threshold {
+            Status::Warning
+        } else {
+            Status::Ok
+        }
+    }
+    
+    /// Collect CPU load averages from /proc/loadavg
+    /// Format: "0.52 0.58 0.59 1/257 12345"
+    async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
+        let content = utils::read_proc_file("/proc/loadavg")?;
+        let parts: Vec<&str> = content.trim().split_whitespace().collect();
+        
+        if parts.len() < 3 {
+            return Err(CollectorError::Parse {
+                value: content,
+                error: "Expected at least 3 values in /proc/loadavg".to_string(),
+            });
+        }
+        
+        let load_1min = utils::parse_f32(parts[0])?;
+        let load_5min = utils::parse_f32(parts[1])?;
+        let load_15min = utils::parse_f32(parts[2])?;
+        
+        // Calculate status for each load average (use 1min for primary status)
+        let load_1min_status = self.calculate_load_status(load_1min);
+        let load_5min_status = self.calculate_load_status(load_5min);
+        let load_15min_status = self.calculate_load_status(load_15min);
+        
+        Ok(vec![
+            Metric::new(
+                registry::CPU_LOAD_1MIN.to_string(),
+                MetricValue::Float(load_1min),
+                load_1min_status,
+            ).with_description("CPU load average over 1 minute".to_string()),
+            
+            Metric::new(
+                registry::CPU_LOAD_5MIN.to_string(),
+                MetricValue::Float(load_5min),
+                load_5min_status,
+            ).with_description("CPU load average over 5 minutes".to_string()),
+            
+            Metric::new(
+                registry::CPU_LOAD_15MIN.to_string(),
+                MetricValue::Float(load_15min),
+                load_15min_status,
+            ).with_description("CPU load average over 15 minutes".to_string()),
+        ])
+    }
+    
+    /// Collect CPU temperature from thermal zones
+    /// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
+    async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
+        // Try x86_pkg_temp first (Intel CPU package temperature)
+        if let Ok(temp) = self.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp").await {
+            let temp_celsius = temp as f32 / 1000.0;
+            let status = self.calculate_temperature_status(temp_celsius);
+            
+            return Ok(Some(Metric::new(
+                registry::CPU_TEMPERATURE_CELSIUS.to_string(),
+                MetricValue::Float(temp_celsius),
+                status,
+            ).with_description("CPU package temperature".to_string())
+            .with_unit("°C".to_string())));
+        }
+        
+        // Fallback: try other thermal zones
+        for zone_id in 0..10 {
+            let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
+            if let Ok(temp) = self.read_thermal_zone(&path).await {
+                let temp_celsius = temp as f32 / 1000.0;
+                let status = self.calculate_temperature_status(temp_celsius);
+                
+                return Ok(Some(Metric::new(
+                    registry::CPU_TEMPERATURE_CELSIUS.to_string(),
+                    MetricValue::Float(temp_celsius),
+                    status,
+                ).with_description(format!("CPU temperature from thermal_zone{}", zone_id))
+                .with_unit("°C".to_string())));
+            }
+        }
+        
+        debug!("No CPU temperature sensors found");
+        Ok(None)
+    }
+    
+    /// Read temperature from thermal zone efficiently
+    async fn read_thermal_zone(&self, path: &str) -> Result<u64, CollectorError> {
+        let content = utils::read_proc_file(path)?;
+        utils::parse_u64(content.trim())
+    }
+    
+    /// Collect CPU frequency from /proc/cpuinfo or scaling governor
+    async fn collect_frequency(&self) -> Result<Option<Metric>, CollectorError> {
+        // Try scaling frequency first (more accurate for current frequency)
+        if let Ok(freq) = utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq") {
+            if let Ok(freq_khz) = utils::parse_u64(freq.trim()) {
+                let freq_mhz = freq_khz as f32 / 1000.0;
+                
+                return Ok(Some(Metric::new(
+                    registry::CPU_FREQUENCY_MHZ.to_string(),
+                    MetricValue::Float(freq_mhz),
+                    Status::Ok, // Frequency doesn't have status thresholds
+                ).with_description("Current CPU frequency".to_string())
+                .with_unit("MHz".to_string())));
+            }
+        }
+        
+        // Fallback: parse /proc/cpuinfo for base frequency
+        if let Ok(content) = utils::read_proc_file("/proc/cpuinfo") {
+            for line in content.lines() {
+                if line.starts_with("cpu MHz") {
+                    if let Some(freq_str) = line.split(':').nth(1) {
+                        if let Ok(freq_mhz) = utils::parse_f32(freq_str) {
+                            return Ok(Some(Metric::new(
+                                registry::CPU_FREQUENCY_MHZ.to_string(),
+                                MetricValue::Float(freq_mhz),
+                                Status::Ok,
+                            ).with_description("CPU base frequency from /proc/cpuinfo".to_string())
+                            .with_unit("MHz".to_string())));
+                        }
+                    }
+                    break; // Only need first CPU entry
+                }
+            }
+        }
+        
+        debug!("CPU frequency not available");
+        Ok(None)
+    }
+    
+    /// Collect top CPU consuming process using ps command for accurate percentages
+    async fn collect_top_cpu_process(&self) -> Result<Option<Metric>, CollectorError> {
+        use std::process::Command;
+        
+        // Use ps to get current CPU percentages, sorted by CPU usage
+        let output = Command::new("ps")
+            .arg("aux")
+            .arg("--sort=-%cpu")
+            .arg("--no-headers")
+            .output()
+            .map_err(|e| CollectorError::SystemRead {
+                path: "ps command".to_string(),
+                error: e.to_string(),
+            })?;
+            
+        if !output.status.success() {
+            return Ok(None);
+        }
+        
+        let output_str = String::from_utf8_lossy(&output.stdout);
+        
+        // Parse lines and find the first non-ps process (to avoid catching our own ps command)
+        for line in output_str.lines() {
+            let parts: Vec<&str> = line.split_whitespace().collect();
+            if parts.len() >= 11 {
+                // ps aux format: USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
+                let pid = parts[1];
+                let cpu_percent = parts[2];
+                let full_command = parts[10..].join(" ");
+                
+                // Skip ps processes to avoid catching our own ps command
+                if full_command.contains("ps aux") || full_command.starts_with("ps ") {
+                    continue;
+                }
+                
+                // Extract just the command name (basename of executable)
+                let command_name = if let Some(first_part) = parts.get(10) {
+                    // Get just the executable name, not the full path
+                    if let Some(basename) = first_part.split('/').last() {
+                        basename.to_string()
+                    } else {
+                        first_part.to_string()
+                    }
+                } else {
+                    "unknown".to_string()
+                };
+                
+                // Validate CPU percentage is reasonable (not over 100% per core)
+                if let Ok(cpu_val) = cpu_percent.parse::<f32>() {
+                    if cpu_val > 1000.0 {
+                        // Skip obviously wrong values
+                        continue;
+                    }
+                }
+                
+                let process_info = format!("{} (PID {}) {}%", command_name, pid, cpu_percent);
+                
+                return Ok(Some(Metric::new(
+                    "top_cpu_process".to_string(),
+                    MetricValue::String(process_info),
+                    Status::Ok,
+                ).with_description("Process consuming the most CPU".to_string())));
+            }
+        }
+        
+        Ok(Some(Metric::new(
+            "top_cpu_process".to_string(),
+            MetricValue::String("No processes found".to_string()),
+            Status::Ok,
+        ).with_description("Process consuming the most CPU".to_string())))
+    }
+    
+    /// Collect top RAM consuming process using ps command for accurate memory usage
+    async fn collect_top_ram_process(&self) -> Result<Option<Metric>, CollectorError> {
+        use std::process::Command;
+        
+        // Use ps to get current memory usage, sorted by memory
+        let output = Command::new("ps")
+            .arg("aux")
+            .arg("--sort=-%mem")
+            .arg("--no-headers")
+            .output()
+            .map_err(|e| CollectorError::SystemRead {
+                path: "ps command".to_string(),
+                error: e.to_string(),
+            })?;
+            
+        if !output.status.success() {
+            return Ok(None);
+        }
+        
+        let output_str = String::from_utf8_lossy(&output.stdout);
+        
+        // Parse lines and find the first non-ps process (to avoid catching our own ps command)
+        for line in output_str.lines() {
+            let parts: Vec<&str> = line.split_whitespace().collect();
+            if parts.len() >= 11 {
+                // ps aux format: USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
+                let pid = parts[1];
+                let mem_percent = parts[3];
+                let rss_kb = parts[5]; // RSS in KB
+                let full_command = parts[10..].join(" ");
+                
+                // Skip ps processes to avoid catching our own ps command
+                if full_command.contains("ps aux") || full_command.starts_with("ps ") {
+                    continue;
+                }
+                
+                // Extract just the command name (basename of executable)
+                let command_name = if let Some(first_part) = parts.get(10) {
+                    // Get just the executable name, not the full path
+                    if let Some(basename) = first_part.split('/').last() {
+                        basename.to_string()
+                    } else {
+                        first_part.to_string()
+                    }
+                } else {
+                    "unknown".to_string()
+                };
+                
+                // Convert RSS from KB to MB
+                if let Ok(rss_kb_val) = rss_kb.parse::<u64>() {
+                    let rss_mb = rss_kb_val as f32 / 1024.0;
+                    
+                    // Skip processes with very little memory (likely temporary commands)
+                    if rss_mb < 1.0 {
+                        continue;
+                    }
+                    
+                    let process_info = format!("{} (PID {}) {:.1}MB", command_name, pid, rss_mb);
+                    
+                    return Ok(Some(Metric::new(
+                        "top_ram_process".to_string(),
+                        MetricValue::String(process_info),
+                        Status::Ok,
+                    ).with_description("Process consuming the most RAM".to_string())));
+                }
+            }
+        }
+        
+        Ok(Some(Metric::new(
+            "top_ram_process".to_string(),
+            MetricValue::String("No processes found".to_string()),
+            Status::Ok,
+        ).with_description("Process consuming the most RAM".to_string())))
+    }
+}
+
+#[async_trait]
+impl Collector for CpuCollector {
+    fn name(&self) -> &str {
+        &self.name
+    }
+    
+    async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
+        
+        debug!("Collecting CPU metrics");
+        let start = std::time::Instant::now();
+        
+        let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
+        
+        // Collect load averages (always available)
+        metrics.extend(self.collect_load_averages().await?);
+        
+        // Collect temperature (optional)
+        if let Some(temp_metric) = self.collect_temperature().await? {
+            metrics.push(temp_metric);
+        }
+        
+        // Collect frequency (optional)
+        if let Some(freq_metric) = self.collect_frequency().await? {
+            metrics.push(freq_metric);
+        }
+        
+        // Collect top CPU process (optional)
+        if let Some(top_cpu_metric) = self.collect_top_cpu_process().await? {
+            metrics.push(top_cpu_metric);
+        }
+        
+        // Collect top RAM process (optional)
+        if let Some(top_ram_metric) = self.collect_top_ram_process().await? {
+            metrics.push(top_ram_metric);
+        }
+        
+        let duration = start.elapsed();
+        debug!("CPU collection completed in {:?} with {} metrics", duration, metrics.len());
+        
+        // Efficiency check: warn if collection takes too long
+        if duration.as_millis() > 1 {
+            debug!("CPU collection took {}ms - consider optimization", duration.as_millis());
+        }
+        
+        // Store performance metrics  
+        // Performance tracking handled by cache system
+        
+        Ok(metrics)
+    }
+    
+    fn get_performance_metrics(&self) -> Option<super::PerformanceMetrics> {
+        None // Performance tracking handled by cache system
+    }
+}