Improve dashboard display and fix service issues

- Remove unreachable descriptions from failed nginx sites - Show complete site URLs instead of truncating at first dot - Implement service-specific disk quotas (docker: 4GB, immich: 4GB, others: 1-2GB) - Truncate process names to show only executable name without full path - Display only highest C-state instead of all C-states for cleaner output - Format system RAM as xxxMB/GB (totalGB) to match services format
2025-10-15 09:36:03 +02:00 · 2025-10-15 09:36:03 +02:00 · efdd713f62
commit efdd713f62
parent 672c8bebc9
5 changed files with 107 additions and 63 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -332,11 +332,30 @@ rm /tmp/cm-maintenance
 4. Dashboard uses `status_level_from_agent_status()` for display
 5. Agent adds notification monitoring for status changes

+**Testing & Building:**
+- ALWAYS use `cargo build --workspace` to match NixOS build configuration
+- Test with OpenSSL environment variables when building locally:
+  ```bash
+  OPENSSL_DIR=/nix/store/.../openssl-dev \
+  OPENSSL_LIB_DIR=/nix/store/.../openssl/lib \
+  OPENSSL_INCLUDE_DIR=/nix/store/.../openssl-dev/include \
+  PKG_CONFIG_PATH=/nix/store/.../openssl-dev/lib/pkgconfig \
+  OPENSSL_NO_VENDOR=1 cargo build --workspace
+  ```
+- This prevents build failures that only appear in NixOS deployment
+
+**Notification System:**
+- Universal automatic detection of all `_status` fields across all collectors
+- Sends emails from `hostname@cmtec.se` to `cm@cmtec.se` for any status changes
+- Status stored in-memory: `HashMap<"component.metric", status>`
+- Recovery emails sent when status changes from warning/critical → ok
+
 **NEVER:**
 - Add hardcoded thresholds to dashboard widgets
 - Calculate status in dashboard with different thresholds than agent
 - Use "ok" as default when agent status is missing (use "unknown")
 - Calculate colors in widgets (TableBuilder's responsibility)
+- Use `cargo build` without `--workspace` for final testing

 # Important Communication Guidelines

--- a/agent/src/collectors/service.rs
+++ b/agent/src/collectors/service.rs
@ -303,6 +303,9 @@ impl ServiceCollector {

    async fn get_service_disk_quota(&self, service: &str) -> Result<f32, CollectorError> {
        // Check systemd service properties for NixOS hardening-related disk restrictions
+        let mut private_tmp = false;
+        let mut protect_system = false;
+        
        let systemd_output = Command::new("/run/current-system/sw/bin/systemctl")
            .args(["show", service, "--property=PrivateTmp,ProtectHome,ProtectSystem,ReadOnlyPaths,InaccessiblePaths,BindPaths,BindReadOnlyPaths", "--no-pager"])
            .stdout(Stdio::piped())
@ -315,8 +318,6 @@ impl ServiceCollector {
                let stdout = String::from_utf8_lossy(&output.stdout);
                
                // Parse systemd properties that might indicate disk restrictions
-                let mut private_tmp = false;
-                let mut protect_system = false;
                let mut readonly_paths = Vec::new();
                
                for line in stdout.lines() {
@ -328,39 +329,33 @@ impl ServiceCollector {
                        readonly_paths.push(paths.to_string());
                    }
                }
-                
-                // If service has significant restrictions, it might have implicit disk limits
-                // This is heuristic-based since systemd doesn't have direct disk quotas
-                if private_tmp && protect_system {
-                    // Heavily sandboxed services might have practical disk limits
-                    // Return a conservative estimate based on typical service needs
-                    return Ok(1.0); // 1 GB as reasonable limit for sandboxed services
-                }
            }
        }
        
-        // Check for service-specific disk configurations in NixOS
-        match service {
-            "docker" => {
-                // Docker might have storage driver limits in NixOS config
-                if let Ok(limit) = self.get_docker_storage_quota().await {
-                    return Ok(limit);
+        // Check for service-specific disk configurations - use service-appropriate defaults
+        let service_quota = match service {
+            "docker" => 4.0, // Docker containers need more space
+            "gitea" => 1.0,  // Gitea repositories, but database is external
+            "postgresql" | "postgres" => 1.0, // Database storage
+            "mysql" | "mariadb" => 1.0, // Database storage
+            "immich-server" => 4.0, // Photo storage app needs more space
+            "unifi" => 2.0, // Network management with logs and configs
+            "vaultwarden" => 1.0, // Password manager
+            "gitea-runner-default" => 1.0, // CI/CD runner
+            "nginx" => 1.0, // Web server
+            "mosquitto" => 1.0, // MQTT broker
+            "redis-immich" => 1.0, // Redis cache
+            _ => {
+                // Default based on sandboxing - sandboxed services get smaller quotas
+                if private_tmp && protect_system {
+                    1.0 // 1 GB for sandboxed services
+                } else {
+                    2.0 // 2 GB for non-sandboxed services
                }
-            },
-            "postgresql" | "postgres" => {
-                // PostgreSQL might have tablespace or data directory limits
-                // Check for database-specific storage configuration
-            },
-            "mysql" | "mariadb" => {
-                // MySQL might have data directory size limits
-            },
-            _ => {}
-        }
+            }
+        };
        
-        // No quota found
-        Err(CollectorError::ParseError {
-            message: format!("No disk quota found for service {}", service),
-        })
+        Ok(service_quota)
    }
    
    async fn check_filesystem_quota(&self, path: &str) -> Result<f32, CollectorError> {
@ -1282,7 +1277,7 @@ impl Collector for ServiceCollector {
                                let (site_status, site_description) = match (latency, is_healthy) {
                                    (Some(_ms), true) => (ServiceStatus::Running, None),
                                    (Some(_ms), false) => (ServiceStatus::Stopped, None), // Show error status but no description
-                                    (None, _) => (ServiceStatus::Stopped, Some(vec!["unreachable".to_string()])),
+                                    (None, _) => (ServiceStatus::Stopped, None), // No description for unreachable sites
                                };
                                
                                // Update counters based on site status
--- a/agent/src/collectors/system.rs
+++ b/agent/src/collectors/system.rs
@ -204,29 +204,36 @@ impl SystemCollector {
                    order_a.cmp(&order_b)
                });
                
-                // Format C-states as description lines (2 C-states per row)
-                let mut result = Vec::new();
-                let mut current_line = Vec::new();
+                // Find the highest C-state with significant usage (>= 0.1%)
+                let mut highest_cstate = None;
+                let mut highest_order = -1;
                
-                for (name, time) in cstate_times {
-                    let percent = (time as f32 / total_time as f32) * 100.0;
-                    if percent >= 0.1 { // Only show states with at least 0.1% time
-                        current_line.push(format!("{}: {:.1}%", name, percent));
+                for (name, time) in &cstate_times {
+                    let percent = (*time as f32 / total_time as f32) * 100.0;
+                    if percent >= 0.1 { // Only consider states with at least 0.1% time
+                        let order = match name.as_str() {
+                            "POLL" => 0,
+                            "C1" => 1,
+                            "C1E" => 2,
+                            "C3" => 3,
+                            "C6" => 4,
+                            "C7s" => 5,
+                            "C8" => 6,
+                            "C9" => 7,
+                            "C10" => 8,
+                            _ => -1,
+                        };
                        
-                        // Split into rows when we have 2 items
-                        if current_line.len() == 2 {
-                            result.push(current_line.join(", "));
-                            current_line.clear();
+                        if order > highest_order {
+                            highest_order = order;
+                            highest_cstate = Some(format!("{}: {:.1}%", name, percent));
                        }
                    }
                }
                
-                // Add remaining items as final line
-                if !current_line.is_empty() {
-                    result.push(current_line.join(", "));
+                if let Some(cstate) = highest_cstate {
+                    return Some(vec![format!("C-State: {}", cstate)]);
                }
-                
-                return Some(result);
            }
        }
        
@ -281,7 +288,13 @@ impl SystemCollector {
                    let command = fields[10];
                    // Skip kernel threads (in brackets) and low CPU processes
                    if !command.starts_with('[') && cpu_percent.parse::<f32>().unwrap_or(0.0) > 0.1 {
-                        return Some(format!("{} {:.1}%", command, cpu_percent.parse::<f32>().unwrap_or(0.0)));
+                        // Extract just the process name from the full path
+                        let process_name = if let Some(last_slash) = command.rfind('/') {
+                            &command[last_slash + 1..]
+                        } else {
+                            command
+                        };
+                        return Some(format!("{} {:.1}%", process_name, cpu_percent.parse::<f32>().unwrap_or(0.0)));
                    }
                }
            }
@ -308,7 +321,13 @@ impl SystemCollector {
                    let command = fields[10];
                    // Skip kernel threads (in brackets) and low memory processes
                    if !command.starts_with('[') && mem_percent.parse::<f32>().unwrap_or(0.0) > 0.1 {
-                        return Some(format!("{} {:.1}%", command, mem_percent.parse::<f32>().unwrap_or(0.0)));
+                        // Extract just the process name from the full path
+                        let process_name = if let Some(last_slash) = command.rfind('/') {
+                            &command[last_slash + 1..]
+                        } else {
+                            command
+                        };
+                        return Some(format!("{} {:.1}%", process_name, mem_percent.parse::<f32>().unwrap_or(0.0)));
                    }
                }
            }
--- a/dashboard/src/ui/services.rs
+++ b/dashboard/src/ui/services.rs
@ -109,12 +109,8 @@ fn render_metrics(
            // Add latency information for nginx sites if available
            let service_name_with_latency = if let Some(parent) = &svc.sub_service {
                if parent == "nginx" {
-                    // Extract subdomain part for shorter display
-                    let short_name = if let Some(dot_pos) = svc.name.find('.') {
-                        &svc.name[..dot_pos]
-                    } else {
-                        &svc.name
-                    };
+                    // Use full site name instead of truncating at first dot
+                    let short_name = &svc.name;
                    
                    match &svc.latency_ms {
                        Some(latency) if *latency >= 2000.0 => format!("{} → unreachable", short_name), // Timeout (2s+)
--- a/dashboard/src/ui/system.rs
+++ b/dashboard/src/ui/system.rs
@ -68,14 +68,10 @@ fn render_metrics(
    // Use agent-provided C-states and logged-in users as description
    let mut description_lines = Vec::new();
    
-    // Add C-states with prefix on first line, indent subsequent lines
+    // Add C-state (now only highest C-state from agent)
    if let Some(cstates) = &summary.cpu_cstate {
-        for (i, cstate_line) in cstates.iter().enumerate() {
-            if i == 0 {
-                description_lines.push(format!("C-State: {}", cstate_line));
-            } else {
-                description_lines.push(format!("         {}", cstate_line));
-            }
+        for cstate_line in cstates.iter() {
+            description_lines.push(cstate_line.clone()); // Agent already includes "C-State:" prefix
        }
    }
    
@ -105,7 +101,7 @@ fn render_metrics(
        overall_status.clone(),
        description_lines,
        vec![
-            format!("{:.1} / {:.1} GB", summary.memory_used_mb / 1000.0, summary.memory_total_mb / 1000.0),
+            format_system_memory_value(summary.memory_used_mb, summary.memory_total_mb),
            format!("{:.2} • {:.2} • {:.2}", summary.cpu_load_1, summary.cpu_load_5, summary.cpu_load_15),
            format_optional_metric(summary.cpu_temp_c, "°C"),
        ],
@ -122,3 +118,22 @@ fn format_optional_metric(value: Option<f32>, unit: &str) -> String {
    }
 }

+fn format_bytes(mb: f32) -> String {
+    if mb < 0.1 {
+        "<1MB".to_string()
+    } else if mb < 1.0 {
+        format!("{:.0}kB", mb * 1000.0)
+    } else if mb < 1000.0 {
+        format!("{:.0}MB", mb)
+    } else {
+        format!("{:.1}GB", mb / 1000.0)
+    }
+}
+
+fn format_system_memory_value(used_mb: f32, total_mb: f32) -> String {
+    let used_value = format_bytes(used_mb);
+    let total_gb = total_mb / 1000.0;
+    // Format total as GB without decimals
+    format!("{} ({}GB)", used_value, total_gb as u32)
+}
+