From efdd713f6232fcc74037dc5b5321fe672a4a99f1 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Wed, 15 Oct 2025 09:36:03 +0200 Subject: [PATCH] Improve dashboard display and fix service issues - Remove unreachable descriptions from failed nginx sites - Show complete site URLs instead of truncating at first dot - Implement service-specific disk quotas (docker: 4GB, immich: 4GB, others: 1-2GB) - Truncate process names to show only executable name without full path - Display only highest C-state instead of all C-states for cleaner output - Format system RAM as xxxMB/GB (totalGB) to match services format --- CLAUDE.md | 19 +++++++++++ agent/src/collectors/service.rs | 57 +++++++++++++++------------------ agent/src/collectors/system.rs | 55 ++++++++++++++++++++----------- dashboard/src/ui/services.rs | 8 ++--- dashboard/src/ui/system.rs | 31 +++++++++++++----- 5 files changed, 107 insertions(+), 63 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 9ae7473..0b02627 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -332,11 +332,30 @@ rm /tmp/cm-maintenance 4. Dashboard uses `status_level_from_agent_status()` for display 5. Agent adds notification monitoring for status changes +**Testing & Building:** +- ALWAYS use `cargo build --workspace` to match NixOS build configuration +- Test with OpenSSL environment variables when building locally: + ```bash + OPENSSL_DIR=/nix/store/.../openssl-dev \ + OPENSSL_LIB_DIR=/nix/store/.../openssl/lib \ + OPENSSL_INCLUDE_DIR=/nix/store/.../openssl-dev/include \ + PKG_CONFIG_PATH=/nix/store/.../openssl-dev/lib/pkgconfig \ + OPENSSL_NO_VENDOR=1 cargo build --workspace + ``` +- This prevents build failures that only appear in NixOS deployment + +**Notification System:** +- Universal automatic detection of all `_status` fields across all collectors +- Sends emails from `hostname@cmtec.se` to `cm@cmtec.se` for any status changes +- Status stored in-memory: `HashMap<"component.metric", status>` +- Recovery emails sent when status changes from warning/critical → ok + **NEVER:** - Add hardcoded thresholds to dashboard widgets - Calculate status in dashboard with different thresholds than agent - Use "ok" as default when agent status is missing (use "unknown") - Calculate colors in widgets (TableBuilder's responsibility) +- Use `cargo build` without `--workspace` for final testing # Important Communication Guidelines diff --git a/agent/src/collectors/service.rs b/agent/src/collectors/service.rs index cfb4350..5a2a2c7 100644 --- a/agent/src/collectors/service.rs +++ b/agent/src/collectors/service.rs @@ -303,6 +303,9 @@ impl ServiceCollector { async fn get_service_disk_quota(&self, service: &str) -> Result { // Check systemd service properties for NixOS hardening-related disk restrictions + let mut private_tmp = false; + let mut protect_system = false; + let systemd_output = Command::new("/run/current-system/sw/bin/systemctl") .args(["show", service, "--property=PrivateTmp,ProtectHome,ProtectSystem,ReadOnlyPaths,InaccessiblePaths,BindPaths,BindReadOnlyPaths", "--no-pager"]) .stdout(Stdio::piped()) @@ -315,8 +318,6 @@ impl ServiceCollector { let stdout = String::from_utf8_lossy(&output.stdout); // Parse systemd properties that might indicate disk restrictions - let mut private_tmp = false; - let mut protect_system = false; let mut readonly_paths = Vec::new(); for line in stdout.lines() { @@ -328,39 +329,33 @@ impl ServiceCollector { readonly_paths.push(paths.to_string()); } } - - // If service has significant restrictions, it might have implicit disk limits - // This is heuristic-based since systemd doesn't have direct disk quotas - if private_tmp && protect_system { - // Heavily sandboxed services might have practical disk limits - // Return a conservative estimate based on typical service needs - return Ok(1.0); // 1 GB as reasonable limit for sandboxed services - } } } - // Check for service-specific disk configurations in NixOS - match service { - "docker" => { - // Docker might have storage driver limits in NixOS config - if let Ok(limit) = self.get_docker_storage_quota().await { - return Ok(limit); + // Check for service-specific disk configurations - use service-appropriate defaults + let service_quota = match service { + "docker" => 4.0, // Docker containers need more space + "gitea" => 1.0, // Gitea repositories, but database is external + "postgresql" | "postgres" => 1.0, // Database storage + "mysql" | "mariadb" => 1.0, // Database storage + "immich-server" => 4.0, // Photo storage app needs more space + "unifi" => 2.0, // Network management with logs and configs + "vaultwarden" => 1.0, // Password manager + "gitea-runner-default" => 1.0, // CI/CD runner + "nginx" => 1.0, // Web server + "mosquitto" => 1.0, // MQTT broker + "redis-immich" => 1.0, // Redis cache + _ => { + // Default based on sandboxing - sandboxed services get smaller quotas + if private_tmp && protect_system { + 1.0 // 1 GB for sandboxed services + } else { + 2.0 // 2 GB for non-sandboxed services } - }, - "postgresql" | "postgres" => { - // PostgreSQL might have tablespace or data directory limits - // Check for database-specific storage configuration - }, - "mysql" | "mariadb" => { - // MySQL might have data directory size limits - }, - _ => {} - } + } + }; - // No quota found - Err(CollectorError::ParseError { - message: format!("No disk quota found for service {}", service), - }) + Ok(service_quota) } async fn check_filesystem_quota(&self, path: &str) -> Result { @@ -1282,7 +1277,7 @@ impl Collector for ServiceCollector { let (site_status, site_description) = match (latency, is_healthy) { (Some(_ms), true) => (ServiceStatus::Running, None), (Some(_ms), false) => (ServiceStatus::Stopped, None), // Show error status but no description - (None, _) => (ServiceStatus::Stopped, Some(vec!["unreachable".to_string()])), + (None, _) => (ServiceStatus::Stopped, None), // No description for unreachable sites }; // Update counters based on site status diff --git a/agent/src/collectors/system.rs b/agent/src/collectors/system.rs index 06e4170..6db07d3 100644 --- a/agent/src/collectors/system.rs +++ b/agent/src/collectors/system.rs @@ -204,29 +204,36 @@ impl SystemCollector { order_a.cmp(&order_b) }); - // Format C-states as description lines (2 C-states per row) - let mut result = Vec::new(); - let mut current_line = Vec::new(); + // Find the highest C-state with significant usage (>= 0.1%) + let mut highest_cstate = None; + let mut highest_order = -1; - for (name, time) in cstate_times { - let percent = (time as f32 / total_time as f32) * 100.0; - if percent >= 0.1 { // Only show states with at least 0.1% time - current_line.push(format!("{}: {:.1}%", name, percent)); + for (name, time) in &cstate_times { + let percent = (*time as f32 / total_time as f32) * 100.0; + if percent >= 0.1 { // Only consider states with at least 0.1% time + let order = match name.as_str() { + "POLL" => 0, + "C1" => 1, + "C1E" => 2, + "C3" => 3, + "C6" => 4, + "C7s" => 5, + "C8" => 6, + "C9" => 7, + "C10" => 8, + _ => -1, + }; - // Split into rows when we have 2 items - if current_line.len() == 2 { - result.push(current_line.join(", ")); - current_line.clear(); + if order > highest_order { + highest_order = order; + highest_cstate = Some(format!("{}: {:.1}%", name, percent)); } } } - // Add remaining items as final line - if !current_line.is_empty() { - result.push(current_line.join(", ")); + if let Some(cstate) = highest_cstate { + return Some(vec![format!("C-State: {}", cstate)]); } - - return Some(result); } } @@ -281,7 +288,13 @@ impl SystemCollector { let command = fields[10]; // Skip kernel threads (in brackets) and low CPU processes if !command.starts_with('[') && cpu_percent.parse::().unwrap_or(0.0) > 0.1 { - return Some(format!("{} {:.1}%", command, cpu_percent.parse::().unwrap_or(0.0))); + // Extract just the process name from the full path + let process_name = if let Some(last_slash) = command.rfind('/') { + &command[last_slash + 1..] + } else { + command + }; + return Some(format!("{} {:.1}%", process_name, cpu_percent.parse::().unwrap_or(0.0))); } } } @@ -308,7 +321,13 @@ impl SystemCollector { let command = fields[10]; // Skip kernel threads (in brackets) and low memory processes if !command.starts_with('[') && mem_percent.parse::().unwrap_or(0.0) > 0.1 { - return Some(format!("{} {:.1}%", command, mem_percent.parse::().unwrap_or(0.0))); + // Extract just the process name from the full path + let process_name = if let Some(last_slash) = command.rfind('/') { + &command[last_slash + 1..] + } else { + command + }; + return Some(format!("{} {:.1}%", process_name, mem_percent.parse::().unwrap_or(0.0))); } } } diff --git a/dashboard/src/ui/services.rs b/dashboard/src/ui/services.rs index f1bb278..f062254 100644 --- a/dashboard/src/ui/services.rs +++ b/dashboard/src/ui/services.rs @@ -109,12 +109,8 @@ fn render_metrics( // Add latency information for nginx sites if available let service_name_with_latency = if let Some(parent) = &svc.sub_service { if parent == "nginx" { - // Extract subdomain part for shorter display - let short_name = if let Some(dot_pos) = svc.name.find('.') { - &svc.name[..dot_pos] - } else { - &svc.name - }; + // Use full site name instead of truncating at first dot + let short_name = &svc.name; match &svc.latency_ms { Some(latency) if *latency >= 2000.0 => format!("{} → unreachable", short_name), // Timeout (2s+) diff --git a/dashboard/src/ui/system.rs b/dashboard/src/ui/system.rs index f71d04c..98b10ac 100644 --- a/dashboard/src/ui/system.rs +++ b/dashboard/src/ui/system.rs @@ -68,14 +68,10 @@ fn render_metrics( // Use agent-provided C-states and logged-in users as description let mut description_lines = Vec::new(); - // Add C-states with prefix on first line, indent subsequent lines + // Add C-state (now only highest C-state from agent) if let Some(cstates) = &summary.cpu_cstate { - for (i, cstate_line) in cstates.iter().enumerate() { - if i == 0 { - description_lines.push(format!("C-State: {}", cstate_line)); - } else { - description_lines.push(format!(" {}", cstate_line)); - } + for cstate_line in cstates.iter() { + description_lines.push(cstate_line.clone()); // Agent already includes "C-State:" prefix } } @@ -105,7 +101,7 @@ fn render_metrics( overall_status.clone(), description_lines, vec![ - format!("{:.1} / {:.1} GB", summary.memory_used_mb / 1000.0, summary.memory_total_mb / 1000.0), + format_system_memory_value(summary.memory_used_mb, summary.memory_total_mb), format!("{:.2} • {:.2} • {:.2}", summary.cpu_load_1, summary.cpu_load_5, summary.cpu_load_15), format_optional_metric(summary.cpu_temp_c, "°C"), ], @@ -122,3 +118,22 @@ fn format_optional_metric(value: Option, unit: &str) -> String { } } +fn format_bytes(mb: f32) -> String { + if mb < 0.1 { + "<1MB".to_string() + } else if mb < 1.0 { + format!("{:.0}kB", mb * 1000.0) + } else if mb < 1000.0 { + format!("{:.0}MB", mb) + } else { + format!("{:.1}GB", mb / 1000.0) + } +} + +fn format_system_memory_value(used_mb: f32, total_mb: f32) -> String { + let used_value = format_bytes(used_mb); + let total_gb = total_mb / 1000.0; + // Format total as GB without decimals + format!("{} ({}GB)", used_value, total_gb as u32) +} +