Improve dashboard display and fix service issues

- Remove unreachable descriptions from failed nginx sites
- Show complete site URLs instead of truncating at first dot
- Implement service-specific disk quotas (docker: 4GB, immich: 4GB, others: 1-2GB)
- Truncate process names to show only executable name without full path
- Display only highest C-state instead of all C-states for cleaner output
- Format system RAM as xxxMB/GB (totalGB) to match services format
This commit is contained in:
Christoffer Martinsson 2025-10-15 09:36:03 +02:00
parent 672c8bebc9
commit efdd713f62
5 changed files with 107 additions and 63 deletions

View File

@ -332,11 +332,30 @@ rm /tmp/cm-maintenance
4. Dashboard uses `status_level_from_agent_status()` for display
5. Agent adds notification monitoring for status changes
**Testing & Building:**
- ALWAYS use `cargo build --workspace` to match NixOS build configuration
- Test with OpenSSL environment variables when building locally:
```bash
OPENSSL_DIR=/nix/store/.../openssl-dev \
OPENSSL_LIB_DIR=/nix/store/.../openssl/lib \
OPENSSL_INCLUDE_DIR=/nix/store/.../openssl-dev/include \
PKG_CONFIG_PATH=/nix/store/.../openssl-dev/lib/pkgconfig \
OPENSSL_NO_VENDOR=1 cargo build --workspace
```
- This prevents build failures that only appear in NixOS deployment
**Notification System:**
- Universal automatic detection of all `_status` fields across all collectors
- Sends emails from `hostname@cmtec.se` to `cm@cmtec.se` for any status changes
- Status stored in-memory: `HashMap<"component.metric", status>`
- Recovery emails sent when status changes from warning/critical → ok
**NEVER:**
- Add hardcoded thresholds to dashboard widgets
- Calculate status in dashboard with different thresholds than agent
- Use "ok" as default when agent status is missing (use "unknown")
- Calculate colors in widgets (TableBuilder's responsibility)
- Use `cargo build` without `--workspace` for final testing
# Important Communication Guidelines

View File

@ -303,6 +303,9 @@ impl ServiceCollector {
async fn get_service_disk_quota(&self, service: &str) -> Result<f32, CollectorError> {
// Check systemd service properties for NixOS hardening-related disk restrictions
let mut private_tmp = false;
let mut protect_system = false;
let systemd_output = Command::new("/run/current-system/sw/bin/systemctl")
.args(["show", service, "--property=PrivateTmp,ProtectHome,ProtectSystem,ReadOnlyPaths,InaccessiblePaths,BindPaths,BindReadOnlyPaths", "--no-pager"])
.stdout(Stdio::piped())
@ -315,8 +318,6 @@ impl ServiceCollector {
let stdout = String::from_utf8_lossy(&output.stdout);
// Parse systemd properties that might indicate disk restrictions
let mut private_tmp = false;
let mut protect_system = false;
let mut readonly_paths = Vec::new();
for line in stdout.lines() {
@ -328,39 +329,33 @@ impl ServiceCollector {
readonly_paths.push(paths.to_string());
}
}
// If service has significant restrictions, it might have implicit disk limits
// This is heuristic-based since systemd doesn't have direct disk quotas
if private_tmp && protect_system {
// Heavily sandboxed services might have practical disk limits
// Return a conservative estimate based on typical service needs
return Ok(1.0); // 1 GB as reasonable limit for sandboxed services
}
}
}
// Check for service-specific disk configurations in NixOS
match service {
"docker" => {
// Docker might have storage driver limits in NixOS config
if let Ok(limit) = self.get_docker_storage_quota().await {
return Ok(limit);
// Check for service-specific disk configurations - use service-appropriate defaults
let service_quota = match service {
"docker" => 4.0, // Docker containers need more space
"gitea" => 1.0, // Gitea repositories, but database is external
"postgresql" | "postgres" => 1.0, // Database storage
"mysql" | "mariadb" => 1.0, // Database storage
"immich-server" => 4.0, // Photo storage app needs more space
"unifi" => 2.0, // Network management with logs and configs
"vaultwarden" => 1.0, // Password manager
"gitea-runner-default" => 1.0, // CI/CD runner
"nginx" => 1.0, // Web server
"mosquitto" => 1.0, // MQTT broker
"redis-immich" => 1.0, // Redis cache
_ => {
// Default based on sandboxing - sandboxed services get smaller quotas
if private_tmp && protect_system {
1.0 // 1 GB for sandboxed services
} else {
2.0 // 2 GB for non-sandboxed services
}
},
"postgresql" | "postgres" => {
// PostgreSQL might have tablespace or data directory limits
// Check for database-specific storage configuration
},
"mysql" | "mariadb" => {
// MySQL might have data directory size limits
},
_ => {}
}
}
};
// No quota found
Err(CollectorError::ParseError {
message: format!("No disk quota found for service {}", service),
})
Ok(service_quota)
}
async fn check_filesystem_quota(&self, path: &str) -> Result<f32, CollectorError> {
@ -1282,7 +1277,7 @@ impl Collector for ServiceCollector {
let (site_status, site_description) = match (latency, is_healthy) {
(Some(_ms), true) => (ServiceStatus::Running, None),
(Some(_ms), false) => (ServiceStatus::Stopped, None), // Show error status but no description
(None, _) => (ServiceStatus::Stopped, Some(vec!["unreachable".to_string()])),
(None, _) => (ServiceStatus::Stopped, None), // No description for unreachable sites
};
// Update counters based on site status

View File

@ -204,29 +204,36 @@ impl SystemCollector {
order_a.cmp(&order_b)
});
// Format C-states as description lines (2 C-states per row)
let mut result = Vec::new();
let mut current_line = Vec::new();
// Find the highest C-state with significant usage (>= 0.1%)
let mut highest_cstate = None;
let mut highest_order = -1;
for (name, time) in cstate_times {
let percent = (time as f32 / total_time as f32) * 100.0;
if percent >= 0.1 { // Only show states with at least 0.1% time
current_line.push(format!("{}: {:.1}%", name, percent));
for (name, time) in &cstate_times {
let percent = (*time as f32 / total_time as f32) * 100.0;
if percent >= 0.1 { // Only consider states with at least 0.1% time
let order = match name.as_str() {
"POLL" => 0,
"C1" => 1,
"C1E" => 2,
"C3" => 3,
"C6" => 4,
"C7s" => 5,
"C8" => 6,
"C9" => 7,
"C10" => 8,
_ => -1,
};
// Split into rows when we have 2 items
if current_line.len() == 2 {
result.push(current_line.join(", "));
current_line.clear();
if order > highest_order {
highest_order = order;
highest_cstate = Some(format!("{}: {:.1}%", name, percent));
}
}
}
// Add remaining items as final line
if !current_line.is_empty() {
result.push(current_line.join(", "));
if let Some(cstate) = highest_cstate {
return Some(vec![format!("C-State: {}", cstate)]);
}
return Some(result);
}
}
@ -281,7 +288,13 @@ impl SystemCollector {
let command = fields[10];
// Skip kernel threads (in brackets) and low CPU processes
if !command.starts_with('[') && cpu_percent.parse::<f32>().unwrap_or(0.0) > 0.1 {
return Some(format!("{} {:.1}%", command, cpu_percent.parse::<f32>().unwrap_or(0.0)));
// Extract just the process name from the full path
let process_name = if let Some(last_slash) = command.rfind('/') {
&command[last_slash + 1..]
} else {
command
};
return Some(format!("{} {:.1}%", process_name, cpu_percent.parse::<f32>().unwrap_or(0.0)));
}
}
}
@ -308,7 +321,13 @@ impl SystemCollector {
let command = fields[10];
// Skip kernel threads (in brackets) and low memory processes
if !command.starts_with('[') && mem_percent.parse::<f32>().unwrap_or(0.0) > 0.1 {
return Some(format!("{} {:.1}%", command, mem_percent.parse::<f32>().unwrap_or(0.0)));
// Extract just the process name from the full path
let process_name = if let Some(last_slash) = command.rfind('/') {
&command[last_slash + 1..]
} else {
command
};
return Some(format!("{} {:.1}%", process_name, mem_percent.parse::<f32>().unwrap_or(0.0)));
}
}
}

View File

@ -109,12 +109,8 @@ fn render_metrics(
// Add latency information for nginx sites if available
let service_name_with_latency = if let Some(parent) = &svc.sub_service {
if parent == "nginx" {
// Extract subdomain part for shorter display
let short_name = if let Some(dot_pos) = svc.name.find('.') {
&svc.name[..dot_pos]
} else {
&svc.name
};
// Use full site name instead of truncating at first dot
let short_name = &svc.name;
match &svc.latency_ms {
Some(latency) if *latency >= 2000.0 => format!("{} → unreachable", short_name), // Timeout (2s+)

View File

@ -68,14 +68,10 @@ fn render_metrics(
// Use agent-provided C-states and logged-in users as description
let mut description_lines = Vec::new();
// Add C-states with prefix on first line, indent subsequent lines
// Add C-state (now only highest C-state from agent)
if let Some(cstates) = &summary.cpu_cstate {
for (i, cstate_line) in cstates.iter().enumerate() {
if i == 0 {
description_lines.push(format!("C-State: {}", cstate_line));
} else {
description_lines.push(format!(" {}", cstate_line));
}
for cstate_line in cstates.iter() {
description_lines.push(cstate_line.clone()); // Agent already includes "C-State:" prefix
}
}
@ -105,7 +101,7 @@ fn render_metrics(
overall_status.clone(),
description_lines,
vec![
format!("{:.1} / {:.1} GB", summary.memory_used_mb / 1000.0, summary.memory_total_mb / 1000.0),
format_system_memory_value(summary.memory_used_mb, summary.memory_total_mb),
format!("{:.2}{:.2}{:.2}", summary.cpu_load_1, summary.cpu_load_5, summary.cpu_load_15),
format_optional_metric(summary.cpu_temp_c, "°C"),
],
@ -122,3 +118,22 @@ fn format_optional_metric(value: Option<f32>, unit: &str) -> String {
}
}
fn format_bytes(mb: f32) -> String {
if mb < 0.1 {
"<1MB".to_string()
} else if mb < 1.0 {
format!("{:.0}kB", mb * 1000.0)
} else if mb < 1000.0 {
format!("{:.0}MB", mb)
} else {
format!("{:.1}GB", mb / 1000.0)
}
}
fn format_system_memory_value(used_mb: f32, total_mb: f32) -> String {
let used_value = format_bytes(used_mb);
let total_gb = total_mb / 1000.0;
// Format total as GB without decimals
format!("{} ({}GB)", used_value, total_gb as u32)
}