Improve dashboard display and fix service issues
- Remove unreachable descriptions from failed nginx sites - Show complete site URLs instead of truncating at first dot - Implement service-specific disk quotas (docker: 4GB, immich: 4GB, others: 1-2GB) - Truncate process names to show only executable name without full path - Display only highest C-state instead of all C-states for cleaner output - Format system RAM as xxxMB/GB (totalGB) to match services format
This commit is contained in:
parent
672c8bebc9
commit
efdd713f62
19
CLAUDE.md
19
CLAUDE.md
@ -332,11 +332,30 @@ rm /tmp/cm-maintenance
|
||||
4. Dashboard uses `status_level_from_agent_status()` for display
|
||||
5. Agent adds notification monitoring for status changes
|
||||
|
||||
**Testing & Building:**
|
||||
- ALWAYS use `cargo build --workspace` to match NixOS build configuration
|
||||
- Test with OpenSSL environment variables when building locally:
|
||||
```bash
|
||||
OPENSSL_DIR=/nix/store/.../openssl-dev \
|
||||
OPENSSL_LIB_DIR=/nix/store/.../openssl/lib \
|
||||
OPENSSL_INCLUDE_DIR=/nix/store/.../openssl-dev/include \
|
||||
PKG_CONFIG_PATH=/nix/store/.../openssl-dev/lib/pkgconfig \
|
||||
OPENSSL_NO_VENDOR=1 cargo build --workspace
|
||||
```
|
||||
- This prevents build failures that only appear in NixOS deployment
|
||||
|
||||
**Notification System:**
|
||||
- Universal automatic detection of all `_status` fields across all collectors
|
||||
- Sends emails from `hostname@cmtec.se` to `cm@cmtec.se` for any status changes
|
||||
- Status stored in-memory: `HashMap<"component.metric", status>`
|
||||
- Recovery emails sent when status changes from warning/critical → ok
|
||||
|
||||
**NEVER:**
|
||||
- Add hardcoded thresholds to dashboard widgets
|
||||
- Calculate status in dashboard with different thresholds than agent
|
||||
- Use "ok" as default when agent status is missing (use "unknown")
|
||||
- Calculate colors in widgets (TableBuilder's responsibility)
|
||||
- Use `cargo build` without `--workspace` for final testing
|
||||
|
||||
# Important Communication Guidelines
|
||||
|
||||
|
||||
@ -303,6 +303,9 @@ impl ServiceCollector {
|
||||
|
||||
async fn get_service_disk_quota(&self, service: &str) -> Result<f32, CollectorError> {
|
||||
// Check systemd service properties for NixOS hardening-related disk restrictions
|
||||
let mut private_tmp = false;
|
||||
let mut protect_system = false;
|
||||
|
||||
let systemd_output = Command::new("/run/current-system/sw/bin/systemctl")
|
||||
.args(["show", service, "--property=PrivateTmp,ProtectHome,ProtectSystem,ReadOnlyPaths,InaccessiblePaths,BindPaths,BindReadOnlyPaths", "--no-pager"])
|
||||
.stdout(Stdio::piped())
|
||||
@ -315,8 +318,6 @@ impl ServiceCollector {
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
// Parse systemd properties that might indicate disk restrictions
|
||||
let mut private_tmp = false;
|
||||
let mut protect_system = false;
|
||||
let mut readonly_paths = Vec::new();
|
||||
|
||||
for line in stdout.lines() {
|
||||
@ -328,39 +329,33 @@ impl ServiceCollector {
|
||||
readonly_paths.push(paths.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// If service has significant restrictions, it might have implicit disk limits
|
||||
// This is heuristic-based since systemd doesn't have direct disk quotas
|
||||
if private_tmp && protect_system {
|
||||
// Heavily sandboxed services might have practical disk limits
|
||||
// Return a conservative estimate based on typical service needs
|
||||
return Ok(1.0); // 1 GB as reasonable limit for sandboxed services
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check for service-specific disk configurations in NixOS
|
||||
match service {
|
||||
"docker" => {
|
||||
// Docker might have storage driver limits in NixOS config
|
||||
if let Ok(limit) = self.get_docker_storage_quota().await {
|
||||
return Ok(limit);
|
||||
// Check for service-specific disk configurations - use service-appropriate defaults
|
||||
let service_quota = match service {
|
||||
"docker" => 4.0, // Docker containers need more space
|
||||
"gitea" => 1.0, // Gitea repositories, but database is external
|
||||
"postgresql" | "postgres" => 1.0, // Database storage
|
||||
"mysql" | "mariadb" => 1.0, // Database storage
|
||||
"immich-server" => 4.0, // Photo storage app needs more space
|
||||
"unifi" => 2.0, // Network management with logs and configs
|
||||
"vaultwarden" => 1.0, // Password manager
|
||||
"gitea-runner-default" => 1.0, // CI/CD runner
|
||||
"nginx" => 1.0, // Web server
|
||||
"mosquitto" => 1.0, // MQTT broker
|
||||
"redis-immich" => 1.0, // Redis cache
|
||||
_ => {
|
||||
// Default based on sandboxing - sandboxed services get smaller quotas
|
||||
if private_tmp && protect_system {
|
||||
1.0 // 1 GB for sandboxed services
|
||||
} else {
|
||||
2.0 // 2 GB for non-sandboxed services
|
||||
}
|
||||
},
|
||||
"postgresql" | "postgres" => {
|
||||
// PostgreSQL might have tablespace or data directory limits
|
||||
// Check for database-specific storage configuration
|
||||
},
|
||||
"mysql" | "mariadb" => {
|
||||
// MySQL might have data directory size limits
|
||||
},
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// No quota found
|
||||
Err(CollectorError::ParseError {
|
||||
message: format!("No disk quota found for service {}", service),
|
||||
})
|
||||
Ok(service_quota)
|
||||
}
|
||||
|
||||
async fn check_filesystem_quota(&self, path: &str) -> Result<f32, CollectorError> {
|
||||
@ -1282,7 +1277,7 @@ impl Collector for ServiceCollector {
|
||||
let (site_status, site_description) = match (latency, is_healthy) {
|
||||
(Some(_ms), true) => (ServiceStatus::Running, None),
|
||||
(Some(_ms), false) => (ServiceStatus::Stopped, None), // Show error status but no description
|
||||
(None, _) => (ServiceStatus::Stopped, Some(vec!["unreachable".to_string()])),
|
||||
(None, _) => (ServiceStatus::Stopped, None), // No description for unreachable sites
|
||||
};
|
||||
|
||||
// Update counters based on site status
|
||||
|
||||
@ -204,29 +204,36 @@ impl SystemCollector {
|
||||
order_a.cmp(&order_b)
|
||||
});
|
||||
|
||||
// Format C-states as description lines (2 C-states per row)
|
||||
let mut result = Vec::new();
|
||||
let mut current_line = Vec::new();
|
||||
// Find the highest C-state with significant usage (>= 0.1%)
|
||||
let mut highest_cstate = None;
|
||||
let mut highest_order = -1;
|
||||
|
||||
for (name, time) in cstate_times {
|
||||
let percent = (time as f32 / total_time as f32) * 100.0;
|
||||
if percent >= 0.1 { // Only show states with at least 0.1% time
|
||||
current_line.push(format!("{}: {:.1}%", name, percent));
|
||||
for (name, time) in &cstate_times {
|
||||
let percent = (*time as f32 / total_time as f32) * 100.0;
|
||||
if percent >= 0.1 { // Only consider states with at least 0.1% time
|
||||
let order = match name.as_str() {
|
||||
"POLL" => 0,
|
||||
"C1" => 1,
|
||||
"C1E" => 2,
|
||||
"C3" => 3,
|
||||
"C6" => 4,
|
||||
"C7s" => 5,
|
||||
"C8" => 6,
|
||||
"C9" => 7,
|
||||
"C10" => 8,
|
||||
_ => -1,
|
||||
};
|
||||
|
||||
// Split into rows when we have 2 items
|
||||
if current_line.len() == 2 {
|
||||
result.push(current_line.join(", "));
|
||||
current_line.clear();
|
||||
if order > highest_order {
|
||||
highest_order = order;
|
||||
highest_cstate = Some(format!("{}: {:.1}%", name, percent));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add remaining items as final line
|
||||
if !current_line.is_empty() {
|
||||
result.push(current_line.join(", "));
|
||||
if let Some(cstate) = highest_cstate {
|
||||
return Some(vec![format!("C-State: {}", cstate)]);
|
||||
}
|
||||
|
||||
return Some(result);
|
||||
}
|
||||
}
|
||||
|
||||
@ -281,7 +288,13 @@ impl SystemCollector {
|
||||
let command = fields[10];
|
||||
// Skip kernel threads (in brackets) and low CPU processes
|
||||
if !command.starts_with('[') && cpu_percent.parse::<f32>().unwrap_or(0.0) > 0.1 {
|
||||
return Some(format!("{} {:.1}%", command, cpu_percent.parse::<f32>().unwrap_or(0.0)));
|
||||
// Extract just the process name from the full path
|
||||
let process_name = if let Some(last_slash) = command.rfind('/') {
|
||||
&command[last_slash + 1..]
|
||||
} else {
|
||||
command
|
||||
};
|
||||
return Some(format!("{} {:.1}%", process_name, cpu_percent.parse::<f32>().unwrap_or(0.0)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -308,7 +321,13 @@ impl SystemCollector {
|
||||
let command = fields[10];
|
||||
// Skip kernel threads (in brackets) and low memory processes
|
||||
if !command.starts_with('[') && mem_percent.parse::<f32>().unwrap_or(0.0) > 0.1 {
|
||||
return Some(format!("{} {:.1}%", command, mem_percent.parse::<f32>().unwrap_or(0.0)));
|
||||
// Extract just the process name from the full path
|
||||
let process_name = if let Some(last_slash) = command.rfind('/') {
|
||||
&command[last_slash + 1..]
|
||||
} else {
|
||||
command
|
||||
};
|
||||
return Some(format!("{} {:.1}%", process_name, mem_percent.parse::<f32>().unwrap_or(0.0)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -109,12 +109,8 @@ fn render_metrics(
|
||||
// Add latency information for nginx sites if available
|
||||
let service_name_with_latency = if let Some(parent) = &svc.sub_service {
|
||||
if parent == "nginx" {
|
||||
// Extract subdomain part for shorter display
|
||||
let short_name = if let Some(dot_pos) = svc.name.find('.') {
|
||||
&svc.name[..dot_pos]
|
||||
} else {
|
||||
&svc.name
|
||||
};
|
||||
// Use full site name instead of truncating at first dot
|
||||
let short_name = &svc.name;
|
||||
|
||||
match &svc.latency_ms {
|
||||
Some(latency) if *latency >= 2000.0 => format!("{} → unreachable", short_name), // Timeout (2s+)
|
||||
|
||||
@ -68,14 +68,10 @@ fn render_metrics(
|
||||
// Use agent-provided C-states and logged-in users as description
|
||||
let mut description_lines = Vec::new();
|
||||
|
||||
// Add C-states with prefix on first line, indent subsequent lines
|
||||
// Add C-state (now only highest C-state from agent)
|
||||
if let Some(cstates) = &summary.cpu_cstate {
|
||||
for (i, cstate_line) in cstates.iter().enumerate() {
|
||||
if i == 0 {
|
||||
description_lines.push(format!("C-State: {}", cstate_line));
|
||||
} else {
|
||||
description_lines.push(format!(" {}", cstate_line));
|
||||
}
|
||||
for cstate_line in cstates.iter() {
|
||||
description_lines.push(cstate_line.clone()); // Agent already includes "C-State:" prefix
|
||||
}
|
||||
}
|
||||
|
||||
@ -105,7 +101,7 @@ fn render_metrics(
|
||||
overall_status.clone(),
|
||||
description_lines,
|
||||
vec![
|
||||
format!("{:.1} / {:.1} GB", summary.memory_used_mb / 1000.0, summary.memory_total_mb / 1000.0),
|
||||
format_system_memory_value(summary.memory_used_mb, summary.memory_total_mb),
|
||||
format!("{:.2} • {:.2} • {:.2}", summary.cpu_load_1, summary.cpu_load_5, summary.cpu_load_15),
|
||||
format_optional_metric(summary.cpu_temp_c, "°C"),
|
||||
],
|
||||
@ -122,3 +118,22 @@ fn format_optional_metric(value: Option<f32>, unit: &str) -> String {
|
||||
}
|
||||
}
|
||||
|
||||
fn format_bytes(mb: f32) -> String {
|
||||
if mb < 0.1 {
|
||||
"<1MB".to_string()
|
||||
} else if mb < 1.0 {
|
||||
format!("{:.0}kB", mb * 1000.0)
|
||||
} else if mb < 1000.0 {
|
||||
format!("{:.0}MB", mb)
|
||||
} else {
|
||||
format!("{:.1}GB", mb / 1000.0)
|
||||
}
|
||||
}
|
||||
|
||||
fn format_system_memory_value(used_mb: f32, total_mb: f32) -> String {
|
||||
let used_value = format_bytes(used_mb);
|
||||
let total_gb = total_mb / 1000.0;
|
||||
// Format total as GB without decimals
|
||||
format!("{} ({}GB)", used_value, total_gb as u32)
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user