Improve dashboard display and fix service issues
- Remove unreachable descriptions from failed nginx sites - Show complete site URLs instead of truncating at first dot - Implement service-specific disk quotas (docker: 4GB, immich: 4GB, others: 1-2GB) - Truncate process names to show only executable name without full path - Display only highest C-state instead of all C-states for cleaner output - Format system RAM as xxxMB/GB (totalGB) to match services format
This commit is contained in:
parent
672c8bebc9
commit
efdd713f62
19
CLAUDE.md
19
CLAUDE.md
@ -332,11 +332,30 @@ rm /tmp/cm-maintenance
|
|||||||
4. Dashboard uses `status_level_from_agent_status()` for display
|
4. Dashboard uses `status_level_from_agent_status()` for display
|
||||||
5. Agent adds notification monitoring for status changes
|
5. Agent adds notification monitoring for status changes
|
||||||
|
|
||||||
|
**Testing & Building:**
|
||||||
|
- ALWAYS use `cargo build --workspace` to match NixOS build configuration
|
||||||
|
- Test with OpenSSL environment variables when building locally:
|
||||||
|
```bash
|
||||||
|
OPENSSL_DIR=/nix/store/.../openssl-dev \
|
||||||
|
OPENSSL_LIB_DIR=/nix/store/.../openssl/lib \
|
||||||
|
OPENSSL_INCLUDE_DIR=/nix/store/.../openssl-dev/include \
|
||||||
|
PKG_CONFIG_PATH=/nix/store/.../openssl-dev/lib/pkgconfig \
|
||||||
|
OPENSSL_NO_VENDOR=1 cargo build --workspace
|
||||||
|
```
|
||||||
|
- This prevents build failures that only appear in NixOS deployment
|
||||||
|
|
||||||
|
**Notification System:**
|
||||||
|
- Universal automatic detection of all `_status` fields across all collectors
|
||||||
|
- Sends emails from `hostname@cmtec.se` to `cm@cmtec.se` for any status changes
|
||||||
|
- Status stored in-memory: `HashMap<"component.metric", status>`
|
||||||
|
- Recovery emails sent when status changes from warning/critical → ok
|
||||||
|
|
||||||
**NEVER:**
|
**NEVER:**
|
||||||
- Add hardcoded thresholds to dashboard widgets
|
- Add hardcoded thresholds to dashboard widgets
|
||||||
- Calculate status in dashboard with different thresholds than agent
|
- Calculate status in dashboard with different thresholds than agent
|
||||||
- Use "ok" as default when agent status is missing (use "unknown")
|
- Use "ok" as default when agent status is missing (use "unknown")
|
||||||
- Calculate colors in widgets (TableBuilder's responsibility)
|
- Calculate colors in widgets (TableBuilder's responsibility)
|
||||||
|
- Use `cargo build` without `--workspace` for final testing
|
||||||
|
|
||||||
# Important Communication Guidelines
|
# Important Communication Guidelines
|
||||||
|
|
||||||
|
|||||||
@ -303,6 +303,9 @@ impl ServiceCollector {
|
|||||||
|
|
||||||
async fn get_service_disk_quota(&self, service: &str) -> Result<f32, CollectorError> {
|
async fn get_service_disk_quota(&self, service: &str) -> Result<f32, CollectorError> {
|
||||||
// Check systemd service properties for NixOS hardening-related disk restrictions
|
// Check systemd service properties for NixOS hardening-related disk restrictions
|
||||||
|
let mut private_tmp = false;
|
||||||
|
let mut protect_system = false;
|
||||||
|
|
||||||
let systemd_output = Command::new("/run/current-system/sw/bin/systemctl")
|
let systemd_output = Command::new("/run/current-system/sw/bin/systemctl")
|
||||||
.args(["show", service, "--property=PrivateTmp,ProtectHome,ProtectSystem,ReadOnlyPaths,InaccessiblePaths,BindPaths,BindReadOnlyPaths", "--no-pager"])
|
.args(["show", service, "--property=PrivateTmp,ProtectHome,ProtectSystem,ReadOnlyPaths,InaccessiblePaths,BindPaths,BindReadOnlyPaths", "--no-pager"])
|
||||||
.stdout(Stdio::piped())
|
.stdout(Stdio::piped())
|
||||||
@ -315,8 +318,6 @@ impl ServiceCollector {
|
|||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||||
|
|
||||||
// Parse systemd properties that might indicate disk restrictions
|
// Parse systemd properties that might indicate disk restrictions
|
||||||
let mut private_tmp = false;
|
|
||||||
let mut protect_system = false;
|
|
||||||
let mut readonly_paths = Vec::new();
|
let mut readonly_paths = Vec::new();
|
||||||
|
|
||||||
for line in stdout.lines() {
|
for line in stdout.lines() {
|
||||||
@ -328,39 +329,33 @@ impl ServiceCollector {
|
|||||||
readonly_paths.push(paths.to_string());
|
readonly_paths.push(paths.to_string());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If service has significant restrictions, it might have implicit disk limits
|
|
||||||
// This is heuristic-based since systemd doesn't have direct disk quotas
|
|
||||||
if private_tmp && protect_system {
|
|
||||||
// Heavily sandboxed services might have practical disk limits
|
|
||||||
// Return a conservative estimate based on typical service needs
|
|
||||||
return Ok(1.0); // 1 GB as reasonable limit for sandboxed services
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for service-specific disk configurations in NixOS
|
// Check for service-specific disk configurations - use service-appropriate defaults
|
||||||
match service {
|
let service_quota = match service {
|
||||||
"docker" => {
|
"docker" => 4.0, // Docker containers need more space
|
||||||
// Docker might have storage driver limits in NixOS config
|
"gitea" => 1.0, // Gitea repositories, but database is external
|
||||||
if let Ok(limit) = self.get_docker_storage_quota().await {
|
"postgresql" | "postgres" => 1.0, // Database storage
|
||||||
return Ok(limit);
|
"mysql" | "mariadb" => 1.0, // Database storage
|
||||||
|
"immich-server" => 4.0, // Photo storage app needs more space
|
||||||
|
"unifi" => 2.0, // Network management with logs and configs
|
||||||
|
"vaultwarden" => 1.0, // Password manager
|
||||||
|
"gitea-runner-default" => 1.0, // CI/CD runner
|
||||||
|
"nginx" => 1.0, // Web server
|
||||||
|
"mosquitto" => 1.0, // MQTT broker
|
||||||
|
"redis-immich" => 1.0, // Redis cache
|
||||||
|
_ => {
|
||||||
|
// Default based on sandboxing - sandboxed services get smaller quotas
|
||||||
|
if private_tmp && protect_system {
|
||||||
|
1.0 // 1 GB for sandboxed services
|
||||||
|
} else {
|
||||||
|
2.0 // 2 GB for non-sandboxed services
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"postgresql" | "postgres" => {
|
};
|
||||||
// PostgreSQL might have tablespace or data directory limits
|
|
||||||
// Check for database-specific storage configuration
|
|
||||||
},
|
|
||||||
"mysql" | "mariadb" => {
|
|
||||||
// MySQL might have data directory size limits
|
|
||||||
},
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
|
|
||||||
// No quota found
|
Ok(service_quota)
|
||||||
Err(CollectorError::ParseError {
|
|
||||||
message: format!("No disk quota found for service {}", service),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn check_filesystem_quota(&self, path: &str) -> Result<f32, CollectorError> {
|
async fn check_filesystem_quota(&self, path: &str) -> Result<f32, CollectorError> {
|
||||||
@ -1282,7 +1277,7 @@ impl Collector for ServiceCollector {
|
|||||||
let (site_status, site_description) = match (latency, is_healthy) {
|
let (site_status, site_description) = match (latency, is_healthy) {
|
||||||
(Some(_ms), true) => (ServiceStatus::Running, None),
|
(Some(_ms), true) => (ServiceStatus::Running, None),
|
||||||
(Some(_ms), false) => (ServiceStatus::Stopped, None), // Show error status but no description
|
(Some(_ms), false) => (ServiceStatus::Stopped, None), // Show error status but no description
|
||||||
(None, _) => (ServiceStatus::Stopped, Some(vec!["unreachable".to_string()])),
|
(None, _) => (ServiceStatus::Stopped, None), // No description for unreachable sites
|
||||||
};
|
};
|
||||||
|
|
||||||
// Update counters based on site status
|
// Update counters based on site status
|
||||||
|
|||||||
@ -204,29 +204,36 @@ impl SystemCollector {
|
|||||||
order_a.cmp(&order_b)
|
order_a.cmp(&order_b)
|
||||||
});
|
});
|
||||||
|
|
||||||
// Format C-states as description lines (2 C-states per row)
|
// Find the highest C-state with significant usage (>= 0.1%)
|
||||||
let mut result = Vec::new();
|
let mut highest_cstate = None;
|
||||||
let mut current_line = Vec::new();
|
let mut highest_order = -1;
|
||||||
|
|
||||||
for (name, time) in cstate_times {
|
for (name, time) in &cstate_times {
|
||||||
let percent = (time as f32 / total_time as f32) * 100.0;
|
let percent = (*time as f32 / total_time as f32) * 100.0;
|
||||||
if percent >= 0.1 { // Only show states with at least 0.1% time
|
if percent >= 0.1 { // Only consider states with at least 0.1% time
|
||||||
current_line.push(format!("{}: {:.1}%", name, percent));
|
let order = match name.as_str() {
|
||||||
|
"POLL" => 0,
|
||||||
|
"C1" => 1,
|
||||||
|
"C1E" => 2,
|
||||||
|
"C3" => 3,
|
||||||
|
"C6" => 4,
|
||||||
|
"C7s" => 5,
|
||||||
|
"C8" => 6,
|
||||||
|
"C9" => 7,
|
||||||
|
"C10" => 8,
|
||||||
|
_ => -1,
|
||||||
|
};
|
||||||
|
|
||||||
// Split into rows when we have 2 items
|
if order > highest_order {
|
||||||
if current_line.len() == 2 {
|
highest_order = order;
|
||||||
result.push(current_line.join(", "));
|
highest_cstate = Some(format!("{}: {:.1}%", name, percent));
|
||||||
current_line.clear();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add remaining items as final line
|
if let Some(cstate) = highest_cstate {
|
||||||
if !current_line.is_empty() {
|
return Some(vec![format!("C-State: {}", cstate)]);
|
||||||
result.push(current_line.join(", "));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return Some(result);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -281,7 +288,13 @@ impl SystemCollector {
|
|||||||
let command = fields[10];
|
let command = fields[10];
|
||||||
// Skip kernel threads (in brackets) and low CPU processes
|
// Skip kernel threads (in brackets) and low CPU processes
|
||||||
if !command.starts_with('[') && cpu_percent.parse::<f32>().unwrap_or(0.0) > 0.1 {
|
if !command.starts_with('[') && cpu_percent.parse::<f32>().unwrap_or(0.0) > 0.1 {
|
||||||
return Some(format!("{} {:.1}%", command, cpu_percent.parse::<f32>().unwrap_or(0.0)));
|
// Extract just the process name from the full path
|
||||||
|
let process_name = if let Some(last_slash) = command.rfind('/') {
|
||||||
|
&command[last_slash + 1..]
|
||||||
|
} else {
|
||||||
|
command
|
||||||
|
};
|
||||||
|
return Some(format!("{} {:.1}%", process_name, cpu_percent.parse::<f32>().unwrap_or(0.0)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -308,7 +321,13 @@ impl SystemCollector {
|
|||||||
let command = fields[10];
|
let command = fields[10];
|
||||||
// Skip kernel threads (in brackets) and low memory processes
|
// Skip kernel threads (in brackets) and low memory processes
|
||||||
if !command.starts_with('[') && mem_percent.parse::<f32>().unwrap_or(0.0) > 0.1 {
|
if !command.starts_with('[') && mem_percent.parse::<f32>().unwrap_or(0.0) > 0.1 {
|
||||||
return Some(format!("{} {:.1}%", command, mem_percent.parse::<f32>().unwrap_or(0.0)));
|
// Extract just the process name from the full path
|
||||||
|
let process_name = if let Some(last_slash) = command.rfind('/') {
|
||||||
|
&command[last_slash + 1..]
|
||||||
|
} else {
|
||||||
|
command
|
||||||
|
};
|
||||||
|
return Some(format!("{} {:.1}%", process_name, mem_percent.parse::<f32>().unwrap_or(0.0)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -109,12 +109,8 @@ fn render_metrics(
|
|||||||
// Add latency information for nginx sites if available
|
// Add latency information for nginx sites if available
|
||||||
let service_name_with_latency = if let Some(parent) = &svc.sub_service {
|
let service_name_with_latency = if let Some(parent) = &svc.sub_service {
|
||||||
if parent == "nginx" {
|
if parent == "nginx" {
|
||||||
// Extract subdomain part for shorter display
|
// Use full site name instead of truncating at first dot
|
||||||
let short_name = if let Some(dot_pos) = svc.name.find('.') {
|
let short_name = &svc.name;
|
||||||
&svc.name[..dot_pos]
|
|
||||||
} else {
|
|
||||||
&svc.name
|
|
||||||
};
|
|
||||||
|
|
||||||
match &svc.latency_ms {
|
match &svc.latency_ms {
|
||||||
Some(latency) if *latency >= 2000.0 => format!("{} → unreachable", short_name), // Timeout (2s+)
|
Some(latency) if *latency >= 2000.0 => format!("{} → unreachable", short_name), // Timeout (2s+)
|
||||||
|
|||||||
@ -68,14 +68,10 @@ fn render_metrics(
|
|||||||
// Use agent-provided C-states and logged-in users as description
|
// Use agent-provided C-states and logged-in users as description
|
||||||
let mut description_lines = Vec::new();
|
let mut description_lines = Vec::new();
|
||||||
|
|
||||||
// Add C-states with prefix on first line, indent subsequent lines
|
// Add C-state (now only highest C-state from agent)
|
||||||
if let Some(cstates) = &summary.cpu_cstate {
|
if let Some(cstates) = &summary.cpu_cstate {
|
||||||
for (i, cstate_line) in cstates.iter().enumerate() {
|
for cstate_line in cstates.iter() {
|
||||||
if i == 0 {
|
description_lines.push(cstate_line.clone()); // Agent already includes "C-State:" prefix
|
||||||
description_lines.push(format!("C-State: {}", cstate_line));
|
|
||||||
} else {
|
|
||||||
description_lines.push(format!(" {}", cstate_line));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -105,7 +101,7 @@ fn render_metrics(
|
|||||||
overall_status.clone(),
|
overall_status.clone(),
|
||||||
description_lines,
|
description_lines,
|
||||||
vec![
|
vec![
|
||||||
format!("{:.1} / {:.1} GB", summary.memory_used_mb / 1000.0, summary.memory_total_mb / 1000.0),
|
format_system_memory_value(summary.memory_used_mb, summary.memory_total_mb),
|
||||||
format!("{:.2} • {:.2} • {:.2}", summary.cpu_load_1, summary.cpu_load_5, summary.cpu_load_15),
|
format!("{:.2} • {:.2} • {:.2}", summary.cpu_load_1, summary.cpu_load_5, summary.cpu_load_15),
|
||||||
format_optional_metric(summary.cpu_temp_c, "°C"),
|
format_optional_metric(summary.cpu_temp_c, "°C"),
|
||||||
],
|
],
|
||||||
@ -122,3 +118,22 @@ fn format_optional_metric(value: Option<f32>, unit: &str) -> String {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn format_bytes(mb: f32) -> String {
|
||||||
|
if mb < 0.1 {
|
||||||
|
"<1MB".to_string()
|
||||||
|
} else if mb < 1.0 {
|
||||||
|
format!("{:.0}kB", mb * 1000.0)
|
||||||
|
} else if mb < 1000.0 {
|
||||||
|
format!("{:.0}MB", mb)
|
||||||
|
} else {
|
||||||
|
format!("{:.1}GB", mb / 1000.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn format_system_memory_value(used_mb: f32, total_mb: f32) -> String {
|
||||||
|
let used_value = format_bytes(used_mb);
|
||||||
|
let total_gb = total_mb / 1000.0;
|
||||||
|
// Format total as GB without decimals
|
||||||
|
format!("{} ({}GB)", used_value, total_gb as u32)
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user