Fix agent hang by reverting service discovery to systemctl
All checks were successful
Build and Release / build-and-release (push) Successful in 1m31s

The D-Bus ListUnits call in discover_services_internal() was causing
the agent to hang on startup.

**Root cause:**
- D-Bus ListUnits call with complex tuple destructuring hung indefinitely
- Agent never completed first collection cycle
- No collector output in logs

**Fix:**
- Revert discover_services_internal() to use systemctl list-units/list-unit-files
- Keep D-Bus-based property queries (WorkingDirectory, MemoryCurrent, ExecStart)
- Hybrid approach: systemctl for discovery, D-Bus for individual queries

**External commands still used:**
- systemctl list-units, list-unit-files (service discovery)
- smartctl (SMART data)
- sudo du (directory sizes)
- nginx -T (config fallback)

Version bump: 0.1.198 → 0.1.199
This commit is contained in:
Christoffer Martinsson 2025-11-28 11:57:31 +01:00
parent 7ad149bbe4
commit eab3f17428
5 changed files with 61 additions and 35 deletions

6
Cargo.lock generated
View File

@ -493,7 +493,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
[[package]]
name = "cm-dashboard"
version = "0.1.198"
version = "0.1.199"
dependencies = [
"anyhow",
"chrono",
@ -515,7 +515,7 @@ dependencies = [
[[package]]
name = "cm-dashboard-agent"
version = "0.1.198"
version = "0.1.199"
dependencies = [
"anyhow",
"async-trait",
@ -545,7 +545,7 @@ dependencies = [
[[package]]
name = "cm-dashboard-shared"
version = "0.1.198"
version = "0.1.199"
dependencies = [
"chrono",
"serde",

View File

@ -1,6 +1,6 @@
[package]
name = "cm-dashboard-agent"
version = "0.1.198"
version = "0.1.199"
edition = "2021"
[dependencies]

View File

@ -263,45 +263,71 @@ impl SystemdCollector {
state.nginx_site_metrics.clone()
}
/// Auto-discover interesting services to monitor using D-Bus
/// Auto-discover interesting services to monitor using systemctl
async fn discover_services_internal(&self) -> Result<(Vec<String>, std::collections::HashMap<String, ServiceStatusInfo>)> {
// Connect to system D-Bus
let connection = Connection::system().await?;
// First: Get all service unit files (with 3 second timeout)
let unit_files_output = Command::new("timeout")
.args(&["3", "systemctl", "list-unit-files", "--type=service", "--no-pager", "--plain"])
.output()?;
// Get systemd manager proxy
let proxy = zbus::Proxy::new(
&connection,
"org.freedesktop.systemd1",
"/org/freedesktop/systemd1",
"org.freedesktop.systemd1.Manager",
).await?;
if !unit_files_output.status.success() {
return Err(anyhow::anyhow!("systemctl list-unit-files command failed"));
}
// List all units via D-Bus
let units: Vec<(String, String, String, String, String, String, zbus::zvariant::OwnedObjectPath, u32, String, zbus::zvariant::OwnedObjectPath)> =
proxy.call("ListUnits", &()).await?;
// Second: Get runtime status of all units (with 3 second timeout)
let units_status_output = Command::new("timeout")
.args(&["3", "systemctl", "list-units", "--type=service", "--all", "--no-pager", "--plain"])
.output()?;
if !units_status_output.status.success() {
return Err(anyhow::anyhow!("systemctl list-units command failed"));
}
let unit_files_str = String::from_utf8(unit_files_output.stdout)?;
let units_status_str = String::from_utf8(units_status_output.stdout)?;
let mut services = Vec::new();
let excluded_services = &self.config.excluded_services;
let service_name_filters = &self.config.service_name_filters;
// Parse all service unit files
let mut all_service_names = std::collections::HashSet::new();
let mut service_status_cache = std::collections::HashMap::new();
// Parse D-Bus response for services only
for unit in units {
let (unit_name, _description, load_state, active_state, sub_state, _followed, _unit_path, _job_id, _job_type, _job_path) = unit;
if unit_name.ends_with(".service") {
let service_name = unit_name.trim_end_matches(".service");
for line in unit_files_str.lines() {
let fields: Vec<&str> = line.split_whitespace().collect();
if fields.len() >= 2 && fields[0].ends_with(".service") {
let service_name = fields[0].trim_end_matches(".service");
all_service_names.insert(service_name.to_string());
}
}
service_status_cache.insert(service_name.to_string(), ServiceStatusInfo {
load_state: load_state.clone(),
active_state: active_state.clone(),
sub_state: sub_state.clone(),
// Parse runtime status for all units
let mut status_cache = std::collections::HashMap::new();
for line in units_status_str.lines() {
let fields: Vec<&str> = line.split_whitespace().collect();
if fields.len() >= 4 && fields[0].ends_with(".service") {
let service_name = fields[0].trim_end_matches(".service");
let load_state = fields.get(1).unwrap_or(&"unknown").to_string();
let active_state = fields.get(2).unwrap_or(&"unknown").to_string();
let sub_state = fields.get(3).unwrap_or(&"unknown").to_string();
status_cache.insert(service_name.to_string(), ServiceStatusInfo {
load_state,
active_state,
sub_state,
});
}
}
let mut services = Vec::new();
let excluded_services = &self.config.excluded_services;
let service_name_filters = &self.config.service_name_filters;
// For services found in unit files but not in runtime status, set default inactive status
for service_name in &all_service_names {
if !status_cache.contains_key(service_name) {
status_cache.insert(service_name.to_string(), ServiceStatusInfo {
load_state: "not-loaded".to_string(),
active_state: "inactive".to_string(),
sub_state: "dead".to_string(),
});
}
}
// Process all discovered services and apply filters
for service_name in &all_service_names {
@ -327,7 +353,7 @@ impl SystemdCollector {
}
}
Ok((services, service_status_cache))
Ok((services, status_cache))
}
/// Get service status from D-Bus cache

View File

@ -1,6 +1,6 @@
[package]
name = "cm-dashboard"
version = "0.1.198"
version = "0.1.199"
edition = "2021"
[dependencies]

View File

@ -1,6 +1,6 @@
[package]
name = "cm-dashboard-shared"
version = "0.1.198"
version = "0.1.199"
edition = "2021"
[dependencies]