diff --git a/agent/src/collectors/systemd.rs b/agent/src/collectors/systemd.rs index ee17116..4ff4ac0 100644 --- a/agent/src/collectors/systemd.rs +++ b/agent/src/collectors/systemd.rs @@ -20,25 +20,30 @@ pub struct SystemdCollector { /// Internal state for service caching #[derive(Debug)] struct ServiceCacheState { - /// Interesting services to monitor (cached after discovery) monitored_services: Vec, - /// Last time services were discovered + service_status_cache: std::collections::HashMap, last_discovery_time: Option, - /// How often to rediscover services (5 minutes) discovery_interval_seconds: u64, - /// Cached nginx site latency metrics nginx_site_metrics: Vec, - /// Last time nginx sites were checked last_nginx_check_time: Option, - /// How often to check nginx site latency (30 seconds) nginx_check_interval_seconds: u64, } +/// Cached service status information from systemctl list-units +#[derive(Debug, Clone)] +struct ServiceStatusInfo { + load_state: String, + active_state: String, + sub_state: String, + description: String, +} + impl SystemdCollector { pub fn new(config: SystemdConfig) -> Self { Self { state: RwLock::new(ServiceCacheState { monitored_services: Vec::new(), + service_status_cache: std::collections::HashMap::new(), last_discovery_time: None, discovery_interval_seconds: 300, // 5 minutes nginx_site_metrics: Vec::new(), @@ -113,17 +118,8 @@ impl SystemdCollector { state.nginx_site_metrics.clone() } - /// Auto-discover interesting services to monitor + /// Auto-discover interesting services to monitor and cache their status fn discover_services(&self) -> Result> { - // First get all unit files (includes inactive services) - let unit_files_output = Command::new("systemctl") - .arg("list-unit-files") - .arg("--type=service") - .arg("--no-pager") - .arg("--plain") - .output()?; - - // Then get all loaded units (includes running/failed services) let units_output = Command::new("systemctl") .arg("list-units") .arg("--type=service") @@ -132,106 +128,41 @@ impl SystemdCollector { .arg("--plain") .output()?; - // Use configured user mapping instead of hardcoded hostname logic - let target_user = &self.config.host_user_mapping; - - // Also get user unit files (user-level services) for target user - let user_unit_files_output = Command::new("sudo") - .arg("-u") - .arg(target_user) - .arg("systemctl") - .arg("--user") - .arg("list-unit-files") - .arg("--type=service") - .arg("--no-pager") - .arg("--plain") - .output()?; - - // And user loaded units for target user - let user_units_output = Command::new("sudo") - .arg("-u") - .arg(target_user) - .arg("systemctl") - .arg("--user") - .arg("list-units") - .arg("--type=service") - .arg("--all") - .arg("--no-pager") - .arg("--plain") - .output()?; - - if !unit_files_output.status.success() || !units_output.status.success() { + if !units_output.status.success() { return Err(anyhow::anyhow!("systemctl system command failed")); } - // User commands might fail if no user session, so check individually - let user_unit_files_success = user_unit_files_output.status.success(); - let user_units_success = user_units_output.status.success(); - - let unit_files_str = String::from_utf8(unit_files_output.stdout)?; let units_str = String::from_utf8(units_output.stdout)?; - let user_unit_files_str = if user_unit_files_success { - String::from_utf8(user_unit_files_output.stdout).ok() - } else { - None - }; - let user_units_str = if user_units_success { - String::from_utf8(user_units_output.stdout).ok() - } else { - None - }; let mut services = Vec::new(); + let mut status_cache = std::collections::HashMap::new(); // Use configuration instead of hardcoded values let excluded_services = &self.config.excluded_services; let service_name_filters = &self.config.service_name_filters; - // Parse both unit files and loaded units - let mut all_service_names = std::collections::HashSet::new(); - - // Parse unit files (includes inactive services) - for line in unit_files_str.lines() { - let fields: Vec<&str> = line.split_whitespace().collect(); - if fields.len() >= 2 && fields[0].ends_with(".service") { - let service_name = fields[0].trim_end_matches(".service"); - all_service_names.insert(service_name.to_string()); - } - } - - // Parse loaded units (includes running/failed services) + // Parse loaded units and extract status information for line in units_str.lines() { let fields: Vec<&str> = line.split_whitespace().collect(); if fields.len() >= 4 && fields[0].ends_with(".service") { let service_name = fields[0].trim_end_matches(".service"); - all_service_names.insert(service_name.to_string()); - } - } + + // Extract status information from systemctl list-units output + let load_state = fields.get(1).unwrap_or(&"unknown").to_string(); + let active_state = fields.get(2).unwrap_or(&"unknown").to_string(); + let sub_state = fields.get(3).unwrap_or(&"unknown").to_string(); + let description = if fields.len() > 4 { + fields[4..].join(" ") + } else { + "".to_string() + }; - // Parse user unit files if available - if let Some(user_unit_files_str) = &user_unit_files_str { - for line in user_unit_files_str.lines() { - let fields: Vec<&str> = line.split_whitespace().collect(); - if fields.len() >= 2 && fields[0].ends_with(".service") { - let service_name = fields[0].trim_end_matches(".service"); - all_service_names.insert(service_name.to_string()); - } - } - } - - // Parse user loaded units if available - if let Some(user_units_str) = &user_units_str { - for line in user_units_str.lines() { - let fields: Vec<&str> = line.split_whitespace().collect(); - if fields.len() >= 4 && fields[0].ends_with(".service") { - let service_name = fields[0].trim_end_matches(".service"); - all_service_names.insert(service_name.to_string()); - } - } - } - - // Now process all discovered services - for service_name in &all_service_names { - debug!("Processing service: '{}'", service_name); + // Cache the status information + status_cache.insert(service_name.to_string(), ServiceStatusInfo { + load_state, + active_state, + sub_state, + description, + }); // Skip excluded services first let mut is_excluded = false; @@ -253,7 +184,7 @@ impl SystemdCollector { // Check if this service matches our filter patterns for pattern in service_name_filters { - if service_name.contains(pattern) || pattern.contains(service_name) { + if service_name == pattern { debug!( "INCLUDING service '{}' because it matches pattern '{}'", service_name, pattern @@ -262,14 +193,35 @@ impl SystemdCollector { break; } } + } } + // Update the cache with all discovered service status information + if let Ok(mut state) = self.state.write() { + state.service_status_cache = status_cache; + } Ok(services) } - /// Get service status using systemctl + /// Get service status from cache (if available) or fallback to systemctl fn get_service_status(&self, service: &str) -> Result<(String, String)> { + // Try to get status from cache first + if let Ok(state) = self.state.read() { + if let Some(cached_info) = state.service_status_cache.get(service) { + let active_status = cached_info.active_state.clone(); + let detailed_info = format!( + "LoadState={}\nActiveState={}\nSubState={}", + cached_info.load_state, + cached_info.active_state, + cached_info.sub_state + ); + return Ok((active_status, detailed_info)); + } + } + + // Fallback to systemctl if not in cache (shouldn't happen during normal operation) + debug!("Service '{}' not found in cache, falling back to systemctl", service); let output = Command::new("systemctl") .arg("is-active") .arg(format!("{}.service", service)) @@ -294,7 +246,9 @@ impl SystemdCollector { "active" => Status::Ok, "inactive" | "dead" => Status::Warning, "failed" | "error" => Status::Critical, - "activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => Status::Pending, + "activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => { + Status::Pending + } _ => Status::Unknown, } } @@ -320,10 +274,14 @@ impl SystemdCollector { None } - /// Get directory size in GB with permission-aware logging fn get_directory_size(&self, dir: &str) -> Option { - let output = Command::new("sudo").arg("du").arg("-sb").arg(dir).output().ok()?; + let output = Command::new("sudo") + .arg("du") + .arg("-sb") + .arg(dir) + .output() + .ok()?; if !output.status.success() { // Log permission errors for debugging but don't spam logs @@ -385,12 +343,6 @@ impl SystemdCollector { None } - - - - - - } #[async_trait] @@ -399,7 +351,10 @@ impl Collector for SystemdCollector { "systemd" } - async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result, CollectorError> { + async fn collect( + &self, + _status_tracker: &mut StatusTracker, + ) -> Result, CollectorError> { let start_time = Instant::now(); debug!("Collecting systemd services metrics"); @@ -478,7 +433,6 @@ impl Collector for SystemdCollector { Ok(metrics) } - } impl SystemdCollector { @@ -779,7 +733,6 @@ impl SystemdCollector { } } - // Check for redirects (skip redirect-only servers) if trimmed.contains("return") && (trimmed.contains("301") || trimmed.contains("302")) { has_redirect = true;