Optimize systemd service status collection
Reduce redundant systemctl calls by caching service status data during discovery phase. Previously made 1+2N systemctl calls (discovery + 2 per service), now makes only 1 systemctl call with cached lookups. - Cache service status info (load/active/sub states) from list-units output - Eliminate separate is-active and show calls per service - Add fallback to systemctl for cache misses - Remove duplicate user service discovery logic Performance improvement: 21 calls → 1 call for 10 monitored services.
This commit is contained in:
parent
14aae90954
commit
5babf9a04e
@ -20,25 +20,30 @@ pub struct SystemdCollector {
|
|||||||
/// Internal state for service caching
|
/// Internal state for service caching
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
struct ServiceCacheState {
|
struct ServiceCacheState {
|
||||||
/// Interesting services to monitor (cached after discovery)
|
|
||||||
monitored_services: Vec<String>,
|
monitored_services: Vec<String>,
|
||||||
/// Last time services were discovered
|
service_status_cache: std::collections::HashMap<String, ServiceStatusInfo>,
|
||||||
last_discovery_time: Option<Instant>,
|
last_discovery_time: Option<Instant>,
|
||||||
/// How often to rediscover services (5 minutes)
|
|
||||||
discovery_interval_seconds: u64,
|
discovery_interval_seconds: u64,
|
||||||
/// Cached nginx site latency metrics
|
|
||||||
nginx_site_metrics: Vec<Metric>,
|
nginx_site_metrics: Vec<Metric>,
|
||||||
/// Last time nginx sites were checked
|
|
||||||
last_nginx_check_time: Option<Instant>,
|
last_nginx_check_time: Option<Instant>,
|
||||||
/// How often to check nginx site latency (30 seconds)
|
|
||||||
nginx_check_interval_seconds: u64,
|
nginx_check_interval_seconds: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Cached service status information from systemctl list-units
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct ServiceStatusInfo {
|
||||||
|
load_state: String,
|
||||||
|
active_state: String,
|
||||||
|
sub_state: String,
|
||||||
|
description: String,
|
||||||
|
}
|
||||||
|
|
||||||
impl SystemdCollector {
|
impl SystemdCollector {
|
||||||
pub fn new(config: SystemdConfig) -> Self {
|
pub fn new(config: SystemdConfig) -> Self {
|
||||||
Self {
|
Self {
|
||||||
state: RwLock::new(ServiceCacheState {
|
state: RwLock::new(ServiceCacheState {
|
||||||
monitored_services: Vec::new(),
|
monitored_services: Vec::new(),
|
||||||
|
service_status_cache: std::collections::HashMap::new(),
|
||||||
last_discovery_time: None,
|
last_discovery_time: None,
|
||||||
discovery_interval_seconds: 300, // 5 minutes
|
discovery_interval_seconds: 300, // 5 minutes
|
||||||
nginx_site_metrics: Vec::new(),
|
nginx_site_metrics: Vec::new(),
|
||||||
@ -113,17 +118,8 @@ impl SystemdCollector {
|
|||||||
state.nginx_site_metrics.clone()
|
state.nginx_site_metrics.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Auto-discover interesting services to monitor
|
/// Auto-discover interesting services to monitor and cache their status
|
||||||
fn discover_services(&self) -> Result<Vec<String>> {
|
fn discover_services(&self) -> Result<Vec<String>> {
|
||||||
// First get all unit files (includes inactive services)
|
|
||||||
let unit_files_output = Command::new("systemctl")
|
|
||||||
.arg("list-unit-files")
|
|
||||||
.arg("--type=service")
|
|
||||||
.arg("--no-pager")
|
|
||||||
.arg("--plain")
|
|
||||||
.output()?;
|
|
||||||
|
|
||||||
// Then get all loaded units (includes running/failed services)
|
|
||||||
let units_output = Command::new("systemctl")
|
let units_output = Command::new("systemctl")
|
||||||
.arg("list-units")
|
.arg("list-units")
|
||||||
.arg("--type=service")
|
.arg("--type=service")
|
||||||
@ -132,106 +128,41 @@ impl SystemdCollector {
|
|||||||
.arg("--plain")
|
.arg("--plain")
|
||||||
.output()?;
|
.output()?;
|
||||||
|
|
||||||
// Use configured user mapping instead of hardcoded hostname logic
|
if !units_output.status.success() {
|
||||||
let target_user = &self.config.host_user_mapping;
|
|
||||||
|
|
||||||
// Also get user unit files (user-level services) for target user
|
|
||||||
let user_unit_files_output = Command::new("sudo")
|
|
||||||
.arg("-u")
|
|
||||||
.arg(target_user)
|
|
||||||
.arg("systemctl")
|
|
||||||
.arg("--user")
|
|
||||||
.arg("list-unit-files")
|
|
||||||
.arg("--type=service")
|
|
||||||
.arg("--no-pager")
|
|
||||||
.arg("--plain")
|
|
||||||
.output()?;
|
|
||||||
|
|
||||||
// And user loaded units for target user
|
|
||||||
let user_units_output = Command::new("sudo")
|
|
||||||
.arg("-u")
|
|
||||||
.arg(target_user)
|
|
||||||
.arg("systemctl")
|
|
||||||
.arg("--user")
|
|
||||||
.arg("list-units")
|
|
||||||
.arg("--type=service")
|
|
||||||
.arg("--all")
|
|
||||||
.arg("--no-pager")
|
|
||||||
.arg("--plain")
|
|
||||||
.output()?;
|
|
||||||
|
|
||||||
if !unit_files_output.status.success() || !units_output.status.success() {
|
|
||||||
return Err(anyhow::anyhow!("systemctl system command failed"));
|
return Err(anyhow::anyhow!("systemctl system command failed"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// User commands might fail if no user session, so check individually
|
|
||||||
let user_unit_files_success = user_unit_files_output.status.success();
|
|
||||||
let user_units_success = user_units_output.status.success();
|
|
||||||
|
|
||||||
let unit_files_str = String::from_utf8(unit_files_output.stdout)?;
|
|
||||||
let units_str = String::from_utf8(units_output.stdout)?;
|
let units_str = String::from_utf8(units_output.stdout)?;
|
||||||
let user_unit_files_str = if user_unit_files_success {
|
|
||||||
String::from_utf8(user_unit_files_output.stdout).ok()
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
let user_units_str = if user_units_success {
|
|
||||||
String::from_utf8(user_units_output.stdout).ok()
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
let mut services = Vec::new();
|
let mut services = Vec::new();
|
||||||
|
let mut status_cache = std::collections::HashMap::new();
|
||||||
|
|
||||||
// Use configuration instead of hardcoded values
|
// Use configuration instead of hardcoded values
|
||||||
let excluded_services = &self.config.excluded_services;
|
let excluded_services = &self.config.excluded_services;
|
||||||
let service_name_filters = &self.config.service_name_filters;
|
let service_name_filters = &self.config.service_name_filters;
|
||||||
|
|
||||||
// Parse both unit files and loaded units
|
// Parse loaded units and extract status information
|
||||||
let mut all_service_names = std::collections::HashSet::new();
|
|
||||||
|
|
||||||
// Parse unit files (includes inactive services)
|
|
||||||
for line in unit_files_str.lines() {
|
|
||||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
|
||||||
if fields.len() >= 2 && fields[0].ends_with(".service") {
|
|
||||||
let service_name = fields[0].trim_end_matches(".service");
|
|
||||||
all_service_names.insert(service_name.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse loaded units (includes running/failed services)
|
|
||||||
for line in units_str.lines() {
|
for line in units_str.lines() {
|
||||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||||
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
||||||
let service_name = fields[0].trim_end_matches(".service");
|
let service_name = fields[0].trim_end_matches(".service");
|
||||||
all_service_names.insert(service_name.to_string());
|
|
||||||
}
|
// Extract status information from systemctl list-units output
|
||||||
}
|
let load_state = fields.get(1).unwrap_or(&"unknown").to_string();
|
||||||
|
let active_state = fields.get(2).unwrap_or(&"unknown").to_string();
|
||||||
|
let sub_state = fields.get(3).unwrap_or(&"unknown").to_string();
|
||||||
|
let description = if fields.len() > 4 {
|
||||||
|
fields[4..].join(" ")
|
||||||
|
} else {
|
||||||
|
"".to_string()
|
||||||
|
};
|
||||||
|
|
||||||
// Parse user unit files if available
|
// Cache the status information
|
||||||
if let Some(user_unit_files_str) = &user_unit_files_str {
|
status_cache.insert(service_name.to_string(), ServiceStatusInfo {
|
||||||
for line in user_unit_files_str.lines() {
|
load_state,
|
||||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
active_state,
|
||||||
if fields.len() >= 2 && fields[0].ends_with(".service") {
|
sub_state,
|
||||||
let service_name = fields[0].trim_end_matches(".service");
|
description,
|
||||||
all_service_names.insert(service_name.to_string());
|
});
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse user loaded units if available
|
|
||||||
if let Some(user_units_str) = &user_units_str {
|
|
||||||
for line in user_units_str.lines() {
|
|
||||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
|
||||||
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
|
||||||
let service_name = fields[0].trim_end_matches(".service");
|
|
||||||
all_service_names.insert(service_name.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now process all discovered services
|
|
||||||
for service_name in &all_service_names {
|
|
||||||
debug!("Processing service: '{}'", service_name);
|
|
||||||
|
|
||||||
// Skip excluded services first
|
// Skip excluded services first
|
||||||
let mut is_excluded = false;
|
let mut is_excluded = false;
|
||||||
@ -253,7 +184,7 @@ impl SystemdCollector {
|
|||||||
|
|
||||||
// Check if this service matches our filter patterns
|
// Check if this service matches our filter patterns
|
||||||
for pattern in service_name_filters {
|
for pattern in service_name_filters {
|
||||||
if service_name.contains(pattern) || pattern.contains(service_name) {
|
if service_name == pattern {
|
||||||
debug!(
|
debug!(
|
||||||
"INCLUDING service '{}' because it matches pattern '{}'",
|
"INCLUDING service '{}' because it matches pattern '{}'",
|
||||||
service_name, pattern
|
service_name, pattern
|
||||||
@ -262,14 +193,35 @@ impl SystemdCollector {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Update the cache with all discovered service status information
|
||||||
|
if let Ok(mut state) = self.state.write() {
|
||||||
|
state.service_status_cache = status_cache;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(services)
|
Ok(services)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get service status using systemctl
|
/// Get service status from cache (if available) or fallback to systemctl
|
||||||
fn get_service_status(&self, service: &str) -> Result<(String, String)> {
|
fn get_service_status(&self, service: &str) -> Result<(String, String)> {
|
||||||
|
// Try to get status from cache first
|
||||||
|
if let Ok(state) = self.state.read() {
|
||||||
|
if let Some(cached_info) = state.service_status_cache.get(service) {
|
||||||
|
let active_status = cached_info.active_state.clone();
|
||||||
|
let detailed_info = format!(
|
||||||
|
"LoadState={}\nActiveState={}\nSubState={}",
|
||||||
|
cached_info.load_state,
|
||||||
|
cached_info.active_state,
|
||||||
|
cached_info.sub_state
|
||||||
|
);
|
||||||
|
return Ok((active_status, detailed_info));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to systemctl if not in cache (shouldn't happen during normal operation)
|
||||||
|
debug!("Service '{}' not found in cache, falling back to systemctl", service);
|
||||||
let output = Command::new("systemctl")
|
let output = Command::new("systemctl")
|
||||||
.arg("is-active")
|
.arg("is-active")
|
||||||
.arg(format!("{}.service", service))
|
.arg(format!("{}.service", service))
|
||||||
@ -294,7 +246,9 @@ impl SystemdCollector {
|
|||||||
"active" => Status::Ok,
|
"active" => Status::Ok,
|
||||||
"inactive" | "dead" => Status::Warning,
|
"inactive" | "dead" => Status::Warning,
|
||||||
"failed" | "error" => Status::Critical,
|
"failed" | "error" => Status::Critical,
|
||||||
"activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => Status::Pending,
|
"activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => {
|
||||||
|
Status::Pending
|
||||||
|
}
|
||||||
_ => Status::Unknown,
|
_ => Status::Unknown,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -320,10 +274,14 @@ impl SystemdCollector {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Get directory size in GB with permission-aware logging
|
/// Get directory size in GB with permission-aware logging
|
||||||
fn get_directory_size(&self, dir: &str) -> Option<f32> {
|
fn get_directory_size(&self, dir: &str) -> Option<f32> {
|
||||||
let output = Command::new("sudo").arg("du").arg("-sb").arg(dir).output().ok()?;
|
let output = Command::new("sudo")
|
||||||
|
.arg("du")
|
||||||
|
.arg("-sb")
|
||||||
|
.arg(dir)
|
||||||
|
.output()
|
||||||
|
.ok()?;
|
||||||
|
|
||||||
if !output.status.success() {
|
if !output.status.success() {
|
||||||
// Log permission errors for debugging but don't spam logs
|
// Log permission errors for debugging but don't spam logs
|
||||||
@ -385,12 +343,6 @@ impl SystemdCollector {
|
|||||||
|
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
@ -399,7 +351,10 @@ impl Collector for SystemdCollector {
|
|||||||
"systemd"
|
"systemd"
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
async fn collect(
|
||||||
|
&self,
|
||||||
|
_status_tracker: &mut StatusTracker,
|
||||||
|
) -> Result<Vec<Metric>, CollectorError> {
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
debug!("Collecting systemd services metrics");
|
debug!("Collecting systemd services metrics");
|
||||||
|
|
||||||
@ -478,7 +433,6 @@ impl Collector for SystemdCollector {
|
|||||||
|
|
||||||
Ok(metrics)
|
Ok(metrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SystemdCollector {
|
impl SystemdCollector {
|
||||||
@ -779,7 +733,6 @@ impl SystemdCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Check for redirects (skip redirect-only servers)
|
// Check for redirects (skip redirect-only servers)
|
||||||
if trimmed.contains("return") && (trimmed.contains("301") || trimmed.contains("302")) {
|
if trimmed.contains("return") && (trimmed.contains("301") || trimmed.contains("302")) {
|
||||||
has_redirect = true;
|
has_redirect = true;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user