All checks were successful
Build and Release / build-and-release (push) Successful in 1m16s
403 lines
15 KiB
Rust
403 lines
15 KiB
Rust
use anyhow::Result;
|
|
use async_trait::async_trait;
|
|
use cm_dashboard_shared::{AgentData, ServiceData, Status};
|
|
use std::process::Command;
|
|
use std::sync::RwLock;
|
|
use std::time::Instant;
|
|
use tracing::debug;
|
|
|
|
use super::{Collector, CollectorError};
|
|
use crate::config::SystemdConfig;
|
|
|
|
/// Systemd collector for monitoring systemd services with structured data output
|
|
pub struct SystemdCollector {
|
|
/// Cached state with thread-safe interior mutability
|
|
state: RwLock<ServiceCacheState>,
|
|
/// Configuration for service monitoring
|
|
config: SystemdConfig,
|
|
}
|
|
|
|
/// Internal state for service caching
|
|
#[derive(Debug, Clone)]
|
|
struct ServiceCacheState {
|
|
/// Last collection time for performance tracking
|
|
last_collection: Option<Instant>,
|
|
/// Cached service data
|
|
services: Vec<ServiceInfo>,
|
|
/// Interesting services to monitor (cached after discovery)
|
|
monitored_services: Vec<String>,
|
|
/// Cached service status information from discovery
|
|
service_status_cache: std::collections::HashMap<String, ServiceStatusInfo>,
|
|
/// Last time services were discovered
|
|
last_discovery_time: Option<Instant>,
|
|
/// How often to rediscover services (from config)
|
|
discovery_interval_seconds: u64,
|
|
}
|
|
|
|
/// Cached service status information from systemctl list-units
|
|
#[derive(Debug, Clone)]
|
|
struct ServiceStatusInfo {
|
|
load_state: String,
|
|
active_state: String,
|
|
sub_state: String,
|
|
}
|
|
|
|
/// Internal service information
|
|
#[derive(Debug, Clone)]
|
|
struct ServiceInfo {
|
|
name: String,
|
|
status: String, // "active", "inactive", "failed", etc.
|
|
memory_mb: f32, // Memory usage in MB
|
|
disk_gb: f32, // Disk usage in GB (usually 0 for services)
|
|
}
|
|
|
|
impl SystemdCollector {
|
|
pub fn new(config: SystemdConfig) -> Self {
|
|
let state = ServiceCacheState {
|
|
last_collection: None,
|
|
services: Vec::new(),
|
|
monitored_services: Vec::new(),
|
|
service_status_cache: std::collections::HashMap::new(),
|
|
last_discovery_time: None,
|
|
discovery_interval_seconds: config.interval_seconds,
|
|
};
|
|
|
|
Self {
|
|
state: RwLock::new(state),
|
|
config,
|
|
}
|
|
}
|
|
|
|
/// Collect service data and populate AgentData
|
|
async fn collect_service_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
|
let start_time = Instant::now();
|
|
debug!("Collecting systemd services metrics");
|
|
|
|
// Get cached services (discovery only happens when needed)
|
|
let monitored_services = match self.get_monitored_services() {
|
|
Ok(services) => services,
|
|
Err(e) => {
|
|
debug!("Failed to get monitored services: {}", e);
|
|
return Ok(());
|
|
}
|
|
};
|
|
|
|
// Collect service data for each monitored service
|
|
let mut services = Vec::new();
|
|
for service_name in &monitored_services {
|
|
match self.get_service_status(service_name) {
|
|
Ok((active_status, _detailed_info)) => {
|
|
let memory_mb = self.get_service_memory_usage(service_name).await.unwrap_or(0.0);
|
|
let disk_gb = self.get_service_disk_usage(service_name).await.unwrap_or(0.0);
|
|
|
|
let service_info = ServiceInfo {
|
|
name: service_name.clone(),
|
|
status: active_status,
|
|
memory_mb,
|
|
disk_gb,
|
|
};
|
|
services.push(service_info);
|
|
}
|
|
Err(e) => {
|
|
debug!("Failed to get status for service {}: {}", service_name, e);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update cached state
|
|
{
|
|
let mut state = self.state.write().unwrap();
|
|
state.last_collection = Some(start_time);
|
|
state.services = services.clone();
|
|
}
|
|
|
|
// Populate AgentData with service information
|
|
for service in services {
|
|
agent_data.services.push(ServiceData {
|
|
name: service.name.clone(),
|
|
status: service.status.clone(),
|
|
memory_mb: service.memory_mb,
|
|
disk_gb: service.disk_gb,
|
|
user_stopped: false, // TODO: Integrate with service tracker
|
|
service_status: self.calculate_service_status(&service.name, &service.status),
|
|
});
|
|
}
|
|
|
|
let elapsed = start_time.elapsed();
|
|
debug!("Systemd collection completed in {:?} with {} services", elapsed, agent_data.services.len());
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Get systemd services information
|
|
async fn get_systemd_services(&self) -> Result<Vec<ServiceInfo>, CollectorError> {
|
|
let mut services = Vec::new();
|
|
|
|
// Get ALL service unit files (includes inactive services)
|
|
let unit_files_output = Command::new("systemctl")
|
|
.args(&["list-unit-files", "--type=service", "--no-pager", "--plain"])
|
|
.output()
|
|
.map_err(|e| CollectorError::SystemRead {
|
|
path: "systemctl list-unit-files".to_string(),
|
|
error: e.to_string(),
|
|
})?;
|
|
|
|
// Get runtime status of ALL units (including inactive)
|
|
let status_output = Command::new("systemctl")
|
|
.args(&["list-units", "--type=service", "--all", "--no-pager", "--plain"])
|
|
.output()
|
|
.map_err(|e| CollectorError::SystemRead {
|
|
path: "systemctl list-units --all".to_string(),
|
|
error: e.to_string(),
|
|
})?;
|
|
|
|
let unit_files_str = String::from_utf8_lossy(&unit_files_output.stdout);
|
|
let status_str = String::from_utf8_lossy(&status_output.stdout);
|
|
|
|
// Parse all service unit files to get complete service list
|
|
let mut all_service_names = std::collections::HashSet::new();
|
|
for line in unit_files_str.lines() {
|
|
let fields: Vec<&str> = line.split_whitespace().collect();
|
|
if fields.len() >= 2 && fields[0].ends_with(".service") {
|
|
let service_name = fields[0].trim_end_matches(".service");
|
|
all_service_names.insert(service_name.to_string());
|
|
}
|
|
}
|
|
|
|
// Parse runtime status for all units
|
|
let mut status_cache = std::collections::HashMap::new();
|
|
for line in status_str.lines() {
|
|
let fields: Vec<&str> = line.split_whitespace().collect();
|
|
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
|
let service_name = fields[0].trim_end_matches(".service");
|
|
let load_state = fields.get(1).unwrap_or(&"unknown").to_string();
|
|
let active_state = fields.get(2).unwrap_or(&"unknown").to_string();
|
|
let sub_state = fields.get(3).unwrap_or(&"unknown").to_string();
|
|
status_cache.insert(service_name.to_string(), (load_state, active_state, sub_state));
|
|
}
|
|
}
|
|
|
|
// For services found in unit files but not in runtime status, set default inactive status
|
|
for service_name in &all_service_names {
|
|
if !status_cache.contains_key(service_name) {
|
|
status_cache.insert(service_name.to_string(), (
|
|
"not-loaded".to_string(),
|
|
"inactive".to_string(),
|
|
"dead".to_string()
|
|
));
|
|
}
|
|
}
|
|
|
|
// Process all discovered services and apply filters
|
|
for service_name in &all_service_names {
|
|
if self.should_monitor_service(service_name) {
|
|
if let Some((load_state, active_state, sub_state)) = status_cache.get(service_name) {
|
|
let memory_mb = self.get_service_memory_usage(service_name).await.unwrap_or(0.0);
|
|
let disk_gb = self.get_service_disk_usage(service_name).await.unwrap_or(0.0);
|
|
|
|
let normalized_status = self.normalize_service_status(active_state, sub_state);
|
|
let service_info = ServiceInfo {
|
|
name: service_name.to_string(),
|
|
status: normalized_status,
|
|
memory_mb,
|
|
disk_gb,
|
|
};
|
|
|
|
services.push(service_info);
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(services)
|
|
}
|
|
|
|
/// Check if a service should be monitored based on configuration filters with wildcard support
|
|
fn should_monitor_service(&self, service_name: &str) -> bool {
|
|
// If no filters configured, monitor nothing (to prevent noise)
|
|
if self.config.service_name_filters.is_empty() {
|
|
return false;
|
|
}
|
|
|
|
// Check if service matches any of the configured patterns
|
|
for pattern in &self.config.service_name_filters {
|
|
if self.matches_pattern(service_name, pattern) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
false
|
|
}
|
|
|
|
/// Check if service name matches pattern (supports wildcards like nginx*)
|
|
fn matches_pattern(&self, service_name: &str, pattern: &str) -> bool {
|
|
if pattern.ends_with('*') {
|
|
let prefix = &pattern[..pattern.len() - 1];
|
|
service_name.starts_with(prefix)
|
|
} else {
|
|
service_name == pattern
|
|
}
|
|
}
|
|
|
|
/// Get disk usage for a specific service
|
|
async fn get_service_disk_usage(&self, service_name: &str) -> Result<f32, CollectorError> {
|
|
// Check if this service has configured directory paths
|
|
if let Some(dirs) = self.config.service_directories.get(service_name) {
|
|
// Service has configured paths - use the first accessible one
|
|
for dir in dirs {
|
|
if let Some(size) = self.get_directory_size(dir) {
|
|
return Ok(size);
|
|
}
|
|
}
|
|
// If configured paths failed, return 0
|
|
return Ok(0.0);
|
|
}
|
|
|
|
// No configured path - try to get WorkingDirectory from systemctl
|
|
let output = Command::new("systemctl")
|
|
.args(&["show", &format!("{}.service", service_name), "--property=WorkingDirectory"])
|
|
.output()
|
|
.map_err(|e| CollectorError::SystemRead {
|
|
path: format!("WorkingDirectory for {}", service_name),
|
|
error: e.to_string(),
|
|
})?;
|
|
|
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
|
for line in output_str.lines() {
|
|
if line.starts_with("WorkingDirectory=") && !line.contains("[not set]") {
|
|
let dir = line.strip_prefix("WorkingDirectory=").unwrap_or("");
|
|
if !dir.is_empty() {
|
|
return Ok(self.get_directory_size(dir).unwrap_or(0.0));
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(0.0)
|
|
}
|
|
|
|
/// Get size of a directory in GB
|
|
fn get_directory_size(&self, path: &str) -> Option<f32> {
|
|
let output = Command::new("du")
|
|
.args(&["-sb", path])
|
|
.output()
|
|
.ok()?;
|
|
|
|
if !output.status.success() {
|
|
return None;
|
|
}
|
|
|
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
|
let parts: Vec<&str> = output_str.split_whitespace().collect();
|
|
if let Some(size_str) = parts.first() {
|
|
if let Ok(size_bytes) = size_str.parse::<u64>() {
|
|
return Some(size_bytes as f32 / (1024.0 * 1024.0 * 1024.0));
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
/// Calculate service status, taking user-stopped services into account
|
|
fn calculate_service_status(&self, service_name: &str, active_status: &str) -> Status {
|
|
match active_status.to_lowercase().as_str() {
|
|
"active" => Status::Ok,
|
|
"inactive" | "dead" => {
|
|
debug!("Service '{}' is inactive - treating as Inactive status", service_name);
|
|
Status::Inactive
|
|
},
|
|
"failed" | "error" => Status::Critical,
|
|
"activating" | "deactivating" | "reloading" | "starting" | "stopping" => {
|
|
debug!("Service '{}' is transitioning - treating as Pending", service_name);
|
|
Status::Pending
|
|
},
|
|
_ => Status::Unknown,
|
|
}
|
|
}
|
|
|
|
/// Get memory usage for a specific service
|
|
async fn get_service_memory_usage(&self, service_name: &str) -> Result<f32, CollectorError> {
|
|
let output = Command::new("systemctl")
|
|
.args(&["show", &format!("{}.service", service_name), "--property=MemoryCurrent"])
|
|
.output()
|
|
.map_err(|e| CollectorError::SystemRead {
|
|
path: format!("memory usage for {}", service_name),
|
|
error: e.to_string(),
|
|
})?;
|
|
|
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
|
|
|
for line in output_str.lines() {
|
|
if line.starts_with("MemoryCurrent=") {
|
|
if let Some(mem_str) = line.strip_prefix("MemoryCurrent=") {
|
|
if mem_str != "[not set]" {
|
|
if let Ok(memory_bytes) = mem_str.parse::<u64>() {
|
|
return Ok(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(0.0)
|
|
}
|
|
|
|
/// Normalize service status to standard values
|
|
fn normalize_service_status(&self, active_state: &str, sub_state: &str) -> String {
|
|
match (active_state, sub_state) {
|
|
("active", "running") => "active".to_string(),
|
|
("active", _) => "active".to_string(),
|
|
("inactive", "dead") => "inactive".to_string(),
|
|
("inactive", _) => "inactive".to_string(),
|
|
("failed", _) => "failed".to_string(),
|
|
("activating", _) => "starting".to_string(),
|
|
("deactivating", _) => "stopping".to_string(),
|
|
_ => format!("{}:{}", active_state, sub_state),
|
|
}
|
|
}
|
|
|
|
/// Check if service collection cache should be updated
|
|
fn should_update_cache(&self) -> bool {
|
|
let state = self.state.read().unwrap();
|
|
|
|
match state.last_collection {
|
|
None => true,
|
|
Some(last) => {
|
|
let cache_duration = std::time::Duration::from_secs(30);
|
|
last.elapsed() > cache_duration
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Get cached service data if available and fresh
|
|
fn get_cached_services(&self) -> Option<Vec<ServiceInfo>> {
|
|
if !self.should_update_cache() {
|
|
let state = self.state.read().unwrap();
|
|
Some(state.services.clone())
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Collector for SystemdCollector {
|
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
|
// Use cached data if available and fresh
|
|
if let Some(cached_services) = self.get_cached_services() {
|
|
debug!("Using cached systemd services data");
|
|
for service in cached_services {
|
|
agent_data.services.push(ServiceData {
|
|
name: service.name.clone(),
|
|
status: service.status.clone(),
|
|
memory_mb: service.memory_mb,
|
|
disk_gb: service.disk_gb,
|
|
user_stopped: false, // TODO: Integrate with service tracker
|
|
service_status: self.calculate_service_status(&service.name, &service.status),
|
|
});
|
|
}
|
|
Ok(())
|
|
} else {
|
|
// Collect fresh data
|
|
self.collect_service_data(agent_data).await
|
|
}
|
|
}
|
|
} |