Implement hysteresis for metric status changes to prevent flapping

Add comprehensive hysteresis support to prevent status oscillation near
threshold boundaries while maintaining responsive alerting.

Key Features:
- HysteresisThresholds with configurable upper/lower limits
- StatusTracker for per-metric status history
- Default gaps: CPU load 10%, memory 5%, disk temp 5°C

Updated Components:
- CPU load collector (5-minute average with hysteresis)
- Memory usage collector (percentage-based thresholds)
- Disk temperature collector (SMART data monitoring)
- All collectors updated to support StatusTracker interface

Cache Interval Adjustments:
- Service status: 60s → 10s (faster response)
- Disk usage: 300s → 60s (more frequent checks)
- Backup status: 900s → 60s (quicker updates)
- SMART data: moved to 600s tier (10 minutes)

Architecture:
- Individual metric status calculation in collectors
- Centralized StatusTracker in MetricCollectionManager
- Status aggregation preserved in dashboard widgets
This commit is contained in:
2025-10-20 18:45:41 +02:00
parent e998679901
commit 00a8ed3da2
34 changed files with 1037 additions and 770 deletions

View File

@@ -1,11 +1,11 @@
use anyhow::Result;
use gethostname::gethostname;
use std::time::Duration;
use tokio::time::interval;
use tracing::{info, error, debug};
use gethostname::gethostname;
use tracing::{debug, error, info};
use crate::communication::{AgentCommand, ZmqHandler};
use crate::config::AgentConfig;
use crate::communication::{ZmqHandler, AgentCommand};
use crate::metrics::MetricCollectionManager;
use crate::notifications::NotificationManager;
use cm_dashboard_shared::{Metric, MetricMessage};
@@ -22,28 +22,31 @@ impl Agent {
pub async fn new(config_path: Option<String>) -> Result<Self> {
let hostname = gethostname().to_string_lossy().to_string();
info!("Initializing agent for host: {}", hostname);
// Load configuration
let config = if let Some(path) = config_path {
AgentConfig::load_from_file(&path)?
} else {
AgentConfig::default()
};
info!("Agent configuration loaded");
// Initialize ZMQ communication
let zmq_handler = ZmqHandler::new(&config.zmq).await?;
info!("ZMQ communication initialized on port {}", config.zmq.publisher_port);
info!(
"ZMQ communication initialized on port {}",
config.zmq.publisher_port
);
// Initialize metric collection manager with cache config
let metric_manager = MetricCollectionManager::new(&config.collectors, &config).await?;
info!("Metric collection manager initialized");
// Initialize notification manager
let notification_manager = NotificationManager::new(&config.notifications, &hostname)?;
info!("Notification manager initialized");
Ok(Self {
hostname,
config,
@@ -52,10 +55,10 @@ impl Agent {
notification_manager,
})
}
pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> {
info!("Starting agent main loop with separated collection and transmission");
// CRITICAL: Collect ALL data immediately at startup before entering the loop
info!("Performing initial FORCE collection of all metrics at startup");
if let Err(e) = self.collect_all_metrics_force().await {
@@ -63,12 +66,13 @@ impl Agent {
} else {
info!("Initial metric collection completed - all data cached and ready");
}
// Separate intervals for collection and transmission
let mut collection_interval = interval(Duration::from_secs(self.config.collection_interval_seconds));
let mut collection_interval =
interval(Duration::from_secs(self.config.collection_interval_seconds));
let mut transmission_interval = interval(Duration::from_secs(1)); // ZMQ broadcast every 1 second
let mut notification_check_interval = interval(Duration::from_secs(30)); // Check notifications every 30s
loop {
tokio::select! {
_ = collection_interval.tick() => {
@@ -99,84 +103,93 @@ impl Agent {
}
}
}
info!("Agent main loop stopped");
Ok(())
}
async fn collect_all_metrics_force(&mut self) -> Result<()> {
info!("Starting FORCE metric collection for startup");
// Force collect all metrics from all collectors immediately
let metrics = self.metric_manager.collect_all_metrics_force().await?;
if metrics.is_empty() {
error!("No metrics collected during force collection!");
return Ok(());
}
info!("Force collected and cached {} metrics", metrics.len());
// Check for status changes and send notifications
self.check_status_changes(&metrics).await;
Ok(())
}
async fn collect_metrics_only(&mut self) -> Result<()> {
debug!("Starting metric collection cycle (cache only)");
// Collect all metrics from all collectors and cache them
let metrics = self.metric_manager.collect_all_metrics().await?;
if metrics.is_empty() {
debug!("No metrics collected this cycle");
return Ok(());
}
debug!("Collected and cached {} metrics", metrics.len());
// Check for status changes and send notifications
self.check_status_changes(&metrics).await;
Ok(())
}
async fn broadcast_all_cached_metrics(&mut self) -> Result<()> {
debug!("Broadcasting all cached metrics via ZMQ");
// Get all cached metrics from the metric manager
let cached_metrics = self.metric_manager.get_all_cached_metrics().await?;
if cached_metrics.is_empty() {
debug!("No cached metrics to broadcast");
return Ok(());
}
debug!("Broadcasting {} cached metrics", cached_metrics.len());
// Create and send message with all cached data
let message = MetricMessage::new(self.hostname.clone(), cached_metrics);
self.zmq_handler.publish_metrics(&message).await?;
debug!("Cached metrics broadcasted successfully");
Ok(())
}
async fn check_status_changes(&mut self, metrics: &[Metric]) {
for metric in metrics {
if let Some(status_change) = self.notification_manager.update_metric_status(&metric.name, metric.status) {
info!("Status change detected for {}: {:?} -> {:?}",
metric.name, status_change.old_status, status_change.new_status);
if let Some(status_change) = self
.notification_manager
.update_metric_status(&metric.name, metric.status)
{
info!(
"Status change detected for {}: {:?} -> {:?}",
metric.name, status_change.old_status, status_change.new_status
);
// Send notification for status change
if let Err(e) = self.notification_manager.send_status_change_notification(status_change, metric).await {
if let Err(e) = self
.notification_manager
.send_status_change_notification(status_change, metric)
.await
{
error!("Failed to send notification: {}", e);
}
}
}
}
async fn handle_commands(&mut self) -> Result<()> {
// Try to receive commands (non-blocking)
match self.zmq_handler.try_receive_command() {
@@ -193,7 +206,7 @@ impl Agent {
}
Ok(())
}
async fn process_command(&mut self, command: AgentCommand) -> Result<()> {
match command {
AgentCommand::CollectNow => {
@@ -209,7 +222,10 @@ impl Agent {
info!("Interval change requested but not implemented yet");
}
AgentCommand::ToggleCollector { name, enabled } => {
info!("Processing ToggleCollector command: {} -> {}", name, enabled);
info!(
"Processing ToggleCollector command: {} -> {}",
name, enabled
);
// Note: This would require dynamic collector management
info!("Collector toggle requested but not implemented yet");
}
@@ -220,4 +236,4 @@ impl Agent {
}
Ok(())
}
}
}

View File

@@ -8,4 +8,4 @@ pub struct CachedMetric {
pub collected_at: Instant,
pub access_count: u64,
pub tier: Option<CacheTier>,
}
}

View File

@@ -11,10 +11,8 @@ pub struct MetricCacheManager {
impl MetricCacheManager {
pub fn new(config: CacheConfig) -> Self {
let cache = Arc::new(ConfigurableCache::new(config.clone()));
Self {
cache,
}
Self { cache }
}
/// Start background cache management tasks
@@ -32,5 +30,4 @@ impl MetricCacheManager {
pub async fn get_all_cached_metrics(&self) -> Vec<Metric> {
self.cache.get_all_cached_metrics().await
}
}
}

View File

@@ -4,11 +4,11 @@ use std::time::Instant;
use tokio::sync::RwLock;
use tracing::warn;
mod manager;
mod cached_metric;
mod manager;
pub use manager::MetricCacheManager;
pub use cached_metric::CachedMetric;
pub use manager::MetricCacheManager;
/// Central cache for individual metrics with configurable tiers
pub struct ConfigurableCache {
@@ -31,7 +31,7 @@ impl ConfigurableCache {
}
let mut cache = self.cache.write().await;
// Enforce max entries limit
if cache.len() >= self.config.max_entries {
self.cleanup_old_entries(&mut cache).await;
@@ -45,11 +45,10 @@ impl ConfigurableCache {
};
cache.insert(metric.name.clone(), cached_metric);
// Cached metric (debug logging disabled for performance)
}
/// Get all cached metrics (including expired ones) for broadcasting
pub async fn get_all_cached_metrics(&self) -> Vec<Metric> {
if !self.config.enabled {
@@ -58,44 +57,46 @@ impl ConfigurableCache {
let cache = self.cache.read().await;
let mut all_metrics = Vec::new();
for cached_metric in cache.values() {
all_metrics.push(cached_metric.metric.clone());
}
all_metrics
}
/// Background cleanup of old entries
async fn cleanup_old_entries(&self, cache: &mut HashMap<String, CachedMetric>) {
let mut to_remove = Vec::new();
for (metric_name, cached_metric) in cache.iter() {
let cache_interval = self.config.get_cache_interval(metric_name);
let elapsed = cached_metric.collected_at.elapsed().as_secs();
// Remove entries that are way past their expiration (2x interval)
if elapsed > cache_interval * 2 {
to_remove.push(metric_name.clone());
}
}
for metric_name in to_remove {
cache.remove(&metric_name);
}
// If still too many entries, remove least recently accessed
if cache.len() >= self.config.max_entries {
let mut entries: Vec<_> = cache.iter().map(|(k, v)| (k.clone(), v.access_count)).collect();
let mut entries: Vec<_> = cache
.iter()
.map(|(k, v)| (k.clone(), v.access_count))
.collect();
entries.sort_by_key(|(_, access_count)| *access_count);
let excess = cache.len() - (self.config.max_entries * 3 / 4); // Remove 25%
for (metric_name, _) in entries.iter().take(excess) {
cache.remove(metric_name);
}
warn!("Cache cleanup removed {} entries due to size limit", excess);
}
}
}
}

View File

@@ -1,6 +1,6 @@
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status};
use chrono::Utc;
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use tokio::fs;
@@ -18,7 +18,8 @@ pub struct BackupCollector {
impl BackupCollector {
pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
Self {
backup_status_file: backup_status_file.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
backup_status_file: backup_status_file
.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
max_age_hours,
}
}
@@ -43,10 +44,16 @@ impl BackupCollector {
Ok(dt) => dt.with_timezone(&Utc),
Err(_) => {
// Try parsing as naive datetime and assume UTC
match chrono::NaiveDateTime::parse_from_str(&backup_status.start_time, "%Y-%m-%dT%H:%M:%S%.f") {
match chrono::NaiveDateTime::parse_from_str(
&backup_status.start_time,
"%Y-%m-%dT%H:%M:%S%.f",
) {
Ok(naive_dt) => naive_dt.and_utc(),
Err(_) => {
error!("Failed to parse backup timestamp: {}", backup_status.start_time);
error!(
"Failed to parse backup timestamp: {}",
backup_status.start_time
);
return Status::Unknown;
}
}
@@ -63,7 +70,7 @@ impl BackupCollector {
} else {
Status::Ok
}
},
}
"failed" => Status::Critical,
"running" => Status::Ok, // Currently running is OK
_ => Status::Unknown,
@@ -78,7 +85,7 @@ impl BackupCollector {
} else {
Status::Critical
}
},
}
"failed" => Status::Critical,
"disabled" => Status::Warning, // Service intentionally disabled
"running" => Status::Ok,
@@ -97,7 +104,7 @@ impl Collector for BackupCollector {
"backup"
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
let backup_status = self.read_backup_status().await?;
let mut metrics = Vec::new();
let timestamp = chrono::Utc::now().timestamp() as u64;
@@ -114,7 +121,10 @@ impl Collector for BackupCollector {
}),
status: overall_status,
timestamp,
description: Some(format!("Backup: {} at {}", backup_status.status, backup_status.start_time)),
description: Some(format!(
"Backup: {} at {}",
backup_status.status, backup_status.start_time
)),
unit: None,
});
@@ -129,14 +139,18 @@ impl Collector for BackupCollector {
});
// Last backup timestamp - use last_updated (when backup finished) instead of start_time
let last_updated_dt_result = chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
.map(|dt| dt.with_timezone(&Utc))
.or_else(|_| {
// Try parsing as naive datetime and assume UTC
chrono::NaiveDateTime::parse_from_str(&backup_status.last_updated, "%Y-%m-%dT%H:%M:%S%.f")
let last_updated_dt_result =
chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
.map(|dt| dt.with_timezone(&Utc))
.or_else(|_| {
// Try parsing as naive datetime and assume UTC
chrono::NaiveDateTime::parse_from_str(
&backup_status.last_updated,
"%Y-%m-%dT%H:%M:%S%.f",
)
.map(|naive_dt| naive_dt.and_utc())
});
});
if let Ok(last_updated_dt) = last_updated_dt_result {
metrics.push(Metric {
name: "backup_last_run_timestamp".to_string(),
@@ -147,13 +161,16 @@ impl Collector for BackupCollector {
unit: Some("unix_timestamp".to_string()),
});
} else {
error!("Failed to parse backup timestamp for last_run_timestamp: {}", backup_status.last_updated);
error!(
"Failed to parse backup timestamp for last_run_timestamp: {}",
backup_status.last_updated
);
}
// Individual service metrics
for (service_name, service) in &backup_status.services {
let service_status = self.calculate_service_status(service);
// Service status
metrics.push(Metric {
name: format!("backup_service_{}_status", service_name),
@@ -165,7 +182,10 @@ impl Collector for BackupCollector {
}),
status: service_status,
timestamp,
description: Some(format!("Backup service {} status: {}", service_name, service.status)),
description: Some(format!(
"Backup service {} status: {}",
service_name, service.status
)),
unit: None,
});
@@ -173,7 +193,11 @@ impl Collector for BackupCollector {
metrics.push(Metric {
name: format!("backup_service_{}_exit_code", service_name),
value: MetricValue::Integer(service.exit_code),
status: if service.exit_code == 0 { Status::Ok } else { Status::Critical },
status: if service.exit_code == 0 {
Status::Ok
} else {
Status::Critical
},
timestamp,
description: Some(format!("Exit code for backup service {}", service_name)),
unit: None,
@@ -222,7 +246,9 @@ impl Collector for BackupCollector {
});
// Calculate total repository size
let total_size_bytes: u64 = backup_status.services.values()
let total_size_bytes: u64 = backup_status
.services
.values()
.map(|s| s.repo_size_bytes)
.sum();
let total_size_gb = Self::bytes_to_gb(total_size_bytes);
@@ -301,7 +327,6 @@ impl Collector for BackupCollector {
unit: None,
});
}
}
// Add standalone disk identification metrics from TOML fields
@@ -372,7 +397,7 @@ pub struct DiskSpace {
pub used_gb: f64,
pub available_gb: f64,
pub usage_percent: f64,
// Optional disk identification fields
// Optional disk identification fields
pub product_name: Option<String>,
pub serial_number: Option<String>,
}
@@ -384,4 +409,4 @@ pub struct ServiceStatus {
pub repo_path: String,
pub archive_count: i64,
pub repo_size_bytes: u64,
}
}

View File

@@ -1,5 +1,5 @@
use async_trait::async_trait;
use cm_dashboard_shared::{registry, Metric, MetricValue, Status};
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
use tracing::debug;
@@ -17,41 +17,44 @@ use crate::config::CpuConfig;
pub struct CpuCollector {
config: CpuConfig,
name: String,
load_thresholds: HysteresisThresholds,
temperature_thresholds: HysteresisThresholds,
}
impl CpuCollector {
pub fn new(config: CpuConfig) -> Self {
// Create hysteresis thresholds with 10% gap for recovery
let load_thresholds = HysteresisThresholds::new(
config.load_warning_threshold,
config.load_critical_threshold,
);
let temperature_thresholds = HysteresisThresholds::new(
config.temperature_warning_threshold,
config.temperature_critical_threshold,
);
Self {
config,
name: "cpu".to_string(),
load_thresholds,
temperature_thresholds,
}
}
/// Calculate CPU load status using configured thresholds
fn calculate_load_status(&self, load: f32) -> Status {
if load >= self.config.load_critical_threshold {
Status::Critical
} else if load >= self.config.load_warning_threshold {
Status::Warning
} else {
Status::Ok
}
/// Calculate CPU load status using hysteresis thresholds
fn calculate_load_status(&self, metric_name: &str, load: f32, status_tracker: &mut StatusTracker) -> Status {
status_tracker.calculate_with_hysteresis(metric_name, load, &self.load_thresholds)
}
/// Calculate CPU temperature status using configured thresholds
fn calculate_temperature_status(&self, temp: f32) -> Status {
if temp >= self.config.temperature_critical_threshold {
Status::Critical
} else if temp >= self.config.temperature_warning_threshold {
Status::Warning
} else {
Status::Ok
}
/// Calculate CPU temperature status using hysteresis thresholds
fn calculate_temperature_status(&self, metric_name: &str, temp: f32, status_tracker: &mut StatusTracker) -> Status {
status_tracker.calculate_with_hysteresis(metric_name, temp, &self.temperature_thresholds)
}
/// Collect CPU load averages from /proc/loadavg
/// Format: "0.52 0.58 0.59 1/257 12345"
async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect_load_averages(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
let content = utils::read_proc_file("/proc/loadavg")?;
let parts: Vec<&str> = content.trim().split_whitespace().collect();
@@ -68,7 +71,7 @@ impl CpuCollector {
// Only apply thresholds to 5-minute load average
let load_1min_status = Status::Ok; // No alerting on 1min
let load_5min_status = self.calculate_load_status(load_5min); // Only 5min triggers alerts
let load_5min_status = self.calculate_load_status(registry::CPU_LOAD_5MIN, load_5min, status_tracker); // Only 5min triggers alerts
let load_15min_status = Status::Ok; // No alerting on 15min
Ok(vec![
@@ -95,14 +98,14 @@ impl CpuCollector {
/// Collect CPU temperature from thermal zones
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
async fn collect_temperature(&self, status_tracker: &mut StatusTracker) -> Result<Option<Metric>, CollectorError> {
// Try x86_pkg_temp first (Intel CPU package temperature)
if let Ok(temp) = self
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
.await
{
let temp_celsius = temp as f32 / 1000.0;
let status = self.calculate_temperature_status(temp_celsius);
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
return Ok(Some(
Metric::new(
@@ -120,7 +123,7 @@ impl CpuCollector {
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
if let Ok(temp) = self.read_thermal_zone(&path).await {
let temp_celsius = temp as f32 / 1000.0;
let status = self.calculate_temperature_status(temp_celsius);
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
return Ok(Some(
Metric::new(
@@ -200,17 +203,17 @@ impl Collector for CpuCollector {
&self.name
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
debug!("Collecting CPU metrics");
let start = std::time::Instant::now();
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
// Collect load averages (always available)
metrics.extend(self.collect_load_averages().await?);
metrics.extend(self.collect_load_averages(status_tracker).await?);
// Collect temperature (optional)
if let Some(temp_metric) = self.collect_temperature().await? {
if let Some(temp_metric) = self.collect_temperature(status_tracker).await? {
metrics.push(temp_metric);
}

View File

@@ -1,6 +1,6 @@
use anyhow::Result;
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status};
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
use crate::config::DiskConfig;
use std::fs;
@@ -28,11 +28,28 @@ struct MountedDisk {
/// Disk usage collector for monitoring filesystem sizes
pub struct DiskCollector {
config: DiskConfig,
temperature_thresholds: HysteresisThresholds,
}
impl DiskCollector {
pub fn new(config: DiskConfig) -> Self {
Self { config }
// Create hysteresis thresholds for disk temperature
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
60.0, // warning at 60°C
5.0, // 5°C gap for recovery
70.0, // critical at 70°C
5.0, // 5°C gap for recovery
);
Self {
config,
temperature_thresholds,
}
}
/// Calculate disk temperature status using hysteresis thresholds
fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
}
/// Resolve UUID to actual device path
@@ -203,12 +220,6 @@ impl DiskCollector {
Ok((total_bytes, used_bytes))
}
/// Get root filesystem disk usage
fn get_root_filesystem_usage(&self) -> Result<(u64, u64, f32)> {
let (total_bytes, used_bytes) = self.get_filesystem_info("/")?;
let usage_percent = (used_bytes as f64 / total_bytes as f64) * 100.0;
Ok((total_bytes, used_bytes, usage_percent as f32))
}
/// Get the physical device for a given device (resolves symlinks, gets parent device)
@@ -339,7 +350,7 @@ impl Collector for DiskCollector {
"disk"
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
let start_time = Instant::now();
debug!("Collecting multi-disk metrics");
@@ -497,13 +508,8 @@ impl Collector for DiskCollector {
});
if temperature > 0.0 {
let temp_status = if temperature >= 70.0 {
Status::Critical
} else if temperature >= 60.0 {
Status::Warning
} else {
Status::Ok
};
let metric_name = format!("disk_smart_{}_temperature", device_name);
let temp_status = self.calculate_temperature_status(&metric_name, temperature, status_tracker);
metrics.push(Metric {
name: format!("disk_smart_{}_temperature", device_name),

View File

@@ -4,7 +4,7 @@ use thiserror::Error;
pub enum CollectorError {
#[error("Failed to read system file {path}: {error}")]
SystemRead { path: String, error: String },
#[error("Failed to parse value '{value}': {error}")]
Parse { value: String, error: String },
}
}

View File

@@ -1,13 +1,13 @@
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status, registry};
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
use tracing::debug;
use super::{Collector, CollectorError, utils};
use super::{utils, Collector, CollectorError};
use crate::config::MemoryConfig;
/// Extremely efficient memory metrics collector
///
///
/// EFFICIENCY OPTIMIZATIONS:
/// - Single /proc/meminfo read for all memory metrics
/// - Minimal string parsing with split operations
@@ -17,6 +17,7 @@ use crate::config::MemoryConfig;
pub struct MemoryCollector {
config: MemoryConfig,
name: String,
usage_thresholds: HysteresisThresholds,
}
/// Memory information parsed from /proc/meminfo
@@ -33,36 +34,38 @@ struct MemoryInfo {
impl MemoryCollector {
pub fn new(config: MemoryConfig) -> Self {
// Create hysteresis thresholds with 5% gap for memory usage
let usage_thresholds = HysteresisThresholds::with_custom_gaps(
config.usage_warning_percent,
5.0, // 5% gap for warning recovery
config.usage_critical_percent,
5.0, // 5% gap for critical recovery
);
Self {
config,
name: "memory".to_string(),
usage_thresholds,
}
}
/// Calculate memory usage status using configured thresholds
fn calculate_usage_status(&self, usage_percent: f32) -> Status {
if usage_percent >= self.config.usage_critical_percent {
Status::Critical
} else if usage_percent >= self.config.usage_warning_percent {
Status::Warning
} else {
Status::Ok
}
/// Calculate memory usage status using hysteresis thresholds
fn calculate_usage_status(&self, metric_name: &str, usage_percent: f32, status_tracker: &mut StatusTracker) -> Status {
status_tracker.calculate_with_hysteresis(metric_name, usage_percent, &self.usage_thresholds)
}
/// Parse /proc/meminfo efficiently
/// Format: "MemTotal: 16384000 kB"
async fn parse_meminfo(&self) -> Result<MemoryInfo, CollectorError> {
let content = utils::read_proc_file("/proc/meminfo")?;
let mut info = MemoryInfo::default();
// Parse each line efficiently - only extract what we need
for line in content.lines() {
if let Some(colon_pos) = line.find(':') {
let key = &line[..colon_pos];
let value_part = &line[colon_pos + 1..];
// Extract number from value part (format: " 12345 kB")
if let Some(number_str) = value_part.split_whitespace().next() {
if let Ok(value_kb) = utils::parse_u64(number_str) {
@@ -80,7 +83,7 @@ impl MemoryCollector {
}
}
}
// Validate that we got essential fields
if info.total_kb == 0 {
return Err(CollectorError::Parse {
@@ -88,87 +91,105 @@ impl MemoryCollector {
error: "MemTotal not found or zero in /proc/meminfo".to_string(),
});
}
// If MemAvailable is not available (older kernels), calculate it
if info.available_kb == 0 {
info.available_kb = info.free_kb + info.buffers_kb + info.cached_kb;
}
Ok(info)
}
/// Convert KB to GB efficiently (avoiding floating point in hot path)
fn kb_to_gb(kb: u64) -> f32 {
kb as f32 / 1_048_576.0 // 1024 * 1024
}
/// Calculate memory metrics from parsed info
fn calculate_metrics(&self, info: &MemoryInfo) -> Vec<Metric> {
fn calculate_metrics(&self, info: &MemoryInfo, status_tracker: &mut StatusTracker) -> Vec<Metric> {
let mut metrics = Vec::with_capacity(6);
// Calculate derived values
let used_kb = info.total_kb - info.available_kb;
let usage_percent = (used_kb as f32 / info.total_kb as f32) * 100.0;
let usage_status = self.calculate_usage_status(usage_percent);
let usage_status = self.calculate_usage_status(registry::MEMORY_USAGE_PERCENT, usage_percent, status_tracker);
let swap_used_kb = info.swap_total_kb - info.swap_free_kb;
// Convert to GB for metrics
let total_gb = Self::kb_to_gb(info.total_kb);
let used_gb = Self::kb_to_gb(used_kb);
let available_gb = Self::kb_to_gb(info.available_kb);
let swap_total_gb = Self::kb_to_gb(info.swap_total_kb);
let swap_used_gb = Self::kb_to_gb(swap_used_kb);
// Memory usage percentage (primary metric with status)
metrics.push(Metric::new(
registry::MEMORY_USAGE_PERCENT.to_string(),
MetricValue::Float(usage_percent),
usage_status,
).with_description("Memory usage percentage".to_string())
.with_unit("%".to_string()));
metrics.push(
Metric::new(
registry::MEMORY_USAGE_PERCENT.to_string(),
MetricValue::Float(usage_percent),
usage_status,
)
.with_description("Memory usage percentage".to_string())
.with_unit("%".to_string()),
);
// Total memory
metrics.push(Metric::new(
registry::MEMORY_TOTAL_GB.to_string(),
MetricValue::Float(total_gb),
Status::Ok, // Total memory doesn't have status
).with_description("Total system memory".to_string())
.with_unit("GB".to_string()));
metrics.push(
Metric::new(
registry::MEMORY_TOTAL_GB.to_string(),
MetricValue::Float(total_gb),
Status::Ok, // Total memory doesn't have status
)
.with_description("Total system memory".to_string())
.with_unit("GB".to_string()),
);
// Used memory
metrics.push(Metric::new(
registry::MEMORY_USED_GB.to_string(),
MetricValue::Float(used_gb),
Status::Ok, // Used memory absolute value doesn't have status
).with_description("Used system memory".to_string())
.with_unit("GB".to_string()));
metrics.push(
Metric::new(
registry::MEMORY_USED_GB.to_string(),
MetricValue::Float(used_gb),
Status::Ok, // Used memory absolute value doesn't have status
)
.with_description("Used system memory".to_string())
.with_unit("GB".to_string()),
);
// Available memory
metrics.push(Metric::new(
registry::MEMORY_AVAILABLE_GB.to_string(),
MetricValue::Float(available_gb),
Status::Ok, // Available memory absolute value doesn't have status
).with_description("Available system memory".to_string())
.with_unit("GB".to_string()));
metrics.push(
Metric::new(
registry::MEMORY_AVAILABLE_GB.to_string(),
MetricValue::Float(available_gb),
Status::Ok, // Available memory absolute value doesn't have status
)
.with_description("Available system memory".to_string())
.with_unit("GB".to_string()),
);
// Swap metrics (only if swap exists)
if info.swap_total_kb > 0 {
metrics.push(Metric::new(
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
MetricValue::Float(swap_total_gb),
Status::Ok,
).with_description("Total swap space".to_string())
.with_unit("GB".to_string()));
metrics.push(Metric::new(
registry::MEMORY_SWAP_USED_GB.to_string(),
MetricValue::Float(swap_used_gb),
Status::Ok,
).with_description("Used swap space".to_string())
.with_unit("GB".to_string()));
metrics.push(
Metric::new(
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
MetricValue::Float(swap_total_gb),
Status::Ok,
)
.with_description("Total swap space".to_string())
.with_unit("GB".to_string()),
);
metrics.push(
Metric::new(
registry::MEMORY_SWAP_USED_GB.to_string(),
MetricValue::Float(swap_used_gb),
Status::Ok,
)
.with_description("Used swap space".to_string())
.with_unit("GB".to_string()),
);
}
metrics
}
}
@@ -178,34 +199,39 @@ impl Collector for MemoryCollector {
fn name(&self) -> &str {
&self.name
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
debug!("Collecting memory metrics");
let start = std::time::Instant::now();
// Parse memory info from /proc/meminfo
let info = self.parse_meminfo().await?;
// Calculate all metrics from parsed info
let metrics = self.calculate_metrics(&info);
let metrics = self.calculate_metrics(&info, status_tracker);
let duration = start.elapsed();
debug!("Memory collection completed in {:?} with {} metrics", duration, metrics.len());
debug!(
"Memory collection completed in {:?} with {} metrics",
duration,
metrics.len()
);
// Efficiency check: warn if collection takes too long
if duration.as_millis() > 1 {
debug!("Memory collection took {}ms - consider optimization", duration.as_millis());
debug!(
"Memory collection took {}ms - consider optimization",
duration.as_millis()
);
}
// Store performance metrics
// Performance tracking handled by cache system
Ok(metrics)
}
fn get_performance_metrics(&self) -> Option<super::PerformanceMetrics> {
None // Performance tracking handled by cache system
}
}
}

View File

@@ -1,16 +1,7 @@
use async_trait::async_trait;
use cm_dashboard_shared::Metric;
use cm_dashboard_shared::{Metric, StatusTracker};
use std::time::Duration;
pub mod cpu;
pub mod memory;
pub mod disk;
pub mod systemd;
pub mod backup;
pub mod error;
pub use error::CollectorError;
/// Performance metrics for a collector
#[derive(Debug, Clone)]
pub struct PerformanceMetrics {
@@ -18,69 +9,78 @@ pub struct PerformanceMetrics {
pub collection_efficiency_percent: f32,
}
pub mod backup;
pub mod cpu;
pub mod disk;
pub mod error;
pub mod memory;
pub mod systemd;
pub use error::CollectorError;
/// Base trait for all collectors with extreme efficiency requirements
#[async_trait]
pub trait Collector: Send + Sync {
/// Name of this collector
fn name(&self) -> &str;
/// Collect all metrics this collector provides
async fn collect(&self) -> Result<Vec<Metric>, CollectorError>;
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError>;
/// Get performance metrics for monitoring collector efficiency
fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
None
}
}
/// CPU efficiency rules for all collectors
pub mod efficiency {
/// CRITICAL: All collectors must follow these efficiency rules to minimize system impact
/// 1. FILE READING RULES
/// - Read entire files in single syscall when possible
/// - Use BufReader only for very large files (>4KB)
/// - Never read files character by character
/// - Cache file descriptors when safe (immutable paths)
/// 2. PARSING RULES
/// - Use split() instead of regex for simple patterns
/// - Parse numbers with from_str() not complex parsing
/// - Avoid string allocations in hot paths
/// - Use str::trim() before parsing numbers
/// 3. MEMORY ALLOCATION RULES
/// - Reuse Vec buffers when possible
/// - Pre-allocate collections with known sizes
/// - Use str slices instead of String when possible
/// - Avoid clone() in hot paths
/// 4. SYSTEM CALL RULES
/// - Minimize syscalls - prefer single reads over multiple
/// - Use /proc filesystem efficiently
/// - Avoid spawning processes when /proc data available
/// - Cache static data (like CPU count)
/// 5. ERROR HANDLING RULES
/// - Use Result<> but minimize allocation in error paths
/// - Log errors at debug level only to avoid I/O overhead
/// - Graceful degradation - missing metrics better than failing
/// - Never panic in collectors
/// 6. CONCURRENCY RULES
/// - Collectors must be thread-safe but avoid locks
/// - Use atomic operations for simple counters
/// - Avoid shared mutable state between collections
/// - Each collection should be independent
pub const PERFORMANCE_TARGET_OVERHEAD_PERCENT: f32 = 0.1;
//! CRITICAL: All collectors must follow these efficiency rules to minimize system impact
//!
//! # FILE READING RULES
//! - Read entire files in single syscall when possible
//! - Use BufReader only for very large files (>4KB)
//! - Never read files character by character
//! - Cache file descriptors when safe (immutable paths)
//!
//! # PARSING RULES
//! - Use split() instead of regex for simple patterns
//! - Parse numbers with from_str() not complex parsing
//! - Avoid string allocations in hot paths
//! - Use str::trim() before parsing numbers
//!
//! # MEMORY ALLOCATION RULES
//! - Reuse Vec buffers when possible
//! - Pre-allocate collections with known sizes
//! - Use str slices instead of String when possible
//! - Avoid clone() in hot paths
//!
//! # SYSTEM CALL RULES
//! - Minimize syscalls - prefer single reads over multiple
//! - Use /proc filesystem efficiently
//! - Avoid spawning processes when /proc data available
//! - Cache static data (like CPU count)
//!
//! # ERROR HANDLING RULES
//! - Use Result<> but minimize allocation in error paths
//! - Log errors at debug level only to avoid I/O overhead
//! - Graceful degradation - missing metrics better than failing
//! - Never panic in collectors
//!
//! # CONCURRENCY RULES
//! - Collectors must be thread-safe but avoid locks
//! - Use atomic operations for simple counters
//! - Avoid shared mutable state between collections
//! - Each collection should be independent
}
/// Utility functions for efficient system data collection
pub mod utils {
use std::fs;
use super::CollectorError;
use std::fs;
/// Read entire file content efficiently
pub fn read_proc_file(path: &str) -> Result<String, CollectorError> {
fs::read_to_string(path).map_err(|e| CollectorError::SystemRead {
@@ -88,25 +88,25 @@ pub mod utils {
error: e.to_string(),
})
}
/// Parse float from string slice efficiently
pub fn parse_f32(s: &str) -> Result<f32, CollectorError> {
s.trim().parse().map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
value: s.to_string(),
error: e.to_string(),
})
s.trim()
.parse()
.map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
value: s.to_string(),
error: e.to_string(),
})
}
/// Parse integer from string slice efficiently
pub fn parse_u64(s: &str) -> Result<u64, CollectorError> {
s.trim().parse().map_err(|e: std::num::ParseIntError| CollectorError::Parse {
value: s.to_string(),
error: e.to_string(),
})
s.trim()
.parse()
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
value: s.to_string(),
error: e.to_string(),
})
}
/// Split string and get nth element safely
pub fn split_nth<'a>(s: &'a str, delimiter: char, n: usize) -> Option<&'a str> {
s.split(delimiter).nth(n)
}
}
}

View File

@@ -1,6 +1,6 @@
use anyhow::Result;
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status};
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
use std::process::Command;
use std::sync::RwLock;
use std::time::Instant;
@@ -401,7 +401,7 @@ impl Collector for SystemdCollector {
"systemd"
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
let start_time = Instant::now();
debug!("Collecting systemd services metrics");

View File

@@ -1,6 +1,6 @@
use anyhow::Result;
use cm_dashboard_shared::{MetricMessage, MessageEnvelope};
use tracing::{info, debug};
use cm_dashboard_shared::{MessageEnvelope, MetricMessage};
use tracing::{debug, info};
use zmq::{Context, Socket, SocketType};
use crate::config::ZmqConfig;
@@ -15,75 +15,69 @@ pub struct ZmqHandler {
impl ZmqHandler {
pub async fn new(config: &ZmqConfig) -> Result<Self> {
let context = Context::new();
// Create publisher socket for metrics
let publisher = context.socket(SocketType::PUB)?;
let pub_bind_address = format!("tcp://{}:{}", config.bind_address, config.publisher_port);
publisher.bind(&pub_bind_address)?;
info!("ZMQ publisher bound to {}", pub_bind_address);
// Set socket options for efficiency
publisher.set_sndhwm(1000)?; // High water mark for outbound messages
publisher.set_linger(1000)?; // Linger time on close
// Create command receiver socket (PULL socket to receive commands from dashboard)
let command_receiver = context.socket(SocketType::PULL)?;
let cmd_bind_address = format!("tcp://{}:{}", config.bind_address, config.command_port);
command_receiver.bind(&cmd_bind_address)?;
info!("ZMQ command receiver bound to {}", cmd_bind_address);
// Set non-blocking mode for command receiver
command_receiver.set_rcvtimeo(0)?; // Non-blocking receive
command_receiver.set_linger(1000)?;
Ok(Self {
publisher,
command_receiver,
config: config.clone(),
})
}
/// Publish metrics message via ZMQ
pub async fn publish_metrics(&self, message: &MetricMessage) -> Result<()> {
debug!("Publishing {} metrics for host {}", message.metrics.len(), message.hostname);
debug!(
"Publishing {} metrics for host {}",
message.metrics.len(),
message.hostname
);
// Create message envelope
let envelope = MessageEnvelope::metrics(message.clone())
.map_err(|e| anyhow::anyhow!("Failed to create message envelope: {}", e))?;
// Serialize envelope
let serialized = serde_json::to_vec(&envelope)?;
// Send via ZMQ
self.publisher.send(&serialized, 0)?;
debug!("Published metrics message ({} bytes)", serialized.len());
Ok(())
}
/// Send heartbeat (placeholder for future use)
pub async fn send_heartbeat(&self) -> Result<()> {
let envelope = MessageEnvelope::heartbeat()
.map_err(|e| anyhow::anyhow!("Failed to create heartbeat envelope: {}", e))?;
let serialized = serde_json::to_vec(&envelope)?;
self.publisher.send(&serialized, 0)?;
debug!("Sent heartbeat");
Ok(())
}
/// Try to receive a command (non-blocking)
pub fn try_receive_command(&self) -> Result<Option<AgentCommand>> {
match self.command_receiver.recv_bytes(zmq::DONTWAIT) {
Ok(bytes) => {
debug!("Received command message ({} bytes)", bytes.len());
let command: AgentCommand = serde_json::from_slice(&bytes)
.map_err(|e| anyhow::anyhow!("Failed to deserialize command: {}", e))?;
debug!("Parsed command: {:?}", command);
Ok(Some(command))
}
@@ -107,4 +101,4 @@ pub enum AgentCommand {
ToggleCollector { name: String, enabled: bool },
/// Request status/health check
Ping,
}
}

View File

@@ -1,18 +1,19 @@
use anyhow::{Context, Result};
use std::path::Path;
use std::fs;
use crate::config::AgentConfig;
use anyhow::{Context, Result};
use std::fs;
use std::path::Path;
pub fn load_config<P: AsRef<Path>>(path: P) -> Result<AgentConfig> {
let path = path.as_ref();
let content = fs::read_to_string(path)
.with_context(|| format!("Failed to read config file: {}", path.display()))?;
let config: AgentConfig = toml::from_str(&content)
.with_context(|| format!("Failed to parse config file: {}", path.display()))?;
config.validate()
config
.validate()
.with_context(|| format!("Invalid configuration in file: {}", path.display()))?;
Ok(config)
}
}

View File

@@ -1,6 +1,5 @@
use anyhow::Result;
use cm_dashboard_shared::CacheConfig;
use gethostname::gethostname;
use serde::{Deserialize, Serialize};
use std::path::Path;

View File

@@ -1,114 +1,126 @@
use anyhow::{bail, Result};
use crate::config::AgentConfig;
use anyhow::{bail, Result};
pub fn validate_config(config: &AgentConfig) -> Result<()> {
// Validate ZMQ configuration
if config.zmq.publisher_port == 0 {
bail!("ZMQ publisher port cannot be 0");
}
if config.zmq.command_port == 0 {
bail!("ZMQ command port cannot be 0");
}
if config.zmq.publisher_port == config.zmq.command_port {
bail!("ZMQ publisher and command ports cannot be the same");
}
if config.zmq.bind_address.is_empty() {
bail!("ZMQ bind address cannot be empty");
}
if config.zmq.timeout_ms == 0 {
bail!("ZMQ timeout cannot be 0");
}
// Validate collection interval
if config.collection_interval_seconds == 0 {
bail!("Collection interval cannot be 0");
}
// Validate CPU thresholds
if config.collectors.cpu.enabled {
if config.collectors.cpu.load_warning_threshold <= 0.0 {
bail!("CPU load warning threshold must be positive");
}
if config.collectors.cpu.load_critical_threshold <= config.collectors.cpu.load_warning_threshold {
if config.collectors.cpu.load_critical_threshold
<= config.collectors.cpu.load_warning_threshold
{
bail!("CPU load critical threshold must be greater than warning threshold");
}
if config.collectors.cpu.temperature_warning_threshold <= 0.0 {
bail!("CPU temperature warning threshold must be positive");
}
if config.collectors.cpu.temperature_critical_threshold <= config.collectors.cpu.temperature_warning_threshold {
if config.collectors.cpu.temperature_critical_threshold
<= config.collectors.cpu.temperature_warning_threshold
{
bail!("CPU temperature critical threshold must be greater than warning threshold");
}
}
// Validate memory thresholds
if config.collectors.memory.enabled {
if config.collectors.memory.usage_warning_percent <= 0.0 || config.collectors.memory.usage_warning_percent > 100.0 {
if config.collectors.memory.usage_warning_percent <= 0.0
|| config.collectors.memory.usage_warning_percent > 100.0
{
bail!("Memory usage warning threshold must be between 0 and 100");
}
if config.collectors.memory.usage_critical_percent <= config.collectors.memory.usage_warning_percent
|| config.collectors.memory.usage_critical_percent > 100.0 {
if config.collectors.memory.usage_critical_percent
<= config.collectors.memory.usage_warning_percent
|| config.collectors.memory.usage_critical_percent > 100.0
{
bail!("Memory usage critical threshold must be between warning threshold and 100");
}
}
// Validate disk thresholds
if config.collectors.disk.enabled {
if config.collectors.disk.usage_warning_percent <= 0.0 || config.collectors.disk.usage_warning_percent > 100.0 {
if config.collectors.disk.usage_warning_percent <= 0.0
|| config.collectors.disk.usage_warning_percent > 100.0
{
bail!("Disk usage warning threshold must be between 0 and 100");
}
if config.collectors.disk.usage_critical_percent <= config.collectors.disk.usage_warning_percent
|| config.collectors.disk.usage_critical_percent > 100.0 {
if config.collectors.disk.usage_critical_percent
<= config.collectors.disk.usage_warning_percent
|| config.collectors.disk.usage_critical_percent > 100.0
{
bail!("Disk usage critical threshold must be between warning threshold and 100");
}
}
// Validate SMTP configuration
if config.notifications.enabled {
if config.notifications.smtp_host.is_empty() {
bail!("SMTP host cannot be empty when notifications are enabled");
}
if config.notifications.smtp_port == 0 {
bail!("SMTP port cannot be 0");
}
if config.notifications.from_email.is_empty() {
bail!("From email cannot be empty when notifications are enabled");
}
if config.notifications.to_email.is_empty() {
bail!("To email cannot be empty when notifications are enabled");
}
// Basic email validation
if !config.notifications.from_email.contains('@') {
bail!("From email must contain @ symbol");
}
if !config.notifications.to_email.contains('@') {
bail!("To email must contain @ symbol");
}
}
// Validate cache configuration
if config.cache.enabled {
if config.cache.default_ttl_seconds == 0 {
bail!("Cache TTL cannot be 0");
}
if config.cache.max_entries == 0 {
bail!("Cache max entries cannot be 0");
}
}
Ok(())
}
}

View File

@@ -1,14 +1,14 @@
use anyhow::Result;
use clap::Parser;
use tracing::{info, error};
use tracing::{error, info};
use tracing_subscriber::EnvFilter;
mod agent;
mod cache;
mod config;
mod communication;
mod metrics;
mod collectors;
mod communication;
mod config;
mod metrics;
mod notifications;
mod utils;
@@ -22,7 +22,7 @@ struct Cli {
/// Increase logging verbosity (-v, -vv)
#[arg(short, long, action = clap::ArgAction::Count)]
verbose: u8,
/// Configuration file path
#[arg(short, long)]
config: Option<String>,
@@ -31,32 +31,32 @@ struct Cli {
#[tokio::main]
async fn main() -> Result<()> {
let cli = Cli::parse();
// Setup logging
let log_level = match cli.verbose {
0 => "info",
1 => "debug",
1 => "debug",
_ => "trace",
};
tracing_subscriber::fmt()
.with_env_filter(EnvFilter::from_default_env().add_directive(log_level.parse()?))
.init();
info!("CM Dashboard Agent starting with individual metrics architecture...");
// Create and run agent
let mut agent = Agent::new(cli.config).await?;
// Setup graceful shutdown channel
let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel();
let ctrl_c = async {
tokio::signal::ctrl_c()
.await
.expect("failed to install Ctrl+C handler");
};
// Run agent with graceful shutdown
tokio::select! {
result = agent.run(shutdown_rx) => {
@@ -72,7 +72,7 @@ async fn main() -> Result<()> {
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
}
}
info!("Agent shutdown complete");
Ok(())
}
}

View File

@@ -1,5 +1,5 @@
use anyhow::Result;
use cm_dashboard_shared::Metric;
use cm_dashboard_shared::{Metric, StatusTracker};
use std::collections::HashMap;
use std::time::Instant;
use tracing::{debug, error, info};
@@ -16,6 +16,7 @@ pub struct MetricCollectionManager {
collectors: Vec<Box<dyn Collector>>,
cache_manager: MetricCacheManager,
last_collection_times: HashMap<String, Instant>,
status_tracker: StatusTracker,
}
impl MetricCollectionManager {
@@ -117,6 +118,7 @@ impl MetricCollectionManager {
collectors,
cache_manager,
last_collection_times: HashMap::new(),
status_tracker: StatusTracker::new(),
})
}
@@ -134,7 +136,7 @@ impl MetricCollectionManager {
for collector in &self.collectors {
let collector_name = collector.name();
match collector.collect().await {
match collector.collect(&mut self.status_tracker).await {
Ok(metrics) => {
info!(
"Force collected {} metrics from {} collector",
@@ -200,7 +202,7 @@ impl MetricCollectionManager {
if should_collect {
collecting_fresh.insert(collector_name.to_string());
match collector.collect().await {
match collector.collect(&mut self.status_tracker).await {
Ok(metrics) => {
// Collector returned fresh metrics (debug logging disabled for performance)

View File

@@ -3,41 +3,42 @@
/// System information utilities
pub mod system {
use std::fs;
/// Get number of CPU cores efficiently
pub fn get_cpu_count() -> Result<usize, std::io::Error> {
// Try /proc/cpuinfo first (most reliable)
if let Ok(content) = fs::read_to_string("/proc/cpuinfo") {
let count = content.lines()
let count = content
.lines()
.filter(|line| line.starts_with("processor"))
.count();
if count > 0 {
return Ok(count);
}
}
// Fallback to nproc equivalent
match std::thread::available_parallelism() {
Ok(count) => Ok(count.get()),
Err(_) => Ok(1), // Default to 1 core if all else fails
}
}
/// Check if running in container
pub fn is_container() -> bool {
// Check for common container indicators
fs::metadata("/.dockerenv").is_ok() ||
fs::read_to_string("/proc/1/cgroup")
.map(|content| content.contains("docker") || content.contains("containerd"))
.unwrap_or(false)
fs::metadata("/.dockerenv").is_ok()
|| fs::read_to_string("/proc/1/cgroup")
.map(|content| content.contains("docker") || content.contains("containerd"))
.unwrap_or(false)
}
}
/// Time utilities
pub mod time {
use std::time::{Duration, Instant};
/// Measure execution time of a closure
pub fn measure_time<F, R>(f: F) -> (R, Duration)
where
@@ -54,14 +55,14 @@ pub mod time {
pub mod perf {
use std::time::{Duration, Instant};
use tracing::warn;
/// Performance monitor for critical operations
pub struct PerfMonitor {
operation: String,
start: Instant,
warning_threshold: Duration,
}
impl PerfMonitor {
pub fn new(operation: &str, warning_threshold: Duration) -> Self {
Self {
@@ -70,12 +71,12 @@ pub mod perf {
warning_threshold,
}
}
pub fn new_ms(operation: &str, warning_threshold_ms: u64) -> Self {
Self::new(operation, Duration::from_millis(warning_threshold_ms))
}
}
impl Drop for PerfMonitor {
fn drop(&mut self) {
let elapsed = self.start.elapsed();
@@ -87,4 +88,4 @@ pub mod perf {
}
}
}
}
}