Implement hysteresis for metric status changes to prevent flapping

Add comprehensive hysteresis support to prevent status oscillation near
threshold boundaries while maintaining responsive alerting.

Key Features:
- HysteresisThresholds with configurable upper/lower limits
- StatusTracker for per-metric status history
- Default gaps: CPU load 10%, memory 5%, disk temp 5°C

Updated Components:
- CPU load collector (5-minute average with hysteresis)
- Memory usage collector (percentage-based thresholds)
- Disk temperature collector (SMART data monitoring)
- All collectors updated to support StatusTracker interface

Cache Interval Adjustments:
- Service status: 60s → 10s (faster response)
- Disk usage: 300s → 60s (more frequent checks)
- Backup status: 900s → 60s (quicker updates)
- SMART data: moved to 600s tier (10 minutes)

Architecture:
- Individual metric status calculation in collectors
- Centralized StatusTracker in MetricCollectionManager
- Status aggregation preserved in dashboard widgets
This commit is contained in:
2025-10-20 18:45:41 +02:00
parent e998679901
commit 00a8ed3da2
34 changed files with 1037 additions and 770 deletions

View File

@@ -1,6 +1,6 @@
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status};
use chrono::Utc;
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use tokio::fs;
@@ -18,7 +18,8 @@ pub struct BackupCollector {
impl BackupCollector {
pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
Self {
backup_status_file: backup_status_file.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
backup_status_file: backup_status_file
.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
max_age_hours,
}
}
@@ -43,10 +44,16 @@ impl BackupCollector {
Ok(dt) => dt.with_timezone(&Utc),
Err(_) => {
// Try parsing as naive datetime and assume UTC
match chrono::NaiveDateTime::parse_from_str(&backup_status.start_time, "%Y-%m-%dT%H:%M:%S%.f") {
match chrono::NaiveDateTime::parse_from_str(
&backup_status.start_time,
"%Y-%m-%dT%H:%M:%S%.f",
) {
Ok(naive_dt) => naive_dt.and_utc(),
Err(_) => {
error!("Failed to parse backup timestamp: {}", backup_status.start_time);
error!(
"Failed to parse backup timestamp: {}",
backup_status.start_time
);
return Status::Unknown;
}
}
@@ -63,7 +70,7 @@ impl BackupCollector {
} else {
Status::Ok
}
},
}
"failed" => Status::Critical,
"running" => Status::Ok, // Currently running is OK
_ => Status::Unknown,
@@ -78,7 +85,7 @@ impl BackupCollector {
} else {
Status::Critical
}
},
}
"failed" => Status::Critical,
"disabled" => Status::Warning, // Service intentionally disabled
"running" => Status::Ok,
@@ -97,7 +104,7 @@ impl Collector for BackupCollector {
"backup"
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
let backup_status = self.read_backup_status().await?;
let mut metrics = Vec::new();
let timestamp = chrono::Utc::now().timestamp() as u64;
@@ -114,7 +121,10 @@ impl Collector for BackupCollector {
}),
status: overall_status,
timestamp,
description: Some(format!("Backup: {} at {}", backup_status.status, backup_status.start_time)),
description: Some(format!(
"Backup: {} at {}",
backup_status.status, backup_status.start_time
)),
unit: None,
});
@@ -129,14 +139,18 @@ impl Collector for BackupCollector {
});
// Last backup timestamp - use last_updated (when backup finished) instead of start_time
let last_updated_dt_result = chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
.map(|dt| dt.with_timezone(&Utc))
.or_else(|_| {
// Try parsing as naive datetime and assume UTC
chrono::NaiveDateTime::parse_from_str(&backup_status.last_updated, "%Y-%m-%dT%H:%M:%S%.f")
let last_updated_dt_result =
chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
.map(|dt| dt.with_timezone(&Utc))
.or_else(|_| {
// Try parsing as naive datetime and assume UTC
chrono::NaiveDateTime::parse_from_str(
&backup_status.last_updated,
"%Y-%m-%dT%H:%M:%S%.f",
)
.map(|naive_dt| naive_dt.and_utc())
});
});
if let Ok(last_updated_dt) = last_updated_dt_result {
metrics.push(Metric {
name: "backup_last_run_timestamp".to_string(),
@@ -147,13 +161,16 @@ impl Collector for BackupCollector {
unit: Some("unix_timestamp".to_string()),
});
} else {
error!("Failed to parse backup timestamp for last_run_timestamp: {}", backup_status.last_updated);
error!(
"Failed to parse backup timestamp for last_run_timestamp: {}",
backup_status.last_updated
);
}
// Individual service metrics
for (service_name, service) in &backup_status.services {
let service_status = self.calculate_service_status(service);
// Service status
metrics.push(Metric {
name: format!("backup_service_{}_status", service_name),
@@ -165,7 +182,10 @@ impl Collector for BackupCollector {
}),
status: service_status,
timestamp,
description: Some(format!("Backup service {} status: {}", service_name, service.status)),
description: Some(format!(
"Backup service {} status: {}",
service_name, service.status
)),
unit: None,
});
@@ -173,7 +193,11 @@ impl Collector for BackupCollector {
metrics.push(Metric {
name: format!("backup_service_{}_exit_code", service_name),
value: MetricValue::Integer(service.exit_code),
status: if service.exit_code == 0 { Status::Ok } else { Status::Critical },
status: if service.exit_code == 0 {
Status::Ok
} else {
Status::Critical
},
timestamp,
description: Some(format!("Exit code for backup service {}", service_name)),
unit: None,
@@ -222,7 +246,9 @@ impl Collector for BackupCollector {
});
// Calculate total repository size
let total_size_bytes: u64 = backup_status.services.values()
let total_size_bytes: u64 = backup_status
.services
.values()
.map(|s| s.repo_size_bytes)
.sum();
let total_size_gb = Self::bytes_to_gb(total_size_bytes);
@@ -301,7 +327,6 @@ impl Collector for BackupCollector {
unit: None,
});
}
}
// Add standalone disk identification metrics from TOML fields
@@ -372,7 +397,7 @@ pub struct DiskSpace {
pub used_gb: f64,
pub available_gb: f64,
pub usage_percent: f64,
// Optional disk identification fields
// Optional disk identification fields
pub product_name: Option<String>,
pub serial_number: Option<String>,
}
@@ -384,4 +409,4 @@ pub struct ServiceStatus {
pub repo_path: String,
pub archive_count: i64,
pub repo_size_bytes: u64,
}
}

View File

@@ -1,5 +1,5 @@
use async_trait::async_trait;
use cm_dashboard_shared::{registry, Metric, MetricValue, Status};
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
use tracing::debug;
@@ -17,41 +17,44 @@ use crate::config::CpuConfig;
pub struct CpuCollector {
config: CpuConfig,
name: String,
load_thresholds: HysteresisThresholds,
temperature_thresholds: HysteresisThresholds,
}
impl CpuCollector {
pub fn new(config: CpuConfig) -> Self {
// Create hysteresis thresholds with 10% gap for recovery
let load_thresholds = HysteresisThresholds::new(
config.load_warning_threshold,
config.load_critical_threshold,
);
let temperature_thresholds = HysteresisThresholds::new(
config.temperature_warning_threshold,
config.temperature_critical_threshold,
);
Self {
config,
name: "cpu".to_string(),
load_thresholds,
temperature_thresholds,
}
}
/// Calculate CPU load status using configured thresholds
fn calculate_load_status(&self, load: f32) -> Status {
if load >= self.config.load_critical_threshold {
Status::Critical
} else if load >= self.config.load_warning_threshold {
Status::Warning
} else {
Status::Ok
}
/// Calculate CPU load status using hysteresis thresholds
fn calculate_load_status(&self, metric_name: &str, load: f32, status_tracker: &mut StatusTracker) -> Status {
status_tracker.calculate_with_hysteresis(metric_name, load, &self.load_thresholds)
}
/// Calculate CPU temperature status using configured thresholds
fn calculate_temperature_status(&self, temp: f32) -> Status {
if temp >= self.config.temperature_critical_threshold {
Status::Critical
} else if temp >= self.config.temperature_warning_threshold {
Status::Warning
} else {
Status::Ok
}
/// Calculate CPU temperature status using hysteresis thresholds
fn calculate_temperature_status(&self, metric_name: &str, temp: f32, status_tracker: &mut StatusTracker) -> Status {
status_tracker.calculate_with_hysteresis(metric_name, temp, &self.temperature_thresholds)
}
/// Collect CPU load averages from /proc/loadavg
/// Format: "0.52 0.58 0.59 1/257 12345"
async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect_load_averages(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
let content = utils::read_proc_file("/proc/loadavg")?;
let parts: Vec<&str> = content.trim().split_whitespace().collect();
@@ -68,7 +71,7 @@ impl CpuCollector {
// Only apply thresholds to 5-minute load average
let load_1min_status = Status::Ok; // No alerting on 1min
let load_5min_status = self.calculate_load_status(load_5min); // Only 5min triggers alerts
let load_5min_status = self.calculate_load_status(registry::CPU_LOAD_5MIN, load_5min, status_tracker); // Only 5min triggers alerts
let load_15min_status = Status::Ok; // No alerting on 15min
Ok(vec![
@@ -95,14 +98,14 @@ impl CpuCollector {
/// Collect CPU temperature from thermal zones
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
async fn collect_temperature(&self, status_tracker: &mut StatusTracker) -> Result<Option<Metric>, CollectorError> {
// Try x86_pkg_temp first (Intel CPU package temperature)
if let Ok(temp) = self
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
.await
{
let temp_celsius = temp as f32 / 1000.0;
let status = self.calculate_temperature_status(temp_celsius);
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
return Ok(Some(
Metric::new(
@@ -120,7 +123,7 @@ impl CpuCollector {
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
if let Ok(temp) = self.read_thermal_zone(&path).await {
let temp_celsius = temp as f32 / 1000.0;
let status = self.calculate_temperature_status(temp_celsius);
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
return Ok(Some(
Metric::new(
@@ -200,17 +203,17 @@ impl Collector for CpuCollector {
&self.name
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
debug!("Collecting CPU metrics");
let start = std::time::Instant::now();
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
// Collect load averages (always available)
metrics.extend(self.collect_load_averages().await?);
metrics.extend(self.collect_load_averages(status_tracker).await?);
// Collect temperature (optional)
if let Some(temp_metric) = self.collect_temperature().await? {
if let Some(temp_metric) = self.collect_temperature(status_tracker).await? {
metrics.push(temp_metric);
}

View File

@@ -1,6 +1,6 @@
use anyhow::Result;
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status};
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
use crate::config::DiskConfig;
use std::fs;
@@ -28,11 +28,28 @@ struct MountedDisk {
/// Disk usage collector for monitoring filesystem sizes
pub struct DiskCollector {
config: DiskConfig,
temperature_thresholds: HysteresisThresholds,
}
impl DiskCollector {
pub fn new(config: DiskConfig) -> Self {
Self { config }
// Create hysteresis thresholds for disk temperature
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
60.0, // warning at 60°C
5.0, // 5°C gap for recovery
70.0, // critical at 70°C
5.0, // 5°C gap for recovery
);
Self {
config,
temperature_thresholds,
}
}
/// Calculate disk temperature status using hysteresis thresholds
fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
}
/// Resolve UUID to actual device path
@@ -203,12 +220,6 @@ impl DiskCollector {
Ok((total_bytes, used_bytes))
}
/// Get root filesystem disk usage
fn get_root_filesystem_usage(&self) -> Result<(u64, u64, f32)> {
let (total_bytes, used_bytes) = self.get_filesystem_info("/")?;
let usage_percent = (used_bytes as f64 / total_bytes as f64) * 100.0;
Ok((total_bytes, used_bytes, usage_percent as f32))
}
/// Get the physical device for a given device (resolves symlinks, gets parent device)
@@ -339,7 +350,7 @@ impl Collector for DiskCollector {
"disk"
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
let start_time = Instant::now();
debug!("Collecting multi-disk metrics");
@@ -497,13 +508,8 @@ impl Collector for DiskCollector {
});
if temperature > 0.0 {
let temp_status = if temperature >= 70.0 {
Status::Critical
} else if temperature >= 60.0 {
Status::Warning
} else {
Status::Ok
};
let metric_name = format!("disk_smart_{}_temperature", device_name);
let temp_status = self.calculate_temperature_status(&metric_name, temperature, status_tracker);
metrics.push(Metric {
name: format!("disk_smart_{}_temperature", device_name),

View File

@@ -4,7 +4,7 @@ use thiserror::Error;
pub enum CollectorError {
#[error("Failed to read system file {path}: {error}")]
SystemRead { path: String, error: String },
#[error("Failed to parse value '{value}': {error}")]
Parse { value: String, error: String },
}
}

View File

@@ -1,13 +1,13 @@
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status, registry};
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
use tracing::debug;
use super::{Collector, CollectorError, utils};
use super::{utils, Collector, CollectorError};
use crate::config::MemoryConfig;
/// Extremely efficient memory metrics collector
///
///
/// EFFICIENCY OPTIMIZATIONS:
/// - Single /proc/meminfo read for all memory metrics
/// - Minimal string parsing with split operations
@@ -17,6 +17,7 @@ use crate::config::MemoryConfig;
pub struct MemoryCollector {
config: MemoryConfig,
name: String,
usage_thresholds: HysteresisThresholds,
}
/// Memory information parsed from /proc/meminfo
@@ -33,36 +34,38 @@ struct MemoryInfo {
impl MemoryCollector {
pub fn new(config: MemoryConfig) -> Self {
// Create hysteresis thresholds with 5% gap for memory usage
let usage_thresholds = HysteresisThresholds::with_custom_gaps(
config.usage_warning_percent,
5.0, // 5% gap for warning recovery
config.usage_critical_percent,
5.0, // 5% gap for critical recovery
);
Self {
config,
name: "memory".to_string(),
usage_thresholds,
}
}
/// Calculate memory usage status using configured thresholds
fn calculate_usage_status(&self, usage_percent: f32) -> Status {
if usage_percent >= self.config.usage_critical_percent {
Status::Critical
} else if usage_percent >= self.config.usage_warning_percent {
Status::Warning
} else {
Status::Ok
}
/// Calculate memory usage status using hysteresis thresholds
fn calculate_usage_status(&self, metric_name: &str, usage_percent: f32, status_tracker: &mut StatusTracker) -> Status {
status_tracker.calculate_with_hysteresis(metric_name, usage_percent, &self.usage_thresholds)
}
/// Parse /proc/meminfo efficiently
/// Format: "MemTotal: 16384000 kB"
async fn parse_meminfo(&self) -> Result<MemoryInfo, CollectorError> {
let content = utils::read_proc_file("/proc/meminfo")?;
let mut info = MemoryInfo::default();
// Parse each line efficiently - only extract what we need
for line in content.lines() {
if let Some(colon_pos) = line.find(':') {
let key = &line[..colon_pos];
let value_part = &line[colon_pos + 1..];
// Extract number from value part (format: " 12345 kB")
if let Some(number_str) = value_part.split_whitespace().next() {
if let Ok(value_kb) = utils::parse_u64(number_str) {
@@ -80,7 +83,7 @@ impl MemoryCollector {
}
}
}
// Validate that we got essential fields
if info.total_kb == 0 {
return Err(CollectorError::Parse {
@@ -88,87 +91,105 @@ impl MemoryCollector {
error: "MemTotal not found or zero in /proc/meminfo".to_string(),
});
}
// If MemAvailable is not available (older kernels), calculate it
if info.available_kb == 0 {
info.available_kb = info.free_kb + info.buffers_kb + info.cached_kb;
}
Ok(info)
}
/// Convert KB to GB efficiently (avoiding floating point in hot path)
fn kb_to_gb(kb: u64) -> f32 {
kb as f32 / 1_048_576.0 // 1024 * 1024
}
/// Calculate memory metrics from parsed info
fn calculate_metrics(&self, info: &MemoryInfo) -> Vec<Metric> {
fn calculate_metrics(&self, info: &MemoryInfo, status_tracker: &mut StatusTracker) -> Vec<Metric> {
let mut metrics = Vec::with_capacity(6);
// Calculate derived values
let used_kb = info.total_kb - info.available_kb;
let usage_percent = (used_kb as f32 / info.total_kb as f32) * 100.0;
let usage_status = self.calculate_usage_status(usage_percent);
let usage_status = self.calculate_usage_status(registry::MEMORY_USAGE_PERCENT, usage_percent, status_tracker);
let swap_used_kb = info.swap_total_kb - info.swap_free_kb;
// Convert to GB for metrics
let total_gb = Self::kb_to_gb(info.total_kb);
let used_gb = Self::kb_to_gb(used_kb);
let available_gb = Self::kb_to_gb(info.available_kb);
let swap_total_gb = Self::kb_to_gb(info.swap_total_kb);
let swap_used_gb = Self::kb_to_gb(swap_used_kb);
// Memory usage percentage (primary metric with status)
metrics.push(Metric::new(
registry::MEMORY_USAGE_PERCENT.to_string(),
MetricValue::Float(usage_percent),
usage_status,
).with_description("Memory usage percentage".to_string())
.with_unit("%".to_string()));
metrics.push(
Metric::new(
registry::MEMORY_USAGE_PERCENT.to_string(),
MetricValue::Float(usage_percent),
usage_status,
)
.with_description("Memory usage percentage".to_string())
.with_unit("%".to_string()),
);
// Total memory
metrics.push(Metric::new(
registry::MEMORY_TOTAL_GB.to_string(),
MetricValue::Float(total_gb),
Status::Ok, // Total memory doesn't have status
).with_description("Total system memory".to_string())
.with_unit("GB".to_string()));
metrics.push(
Metric::new(
registry::MEMORY_TOTAL_GB.to_string(),
MetricValue::Float(total_gb),
Status::Ok, // Total memory doesn't have status
)
.with_description("Total system memory".to_string())
.with_unit("GB".to_string()),
);
// Used memory
metrics.push(Metric::new(
registry::MEMORY_USED_GB.to_string(),
MetricValue::Float(used_gb),
Status::Ok, // Used memory absolute value doesn't have status
).with_description("Used system memory".to_string())
.with_unit("GB".to_string()));
metrics.push(
Metric::new(
registry::MEMORY_USED_GB.to_string(),
MetricValue::Float(used_gb),
Status::Ok, // Used memory absolute value doesn't have status
)
.with_description("Used system memory".to_string())
.with_unit("GB".to_string()),
);
// Available memory
metrics.push(Metric::new(
registry::MEMORY_AVAILABLE_GB.to_string(),
MetricValue::Float(available_gb),
Status::Ok, // Available memory absolute value doesn't have status
).with_description("Available system memory".to_string())
.with_unit("GB".to_string()));
metrics.push(
Metric::new(
registry::MEMORY_AVAILABLE_GB.to_string(),
MetricValue::Float(available_gb),
Status::Ok, // Available memory absolute value doesn't have status
)
.with_description("Available system memory".to_string())
.with_unit("GB".to_string()),
);
// Swap metrics (only if swap exists)
if info.swap_total_kb > 0 {
metrics.push(Metric::new(
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
MetricValue::Float(swap_total_gb),
Status::Ok,
).with_description("Total swap space".to_string())
.with_unit("GB".to_string()));
metrics.push(Metric::new(
registry::MEMORY_SWAP_USED_GB.to_string(),
MetricValue::Float(swap_used_gb),
Status::Ok,
).with_description("Used swap space".to_string())
.with_unit("GB".to_string()));
metrics.push(
Metric::new(
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
MetricValue::Float(swap_total_gb),
Status::Ok,
)
.with_description("Total swap space".to_string())
.with_unit("GB".to_string()),
);
metrics.push(
Metric::new(
registry::MEMORY_SWAP_USED_GB.to_string(),
MetricValue::Float(swap_used_gb),
Status::Ok,
)
.with_description("Used swap space".to_string())
.with_unit("GB".to_string()),
);
}
metrics
}
}
@@ -178,34 +199,39 @@ impl Collector for MemoryCollector {
fn name(&self) -> &str {
&self.name
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
debug!("Collecting memory metrics");
let start = std::time::Instant::now();
// Parse memory info from /proc/meminfo
let info = self.parse_meminfo().await?;
// Calculate all metrics from parsed info
let metrics = self.calculate_metrics(&info);
let metrics = self.calculate_metrics(&info, status_tracker);
let duration = start.elapsed();
debug!("Memory collection completed in {:?} with {} metrics", duration, metrics.len());
debug!(
"Memory collection completed in {:?} with {} metrics",
duration,
metrics.len()
);
// Efficiency check: warn if collection takes too long
if duration.as_millis() > 1 {
debug!("Memory collection took {}ms - consider optimization", duration.as_millis());
debug!(
"Memory collection took {}ms - consider optimization",
duration.as_millis()
);
}
// Store performance metrics
// Performance tracking handled by cache system
Ok(metrics)
}
fn get_performance_metrics(&self) -> Option<super::PerformanceMetrics> {
None // Performance tracking handled by cache system
}
}
}

View File

@@ -1,16 +1,7 @@
use async_trait::async_trait;
use cm_dashboard_shared::Metric;
use cm_dashboard_shared::{Metric, StatusTracker};
use std::time::Duration;
pub mod cpu;
pub mod memory;
pub mod disk;
pub mod systemd;
pub mod backup;
pub mod error;
pub use error::CollectorError;
/// Performance metrics for a collector
#[derive(Debug, Clone)]
pub struct PerformanceMetrics {
@@ -18,69 +9,78 @@ pub struct PerformanceMetrics {
pub collection_efficiency_percent: f32,
}
pub mod backup;
pub mod cpu;
pub mod disk;
pub mod error;
pub mod memory;
pub mod systemd;
pub use error::CollectorError;
/// Base trait for all collectors with extreme efficiency requirements
#[async_trait]
pub trait Collector: Send + Sync {
/// Name of this collector
fn name(&self) -> &str;
/// Collect all metrics this collector provides
async fn collect(&self) -> Result<Vec<Metric>, CollectorError>;
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError>;
/// Get performance metrics for monitoring collector efficiency
fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
None
}
}
/// CPU efficiency rules for all collectors
pub mod efficiency {
/// CRITICAL: All collectors must follow these efficiency rules to minimize system impact
/// 1. FILE READING RULES
/// - Read entire files in single syscall when possible
/// - Use BufReader only for very large files (>4KB)
/// - Never read files character by character
/// - Cache file descriptors when safe (immutable paths)
/// 2. PARSING RULES
/// - Use split() instead of regex for simple patterns
/// - Parse numbers with from_str() not complex parsing
/// - Avoid string allocations in hot paths
/// - Use str::trim() before parsing numbers
/// 3. MEMORY ALLOCATION RULES
/// - Reuse Vec buffers when possible
/// - Pre-allocate collections with known sizes
/// - Use str slices instead of String when possible
/// - Avoid clone() in hot paths
/// 4. SYSTEM CALL RULES
/// - Minimize syscalls - prefer single reads over multiple
/// - Use /proc filesystem efficiently
/// - Avoid spawning processes when /proc data available
/// - Cache static data (like CPU count)
/// 5. ERROR HANDLING RULES
/// - Use Result<> but minimize allocation in error paths
/// - Log errors at debug level only to avoid I/O overhead
/// - Graceful degradation - missing metrics better than failing
/// - Never panic in collectors
/// 6. CONCURRENCY RULES
/// - Collectors must be thread-safe but avoid locks
/// - Use atomic operations for simple counters
/// - Avoid shared mutable state between collections
/// - Each collection should be independent
pub const PERFORMANCE_TARGET_OVERHEAD_PERCENT: f32 = 0.1;
//! CRITICAL: All collectors must follow these efficiency rules to minimize system impact
//!
//! # FILE READING RULES
//! - Read entire files in single syscall when possible
//! - Use BufReader only for very large files (>4KB)
//! - Never read files character by character
//! - Cache file descriptors when safe (immutable paths)
//!
//! # PARSING RULES
//! - Use split() instead of regex for simple patterns
//! - Parse numbers with from_str() not complex parsing
//! - Avoid string allocations in hot paths
//! - Use str::trim() before parsing numbers
//!
//! # MEMORY ALLOCATION RULES
//! - Reuse Vec buffers when possible
//! - Pre-allocate collections with known sizes
//! - Use str slices instead of String when possible
//! - Avoid clone() in hot paths
//!
//! # SYSTEM CALL RULES
//! - Minimize syscalls - prefer single reads over multiple
//! - Use /proc filesystem efficiently
//! - Avoid spawning processes when /proc data available
//! - Cache static data (like CPU count)
//!
//! # ERROR HANDLING RULES
//! - Use Result<> but minimize allocation in error paths
//! - Log errors at debug level only to avoid I/O overhead
//! - Graceful degradation - missing metrics better than failing
//! - Never panic in collectors
//!
//! # CONCURRENCY RULES
//! - Collectors must be thread-safe but avoid locks
//! - Use atomic operations for simple counters
//! - Avoid shared mutable state between collections
//! - Each collection should be independent
}
/// Utility functions for efficient system data collection
pub mod utils {
use std::fs;
use super::CollectorError;
use std::fs;
/// Read entire file content efficiently
pub fn read_proc_file(path: &str) -> Result<String, CollectorError> {
fs::read_to_string(path).map_err(|e| CollectorError::SystemRead {
@@ -88,25 +88,25 @@ pub mod utils {
error: e.to_string(),
})
}
/// Parse float from string slice efficiently
pub fn parse_f32(s: &str) -> Result<f32, CollectorError> {
s.trim().parse().map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
value: s.to_string(),
error: e.to_string(),
})
s.trim()
.parse()
.map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
value: s.to_string(),
error: e.to_string(),
})
}
/// Parse integer from string slice efficiently
pub fn parse_u64(s: &str) -> Result<u64, CollectorError> {
s.trim().parse().map_err(|e: std::num::ParseIntError| CollectorError::Parse {
value: s.to_string(),
error: e.to_string(),
})
s.trim()
.parse()
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
value: s.to_string(),
error: e.to_string(),
})
}
/// Split string and get nth element safely
pub fn split_nth<'a>(s: &'a str, delimiter: char, n: usize) -> Option<&'a str> {
s.split(delimiter).nth(n)
}
}
}

View File

@@ -1,6 +1,6 @@
use anyhow::Result;
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status};
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
use std::process::Command;
use std::sync::RwLock;
use std::time::Instant;
@@ -401,7 +401,7 @@ impl Collector for SystemdCollector {
"systemd"
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
let start_time = Instant::now();
debug!("Collecting systemd services metrics");