Implement hysteresis for metric status changes to prevent flapping
Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
This commit is contained in:
@@ -1,13 +1,13 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, registry};
|
||||
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError, utils};
|
||||
use super::{utils, Collector, CollectorError};
|
||||
use crate::config::MemoryConfig;
|
||||
|
||||
/// Extremely efficient memory metrics collector
|
||||
///
|
||||
///
|
||||
/// EFFICIENCY OPTIMIZATIONS:
|
||||
/// - Single /proc/meminfo read for all memory metrics
|
||||
/// - Minimal string parsing with split operations
|
||||
@@ -17,6 +17,7 @@ use crate::config::MemoryConfig;
|
||||
pub struct MemoryCollector {
|
||||
config: MemoryConfig,
|
||||
name: String,
|
||||
usage_thresholds: HysteresisThresholds,
|
||||
}
|
||||
|
||||
/// Memory information parsed from /proc/meminfo
|
||||
@@ -33,36 +34,38 @@ struct MemoryInfo {
|
||||
|
||||
impl MemoryCollector {
|
||||
pub fn new(config: MemoryConfig) -> Self {
|
||||
// Create hysteresis thresholds with 5% gap for memory usage
|
||||
let usage_thresholds = HysteresisThresholds::with_custom_gaps(
|
||||
config.usage_warning_percent,
|
||||
5.0, // 5% gap for warning recovery
|
||||
config.usage_critical_percent,
|
||||
5.0, // 5% gap for critical recovery
|
||||
);
|
||||
|
||||
Self {
|
||||
config,
|
||||
name: "memory".to_string(),
|
||||
|
||||
usage_thresholds,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate memory usage status using configured thresholds
|
||||
fn calculate_usage_status(&self, usage_percent: f32) -> Status {
|
||||
if usage_percent >= self.config.usage_critical_percent {
|
||||
Status::Critical
|
||||
} else if usage_percent >= self.config.usage_warning_percent {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
|
||||
/// Calculate memory usage status using hysteresis thresholds
|
||||
fn calculate_usage_status(&self, metric_name: &str, usage_percent: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, usage_percent, &self.usage_thresholds)
|
||||
}
|
||||
|
||||
|
||||
/// Parse /proc/meminfo efficiently
|
||||
/// Format: "MemTotal: 16384000 kB"
|
||||
async fn parse_meminfo(&self) -> Result<MemoryInfo, CollectorError> {
|
||||
let content = utils::read_proc_file("/proc/meminfo")?;
|
||||
let mut info = MemoryInfo::default();
|
||||
|
||||
|
||||
// Parse each line efficiently - only extract what we need
|
||||
for line in content.lines() {
|
||||
if let Some(colon_pos) = line.find(':') {
|
||||
let key = &line[..colon_pos];
|
||||
let value_part = &line[colon_pos + 1..];
|
||||
|
||||
|
||||
// Extract number from value part (format: " 12345 kB")
|
||||
if let Some(number_str) = value_part.split_whitespace().next() {
|
||||
if let Ok(value_kb) = utils::parse_u64(number_str) {
|
||||
@@ -80,7 +83,7 @@ impl MemoryCollector {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Validate that we got essential fields
|
||||
if info.total_kb == 0 {
|
||||
return Err(CollectorError::Parse {
|
||||
@@ -88,87 +91,105 @@ impl MemoryCollector {
|
||||
error: "MemTotal not found or zero in /proc/meminfo".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// If MemAvailable is not available (older kernels), calculate it
|
||||
if info.available_kb == 0 {
|
||||
info.available_kb = info.free_kb + info.buffers_kb + info.cached_kb;
|
||||
}
|
||||
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
|
||||
/// Convert KB to GB efficiently (avoiding floating point in hot path)
|
||||
fn kb_to_gb(kb: u64) -> f32 {
|
||||
kb as f32 / 1_048_576.0 // 1024 * 1024
|
||||
}
|
||||
|
||||
|
||||
/// Calculate memory metrics from parsed info
|
||||
fn calculate_metrics(&self, info: &MemoryInfo) -> Vec<Metric> {
|
||||
fn calculate_metrics(&self, info: &MemoryInfo, status_tracker: &mut StatusTracker) -> Vec<Metric> {
|
||||
let mut metrics = Vec::with_capacity(6);
|
||||
|
||||
|
||||
// Calculate derived values
|
||||
let used_kb = info.total_kb - info.available_kb;
|
||||
let usage_percent = (used_kb as f32 / info.total_kb as f32) * 100.0;
|
||||
let usage_status = self.calculate_usage_status(usage_percent);
|
||||
|
||||
let usage_status = self.calculate_usage_status(registry::MEMORY_USAGE_PERCENT, usage_percent, status_tracker);
|
||||
|
||||
let swap_used_kb = info.swap_total_kb - info.swap_free_kb;
|
||||
|
||||
|
||||
// Convert to GB for metrics
|
||||
let total_gb = Self::kb_to_gb(info.total_kb);
|
||||
let used_gb = Self::kb_to_gb(used_kb);
|
||||
let available_gb = Self::kb_to_gb(info.available_kb);
|
||||
let swap_total_gb = Self::kb_to_gb(info.swap_total_kb);
|
||||
let swap_used_gb = Self::kb_to_gb(swap_used_kb);
|
||||
|
||||
|
||||
// Memory usage percentage (primary metric with status)
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_USAGE_PERCENT.to_string(),
|
||||
MetricValue::Float(usage_percent),
|
||||
usage_status,
|
||||
).with_description("Memory usage percentage".to_string())
|
||||
.with_unit("%".to_string()));
|
||||
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_USAGE_PERCENT.to_string(),
|
||||
MetricValue::Float(usage_percent),
|
||||
usage_status,
|
||||
)
|
||||
.with_description("Memory usage percentage".to_string())
|
||||
.with_unit("%".to_string()),
|
||||
);
|
||||
|
||||
// Total memory
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_TOTAL_GB.to_string(),
|
||||
MetricValue::Float(total_gb),
|
||||
Status::Ok, // Total memory doesn't have status
|
||||
).with_description("Total system memory".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_TOTAL_GB.to_string(),
|
||||
MetricValue::Float(total_gb),
|
||||
Status::Ok, // Total memory doesn't have status
|
||||
)
|
||||
.with_description("Total system memory".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
|
||||
// Used memory
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_USED_GB.to_string(),
|
||||
MetricValue::Float(used_gb),
|
||||
Status::Ok, // Used memory absolute value doesn't have status
|
||||
).with_description("Used system memory".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_USED_GB.to_string(),
|
||||
MetricValue::Float(used_gb),
|
||||
Status::Ok, // Used memory absolute value doesn't have status
|
||||
)
|
||||
.with_description("Used system memory".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
|
||||
// Available memory
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_AVAILABLE_GB.to_string(),
|
||||
MetricValue::Float(available_gb),
|
||||
Status::Ok, // Available memory absolute value doesn't have status
|
||||
).with_description("Available system memory".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_AVAILABLE_GB.to_string(),
|
||||
MetricValue::Float(available_gb),
|
||||
Status::Ok, // Available memory absolute value doesn't have status
|
||||
)
|
||||
.with_description("Available system memory".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
|
||||
// Swap metrics (only if swap exists)
|
||||
if info.swap_total_kb > 0 {
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
|
||||
MetricValue::Float(swap_total_gb),
|
||||
Status::Ok,
|
||||
).with_description("Total swap space".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_SWAP_USED_GB.to_string(),
|
||||
MetricValue::Float(swap_used_gb),
|
||||
Status::Ok,
|
||||
).with_description("Used swap space".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
|
||||
MetricValue::Float(swap_total_gb),
|
||||
Status::Ok,
|
||||
)
|
||||
.with_description("Total swap space".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_SWAP_USED_GB.to_string(),
|
||||
MetricValue::Float(swap_used_gb),
|
||||
Status::Ok,
|
||||
)
|
||||
.with_description("Used swap space".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
metrics
|
||||
}
|
||||
}
|
||||
@@ -178,34 +199,39 @@ impl Collector for MemoryCollector {
|
||||
fn name(&self) -> &str {
|
||||
&self.name
|
||||
}
|
||||
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
|
||||
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
debug!("Collecting memory metrics");
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
|
||||
// Parse memory info from /proc/meminfo
|
||||
let info = self.parse_meminfo().await?;
|
||||
|
||||
|
||||
// Calculate all metrics from parsed info
|
||||
let metrics = self.calculate_metrics(&info);
|
||||
|
||||
let metrics = self.calculate_metrics(&info, status_tracker);
|
||||
|
||||
let duration = start.elapsed();
|
||||
debug!("Memory collection completed in {:?} with {} metrics", duration, metrics.len());
|
||||
|
||||
debug!(
|
||||
"Memory collection completed in {:?} with {} metrics",
|
||||
duration,
|
||||
metrics.len()
|
||||
);
|
||||
|
||||
// Efficiency check: warn if collection takes too long
|
||||
if duration.as_millis() > 1 {
|
||||
debug!("Memory collection took {}ms - consider optimization", duration.as_millis());
|
||||
debug!(
|
||||
"Memory collection took {}ms - consider optimization",
|
||||
duration.as_millis()
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
// Store performance metrics
|
||||
// Performance tracking handled by cache system
|
||||
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
|
||||
fn get_performance_metrics(&self) -> Option<super::PerformanceMetrics> {
|
||||
None // Performance tracking handled by cache system
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user