Implement hysteresis for metric status changes to prevent flapping
Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
||||
use chrono::Utc;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use tokio::fs;
|
||||
@@ -18,7 +18,8 @@ pub struct BackupCollector {
|
||||
impl BackupCollector {
|
||||
pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
|
||||
Self {
|
||||
backup_status_file: backup_status_file.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
|
||||
backup_status_file: backup_status_file
|
||||
.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
|
||||
max_age_hours,
|
||||
}
|
||||
}
|
||||
@@ -43,10 +44,16 @@ impl BackupCollector {
|
||||
Ok(dt) => dt.with_timezone(&Utc),
|
||||
Err(_) => {
|
||||
// Try parsing as naive datetime and assume UTC
|
||||
match chrono::NaiveDateTime::parse_from_str(&backup_status.start_time, "%Y-%m-%dT%H:%M:%S%.f") {
|
||||
match chrono::NaiveDateTime::parse_from_str(
|
||||
&backup_status.start_time,
|
||||
"%Y-%m-%dT%H:%M:%S%.f",
|
||||
) {
|
||||
Ok(naive_dt) => naive_dt.and_utc(),
|
||||
Err(_) => {
|
||||
error!("Failed to parse backup timestamp: {}", backup_status.start_time);
|
||||
error!(
|
||||
"Failed to parse backup timestamp: {}",
|
||||
backup_status.start_time
|
||||
);
|
||||
return Status::Unknown;
|
||||
}
|
||||
}
|
||||
@@ -63,7 +70,7 @@ impl BackupCollector {
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
},
|
||||
}
|
||||
"failed" => Status::Critical,
|
||||
"running" => Status::Ok, // Currently running is OK
|
||||
_ => Status::Unknown,
|
||||
@@ -78,7 +85,7 @@ impl BackupCollector {
|
||||
} else {
|
||||
Status::Critical
|
||||
}
|
||||
},
|
||||
}
|
||||
"failed" => Status::Critical,
|
||||
"disabled" => Status::Warning, // Service intentionally disabled
|
||||
"running" => Status::Ok,
|
||||
@@ -97,7 +104,7 @@ impl Collector for BackupCollector {
|
||||
"backup"
|
||||
}
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
let backup_status = self.read_backup_status().await?;
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
@@ -114,7 +121,10 @@ impl Collector for BackupCollector {
|
||||
}),
|
||||
status: overall_status,
|
||||
timestamp,
|
||||
description: Some(format!("Backup: {} at {}", backup_status.status, backup_status.start_time)),
|
||||
description: Some(format!(
|
||||
"Backup: {} at {}",
|
||||
backup_status.status, backup_status.start_time
|
||||
)),
|
||||
unit: None,
|
||||
});
|
||||
|
||||
@@ -129,14 +139,18 @@ impl Collector for BackupCollector {
|
||||
});
|
||||
|
||||
// Last backup timestamp - use last_updated (when backup finished) instead of start_time
|
||||
let last_updated_dt_result = chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
|
||||
.map(|dt| dt.with_timezone(&Utc))
|
||||
.or_else(|_| {
|
||||
// Try parsing as naive datetime and assume UTC
|
||||
chrono::NaiveDateTime::parse_from_str(&backup_status.last_updated, "%Y-%m-%dT%H:%M:%S%.f")
|
||||
let last_updated_dt_result =
|
||||
chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
|
||||
.map(|dt| dt.with_timezone(&Utc))
|
||||
.or_else(|_| {
|
||||
// Try parsing as naive datetime and assume UTC
|
||||
chrono::NaiveDateTime::parse_from_str(
|
||||
&backup_status.last_updated,
|
||||
"%Y-%m-%dT%H:%M:%S%.f",
|
||||
)
|
||||
.map(|naive_dt| naive_dt.and_utc())
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
if let Ok(last_updated_dt) = last_updated_dt_result {
|
||||
metrics.push(Metric {
|
||||
name: "backup_last_run_timestamp".to_string(),
|
||||
@@ -147,13 +161,16 @@ impl Collector for BackupCollector {
|
||||
unit: Some("unix_timestamp".to_string()),
|
||||
});
|
||||
} else {
|
||||
error!("Failed to parse backup timestamp for last_run_timestamp: {}", backup_status.last_updated);
|
||||
error!(
|
||||
"Failed to parse backup timestamp for last_run_timestamp: {}",
|
||||
backup_status.last_updated
|
||||
);
|
||||
}
|
||||
|
||||
// Individual service metrics
|
||||
for (service_name, service) in &backup_status.services {
|
||||
let service_status = self.calculate_service_status(service);
|
||||
|
||||
|
||||
// Service status
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_status", service_name),
|
||||
@@ -165,7 +182,10 @@ impl Collector for BackupCollector {
|
||||
}),
|
||||
status: service_status,
|
||||
timestamp,
|
||||
description: Some(format!("Backup service {} status: {}", service_name, service.status)),
|
||||
description: Some(format!(
|
||||
"Backup service {} status: {}",
|
||||
service_name, service.status
|
||||
)),
|
||||
unit: None,
|
||||
});
|
||||
|
||||
@@ -173,7 +193,11 @@ impl Collector for BackupCollector {
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_exit_code", service_name),
|
||||
value: MetricValue::Integer(service.exit_code),
|
||||
status: if service.exit_code == 0 { Status::Ok } else { Status::Critical },
|
||||
status: if service.exit_code == 0 {
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Critical
|
||||
},
|
||||
timestamp,
|
||||
description: Some(format!("Exit code for backup service {}", service_name)),
|
||||
unit: None,
|
||||
@@ -222,7 +246,9 @@ impl Collector for BackupCollector {
|
||||
});
|
||||
|
||||
// Calculate total repository size
|
||||
let total_size_bytes: u64 = backup_status.services.values()
|
||||
let total_size_bytes: u64 = backup_status
|
||||
.services
|
||||
.values()
|
||||
.map(|s| s.repo_size_bytes)
|
||||
.sum();
|
||||
let total_size_gb = Self::bytes_to_gb(total_size_bytes);
|
||||
@@ -301,7 +327,6 @@ impl Collector for BackupCollector {
|
||||
unit: None,
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Add standalone disk identification metrics from TOML fields
|
||||
@@ -372,7 +397,7 @@ pub struct DiskSpace {
|
||||
pub used_gb: f64,
|
||||
pub available_gb: f64,
|
||||
pub usage_percent: f64,
|
||||
// Optional disk identification fields
|
||||
// Optional disk identification fields
|
||||
pub product_name: Option<String>,
|
||||
pub serial_number: Option<String>,
|
||||
}
|
||||
@@ -384,4 +409,4 @@ pub struct ServiceStatus {
|
||||
pub repo_path: String,
|
||||
pub archive_count: i64,
|
||||
pub repo_size_bytes: u64,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{registry, Metric, MetricValue, Status};
|
||||
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||
|
||||
use tracing::debug;
|
||||
|
||||
@@ -17,41 +17,44 @@ use crate::config::CpuConfig;
|
||||
pub struct CpuCollector {
|
||||
config: CpuConfig,
|
||||
name: String,
|
||||
load_thresholds: HysteresisThresholds,
|
||||
temperature_thresholds: HysteresisThresholds,
|
||||
}
|
||||
|
||||
impl CpuCollector {
|
||||
pub fn new(config: CpuConfig) -> Self {
|
||||
// Create hysteresis thresholds with 10% gap for recovery
|
||||
let load_thresholds = HysteresisThresholds::new(
|
||||
config.load_warning_threshold,
|
||||
config.load_critical_threshold,
|
||||
);
|
||||
|
||||
let temperature_thresholds = HysteresisThresholds::new(
|
||||
config.temperature_warning_threshold,
|
||||
config.temperature_critical_threshold,
|
||||
);
|
||||
|
||||
Self {
|
||||
config,
|
||||
name: "cpu".to_string(),
|
||||
load_thresholds,
|
||||
temperature_thresholds,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate CPU load status using configured thresholds
|
||||
fn calculate_load_status(&self, load: f32) -> Status {
|
||||
if load >= self.config.load_critical_threshold {
|
||||
Status::Critical
|
||||
} else if load >= self.config.load_warning_threshold {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
/// Calculate CPU load status using hysteresis thresholds
|
||||
fn calculate_load_status(&self, metric_name: &str, load: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, load, &self.load_thresholds)
|
||||
}
|
||||
|
||||
/// Calculate CPU temperature status using configured thresholds
|
||||
fn calculate_temperature_status(&self, temp: f32) -> Status {
|
||||
if temp >= self.config.temperature_critical_threshold {
|
||||
Status::Critical
|
||||
} else if temp >= self.config.temperature_warning_threshold {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
/// Calculate CPU temperature status using hysteresis thresholds
|
||||
fn calculate_temperature_status(&self, metric_name: &str, temp: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, temp, &self.temperature_thresholds)
|
||||
}
|
||||
|
||||
/// Collect CPU load averages from /proc/loadavg
|
||||
/// Format: "0.52 0.58 0.59 1/257 12345"
|
||||
async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect_load_averages(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
let content = utils::read_proc_file("/proc/loadavg")?;
|
||||
let parts: Vec<&str> = content.trim().split_whitespace().collect();
|
||||
|
||||
@@ -68,7 +71,7 @@ impl CpuCollector {
|
||||
|
||||
// Only apply thresholds to 5-minute load average
|
||||
let load_1min_status = Status::Ok; // No alerting on 1min
|
||||
let load_5min_status = self.calculate_load_status(load_5min); // Only 5min triggers alerts
|
||||
let load_5min_status = self.calculate_load_status(registry::CPU_LOAD_5MIN, load_5min, status_tracker); // Only 5min triggers alerts
|
||||
let load_15min_status = Status::Ok; // No alerting on 15min
|
||||
|
||||
Ok(vec![
|
||||
@@ -95,14 +98,14 @@ impl CpuCollector {
|
||||
|
||||
/// Collect CPU temperature from thermal zones
|
||||
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
|
||||
async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
|
||||
async fn collect_temperature(&self, status_tracker: &mut StatusTracker) -> Result<Option<Metric>, CollectorError> {
|
||||
// Try x86_pkg_temp first (Intel CPU package temperature)
|
||||
if let Ok(temp) = self
|
||||
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
|
||||
.await
|
||||
{
|
||||
let temp_celsius = temp as f32 / 1000.0;
|
||||
let status = self.calculate_temperature_status(temp_celsius);
|
||||
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
|
||||
|
||||
return Ok(Some(
|
||||
Metric::new(
|
||||
@@ -120,7 +123,7 @@ impl CpuCollector {
|
||||
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
|
||||
if let Ok(temp) = self.read_thermal_zone(&path).await {
|
||||
let temp_celsius = temp as f32 / 1000.0;
|
||||
let status = self.calculate_temperature_status(temp_celsius);
|
||||
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
|
||||
|
||||
return Ok(Some(
|
||||
Metric::new(
|
||||
@@ -200,17 +203,17 @@ impl Collector for CpuCollector {
|
||||
&self.name
|
||||
}
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
debug!("Collecting CPU metrics");
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
|
||||
|
||||
// Collect load averages (always available)
|
||||
metrics.extend(self.collect_load_averages().await?);
|
||||
metrics.extend(self.collect_load_averages(status_tracker).await?);
|
||||
|
||||
// Collect temperature (optional)
|
||||
if let Some(temp_metric) = self.collect_temperature().await? {
|
||||
if let Some(temp_metric) = self.collect_temperature(status_tracker).await? {
|
||||
metrics.push(temp_metric);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||
|
||||
use crate::config::DiskConfig;
|
||||
use std::fs;
|
||||
@@ -28,11 +28,28 @@ struct MountedDisk {
|
||||
/// Disk usage collector for monitoring filesystem sizes
|
||||
pub struct DiskCollector {
|
||||
config: DiskConfig,
|
||||
temperature_thresholds: HysteresisThresholds,
|
||||
}
|
||||
|
||||
impl DiskCollector {
|
||||
pub fn new(config: DiskConfig) -> Self {
|
||||
Self { config }
|
||||
// Create hysteresis thresholds for disk temperature
|
||||
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
|
||||
60.0, // warning at 60°C
|
||||
5.0, // 5°C gap for recovery
|
||||
70.0, // critical at 70°C
|
||||
5.0, // 5°C gap for recovery
|
||||
);
|
||||
|
||||
Self {
|
||||
config,
|
||||
temperature_thresholds,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate disk temperature status using hysteresis thresholds
|
||||
fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
|
||||
}
|
||||
|
||||
/// Resolve UUID to actual device path
|
||||
@@ -203,12 +220,6 @@ impl DiskCollector {
|
||||
Ok((total_bytes, used_bytes))
|
||||
}
|
||||
|
||||
/// Get root filesystem disk usage
|
||||
fn get_root_filesystem_usage(&self) -> Result<(u64, u64, f32)> {
|
||||
let (total_bytes, used_bytes) = self.get_filesystem_info("/")?;
|
||||
let usage_percent = (used_bytes as f64 / total_bytes as f64) * 100.0;
|
||||
Ok((total_bytes, used_bytes, usage_percent as f32))
|
||||
}
|
||||
|
||||
|
||||
/// Get the physical device for a given device (resolves symlinks, gets parent device)
|
||||
@@ -339,7 +350,7 @@ impl Collector for DiskCollector {
|
||||
"disk"
|
||||
}
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Collecting multi-disk metrics");
|
||||
|
||||
@@ -497,13 +508,8 @@ impl Collector for DiskCollector {
|
||||
});
|
||||
|
||||
if temperature > 0.0 {
|
||||
let temp_status = if temperature >= 70.0 {
|
||||
Status::Critical
|
||||
} else if temperature >= 60.0 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
let metric_name = format!("disk_smart_{}_temperature", device_name);
|
||||
let temp_status = self.calculate_temperature_status(&metric_name, temperature, status_tracker);
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_smart_{}_temperature", device_name),
|
||||
|
||||
@@ -4,7 +4,7 @@ use thiserror::Error;
|
||||
pub enum CollectorError {
|
||||
#[error("Failed to read system file {path}: {error}")]
|
||||
SystemRead { path: String, error: String },
|
||||
|
||||
|
||||
#[error("Failed to parse value '{value}': {error}")]
|
||||
Parse { value: String, error: String },
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, registry};
|
||||
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError, utils};
|
||||
use super::{utils, Collector, CollectorError};
|
||||
use crate::config::MemoryConfig;
|
||||
|
||||
/// Extremely efficient memory metrics collector
|
||||
///
|
||||
///
|
||||
/// EFFICIENCY OPTIMIZATIONS:
|
||||
/// - Single /proc/meminfo read for all memory metrics
|
||||
/// - Minimal string parsing with split operations
|
||||
@@ -17,6 +17,7 @@ use crate::config::MemoryConfig;
|
||||
pub struct MemoryCollector {
|
||||
config: MemoryConfig,
|
||||
name: String,
|
||||
usage_thresholds: HysteresisThresholds,
|
||||
}
|
||||
|
||||
/// Memory information parsed from /proc/meminfo
|
||||
@@ -33,36 +34,38 @@ struct MemoryInfo {
|
||||
|
||||
impl MemoryCollector {
|
||||
pub fn new(config: MemoryConfig) -> Self {
|
||||
// Create hysteresis thresholds with 5% gap for memory usage
|
||||
let usage_thresholds = HysteresisThresholds::with_custom_gaps(
|
||||
config.usage_warning_percent,
|
||||
5.0, // 5% gap for warning recovery
|
||||
config.usage_critical_percent,
|
||||
5.0, // 5% gap for critical recovery
|
||||
);
|
||||
|
||||
Self {
|
||||
config,
|
||||
name: "memory".to_string(),
|
||||
|
||||
usage_thresholds,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate memory usage status using configured thresholds
|
||||
fn calculate_usage_status(&self, usage_percent: f32) -> Status {
|
||||
if usage_percent >= self.config.usage_critical_percent {
|
||||
Status::Critical
|
||||
} else if usage_percent >= self.config.usage_warning_percent {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
|
||||
/// Calculate memory usage status using hysteresis thresholds
|
||||
fn calculate_usage_status(&self, metric_name: &str, usage_percent: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, usage_percent, &self.usage_thresholds)
|
||||
}
|
||||
|
||||
|
||||
/// Parse /proc/meminfo efficiently
|
||||
/// Format: "MemTotal: 16384000 kB"
|
||||
async fn parse_meminfo(&self) -> Result<MemoryInfo, CollectorError> {
|
||||
let content = utils::read_proc_file("/proc/meminfo")?;
|
||||
let mut info = MemoryInfo::default();
|
||||
|
||||
|
||||
// Parse each line efficiently - only extract what we need
|
||||
for line in content.lines() {
|
||||
if let Some(colon_pos) = line.find(':') {
|
||||
let key = &line[..colon_pos];
|
||||
let value_part = &line[colon_pos + 1..];
|
||||
|
||||
|
||||
// Extract number from value part (format: " 12345 kB")
|
||||
if let Some(number_str) = value_part.split_whitespace().next() {
|
||||
if let Ok(value_kb) = utils::parse_u64(number_str) {
|
||||
@@ -80,7 +83,7 @@ impl MemoryCollector {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Validate that we got essential fields
|
||||
if info.total_kb == 0 {
|
||||
return Err(CollectorError::Parse {
|
||||
@@ -88,87 +91,105 @@ impl MemoryCollector {
|
||||
error: "MemTotal not found or zero in /proc/meminfo".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// If MemAvailable is not available (older kernels), calculate it
|
||||
if info.available_kb == 0 {
|
||||
info.available_kb = info.free_kb + info.buffers_kb + info.cached_kb;
|
||||
}
|
||||
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
|
||||
/// Convert KB to GB efficiently (avoiding floating point in hot path)
|
||||
fn kb_to_gb(kb: u64) -> f32 {
|
||||
kb as f32 / 1_048_576.0 // 1024 * 1024
|
||||
}
|
||||
|
||||
|
||||
/// Calculate memory metrics from parsed info
|
||||
fn calculate_metrics(&self, info: &MemoryInfo) -> Vec<Metric> {
|
||||
fn calculate_metrics(&self, info: &MemoryInfo, status_tracker: &mut StatusTracker) -> Vec<Metric> {
|
||||
let mut metrics = Vec::with_capacity(6);
|
||||
|
||||
|
||||
// Calculate derived values
|
||||
let used_kb = info.total_kb - info.available_kb;
|
||||
let usage_percent = (used_kb as f32 / info.total_kb as f32) * 100.0;
|
||||
let usage_status = self.calculate_usage_status(usage_percent);
|
||||
|
||||
let usage_status = self.calculate_usage_status(registry::MEMORY_USAGE_PERCENT, usage_percent, status_tracker);
|
||||
|
||||
let swap_used_kb = info.swap_total_kb - info.swap_free_kb;
|
||||
|
||||
|
||||
// Convert to GB for metrics
|
||||
let total_gb = Self::kb_to_gb(info.total_kb);
|
||||
let used_gb = Self::kb_to_gb(used_kb);
|
||||
let available_gb = Self::kb_to_gb(info.available_kb);
|
||||
let swap_total_gb = Self::kb_to_gb(info.swap_total_kb);
|
||||
let swap_used_gb = Self::kb_to_gb(swap_used_kb);
|
||||
|
||||
|
||||
// Memory usage percentage (primary metric with status)
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_USAGE_PERCENT.to_string(),
|
||||
MetricValue::Float(usage_percent),
|
||||
usage_status,
|
||||
).with_description("Memory usage percentage".to_string())
|
||||
.with_unit("%".to_string()));
|
||||
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_USAGE_PERCENT.to_string(),
|
||||
MetricValue::Float(usage_percent),
|
||||
usage_status,
|
||||
)
|
||||
.with_description("Memory usage percentage".to_string())
|
||||
.with_unit("%".to_string()),
|
||||
);
|
||||
|
||||
// Total memory
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_TOTAL_GB.to_string(),
|
||||
MetricValue::Float(total_gb),
|
||||
Status::Ok, // Total memory doesn't have status
|
||||
).with_description("Total system memory".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_TOTAL_GB.to_string(),
|
||||
MetricValue::Float(total_gb),
|
||||
Status::Ok, // Total memory doesn't have status
|
||||
)
|
||||
.with_description("Total system memory".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
|
||||
// Used memory
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_USED_GB.to_string(),
|
||||
MetricValue::Float(used_gb),
|
||||
Status::Ok, // Used memory absolute value doesn't have status
|
||||
).with_description("Used system memory".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_USED_GB.to_string(),
|
||||
MetricValue::Float(used_gb),
|
||||
Status::Ok, // Used memory absolute value doesn't have status
|
||||
)
|
||||
.with_description("Used system memory".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
|
||||
// Available memory
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_AVAILABLE_GB.to_string(),
|
||||
MetricValue::Float(available_gb),
|
||||
Status::Ok, // Available memory absolute value doesn't have status
|
||||
).with_description("Available system memory".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_AVAILABLE_GB.to_string(),
|
||||
MetricValue::Float(available_gb),
|
||||
Status::Ok, // Available memory absolute value doesn't have status
|
||||
)
|
||||
.with_description("Available system memory".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
|
||||
// Swap metrics (only if swap exists)
|
||||
if info.swap_total_kb > 0 {
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
|
||||
MetricValue::Float(swap_total_gb),
|
||||
Status::Ok,
|
||||
).with_description("Total swap space".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_SWAP_USED_GB.to_string(),
|
||||
MetricValue::Float(swap_used_gb),
|
||||
Status::Ok,
|
||||
).with_description("Used swap space".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
|
||||
MetricValue::Float(swap_total_gb),
|
||||
Status::Ok,
|
||||
)
|
||||
.with_description("Total swap space".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_SWAP_USED_GB.to_string(),
|
||||
MetricValue::Float(swap_used_gb),
|
||||
Status::Ok,
|
||||
)
|
||||
.with_description("Used swap space".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
metrics
|
||||
}
|
||||
}
|
||||
@@ -178,34 +199,39 @@ impl Collector for MemoryCollector {
|
||||
fn name(&self) -> &str {
|
||||
&self.name
|
||||
}
|
||||
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
|
||||
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
debug!("Collecting memory metrics");
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
|
||||
// Parse memory info from /proc/meminfo
|
||||
let info = self.parse_meminfo().await?;
|
||||
|
||||
|
||||
// Calculate all metrics from parsed info
|
||||
let metrics = self.calculate_metrics(&info);
|
||||
|
||||
let metrics = self.calculate_metrics(&info, status_tracker);
|
||||
|
||||
let duration = start.elapsed();
|
||||
debug!("Memory collection completed in {:?} with {} metrics", duration, metrics.len());
|
||||
|
||||
debug!(
|
||||
"Memory collection completed in {:?} with {} metrics",
|
||||
duration,
|
||||
metrics.len()
|
||||
);
|
||||
|
||||
// Efficiency check: warn if collection takes too long
|
||||
if duration.as_millis() > 1 {
|
||||
debug!("Memory collection took {}ms - consider optimization", duration.as_millis());
|
||||
debug!(
|
||||
"Memory collection took {}ms - consider optimization",
|
||||
duration.as_millis()
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
// Store performance metrics
|
||||
// Performance tracking handled by cache system
|
||||
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
|
||||
fn get_performance_metrics(&self) -> Option<super::PerformanceMetrics> {
|
||||
None // Performance tracking handled by cache system
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +1,7 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::Metric;
|
||||
use cm_dashboard_shared::{Metric, StatusTracker};
|
||||
use std::time::Duration;
|
||||
|
||||
pub mod cpu;
|
||||
pub mod memory;
|
||||
pub mod disk;
|
||||
pub mod systemd;
|
||||
pub mod backup;
|
||||
pub mod error;
|
||||
|
||||
pub use error::CollectorError;
|
||||
|
||||
/// Performance metrics for a collector
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PerformanceMetrics {
|
||||
@@ -18,69 +9,78 @@ pub struct PerformanceMetrics {
|
||||
pub collection_efficiency_percent: f32,
|
||||
}
|
||||
|
||||
pub mod backup;
|
||||
pub mod cpu;
|
||||
pub mod disk;
|
||||
pub mod error;
|
||||
pub mod memory;
|
||||
pub mod systemd;
|
||||
|
||||
pub use error::CollectorError;
|
||||
|
||||
|
||||
/// Base trait for all collectors with extreme efficiency requirements
|
||||
#[async_trait]
|
||||
pub trait Collector: Send + Sync {
|
||||
/// Name of this collector
|
||||
fn name(&self) -> &str;
|
||||
|
||||
|
||||
/// Collect all metrics this collector provides
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError>;
|
||||
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError>;
|
||||
|
||||
/// Get performance metrics for monitoring collector efficiency
|
||||
fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
|
||||
None
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// CPU efficiency rules for all collectors
|
||||
pub mod efficiency {
|
||||
/// CRITICAL: All collectors must follow these efficiency rules to minimize system impact
|
||||
|
||||
/// 1. FILE READING RULES
|
||||
/// - Read entire files in single syscall when possible
|
||||
/// - Use BufReader only for very large files (>4KB)
|
||||
/// - Never read files character by character
|
||||
/// - Cache file descriptors when safe (immutable paths)
|
||||
|
||||
/// 2. PARSING RULES
|
||||
/// - Use split() instead of regex for simple patterns
|
||||
/// - Parse numbers with from_str() not complex parsing
|
||||
/// - Avoid string allocations in hot paths
|
||||
/// - Use str::trim() before parsing numbers
|
||||
|
||||
/// 3. MEMORY ALLOCATION RULES
|
||||
/// - Reuse Vec buffers when possible
|
||||
/// - Pre-allocate collections with known sizes
|
||||
/// - Use str slices instead of String when possible
|
||||
/// - Avoid clone() in hot paths
|
||||
|
||||
/// 4. SYSTEM CALL RULES
|
||||
/// - Minimize syscalls - prefer single reads over multiple
|
||||
/// - Use /proc filesystem efficiently
|
||||
/// - Avoid spawning processes when /proc data available
|
||||
/// - Cache static data (like CPU count)
|
||||
|
||||
/// 5. ERROR HANDLING RULES
|
||||
/// - Use Result<> but minimize allocation in error paths
|
||||
/// - Log errors at debug level only to avoid I/O overhead
|
||||
/// - Graceful degradation - missing metrics better than failing
|
||||
/// - Never panic in collectors
|
||||
|
||||
/// 6. CONCURRENCY RULES
|
||||
/// - Collectors must be thread-safe but avoid locks
|
||||
/// - Use atomic operations for simple counters
|
||||
/// - Avoid shared mutable state between collections
|
||||
/// - Each collection should be independent
|
||||
|
||||
pub const PERFORMANCE_TARGET_OVERHEAD_PERCENT: f32 = 0.1;
|
||||
//! CRITICAL: All collectors must follow these efficiency rules to minimize system impact
|
||||
//!
|
||||
//! # FILE READING RULES
|
||||
//! - Read entire files in single syscall when possible
|
||||
//! - Use BufReader only for very large files (>4KB)
|
||||
//! - Never read files character by character
|
||||
//! - Cache file descriptors when safe (immutable paths)
|
||||
//!
|
||||
//! # PARSING RULES
|
||||
//! - Use split() instead of regex for simple patterns
|
||||
//! - Parse numbers with from_str() not complex parsing
|
||||
//! - Avoid string allocations in hot paths
|
||||
//! - Use str::trim() before parsing numbers
|
||||
//!
|
||||
//! # MEMORY ALLOCATION RULES
|
||||
//! - Reuse Vec buffers when possible
|
||||
//! - Pre-allocate collections with known sizes
|
||||
//! - Use str slices instead of String when possible
|
||||
//! - Avoid clone() in hot paths
|
||||
//!
|
||||
//! # SYSTEM CALL RULES
|
||||
//! - Minimize syscalls - prefer single reads over multiple
|
||||
//! - Use /proc filesystem efficiently
|
||||
//! - Avoid spawning processes when /proc data available
|
||||
//! - Cache static data (like CPU count)
|
||||
//!
|
||||
//! # ERROR HANDLING RULES
|
||||
//! - Use Result<> but minimize allocation in error paths
|
||||
//! - Log errors at debug level only to avoid I/O overhead
|
||||
//! - Graceful degradation - missing metrics better than failing
|
||||
//! - Never panic in collectors
|
||||
//!
|
||||
//! # CONCURRENCY RULES
|
||||
//! - Collectors must be thread-safe but avoid locks
|
||||
//! - Use atomic operations for simple counters
|
||||
//! - Avoid shared mutable state between collections
|
||||
//! - Each collection should be independent
|
||||
}
|
||||
|
||||
/// Utility functions for efficient system data collection
|
||||
pub mod utils {
|
||||
use std::fs;
|
||||
use super::CollectorError;
|
||||
|
||||
use std::fs;
|
||||
|
||||
/// Read entire file content efficiently
|
||||
pub fn read_proc_file(path: &str) -> Result<String, CollectorError> {
|
||||
fs::read_to_string(path).map_err(|e| CollectorError::SystemRead {
|
||||
@@ -88,25 +88,25 @@ pub mod utils {
|
||||
error: e.to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Parse float from string slice efficiently
|
||||
pub fn parse_f32(s: &str) -> Result<f32, CollectorError> {
|
||||
s.trim().parse().map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
|
||||
value: s.to_string(),
|
||||
error: e.to_string(),
|
||||
})
|
||||
s.trim()
|
||||
.parse()
|
||||
.map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
|
||||
value: s.to_string(),
|
||||
error: e.to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Parse integer from string slice efficiently
|
||||
pub fn parse_u64(s: &str) -> Result<u64, CollectorError> {
|
||||
s.trim().parse().map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
||||
value: s.to_string(),
|
||||
error: e.to_string(),
|
||||
})
|
||||
s.trim()
|
||||
.parse()
|
||||
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
||||
value: s.to_string(),
|
||||
error: e.to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Split string and get nth element safely
|
||||
pub fn split_nth<'a>(s: &'a str, delimiter: char, n: usize) -> Option<&'a str> {
|
||||
s.split(delimiter).nth(n)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
||||
use std::process::Command;
|
||||
use std::sync::RwLock;
|
||||
use std::time::Instant;
|
||||
@@ -401,7 +401,7 @@ impl Collector for SystemdCollector {
|
||||
"systemd"
|
||||
}
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Collecting systemd services metrics");
|
||||
|
||||
|
||||
Reference in New Issue
Block a user