Implement real-time process monitoring and fix UI hardcoded data

This commit addresses several key issues identified during development:

Major Changes:
- Replace hardcoded top CPU/RAM process display with real system data
- Add intelligent process monitoring to CpuCollector using ps command
- Fix disk metrics permission issues in systemd collector
- Optimize service collection to focus on status, memory, and disk only
- Update dashboard widgets to display live process information

Process Monitoring Implementation:
- Added collect_top_cpu_process() and collect_top_ram_process() methods
- Implemented ps-based monitoring with accurate CPU percentages
- Added filtering to prevent self-monitoring artifacts (ps commands)
- Enhanced error handling and validation for process data
- Dashboard now shows realistic values like "claude (PID 2974) 11.0%"

Service Collection Optimization:
- Removed CPU monitoring from systemd collector for efficiency
- Enhanced service directory permission error logging
- Simplified services widget to show essential metrics only
- Fixed service-to-directory mapping accuracy

UI and Dashboard Improvements:
- Reorganized dashboard layout with btop-inspired multi-panel design
- Updated system panel to include real top CPU/RAM process display
- Enhanced widget formatting and data presentation
- Removed placeholder/hardcoded data throughout the interface

Technical Details:
- Updated agent/src/collectors/cpu.rs with process monitoring
- Modified dashboard/src/ui/mod.rs for real-time process display
- Enhanced systemd collector error handling and disk metrics
- Updated CLAUDE.md documentation with implementation details
This commit is contained in:
2025-10-16 23:55:05 +02:00
parent 7a664ef0fb
commit 8a36472a3d
81 changed files with 7702 additions and 9608 deletions

171
shared/src/cache.rs Normal file
View File

@@ -0,0 +1,171 @@
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
/// Cache tier configuration
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct CacheTier {
pub interval_seconds: u64,
pub description: String,
}
/// Cache configuration
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct CacheConfig {
pub enabled: bool,
pub default_ttl_seconds: u64,
pub max_entries: usize,
pub warming_timeout_seconds: u64,
pub background_refresh_enabled: bool,
pub cleanup_interval_seconds: u64,
pub tiers: HashMap<String, CacheTier>,
pub metric_assignments: HashMap<String, String>,
}
impl Default for CacheConfig {
fn default() -> Self {
let mut tiers = HashMap::new();
tiers.insert("realtime".to_string(), CacheTier {
interval_seconds: 2,
description: "Memory/CPU operations - no disk I/O (CPU, memory, service CPU/RAM)".to_string(),
});
tiers.insert("disk_light".to_string(), CacheTier {
interval_seconds: 60,
description: "Light disk operations - 1 minute (service status checks)".to_string(),
});
tiers.insert("disk_medium".to_string(), CacheTier {
interval_seconds: 300,
description: "Medium disk operations - 5 minutes (disk usage, service disk)".to_string(),
});
tiers.insert("disk_heavy".to_string(), CacheTier {
interval_seconds: 900,
description: "Heavy disk operations - 15 minutes (SMART data, backup status)".to_string(),
});
tiers.insert("static".to_string(), CacheTier {
interval_seconds: 3600,
description: "Hardware info that rarely changes - 1 hour".to_string(),
});
let mut metric_assignments = HashMap::new();
// REALTIME (5s) - Memory/CPU operations, no disk I/O
metric_assignments.insert("cpu_load_*".to_string(), "realtime".to_string());
metric_assignments.insert("cpu_temperature_*".to_string(), "realtime".to_string());
metric_assignments.insert("cpu_frequency_*".to_string(), "realtime".to_string());
metric_assignments.insert("memory_*".to_string(), "realtime".to_string());
metric_assignments.insert("service_*_cpu_percent".to_string(), "realtime".to_string());
metric_assignments.insert("service_*_memory_mb".to_string(), "realtime".to_string());
metric_assignments.insert("network_*".to_string(), "realtime".to_string());
// DISK_LIGHT (1min) - Light disk operations: service status checks
metric_assignments.insert("service_*_status".to_string(), "disk_light".to_string());
// DISK_MEDIUM (5min) - Medium disk operations: du commands, disk usage
metric_assignments.insert("service_*_disk_gb".to_string(), "disk_medium".to_string());
metric_assignments.insert("disk_tmp_*".to_string(), "disk_medium".to_string());
metric_assignments.insert("disk_*_usage_*".to_string(), "disk_medium".to_string());
metric_assignments.insert("disk_*_size_*".to_string(), "disk_medium".to_string());
// DISK_HEAVY (15min) - Heavy disk operations: SMART data, backup status
metric_assignments.insert("disk_*_temperature".to_string(), "disk_heavy".to_string());
metric_assignments.insert("disk_*_wear_percent".to_string(), "disk_heavy".to_string());
metric_assignments.insert("smart_*".to_string(), "disk_heavy".to_string());
metric_assignments.insert("backup_*".to_string(), "disk_heavy".to_string());
Self {
enabled: true,
default_ttl_seconds: 30,
max_entries: 10000,
warming_timeout_seconds: 3,
background_refresh_enabled: true,
cleanup_interval_seconds: 1800,
tiers,
metric_assignments,
}
}
}
impl CacheConfig {
/// Get the cache tier for a metric name
pub fn get_tier_for_metric(&self, metric_name: &str) -> Option<&CacheTier> {
// Find matching pattern
for (pattern, tier_name) in &self.metric_assignments {
if self.matches_pattern(metric_name, pattern) {
return self.tiers.get(tier_name);
}
}
None
}
/// Check if metric name matches pattern (supports wildcards)
fn matches_pattern(&self, metric_name: &str, pattern: &str) -> bool {
if pattern.contains('*') {
// Convert pattern to regex-like matching
let pattern_parts: Vec<&str> = pattern.split('*').collect();
if pattern_parts.len() == 2 {
let prefix = pattern_parts[0];
let suffix = pattern_parts[1];
if suffix.is_empty() {
// Pattern like "cpu_*" - just check prefix
metric_name.starts_with(prefix)
} else if prefix.is_empty() {
// Pattern like "*_status" - just check suffix
metric_name.ends_with(suffix)
} else {
// Pattern like "service_*_disk_gb" - check prefix and suffix
metric_name.starts_with(prefix) && metric_name.ends_with(suffix)
}
} else {
// More complex patterns - for now, just check if all parts are present
pattern_parts.iter().all(|part| {
part.is_empty() || metric_name.contains(part)
})
}
} else {
metric_name == pattern
}
}
/// Get cache interval for a metric
pub fn get_cache_interval(&self, metric_name: &str) -> u64 {
self.get_tier_for_metric(metric_name)
.map(|tier| tier.interval_seconds)
.unwrap_or(self.default_ttl_seconds)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pattern_matching() {
let config = CacheConfig::default();
assert!(config.matches_pattern("cpu_load_1min", "cpu_load_*"));
assert!(config.matches_pattern("service_nginx_disk_gb", "service_*_disk_gb"));
assert!(!config.matches_pattern("memory_usage_percent", "cpu_load_*"));
}
#[test]
fn test_tier_assignment() {
let config = CacheConfig::default();
// Realtime (5s) - CPU/Memory operations
assert_eq!(config.get_cache_interval("cpu_load_1min"), 5);
assert_eq!(config.get_cache_interval("memory_usage_percent"), 5);
assert_eq!(config.get_cache_interval("service_nginx_cpu_percent"), 5);
// Disk light (60s) - Service status
assert_eq!(config.get_cache_interval("service_nginx_status"), 60);
// Disk medium (300s) - Disk usage
assert_eq!(config.get_cache_interval("service_nginx_disk_gb"), 300);
assert_eq!(config.get_cache_interval("disk_tmp_usage_percent"), 300);
// Disk heavy (900s) - SMART data
assert_eq!(config.get_cache_interval("disk_nvme0_temperature"), 900);
assert_eq!(config.get_cache_interval("smart_nvme0_wear_percent"), 900);
}
}