Implement real-time process monitoring and fix UI hardcoded data

This commit addresses several key issues identified during development:

Major Changes:
- Replace hardcoded top CPU/RAM process display with real system data
- Add intelligent process monitoring to CpuCollector using ps command
- Fix disk metrics permission issues in systemd collector
- Optimize service collection to focus on status, memory, and disk only
- Update dashboard widgets to display live process information

Process Monitoring Implementation:
- Added collect_top_cpu_process() and collect_top_ram_process() methods
- Implemented ps-based monitoring with accurate CPU percentages
- Added filtering to prevent self-monitoring artifacts (ps commands)
- Enhanced error handling and validation for process data
- Dashboard now shows realistic values like "claude (PID 2974) 11.0%"

Service Collection Optimization:
- Removed CPU monitoring from systemd collector for efficiency
- Enhanced service directory permission error logging
- Simplified services widget to show essential metrics only
- Fixed service-to-directory mapping accuracy

UI and Dashboard Improvements:
- Reorganized dashboard layout with btop-inspired multi-panel design
- Updated system panel to include real top CPU/RAM process display
- Enhanced widget formatting and data presentation
- Removed placeholder/hardcoded data throughout the interface

Technical Details:
- Updated agent/src/collectors/cpu.rs with process monitoring
- Modified dashboard/src/ui/mod.rs for real-time process display
- Enhanced systemd collector error handling and disk metrics
- Updated CLAUDE.md documentation with implementation details
This commit is contained in:
2025-10-16 23:55:05 +02:00
parent 7a664ef0fb
commit 8a36472a3d
81 changed files with 7702 additions and 9608 deletions

View File

@@ -0,0 +1,230 @@
use cm_dashboard_shared::{Metric, Status};
use std::collections::HashMap;
use std::time::{Duration, Instant};
use tracing::{debug, info, warn};
use super::{MetricDataPoint, WidgetType, subscriptions};
/// Central metric storage for the dashboard
pub struct MetricStore {
/// Current metrics: hostname -> metric_name -> metric
current_metrics: HashMap<String, HashMap<String, Metric>>,
/// Historical metrics for trending
historical_metrics: HashMap<String, Vec<MetricDataPoint>>,
/// Last update timestamp per host
last_update: HashMap<String, Instant>,
/// Configuration
max_metrics_per_host: usize,
history_retention: Duration,
}
impl MetricStore {
pub fn new(max_metrics_per_host: usize, history_retention_hours: u64) -> Self {
Self {
current_metrics: HashMap::new(),
historical_metrics: HashMap::new(),
last_update: HashMap::new(),
max_metrics_per_host,
history_retention: Duration::from_secs(history_retention_hours * 3600),
}
}
/// Update metrics for a specific host
pub fn update_metrics(&mut self, hostname: &str, metrics: Vec<Metric>) {
let now = Instant::now();
debug!("Updating {} metrics for host {}", metrics.len(), hostname);
// Get or create host entry
let host_metrics = self.current_metrics
.entry(hostname.to_string())
.or_insert_with(HashMap::new);
// Get or create historical entry
let host_history = self.historical_metrics
.entry(hostname.to_string())
.or_insert_with(Vec::new);
// Update current metrics and add to history
for metric in metrics {
let metric_name = metric.name.clone();
// Store current metric
host_metrics.insert(metric_name.clone(), metric.clone());
// Add to history
host_history.push(MetricDataPoint {
metric,
received_at: now,
});
}
// Update last update timestamp
self.last_update.insert(hostname.to_string(), now);
// Get metrics count before cleanup
let metrics_count = host_metrics.len();
// Cleanup old history and enforce limits
self.cleanup_host_data(hostname);
info!("Updated metrics for {}: {} current metrics",
hostname, metrics_count);
}
/// Get current metric for a specific host
pub fn get_metric(&self, hostname: &str, metric_name: &str) -> Option<&Metric> {
self.current_metrics
.get(hostname)?
.get(metric_name)
}
/// Get all current metrics for a host
pub fn get_host_metrics(&self, hostname: &str) -> Option<&HashMap<String, Metric>> {
self.current_metrics.get(hostname)
}
/// Get all current metrics for a host as a vector
pub fn get_metrics_for_host(&self, hostname: &str) -> Vec<&Metric> {
if let Some(metrics_map) = self.current_metrics.get(hostname) {
metrics_map.values().collect()
} else {
Vec::new()
}
}
/// Get metrics for a specific widget type
pub fn get_metrics_for_widget(&self, hostname: &str, widget_type: WidgetType) -> Vec<&Metric> {
let subscriptions = subscriptions::get_widget_subscriptions(widget_type);
if let Some(host_metrics) = self.get_host_metrics(hostname) {
subscriptions
.iter()
.filter_map(|&metric_name| host_metrics.get(metric_name))
.collect()
} else {
Vec::new()
}
}
/// Get aggregated status for a widget
pub fn get_widget_status(&self, hostname: &str, widget_type: WidgetType) -> Status {
let metrics = self.get_metrics_for_widget(hostname, widget_type);
if metrics.is_empty() {
Status::Unknown
} else {
let statuses: Vec<Status> = metrics.iter().map(|m| m.status).collect();
Status::aggregate(&statuses)
}
}
/// Get list of all hosts with metrics
pub fn get_hosts(&self) -> Vec<String> {
self.current_metrics.keys().cloned().collect()
}
/// Get connected hosts (hosts with recent updates)
pub fn get_connected_hosts(&self, timeout: Duration) -> Vec<String> {
let now = Instant::now();
self.last_update
.iter()
.filter_map(|(hostname, &last_update)| {
if now.duration_since(last_update) <= timeout {
Some(hostname.clone())
} else {
None
}
})
.collect()
}
/// Get last update timestamp for a host
pub fn get_last_update(&self, hostname: &str) -> Option<Instant> {
self.last_update.get(hostname).copied()
}
/// Check if host is considered connected
pub fn is_host_connected(&self, hostname: &str, timeout: Duration) -> bool {
if let Some(&last_update) = self.last_update.get(hostname) {
Instant::now().duration_since(last_update) <= timeout
} else {
false
}
}
/// Get metric value as specific type (helper function)
pub fn get_metric_value_f32(&self, hostname: &str, metric_name: &str) -> Option<f32> {
self.get_metric(hostname, metric_name)?
.value
.as_f32()
}
/// Get metric value as string (helper function)
pub fn get_metric_value_string(&self, hostname: &str, metric_name: &str) -> Option<String> {
Some(self.get_metric(hostname, metric_name)?
.value
.as_string())
}
/// Get historical data for a metric
pub fn get_metric_history(&self, hostname: &str, metric_name: &str) -> Vec<&MetricDataPoint> {
if let Some(history) = self.historical_metrics.get(hostname) {
history
.iter()
.filter(|dp| dp.metric.name == metric_name)
.collect()
} else {
Vec::new()
}
}
/// Cleanup old data and enforce limits
fn cleanup_host_data(&mut self, hostname: &str) {
let now = Instant::now();
// Cleanup historical data
if let Some(history) = self.historical_metrics.get_mut(hostname) {
// Remove old entries
history.retain(|dp| now.duration_since(dp.received_at) <= self.history_retention);
// Enforce size limit
if history.len() > self.max_metrics_per_host {
let excess = history.len() - self.max_metrics_per_host;
history.drain(0..excess);
warn!("Trimmed {} old metrics for host {} (size limit: {})",
excess, hostname, self.max_metrics_per_host);
}
}
}
/// Get storage statistics
pub fn get_stats(&self) -> MetricStoreStats {
let total_current_metrics: usize = self.current_metrics
.values()
.map(|host_metrics| host_metrics.len())
.sum();
let total_historical_metrics: usize = self.historical_metrics
.values()
.map(|history| history.len())
.sum();
MetricStoreStats {
total_hosts: self.current_metrics.len(),
total_current_metrics,
total_historical_metrics,
connected_hosts: self.get_connected_hosts(Duration::from_secs(30)).len(),
}
}
}
/// Metric store statistics
#[derive(Debug, Clone)]
pub struct MetricStoreStats {
pub total_hosts: usize,
pub total_current_metrics: usize,
pub total_historical_metrics: usize,
pub connected_hosts: usize,
}