Implement real-time process monitoring and fix UI hardcoded data

This commit addresses several key issues identified during development:

Major Changes:
- Replace hardcoded top CPU/RAM process display with real system data
- Add intelligent process monitoring to CpuCollector using ps command
- Fix disk metrics permission issues in systemd collector
- Optimize service collection to focus on status, memory, and disk only
- Update dashboard widgets to display live process information

Process Monitoring Implementation:
- Added collect_top_cpu_process() and collect_top_ram_process() methods
- Implemented ps-based monitoring with accurate CPU percentages
- Added filtering to prevent self-monitoring artifacts (ps commands)
- Enhanced error handling and validation for process data
- Dashboard now shows realistic values like "claude (PID 2974) 11.0%"

Service Collection Optimization:
- Removed CPU monitoring from systemd collector for efficiency
- Enhanced service directory permission error logging
- Simplified services widget to show essential metrics only
- Fixed service-to-directory mapping accuracy

UI and Dashboard Improvements:
- Reorganized dashboard layout with btop-inspired multi-panel design
- Updated system panel to include real top CPU/RAM process display
- Enhanced widget formatting and data presentation
- Removed placeholder/hardcoded data throughout the interface

Technical Details:
- Updated agent/src/collectors/cpu.rs with process monitoring
- Modified dashboard/src/ui/mod.rs for real-time process display
- Enhanced systemd collector error handling and disk metrics
- Updated CLAUDE.md documentation with implementation details
This commit is contained in:
2025-10-16 23:55:05 +02:00
parent 7a664ef0fb
commit 8a36472a3d
81 changed files with 7702 additions and 9608 deletions

View File

@@ -0,0 +1,173 @@
use anyhow::Result;
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status};
use std::process::Command;
use std::time::Instant;
use tracing::debug;
use super::{Collector, CollectorError, PerformanceMetrics};
/// Disk usage collector for monitoring filesystem sizes
pub struct DiskCollector {
// Immutable collector for caching compatibility
}
impl DiskCollector {
pub fn new() -> Self {
Self {}
}
/// Get directory size using du command (efficient for single directory)
fn get_directory_size(&self, path: &str) -> Result<u64> {
let output = Command::new("du")
.arg("-s")
.arg("--block-size=1")
.arg(path)
.output()?;
// du returns success even with permission denied warnings in stderr
// We only care if the command completely failed or produced no stdout
let output_str = String::from_utf8(output.stdout)?;
if output_str.trim().is_empty() {
return Err(anyhow::anyhow!("du command produced no output for {}", path));
}
let size_str = output_str
.split_whitespace()
.next()
.ok_or_else(|| anyhow::anyhow!("Failed to parse du output"))?;
let size_bytes = size_str.parse::<u64>()?;
Ok(size_bytes)
}
/// Get filesystem info using df command
fn get_filesystem_info(&self, path: &str) -> Result<(u64, u64)> {
let output = Command::new("df")
.arg("--block-size=1")
.arg(path)
.output()?;
if !output.status.success() {
return Err(anyhow::anyhow!("df command failed for {}", path));
}
let output_str = String::from_utf8(output.stdout)?;
let lines: Vec<&str> = output_str.lines().collect();
if lines.len() < 2 {
return Err(anyhow::anyhow!("Unexpected df output format"));
}
let fields: Vec<&str> = lines[1].split_whitespace().collect();
if fields.len() < 4 {
return Err(anyhow::anyhow!("Unexpected df fields count"));
}
let total_bytes = fields[1].parse::<u64>()?;
let used_bytes = fields[2].parse::<u64>()?;
Ok((total_bytes, used_bytes))
}
/// Calculate status based on usage percentage
fn calculate_usage_status(&self, used_bytes: u64, total_bytes: u64) -> Status {
if total_bytes == 0 {
return Status::Unknown;
}
let usage_percent = (used_bytes as f64 / total_bytes as f64) * 100.0;
// Thresholds for disk usage
if usage_percent >= 95.0 {
Status::Critical
} else if usage_percent >= 85.0 {
Status::Warning
} else {
Status::Ok
}
}
}
#[async_trait]
impl Collector for DiskCollector {
fn name(&self) -> &str {
"disk"
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
let start_time = Instant::now();
debug!("Collecting disk metrics");
let mut metrics = Vec::new();
// Monitor /tmp directory size
match self.get_directory_size("/tmp") {
Ok(tmp_size_bytes) => {
let tmp_size_mb = tmp_size_bytes as f64 / (1024.0 * 1024.0);
// Get /tmp filesystem info (usually tmpfs with 2GB limit)
let (total_bytes, _) = match self.get_filesystem_info("/tmp") {
Ok((total, used)) => (total, used),
Err(_) => {
// Fallback: assume 2GB limit for tmpfs
(2 * 1024 * 1024 * 1024, tmp_size_bytes)
}
};
let total_mb = total_bytes as f64 / (1024.0 * 1024.0);
let usage_percent = (tmp_size_bytes as f64 / total_bytes as f64) * 100.0;
let status = self.calculate_usage_status(tmp_size_bytes, total_bytes);
metrics.push(Metric {
name: "disk_tmp_size_mb".to_string(),
value: MetricValue::Float(tmp_size_mb as f32),
unit: Some("MB".to_string()),
description: Some(format!("Used: {:.1} MB", tmp_size_mb)),
status,
timestamp: chrono::Utc::now().timestamp() as u64,
});
metrics.push(Metric {
name: "disk_tmp_total_mb".to_string(),
value: MetricValue::Float(total_mb as f32),
unit: Some("MB".to_string()),
description: Some(format!("Total: {:.1} MB", total_mb)),
status: Status::Ok,
timestamp: chrono::Utc::now().timestamp() as u64,
});
metrics.push(Metric {
name: "disk_tmp_usage_percent".to_string(),
value: MetricValue::Float(usage_percent as f32),
unit: Some("%".to_string()),
description: Some(format!("Usage: {:.1}%", usage_percent)),
status,
timestamp: chrono::Utc::now().timestamp() as u64,
});
}
Err(e) => {
debug!("Failed to get /tmp size: {}", e);
metrics.push(Metric {
name: "disk_tmp_size_mb".to_string(),
value: MetricValue::String("error".to_string()),
unit: Some("MB".to_string()),
description: Some(format!("Error: {}", e)),
status: Status::Unknown,
timestamp: chrono::Utc::now().timestamp() as u64,
});
}
}
let collection_time = start_time.elapsed();
debug!("Disk collection completed in {:?} with {} metrics",
collection_time, metrics.len());
Ok(metrics)
}
fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
None // Performance tracking handled by cache system
}
}