From c68ccf023eb1be0dee950d5abdf30967ecd5dbde Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Mon, 13 Oct 2025 00:28:06 +0200 Subject: [PATCH] Testing --- CLAUDE.md | 4 + README.md | 348 ++++++++++++++++++++++++++++--------- agent/src/notifications.rs | 34 +++- agent/src/simple_agent.rs | 33 +++- 4 files changed, 335 insertions(+), 84 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index d30d568..e2e9e51 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -273,6 +273,10 @@ Agent (calculations + thresholds) → Status → Dashboard (display only) → Ta - [x] Updated dashboard to parse and display SystemCollector data - [x] Enhanced service notifications to include specific failure details - [x] CPU temperature thresholds set to 100°C (effectively disabled) +- [x] **SystemCollector bug fixes completed (2025-10-12)** +- [x] Fixed CPU load parsing for comma decimal separator locale (", " split) +- [x] Fixed CPU temperature to prioritize x86_pkg_temp over generic thermal zones +- [x] Fixed C-state collection to discover all available states (including C10) **Production Configuration:** - CPU load thresholds: Warning ≥ 5.0, Critical ≥ 8.0 diff --git a/README.md b/README.md index 4c51796..2adc721 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# CM Dashboard +# CM Dashboard - Infrastructure Monitoring TUI -CM Dashboard is a Rust-powered terminal UI for real-time monitoring of CMTEC infrastructure hosts. It subscribes to the CMTEC ZMQ gossip network where lightweight agents publish SMART, service, and backup metrics, and presents them in an efficient, keyboard-driven interface built with `ratatui`. +A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built to replace Glance with a custom solution tailored for specific monitoring needs and API integrations. Features real-time monitoring of all infrastructure components with intelligent email notifications and automatic status calculation. ``` ┌─────────────────────────────────────────────────────────────────────┐ @@ -28,41 +28,116 @@ CM Dashboard is a Rust-powered terminal UI for real-time monitoring of CMTEC inf Keys: [←→] hosts [r]efresh [q]uit ``` -## Requirements +## Key Features -- Rust toolchain 1.75+ (install via [`rustup`](https://rustup.rs)) -- Network access to the CMTEC metrics gossip agents (default `tcp://:6130`; install `zeromq`/`libzmq` on the host) -- Configuration files under `config/` describing hosts and dashboard preferences +### Real-time Monitoring +- **Multi-host support** for cmbox, labbox, simonbox, steambox, srv01 +- **Performance-focused** with minimal resource usage +- **Keyboard-driven interface** for power users +- **ZMQ gossip network** for efficient data distribution + +### Infrastructure Monitoring +- **NVMe health monitoring** with wear prediction and temperature tracking +- **CPU/Memory/GPU telemetry** with automatic thresholding +- **Service resource monitoring** with per-service CPU and RAM usage +- **Disk usage overview** for root filesystems +- **Backup status** with detailed metrics and history +- **C-state monitoring** for CPU power management analysis + +### Intelligent Alerting +- **Agent-calculated status** with predefined thresholds +- **Email notifications** via SMTP with rate limiting +- **Recovery notifications** with context about original issues +- **Stockholm timezone** support for email timestamps +- **Unified alert pipeline** summarizing host health + +## Architecture + +### Agent-Dashboard Separation +The system follows a strict separation of concerns: + +- **Agent**: Single source of truth for all status calculations using defined thresholds +- **Dashboard**: Display-only interface that shows agent-provided status +- **Data Flow**: Agent (calculations) → Status → Dashboard (display) → Colors + +### Agent Thresholds (Production) +- **CPU Load**: Warning ≥ 5.0, Critical ≥ 8.0 +- **Memory Usage**: Warning ≥ 80%, Critical ≥ 95% +- **CPU Temperature**: Warning ≥ 100°C, Critical ≥ 100°C (effectively disabled) + +### Email Notification System +- **From**: `{hostname}@cmtec.se` (e.g., cmbox@cmtec.se) +- **To**: `cm@cmtec.se` +- **SMTP**: localhost:25 (postfix) +- **Rate Limiting**: 30 minutes (configurable) +- **Triggers**: Status degradation and recovery with detailed context ## Installation -Clone the repository and build with Cargo: +### Requirements +- Rust toolchain 1.75+ (install via [`rustup`](https://rustup.rs)) +- Root privileges for agent (hardware monitoring access) +- Network access for ZMQ communication (default port 6130) +- SMTP server for notifications (postfix recommended) +### Build from Source ```bash git clone https://github.com/cmtec/cm-dashboard.git cd cm-dashboard cargo build --release ``` -The optimized binary is available at `target/release/cm-dashboard`. To install into your Cargo bin directory: +Optimized binaries available at: +- Dashboard: `target/release/cm-dashboard` +- Agent: `target/release/cm-dashboard-agent` + +### Installation +```bash +# Install dashboard +cargo install --path dashboard + +# Install agent (requires root for hardware access) +sudo cargo install --path agent +``` + +## Quick Start + +### Dashboard +```bash +# Run with default configuration +cm-dashboard + +# Specify host to monitor +cm-dashboard --host cmbox + +# Override ZMQ endpoints +cm-dashboard --zmq-endpoint tcp://srv01:6130,tcp://labbox:6130 + +# Increase logging verbosity +cm-dashboard -v +``` + +### Agent (Pure Auto-Discovery) +The agent requires **no configuration files** and auto-discovers all system components: ```bash -cargo install --path dashboard +# Basic agent startup (auto-detects everything) +sudo cm-dashboard-agent + +# With verbose logging for troubleshooting +sudo cm-dashboard-agent -v ``` +The agent automatically: +- **Discovers storage devices** for SMART monitoring +- **Detects running systemd services** for resource tracking +- **Configures collection intervals** based on system capabilities +- **Sets up email notifications** using hostname@cmtec.se + ## Configuration -On first launch, the dashboard will create `config/dashboard.toml` and `config/hosts.toml` automatically if they do not exist. - -You can also generate starter configuration files manually with the built-in helper: - -```bash -cargo run -p cm-dashboard -- init-config -# or, once installed -cm-dashboard init-config --dir ./config --force -``` - -This produces `config/dashboard.toml` and `config/hosts.toml`. The primary dashboard config looks like: +### Dashboard Configuration +The dashboard creates `config/dashboard.toml` on first run: ```toml [hosts] @@ -73,21 +148,13 @@ name = "srv01" enabled = true [[hosts.hosts]] -name = "labbox" +name = "cmbox" enabled = true [dashboard] tick_rate_ms = 250 history_duration_minutes = 60 -[[dashboard.widgets]] -id = "nvme" -enabled = true - -[[dashboard.widgets]] -id = "alerts" -enabled = true - [data_source] kind = "zmq" @@ -95,73 +162,198 @@ kind = "zmq" endpoints = ["tcp://127.0.0.1:6130"] ``` -Adjust the host list and `data_source.zmq.endpoints` to match your CMTEC gossip network. If you prefer to manage hosts separately, edit the generated `hosts.toml` file. - -## Features - -- **Real-time monitoring** with ZMQ gossip network architecture -- **Storage health** with drive capacity, usage, temperature, and wear tracking -- **Per-service resource tracking** including memory and disk usage by service -- **CPU/Memory monitoring** with load averages, temperature, and GPU metrics -- **Alert system** with color-coded highlighting and threshold-based warnings -- **Multi-host support** with seamless host switching (`←`, `→`, `h`, `l`, `Tab`) -- **Backup status** monitoring with restic integration -- **Keyboard-driven interface** with help overlay (`?`) -- **Configuration management** via TOML files for hosts and dashboard settings - -## Getting Started +### Agent Configuration (Optional) +The agent works without configuration but supports optional settings: ```bash -cargo run -p cm-dashboard -- --config config/dashboard.toml -# specify a single host -cargo run -p cm-dashboard -- --host srv01 -# override ZMQ endpoints at runtime -cargo run -p cm-dashboard -- --zmq-endpoint tcp://srv01:6130,tcp://labbox:6130 -# increase logging verbosity -cargo run -p cm-dashboard -- -v +# Generate example configuration +cm-dashboard-agent --help + +# Override specific settings +sudo cm-dashboard-agent \ + --hostname cmbox \ + --bind tcp://*:6130 \ + --interval 5000 ``` -### Keyboard Shortcuts +## Monitoring Components + +### System Collector +- **CPU Load**: 1/5/15 minute averages with warning/critical thresholds +- **Memory Usage**: Used/total with percentage calculation +- **CPU Temperature**: x86_pkg_temp prioritized for accuracy +- **C-States**: Power management state distribution (C0-C10) + +### Service Collector +- **Systemd Services**: Auto-discovery of interesting services +- **Resource Usage**: Per-service memory and disk consumption +- **Service Health**: Running/stopped status with detailed failure info + +### SMART Collector +- **NVMe Health**: Temperature, wear leveling, spare blocks +- **Drive Capacity**: Total/used space with percentage +- **SMART Attributes**: Critical health indicators + +### Backup Collector +- **Restic Integration**: Backup status and history +- **Health Monitoring**: Success/failure tracking +- **Storage Metrics**: Backup size and retention + +## Keyboard Controls | Key | Action | -| --- | --- | +|-----|--------| | `←` / `h` | Previous host | | `→` / `l` / `Tab` | Next host | | `?` | Toggle help overlay | -| `r` | Update status message | +| `r` | Force refresh | | `q` / `Esc` | Quit | -## Agent +## Email Notifications -The metrics agent runs on each host and publishes SMART, service, and backup data to the ZMQ gossip network. The agent auto-detects system configuration and requires root privileges for hardware monitoring. +### Notification Triggers +- **Status Degradation**: Any status change to warning/critical +- **Recovery**: Warning/critical status returning to ok +- **Service Failures**: Individual service stop/start events -```bash -# Run agent with auto-detection -sudo cargo run -p cm-dashboard-agent +### Example Recovery Email +``` +✅ RESOLVED: system cpu on cmbox -# Run with specific configuration -sudo cargo run -p cm-dashboard-agent -- --config config/agent.toml +Status Change Alert -# Manual configuration -sudo cargo run -p cm-dashboard-agent -- \ - --hostname srv01 \ - --bind tcp://*:6130 \ - --smart-devices nvme0n1,sda \ - --services nginx,postgres +Host: cmbox +Component: system +Metric: cpu +Status Change: warning → ok +Time: 2025-10-12 22:15:30 CET + +Details: +Recovered from: CPU load (1/5/15min): 6.20 / 5.80 / 4.50 +Current status: CPU load (1/5/15min): 3.30 / 3.17 / 2.84 + +-- +CM Dashboard Agent +Generated at 2025-10-12 22:15:30 CET ``` -The agent automatically: -- Detects available storage devices for SMART monitoring -- Discovers running systemd services for resource tracking -- Configures appropriate collection intervals per host type -- Requires root access for `smartctl` and system metrics - -Use `--disable-smart`, `--disable-service`, or `--disable-backup` to skip specific collectors. +### Rate Limiting +- **Default**: 30 minutes between notifications per component +- **Testing**: Set to 0 for immediate notifications +- **Configurable**: Adjustable per deployment needs ## Development -- Format: `cargo fmt` -- Check workspace: `cargo check` -- Build release binaries: `cargo build --release` +### Project Structure +``` +cm-dashboard/ +├── agent/ # Monitoring agent +│ ├── src/ +│ │ ├── collectors/ # Data collection modules +│ │ ├── notifications.rs # Email notification system +│ │ └── simple_agent.rs # Main agent logic +├── dashboard/ # TUI dashboard +│ ├── src/ +│ │ ├── ui/ # Widget implementations +│ │ ├── data/ # Data structures +│ │ └── app.rs # Application state +├── shared/ # Common data structures +└── config/ # Configuration files +``` -The dashboard subscribes to the CMTEC ZMQ gossip network (default `tcp://127.0.0.1:6130`). Received metrics are cached per host and retained in an in-memory ring buffer for future trend analysis. +### Development Commands +```bash +# Format code +cargo fmt + +# Check all packages +cargo check + +# Run tests +cargo test + +# Build release +cargo build --release + +# Run with logging +RUST_LOG=debug cargo run -p cm-dashboard-agent +``` + +### Architecture Principles + +#### Status Calculation Rules +- **Agent calculates all status** using predefined thresholds +- **Dashboard never calculates status** - only displays agent data +- **No hardcoded thresholds in dashboard** widgets +- **Use "unknown" when agent status missing** (never default to "ok") + +#### Data Flow +``` +System Metrics → Agent Collectors → Status Calculation → ZMQ → Dashboard → Display + ↓ + Email Notifications +``` + +#### Pure Auto-Discovery +- **No config files required** for basic operation +- **Runtime discovery** of system capabilities +- **Service auto-detection** via systemd patterns +- **Storage device enumeration** via /sys filesystem + +## Troubleshooting + +### Common Issues + +#### Agent Won't Start +```bash +# Check permissions (agent requires root) +sudo cm-dashboard-agent -v + +# Verify ZMQ binding +sudo netstat -tulpn | grep 6130 + +# Check system access +sudo smartctl --scan +``` + +#### Dashboard Connection Issues +```bash +# Test ZMQ connectivity +cm-dashboard --zmq-endpoint tcp://target-host:6130 -v + +# Check network connectivity +telnet target-host 6130 +``` + +#### Email Notifications Not Working +```bash +# Check postfix status +sudo systemctl status postfix + +# Test SMTP manually +telnet localhost 25 + +# Verify notification settings +sudo cm-dashboard-agent -v | grep notification +``` + +### Logging +Set `RUST_LOG=debug` for detailed logging: +```bash +RUST_LOG=debug sudo cm-dashboard-agent +RUST_LOG=debug cm-dashboard +``` + +## License + +MIT License - see LICENSE file for details. + +## Contributing + +1. Fork the repository +2. Create feature branch (`git checkout -b feature/amazing-feature`) +3. Commit changes (`git commit -m 'Add amazing feature'`) +4. Push to branch (`git push origin feature/amazing-feature`) +5. Open Pull Request + +For bugs and feature requests, please use GitHub Issues. \ No newline at end of file diff --git a/agent/src/notifications.rs b/agent/src/notifications.rs index 9b947e0..7c00e68 100644 --- a/agent/src/notifications.rs +++ b/agent/src/notifications.rs @@ -41,6 +41,7 @@ pub struct StatusChange { pub struct NotificationManager { config: NotificationConfig, last_status: HashMap, // key: "component.metric", value: status + last_details: HashMap, // key: "component.metric", value: details from warning/critical last_notification: HashMap>, // Rate limiting } @@ -49,6 +50,7 @@ impl NotificationManager { Self { config, last_status: HashMap::new(), + last_details: HashMap::new(), last_notification: HashMap::new(), } } @@ -63,16 +65,39 @@ impl NotificationManager { if let Some(old) = &old_status { if old != status { + // For recovery notifications, include original problem details + let change_details = if status == "ok" && (old == "warning" || old == "critical") { + // Recovery: combine current status details with what we recovered from + let old_details = self.last_details.get(&key).cloned(); + match (old_details, &details) { + (Some(old_detail), Some(current_detail)) => Some(format!("Recovered from: {}\nCurrent status: {}", old_detail, current_detail)), + (Some(old_detail), None) => Some(format!("Recovered from: {}", old_detail)), + (None, current) => current.clone(), + } + } else { + details.clone() + }; + let change = StatusChange { component: component.to_string(), metric: metric.to_string(), old_status: old.clone(), new_status: status.to_string(), timestamp: Utc::now(), - details, + details: change_details, }; - self.last_status.insert(key, status.to_string()); + self.last_status.insert(key.clone(), status.to_string()); + + // Store details for warning/critical states (for future recovery notifications) + if status == "warning" || status == "critical" { + if let Some(ref detail) = details { + self.last_details.insert(key.clone(), detail.clone()); + } + } else if status == "ok" { + // Clear stored details after recovery + self.last_details.remove(&key); + } if self.should_notify(&change) { return Some(change); @@ -80,7 +105,10 @@ impl NotificationManager { } } else { // First time seeing this metric - store but don't notify - self.last_status.insert(key, status.to_string()); + self.last_status.insert(key.clone(), status.to_string()); + if (status == "warning" || status == "critical") && details.is_some() { + self.last_details.insert(key, details.unwrap()); + } } None diff --git a/agent/src/simple_agent.rs b/agent/src/simple_agent.rs index 96f63d9..7989097 100644 --- a/agent/src/simple_agent.rs +++ b/agent/src/simple_agent.rs @@ -193,7 +193,8 @@ impl SimpleAgent { if let Some(summary) = output.data.get("summary") { // Check CPU status if let Some(cpu_status) = summary.get("cpu_status").and_then(|v| v.as_str()) { - if let Some(change) = self.notification_manager.update_status("system", "cpu", cpu_status) { + let cpu_details = self.build_cpu_details(summary); + if let Some(change) = self.notification_manager.update_status_with_details("system", "cpu", cpu_status, cpu_details) { info!("CPU status change detected: {} -> {}", change.old_status, change.new_status); self.notification_manager.send_notification(change).await; } @@ -201,7 +202,8 @@ impl SimpleAgent { // Check memory status if let Some(memory_status) = summary.get("memory_status").and_then(|v| v.as_str()) { - if let Some(change) = self.notification_manager.update_status("system", "memory", memory_status) { + let memory_details = self.build_memory_details(summary); + if let Some(change) = self.notification_manager.update_status_with_details("system", "memory", memory_status, memory_details) { info!("Memory status change detected: {} -> {}", change.old_status, change.new_status); self.notification_manager.send_notification(change).await; } @@ -209,7 +211,8 @@ impl SimpleAgent { // Check CPU temp status (optional) if let Some(cpu_temp_status) = summary.get("cpu_temp_status").and_then(|v| v.as_str()) { - if let Some(change) = self.notification_manager.update_status("system", "cpu_temp", cpu_temp_status) { + let temp_details = self.build_cpu_temp_details(summary); + if let Some(change) = self.notification_manager.update_status_with_details("system", "cpu_temp", cpu_temp_status, temp_details) { info!("CPU temp status change detected: {} -> {}", change.old_status, change.new_status); self.notification_manager.send_notification(change).await; } @@ -232,6 +235,30 @@ impl SimpleAgent { } } + fn build_cpu_details(&self, summary: &serde_json::Value) -> Option { + let cpu_load_1 = summary.get("cpu_load_1").and_then(|v| v.as_f64()).unwrap_or(0.0); + let cpu_load_5 = summary.get("cpu_load_5").and_then(|v| v.as_f64()).unwrap_or(0.0); + let cpu_load_15 = summary.get("cpu_load_15").and_then(|v| v.as_f64()).unwrap_or(0.0); + + Some(format!("CPU load (1/5/15min): {:.2} / {:.2} / {:.2}", cpu_load_1, cpu_load_5, cpu_load_15)) + } + + fn build_memory_details(&self, summary: &serde_json::Value) -> Option { + let used_mb = summary.get("memory_used_mb").and_then(|v| v.as_f64()).unwrap_or(0.0); + let total_mb = summary.get("memory_total_mb").and_then(|v| v.as_f64()).unwrap_or(1.0); + let usage_percent = summary.get("memory_usage_percent").and_then(|v| v.as_f64()).unwrap_or(0.0); + + Some(format!("Memory usage: {:.1} / {:.1} GB ({:.1}%)", used_mb / 1024.0, total_mb / 1024.0, usage_percent)) + } + + fn build_cpu_temp_details(&self, summary: &serde_json::Value) -> Option { + if let Some(temp_c) = summary.get("cpu_temp_c").and_then(|v| v.as_f64()) { + Some(format!("CPU temperature: {:.1}°C", temp_c)) + } else { + None + } + } + fn build_service_failure_details(&self, output: &crate::collectors::CollectorOutput) -> Option { if let Some(services) = output.data.get("services").and_then(|v| v.as_array()) { let mut failed_services = Vec::new();