From c68ccf023eb1be0dee950d5abdf30967ecd5dbde Mon Sep 17 00:00:00 2001
From: Christoffer Martinsson <cm@cmtec.se>
Date: Mon, 13 Oct 2025 00:28:06 +0200
Subject: [PATCH] Testing

---
 CLAUDE.md                  |   4 +
 README.md                  | 348 ++++++++++++++++++++++++++++---------
 agent/src/notifications.rs |  34 +++-
 agent/src/simple_agent.rs  |  33 +++-
 4 files changed, 335 insertions(+), 84 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
index d30d568..e2e9e51 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -273,6 +273,10 @@ Agent (calculations + thresholds) → Status → Dashboard (display only) → Ta
 - [x] Updated dashboard to parse and display SystemCollector data
 - [x] Enhanced service notifications to include specific failure details
 - [x] CPU temperature thresholds set to 100°C (effectively disabled)
+- [x] **SystemCollector bug fixes completed (2025-10-12)**
+- [x] Fixed CPU load parsing for comma decimal separator locale (", " split)
+- [x] Fixed CPU temperature to prioritize x86_pkg_temp over generic thermal zones
+- [x] Fixed C-state collection to discover all available states (including C10)
 
 **Production Configuration:**
 - CPU load thresholds: Warning ≥ 5.0, Critical ≥ 8.0
diff --git a/README.md b/README.md
index 4c51796..2adc721 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-# CM Dashboard
+# CM Dashboard - Infrastructure Monitoring TUI
 
-CM Dashboard is a Rust-powered terminal UI for real-time monitoring of CMTEC infrastructure hosts. It subscribes to the CMTEC ZMQ gossip network where lightweight agents publish SMART, service, and backup metrics, and presents them in an efficient, keyboard-driven interface built with `ratatui`.
+A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built to replace Glance with a custom solution tailored for specific monitoring needs and API integrations. Features real-time monitoring of all infrastructure components with intelligent email notifications and automatic status calculation.
 
 ```
 ┌─────────────────────────────────────────────────────────────────────┐
@@ -28,41 +28,116 @@ CM Dashboard is a Rust-powered terminal UI for real-time monitoring of CMTEC inf
 Keys: [←→] hosts [r]efresh [q]uit
 ```
 
-## Requirements
+## Key Features
 
-- Rust toolchain 1.75+ (install via [`rustup`](https://rustup.rs))
-- Network access to the CMTEC metrics gossip agents (default `tcp://<host>:6130`; install `zeromq`/`libzmq` on the host)
-- Configuration files under `config/` describing hosts and dashboard preferences
+### Real-time Monitoring
+- **Multi-host support** for cmbox, labbox, simonbox, steambox, srv01
+- **Performance-focused** with minimal resource usage
+- **Keyboard-driven interface** for power users
+- **ZMQ gossip network** for efficient data distribution
+
+### Infrastructure Monitoring
+- **NVMe health monitoring** with wear prediction and temperature tracking
+- **CPU/Memory/GPU telemetry** with automatic thresholding
+- **Service resource monitoring** with per-service CPU and RAM usage
+- **Disk usage overview** for root filesystems
+- **Backup status** with detailed metrics and history
+- **C-state monitoring** for CPU power management analysis
+
+### Intelligent Alerting
+- **Agent-calculated status** with predefined thresholds
+- **Email notifications** via SMTP with rate limiting
+- **Recovery notifications** with context about original issues
+- **Stockholm timezone** support for email timestamps
+- **Unified alert pipeline** summarizing host health
+
+## Architecture
+
+### Agent-Dashboard Separation
+The system follows a strict separation of concerns:
+
+- **Agent**: Single source of truth for all status calculations using defined thresholds
+- **Dashboard**: Display-only interface that shows agent-provided status
+- **Data Flow**: Agent (calculations) → Status → Dashboard (display) → Colors
+
+### Agent Thresholds (Production)
+- **CPU Load**: Warning ≥ 5.0, Critical ≥ 8.0
+- **Memory Usage**: Warning ≥ 80%, Critical ≥ 95%
+- **CPU Temperature**: Warning ≥ 100°C, Critical ≥ 100°C (effectively disabled)
+
+### Email Notification System
+- **From**: `{hostname}@cmtec.se` (e.g., cmbox@cmtec.se)
+- **To**: `cm@cmtec.se`
+- **SMTP**: localhost:25 (postfix)
+- **Rate Limiting**: 30 minutes (configurable)
+- **Triggers**: Status degradation and recovery with detailed context
 
 ## Installation
 
-Clone the repository and build with Cargo:
+### Requirements
+- Rust toolchain 1.75+ (install via [`rustup`](https://rustup.rs))
+- Root privileges for agent (hardware monitoring access)
+- Network access for ZMQ communication (default port 6130)
+- SMTP server for notifications (postfix recommended)
 
+### Build from Source
 ```bash
 git clone https://github.com/cmtec/cm-dashboard.git
 cd cm-dashboard
 cargo build --release
 ```
 
-The optimized binary is available at `target/release/cm-dashboard`. To install into your Cargo bin directory:
+Optimized binaries available at:
+- Dashboard: `target/release/cm-dashboard`
+- Agent: `target/release/cm-dashboard-agent`
+
+### Installation
+```bash
+# Install dashboard
+cargo install --path dashboard
+
+# Install agent (requires root for hardware access)
+sudo cargo install --path agent
+```
+
+## Quick Start
+
+### Dashboard
+```bash
+# Run with default configuration
+cm-dashboard
+
+# Specify host to monitor
+cm-dashboard --host cmbox
+
+# Override ZMQ endpoints
+cm-dashboard --zmq-endpoint tcp://srv01:6130,tcp://labbox:6130
+
+# Increase logging verbosity
+cm-dashboard -v
+```
+
+### Agent (Pure Auto-Discovery)
+The agent requires **no configuration files** and auto-discovers all system components:
 
 ```bash
-cargo install --path dashboard
+# Basic agent startup (auto-detects everything)
+sudo cm-dashboard-agent
+
+# With verbose logging for troubleshooting
+sudo cm-dashboard-agent -v
 ```
 
+The agent automatically:
+- **Discovers storage devices** for SMART monitoring
+- **Detects running systemd services** for resource tracking
+- **Configures collection intervals** based on system capabilities
+- **Sets up email notifications** using hostname@cmtec.se
+
 ## Configuration
 
-On first launch, the dashboard will create `config/dashboard.toml` and `config/hosts.toml` automatically if they do not exist.
-
-You can also generate starter configuration files manually with the built-in helper:
-
-```bash
-cargo run -p cm-dashboard -- init-config
-# or, once installed
-cm-dashboard init-config --dir ./config --force
-```
-
-This produces `config/dashboard.toml` and `config/hosts.toml`. The primary dashboard config looks like:
+### Dashboard Configuration
+The dashboard creates `config/dashboard.toml` on first run:
 
 ```toml
 [hosts]
@@ -73,21 +148,13 @@ name = "srv01"
 enabled = true
 
 [[hosts.hosts]]
-name = "labbox"
+name = "cmbox"
 enabled = true
 
 [dashboard]
 tick_rate_ms = 250
 history_duration_minutes = 60
 
-[[dashboard.widgets]]
-id = "nvme"
-enabled = true
-
-[[dashboard.widgets]]
-id = "alerts"
-enabled = true
-
 [data_source]
 kind = "zmq"
 
@@ -95,73 +162,198 @@ kind = "zmq"
 endpoints = ["tcp://127.0.0.1:6130"]
 ```
 
-Adjust the host list and `data_source.zmq.endpoints` to match your CMTEC gossip network. If you prefer to manage hosts separately, edit the generated `hosts.toml` file.
-
-## Features
-
-- **Real-time monitoring** with ZMQ gossip network architecture
-- **Storage health** with drive capacity, usage, temperature, and wear tracking
-- **Per-service resource tracking** including memory and disk usage by service
-- **CPU/Memory monitoring** with load averages, temperature, and GPU metrics
-- **Alert system** with color-coded highlighting and threshold-based warnings
-- **Multi-host support** with seamless host switching (`←`, `→`, `h`, `l`, `Tab`)
-- **Backup status** monitoring with restic integration
-- **Keyboard-driven interface** with help overlay (`?`)
-- **Configuration management** via TOML files for hosts and dashboard settings
-
-## Getting Started
+### Agent Configuration (Optional)
+The agent works without configuration but supports optional settings:
 
 ```bash
-cargo run -p cm-dashboard -- --config config/dashboard.toml
-# specify a single host
-cargo run -p cm-dashboard -- --host srv01
-# override ZMQ endpoints at runtime
-cargo run -p cm-dashboard -- --zmq-endpoint tcp://srv01:6130,tcp://labbox:6130
-# increase logging verbosity
-cargo run -p cm-dashboard -- -v
+# Generate example configuration
+cm-dashboard-agent --help
+
+# Override specific settings
+sudo cm-dashboard-agent \
+    --hostname cmbox \
+    --bind tcp://*:6130 \
+    --interval 5000
 ```
 
-### Keyboard Shortcuts
+## Monitoring Components
+
+### System Collector
+- **CPU Load**: 1/5/15 minute averages with warning/critical thresholds
+- **Memory Usage**: Used/total with percentage calculation
+- **CPU Temperature**: x86_pkg_temp prioritized for accuracy
+- **C-States**: Power management state distribution (C0-C10)
+
+### Service Collector
+- **Systemd Services**: Auto-discovery of interesting services
+- **Resource Usage**: Per-service memory and disk consumption
+- **Service Health**: Running/stopped status with detailed failure info
+
+### SMART Collector
+- **NVMe Health**: Temperature, wear leveling, spare blocks
+- **Drive Capacity**: Total/used space with percentage
+- **SMART Attributes**: Critical health indicators
+
+### Backup Collector
+- **Restic Integration**: Backup status and history
+- **Health Monitoring**: Success/failure tracking
+- **Storage Metrics**: Backup size and retention
+
+## Keyboard Controls
 
 | Key | Action |
-| --- | --- |
+|-----|--------|
 | `←` / `h` | Previous host |
 | `→` / `l` / `Tab` | Next host |
 | `?` | Toggle help overlay |
-| `r` | Update status message |
+| `r` | Force refresh |
 | `q` / `Esc` | Quit |
 
-## Agent
+## Email Notifications
 
-The metrics agent runs on each host and publishes SMART, service, and backup data to the ZMQ gossip network. The agent auto-detects system configuration and requires root privileges for hardware monitoring.
+### Notification Triggers
+- **Status Degradation**: Any status change to warning/critical
+- **Recovery**: Warning/critical status returning to ok
+- **Service Failures**: Individual service stop/start events
 
-```bash
-# Run agent with auto-detection
-sudo cargo run -p cm-dashboard-agent
+### Example Recovery Email
+```
+✅ RESOLVED: system cpu on cmbox
 
-# Run with specific configuration
-sudo cargo run -p cm-dashboard-agent -- --config config/agent.toml
+Status Change Alert
 
-# Manual configuration
-sudo cargo run -p cm-dashboard-agent -- \
-    --hostname srv01 \
-    --bind tcp://*:6130 \
-    --smart-devices nvme0n1,sda \
-    --services nginx,postgres
+Host: cmbox
+Component: system
+Metric: cpu
+Status Change: warning → ok
+Time: 2025-10-12 22:15:30 CET
+
+Details:
+Recovered from: CPU load (1/5/15min): 6.20 / 5.80 / 4.50
+Current status: CPU load (1/5/15min): 3.30 / 3.17 / 2.84
+
+--
+CM Dashboard Agent
+Generated at 2025-10-12 22:15:30 CET
 ```
 
-The agent automatically:
-- Detects available storage devices for SMART monitoring
-- Discovers running systemd services for resource tracking
-- Configures appropriate collection intervals per host type
-- Requires root access for `smartctl` and system metrics
-
-Use `--disable-smart`, `--disable-service`, or `--disable-backup` to skip specific collectors.
+### Rate Limiting
+- **Default**: 30 minutes between notifications per component
+- **Testing**: Set to 0 for immediate notifications
+- **Configurable**: Adjustable per deployment needs
 
 ## Development
 
-- Format: `cargo fmt`
-- Check workspace: `cargo check`
-- Build release binaries: `cargo build --release`
+### Project Structure
+```
+cm-dashboard/
+├── agent/                 # Monitoring agent
+│   ├── src/
+│   │   ├── collectors/    # Data collection modules
+│   │   ├── notifications.rs # Email notification system
+│   │   └── simple_agent.rs # Main agent logic
+├── dashboard/             # TUI dashboard
+│   ├── src/
+│   │   ├── ui/           # Widget implementations
+│   │   ├── data/         # Data structures
+│   │   └── app.rs        # Application state
+├── shared/               # Common data structures
+└── config/              # Configuration files
+```
 
-The dashboard subscribes to the CMTEC ZMQ gossip network (default `tcp://127.0.0.1:6130`). Received metrics are cached per host and retained in an in-memory ring buffer for future trend analysis.
+### Development Commands
+```bash
+# Format code
+cargo fmt
+
+# Check all packages
+cargo check
+
+# Run tests
+cargo test
+
+# Build release
+cargo build --release
+
+# Run with logging
+RUST_LOG=debug cargo run -p cm-dashboard-agent
+```
+
+### Architecture Principles
+
+#### Status Calculation Rules
+- **Agent calculates all status** using predefined thresholds
+- **Dashboard never calculates status** - only displays agent data
+- **No hardcoded thresholds in dashboard** widgets
+- **Use "unknown" when agent status missing** (never default to "ok")
+
+#### Data Flow
+```
+System Metrics → Agent Collectors → Status Calculation → ZMQ → Dashboard → Display
+                                         ↓
+                                 Email Notifications
+```
+
+#### Pure Auto-Discovery
+- **No config files required** for basic operation
+- **Runtime discovery** of system capabilities
+- **Service auto-detection** via systemd patterns
+- **Storage device enumeration** via /sys filesystem
+
+## Troubleshooting
+
+### Common Issues
+
+#### Agent Won't Start
+```bash
+# Check permissions (agent requires root)
+sudo cm-dashboard-agent -v
+
+# Verify ZMQ binding
+sudo netstat -tulpn | grep 6130
+
+# Check system access
+sudo smartctl --scan
+```
+
+#### Dashboard Connection Issues
+```bash
+# Test ZMQ connectivity
+cm-dashboard --zmq-endpoint tcp://target-host:6130 -v
+
+# Check network connectivity
+telnet target-host 6130
+```
+
+#### Email Notifications Not Working
+```bash
+# Check postfix status
+sudo systemctl status postfix
+
+# Test SMTP manually
+telnet localhost 25
+
+# Verify notification settings
+sudo cm-dashboard-agent -v | grep notification
+```
+
+### Logging
+Set `RUST_LOG=debug` for detailed logging:
+```bash
+RUST_LOG=debug sudo cm-dashboard-agent
+RUST_LOG=debug cm-dashboard
+```
+
+## License
+
+MIT License - see LICENSE file for details.
+
+## Contributing
+
+1. Fork the repository
+2. Create feature branch (`git checkout -b feature/amazing-feature`)
+3. Commit changes (`git commit -m 'Add amazing feature'`)
+4. Push to branch (`git push origin feature/amazing-feature`)
+5. Open Pull Request
+
+For bugs and feature requests, please use GitHub Issues.
\ No newline at end of file
diff --git a/agent/src/notifications.rs b/agent/src/notifications.rs
index 9b947e0..7c00e68 100644
--- a/agent/src/notifications.rs
+++ b/agent/src/notifications.rs
@@ -41,6 +41,7 @@ pub struct StatusChange {
 pub struct NotificationManager {
     config: NotificationConfig,
     last_status: HashMap<String, String>, // key: "component.metric", value: status
+    last_details: HashMap<String, String>, // key: "component.metric", value: details from warning/critical
     last_notification: HashMap<String, DateTime<Utc>>, // Rate limiting
 }
 
@@ -49,6 +50,7 @@ impl NotificationManager {
         Self {
             config,
             last_status: HashMap::new(),
+            last_details: HashMap::new(),
             last_notification: HashMap::new(),
         }
     }
@@ -63,16 +65,39 @@ impl NotificationManager {
         
         if let Some(old) = &old_status {
             if old != status {
+                // For recovery notifications, include original problem details
+                let change_details = if status == "ok" && (old == "warning" || old == "critical") {
+                    // Recovery: combine current status details with what we recovered from
+                    let old_details = self.last_details.get(&key).cloned();
+                    match (old_details, &details) {
+                        (Some(old_detail), Some(current_detail)) => Some(format!("Recovered from: {}\nCurrent status: {}", old_detail, current_detail)),
+                        (Some(old_detail), None) => Some(format!("Recovered from: {}", old_detail)),
+                        (None, current) => current.clone(),
+                    }
+                } else {
+                    details.clone()
+                };
+                
                 let change = StatusChange {
                     component: component.to_string(),
                     metric: metric.to_string(),
                     old_status: old.clone(),
                     new_status: status.to_string(),
                     timestamp: Utc::now(),
-                    details,
+                    details: change_details,
                 };
                 
-                self.last_status.insert(key, status.to_string());
+                self.last_status.insert(key.clone(), status.to_string());
+                
+                // Store details for warning/critical states (for future recovery notifications)
+                if status == "warning" || status == "critical" {
+                    if let Some(ref detail) = details {
+                        self.last_details.insert(key.clone(), detail.clone());
+                    }
+                } else if status == "ok" {
+                    // Clear stored details after recovery
+                    self.last_details.remove(&key);
+                }
                 
                 if self.should_notify(&change) {
                     return Some(change);
@@ -80,7 +105,10 @@ impl NotificationManager {
             }
         } else {
             // First time seeing this metric - store but don't notify
-            self.last_status.insert(key, status.to_string());
+            self.last_status.insert(key.clone(), status.to_string());
+            if (status == "warning" || status == "critical") && details.is_some() {
+                self.last_details.insert(key, details.unwrap());
+            }
         }
         
         None
diff --git a/agent/src/simple_agent.rs b/agent/src/simple_agent.rs
index 96f63d9..7989097 100644
--- a/agent/src/simple_agent.rs
+++ b/agent/src/simple_agent.rs
@@ -193,7 +193,8 @@ impl SimpleAgent {
                 if let Some(summary) = output.data.get("summary") {
                     // Check CPU status
                     if let Some(cpu_status) = summary.get("cpu_status").and_then(|v| v.as_str()) {
-                        if let Some(change) = self.notification_manager.update_status("system", "cpu", cpu_status) {
+                        let cpu_details = self.build_cpu_details(summary);
+                        if let Some(change) = self.notification_manager.update_status_with_details("system", "cpu", cpu_status, cpu_details) {
                             info!("CPU status change detected: {} -> {}", change.old_status, change.new_status);
                             self.notification_manager.send_notification(change).await;
                         }
@@ -201,7 +202,8 @@ impl SimpleAgent {
 
                     // Check memory status
                     if let Some(memory_status) = summary.get("memory_status").and_then(|v| v.as_str()) {
-                        if let Some(change) = self.notification_manager.update_status("system", "memory", memory_status) {
+                        let memory_details = self.build_memory_details(summary);
+                        if let Some(change) = self.notification_manager.update_status_with_details("system", "memory", memory_status, memory_details) {
                             info!("Memory status change detected: {} -> {}", change.old_status, change.new_status);
                             self.notification_manager.send_notification(change).await;
                         }
@@ -209,7 +211,8 @@ impl SimpleAgent {
 
                     // Check CPU temp status (optional)
                     if let Some(cpu_temp_status) = summary.get("cpu_temp_status").and_then(|v| v.as_str()) {
-                        if let Some(change) = self.notification_manager.update_status("system", "cpu_temp", cpu_temp_status) {
+                        let temp_details = self.build_cpu_temp_details(summary);
+                        if let Some(change) = self.notification_manager.update_status_with_details("system", "cpu_temp", cpu_temp_status, temp_details) {
                             info!("CPU temp status change detected: {} -> {}", change.old_status, change.new_status);
                             self.notification_manager.send_notification(change).await;
                         }
@@ -232,6 +235,30 @@ impl SimpleAgent {
         }
     }
 
+    fn build_cpu_details(&self, summary: &serde_json::Value) -> Option<String> {
+        let cpu_load_1 = summary.get("cpu_load_1").and_then(|v| v.as_f64()).unwrap_or(0.0);
+        let cpu_load_5 = summary.get("cpu_load_5").and_then(|v| v.as_f64()).unwrap_or(0.0);
+        let cpu_load_15 = summary.get("cpu_load_15").and_then(|v| v.as_f64()).unwrap_or(0.0);
+        
+        Some(format!("CPU load (1/5/15min): {:.2} / {:.2} / {:.2}", cpu_load_1, cpu_load_5, cpu_load_15))
+    }
+
+    fn build_memory_details(&self, summary: &serde_json::Value) -> Option<String> {
+        let used_mb = summary.get("memory_used_mb").and_then(|v| v.as_f64()).unwrap_or(0.0);
+        let total_mb = summary.get("memory_total_mb").and_then(|v| v.as_f64()).unwrap_or(1.0);
+        let usage_percent = summary.get("memory_usage_percent").and_then(|v| v.as_f64()).unwrap_or(0.0);
+        
+        Some(format!("Memory usage: {:.1} / {:.1} GB ({:.1}%)", used_mb / 1024.0, total_mb / 1024.0, usage_percent))
+    }
+
+    fn build_cpu_temp_details(&self, summary: &serde_json::Value) -> Option<String> {
+        if let Some(temp_c) = summary.get("cpu_temp_c").and_then(|v| v.as_f64()) {
+            Some(format!("CPU temperature: {:.1}°C", temp_c))
+        } else {
+            None
+        }
+    }
+
     fn build_service_failure_details(&self, output: &crate::collectors::CollectorOutput) -> Option<String> {
         if let Some(services) = output.data.get("services").and_then(|v| v.as_array()) {
             let mut failed_services = Vec::new();