Compare commits
84 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 505263cec6 | |||
| 61dd686fb9 | |||
| c0f7a97a6f | |||
| 9575077045 | |||
| 34a1f7b9dc | |||
| d11aa11f99 | |||
| 0ca06d2507 | |||
| 6693f3a05f | |||
| de252d27b9 | |||
| db0e41a7d3 | |||
| ec460496d8 | |||
| 33e700529e | |||
| d644b7d40a | |||
| f635ba9c75 | |||
| 76b6e3373e | |||
| 0a13cab897 | |||
| d33ec5d225 | |||
| d31c2384df | |||
| c8db463204 | |||
| e8e50ef9bb | |||
| 0faed9309e | |||
| c980346d05 | |||
| 3e3d3f0c2b | |||
| 9eb7444d56 | |||
| 278d1763aa | |||
| f874264e13 | |||
| 5f6e47ece5 | |||
| 0e7cf24dbb | |||
| 2d080a2f51 | |||
| 6179bd51a7 | |||
| 57de4c366a | |||
| e18778e962 | |||
| e4469a0ebf | |||
| 6fedf4c7fc | |||
| 3f6dffa66e | |||
| 1b64fbde3d | |||
| 4f4c3b0d6e | |||
| bd20f0cae1 | |||
| 11c9a5f9d2 | |||
| aeae60146d | |||
| a82c81e8e3 | |||
| c56e9d7be2 | |||
| c8f800a1e5 | |||
| fc6b3424cf | |||
| 35e06c6734 | |||
| 783d233319 | |||
| 6509a2b91a | |||
| 52f8c40b86 | |||
| a86b5ba8f9 | |||
| 1b964545be | |||
| 97aa1708c2 | |||
| d12689f3b5 | |||
| f22e3ee95e | |||
| e890c5e810 | |||
| 078c30a592 | |||
| a847674004 | |||
| 2618f6b62f | |||
| c3fc5a181d | |||
| 3f45a172b3 | |||
| 5b12c12228 | |||
| 651b801de3 | |||
| 71b9f93d7c | |||
| ae70946c61 | |||
| 2910b7d875 | |||
| 43242debce | |||
| a2519b2814 | |||
| 91f037aa3e | |||
| 627c533724 | |||
| b1bff4857b | |||
| f8a061d496 | |||
| e61a845965 | |||
| ac5d2d4db5 | |||
| 69892a2d84 | |||
| a928d73134 | |||
| af52d49194 | |||
| bc94f75328 | |||
| b6da71b7e7 | |||
| aaf7edfbce | |||
| bb72c42726 | |||
| af5f96ce2f | |||
| 8dffe18a23 | |||
| 0c544753f9 | |||
| c8e26b9bac | |||
| 60ef712fac |
@@ -108,17 +108,18 @@ jobs:
|
||||
|
||||
# Download tarball to get correct hash
|
||||
curl -L -o cm-dashboard.tar.gz "$TARBALL_URL"
|
||||
# Convert sha256 hex to base64 for Nix hash format using Python
|
||||
NEW_HASH=$(sha256sum cm-dashboard.tar.gz | cut -d' ' -f1)
|
||||
NIX_HASH="sha256-$(echo -n $NEW_HASH | xxd -r -p | base64)"
|
||||
NIX_HASH="sha256-$(python3 -c "import base64, binascii; print(base64.b64encode(binascii.unhexlify('$NEW_HASH')).decode())")"
|
||||
|
||||
# Update the NixOS configuration
|
||||
sed -i "s/version = \"v[^\"]*\"/version = \"$VERSION\"/" hosts/common/cm-dashboard.nix
|
||||
sed -i "s/sha256 = \"sha256-[^\"]*\"/sha256 = \"$NIX_HASH\"/" hosts/common/cm-dashboard.nix
|
||||
sed -i "s|version = \"v[^\"]*\"|version = \"$VERSION\"|" hosts/services/cm-dashboard.nix
|
||||
sed -i "s|sha256 = \"sha256-[^\"]*\"|sha256 = \"$NIX_HASH\"|" hosts/services/cm-dashboard.nix
|
||||
|
||||
# Commit and push changes
|
||||
git config user.name "Gitea Actions"
|
||||
git config user.email "actions@gitea.cmtec.se"
|
||||
git add hosts/common/cm-dashboard.nix
|
||||
git add hosts/services/cm-dashboard.nix
|
||||
git commit -m "Auto-update cm-dashboard to $VERSION
|
||||
|
||||
- Update version to $VERSION with automated release
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
# Agent Guide
|
||||
|
||||
Agents working in this repo must follow the instructions in `CLAUDE.md`.
|
||||
383
CLAUDE.md
383
CLAUDE.md
@@ -2,207 +2,80 @@
|
||||
|
||||
## Overview
|
||||
|
||||
A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built to replace Glance with a custom solution tailored for our specific monitoring needs and ZMQ-based metric collection.
|
||||
A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built with ZMQ-based metric collection and individual metrics architecture.
|
||||
|
||||
## Implementation Strategy
|
||||
## Current Features
|
||||
|
||||
### Current Implementation Status
|
||||
### Core Functionality
|
||||
- **Real-time Monitoring**: CPU, RAM, Storage, and Service status
|
||||
- **Service Management**: Start/stop services with user-stopped tracking
|
||||
- **Multi-host Support**: Monitor multiple servers from single dashboard
|
||||
- **NixOS Integration**: System rebuild via SSH + tmux popup
|
||||
- **Backup Monitoring**: Borgbackup status and scheduling
|
||||
|
||||
**System Panel Enhancement - COMPLETED** ✅
|
||||
### User-Stopped Service Tracking
|
||||
- Services stopped via dashboard are marked as "user-stopped"
|
||||
- User-stopped services report Status::OK instead of Warning
|
||||
- Prevents false alerts during intentional maintenance
|
||||
- Persistent storage survives agent restarts
|
||||
- Automatic flag clearing when services are restarted via dashboard
|
||||
|
||||
All system panel features successfully implemented:
|
||||
- ✅ **NixOS Collector**: Created collector for version and active users
|
||||
- ✅ **System Widget**: Unified widget combining NixOS, CPU, RAM, and Storage
|
||||
- ✅ **Build Display**: Shows NixOS build information without codename
|
||||
- ✅ **Active Users**: Displays currently logged in users
|
||||
- ✅ **Tmpfs Monitoring**: Added /tmp usage to RAM section
|
||||
- ✅ **Agent Deployment**: NixOS collector working in production
|
||||
|
||||
**Keyboard Navigation and Service Management - COMPLETED** ✅
|
||||
|
||||
All keyboard navigation and service selection features successfully implemented:
|
||||
- ✅ **Panel Navigation**: Shift+Tab cycles through visible panels only (System → Services → Backup)
|
||||
- ✅ **Service Selection**: Up/Down arrows navigate through parent services with visual cursor
|
||||
- ✅ **Focus Management**: Selection highlighting only visible when Services panel focused
|
||||
- ✅ **Status Preservation**: Service health colors maintained during selection (green/red icons)
|
||||
- ✅ **Smart Panel Switching**: Only cycles through panels with data (backup panel conditional)
|
||||
- ✅ **Scroll Support**: All panels support content scrolling with proper overflow indicators
|
||||
|
||||
**Current Status - October 25, 2025:**
|
||||
- All keyboard navigation features working correctly ✅
|
||||
- Service selection cursor implemented with focus-aware highlighting ✅
|
||||
- Panel scrolling fixed for System, Services, and Backup panels ✅
|
||||
- Build display working: "Build: 25.05.20251004.3bcc93c" ✅
|
||||
- Configuration hash display: Currently shows git hash, needs to be fixed ❌
|
||||
|
||||
**Target Layout:**
|
||||
```
|
||||
NixOS:
|
||||
Build: 25.05.20251004.3bcc93c
|
||||
Config: d8ivwiar # Should show nix store hash (8 chars) from deployed system
|
||||
Active users: cm, simon
|
||||
CPU:
|
||||
● Load: 0.02 0.31 0.86 • 3000MHz
|
||||
RAM:
|
||||
● Usage: 33% 2.6GB/7.6GB
|
||||
● /tmp: 0% 0B/2.0GB
|
||||
Storage:
|
||||
● root (Single):
|
||||
├─ ● nvme0n1 W: 1%
|
||||
└─ ● 18% 167.4GB/928.2GB
|
||||
### Custom Service Logs
|
||||
- Configure service-specific log file paths per host in dashboard config
|
||||
- Press `L` on any service to view custom log files via `tail -f`
|
||||
- Configuration format in dashboard config:
|
||||
```toml
|
||||
[service_logs]
|
||||
hostname1 = [
|
||||
{ service_name = "nginx", log_file_path = "/var/log/nginx/access.log" },
|
||||
{ service_name = "app", log_file_path = "/var/log/myapp/app.log" }
|
||||
]
|
||||
hostname2 = [
|
||||
{ service_name = "database", log_file_path = "/var/log/postgres/postgres.log" }
|
||||
]
|
||||
```
|
||||
|
||||
**System panel layout fully implemented with blue tree symbols ✅**
|
||||
**Tree symbols now use consistent blue theming across all panels ✅**
|
||||
**Overflow handling restored for all widgets ("... and X more") ✅**
|
||||
**Agent hash display working correctly ✅**
|
||||
### Service Management
|
||||
- **Direct Control**: Arrow keys (↑↓) or vim keys (j/k) navigate services
|
||||
- **Service Actions**:
|
||||
- `s` - Start service (sends UserStart command)
|
||||
- `S` - Stop service (sends UserStop command)
|
||||
- `J` - Show service logs (journalctl in tmux popup)
|
||||
- `L` - Show custom log files (tail -f custom paths in tmux popup)
|
||||
- `R` - Rebuild current host
|
||||
- **Visual Status**: Green ● (active), Yellow ◐ (inactive), Red ◯ (failed)
|
||||
- **Transitional Icons**: Blue arrows during operations
|
||||
|
||||
### Current Keyboard Navigation Implementation
|
||||
|
||||
**Navigation Controls:**
|
||||
- **Tab**: Switch between hosts (cmbox, srv01, srv02, steambox, etc.)
|
||||
- **Shift+Tab**: Cycle through visible panels (System → Services → Backup → System)
|
||||
- **Up/Down (System/Backup)**: Scroll through panel content
|
||||
- **Up/Down (Services)**: Move service selection cursor between parent services
|
||||
### Navigation
|
||||
- **Tab**: Switch between hosts
|
||||
- **↑↓ or j/k**: Select services
|
||||
- **s**: Start selected service (UserStart)
|
||||
- **S**: Stop selected service (UserStop)
|
||||
- **J**: Show service logs (journalctl)
|
||||
- **L**: Show custom log files
|
||||
- **R**: Rebuild current host
|
||||
- **B**: Run backup on current host
|
||||
- **q**: Quit dashboard
|
||||
|
||||
**Panel-Specific Features:**
|
||||
- **System Panel**: Scrollable content with CPU, RAM, Storage details
|
||||
- **Services Panel**: Service selection cursor for parent services only (docker, nginx, postgresql, etc.)
|
||||
- **Backup Panel**: Scrollable repository list with proper overflow handling
|
||||
|
||||
**Visual Feedback:**
|
||||
- **Focused Panel**: Blue border and title highlighting
|
||||
- **Service Selection**: Blue background with preserved status icon colors (green ● for active, red ● for failed)
|
||||
- **Focus-Aware Selection**: Selection highlighting only visible when Services panel focused
|
||||
- **Dynamic Statusbar**: Context-aware shortcuts based on focused panel
|
||||
|
||||
### Remote Command Execution - WORKING ✅
|
||||
|
||||
**All Issues Resolved (as of 2025-10-24):**
|
||||
- ✅ **ZMQ Command Protocol**: Extended with ServiceControl and SystemRebuild variants
|
||||
- ✅ **Agent Handlers**: systemctl and nixos-rebuild execution with maintenance mode
|
||||
- ✅ **Dashboard Integration**: Keyboard shortcuts execute commands
|
||||
- ✅ **Service Control**: Fixed toggle logic - replaced with separate 's' (start) and 'S' (stop)
|
||||
- ✅ **System Rebuild**: Fixed permission issues and sandboxing problems
|
||||
- ✅ **Git Clone Approach**: Implemented for nixos-rebuild to avoid directory permissions
|
||||
- ✅ **Visual Feedback**: Directional arrows for service status (↑ starting, ↓ stopping, ↻ restarting)
|
||||
|
||||
**Keyboard Controls Status:**
|
||||
- **Services Panel**:
|
||||
- R (restart) ✅ Working
|
||||
- s (start) ✅ Working
|
||||
- S (stop) ✅ Working
|
||||
- **System Panel**: R (nixos-rebuild) ✅ Working with --option sandbox false
|
||||
- **Backup Panel**: B (trigger backup) ❓ Not implemented
|
||||
|
||||
**Visual Feedback Implementation - IN PROGRESS:**
|
||||
|
||||
Context-appropriate progress indicators for each panel:
|
||||
|
||||
**Services Panel** (Service status transitions):
|
||||
```
|
||||
● nginx active → ⏳ nginx restarting → ● nginx active
|
||||
● docker active → ⏳ docker stopping → ● docker inactive
|
||||
```
|
||||
|
||||
**System Panel** (Build progress in NixOS section):
|
||||
```
|
||||
NixOS:
|
||||
Build: 25.05.20251004.3bcc93c → Build: [████████████ ] 65%
|
||||
Active users: cm, simon Active users: cm, simon
|
||||
```
|
||||
|
||||
**Backup Panel** (OnGoing status with progress):
|
||||
```
|
||||
Latest backup: → Latest backup:
|
||||
● 2024-10-23 14:32:15 ● OnGoing
|
||||
└─ Duration: 1.3m └─ [██████ ] 60%
|
||||
```
|
||||
|
||||
**Critical Configuration Hash Fix - HIGH PRIORITY:**
|
||||
|
||||
**Problem:** Configuration hash currently shows git commit hash instead of actual deployed system hash.
|
||||
|
||||
**Current (incorrect):**
|
||||
- Shows git hash: `db11f82` (source repository commit)
|
||||
- Not accurate - doesn't reflect what's actually deployed
|
||||
|
||||
**Target (correct):**
|
||||
- Show nix store hash: `d8ivwiar` (first 8 chars from deployed system)
|
||||
- Source: `/nix/store/d8ivwiarhwhgqzskj6q2482r58z46qjf-nixos-system-cmbox-25.05.20251004.3bcc93c`
|
||||
- Pattern: Extract hash from `/nix/store/HASH-nixos-system-HOSTNAME-VERSION`
|
||||
|
||||
**Benefits:**
|
||||
1. **Deployment Verification:** Confirms rebuild actually succeeded
|
||||
2. **Accurate Status:** Shows what's truly running, not just source
|
||||
3. **Rebuild Completion Detection:** Hash change = rebuild completed
|
||||
4. **Rollback Tracking:** Each deployment has unique identifier
|
||||
|
||||
**Implementation Required:**
|
||||
1. Agent extracts nix store hash from `ls -la /run/current-system`
|
||||
2. Reports this as `system_config_hash` metric instead of git hash
|
||||
3. Dashboard displays first 8 characters: `Config: d8ivwiar`
|
||||
|
||||
**Next Session Priority Tasks:**
|
||||
|
||||
**Remaining Features:**
|
||||
1. **Fix Configuration Hash Display (CRITICAL)**:
|
||||
- Use nix store hash instead of git commit hash
|
||||
- Extract from `/run/current-system` -> `/nix/store/HASH-nixos-system-*`
|
||||
- Enables proper rebuild completion detection
|
||||
|
||||
2. **Command Response Protocol**:
|
||||
- Agent sends command completion/failure back to dashboard via ZMQ
|
||||
- Dashboard updates UI status from ⏳ to ● when commands complete
|
||||
- Clear success/failure status after timeout
|
||||
|
||||
3. **Backup Panel Features**:
|
||||
- Implement backup trigger functionality (B key)
|
||||
- Complete visual feedback for backup operations
|
||||
- Add backup progress indicators
|
||||
|
||||
**Enhancement Tasks:**
|
||||
- Add confirmation dialogs for destructive actions (stop/restart/rebuild)
|
||||
- Implement command history/logging
|
||||
- Add keyboard shortcuts help overlay
|
||||
|
||||
**Future Enhanced Navigation:**
|
||||
- Add Page Up/Down for faster scrolling through long service lists
|
||||
- Implement search/filter functionality for services
|
||||
- Add jump-to-service shortcuts (first letter navigation)
|
||||
|
||||
**Future Advanced Features:**
|
||||
- Service dependency visualization
|
||||
- Historical service status tracking
|
||||
- Real-time log viewing integration
|
||||
|
||||
## Core Architecture Principles - CRITICAL
|
||||
## Core Architecture Principles
|
||||
|
||||
### Individual Metrics Philosophy
|
||||
|
||||
**NEW ARCHITECTURE**: Agent collects individual metrics, dashboard composes widgets from those metrics.
|
||||
- Agent collects individual metrics, dashboard composes widgets
|
||||
- Each metric collected, transmitted, and stored individually
|
||||
- Agent calculates status for each metric using thresholds
|
||||
- Dashboard aggregates individual metric statuses for widget status
|
||||
|
||||
### Maintenance Mode
|
||||
|
||||
**Purpose:**
|
||||
|
||||
- Suppress email notifications during planned maintenance or backups
|
||||
- Prevents false alerts when services are intentionally stopped
|
||||
|
||||
**Implementation:**
|
||||
|
||||
- Agent checks for `/tmp/cm-maintenance` file before sending notifications
|
||||
- File presence suppresses all email notifications while continuing monitoring
|
||||
- Dashboard continues to show real status, only notifications are blocked
|
||||
|
||||
**Usage:**
|
||||
|
||||
Usage:
|
||||
```bash
|
||||
# Enable maintenance mode
|
||||
touch /tmp/cm-maintenance
|
||||
|
||||
# Run maintenance tasks (backups, service restarts, etc.)
|
||||
# Run maintenance tasks
|
||||
systemctl stop service
|
||||
# ... maintenance work ...
|
||||
systemctl start service
|
||||
@@ -211,102 +84,29 @@ systemctl start service
|
||||
rm /tmp/cm-maintenance
|
||||
```
|
||||
|
||||
**NixOS Integration:**
|
||||
|
||||
- Borgbackup script automatically creates/removes maintenance file
|
||||
- Automatic cleanup via trap ensures maintenance mode doesn't stick
|
||||
- All cinfiguration are shall be done from nixos config
|
||||
|
||||
**ARCHITECTURE ENFORCEMENT**:
|
||||
|
||||
- **ZERO legacy code reuse** - Fresh implementation following ARCHITECT.md exactly
|
||||
- **Individual metrics only** - NO grouped metric structures
|
||||
- **Reference-only legacy** - Study old functionality, implement new architecture
|
||||
- **Clean slate mindset** - Build as if legacy codebase never existed
|
||||
|
||||
**Implementation Rules**:
|
||||
|
||||
1. **Individual Metrics**: Each metric is collected, transmitted, and stored individually
|
||||
2. **Agent Status Authority**: Agent calculates status for each metric using thresholds
|
||||
3. **Dashboard Composition**: Dashboard widgets subscribe to specific metrics by name
|
||||
4. **Status Aggregation**: Dashboard aggregates individual metric statuses for widget status
|
||||
**Testing & Building**:
|
||||
|
||||
- **Workspace builds**: `cargo build --workspace` for all testing
|
||||
- **Clean compilation**: Remove `target/` between architecture changes
|
||||
- **ZMQ testing**: Test agent-dashboard communication independently
|
||||
- **Widget testing**: Verify UI layout matches legacy appearance exactly
|
||||
|
||||
**NEVER in New Implementation**:
|
||||
|
||||
- Copy/paste ANY code from legacy backup
|
||||
- Calculate status in dashboard widgets
|
||||
- Hardcode metric names in widgets (use const arrays)
|
||||
|
||||
# Important Communication Guidelines
|
||||
|
||||
NEVER write that you have "successfully implemented" something or generate extensive summary text without first verifying with the user that the implementation is correct. This wastes tokens. Keep responses concise.
|
||||
|
||||
NEVER implement code without first getting explicit user agreement on the approach. Always ask for confirmation before proceeding with implementation.
|
||||
|
||||
## Commit Message Guidelines
|
||||
|
||||
**NEVER mention:**
|
||||
|
||||
- Claude or any AI assistant names
|
||||
- Automation or AI-generated content
|
||||
- Any reference to automated code generation
|
||||
|
||||
**ALWAYS:**
|
||||
|
||||
- Focus purely on technical changes and their purpose
|
||||
- Use standard software development commit message format
|
||||
- Describe what was changed and why, not how it was created
|
||||
- Write from the perspective of a human developer
|
||||
|
||||
**Examples:**
|
||||
|
||||
- ❌ "Generated with Claude Code"
|
||||
- ❌ "AI-assisted implementation"
|
||||
- ❌ "Automated refactoring"
|
||||
- ✅ "Implement maintenance mode for backup operations"
|
||||
- ✅ "Restructure storage widget with improved layout"
|
||||
- ✅ "Update CPU thresholds to production values"
|
||||
|
||||
## Development and Deployment Architecture
|
||||
|
||||
**CRITICAL:** Development and deployment paths are completely separate:
|
||||
|
||||
### Development Path
|
||||
- **Location:** `~/projects/nixosbox`
|
||||
- **Purpose:** Development workflow only - for committing new cm-dashboard code
|
||||
- **Location:** `~/projects/cm-dashboard`
|
||||
- **Purpose:** Development workflow only - for committing new code
|
||||
- **Access:** Only for developers to commit changes
|
||||
- **Code Access:** Running cm-dashboard code shall NEVER access this path
|
||||
|
||||
### Deployment Path
|
||||
- **Location:** `/var/lib/cm-dashboard/nixos-config`
|
||||
- **Purpose:** Production deployment only - agent clones/pulls from git
|
||||
- **Access:** Only cm-dashboard agent for deployment operations
|
||||
- **Workflow:** git pull → `/var/lib/cm-dashboard/nixos-config` → nixos-rebuild
|
||||
|
||||
### Git Flow
|
||||
```
|
||||
Development: ~/projects/nixosbox → git commit → git push
|
||||
Development: ~/projects/cm-dashboard → git commit → git push
|
||||
Deployment: git pull → /var/lib/cm-dashboard/nixos-config → rebuild
|
||||
```
|
||||
|
||||
## Automated Binary Release System
|
||||
|
||||
**IMPLEMENTED:** cm-dashboard now uses automated binary releases instead of source builds.
|
||||
CM Dashboard uses automated binary releases instead of source builds.
|
||||
|
||||
### Release Workflow
|
||||
|
||||
1. **Automated Release Creation**
|
||||
- Gitea Actions workflow builds static binaries on tag push
|
||||
- Creates release with `cm-dashboard-linux-x86_64.tar.gz` tarball
|
||||
- No manual intervention required for binary generation
|
||||
|
||||
2. **Creating New Releases**
|
||||
### Creating New Releases
|
||||
```bash
|
||||
cd ~/projects/cm-dashboard
|
||||
git tag v0.1.X
|
||||
@@ -318,8 +118,8 @@ Deployment: git pull → /var/lib/cm-dashboard/nixos-config → rebuild
|
||||
- Creates GitHub-style release with tarball
|
||||
- Uploads binaries via Gitea API
|
||||
|
||||
3. **NixOS Configuration Updates**
|
||||
Edit `~/projects/nixosbox/hosts/common/cm-dashboard.nix`:
|
||||
### NixOS Configuration Updates
|
||||
Edit `~/projects/nixosbox/hosts/services/cm-dashboard.nix`:
|
||||
|
||||
```nix
|
||||
version = "v0.1.X";
|
||||
@@ -329,7 +129,7 @@ Deployment: git pull → /var/lib/cm-dashboard/nixos-config → rebuild
|
||||
};
|
||||
```
|
||||
|
||||
4. **Get Release Hash**
|
||||
### Get Release Hash
|
||||
```bash
|
||||
cd ~/projects/nixosbox
|
||||
nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchurl {
|
||||
@@ -338,18 +138,53 @@ Deployment: git pull → /var/lib/cm-dashboard/nixos-config → rebuild
|
||||
}' 2>&1 | grep "got:"
|
||||
```
|
||||
|
||||
5. **Commit and Deploy**
|
||||
```bash
|
||||
cd ~/projects/nixosbox
|
||||
git add hosts/common/cm-dashboard.nix
|
||||
git commit -m "Update cm-dashboard to v0.1.X with static binaries"
|
||||
git push
|
||||
```
|
||||
### Building
|
||||
|
||||
### Benefits
|
||||
**Testing & Building:**
|
||||
- **Workspace builds**: `nix-shell -p openssl pkg-config --run "cargo build --workspace"`
|
||||
- **Clean compilation**: Remove `target/` between major changes
|
||||
|
||||
- **No compilation overhead** on each host
|
||||
- **Consistent static binaries** across all hosts
|
||||
- **Faster deployments** - download vs compile
|
||||
- **No library dependency issues** - static linking
|
||||
- **Automated pipeline** - tag push triggers everything
|
||||
## Important Communication Guidelines
|
||||
|
||||
Keep responses concise and focused. Avoid extensive implementation summaries unless requested.
|
||||
|
||||
## Commit Message Guidelines
|
||||
|
||||
**NEVER mention:**
|
||||
- Claude or any AI assistant names
|
||||
- Automation or AI-generated content
|
||||
- Any reference to automated code generation
|
||||
|
||||
**ALWAYS:**
|
||||
- Focus purely on technical changes and their purpose
|
||||
- Use standard software development commit message format
|
||||
- Describe what was changed and why, not how it was created
|
||||
- Write from the perspective of a human developer
|
||||
|
||||
**Examples:**
|
||||
- ❌ "Generated with Claude Code"
|
||||
- ❌ "AI-assisted implementation"
|
||||
- ❌ "Automated refactoring"
|
||||
- ✅ "Implement maintenance mode for backup operations"
|
||||
- ✅ "Restructure storage widget with improved layout"
|
||||
- ✅ "Update CPU thresholds to production values"
|
||||
|
||||
## Implementation Rules
|
||||
|
||||
1. **Individual Metrics**: Each metric is collected, transmitted, and stored individually
|
||||
2. **Agent Status Authority**: Agent calculates status for each metric using thresholds
|
||||
3. **Dashboard Composition**: Dashboard widgets subscribe to specific metrics by name
|
||||
4. **Status Aggregation**: Dashboard aggregates individual metric statuses for widget status
|
||||
|
||||
**NEVER:**
|
||||
- Copy/paste ANY code from legacy implementations
|
||||
- Calculate status in dashboard widgets
|
||||
- Hardcode metric names in widgets (use const arrays)
|
||||
- Create files unless absolutely necessary for achieving goals
|
||||
- Create documentation files unless explicitly requested
|
||||
|
||||
**ALWAYS:**
|
||||
- Prefer editing existing files to creating new ones
|
||||
- Follow existing code conventions and patterns
|
||||
- Use existing libraries and utilities
|
||||
- Follow security best practices
|
||||
13
Cargo.lock
generated
13
Cargo.lock
generated
@@ -270,7 +270,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.0"
|
||||
version = "0.1.81"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
@@ -286,12 +286,13 @@ dependencies = [
|
||||
"toml",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"wake-on-lan",
|
||||
"zmq",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.0"
|
||||
version = "0.1.81"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -314,7 +315,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.0"
|
||||
version = "0.1.81"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"serde",
|
||||
@@ -2064,6 +2065,12 @@ version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||
|
||||
[[package]]
|
||||
name = "wake-on-lan"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ccf60b60ad7e5b1b37372c5134cbcab4db0706c231d212e0c643a077462bc8f"
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.5.0"
|
||||
|
||||
501
README.md
501
README.md
@@ -1,88 +1,108 @@
|
||||
# CM Dashboard
|
||||
|
||||
A real-time infrastructure monitoring system with intelligent status aggregation and email notifications, built with Rust and ZMQ.
|
||||
A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built with ZMQ-based metric collection and individual metrics architecture.
|
||||
|
||||
## Current Implementation
|
||||
## Features
|
||||
|
||||
This is a complete rewrite implementing an **individual metrics architecture** where:
|
||||
### Core Monitoring
|
||||
- **Real-time metrics**: CPU, RAM, Storage, and Service status
|
||||
- **Multi-host support**: Monitor multiple servers from single dashboard
|
||||
- **Service management**: Start/stop services with intelligent status tracking
|
||||
- **NixOS integration**: System rebuild via SSH + tmux popup
|
||||
- **Backup monitoring**: Borgbackup status and scheduling
|
||||
- **Email notifications**: Intelligent batching prevents spam
|
||||
|
||||
- **Agent** collects individual metrics (e.g., `cpu_load_1min`, `memory_usage_percent`) and calculates status
|
||||
- **Dashboard** subscribes to specific metrics and composes widgets
|
||||
- **Status Aggregation** provides intelligent email notifications with batching
|
||||
- **Persistent Cache** prevents false notifications on restart
|
||||
### User-Stopped Service Tracking
|
||||
Services stopped via the dashboard are intelligently tracked to prevent false alerts:
|
||||
|
||||
## Dashboard Interface
|
||||
- **Smart status reporting**: User-stopped services show as Status::OK instead of Warning
|
||||
- **Persistent storage**: Tracking survives agent restarts via JSON storage
|
||||
- **Automatic management**: Flags cleared when services restarted via dashboard
|
||||
- **Maintenance friendly**: No false alerts during intentional service operations
|
||||
|
||||
## Architecture
|
||||
|
||||
### Individual Metrics Philosophy
|
||||
- **Agent**: Collects individual metrics, calculates status using thresholds
|
||||
- **Dashboard**: Subscribes to specific metrics, composes widgets from individual data
|
||||
- **ZMQ Communication**: Efficient real-time metric transmission
|
||||
- **Status Aggregation**: Host-level status calculated from all service metrics
|
||||
|
||||
### Components
|
||||
|
||||
```
|
||||
┌─────────────────┐ ZMQ ┌─────────────────┐
|
||||
│ │◄──────────►│ │
|
||||
│ Agent │ Metrics │ Dashboard │
|
||||
│ - Collectors │ │ - TUI │
|
||||
│ - Status │ │ - Widgets │
|
||||
│ - Tracking │ │ - Commands │
|
||||
│ │ │ │
|
||||
└─────────────────┘ └─────────────────┘
|
||||
│ │
|
||||
▼ ▼
|
||||
┌─────────────────┐ ┌─────────────────┐
|
||||
│ JSON Storage │ │ SSH + tmux │
|
||||
│ - User-stopped │ │ - Remote rebuild│
|
||||
│ - Cache │ │ - Process │
|
||||
│ - State │ │ isolation │
|
||||
└─────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
### Service Control Flow
|
||||
|
||||
1. **User Action**: Dashboard sends `UserStart`/`UserStop` commands
|
||||
2. **Agent Processing**:
|
||||
- Marks service as user-stopped (if stopping)
|
||||
- Executes `systemctl start/stop service`
|
||||
- Syncs state to global tracker
|
||||
3. **Status Calculation**:
|
||||
- Systemd collector checks user-stopped flag
|
||||
- Reports Status::OK for user-stopped inactive services
|
||||
- Normal Warning status for system failures
|
||||
|
||||
## Interface
|
||||
|
||||
```
|
||||
cm-dashboard • ● cmbox ● srv01 ● srv02 ● steambox
|
||||
┌system──────────────────────────────┐┌services─────────────────────────────────────────┐
|
||||
│CPU: ││Service: Status: RAM: Disk: │
|
||||
│● Load: 0.10 0.52 0.88 • 400.0 MHz ││● docker active 27M 496MB │
|
||||
│RAM: ││● docker-registry active 19M 496MB │
|
||||
│● Used: 30% 2.3GB/7.6GB ││● gitea active 579M 2.6GB │
|
||||
│● tmp: 0.0% 0B/2.0GB ││● gitea-runner-default active 11M 2.6GB │
|
||||
│Disk nvme0n1: ││● haasp-core active 9M 1MB │
|
||||
│● Health: PASSED ││● haasp-mqtt active 3M 1MB │
|
||||
│● Usage @root: 8.3% • 75.4/906.2 GB ││● haasp-webgrid active 10M 1MB │
|
||||
│● Usage @boot: 5.9% • 0.1/1.0 GB ││● immich-server active 240M 45.1GB │
|
||||
│ ││● mosquitto active 1M 1MB │
|
||||
│ ││● mysql active 38M 225MB │
|
||||
│ ││● nginx active 28M 24MB │
|
||||
│ ││ ├─ ● gitea.cmtec.se 51ms │
|
||||
│ ││ ├─ ● haasp.cmtec.se 43ms │
|
||||
│ ││ ├─ ● haasp.net 43ms │
|
||||
│ ││ ├─ ● pages.cmtec.se 45ms │
|
||||
└────────────────────────────────────┘│ ├─ ● photos.cmtec.se 41ms │
|
||||
┌backup──────────────────────────────┐│ ├─ ● unifi.cmtec.se 46ms │
|
||||
│Latest backup: ││ ├─ ● vault.cmtec.se 47ms │
|
||||
│● Status: OK ││ ├─ ● www.kryddorten.se 81ms │
|
||||
│Duration: 54s • Last: 4h ago ││ ├─ ● www.mariehall2.se 86ms │
|
||||
│Disk usage: 48.2GB/915.8GB ││● postgresql active 112M 357MB │
|
||||
│P/N: Samsung SSD 870 QVO 1TB ││● redis-immich active 8M 45.1GB │
|
||||
│S/N: S5RRNF0W800639Y ││● sshd active 2M 0 │
|
||||
│● gitea 2 archives 2.7GB ││● unifi active 594M 495MB │
|
||||
│● immich 2 archives 45.0GB ││● vaultwarden active 12M 1MB │
|
||||
│● kryddorten 2 archives 67.6MB ││ │
|
||||
│● mariehall2 2 archives 321.8MB ││ │
|
||||
│● nixosbox 2 archives 4.5MB ││ │
|
||||
│● unifi 2 archives 2.9MB ││ │
|
||||
│● vaultwarden 2 archives 305kB ││ │
|
||||
│NixOS: ││Service: Status: RAM: Disk: │
|
||||
│Build: 25.05.20251004.3bcc93c ││● docker active 27M 496MB │
|
||||
│Agent: v0.1.43 ││● gitea active 579M 2.6GB │
|
||||
│Active users: cm, simon ││● nginx active 28M 24MB │
|
||||
│CPU: ││ ├─ ● gitea.cmtec.se 51ms │
|
||||
│● Load: 0.10 0.52 0.88 • 3000MHz ││ ├─ ● photos.cmtec.se 41ms │
|
||||
│RAM: ││● postgresql active 112M 357MB │
|
||||
│● Usage: 33% 2.6GB/7.6GB ││● redis-immich user-stopped │
|
||||
│● /tmp: 0% 0B/2.0GB ││● sshd active 2M 0 │
|
||||
│Storage: ││● unifi active 594M 495MB │
|
||||
│● root (Single): ││ │
|
||||
│ ├─ ● nvme0n1 W: 1% ││ │
|
||||
│ └─ ● 18% 167.4GB/928.2GB ││ │
|
||||
└────────────────────────────────────┘└─────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Navigation**: `←→` switch hosts, `r` refresh, `q` quit
|
||||
### Navigation
|
||||
- **Tab**: Switch between hosts
|
||||
- **↑↓ or j/k**: Navigate services
|
||||
- **s**: Start selected service (UserStart)
|
||||
- **S**: Stop selected service (UserStop)
|
||||
- **J**: Show service logs (journalctl in tmux popup)
|
||||
- **L**: Show custom log files (tail -f custom paths in tmux popup)
|
||||
- **R**: Rebuild current host
|
||||
- **B**: Run backup on current host
|
||||
- **q**: Quit
|
||||
|
||||
## Features
|
||||
|
||||
- **Real-time monitoring** - Dashboard updates every 1-2 seconds
|
||||
- **Individual metric collection** - Granular data for flexible dashboard composition
|
||||
- **Intelligent status aggregation** - Host-level status calculated from all services
|
||||
- **Smart email notifications** - Batched, detailed alerts with service groupings
|
||||
- **Persistent state** - Prevents false notifications on restarts
|
||||
- **ZMQ communication** - Efficient agent-to-dashboard messaging
|
||||
- **Clean TUI** - Terminal-based dashboard with color-coded status indicators
|
||||
|
||||
## Architecture
|
||||
|
||||
### Core Components
|
||||
|
||||
- **Agent** (`cm-dashboard-agent`) - Collects metrics and sends via ZMQ
|
||||
- **Dashboard** (`cm-dashboard`) - Real-time TUI display consuming metrics
|
||||
- **Shared** (`cm-dashboard-shared`) - Common types and protocol
|
||||
- **Status Aggregation** - Intelligent batching and notification management
|
||||
- **Persistent Cache** - Maintains state across restarts
|
||||
|
||||
### Status Levels
|
||||
|
||||
- **🟢 Ok** - Service running normally
|
||||
- **🔵 Pending** - Service starting/stopping/reloading
|
||||
- **🟡 Warning** - Service issues (high load, memory, disk usage)
|
||||
- **🔴 Critical** - Service failed or critical thresholds exceeded
|
||||
- **❓ Unknown** - Service state cannot be determined
|
||||
### Status Indicators
|
||||
- **Green ●**: Active service
|
||||
- **Yellow ◐**: Inactive service (system issue)
|
||||
- **Red ◯**: Failed service
|
||||
- **Blue arrows**: Service transitioning (↑ starting, ↓ stopping, ↻ restarting)
|
||||
- **"user-stopped"**: Service stopped via dashboard (Status::OK)
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Build
|
||||
### Building
|
||||
|
||||
```bash
|
||||
# With Nix (recommended)
|
||||
@@ -93,21 +113,20 @@ sudo apt install libssl-dev pkg-config # Ubuntu/Debian
|
||||
cargo build --workspace
|
||||
```
|
||||
|
||||
### Run
|
||||
### Running
|
||||
|
||||
```bash
|
||||
# Start agent (requires configuration file)
|
||||
# Start agent (requires configuration)
|
||||
./target/debug/cm-dashboard-agent --config /etc/cm-dashboard/agent.toml
|
||||
|
||||
# Start dashboard
|
||||
./target/debug/cm-dashboard --config /path/to/dashboard.toml
|
||||
# Start dashboard (inside tmux session)
|
||||
tmux
|
||||
./target/debug/cm-dashboard --config /etc/cm-dashboard/dashboard.toml
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Agent Configuration (`agent.toml`)
|
||||
|
||||
The agent requires a comprehensive TOML configuration file:
|
||||
### Agent Configuration
|
||||
|
||||
```toml
|
||||
collection_interval_seconds = 2
|
||||
@@ -116,47 +135,27 @@ collection_interval_seconds = 2
|
||||
publisher_port = 6130
|
||||
command_port = 6131
|
||||
bind_address = "0.0.0.0"
|
||||
timeout_ms = 5000
|
||||
heartbeat_interval_ms = 30000
|
||||
transmission_interval_seconds = 2
|
||||
|
||||
[collectors.cpu]
|
||||
enabled = true
|
||||
interval_seconds = 2
|
||||
load_warning_threshold = 9.0
|
||||
load_warning_threshold = 5.0
|
||||
load_critical_threshold = 10.0
|
||||
temperature_warning_threshold = 100.0
|
||||
temperature_critical_threshold = 110.0
|
||||
|
||||
[collectors.memory]
|
||||
enabled = true
|
||||
interval_seconds = 2
|
||||
usage_warning_percent = 80.0
|
||||
usage_critical_percent = 95.0
|
||||
|
||||
[collectors.disk]
|
||||
enabled = true
|
||||
interval_seconds = 300
|
||||
usage_warning_percent = 80.0
|
||||
usage_critical_percent = 90.0
|
||||
|
||||
[[collectors.disk.filesystems]]
|
||||
name = "root"
|
||||
uuid = "4cade5ce-85a5-4a03-83c8-dfd1d3888d79"
|
||||
mount_point = "/"
|
||||
fs_type = "ext4"
|
||||
monitor = true
|
||||
|
||||
[collectors.systemd]
|
||||
enabled = true
|
||||
interval_seconds = 10
|
||||
memory_warning_mb = 1000.0
|
||||
memory_critical_mb = 2000.0
|
||||
service_name_filters = [
|
||||
"nginx", "postgresql", "redis", "docker", "sshd"
|
||||
]
|
||||
excluded_services = [
|
||||
"nginx-config-reload", "sshd-keygen"
|
||||
]
|
||||
service_name_filters = ["nginx*", "postgresql*", "docker*", "sshd*"]
|
||||
excluded_services = ["nginx-config-reload", "systemd-", "getty@"]
|
||||
nginx_latency_critical_ms = 1000.0
|
||||
http_timeout_seconds = 10
|
||||
|
||||
[notifications]
|
||||
enabled = true
|
||||
@@ -164,251 +163,203 @@ smtp_host = "localhost"
|
||||
smtp_port = 25
|
||||
from_email = "{hostname}@example.com"
|
||||
to_email = "admin@example.com"
|
||||
rate_limit_minutes = 0
|
||||
trigger_on_warnings = true
|
||||
trigger_on_failures = true
|
||||
recovery_requires_all_ok = true
|
||||
suppress_individual_recoveries = true
|
||||
|
||||
[status_aggregation]
|
||||
enabled = true
|
||||
aggregation_method = "worst_case"
|
||||
notification_interval_seconds = 30
|
||||
|
||||
[cache]
|
||||
persist_path = "/var/lib/cm-dashboard/cache.json"
|
||||
aggregation_interval_seconds = 30
|
||||
```
|
||||
|
||||
### Dashboard Configuration (`dashboard.toml`)
|
||||
### Dashboard Configuration
|
||||
|
||||
```toml
|
||||
[zmq]
|
||||
hosts = [
|
||||
{ name = "server1", address = "192.168.1.100", port = 6130 },
|
||||
{ name = "server2", address = "192.168.1.101", port = 6130 }
|
||||
]
|
||||
connection_timeout_ms = 5000
|
||||
reconnect_interval_ms = 10000
|
||||
subscriber_ports = [6130]
|
||||
|
||||
[ui]
|
||||
refresh_interval_ms = 1000
|
||||
theme = "dark"
|
||||
[hosts]
|
||||
predefined_hosts = ["cmbox", "srv01", "srv02"]
|
||||
|
||||
[ssh]
|
||||
rebuild_user = "cm"
|
||||
rebuild_alias = "nixos-rebuild-cmtec"
|
||||
backup_alias = "cm-backup-run"
|
||||
```
|
||||
|
||||
## Collectors
|
||||
## Technical Implementation
|
||||
|
||||
The agent implements several specialized collectors:
|
||||
### Collectors
|
||||
|
||||
### CPU Collector (`cpu.rs`)
|
||||
#### Systemd Collector
|
||||
- **Service Discovery**: Uses `systemctl list-unit-files` + `list-units --all`
|
||||
- **Status Calculation**: Checks user-stopped flag before assigning Warning status
|
||||
- **Memory Tracking**: Per-service memory usage via `systemctl show`
|
||||
- **Sub-services**: Nginx site latency, Docker containers
|
||||
- **User-stopped Integration**: `UserStoppedServiceTracker::is_service_user_stopped()`
|
||||
|
||||
- Load average (1, 5, 15 minute)
|
||||
- CPU temperature monitoring
|
||||
- Real-time process monitoring (top CPU consumers)
|
||||
- Status calculation with configurable thresholds
|
||||
#### User-Stopped Service Tracker
|
||||
- **Storage**: `/var/lib/cm-dashboard/user-stopped-services.json`
|
||||
- **Thread Safety**: Global singleton with `Arc<Mutex<>>`
|
||||
- **Persistence**: Automatic save on state changes
|
||||
- **Global Access**: Static methods for collector integration
|
||||
|
||||
### Memory Collector (`memory.rs`)
|
||||
#### Other Collectors
|
||||
- **CPU**: Load average, temperature, frequency monitoring
|
||||
- **Memory**: RAM/swap usage, tmpfs monitoring
|
||||
- **Disk**: Filesystem usage, SMART health data
|
||||
- **NixOS**: Build version, active users, agent version
|
||||
- **Backup**: Borgbackup repository status and metrics
|
||||
|
||||
- RAM usage (total, used, available)
|
||||
- Swap monitoring
|
||||
- Real-time process monitoring (top RAM consumers)
|
||||
- Memory pressure detection
|
||||
### ZMQ Protocol
|
||||
|
||||
### Disk Collector (`disk.rs`)
|
||||
```rust
|
||||
// Metric Message
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct MetricMessage {
|
||||
pub hostname: String,
|
||||
pub timestamp: u64,
|
||||
pub metrics: Vec<Metric>,
|
||||
}
|
||||
|
||||
- Filesystem usage per mount point
|
||||
- SMART health monitoring
|
||||
- Temperature and wear tracking
|
||||
- Configurable filesystem monitoring
|
||||
// Service Commands
|
||||
pub enum AgentCommand {
|
||||
ServiceControl {
|
||||
service_name: String,
|
||||
action: ServiceAction,
|
||||
},
|
||||
SystemRebuild { /* SSH config */ },
|
||||
CollectNow,
|
||||
}
|
||||
|
||||
### Systemd Collector (`systemd.rs`)
|
||||
pub enum ServiceAction {
|
||||
Start, // System-initiated
|
||||
Stop, // System-initiated
|
||||
UserStart, // User via dashboard (clears user-stopped)
|
||||
UserStop, // User via dashboard (marks user-stopped)
|
||||
Status,
|
||||
}
|
||||
```
|
||||
|
||||
- Service status monitoring (`active`, `inactive`, `failed`)
|
||||
- Memory usage per service
|
||||
- Service filtering and exclusions
|
||||
- Handles transitional states (`Status::Pending`)
|
||||
### Maintenance Mode
|
||||
|
||||
### Backup Collector (`backup.rs`)
|
||||
Suppress notifications during planned maintenance:
|
||||
|
||||
- Reads TOML status files from backup systems
|
||||
- Archive age verification
|
||||
- Disk usage tracking
|
||||
- Repository health monitoring
|
||||
```bash
|
||||
# Enable maintenance mode
|
||||
touch /tmp/cm-maintenance
|
||||
|
||||
# Perform maintenance
|
||||
systemctl stop service
|
||||
# ... work ...
|
||||
systemctl start service
|
||||
|
||||
# Disable maintenance mode
|
||||
rm /tmp/cm-maintenance
|
||||
```
|
||||
|
||||
## Email Notifications
|
||||
|
||||
### Intelligent Batching
|
||||
- **Real-time dashboard**: Immediate status updates
|
||||
- **Batched emails**: Aggregated every 30 seconds
|
||||
- **Smart grouping**: Services organized by severity
|
||||
- **Recovery suppression**: Reduces notification spam
|
||||
|
||||
The system implements smart notification batching to prevent email spam:
|
||||
|
||||
- **Real-time dashboard updates** - Status changes appear immediately
|
||||
- **Batched email notifications** - Aggregated every 30 seconds
|
||||
- **Detailed groupings** - Services organized by severity
|
||||
|
||||
### Example Alert Email
|
||||
|
||||
### Example Alert
|
||||
```
|
||||
Subject: Status Alert: 2 critical, 1 warning, 15 started
|
||||
Subject: Status Alert: 1 critical, 2 warnings, 0 recoveries
|
||||
|
||||
Status Summary (30s duration)
|
||||
Host Status: Ok → Warning
|
||||
|
||||
🔴 CRITICAL ISSUES (2):
|
||||
postgresql: Ok → Critical
|
||||
nginx: Warning → Critical
|
||||
🔴 CRITICAL ISSUES (1):
|
||||
postgresql: Ok → Critical (memory usage 95%)
|
||||
|
||||
🟡 WARNINGS (1):
|
||||
redis: Ok → Warning (memory usage 85%)
|
||||
🟡 WARNINGS (2):
|
||||
nginx: Ok → Warning (high load 8.5)
|
||||
redis: user-stopped → Warning (restarted by system)
|
||||
|
||||
✅ RECOVERIES (0):
|
||||
|
||||
🟢 SERVICE STARTUPS (15):
|
||||
docker: Unknown → Ok
|
||||
sshd: Unknown → Ok
|
||||
...
|
||||
|
||||
--
|
||||
CM Dashboard Agent
|
||||
Generated at 2025-10-21 19:42:42 CET
|
||||
CM Dashboard Agent v0.1.43
|
||||
```
|
||||
|
||||
## Individual Metrics Architecture
|
||||
|
||||
The system follows a **metrics-first architecture**:
|
||||
|
||||
### Agent Side
|
||||
|
||||
```rust
|
||||
// Agent collects individual metrics
|
||||
vec![
|
||||
Metric::new("cpu_load_1min".to_string(), MetricValue::Float(2.5), Status::Ok),
|
||||
Metric::new("memory_usage_percent".to_string(), MetricValue::Float(78.5), Status::Warning),
|
||||
Metric::new("service_nginx_status".to_string(), MetricValue::String("active".to_string()), Status::Ok),
|
||||
]
|
||||
```
|
||||
|
||||
### Dashboard Side
|
||||
|
||||
```rust
|
||||
// Widgets subscribe to specific metrics
|
||||
impl Widget for CpuWidget {
|
||||
fn update_from_metrics(&mut self, metrics: &[&Metric]) {
|
||||
for metric in metrics {
|
||||
match metric.name.as_str() {
|
||||
"cpu_load_1min" => self.load_1min = metric.value.as_f32(),
|
||||
"cpu_load_5min" => self.load_5min = metric.value.as_f32(),
|
||||
"cpu_temperature_celsius" => self.temperature = metric.value.as_f32(),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Persistent Cache
|
||||
|
||||
The cache system prevents false notifications:
|
||||
|
||||
- **Automatic saving** - Saves when service status changes
|
||||
- **Persistent storage** - Maintains state across agent restarts
|
||||
- **Simple design** - No complex TTL or cleanup logic
|
||||
- **Status preservation** - Prevents duplicate notifications
|
||||
|
||||
## Development
|
||||
|
||||
### Project Structure
|
||||
|
||||
```
|
||||
cm-dashboard/
|
||||
├── agent/ # Metrics collection agent
|
||||
│ ├── src/
|
||||
│ │ ├── collectors/ # CPU, memory, disk, systemd, backup
|
||||
│ │ ├── collectors/ # CPU, memory, disk, systemd, backup, nixos
|
||||
│ │ ├── service_tracker.rs # User-stopped service tracking
|
||||
│ │ ├── status/ # Status aggregation and notifications
|
||||
│ │ ├── cache/ # Persistent metric caching
|
||||
│ │ ├── config/ # TOML configuration loading
|
||||
│ │ └── notifications/ # Email notification system
|
||||
│ │ └── communication/ # ZMQ message handling
|
||||
├── dashboard/ # TUI dashboard application
|
||||
│ ├── src/
|
||||
│ │ ├── ui/widgets/ # CPU, memory, services, backup widgets
|
||||
│ │ ├── metrics/ # Metric storage and filtering
|
||||
│ │ └── communication/ # ZMQ metric consumption
|
||||
│ │ ├── ui/widgets/ # CPU, memory, services, backup, system
|
||||
│ │ ├── communication/ # ZMQ consumption and commands
|
||||
│ │ └── app.rs # Main application loop
|
||||
├── shared/ # Shared types and utilities
|
||||
│ └── src/
|
||||
│ ├── metrics.rs # Metric, Status, and Value types
|
||||
│ ├── metrics.rs # Metric, Status, StatusTracker types
|
||||
│ ├── protocol.rs # ZMQ message format
|
||||
│ └── cache.rs # Cache configuration
|
||||
└── README.md # This file
|
||||
└── CLAUDE.md # Development guidelines and rules
|
||||
```
|
||||
|
||||
### Building
|
||||
|
||||
### Testing
|
||||
```bash
|
||||
# Debug build
|
||||
cargo build --workspace
|
||||
# Build and test
|
||||
nix-shell -p openssl pkg-config --run "cargo build --workspace"
|
||||
nix-shell -p openssl pkg-config --run "cargo test --workspace"
|
||||
|
||||
# Release build
|
||||
cargo build --workspace --release
|
||||
|
||||
# Run tests
|
||||
cargo test --workspace
|
||||
|
||||
# Check code formatting
|
||||
cargo fmt --all -- --check
|
||||
|
||||
# Run clippy linter
|
||||
# Code quality
|
||||
cargo fmt --all
|
||||
cargo clippy --workspace -- -D warnings
|
||||
```
|
||||
|
||||
### Dependencies
|
||||
## Deployment
|
||||
|
||||
- **tokio** - Async runtime
|
||||
- **zmq** - Message passing between agent and dashboard
|
||||
- **ratatui** - Terminal user interface
|
||||
- **serde** - Serialization for metrics and config
|
||||
- **anyhow/thiserror** - Error handling
|
||||
- **tracing** - Structured logging
|
||||
- **lettre** - SMTP email notifications
|
||||
- **clap** - Command-line argument parsing
|
||||
- **toml** - Configuration file parsing
|
||||
### Automated Binary Releases
|
||||
```bash
|
||||
# Create new release
|
||||
cd ~/projects/cm-dashboard
|
||||
git tag v0.1.X
|
||||
git push origin v0.1.X
|
||||
```
|
||||
|
||||
## NixOS Integration
|
||||
This triggers automated:
|
||||
- Static binary compilation with `RUSTFLAGS="-C target-feature=+crt-static"`
|
||||
- GitHub-style release creation
|
||||
- Tarball upload to Gitea
|
||||
|
||||
This project is designed for declarative deployment via NixOS:
|
||||
|
||||
### Configuration Generation
|
||||
|
||||
The NixOS module automatically generates the agent configuration:
|
||||
### NixOS Integration
|
||||
Update `~/projects/nixosbox/hosts/services/cm-dashboard.nix`:
|
||||
|
||||
```nix
|
||||
# hosts/common/cm-dashboard.nix
|
||||
services.cm-dashboard-agent = {
|
||||
enable = true;
|
||||
port = 6130;
|
||||
version = "v0.1.43";
|
||||
src = pkgs.fetchurl {
|
||||
url = "https://gitea.cmtec.se/cm/cm-dashboard/releases/download/${version}/cm-dashboard-linux-x86_64.tar.gz";
|
||||
sha256 = "sha256-HASH";
|
||||
};
|
||||
```
|
||||
|
||||
### Deployment
|
||||
|
||||
Get hash via:
|
||||
```bash
|
||||
# Update NixOS configuration
|
||||
git add hosts/common/cm-dashboard.nix
|
||||
git commit -m "Update cm-dashboard configuration"
|
||||
git push
|
||||
|
||||
# Rebuild system (user-performed)
|
||||
sudo nixos-rebuild switch --flake .
|
||||
cd ~/projects/nixosbox
|
||||
nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchurl {
|
||||
url = "URL_HERE";
|
||||
sha256 = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=";
|
||||
}' 2>&1 | grep "got:"
|
||||
```
|
||||
|
||||
## Monitoring Intervals
|
||||
|
||||
- **CPU/Memory**: 2 seconds (real-time monitoring)
|
||||
- **Disk usage**: 300 seconds (5 minutes)
|
||||
- **Systemd services**: 10 seconds
|
||||
- **SMART health**: 600 seconds (10 minutes)
|
||||
- **Backup status**: 60 seconds (1 minute)
|
||||
- **Email notifications**: 30 seconds (batched)
|
||||
- **Dashboard updates**: 1 second (real-time display)
|
||||
- **Metrics Collection**: 2 seconds (CPU, memory, services)
|
||||
- **Metric Transmission**: 2 seconds (ZMQ publish)
|
||||
- **Dashboard Updates**: 1 second (UI refresh)
|
||||
- **Email Notifications**: 30 seconds (batched)
|
||||
- **Disk Monitoring**: 300 seconds (5 minutes)
|
||||
- **Service Discovery**: 300 seconds (5 minutes cache)
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see LICENSE file for details
|
||||
|
||||
MIT License - see LICENSE file for details.
|
||||
63
TODO.md
63
TODO.md
@@ -1,63 +0,0 @@
|
||||
# TODO
|
||||
|
||||
## Systemd filtering (agent)
|
||||
|
||||
- remove user systemd collection
|
||||
- reduce number of systemctl call
|
||||
- Cahnge so only services in include list are detected
|
||||
- Filter on exact name
|
||||
- Add support for "\*" in filtering
|
||||
|
||||
## System panel (agent/dashboard)
|
||||
|
||||
use following layout:
|
||||
'''
|
||||
NixOS:
|
||||
Build: xxxxxx
|
||||
Agen: xxxxxx
|
||||
CPU:
|
||||
● Load: 0.02 0.31 0.86
|
||||
└─ Freq: 3000MHz
|
||||
RAM:
|
||||
● Usage: 33% 2.6GB/7.6GB
|
||||
└─ ● /tmp: 0% 0B/2.0GB
|
||||
Storage:
|
||||
● /:
|
||||
├─ ● nvme0n1 T: 40C • W: 4%
|
||||
└─ ● 8% 75.0GB/906.2GB
|
||||
'''
|
||||
|
||||
- Add support to show login/active users
|
||||
- Add support to show timestamp/version for latest nixos rebuild
|
||||
|
||||
## Backup panel (dashboard)
|
||||
|
||||
use following layout:
|
||||
'''
|
||||
Latest backup:
|
||||
● <timestamp>
|
||||
└─ Duration: 1.3m
|
||||
Disk:
|
||||
● Samsung SSD 870 QVO 1TB
|
||||
├─ S/N: S5RRNF0W800639Y
|
||||
└─ Usage: 50.5GB/915.8GB
|
||||
Repos:
|
||||
● gitea (4) 5.1GB
|
||||
● immich (4) 45.0GB
|
||||
● kryddorten (4) 67.8MB
|
||||
● mariehall2 (4) 322.7MB
|
||||
● nixosbox (4) 5.5MB
|
||||
● unifi (4) 5.7MB
|
||||
● vaultwarden (4) 508kB
|
||||
'''
|
||||
|
||||
## Keyboard navigation and scrolling (dashboard)
|
||||
|
||||
- Add keyboard navigation between panels "Shift-Tab"
|
||||
- Add lower statusbar with dynamic updated shortcuts when switchng between panels
|
||||
|
||||
## Remote execution (agent/dashboard)
|
||||
|
||||
- Add support for send command via dashboard to agent to do nixos rebuid
|
||||
- Add support for navigating services in dashboard and trigger start/stop/restart
|
||||
- Add support for trigger backup
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.0"
|
||||
version = "0.1.82"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -4,12 +4,13 @@ use std::time::Duration;
|
||||
use tokio::time::interval;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
use crate::communication::{AgentCommand, ServiceAction, ZmqHandler};
|
||||
use crate::communication::{AgentCommand, ZmqHandler};
|
||||
use crate::config::AgentConfig;
|
||||
use crate::metrics::MetricCollectionManager;
|
||||
use crate::notifications::NotificationManager;
|
||||
use crate::service_tracker::UserStoppedServiceTracker;
|
||||
use crate::status::HostStatusManager;
|
||||
use cm_dashboard_shared::{Metric, MetricMessage};
|
||||
use cm_dashboard_shared::{Metric, MetricMessage, MetricValue, Status};
|
||||
|
||||
pub struct Agent {
|
||||
hostname: String,
|
||||
@@ -18,6 +19,7 @@ pub struct Agent {
|
||||
metric_manager: MetricCollectionManager,
|
||||
notification_manager: NotificationManager,
|
||||
host_status_manager: HostStatusManager,
|
||||
service_tracker: UserStoppedServiceTracker,
|
||||
}
|
||||
|
||||
impl Agent {
|
||||
@@ -50,6 +52,10 @@ impl Agent {
|
||||
let host_status_manager = HostStatusManager::new(config.status_aggregation.clone());
|
||||
info!("Host status manager initialized");
|
||||
|
||||
// Initialize user-stopped service tracker
|
||||
let service_tracker = UserStoppedServiceTracker::init_global()?;
|
||||
info!("User-stopped service tracker initialized");
|
||||
|
||||
Ok(Self {
|
||||
hostname,
|
||||
config,
|
||||
@@ -57,6 +63,7 @@ impl Agent {
|
||||
metric_manager,
|
||||
notification_manager,
|
||||
host_status_manager,
|
||||
service_tracker,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -71,11 +78,12 @@ impl Agent {
|
||||
info!("Initial metric collection completed - all data cached and ready");
|
||||
}
|
||||
|
||||
// Separate intervals for collection and transmission
|
||||
// Separate intervals for collection, transmission, heartbeat, and email notifications
|
||||
let mut collection_interval =
|
||||
interval(Duration::from_secs(self.config.collection_interval_seconds));
|
||||
let mut transmission_interval = interval(Duration::from_secs(1)); // ZMQ broadcast every 1 second
|
||||
let mut notification_interval = interval(Duration::from_secs(self.config.status_aggregation.notification_interval_seconds));
|
||||
let mut transmission_interval = interval(Duration::from_secs(self.config.zmq.transmission_interval_seconds));
|
||||
let mut heartbeat_interval = interval(Duration::from_secs(self.config.zmq.heartbeat_interval_seconds));
|
||||
let mut notification_interval = interval(Duration::from_secs(self.config.notifications.aggregation_interval_seconds));
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -86,13 +94,19 @@ impl Agent {
|
||||
}
|
||||
}
|
||||
_ = transmission_interval.tick() => {
|
||||
// Send all metrics via ZMQ every 1 second
|
||||
// Send all metrics via ZMQ (dashboard updates only)
|
||||
if let Err(e) = self.broadcast_all_metrics().await {
|
||||
error!("Failed to broadcast metrics: {}", e);
|
||||
}
|
||||
}
|
||||
_ = heartbeat_interval.tick() => {
|
||||
// Send standalone heartbeat for host connectivity detection
|
||||
if let Err(e) = self.send_heartbeat().await {
|
||||
error!("Failed to send heartbeat: {}", e);
|
||||
}
|
||||
}
|
||||
_ = notification_interval.tick() => {
|
||||
// Process batched notifications
|
||||
// Process batched email notifications (separate from dashboard updates)
|
||||
if let Err(e) = self.host_status_manager.process_pending_notifications(&mut self.notification_manager).await {
|
||||
error!("Failed to process pending notifications: {}", e);
|
||||
}
|
||||
@@ -127,8 +141,8 @@ impl Agent {
|
||||
|
||||
info!("Force collected and cached {} metrics", metrics.len());
|
||||
|
||||
// Process metrics through status manager
|
||||
self.process_metrics(&metrics).await;
|
||||
// Process metrics through status manager (collect status data at startup)
|
||||
let _status_changed = self.process_metrics(&metrics).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -146,28 +160,46 @@ impl Agent {
|
||||
|
||||
debug!("Collected and cached {} metrics", metrics.len());
|
||||
|
||||
// Process metrics through status manager
|
||||
self.process_metrics(&metrics).await;
|
||||
// Process metrics through status manager and trigger immediate transmission if status changed
|
||||
let status_changed = self.process_metrics(&metrics).await;
|
||||
|
||||
if status_changed {
|
||||
info!("Status change detected - triggering immediate metric transmission");
|
||||
if let Err(e) = self.broadcast_all_metrics().await {
|
||||
error!("Failed to broadcast metrics after status change: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn broadcast_all_metrics(&mut self) -> Result<()> {
|
||||
debug!("Broadcasting all metrics via ZMQ");
|
||||
debug!("Broadcasting cached metrics via ZMQ");
|
||||
|
||||
// Get all current metrics from collectors
|
||||
let mut metrics = self.metric_manager.collect_all_metrics().await?;
|
||||
// Get cached metrics (no fresh collection)
|
||||
let mut metrics = self.metric_manager.get_cached_metrics();
|
||||
|
||||
// Add the host status summary metric from status manager
|
||||
let host_status_metric = self.host_status_manager.get_host_status_metric();
|
||||
metrics.push(host_status_metric);
|
||||
|
||||
// Add agent version metric for cross-host version comparison
|
||||
let version_metric = self.get_agent_version_metric();
|
||||
metrics.push(version_metric);
|
||||
|
||||
// Add heartbeat metric for host connectivity detection
|
||||
let heartbeat_metric = self.get_heartbeat_metric();
|
||||
metrics.push(heartbeat_metric);
|
||||
|
||||
// Check for user-stopped services that are now active and clear their flags
|
||||
self.clear_user_stopped_flags_for_active_services(&metrics);
|
||||
|
||||
if metrics.is_empty() {
|
||||
debug!("No metrics to broadcast");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
debug!("Broadcasting {} metrics (including host status summary)", metrics.len());
|
||||
debug!("Broadcasting {} cached metrics (including host status summary)", metrics.len());
|
||||
|
||||
// Create and send message with all current data
|
||||
let message = MetricMessage::new(self.hostname.clone(), metrics);
|
||||
@@ -177,10 +209,67 @@ impl Agent {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn process_metrics(&mut self, metrics: &[Metric]) {
|
||||
async fn process_metrics(&mut self, metrics: &[Metric]) -> bool {
|
||||
let mut status_changed = false;
|
||||
for metric in metrics {
|
||||
self.host_status_manager.process_metric(metric, &mut self.notification_manager).await;
|
||||
// Filter excluded metrics from email notification processing only
|
||||
if self.config.notifications.exclude_email_metrics.contains(&metric.name) {
|
||||
debug!("Excluding metric '{}' from email notification processing", metric.name);
|
||||
continue;
|
||||
}
|
||||
|
||||
if self.host_status_manager.process_metric(metric, &mut self.notification_manager).await {
|
||||
status_changed = true;
|
||||
}
|
||||
}
|
||||
status_changed
|
||||
}
|
||||
|
||||
/// Create agent version metric for cross-host version comparison
|
||||
fn get_agent_version_metric(&self) -> Metric {
|
||||
// Get version from executable path (same logic as main.rs get_version)
|
||||
let version = self.get_agent_version();
|
||||
|
||||
Metric::new(
|
||||
"agent_version".to_string(),
|
||||
MetricValue::String(version),
|
||||
Status::Ok,
|
||||
)
|
||||
}
|
||||
|
||||
/// Get agent version from Cargo package version
|
||||
fn get_agent_version(&self) -> String {
|
||||
// Use the version from Cargo.toml (e.g., "0.1.11")
|
||||
format!("v{}", env!("CARGO_PKG_VERSION"))
|
||||
}
|
||||
|
||||
/// Create heartbeat metric for host connectivity detection
|
||||
fn get_heartbeat_metric(&self) -> Metric {
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
let timestamp = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs();
|
||||
|
||||
Metric::new(
|
||||
"agent_heartbeat".to_string(),
|
||||
MetricValue::Integer(timestamp as i64),
|
||||
Status::Ok,
|
||||
)
|
||||
}
|
||||
|
||||
/// Send standalone heartbeat for connectivity detection
|
||||
async fn send_heartbeat(&mut self) -> Result<()> {
|
||||
let heartbeat_metric = self.get_heartbeat_metric();
|
||||
let message = MetricMessage::new(
|
||||
self.hostname.clone(),
|
||||
vec![heartbeat_metric],
|
||||
);
|
||||
|
||||
self.zmq_handler.publish_metrics(&message).await?;
|
||||
debug!("Sent standalone heartbeat for connectivity detection");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_commands(&mut self) -> Result<()> {
|
||||
@@ -226,191 +315,38 @@ impl Agent {
|
||||
info!("Processing Ping command - agent is alive");
|
||||
// Could send a response back via ZMQ if needed
|
||||
}
|
||||
AgentCommand::ServiceControl { service_name, action } => {
|
||||
info!("Processing ServiceControl command: {} {:?}", service_name, action);
|
||||
if let Err(e) = self.handle_service_control(&service_name, &action).await {
|
||||
error!("Failed to execute service control: {}", e);
|
||||
}
|
||||
}
|
||||
AgentCommand::SystemRebuild { git_url, git_branch, working_dir, api_key_file } => {
|
||||
info!("Processing SystemRebuild command: {} @ {} -> {}", git_url, git_branch, working_dir);
|
||||
if let Err(e) = self.handle_system_rebuild(&git_url, &git_branch, &working_dir, api_key_file.as_deref()).await {
|
||||
error!("Failed to execute system rebuild: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handle systemd service control commands
|
||||
async fn handle_service_control(&self, service_name: &str, action: &ServiceAction) -> Result<()> {
|
||||
let action_str = match action {
|
||||
ServiceAction::Start => "start",
|
||||
ServiceAction::Stop => "stop",
|
||||
ServiceAction::Restart => "restart",
|
||||
ServiceAction::Status => "status",
|
||||
};
|
||||
|
||||
info!("Executing systemctl {} {}", action_str, service_name);
|
||||
/// Check metrics for user-stopped services that are now active and clear their flags
|
||||
fn clear_user_stopped_flags_for_active_services(&mut self, metrics: &[Metric]) {
|
||||
for metric in metrics {
|
||||
// Look for service status metrics that are active
|
||||
if metric.name.starts_with("service_") && metric.name.ends_with("_status") {
|
||||
if let MetricValue::String(status) = &metric.value {
|
||||
if status == "active" {
|
||||
// Extract service name from metric name (service_nginx_status -> nginx)
|
||||
let service_name = metric.name
|
||||
.strip_prefix("service_")
|
||||
.and_then(|s| s.strip_suffix("_status"))
|
||||
.unwrap_or("");
|
||||
|
||||
let output = tokio::process::Command::new("sudo")
|
||||
.arg("systemctl")
|
||||
.arg(action_str)
|
||||
.arg(service_name)
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if output.status.success() {
|
||||
info!("Service {} {} completed successfully", service_name, action_str);
|
||||
if !output.stdout.is_empty() {
|
||||
debug!("stdout: {}", String::from_utf8_lossy(&output.stdout));
|
||||
}
|
||||
if !service_name.is_empty() && UserStoppedServiceTracker::is_service_user_stopped(service_name) {
|
||||
info!("Service '{}' is now active - clearing user-stopped flag", service_name);
|
||||
if let Err(e) = self.service_tracker.clear_user_stopped(service_name) {
|
||||
error!("Failed to clear user-stopped flag for '{}': {}", service_name, e);
|
||||
} else {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
error!("Service {} {} failed: {}", service_name, action_str, stderr);
|
||||
return Err(anyhow::anyhow!("systemctl {} {} failed: {}", action_str, service_name, stderr));
|
||||
// Sync to global tracker
|
||||
UserStoppedServiceTracker::update_global(&self.service_tracker);
|
||||
debug!("Cleared user-stopped flag for service '{}'", service_name);
|
||||
}
|
||||
|
||||
// Force refresh metrics after service control to update service status
|
||||
if matches!(action, ServiceAction::Start | ServiceAction::Stop | ServiceAction::Restart) {
|
||||
info!("Triggering metric refresh after service control");
|
||||
// Note: We can't call self.collect_metrics_only() here due to borrowing issues
|
||||
// The next metric collection cycle will pick up the changes
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handle NixOS system rebuild commands with git clone approach
|
||||
async fn handle_system_rebuild(&self, git_url: &str, git_branch: &str, working_dir: &str, api_key_file: Option<&str>) -> Result<()> {
|
||||
info!("Starting NixOS system rebuild: {} @ {} -> {}", git_url, git_branch, working_dir);
|
||||
|
||||
// Enable maintenance mode before rebuild
|
||||
let maintenance_file = "/tmp/cm-maintenance";
|
||||
if let Err(e) = tokio::fs::File::create(maintenance_file).await {
|
||||
error!("Failed to create maintenance mode file: {}", e);
|
||||
} else {
|
||||
info!("Maintenance mode enabled");
|
||||
}
|
||||
|
||||
// Clone or update repository
|
||||
let git_result = self.ensure_git_repository(git_url, git_branch, working_dir, api_key_file).await;
|
||||
|
||||
// Execute nixos-rebuild if git operation succeeded - run detached but log output
|
||||
let rebuild_result = if git_result.is_ok() {
|
||||
info!("Git repository ready, executing nixos-rebuild in detached mode");
|
||||
let log_file = std::fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open("/var/log/cm-dashboard/nixos-rebuild.log")
|
||||
.map_err(|e| anyhow::anyhow!("Failed to open rebuild log: {}", e))?;
|
||||
|
||||
tokio::process::Command::new("nohup")
|
||||
.arg("sudo")
|
||||
.arg("/run/current-system/sw/bin/nixos-rebuild")
|
||||
.arg("switch")
|
||||
.arg("--option")
|
||||
.arg("sandbox")
|
||||
.arg("false")
|
||||
.arg("--flake")
|
||||
.arg(".")
|
||||
.current_dir(working_dir)
|
||||
.stdin(std::process::Stdio::null())
|
||||
.stdout(std::process::Stdio::from(log_file.try_clone().unwrap()))
|
||||
.stderr(std::process::Stdio::from(log_file))
|
||||
.spawn()
|
||||
} else {
|
||||
return git_result.and_then(|_| unreachable!());
|
||||
};
|
||||
|
||||
// Always try to remove maintenance mode file
|
||||
if let Err(e) = tokio::fs::remove_file(maintenance_file).await {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
error!("Failed to remove maintenance mode file: {}", e);
|
||||
}
|
||||
} else {
|
||||
info!("Maintenance mode disabled");
|
||||
}
|
||||
|
||||
// Check rebuild start result
|
||||
match rebuild_result {
|
||||
Ok(_child) => {
|
||||
info!("NixOS rebuild started successfully in background");
|
||||
// Don't wait for completion to avoid agent being killed during rebuild
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to start nixos-rebuild: {}", e);
|
||||
return Err(anyhow::anyhow!("Failed to start nixos-rebuild: {}", e));
|
||||
}
|
||||
}
|
||||
|
||||
info!("System rebuild completed, triggering metric refresh");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Ensure git repository is cloned and up to date with force clone approach
|
||||
async fn ensure_git_repository(&self, git_url: &str, git_branch: &str, working_dir: &str, api_key_file: Option<&str>) -> Result<()> {
|
||||
use std::path::Path;
|
||||
|
||||
// Read API key if provided
|
||||
let auth_url = if let Some(key_file) = api_key_file {
|
||||
match tokio::fs::read_to_string(key_file).await {
|
||||
Ok(api_key) => {
|
||||
let api_key = api_key.trim();
|
||||
if !api_key.is_empty() {
|
||||
// Convert https://gitea.cmtec.se/cm/nixosbox.git to https://token@gitea.cmtec.se/cm/nixosbox.git
|
||||
if git_url.starts_with("https://") {
|
||||
let url_without_protocol = &git_url[8..]; // Remove "https://"
|
||||
format!("https://{}@{}", api_key, url_without_protocol)
|
||||
} else {
|
||||
info!("API key provided but URL is not HTTPS, using original URL");
|
||||
git_url.to_string()
|
||||
}
|
||||
} else {
|
||||
info!("API key file is empty, using original URL");
|
||||
git_url.to_string()
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
info!("Could not read API key file {}: {}, using original URL", key_file, e);
|
||||
git_url.to_string()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
git_url.to_string()
|
||||
};
|
||||
|
||||
// Always remove existing directory and do fresh clone for consistent state
|
||||
let working_path = Path::new(working_dir);
|
||||
if working_path.exists() {
|
||||
info!("Removing existing repository directory: {}", working_dir);
|
||||
if let Err(e) = tokio::fs::remove_dir_all(working_path).await {
|
||||
error!("Failed to remove existing directory: {}", e);
|
||||
return Err(anyhow::anyhow!("Failed to remove existing directory: {}", e));
|
||||
}
|
||||
}
|
||||
|
||||
info!("Force cloning git repository from {} (branch: {})", git_url, git_branch);
|
||||
|
||||
// Force clone with depth 1 for efficiency (no history needed for deployment)
|
||||
let output = tokio::process::Command::new("git")
|
||||
.arg("clone")
|
||||
.arg("--depth")
|
||||
.arg("1")
|
||||
.arg("--branch")
|
||||
.arg(git_branch)
|
||||
.arg(&auth_url)
|
||||
.arg(working_dir)
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
error!("Git clone failed: {}", stderr);
|
||||
return Err(anyhow::anyhow!("Git clone failed: {}", stderr));
|
||||
}
|
||||
|
||||
info!("Git repository cloned successfully with latest state");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -136,10 +136,12 @@ impl Collector for BackupCollector {
|
||||
name: "backup_overall_status".to_string(),
|
||||
value: MetricValue::String(match overall_status {
|
||||
Status::Ok => "ok".to_string(),
|
||||
Status::Inactive => "inactive".to_string(),
|
||||
Status::Pending => "pending".to_string(),
|
||||
Status::Warning => "warning".to_string(),
|
||||
Status::Critical => "critical".to_string(),
|
||||
Status::Unknown => "unknown".to_string(),
|
||||
Status::Offline => "offline".to_string(),
|
||||
}),
|
||||
status: overall_status,
|
||||
timestamp,
|
||||
@@ -198,10 +200,12 @@ impl Collector for BackupCollector {
|
||||
name: format!("backup_service_{}_status", service_name),
|
||||
value: MetricValue::String(match service_status {
|
||||
Status::Ok => "ok".to_string(),
|
||||
Status::Inactive => "inactive".to_string(),
|
||||
Status::Pending => "pending".to_string(),
|
||||
Status::Warning => "warning".to_string(),
|
||||
Status::Critical => "critical".to_string(),
|
||||
Status::Unknown => "unknown".to_string(),
|
||||
Status::Offline => "offline".to_string(),
|
||||
}),
|
||||
status: service_status,
|
||||
timestamp,
|
||||
|
||||
@@ -41,11 +41,11 @@ pub struct DiskCollector {
|
||||
|
||||
impl DiskCollector {
|
||||
pub fn new(config: DiskConfig) -> Self {
|
||||
// Create hysteresis thresholds for disk temperature
|
||||
// Create hysteresis thresholds for disk temperature from config
|
||||
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
|
||||
60.0, // warning at 60°C
|
||||
config.temperature_warning_celsius,
|
||||
5.0, // 5°C gap for recovery
|
||||
70.0, // critical at 70°C
|
||||
config.temperature_critical_celsius,
|
||||
5.0, // 5°C gap for recovery
|
||||
);
|
||||
|
||||
@@ -219,18 +219,12 @@ impl DiskCollector {
|
||||
}
|
||||
|
||||
/// Parse wear level from SMART output (SSD wear leveling)
|
||||
/// Supports both NVMe and SATA SSD wear indicators
|
||||
fn parse_wear_level_from_smart(&self, smart_output: &str) -> Option<f32> {
|
||||
for line in smart_output.lines() {
|
||||
// Look for wear leveling indicators
|
||||
if line.contains("Wear_Leveling_Count") || line.contains("Media_Wearout_Indicator") {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 10 {
|
||||
if let Ok(wear) = parts[9].parse::<f32>() {
|
||||
return Some(100.0 - wear); // Convert to percentage used
|
||||
}
|
||||
}
|
||||
}
|
||||
// NVMe drives might show percentage used directly
|
||||
let line = line.trim();
|
||||
|
||||
// NVMe drives - direct percentage used
|
||||
if line.contains("Percentage Used:") {
|
||||
if let Some(wear_part) = line.split("Percentage Used:").nth(1) {
|
||||
if let Some(wear_str) = wear_part.split('%').next() {
|
||||
@@ -240,6 +234,38 @@ impl DiskCollector {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SATA SSD attributes - parse SMART table format
|
||||
// Format: ID ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 10 {
|
||||
// SSD Life Left / Percent Lifetime Remaining (higher = less wear)
|
||||
if line.contains("SSD_Life_Left") || line.contains("Percent_Lifetime_Remain") {
|
||||
if let Ok(remaining) = parts[3].parse::<f32>() { // VALUE column
|
||||
return Some(100.0 - remaining); // Convert remaining to used
|
||||
}
|
||||
}
|
||||
|
||||
// Media Wearout Indicator (lower = more wear, normalize to 0-100)
|
||||
if line.contains("Media_Wearout_Indicator") {
|
||||
if let Ok(remaining) = parts[3].parse::<f32>() { // VALUE column
|
||||
return Some(100.0 - remaining); // Convert remaining to used
|
||||
}
|
||||
}
|
||||
|
||||
// Wear Leveling Count (higher = less wear, but varies by manufacturer)
|
||||
if line.contains("Wear_Leveling_Count") {
|
||||
if let Ok(wear_count) = parts[3].parse::<f32>() { // VALUE column
|
||||
// Most SSDs: 100 = new, decreases with wear
|
||||
if wear_count <= 100.0 {
|
||||
return Some(100.0 - wear_count);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Total LBAs Written - calculate against typical endurance if available
|
||||
// This is more complex and manufacturer-specific, so we skip for now
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
@@ -530,8 +556,8 @@ impl Collector for DiskCollector {
|
||||
|
||||
// Drive wear level (for SSDs)
|
||||
if let Some(wear) = drive.wear_level {
|
||||
let wear_status = if wear >= 90.0 { Status::Critical }
|
||||
else if wear >= 80.0 { Status::Warning }
|
||||
let wear_status = if wear >= self.config.wear_critical_percent { Status::Critical }
|
||||
else if wear >= self.config.wear_warning_percent { Status::Warning }
|
||||
else { Status::Ok };
|
||||
|
||||
metrics.push(Metric {
|
||||
|
||||
@@ -187,7 +187,7 @@ impl MemoryCollector {
|
||||
}
|
||||
|
||||
// Monitor tmpfs (/tmp) usage
|
||||
if let Ok(tmpfs_metrics) = self.get_tmpfs_metrics() {
|
||||
if let Ok(tmpfs_metrics) = self.get_tmpfs_metrics(status_tracker) {
|
||||
metrics.extend(tmpfs_metrics);
|
||||
}
|
||||
|
||||
@@ -195,7 +195,7 @@ impl MemoryCollector {
|
||||
}
|
||||
|
||||
/// Get tmpfs (/tmp) usage metrics
|
||||
fn get_tmpfs_metrics(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
fn get_tmpfs_metrics(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
use std::process::Command;
|
||||
|
||||
let output = Command::new("df")
|
||||
@@ -249,12 +249,15 @@ impl MemoryCollector {
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
// Calculate status using same thresholds as main memory
|
||||
let tmp_status = self.calculate_usage_status("memory_tmp_usage_percent", usage_percent, status_tracker);
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "memory_tmp_usage_percent".to_string(),
|
||||
value: MetricValue::Float(usage_percent),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some("tmpfs /tmp usage percentage".to_string()),
|
||||
status: Status::Ok,
|
||||
status: tmp_status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ pub mod disk;
|
||||
pub mod error;
|
||||
pub mod memory;
|
||||
pub mod nixos;
|
||||
pub mod smart;
|
||||
pub mod systemd;
|
||||
|
||||
pub use error::CollectorError;
|
||||
|
||||
@@ -10,7 +10,6 @@ use crate::config::NixOSConfig;
|
||||
///
|
||||
/// Collects NixOS-specific system information including:
|
||||
/// - NixOS version and build information
|
||||
/// - Currently active/logged in users
|
||||
pub struct NixOSCollector {
|
||||
}
|
||||
|
||||
@@ -19,31 +18,6 @@ impl NixOSCollector {
|
||||
Self {}
|
||||
}
|
||||
|
||||
/// Get NixOS build information
|
||||
fn get_nixos_build_info(&self) -> Result<String, Box<dyn std::error::Error>> {
|
||||
// Get nixos-version output directly
|
||||
let output = Command::new("nixos-version").output()?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err("nixos-version command failed".into());
|
||||
}
|
||||
|
||||
let version_line = String::from_utf8_lossy(&output.stdout);
|
||||
let version = version_line.trim();
|
||||
|
||||
if version.is_empty() {
|
||||
return Err("Empty nixos-version output".into());
|
||||
}
|
||||
|
||||
// Remove codename part (e.g., "(Warbler)")
|
||||
let clean_version = if let Some(pos) = version.find(" (") {
|
||||
version[..pos].to_string()
|
||||
} else {
|
||||
version.to_string()
|
||||
};
|
||||
|
||||
Ok(clean_version)
|
||||
}
|
||||
|
||||
/// Get agent hash from binary path
|
||||
fn get_agent_hash(&self) -> Result<String, Box<dyn std::error::Error>> {
|
||||
@@ -63,6 +37,22 @@ impl NixOSCollector {
|
||||
}
|
||||
|
||||
/// Get configuration hash from deployed nix store system
|
||||
/// Get git commit hash from rebuild process
|
||||
fn get_git_commit(&self) -> Result<String, Box<dyn std::error::Error>> {
|
||||
let commit_file = "/var/lib/cm-dashboard/git-commit";
|
||||
match std::fs::read_to_string(commit_file) {
|
||||
Ok(content) => {
|
||||
let commit_hash = content.trim();
|
||||
if commit_hash.len() >= 7 {
|
||||
Ok(commit_hash.to_string())
|
||||
} else {
|
||||
Err("Git commit hash too short".into())
|
||||
}
|
||||
}
|
||||
Err(e) => Err(format!("Failed to read git commit file: {}", e).into())
|
||||
}
|
||||
}
|
||||
|
||||
fn get_config_hash(&self) -> Result<String, Box<dyn std::error::Error>> {
|
||||
// Read the symlink target of /run/current-system to get nix store path
|
||||
let output = Command::new("readlink")
|
||||
@@ -90,27 +80,6 @@ impl NixOSCollector {
|
||||
Err("Could not extract hash from nix store path".into())
|
||||
}
|
||||
|
||||
/// Get currently active users
|
||||
fn get_active_users(&self) -> Result<Vec<String>, Box<dyn std::error::Error>> {
|
||||
let output = Command::new("who").output()?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err("who command failed".into());
|
||||
}
|
||||
|
||||
let who_output = String::from_utf8_lossy(&output.stdout);
|
||||
let mut users = std::collections::HashSet::new();
|
||||
|
||||
for line in who_output.lines() {
|
||||
if let Some(username) = line.split_whitespace().next() {
|
||||
if !username.is_empty() {
|
||||
users.insert(username.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(users.into_iter().collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -121,56 +90,31 @@ impl Collector for NixOSCollector {
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
// Collect NixOS build information
|
||||
match self.get_nixos_build_info() {
|
||||
Ok(build_info) => {
|
||||
// Collect git commit information (shows what's actually deployed)
|
||||
match self.get_git_commit() {
|
||||
Ok(git_commit) => {
|
||||
metrics.push(Metric {
|
||||
name: "system_nixos_build".to_string(),
|
||||
value: MetricValue::String(build_info),
|
||||
value: MetricValue::String(git_commit),
|
||||
unit: None,
|
||||
description: Some("NixOS build information".to_string()),
|
||||
description: Some("Git commit hash of deployed configuration".to_string()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get NixOS build info: {}", e);
|
||||
debug!("Failed to get git commit: {}", e);
|
||||
metrics.push(Metric {
|
||||
name: "system_nixos_build".to_string(),
|
||||
value: MetricValue::String("unknown".to_string()),
|
||||
unit: None,
|
||||
description: Some("NixOS build (failed to detect)".to_string()),
|
||||
description: Some("Git commit hash (failed to detect)".to_string()),
|
||||
status: Status::Unknown,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Collect active users
|
||||
match self.get_active_users() {
|
||||
Ok(users) => {
|
||||
let users_str = users.join(", ");
|
||||
metrics.push(Metric {
|
||||
name: "system_active_users".to_string(),
|
||||
value: MetricValue::String(users_str),
|
||||
unit: None,
|
||||
description: Some("Currently active users".to_string()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get active users: {}", e);
|
||||
metrics.push(Metric {
|
||||
name: "system_active_users".to_string(),
|
||||
value: MetricValue::String("unknown".to_string()),
|
||||
unit: None,
|
||||
description: Some("Active users (failed to detect)".to_string()),
|
||||
status: Status::Unknown,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Collect config hash
|
||||
match self.get_config_hash() {
|
||||
|
||||
@@ -1,178 +0,0 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
||||
use std::process::Stdio;
|
||||
use tokio::process::Command;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
|
||||
pub struct SmartCollector;
|
||||
|
||||
impl SmartCollector {
|
||||
/// Get list of storage devices to monitor
|
||||
async fn get_devices(&self) -> Result<Vec<String>, CollectorError> {
|
||||
let output = Command::new("lsblk")
|
||||
.args(["-d", "-n", "-o", "NAME,TYPE"])
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::null())
|
||||
.output()
|
||||
.await
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: "lsblk".to_string(),
|
||||
error: e.to_string()
|
||||
})?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Ok(Vec::new()); // Return empty if lsblk fails
|
||||
}
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
let mut devices = Vec::new();
|
||||
|
||||
for line in stdout.lines() {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 2 && parts[1] == "disk" {
|
||||
let device_name = parts[0];
|
||||
if device_name.starts_with("nvme") || device_name.starts_with("sd") {
|
||||
devices.push(format!("/dev/{}", device_name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(devices)
|
||||
}
|
||||
|
||||
/// Collect SMART data for a single device
|
||||
async fn collect_device_smart(&self, device: &str, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
debug!("Collecting SMART data for device: {}", device);
|
||||
|
||||
let output = Command::new("sudo")
|
||||
.args(["smartctl", "-H", "-A", device]) // Health and attributes only
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::null())
|
||||
.output()
|
||||
.await
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: "lsblk".to_string(),
|
||||
error: e.to_string()
|
||||
})?;
|
||||
|
||||
if !output.status.success() {
|
||||
warn!("smartctl failed for device: {}", device);
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
self.parse_smart_output(device, &stdout, status_tracker)
|
||||
}
|
||||
|
||||
/// Parse smartctl output and create metrics
|
||||
fn parse_smart_output(&self, device: &str, output: &str, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
let mut metrics = Vec::new();
|
||||
let device_name = device.trim_start_matches("/dev/");
|
||||
|
||||
let mut health_ok = true;
|
||||
let mut temperature: Option<f32> = None;
|
||||
|
||||
for line in output.lines() {
|
||||
let line = line.trim();
|
||||
|
||||
// Parse health status
|
||||
if line.contains("SMART overall-health self-assessment") {
|
||||
if line.contains("FAILED") {
|
||||
health_ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Parse temperature from various formats
|
||||
if (line.contains("Temperature") || line.contains("Airflow_Temperature")) && temperature.is_none() {
|
||||
if let Some(temp) = self.extract_temperature(line) {
|
||||
temperature = Some(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create health metric
|
||||
let health_status = if health_ok {
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Critical
|
||||
};
|
||||
|
||||
metrics.push(Metric::new(
|
||||
format!("smart_health_{}", device_name),
|
||||
MetricValue::String(if health_ok { "PASSED".to_string() } else { "FAILED".to_string() }),
|
||||
health_status,
|
||||
));
|
||||
|
||||
// Create temperature metric if available
|
||||
if let Some(temp) = temperature {
|
||||
let temp_status = if temp >= 70.0 {
|
||||
Status::Critical
|
||||
} else if temp >= 60.0 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
|
||||
metrics.push(Metric::new(
|
||||
format!("smart_temperature_{}", device_name),
|
||||
MetricValue::Float(temp),
|
||||
temp_status,
|
||||
).with_unit("celsius".to_string()));
|
||||
}
|
||||
|
||||
debug!("Collected {} SMART metrics for {}", metrics.len(), device);
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
/// Extract temperature value from smartctl output line
|
||||
fn extract_temperature(&self, line: &str) -> Option<f32> {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
|
||||
for (i, part) in parts.iter().enumerate() {
|
||||
if let Ok(temp) = part.parse::<f32>() {
|
||||
// Check if this looks like a temperature value (reasonable range)
|
||||
if temp > 0.0 && temp < 150.0 {
|
||||
// Check context around the number
|
||||
if i + 1 < parts.len() {
|
||||
let next = parts[i + 1].to_lowercase();
|
||||
if next.contains("celsius") || next.contains("°c") || next == "c" {
|
||||
return Some(temp);
|
||||
}
|
||||
}
|
||||
// For SMART attribute lines, temperature is often the 10th column
|
||||
if parts.len() >= 10 && (line.contains("Temperature") || line.contains("Airflow_Temperature")) {
|
||||
return Some(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for SmartCollector {
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
debug!("Starting SMART data collection");
|
||||
|
||||
let devices = self.get_devices().await?;
|
||||
let mut all_metrics = Vec::new();
|
||||
|
||||
for device in devices {
|
||||
match self.collect_device_smart(&device, status_tracker).await {
|
||||
Ok(mut metrics) => {
|
||||
all_metrics.append(&mut metrics);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to collect SMART data for {}: {}", device, e);
|
||||
// Continue with other devices
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Collected {} total SMART metrics", all_metrics.len());
|
||||
Ok(all_metrics)
|
||||
}
|
||||
}
|
||||
@@ -32,7 +32,7 @@ struct ServiceCacheState {
|
||||
nginx_site_metrics: Vec<Metric>,
|
||||
/// Last time nginx sites were checked
|
||||
last_nginx_check_time: Option<Instant>,
|
||||
/// How often to check nginx site latency (30 seconds)
|
||||
/// How often to check nginx site latency (configurable)
|
||||
nginx_check_interval_seconds: u64,
|
||||
}
|
||||
|
||||
@@ -54,7 +54,7 @@ impl SystemdCollector {
|
||||
discovery_interval_seconds: config.interval_seconds,
|
||||
nginx_site_metrics: Vec::new(),
|
||||
last_nginx_check_time: None,
|
||||
nginx_check_interval_seconds: 30, // 30 seconds for nginx sites
|
||||
nginx_check_interval_seconds: config.nginx_check_interval_seconds,
|
||||
}),
|
||||
config,
|
||||
}
|
||||
@@ -136,8 +136,21 @@ impl SystemdCollector {
|
||||
/// Auto-discover interesting services to monitor (internal version that doesn't update state)
|
||||
fn discover_services_internal(&self) -> Result<(Vec<String>, std::collections::HashMap<String, ServiceStatusInfo>)> {
|
||||
debug!("Starting systemd service discovery with status caching");
|
||||
// Get all services (includes inactive, running, failed - everything)
|
||||
let units_output = Command::new("systemctl")
|
||||
|
||||
// First: Get all service unit files (includes services that have never been started)
|
||||
let unit_files_output = Command::new("systemctl")
|
||||
.arg("list-unit-files")
|
||||
.arg("--type=service")
|
||||
.arg("--no-pager")
|
||||
.arg("--plain")
|
||||
.output()?;
|
||||
|
||||
if !unit_files_output.status.success() {
|
||||
return Err(anyhow::anyhow!("systemctl list-unit-files command failed"));
|
||||
}
|
||||
|
||||
// Second: Get runtime status of all units
|
||||
let units_status_output = Command::new("systemctl")
|
||||
.arg("list-units")
|
||||
.arg("--type=service")
|
||||
.arg("--all")
|
||||
@@ -145,22 +158,33 @@ impl SystemdCollector {
|
||||
.arg("--plain")
|
||||
.output()?;
|
||||
|
||||
if !units_output.status.success() {
|
||||
return Err(anyhow::anyhow!("systemctl system command failed"));
|
||||
if !units_status_output.status.success() {
|
||||
return Err(anyhow::anyhow!("systemctl list-units command failed"));
|
||||
}
|
||||
|
||||
let units_str = String::from_utf8(units_output.stdout)?;
|
||||
let unit_files_str = String::from_utf8(unit_files_output.stdout)?;
|
||||
let units_status_str = String::from_utf8(units_status_output.stdout)?;
|
||||
let mut services = Vec::new();
|
||||
|
||||
// Use configuration instead of hardcoded values
|
||||
let excluded_services = &self.config.excluded_services;
|
||||
let service_name_filters = &self.config.service_name_filters;
|
||||
|
||||
// Parse all services and cache their status information
|
||||
// Parse all service unit files to get complete service list
|
||||
let mut all_service_names = std::collections::HashSet::new();
|
||||
let mut status_cache = std::collections::HashMap::new();
|
||||
|
||||
for line in units_str.lines() {
|
||||
for line in unit_files_str.lines() {
|
||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||
if fields.len() >= 2 && fields[0].ends_with(".service") {
|
||||
let service_name = fields[0].trim_end_matches(".service");
|
||||
all_service_names.insert(service_name.to_string());
|
||||
debug!("Found service unit file: {}", service_name);
|
||||
}
|
||||
}
|
||||
|
||||
// Parse runtime status for all units
|
||||
let mut status_cache = std::collections::HashMap::new();
|
||||
for line in units_status_str.lines() {
|
||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
||||
let service_name = fields[0].trim_end_matches(".service");
|
||||
@@ -177,8 +201,19 @@ impl SystemdCollector {
|
||||
sub_state: sub_state.clone(),
|
||||
});
|
||||
|
||||
all_service_names.insert(service_name.to_string());
|
||||
debug!("Parsed service: {} (load:{}, active:{}, sub:{})", service_name, load_state, active_state, sub_state);
|
||||
debug!("Got runtime status for service: {} (load:{}, active:{}, sub:{})", service_name, load_state, active_state, sub_state);
|
||||
}
|
||||
}
|
||||
|
||||
// For services found in unit files but not in runtime status, set default inactive status
|
||||
for service_name in &all_service_names {
|
||||
if !status_cache.contains_key(service_name) {
|
||||
status_cache.insert(service_name.to_string(), ServiceStatusInfo {
|
||||
load_state: "not-loaded".to_string(),
|
||||
active_state: "inactive".to_string(),
|
||||
sub_state: "dead".to_string(),
|
||||
});
|
||||
debug!("Service {} found in unit files but not runtime - marked as inactive", service_name);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -318,13 +353,19 @@ impl SystemdCollector {
|
||||
Ok((active_status, detailed_info))
|
||||
}
|
||||
|
||||
/// Calculate service status
|
||||
fn calculate_service_status(&self, active_status: &str) -> Status {
|
||||
/// Calculate service status, taking user-stopped services into account
|
||||
fn calculate_service_status(&self, service_name: &str, active_status: &str) -> Status {
|
||||
match active_status.to_lowercase().as_str() {
|
||||
"active" => Status::Ok,
|
||||
"inactive" | "dead" => Status::Warning,
|
||||
"inactive" | "dead" => {
|
||||
debug!("Service '{}' is inactive - treating as Inactive status", service_name);
|
||||
Status::Inactive
|
||||
},
|
||||
"failed" | "error" => Status::Critical,
|
||||
"activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => Status::Pending,
|
||||
"activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => {
|
||||
debug!("Service '{}' is transitioning - treating as Pending", service_name);
|
||||
Status::Pending
|
||||
},
|
||||
_ => Status::Unknown,
|
||||
}
|
||||
}
|
||||
@@ -445,7 +486,7 @@ impl Collector for SystemdCollector {
|
||||
for service in &monitored_services {
|
||||
match self.get_service_status(service) {
|
||||
Ok((active_status, _detailed_info)) => {
|
||||
let status = self.calculate_service_status(&active_status);
|
||||
let status = self.calculate_service_status(service, &active_status);
|
||||
|
||||
// Individual service status metric
|
||||
metrics.push(Metric {
|
||||
@@ -520,10 +561,8 @@ impl SystemdCollector {
|
||||
for (site_name, url) in &sites {
|
||||
match self.check_site_latency(url) {
|
||||
Ok(latency_ms) => {
|
||||
let status = if latency_ms < 500.0 {
|
||||
let status = if latency_ms < self.config.nginx_latency_critical_ms {
|
||||
Status::Ok
|
||||
} else if latency_ms < 2000.0 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Critical
|
||||
};
|
||||
@@ -615,10 +654,10 @@ impl SystemdCollector {
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
// Create HTTP client with timeouts (similar to legacy implementation)
|
||||
// Create HTTP client with timeouts from configuration
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.connect_timeout(Duration::from_secs(10))
|
||||
.timeout(Duration::from_secs(self.config.http_timeout_seconds))
|
||||
.connect_timeout(Duration::from_secs(self.config.http_connect_timeout_seconds))
|
||||
.redirect(reqwest::redirect::Policy::limited(10))
|
||||
.build()?;
|
||||
|
||||
|
||||
@@ -65,7 +65,6 @@ impl ZmqHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Send heartbeat (placeholder for future use)
|
||||
|
||||
/// Try to receive a command (non-blocking)
|
||||
pub fn try_receive_command(&self) -> Result<Option<AgentCommand>> {
|
||||
@@ -99,25 +98,4 @@ pub enum AgentCommand {
|
||||
ToggleCollector { name: String, enabled: bool },
|
||||
/// Request status/health check
|
||||
Ping,
|
||||
/// Control systemd service
|
||||
ServiceControl {
|
||||
service_name: String,
|
||||
action: ServiceAction,
|
||||
},
|
||||
/// Rebuild NixOS system
|
||||
SystemRebuild {
|
||||
git_url: String,
|
||||
git_branch: String,
|
||||
working_dir: String,
|
||||
api_key_file: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
/// Service control actions
|
||||
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
|
||||
pub enum ServiceAction {
|
||||
Start,
|
||||
Stop,
|
||||
Restart,
|
||||
Status,
|
||||
}
|
||||
|
||||
@@ -25,8 +25,10 @@ pub struct ZmqConfig {
|
||||
pub publisher_port: u16,
|
||||
pub command_port: u16,
|
||||
pub bind_address: String,
|
||||
pub timeout_ms: u64,
|
||||
pub heartbeat_interval_ms: u64,
|
||||
pub transmission_interval_seconds: u64,
|
||||
/// Heartbeat transmission interval in seconds for host connectivity detection
|
||||
#[serde(default = "default_heartbeat_interval_seconds")]
|
||||
pub heartbeat_interval_seconds: u64,
|
||||
}
|
||||
|
||||
/// Collector configuration
|
||||
@@ -36,7 +38,6 @@ pub struct CollectorConfig {
|
||||
pub memory: MemoryConfig,
|
||||
pub disk: DiskConfig,
|
||||
pub systemd: SystemdConfig,
|
||||
pub smart: SmartConfig,
|
||||
pub backup: BackupConfig,
|
||||
pub network: NetworkConfig,
|
||||
pub nixos: NixOSConfig,
|
||||
@@ -75,6 +76,11 @@ pub struct DiskConfig {
|
||||
pub usage_critical_percent: f32,
|
||||
/// Filesystem configurations
|
||||
pub filesystems: Vec<FilesystemConfig>,
|
||||
/// SMART monitoring thresholds
|
||||
pub temperature_warning_celsius: f32,
|
||||
pub temperature_critical_celsius: f32,
|
||||
pub wear_warning_percent: f32,
|
||||
pub wear_critical_percent: f32,
|
||||
}
|
||||
|
||||
/// Filesystem configuration entry
|
||||
@@ -100,18 +106,12 @@ pub struct SystemdConfig {
|
||||
pub memory_critical_mb: f32,
|
||||
pub service_directories: std::collections::HashMap<String, Vec<String>>,
|
||||
pub host_user_mapping: String,
|
||||
pub nginx_check_interval_seconds: u64,
|
||||
pub http_timeout_seconds: u64,
|
||||
pub http_connect_timeout_seconds: u64,
|
||||
pub nginx_latency_critical_ms: f32,
|
||||
}
|
||||
|
||||
/// SMART collector configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SmartConfig {
|
||||
pub enabled: bool,
|
||||
pub interval_seconds: u64,
|
||||
pub temperature_warning_celsius: f32,
|
||||
pub temperature_critical_celsius: f32,
|
||||
pub wear_warning_percent: f32,
|
||||
pub wear_critical_percent: f32,
|
||||
}
|
||||
|
||||
/// NixOS collector configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -145,6 +145,23 @@ pub struct NotificationConfig {
|
||||
pub from_email: String,
|
||||
pub to_email: String,
|
||||
pub rate_limit_minutes: u64,
|
||||
/// Email notification batching interval in seconds (default: 60)
|
||||
pub aggregation_interval_seconds: u64,
|
||||
/// List of metric names to exclude from email notifications
|
||||
#[serde(default)]
|
||||
pub exclude_email_metrics: Vec<String>,
|
||||
/// Path to maintenance mode file that suppresses email notifications when present
|
||||
#[serde(default = "default_maintenance_mode_file")]
|
||||
pub maintenance_mode_file: String,
|
||||
}
|
||||
|
||||
|
||||
fn default_heartbeat_interval_seconds() -> u64 {
|
||||
5
|
||||
}
|
||||
|
||||
fn default_maintenance_mode_file() -> String {
|
||||
"/tmp/cm-maintenance".to_string()
|
||||
}
|
||||
|
||||
impl AgentConfig {
|
||||
|
||||
@@ -19,10 +19,6 @@ pub fn validate_config(config: &AgentConfig) -> Result<()> {
|
||||
bail!("ZMQ bind address cannot be empty");
|
||||
}
|
||||
|
||||
if config.zmq.timeout_ms == 0 {
|
||||
bail!("ZMQ timeout cannot be 0");
|
||||
}
|
||||
|
||||
// Validate collection interval
|
||||
if config.collection_interval_seconds == 0 {
|
||||
bail!("Collection interval cannot be 0");
|
||||
@@ -83,6 +79,13 @@ pub fn validate_config(config: &AgentConfig) -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// Validate systemd configuration
|
||||
if config.collectors.systemd.enabled {
|
||||
if config.collectors.systemd.nginx_latency_critical_ms <= 0.0 {
|
||||
bail!("Nginx latency critical threshold must be positive");
|
||||
}
|
||||
}
|
||||
|
||||
// Validate SMTP configuration
|
||||
if config.notifications.enabled {
|
||||
if config.notifications.smtp_host.is_empty() {
|
||||
|
||||
@@ -9,14 +9,31 @@ mod communication;
|
||||
mod config;
|
||||
mod metrics;
|
||||
mod notifications;
|
||||
mod service_tracker;
|
||||
mod status;
|
||||
|
||||
use agent::Agent;
|
||||
|
||||
/// Get version showing cm-dashboard-agent package hash for easy deployment verification
|
||||
fn get_version() -> &'static str {
|
||||
// Get the path of the current executable
|
||||
let exe_path = std::env::current_exe().expect("Failed to get executable path");
|
||||
let exe_str = exe_path.to_string_lossy();
|
||||
|
||||
// Extract Nix store hash from path like /nix/store/HASH-cm-dashboard-v0.1.8/bin/cm-dashboard-agent
|
||||
let hash_part = exe_str.strip_prefix("/nix/store/").expect("Not a nix store path");
|
||||
let hash = hash_part.split('-').next().expect("Invalid nix store path format");
|
||||
assert!(hash.len() >= 8, "Hash too short");
|
||||
|
||||
// Return first 8 characters of nix store hash
|
||||
let short_hash = hash[..8].to_string();
|
||||
Box::leak(short_hash.into_boxed_str())
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "cm-dashboard-agent")]
|
||||
#[command(about = "CM Dashboard metrics agent with individual metric collection")]
|
||||
#[command(version)]
|
||||
#[command(version = get_version())]
|
||||
struct Cli {
|
||||
/// Increase logging verbosity (-v, -vv)
|
||||
#[arg(short, long, action = clap::ArgAction::Count)]
|
||||
|
||||
@@ -1,22 +1,32 @@
|
||||
use anyhow::Result;
|
||||
use cm_dashboard_shared::{Metric, StatusTracker};
|
||||
use tracing::{error, info};
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
use crate::collectors::{
|
||||
backup::BackupCollector, cpu::CpuCollector, disk::DiskCollector, memory::MemoryCollector,
|
||||
nixos::NixOSCollector, smart::SmartCollector, systemd::SystemdCollector, Collector,
|
||||
nixos::NixOSCollector, systemd::SystemdCollector, Collector,
|
||||
};
|
||||
use crate::config::{AgentConfig, CollectorConfig};
|
||||
|
||||
/// Manages all metric collectors
|
||||
/// Collector with timing information
|
||||
struct TimedCollector {
|
||||
collector: Box<dyn Collector>,
|
||||
interval: Duration,
|
||||
last_collection: Option<Instant>,
|
||||
name: String,
|
||||
}
|
||||
|
||||
/// Manages all metric collectors with individual intervals
|
||||
pub struct MetricCollectionManager {
|
||||
collectors: Vec<Box<dyn Collector>>,
|
||||
collectors: Vec<TimedCollector>,
|
||||
status_tracker: StatusTracker,
|
||||
cached_metrics: Vec<Metric>,
|
||||
}
|
||||
|
||||
impl MetricCollectionManager {
|
||||
pub async fn new(config: &CollectorConfig, _agent_config: &AgentConfig) -> Result<Self> {
|
||||
let mut collectors: Vec<Box<dyn Collector>> = Vec::new();
|
||||
let mut collectors: Vec<TimedCollector> = Vec::new();
|
||||
|
||||
// Benchmark mode - only enable specific collector based on env var
|
||||
let benchmark_mode = std::env::var("BENCHMARK_COLLECTOR").ok();
|
||||
@@ -26,7 +36,12 @@ impl MetricCollectionManager {
|
||||
// CPU collector only
|
||||
if config.cpu.enabled {
|
||||
let cpu_collector = CpuCollector::new(config.cpu.clone());
|
||||
collectors.push(Box::new(cpu_collector));
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(cpu_collector),
|
||||
interval: Duration::from_secs(config.cpu.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "CPU".to_string(),
|
||||
});
|
||||
info!("BENCHMARK: CPU collector only");
|
||||
}
|
||||
}
|
||||
@@ -34,20 +49,35 @@ impl MetricCollectionManager {
|
||||
// Memory collector only
|
||||
if config.memory.enabled {
|
||||
let memory_collector = MemoryCollector::new(config.memory.clone());
|
||||
collectors.push(Box::new(memory_collector));
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(memory_collector),
|
||||
interval: Duration::from_secs(config.memory.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Memory".to_string(),
|
||||
});
|
||||
info!("BENCHMARK: Memory collector only");
|
||||
}
|
||||
}
|
||||
Some("disk") => {
|
||||
// Disk collector only
|
||||
let disk_collector = DiskCollector::new(config.disk.clone());
|
||||
collectors.push(Box::new(disk_collector));
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(disk_collector),
|
||||
interval: Duration::from_secs(config.disk.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Disk".to_string(),
|
||||
});
|
||||
info!("BENCHMARK: Disk collector only");
|
||||
}
|
||||
Some("systemd") => {
|
||||
// Systemd collector only
|
||||
let systemd_collector = SystemdCollector::new(config.systemd.clone());
|
||||
collectors.push(Box::new(systemd_collector));
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(systemd_collector),
|
||||
interval: Duration::from_secs(config.systemd.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Systemd".to_string(),
|
||||
});
|
||||
info!("BENCHMARK: Systemd collector only");
|
||||
}
|
||||
Some("backup") => {
|
||||
@@ -57,18 +87,15 @@ impl MetricCollectionManager {
|
||||
config.backup.backup_paths.first().cloned(),
|
||||
config.backup.max_age_hours,
|
||||
);
|
||||
collectors.push(Box::new(backup_collector));
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(backup_collector),
|
||||
interval: Duration::from_secs(config.backup.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Backup".to_string(),
|
||||
});
|
||||
info!("BENCHMARK: Backup collector only");
|
||||
}
|
||||
}
|
||||
Some("smart") => {
|
||||
// SMART collector only
|
||||
if config.smart.enabled {
|
||||
let smart_collector = SmartCollector;
|
||||
collectors.push(Box::new(smart_collector));
|
||||
info!("BENCHMARK: SMART collector only");
|
||||
}
|
||||
}
|
||||
Some("none") => {
|
||||
// No collectors - test agent loop only
|
||||
info!("BENCHMARK: No collectors enabled");
|
||||
@@ -77,44 +104,69 @@ impl MetricCollectionManager {
|
||||
// Normal mode - all collectors
|
||||
if config.cpu.enabled {
|
||||
let cpu_collector = CpuCollector::new(config.cpu.clone());
|
||||
collectors.push(Box::new(cpu_collector));
|
||||
info!("CPU collector initialized");
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(cpu_collector),
|
||||
interval: Duration::from_secs(config.cpu.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "CPU".to_string(),
|
||||
});
|
||||
info!("CPU collector initialized with {}s interval", config.cpu.interval_seconds);
|
||||
}
|
||||
|
||||
if config.memory.enabled {
|
||||
let memory_collector = MemoryCollector::new(config.memory.clone());
|
||||
collectors.push(Box::new(memory_collector));
|
||||
info!("Memory collector initialized");
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(memory_collector),
|
||||
interval: Duration::from_secs(config.memory.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Memory".to_string(),
|
||||
});
|
||||
info!("Memory collector initialized with {}s interval", config.memory.interval_seconds);
|
||||
}
|
||||
|
||||
let disk_collector = DiskCollector::new(config.disk.clone());
|
||||
collectors.push(Box::new(disk_collector));
|
||||
info!("Disk collector initialized");
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(disk_collector),
|
||||
interval: Duration::from_secs(config.disk.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Disk".to_string(),
|
||||
});
|
||||
info!("Disk collector initialized with {}s interval", config.disk.interval_seconds);
|
||||
|
||||
let systemd_collector = SystemdCollector::new(config.systemd.clone());
|
||||
collectors.push(Box::new(systemd_collector));
|
||||
info!("Systemd collector initialized");
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(systemd_collector),
|
||||
interval: Duration::from_secs(config.systemd.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Systemd".to_string(),
|
||||
});
|
||||
info!("Systemd collector initialized with {}s interval", config.systemd.interval_seconds);
|
||||
|
||||
if config.backup.enabled {
|
||||
let backup_collector = BackupCollector::new(
|
||||
config.backup.backup_paths.first().cloned(),
|
||||
config.backup.max_age_hours,
|
||||
);
|
||||
collectors.push(Box::new(backup_collector));
|
||||
info!("Backup collector initialized");
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(backup_collector),
|
||||
interval: Duration::from_secs(config.backup.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Backup".to_string(),
|
||||
});
|
||||
info!("Backup collector initialized with {}s interval", config.backup.interval_seconds);
|
||||
}
|
||||
|
||||
if config.nixos.enabled {
|
||||
let nixos_collector = NixOSCollector::new(config.nixos.clone());
|
||||
collectors.push(Box::new(nixos_collector));
|
||||
info!("NixOS collector initialized");
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(nixos_collector),
|
||||
interval: Duration::from_secs(config.nixos.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "NixOS".to_string(),
|
||||
});
|
||||
info!("NixOS collector initialized with {}s interval", config.nixos.interval_seconds);
|
||||
}
|
||||
|
||||
if config.smart.enabled {
|
||||
let smart_collector = SmartCollector;
|
||||
collectors.push(Box::new(smart_collector));
|
||||
info!("SMART collector initialized");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,29 +178,87 @@ impl MetricCollectionManager {
|
||||
Ok(Self {
|
||||
collectors,
|
||||
status_tracker: StatusTracker::new(),
|
||||
cached_metrics: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Force collection from ALL collectors immediately (used at startup)
|
||||
pub async fn collect_all_metrics_force(&mut self) -> Result<Vec<Metric>> {
|
||||
self.collect_all_metrics().await
|
||||
}
|
||||
|
||||
/// Collect metrics from all collectors
|
||||
pub async fn collect_all_metrics(&mut self) -> Result<Vec<Metric>> {
|
||||
let mut all_metrics = Vec::new();
|
||||
let now = Instant::now();
|
||||
|
||||
for collector in &self.collectors {
|
||||
match collector.collect(&mut self.status_tracker).await {
|
||||
for timed_collector in &mut self.collectors {
|
||||
match timed_collector.collector.collect(&mut self.status_tracker).await {
|
||||
Ok(metrics) => {
|
||||
let metric_count = metrics.len();
|
||||
all_metrics.extend(metrics);
|
||||
timed_collector.last_collection = Some(now);
|
||||
debug!("Force collected {} metrics from {}", metric_count, timed_collector.name);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Collector failed: {}", e);
|
||||
error!("Collector {} failed: {}", timed_collector.name, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Cache the collected metrics
|
||||
self.cached_metrics = all_metrics.clone();
|
||||
Ok(all_metrics)
|
||||
}
|
||||
|
||||
/// Collect metrics from collectors whose intervals have elapsed
|
||||
pub async fn collect_metrics_timed(&mut self) -> Result<Vec<Metric>> {
|
||||
let mut all_metrics = Vec::new();
|
||||
let now = Instant::now();
|
||||
|
||||
for timed_collector in &mut self.collectors {
|
||||
let should_collect = match timed_collector.last_collection {
|
||||
None => true, // First collection
|
||||
Some(last_time) => now.duration_since(last_time) >= timed_collector.interval,
|
||||
};
|
||||
|
||||
if should_collect {
|
||||
match timed_collector.collector.collect(&mut self.status_tracker).await {
|
||||
Ok(metrics) => {
|
||||
let metric_count = metrics.len();
|
||||
all_metrics.extend(metrics);
|
||||
timed_collector.last_collection = Some(now);
|
||||
debug!(
|
||||
"Collected {} metrics from {} ({}s interval)",
|
||||
metric_count,
|
||||
timed_collector.name,
|
||||
timed_collector.interval.as_secs()
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Collector {} failed: {}", timed_collector.name, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update cache with newly collected metrics
|
||||
if !all_metrics.is_empty() {
|
||||
// Merge new metrics with cached metrics (replace by name)
|
||||
for new_metric in &all_metrics {
|
||||
// Remove any existing metric with the same name
|
||||
self.cached_metrics.retain(|cached| cached.name != new_metric.name);
|
||||
// Add the new metric
|
||||
self.cached_metrics.push(new_metric.clone());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(all_metrics)
|
||||
}
|
||||
|
||||
/// Collect metrics from all collectors (legacy method for compatibility)
|
||||
pub async fn collect_all_metrics(&mut self) -> Result<Vec<Metric>> {
|
||||
self.collect_metrics_timed().await
|
||||
}
|
||||
|
||||
/// Get cached metrics without triggering fresh collection
|
||||
pub fn get_cached_metrics(&self) -> Vec<Metric> {
|
||||
self.cached_metrics.clone()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -59,6 +59,6 @@ impl NotificationManager {
|
||||
}
|
||||
|
||||
fn is_maintenance_mode(&self) -> bool {
|
||||
std::fs::metadata("/tmp/cm-maintenance").is_ok()
|
||||
std::fs::metadata(&self.config.maintenance_mode_file).is_ok()
|
||||
}
|
||||
}
|
||||
164
agent/src/service_tracker.rs
Normal file
164
agent/src/service_tracker.rs
Normal file
@@ -0,0 +1,164 @@
|
||||
use anyhow::Result;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, Mutex, OnceLock};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
/// Shared instance for global access
|
||||
static GLOBAL_TRACKER: OnceLock<Arc<Mutex<UserStoppedServiceTracker>>> = OnceLock::new();
|
||||
|
||||
/// Tracks services that have been stopped by user action
|
||||
/// These services should be treated as OK status instead of Warning
|
||||
#[derive(Debug)]
|
||||
pub struct UserStoppedServiceTracker {
|
||||
/// Set of services stopped by user action
|
||||
user_stopped_services: HashSet<String>,
|
||||
/// Path to persistent storage file
|
||||
storage_path: String,
|
||||
}
|
||||
|
||||
/// Serializable data structure for persistence
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct UserStoppedData {
|
||||
services: Vec<String>,
|
||||
}
|
||||
|
||||
impl UserStoppedServiceTracker {
|
||||
/// Create new tracker with default storage path
|
||||
pub fn new() -> Self {
|
||||
Self::with_storage_path("/var/lib/cm-dashboard/user-stopped-services.json")
|
||||
}
|
||||
|
||||
/// Initialize global instance (called by agent)
|
||||
pub fn init_global() -> Result<Self> {
|
||||
let tracker = Self::new();
|
||||
|
||||
// Set global instance
|
||||
let global_instance = Arc::new(Mutex::new(tracker));
|
||||
if GLOBAL_TRACKER.set(global_instance).is_err() {
|
||||
warn!("Global service tracker was already initialized");
|
||||
}
|
||||
|
||||
// Return a new instance for the agent to use
|
||||
Ok(Self::new())
|
||||
}
|
||||
|
||||
/// Check if a service is user-stopped (global access for collectors)
|
||||
pub fn is_service_user_stopped(service_name: &str) -> bool {
|
||||
if let Some(global) = GLOBAL_TRACKER.get() {
|
||||
if let Ok(tracker) = global.lock() {
|
||||
tracker.is_user_stopped(service_name)
|
||||
} else {
|
||||
debug!("Failed to lock global service tracker");
|
||||
false
|
||||
}
|
||||
} else {
|
||||
debug!("Global service tracker not initialized");
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Update global tracker (called by agent when tracker state changes)
|
||||
pub fn update_global(updated_tracker: &UserStoppedServiceTracker) {
|
||||
if let Some(global) = GLOBAL_TRACKER.get() {
|
||||
if let Ok(mut tracker) = global.lock() {
|
||||
tracker.user_stopped_services = updated_tracker.user_stopped_services.clone();
|
||||
} else {
|
||||
debug!("Failed to lock global service tracker for update");
|
||||
}
|
||||
} else {
|
||||
debug!("Global service tracker not initialized for update");
|
||||
}
|
||||
}
|
||||
|
||||
/// Create new tracker with custom storage path
|
||||
pub fn with_storage_path<P: AsRef<Path>>(storage_path: P) -> Self {
|
||||
let storage_path = storage_path.as_ref().to_string_lossy().to_string();
|
||||
let mut tracker = Self {
|
||||
user_stopped_services: HashSet::new(),
|
||||
storage_path,
|
||||
};
|
||||
|
||||
// Load existing data from storage
|
||||
if let Err(e) = tracker.load_from_storage() {
|
||||
warn!("Failed to load user-stopped services from storage: {}", e);
|
||||
info!("Starting with empty user-stopped services list");
|
||||
}
|
||||
|
||||
tracker
|
||||
}
|
||||
|
||||
|
||||
/// Clear user-stopped flag for a service (when user starts it)
|
||||
pub fn clear_user_stopped(&mut self, service_name: &str) -> Result<()> {
|
||||
if self.user_stopped_services.remove(service_name) {
|
||||
info!("Cleared user-stopped flag for service '{}'", service_name);
|
||||
self.save_to_storage()?;
|
||||
debug!("Service '{}' user-stopped flag cleared and saved to storage", service_name);
|
||||
} else {
|
||||
debug!("Service '{}' was not marked as user-stopped", service_name);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check if a service is marked as user-stopped
|
||||
pub fn is_user_stopped(&self, service_name: &str) -> bool {
|
||||
let is_stopped = self.user_stopped_services.contains(service_name);
|
||||
debug!("Service '{}' user-stopped status: {}", service_name, is_stopped);
|
||||
is_stopped
|
||||
}
|
||||
|
||||
|
||||
/// Save current state to persistent storage
|
||||
fn save_to_storage(&self) -> Result<()> {
|
||||
// Create parent directory if it doesn't exist
|
||||
if let Some(parent_dir) = Path::new(&self.storage_path).parent() {
|
||||
if !parent_dir.exists() {
|
||||
fs::create_dir_all(parent_dir)?;
|
||||
debug!("Created parent directory: {}", parent_dir.display());
|
||||
}
|
||||
}
|
||||
|
||||
let data = UserStoppedData {
|
||||
services: self.user_stopped_services.iter().cloned().collect(),
|
||||
};
|
||||
|
||||
let json_data = serde_json::to_string_pretty(&data)?;
|
||||
fs::write(&self.storage_path, json_data)?;
|
||||
|
||||
debug!(
|
||||
"Saved {} user-stopped services to {}",
|
||||
data.services.len(),
|
||||
self.storage_path
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load state from persistent storage
|
||||
fn load_from_storage(&mut self) -> Result<()> {
|
||||
if !Path::new(&self.storage_path).exists() {
|
||||
debug!("Storage file {} does not exist, starting fresh", self.storage_path);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let json_data = fs::read_to_string(&self.storage_path)?;
|
||||
let data: UserStoppedData = serde_json::from_str(&json_data)?;
|
||||
|
||||
self.user_stopped_services = data.services.into_iter().collect();
|
||||
|
||||
info!(
|
||||
"Loaded {} user-stopped services from {}",
|
||||
self.user_stopped_services.len(),
|
||||
self.storage_path
|
||||
);
|
||||
|
||||
if !self.user_stopped_services.is_empty() {
|
||||
debug!("User-stopped services: {:?}", self.user_stopped_services);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ use chrono::Utc;
|
||||
pub struct HostStatusConfig {
|
||||
pub enabled: bool,
|
||||
pub aggregation_method: String, // "worst_case"
|
||||
pub notification_interval_seconds: u64,
|
||||
}
|
||||
|
||||
impl Default for HostStatusConfig {
|
||||
@@ -17,7 +16,6 @@ impl Default for HostStatusConfig {
|
||||
Self {
|
||||
enabled: true,
|
||||
aggregation_method: "worst_case".to_string(),
|
||||
notification_interval_seconds: 30,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -160,25 +158,62 @@ impl HostStatusManager {
|
||||
|
||||
|
||||
|
||||
/// Process a metric - updates status (notifications handled separately via batching)
|
||||
pub async fn process_metric(&mut self, metric: &Metric, _notification_manager: &mut crate::notifications::NotificationManager) {
|
||||
// Just update status - notifications are handled by process_pending_notifications
|
||||
self.update_service_status(metric.name.clone(), metric.status);
|
||||
/// Process a metric - updates status and queues for aggregated notifications if status changed
|
||||
pub async fn process_metric(&mut self, metric: &Metric, _notification_manager: &mut crate::notifications::NotificationManager) -> bool {
|
||||
let old_service_status = self.service_statuses.get(&metric.name).copied();
|
||||
let old_host_status = self.current_host_status;
|
||||
let new_service_status = metric.status;
|
||||
|
||||
// Update status (this recalculates host status internally)
|
||||
self.update_service_status(metric.name.clone(), new_service_status);
|
||||
|
||||
let new_host_status = self.current_host_status;
|
||||
let mut status_changed = false;
|
||||
|
||||
// Check if service status actually changed (ignore first-time status setting)
|
||||
if let Some(old_service_status) = old_service_status {
|
||||
if old_service_status != new_service_status {
|
||||
debug!("Service status change detected for {}: {:?} -> {:?}", metric.name, old_service_status, new_service_status);
|
||||
|
||||
// Queue change for aggregated notification (not immediate)
|
||||
self.queue_status_change(&metric.name, old_service_status, new_service_status);
|
||||
|
||||
status_changed = true;
|
||||
}
|
||||
} else {
|
||||
debug!("Initial status set for {}: {:?}", metric.name, new_service_status);
|
||||
}
|
||||
|
||||
/// Process pending notifications - call this at notification intervals
|
||||
// Check if host status changed (this should trigger immediate transmission)
|
||||
if old_host_status != new_host_status {
|
||||
debug!("Host status change detected: {:?} -> {:?}", old_host_status, new_host_status);
|
||||
status_changed = true;
|
||||
}
|
||||
|
||||
status_changed // Return true if either service or host status changed
|
||||
}
|
||||
|
||||
/// Queue status change for aggregated notification
|
||||
fn queue_status_change(&mut self, metric_name: &str, old_status: Status, new_status: Status) {
|
||||
// Add to pending changes for aggregated notification
|
||||
let entry = self.pending_changes.entry(metric_name.to_string()).or_insert((old_status, old_status, 0));
|
||||
entry.1 = new_status; // Update final status
|
||||
entry.2 += 1; // Increment change count
|
||||
|
||||
// Set batch start time if this is the first change
|
||||
if self.batch_start_time.is_none() {
|
||||
self.batch_start_time = Some(Instant::now());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Process pending notifications - legacy method, now rarely used
|
||||
pub async fn process_pending_notifications(&mut self, notification_manager: &mut crate::notifications::NotificationManager) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
if !self.config.enabled || self.pending_changes.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let batch_start = self.batch_start_time.unwrap_or_else(Instant::now);
|
||||
let batch_duration = batch_start.elapsed();
|
||||
|
||||
// Only process if enough time has passed
|
||||
if batch_duration.as_secs() < self.config.notification_interval_seconds {
|
||||
return Ok(());
|
||||
}
|
||||
// Process notifications immediately without interval batching
|
||||
|
||||
// Create aggregated status changes
|
||||
let aggregated = self.create_aggregated_changes();
|
||||
@@ -237,11 +272,13 @@ impl HostStatusManager {
|
||||
/// Check if a status change is significant enough for notification
|
||||
fn is_significant_change(&self, old_status: Status, new_status: Status) -> bool {
|
||||
match (old_status, new_status) {
|
||||
// Always notify on problems
|
||||
// Don't notify on transitions from Unknown (startup/restart scenario)
|
||||
(Status::Unknown, _) => false,
|
||||
// Always notify on problems (but not from Unknown)
|
||||
(_, Status::Warning) | (_, Status::Critical) => true,
|
||||
// Only notify on recovery if it's from a problem state to OK and all services are OK
|
||||
(Status::Warning | Status::Critical, Status::Ok) => self.current_host_status == Status::Ok,
|
||||
// Don't notify on startup or other transitions
|
||||
// Don't notify on other transitions
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
@@ -339,8 +376,8 @@ impl HostStatusManager {
|
||||
details.push('\n');
|
||||
}
|
||||
|
||||
// Show recoveries
|
||||
if !recovery_changes.is_empty() {
|
||||
// Show recoveries only if host status is now OK (all services recovered)
|
||||
if !recovery_changes.is_empty() && aggregated.host_status_final == Status::Ok {
|
||||
details.push_str(&format!("✅ RECOVERIES ({}):\n", recovery_changes.len()));
|
||||
for change in recovery_changes {
|
||||
details.push_str(&format!(" {}\n", change));
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.0"
|
||||
version = "0.1.82"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
@@ -19,3 +19,4 @@ ratatui = { workspace = true }
|
||||
crossterm = { workspace = true }
|
||||
toml = { workspace = true }
|
||||
gethostname = { workspace = true }
|
||||
wake-on-lan = "0.2"
|
||||
@@ -9,14 +9,13 @@ use std::io;
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::communication::{AgentCommand, ServiceAction, ZmqCommandSender, ZmqConsumer};
|
||||
use crate::communication::{ZmqConsumer};
|
||||
use crate::config::DashboardConfig;
|
||||
use crate::metrics::MetricStore;
|
||||
use crate::ui::{TuiApp, UiCommand};
|
||||
use crate::ui::TuiApp;
|
||||
|
||||
pub struct Dashboard {
|
||||
zmq_consumer: ZmqConsumer,
|
||||
zmq_command_sender: ZmqCommandSender,
|
||||
metric_store: MetricStore,
|
||||
tui_app: Option<TuiApp>,
|
||||
terminal: Option<Terminal<CrosstermBackend<io::Stdout>>>,
|
||||
@@ -58,20 +57,9 @@ impl Dashboard {
|
||||
}
|
||||
};
|
||||
|
||||
// Initialize ZMQ command sender
|
||||
let zmq_command_sender = match ZmqCommandSender::new(&config.zmq) {
|
||||
Ok(sender) => sender,
|
||||
Err(e) => {
|
||||
error!("Failed to initialize ZMQ command sender: {}", e);
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
|
||||
// Connect to predefined hosts from configuration
|
||||
let hosts = config.hosts.predefined_hosts.clone();
|
||||
|
||||
// Try to connect to hosts but don't fail if none are available
|
||||
match zmq_consumer.connect_to_predefined_hosts(&hosts).await {
|
||||
match zmq_consumer.connect_to_predefined_hosts(&config.hosts).await {
|
||||
Ok(_) => info!("Successfully connected to ZMQ hosts"),
|
||||
Err(e) => {
|
||||
warn!(
|
||||
@@ -91,7 +79,7 @@ impl Dashboard {
|
||||
(None, None)
|
||||
} else {
|
||||
// Initialize TUI app
|
||||
let tui_app = TuiApp::new();
|
||||
let tui_app = TuiApp::new(config.clone());
|
||||
|
||||
// Setup terminal
|
||||
if let Err(e) = enable_raw_mode() {
|
||||
@@ -127,7 +115,6 @@ impl Dashboard {
|
||||
|
||||
Ok(Self {
|
||||
zmq_consumer,
|
||||
zmq_command_sender,
|
||||
metric_store,
|
||||
tui_app,
|
||||
terminal,
|
||||
@@ -137,18 +124,14 @@ impl Dashboard {
|
||||
})
|
||||
}
|
||||
|
||||
/// Send a command to a specific agent
|
||||
pub async fn send_command(&mut self, hostname: &str, command: AgentCommand) -> Result<()> {
|
||||
self.zmq_command_sender
|
||||
.send_command(hostname, command)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn run(&mut self) -> Result<()> {
|
||||
info!("Starting dashboard main loop");
|
||||
|
||||
let mut last_metrics_check = Instant::now();
|
||||
let metrics_check_interval = Duration::from_millis(100); // Check for metrics every 100ms
|
||||
let mut last_heartbeat_check = Instant::now();
|
||||
let heartbeat_check_interval = Duration::from_secs(1); // Check for host connectivity every 1 second
|
||||
|
||||
loop {
|
||||
// Handle terminal events (keyboard input) only if not headless
|
||||
@@ -158,16 +141,10 @@ impl Dashboard {
|
||||
match event::read() {
|
||||
Ok(event) => {
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
// Handle input and check for commands
|
||||
// Handle input
|
||||
match tui_app.handle_input(event) {
|
||||
Ok(Some(command)) => {
|
||||
// Execute the command
|
||||
if let Err(e) = self.execute_ui_command(command).await {
|
||||
error!("Failed to execute UI command: {}", e);
|
||||
}
|
||||
}
|
||||
Ok(None) => {
|
||||
// No command, check if we should quit
|
||||
Ok(_) => {
|
||||
// Check if we should quit
|
||||
if tui_app.should_quit() {
|
||||
info!("Quit requested, exiting dashboard");
|
||||
break;
|
||||
@@ -191,6 +168,17 @@ impl Dashboard {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Render UI immediately after handling input for responsive feedback
|
||||
if let Some(ref mut terminal) = self.terminal {
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = terminal.draw(|frame| {
|
||||
tui_app.render(frame, &self.metric_store);
|
||||
}) {
|
||||
error!("Error rendering TUI after input: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check for new metrics
|
||||
@@ -202,73 +190,69 @@ impl Dashboard {
|
||||
metric_message.metrics.len()
|
||||
);
|
||||
|
||||
// Check if this is the first time we've seen this host
|
||||
// Track first contact with host (no command needed - agent sends data every 2s)
|
||||
let is_new_host = !self
|
||||
.initial_commands_sent
|
||||
.contains(&metric_message.hostname);
|
||||
|
||||
if is_new_host {
|
||||
info!(
|
||||
"First contact with host {}, sending initial CollectNow command",
|
||||
metric_message.hostname
|
||||
);
|
||||
|
||||
// Send CollectNow command for immediate refresh
|
||||
if let Err(e) = self
|
||||
.send_command(&metric_message.hostname, AgentCommand::CollectNow)
|
||||
.await
|
||||
{
|
||||
error!(
|
||||
"Failed to send initial CollectNow command to {}: {}",
|
||||
metric_message.hostname, e
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
"✓ Sent initial CollectNow command to {}",
|
||||
"First contact with host {} - data will update automatically",
|
||||
metric_message.hostname
|
||||
);
|
||||
self.initial_commands_sent
|
||||
.insert(metric_message.hostname.clone());
|
||||
}
|
||||
}
|
||||
|
||||
// Update metric store
|
||||
self.metric_store
|
||||
.update_metrics(&metric_message.hostname, metric_message.metrics);
|
||||
|
||||
// Update TUI with new hosts and metrics (only if not headless)
|
||||
// Check for agent version mismatches across hosts
|
||||
if let Some((current_version, outdated_hosts)) = self.metric_store.get_version_mismatches() {
|
||||
for outdated_host in &outdated_hosts {
|
||||
warn!("Host {} has outdated agent version (current: {})", outdated_host, current_version);
|
||||
}
|
||||
}
|
||||
|
||||
// Update TUI with new metrics (only if not headless)
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
let mut connected_hosts = self
|
||||
.metric_store
|
||||
.get_connected_hosts(Duration::from_secs(30));
|
||||
|
||||
// Add hosts that are rebuilding but may be temporarily disconnected
|
||||
// Use extended timeout (5 minutes) for rebuilding hosts
|
||||
let rebuilding_hosts = self
|
||||
.metric_store
|
||||
.get_connected_hosts(Duration::from_secs(300));
|
||||
|
||||
for host in rebuilding_hosts {
|
||||
if !connected_hosts.contains(&host) {
|
||||
// Check if this host is rebuilding in the UI
|
||||
if tui_app.is_host_rebuilding(&host) {
|
||||
connected_hosts.push(host);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tui_app.update_hosts(connected_hosts);
|
||||
tui_app.update_metrics(&self.metric_store);
|
||||
}
|
||||
}
|
||||
|
||||
// Also check for command output messages
|
||||
if let Ok(Some(cmd_output)) = self.zmq_consumer.receive_command_output().await {
|
||||
debug!(
|
||||
"Received command output from {}: {}",
|
||||
cmd_output.hostname,
|
||||
cmd_output.output_line
|
||||
);
|
||||
|
||||
// Command output (terminal popup removed - output not displayed)
|
||||
}
|
||||
|
||||
last_metrics_check = Instant::now();
|
||||
}
|
||||
|
||||
// Check for host connectivity changes (heartbeat timeouts) periodically
|
||||
if last_heartbeat_check.elapsed() >= heartbeat_check_interval {
|
||||
let timeout = Duration::from_secs(self.config.zmq.heartbeat_timeout_seconds);
|
||||
|
||||
// Clean up metrics for offline hosts
|
||||
self.metric_store.cleanup_offline_hosts(timeout);
|
||||
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
let connected_hosts = self.metric_store.get_connected_hosts(timeout);
|
||||
tui_app.update_hosts(connected_hosts);
|
||||
}
|
||||
last_heartbeat_check = Instant::now();
|
||||
}
|
||||
|
||||
// Render TUI (only if not headless)
|
||||
if !self.headless {
|
||||
if let (Some(ref mut terminal), Some(ref mut tui_app)) =
|
||||
(&mut self.terminal, &mut self.tui_app)
|
||||
{
|
||||
if let Some(ref mut terminal) = self.terminal {
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = terminal.draw(|frame| {
|
||||
tui_app.render(frame, &self.metric_store);
|
||||
}) {
|
||||
@@ -277,6 +261,7 @@ impl Dashboard {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Small sleep to prevent excessive CPU usage
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
@@ -286,51 +271,6 @@ impl Dashboard {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute a UI command by sending it to the appropriate agent
|
||||
async fn execute_ui_command(&self, command: UiCommand) -> Result<()> {
|
||||
match command {
|
||||
UiCommand::ServiceRestart { hostname, service_name } => {
|
||||
info!("Sending restart command for service {} on {}", service_name, hostname);
|
||||
let agent_command = AgentCommand::ServiceControl {
|
||||
service_name,
|
||||
action: ServiceAction::Restart,
|
||||
};
|
||||
self.zmq_command_sender.send_command(&hostname, agent_command).await?;
|
||||
}
|
||||
UiCommand::ServiceStart { hostname, service_name } => {
|
||||
info!("Sending start command for service {} on {}", service_name, hostname);
|
||||
let agent_command = AgentCommand::ServiceControl {
|
||||
service_name: service_name.clone(),
|
||||
action: ServiceAction::Start,
|
||||
};
|
||||
self.zmq_command_sender.send_command(&hostname, agent_command).await?;
|
||||
}
|
||||
UiCommand::ServiceStop { hostname, service_name } => {
|
||||
info!("Sending stop command for service {} on {}", service_name, hostname);
|
||||
let agent_command = AgentCommand::ServiceControl {
|
||||
service_name: service_name.clone(),
|
||||
action: ServiceAction::Stop,
|
||||
};
|
||||
self.zmq_command_sender.send_command(&hostname, agent_command).await?;
|
||||
}
|
||||
UiCommand::SystemRebuild { hostname } => {
|
||||
info!("Sending system rebuild command to {}", hostname);
|
||||
let agent_command = AgentCommand::SystemRebuild {
|
||||
git_url: self.config.system.nixos_config_git_url.clone(),
|
||||
git_branch: self.config.system.nixos_config_branch.clone(),
|
||||
working_dir: self.config.system.nixos_config_working_dir.clone(),
|
||||
api_key_file: self.config.system.nixos_config_api_key_file.clone(),
|
||||
};
|
||||
self.zmq_command_sender.send_command(&hostname, agent_command).await?;
|
||||
}
|
||||
UiCommand::TriggerBackup { hostname } => {
|
||||
info!("Trigger backup requested for {}", hostname);
|
||||
// TODO: Implement backup trigger command
|
||||
info!("Backup trigger not yet implemented");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -1,43 +1,10 @@
|
||||
use anyhow::Result;
|
||||
use cm_dashboard_shared::{MessageEnvelope, MessageType, MetricMessage};
|
||||
use cm_dashboard_shared::{CommandOutputMessage, MessageEnvelope, MessageType, MetricMessage};
|
||||
use tracing::{debug, error, info, warn};
|
||||
use zmq::{Context, Socket, SocketType};
|
||||
|
||||
use crate::config::ZmqConfig;
|
||||
|
||||
/// Commands that can be sent to agents
|
||||
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
|
||||
pub enum AgentCommand {
|
||||
/// Request immediate metric collection
|
||||
CollectNow,
|
||||
/// Change collection interval
|
||||
SetInterval { seconds: u64 },
|
||||
/// Enable/disable a collector
|
||||
ToggleCollector { name: String, enabled: bool },
|
||||
/// Request status/health check
|
||||
Ping,
|
||||
/// Control systemd service
|
||||
ServiceControl {
|
||||
service_name: String,
|
||||
action: ServiceAction,
|
||||
},
|
||||
/// Rebuild NixOS system
|
||||
SystemRebuild {
|
||||
git_url: String,
|
||||
git_branch: String,
|
||||
working_dir: String,
|
||||
api_key_file: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
/// Service control actions
|
||||
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
|
||||
pub enum ServiceAction {
|
||||
Start,
|
||||
Stop,
|
||||
Restart,
|
||||
Status,
|
||||
}
|
||||
|
||||
/// ZMQ consumer for receiving metrics from agents
|
||||
pub struct ZmqConsumer {
|
||||
@@ -83,13 +50,14 @@ impl ZmqConsumer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Connect to predefined hosts
|
||||
pub async fn connect_to_predefined_hosts(&mut self, hosts: &[String]) -> Result<()> {
|
||||
|
||||
/// Connect to predefined hosts using their configuration
|
||||
pub async fn connect_to_predefined_hosts(&mut self, hosts: &std::collections::HashMap<String, crate::config::HostDetails>) -> Result<()> {
|
||||
let default_port = self.config.subscriber_ports[0];
|
||||
|
||||
for hostname in hosts {
|
||||
// Try to connect, but don't fail if some hosts are unreachable
|
||||
if let Err(e) = self.connect_to_host(hostname, default_port).await {
|
||||
for (hostname, host_details) in hosts {
|
||||
// Try to connect using configured IP, but don't fail if some hosts are unreachable
|
||||
if let Err(e) = self.connect_to_host_with_details(hostname, host_details, default_port).await {
|
||||
warn!("Could not connect to {}: {}", hostname, e);
|
||||
}
|
||||
}
|
||||
@@ -103,6 +71,52 @@ impl ZmqConsumer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Connect to a host using its configuration details
|
||||
pub async fn connect_to_host_with_details(&mut self, hostname: &str, host_details: &crate::config::HostDetails, port: u16) -> Result<()> {
|
||||
// Get primary connection IP only - no fallbacks
|
||||
let primary_ip = host_details.get_connection_ip(hostname);
|
||||
|
||||
// Connect directly without fallback attempts
|
||||
self.connect_to_host(&primary_ip, port).await
|
||||
}
|
||||
|
||||
/// Receive command output from any connected agent (non-blocking)
|
||||
pub async fn receive_command_output(&mut self) -> Result<Option<CommandOutputMessage>> {
|
||||
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
|
||||
Ok(data) => {
|
||||
// Deserialize envelope
|
||||
let envelope: MessageEnvelope = serde_json::from_slice(&data)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to deserialize envelope: {}", e))?;
|
||||
|
||||
// Check message type
|
||||
match envelope.message_type {
|
||||
MessageType::CommandOutput => {
|
||||
let cmd_output = envelope
|
||||
.decode_command_output()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to decode command output: {}", e))?;
|
||||
|
||||
debug!(
|
||||
"Received command output from {}: {}",
|
||||
cmd_output.hostname,
|
||||
cmd_output.output_line
|
||||
);
|
||||
|
||||
Ok(Some(cmd_output))
|
||||
}
|
||||
_ => Ok(None), // Not a command output message
|
||||
}
|
||||
}
|
||||
Err(zmq::Error::EAGAIN) => {
|
||||
// No message available (non-blocking mode)
|
||||
Ok(None)
|
||||
}
|
||||
Err(e) => {
|
||||
error!("ZMQ receive error: {}", e);
|
||||
Err(anyhow::anyhow!("ZMQ receive error: {}", e))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Receive metrics from any connected agent (non-blocking)
|
||||
pub async fn receive_metrics(&mut self) -> Result<Option<MetricMessage>> {
|
||||
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
|
||||
@@ -132,6 +146,10 @@ impl ZmqConsumer {
|
||||
debug!("Received heartbeat");
|
||||
Ok(None) // Don't return heartbeats as metrics
|
||||
}
|
||||
MessageType::CommandOutput => {
|
||||
debug!("Received command output (will be handled by receive_command_output)");
|
||||
Ok(None) // Command output handled by separate method
|
||||
}
|
||||
_ => {
|
||||
debug!("Received non-metrics message: {:?}", envelope.message_type);
|
||||
Ok(None)
|
||||
@@ -150,42 +168,3 @@ impl ZmqConsumer {
|
||||
}
|
||||
}
|
||||
|
||||
/// ZMQ command sender for sending commands to agents
|
||||
pub struct ZmqCommandSender {
|
||||
context: Context,
|
||||
}
|
||||
|
||||
impl ZmqCommandSender {
|
||||
pub fn new(_config: &ZmqConfig) -> Result<Self> {
|
||||
let context = Context::new();
|
||||
|
||||
info!("ZMQ command sender initialized");
|
||||
|
||||
Ok(Self { context })
|
||||
}
|
||||
|
||||
/// Send a command to a specific agent
|
||||
pub async fn send_command(&self, hostname: &str, command: AgentCommand) -> Result<()> {
|
||||
// Create a new PUSH socket for this command (ZMQ best practice)
|
||||
let socket = self.context.socket(SocketType::PUSH)?;
|
||||
|
||||
// Set socket options
|
||||
socket.set_linger(1000)?; // Wait up to 1 second on close
|
||||
socket.set_sndtimeo(5000)?; // 5 second send timeout
|
||||
|
||||
// Connect to agent's command port (6131)
|
||||
let address = format!("tcp://{}:6131", hostname);
|
||||
socket.connect(&address)?;
|
||||
|
||||
// Serialize command
|
||||
let serialized = serde_json::to_vec(&command)?;
|
||||
|
||||
// Send command
|
||||
socket.send(&serialized, 0)?;
|
||||
|
||||
info!("Sent command {:?} to agent at {}", command, hostname);
|
||||
|
||||
// Socket will be automatically closed when dropped
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,20 +6,40 @@ use std::path::Path;
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DashboardConfig {
|
||||
pub zmq: ZmqConfig,
|
||||
pub hosts: HostsConfig,
|
||||
pub hosts: std::collections::HashMap<String, HostDetails>,
|
||||
pub system: SystemConfig,
|
||||
pub ssh: SshConfig,
|
||||
pub service_logs: std::collections::HashMap<String, Vec<ServiceLogConfig>>,
|
||||
}
|
||||
|
||||
/// ZMQ consumer configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ZmqConfig {
|
||||
pub subscriber_ports: Vec<u16>,
|
||||
/// Heartbeat timeout in seconds - hosts considered offline if no heartbeat received within this time
|
||||
#[serde(default = "default_heartbeat_timeout_seconds")]
|
||||
pub heartbeat_timeout_seconds: u64,
|
||||
}
|
||||
|
||||
/// Hosts configuration
|
||||
fn default_heartbeat_timeout_seconds() -> u64 {
|
||||
10 // Default to 10 seconds - allows for multiple missed heartbeats
|
||||
}
|
||||
|
||||
/// Individual host configuration details
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HostsConfig {
|
||||
pub predefined_hosts: Vec<String>,
|
||||
pub struct HostDetails {
|
||||
pub mac_address: Option<String>,
|
||||
/// Primary IP address (local network)
|
||||
pub ip: Option<String>,
|
||||
}
|
||||
|
||||
|
||||
impl HostDetails {
|
||||
/// Get the IP address for connection (uses ip field or hostname as fallback)
|
||||
pub fn get_connection_ip(&self, hostname: &str) -> String {
|
||||
self.ip.as_ref().unwrap_or(&hostname.to_string()).clone()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// System configuration
|
||||
@@ -31,6 +51,21 @@ pub struct SystemConfig {
|
||||
pub nixos_config_api_key_file: Option<String>,
|
||||
}
|
||||
|
||||
/// SSH configuration for rebuild and backup operations
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SshConfig {
|
||||
pub rebuild_user: String,
|
||||
pub rebuild_alias: String,
|
||||
pub backup_alias: String,
|
||||
}
|
||||
|
||||
/// Service log file configuration per host
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ServiceLogConfig {
|
||||
pub service_name: String,
|
||||
pub log_file_path: String,
|
||||
}
|
||||
|
||||
impl DashboardConfig {
|
||||
pub fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||
let path = path.as_ref();
|
||||
@@ -52,8 +87,3 @@ impl Default for ZmqConfig {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for HostsConfig {
|
||||
fn default() -> Self {
|
||||
panic!("Dashboard configuration must be loaded from file - no hardcoded defaults allowed")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use std::process;
|
||||
use tracing::{error, info};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
@@ -11,26 +12,33 @@ mod ui;
|
||||
|
||||
use app::Dashboard;
|
||||
|
||||
/// Get version showing cm-dashboard package hash for easy rebuild verification
|
||||
fn get_version() -> &'static str {
|
||||
// Get the path of the current executable
|
||||
let exe_path = std::env::current_exe().expect("Failed to get executable path");
|
||||
let exe_str = exe_path.to_string_lossy();
|
||||
|
||||
// Extract Nix store hash from path like /nix/store/HASH-cm-dashboard-0.1.0/bin/cm-dashboard
|
||||
let hash_part = exe_str.strip_prefix("/nix/store/").expect("Not a nix store path");
|
||||
let hash = hash_part.split('-').next().expect("Invalid nix store path format");
|
||||
assert!(hash.len() >= 8, "Hash too short");
|
||||
|
||||
// Return first 8 characters of nix store hash
|
||||
let short_hash = hash[..8].to_string();
|
||||
Box::leak(short_hash.into_boxed_str())
|
||||
/// Check if running inside tmux session
|
||||
fn check_tmux_session() {
|
||||
// Check for TMUX environment variable which is set when inside a tmux session
|
||||
if std::env::var("TMUX").is_err() {
|
||||
eprintln!("╭─────────────────────────────────────────────────────────────╮");
|
||||
eprintln!("│ ⚠️ TMUX REQUIRED │");
|
||||
eprintln!("├─────────────────────────────────────────────────────────────┤");
|
||||
eprintln!("│ CM Dashboard must be run inside a tmux session for proper │");
|
||||
eprintln!("│ terminal handling and remote operation functionality. │");
|
||||
eprintln!("│ │");
|
||||
eprintln!("│ Please start a tmux session first: │");
|
||||
eprintln!("│ tmux new-session -d -s dashboard cm-dashboard │");
|
||||
eprintln!("│ tmux attach-session -t dashboard │");
|
||||
eprintln!("│ │");
|
||||
eprintln!("│ Or simply: │");
|
||||
eprintln!("│ tmux │");
|
||||
eprintln!("│ cm-dashboard │");
|
||||
eprintln!("╰─────────────────────────────────────────────────────────────╯");
|
||||
process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "cm-dashboard")]
|
||||
#[command(about = "CM Dashboard TUI with individual metric consumption")]
|
||||
#[command(version = get_version())]
|
||||
#[command(version)]
|
||||
struct Cli {
|
||||
/// Increase logging verbosity (-v, -vv)
|
||||
#[arg(short, long, action = clap::ArgAction::Count)]
|
||||
@@ -68,6 +76,11 @@ async fn main() -> Result<()> {
|
||||
.init();
|
||||
}
|
||||
|
||||
// Check for tmux session requirement (only for TUI mode)
|
||||
if !cli.headless {
|
||||
check_tmux_session();
|
||||
}
|
||||
|
||||
if cli.headless || cli.verbose > 0 {
|
||||
info!("CM Dashboard starting with individual metrics architecture...");
|
||||
}
|
||||
|
||||
@@ -11,8 +11,8 @@ pub struct MetricStore {
|
||||
current_metrics: HashMap<String, HashMap<String, Metric>>,
|
||||
/// Historical metrics for trending
|
||||
historical_metrics: HashMap<String, Vec<MetricDataPoint>>,
|
||||
/// Last update timestamp per host
|
||||
last_update: HashMap<String, Instant>,
|
||||
/// Last heartbeat timestamp per host
|
||||
last_heartbeat: HashMap<String, Instant>,
|
||||
/// Configuration
|
||||
max_metrics_per_host: usize,
|
||||
history_retention: Duration,
|
||||
@@ -23,7 +23,7 @@ impl MetricStore {
|
||||
Self {
|
||||
current_metrics: HashMap::new(),
|
||||
historical_metrics: HashMap::new(),
|
||||
last_update: HashMap::new(),
|
||||
last_heartbeat: HashMap::new(),
|
||||
max_metrics_per_host,
|
||||
history_retention: Duration::from_secs(history_retention_hours * 3600),
|
||||
}
|
||||
@@ -56,10 +56,13 @@ impl MetricStore {
|
||||
|
||||
// Add to history
|
||||
host_history.push(MetricDataPoint { received_at: now });
|
||||
}
|
||||
|
||||
// Update last update timestamp
|
||||
self.last_update.insert(hostname.to_string(), now);
|
||||
// Track heartbeat metrics for connectivity detection
|
||||
if metric_name == "agent_heartbeat" {
|
||||
self.last_heartbeat.insert(hostname.to_string(), now);
|
||||
debug!("Updated heartbeat for host {}", hostname);
|
||||
}
|
||||
}
|
||||
|
||||
// Get metrics count before cleanup
|
||||
let metrics_count = host_metrics.len();
|
||||
@@ -88,22 +91,46 @@ impl MetricStore {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get connected hosts (hosts with recent updates)
|
||||
/// Get connected hosts (hosts with recent heartbeats)
|
||||
pub fn get_connected_hosts(&self, timeout: Duration) -> Vec<String> {
|
||||
let now = Instant::now();
|
||||
|
||||
self.last_update
|
||||
self.last_heartbeat
|
||||
.iter()
|
||||
.filter_map(|(hostname, &last_update)| {
|
||||
if now.duration_since(last_update) <= timeout {
|
||||
.filter_map(|(hostname, &last_heartbeat)| {
|
||||
if now.duration_since(last_heartbeat) <= timeout {
|
||||
Some(hostname.clone())
|
||||
} else {
|
||||
debug!("Host {} considered offline - last heartbeat was {:?} ago",
|
||||
hostname, now.duration_since(last_heartbeat));
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Clean up data for offline hosts
|
||||
pub fn cleanup_offline_hosts(&mut self, timeout: Duration) {
|
||||
let now = Instant::now();
|
||||
let mut hosts_to_cleanup = Vec::new();
|
||||
|
||||
// Find hosts that are offline (no recent heartbeat)
|
||||
for (hostname, &last_heartbeat) in &self.last_heartbeat {
|
||||
if now.duration_since(last_heartbeat) > timeout {
|
||||
hosts_to_cleanup.push(hostname.clone());
|
||||
}
|
||||
}
|
||||
|
||||
// Clear metrics for offline hosts
|
||||
for hostname in hosts_to_cleanup {
|
||||
if let Some(metrics) = self.current_metrics.remove(&hostname) {
|
||||
info!("Cleared {} metrics for offline host: {}", metrics.len(), hostname);
|
||||
}
|
||||
// Keep heartbeat timestamp for reconnection detection
|
||||
// Don't remove from last_heartbeat to track when host was last seen
|
||||
}
|
||||
}
|
||||
|
||||
/// Cleanup old data and enforce limits
|
||||
fn cleanup_host_data(&mut self, hostname: &str) {
|
||||
let now = Instant::now();
|
||||
@@ -124,4 +151,52 @@ impl MetricStore {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get agent versions from all hosts for cross-host comparison
|
||||
pub fn get_agent_versions(&self) -> HashMap<String, String> {
|
||||
let mut versions = HashMap::new();
|
||||
|
||||
for (hostname, metrics) in &self.current_metrics {
|
||||
if let Some(version_metric) = metrics.get("agent_version") {
|
||||
if let cm_dashboard_shared::MetricValue::String(version) = &version_metric.value {
|
||||
versions.insert(hostname.clone(), version.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
versions
|
||||
}
|
||||
|
||||
/// Check for agent version mismatches across hosts
|
||||
pub fn get_version_mismatches(&self) -> Option<(String, Vec<String>)> {
|
||||
let versions = self.get_agent_versions();
|
||||
|
||||
if versions.len() < 2 {
|
||||
return None; // Need at least 2 hosts to compare
|
||||
}
|
||||
|
||||
// Find the most common version (assume it's the "current" version)
|
||||
let mut version_counts = HashMap::new();
|
||||
for version in versions.values() {
|
||||
*version_counts.entry(version.clone()).or_insert(0) += 1;
|
||||
}
|
||||
|
||||
let most_common_version = version_counts
|
||||
.iter()
|
||||
.max_by_key(|(_, count)| *count)
|
||||
.map(|(version, _)| version.clone())?;
|
||||
|
||||
// Find hosts with different versions
|
||||
let outdated_hosts: Vec<String> = versions
|
||||
.iter()
|
||||
.filter(|(_, version)| *version != &most_common_version)
|
||||
.map(|(hostname, _)| hostname.clone())
|
||||
.collect();
|
||||
|
||||
if outdated_hosts.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some((most_common_version, outdated_hosts))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use anyhow::Result;
|
||||
use crossterm::event::{Event, KeyCode, KeyModifiers};
|
||||
use crossterm::event::{Event, KeyCode};
|
||||
use ratatui::{
|
||||
layout::{Constraint, Direction, Layout, Rect},
|
||||
style::Style,
|
||||
@@ -7,56 +7,23 @@ use ratatui::{
|
||||
Frame,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::time::Instant;
|
||||
use tracing::info;
|
||||
use wake_on_lan::MagicPacket;
|
||||
|
||||
pub mod theme;
|
||||
pub mod widgets;
|
||||
|
||||
use crate::config::DashboardConfig;
|
||||
use crate::metrics::MetricStore;
|
||||
use cm_dashboard_shared::{Metric, Status};
|
||||
use cm_dashboard_shared::Status;
|
||||
use theme::{Components, Layout as ThemeLayout, Theme, Typography};
|
||||
use widgets::{BackupWidget, ServicesWidget, SystemWidget, Widget};
|
||||
|
||||
/// Commands that can be triggered from the UI
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum UiCommand {
|
||||
ServiceRestart { hostname: String, service_name: String },
|
||||
ServiceStart { hostname: String, service_name: String },
|
||||
ServiceStop { hostname: String, service_name: String },
|
||||
SystemRebuild { hostname: String },
|
||||
TriggerBackup { hostname: String },
|
||||
}
|
||||
|
||||
/// Command execution status for visual feedback
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum CommandStatus {
|
||||
/// Command is executing
|
||||
InProgress { command_type: CommandType, target: String, start_time: std::time::Instant },
|
||||
/// Command completed successfully
|
||||
Success { command_type: CommandType, completed_at: std::time::Instant },
|
||||
}
|
||||
|
||||
/// Types of commands for status tracking
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum CommandType {
|
||||
ServiceRestart,
|
||||
ServiceStart,
|
||||
ServiceStop,
|
||||
SystemRebuild,
|
||||
BackupTrigger,
|
||||
}
|
||||
|
||||
/// Panel types for focus management
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum PanelType {
|
||||
System,
|
||||
Services,
|
||||
Backup,
|
||||
}
|
||||
|
||||
impl PanelType {
|
||||
}
|
||||
|
||||
/// Widget states for a specific host
|
||||
#[derive(Clone)]
|
||||
@@ -67,14 +34,8 @@ pub struct HostWidgets {
|
||||
pub services_widget: ServicesWidget,
|
||||
/// Backup widget state
|
||||
pub backup_widget: BackupWidget,
|
||||
/// Scroll offsets for each panel
|
||||
pub system_scroll_offset: usize,
|
||||
pub services_scroll_offset: usize,
|
||||
pub backup_scroll_offset: usize,
|
||||
/// Last update time for this host
|
||||
pub last_update: Option<Instant>,
|
||||
/// Active command status for visual feedback
|
||||
pub command_status: Option<CommandStatus>,
|
||||
}
|
||||
|
||||
impl HostWidgets {
|
||||
@@ -83,15 +44,12 @@ impl HostWidgets {
|
||||
system_widget: SystemWidget::new(),
|
||||
services_widget: ServicesWidget::new(),
|
||||
backup_widget: BackupWidget::new(),
|
||||
system_scroll_offset: 0,
|
||||
services_scroll_offset: 0,
|
||||
backup_scroll_offset: 0,
|
||||
last_update: None,
|
||||
command_status: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Main TUI application
|
||||
pub struct TuiApp {
|
||||
/// Widget states per host (hostname -> HostWidgets)
|
||||
@@ -102,25 +60,39 @@ pub struct TuiApp {
|
||||
available_hosts: Vec<String>,
|
||||
/// Host index for navigation
|
||||
host_index: usize,
|
||||
/// Currently focused panel
|
||||
focused_panel: PanelType,
|
||||
/// Should quit application
|
||||
should_quit: bool,
|
||||
/// Track if user manually navigated away from localhost
|
||||
user_navigated_away: bool,
|
||||
/// Dashboard configuration
|
||||
config: DashboardConfig,
|
||||
/// Cached localhost hostname to avoid repeated system calls
|
||||
localhost: String,
|
||||
}
|
||||
|
||||
impl TuiApp {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
pub fn new(config: DashboardConfig) -> Self {
|
||||
let localhost = gethostname::gethostname().to_string_lossy().to_string();
|
||||
let mut app = Self {
|
||||
host_widgets: HashMap::new(),
|
||||
current_host: None,
|
||||
available_hosts: Vec::new(),
|
||||
available_hosts: config.hosts.keys().cloned().collect(),
|
||||
host_index: 0,
|
||||
focused_panel: PanelType::System, // Start with System panel focused
|
||||
should_quit: false,
|
||||
user_navigated_away: false,
|
||||
config,
|
||||
localhost,
|
||||
};
|
||||
|
||||
// Sort predefined hosts
|
||||
app.available_hosts.sort();
|
||||
|
||||
// Initialize with first host if available
|
||||
if !app.available_hosts.is_empty() {
|
||||
app.current_host = Some(app.available_hosts[0].clone());
|
||||
}
|
||||
|
||||
app
|
||||
}
|
||||
|
||||
/// Get or create host widgets for the given hostname
|
||||
@@ -132,41 +104,39 @@ impl TuiApp {
|
||||
|
||||
/// Update widgets with metrics from store (only for current host)
|
||||
pub fn update_metrics(&mut self, metric_store: &MetricStore) {
|
||||
// Check for command timeouts first
|
||||
self.check_command_timeouts();
|
||||
|
||||
// Check for rebuild completion by agent hash change
|
||||
self.check_rebuild_completion(metric_store);
|
||||
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
// Only update widgets if we have metrics for this host
|
||||
let all_metrics = metric_store.get_metrics_for_host(&hostname);
|
||||
if !all_metrics.is_empty() {
|
||||
// Get metrics first while hostname is borrowed
|
||||
let cpu_metrics: Vec<&Metric> = all_metrics
|
||||
.iter()
|
||||
.filter(|m| {
|
||||
m.name.starts_with("cpu_")
|
||||
|| m.name.contains("c_state_")
|
||||
|| m.name.starts_with("process_top_")
|
||||
})
|
||||
.copied()
|
||||
.collect();
|
||||
let memory_metrics: Vec<&Metric> = all_metrics
|
||||
.iter()
|
||||
.filter(|m| m.name.starts_with("memory_") || m.name.starts_with("disk_tmp_"))
|
||||
.copied()
|
||||
.collect();
|
||||
let service_metrics: Vec<&Metric> = all_metrics
|
||||
.iter()
|
||||
.filter(|m| m.name.starts_with("service_"))
|
||||
.copied()
|
||||
.collect();
|
||||
let all_backup_metrics: Vec<&Metric> = all_metrics
|
||||
.iter()
|
||||
.filter(|m| m.name.starts_with("backup_"))
|
||||
.copied()
|
||||
.collect();
|
||||
// Single pass metric categorization for better performance
|
||||
let mut cpu_metrics = Vec::new();
|
||||
let mut memory_metrics = Vec::new();
|
||||
let mut service_metrics = Vec::new();
|
||||
let mut backup_metrics = Vec::new();
|
||||
let mut nixos_metrics = Vec::new();
|
||||
let mut disk_metrics = Vec::new();
|
||||
|
||||
for metric in all_metrics {
|
||||
if metric.name.starts_with("cpu_")
|
||||
|| metric.name.contains("c_state_")
|
||||
|| metric.name.starts_with("process_top_") {
|
||||
cpu_metrics.push(metric);
|
||||
} else if metric.name.starts_with("memory_") || metric.name.starts_with("disk_tmp_") {
|
||||
memory_metrics.push(metric);
|
||||
} else if metric.name.starts_with("service_") {
|
||||
service_metrics.push(metric);
|
||||
} else if metric.name.starts_with("backup_") {
|
||||
backup_metrics.push(metric);
|
||||
} else if metric.name == "system_nixos_build" || metric.name == "system_active_users" || metric.name == "agent_version" {
|
||||
nixos_metrics.push(metric);
|
||||
} else if metric.name.starts_with("disk_") {
|
||||
disk_metrics.push(metric);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Now get host widgets and update them
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
@@ -174,21 +144,7 @@ impl TuiApp {
|
||||
// Collect all system metrics (CPU, memory, NixOS, disk/storage)
|
||||
let mut system_metrics = cpu_metrics;
|
||||
system_metrics.extend(memory_metrics);
|
||||
|
||||
// Add NixOS metrics - using exact matching for build display fix
|
||||
let nixos_metrics: Vec<&Metric> = all_metrics
|
||||
.iter()
|
||||
.filter(|m| m.name == "system_nixos_build" || m.name == "system_active_users" || m.name == "system_agent_hash")
|
||||
.copied()
|
||||
.collect();
|
||||
system_metrics.extend(nixos_metrics);
|
||||
|
||||
// Add disk/storage metrics
|
||||
let disk_metrics: Vec<&Metric> = all_metrics
|
||||
.iter()
|
||||
.filter(|m| m.name.starts_with("disk_"))
|
||||
.copied()
|
||||
.collect();
|
||||
system_metrics.extend(disk_metrics);
|
||||
|
||||
host_widgets.system_widget.update_from_metrics(&system_metrics);
|
||||
@@ -197,7 +153,7 @@ impl TuiApp {
|
||||
.update_from_metrics(&service_metrics);
|
||||
host_widgets
|
||||
.backup_widget
|
||||
.update_from_metrics(&all_backup_metrics);
|
||||
.update_from_metrics(&backup_metrics);
|
||||
|
||||
host_widgets.last_update = Some(Instant::now());
|
||||
}
|
||||
@@ -205,30 +161,28 @@ impl TuiApp {
|
||||
}
|
||||
|
||||
/// Update available hosts with localhost prioritization
|
||||
pub fn update_hosts(&mut self, hosts: Vec<String>) {
|
||||
// Sort hosts alphabetically
|
||||
let mut sorted_hosts = hosts.clone();
|
||||
pub fn update_hosts(&mut self, discovered_hosts: Vec<String>) {
|
||||
// Start with configured hosts (always visible)
|
||||
let mut all_hosts: Vec<String> = self.config.hosts.keys().cloned().collect();
|
||||
|
||||
// Keep hosts that are undergoing SystemRebuild even if they're offline
|
||||
for (hostname, host_widgets) in &self.host_widgets {
|
||||
if let Some(CommandStatus::InProgress { command_type: CommandType::SystemRebuild, .. }) = &host_widgets.command_status {
|
||||
if !sorted_hosts.contains(hostname) {
|
||||
sorted_hosts.push(hostname.clone());
|
||||
}
|
||||
// Add any discovered hosts that aren't already configured
|
||||
for host in discovered_hosts {
|
||||
if !all_hosts.contains(&host) {
|
||||
all_hosts.push(host);
|
||||
}
|
||||
}
|
||||
|
||||
sorted_hosts.sort();
|
||||
self.available_hosts = sorted_hosts;
|
||||
|
||||
all_hosts.sort();
|
||||
self.available_hosts = all_hosts;
|
||||
|
||||
// Get the current hostname (localhost) for auto-selection
|
||||
let localhost = gethostname::gethostname().to_string_lossy().to_string();
|
||||
if !self.available_hosts.is_empty() {
|
||||
if self.available_hosts.contains(&localhost) && !self.user_navigated_away {
|
||||
if self.available_hosts.contains(&self.localhost) && !self.user_navigated_away {
|
||||
// Localhost is available and user hasn't navigated away - switch to it
|
||||
self.current_host = Some(localhost.clone());
|
||||
self.current_host = Some(self.localhost.clone());
|
||||
// Find the actual index of localhost in the sorted list
|
||||
self.host_index = self.available_hosts.iter().position(|h| h == &localhost).unwrap_or(0);
|
||||
self.host_index = self.available_hosts.iter().position(|h| h == &self.localhost).unwrap_or(0);
|
||||
} else if self.current_host.is_none() {
|
||||
// No current host - select first available (which is localhost if available)
|
||||
self.current_host = Some(self.available_hosts[0].clone());
|
||||
@@ -248,7 +202,7 @@ impl TuiApp {
|
||||
}
|
||||
|
||||
/// Handle keyboard input
|
||||
pub fn handle_input(&mut self, event: Event) -> Result<Option<UiCommand>> {
|
||||
pub fn handle_input(&mut self, event: Event) -> Result<()> {
|
||||
if let Event::Key(key) = event {
|
||||
match key.code {
|
||||
KeyCode::Char('q') => {
|
||||
@@ -261,78 +215,224 @@ impl TuiApp {
|
||||
self.navigate_host(1);
|
||||
}
|
||||
KeyCode::Char('r') => {
|
||||
match self.focused_panel {
|
||||
PanelType::System => {
|
||||
// System rebuild command
|
||||
// System rebuild command - works on any panel for current host
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
self.start_command(&hostname, CommandType::SystemRebuild, hostname.clone());
|
||||
return Ok(Some(UiCommand::SystemRebuild { hostname }));
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
// Create command that shows logo, rebuilds, and waits for user input
|
||||
let logo_and_rebuild = format!(
|
||||
"bash -c 'cat << \"EOF\"\nNixOS System Rebuild\nTarget: {} ({})\n\nEOF\nssh -tt {}@{} \"bash -ic {}\"\necho\necho \"========================================\"\necho \"Rebuild completed. Press any key to close...\"\necho \"========================================\"\nread -n 1 -s\nexit'",
|
||||
hostname,
|
||||
connection_ip,
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip,
|
||||
self.config.ssh.rebuild_alias
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30")
|
||||
.arg(&logo_and_rebuild)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
PanelType::Services => {
|
||||
// Service restart command
|
||||
if let (Some(service_name), Some(hostname)) = (self.get_selected_service(), self.current_host.clone()) {
|
||||
self.start_command(&hostname, CommandType::ServiceRestart, service_name.clone());
|
||||
return Ok(Some(UiCommand::ServiceRestart { hostname, service_name }));
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
info!("Manual refresh requested");
|
||||
}
|
||||
KeyCode::Char('B') => {
|
||||
// Backup command - works on any panel for current host
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
// Create command that shows logo, runs backup, and waits for user input
|
||||
let logo_and_backup = format!(
|
||||
"bash -c 'cat << \"EOF\"\nBackup Operation\nTarget: {} ({})\n\nEOF\nssh -tt {}@{} \"bash -ic {}\"\necho\necho \"========================================\"\necho \"Backup completed. Press any key to close...\"\necho \"========================================\"\nread -n 1 -s\nexit'",
|
||||
hostname,
|
||||
connection_ip,
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip,
|
||||
self.config.ssh.backup_alias
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30")
|
||||
.arg(&logo_and_backup)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
KeyCode::Char('s') => {
|
||||
if self.focused_panel == PanelType::Services {
|
||||
// Service start command
|
||||
// Service start command via SSH with progress display
|
||||
if let (Some(service_name), Some(hostname)) = (self.get_selected_service(), self.current_host.clone()) {
|
||||
self.start_command(&hostname, CommandType::ServiceStart, service_name.clone());
|
||||
return Ok(Some(UiCommand::ServiceStart { hostname, service_name }));
|
||||
}
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
let service_start_command = format!(
|
||||
"bash -c 'cat << \"EOF\"\nService Start: {}\nTarget: {} ({})\n\nEOF\nssh -tt {}@{} \"echo \\\"Starting service and following logs (Ctrl+C to stop):\\\" && echo \\\"========================================\\\" && sudo systemctl start {} & sudo journalctl -fu {} --since=\\\"1 second ago\\\" --no-pager\"\necho\necho \"========================================\"\necho \"Operation completed. Press any key to close...\"\necho \"========================================\"\nread -n 1 -s\nexit'",
|
||||
service_name,
|
||||
hostname,
|
||||
connection_ip,
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip,
|
||||
service_name,
|
||||
service_name
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30")
|
||||
.arg(&service_start_command)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
KeyCode::Char('S') => {
|
||||
if self.focused_panel == PanelType::Services {
|
||||
// Service stop command
|
||||
// Service stop command via SSH with progress display
|
||||
if let (Some(service_name), Some(hostname)) = (self.get_selected_service(), self.current_host.clone()) {
|
||||
self.start_command(&hostname, CommandType::ServiceStop, service_name.clone());
|
||||
return Ok(Some(UiCommand::ServiceStop { hostname, service_name }));
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
let service_stop_command = format!(
|
||||
"bash -c 'cat << \"EOF\"\nService Stop: {}.service\nTarget: {} ({})\n\nEOF\nssh -tt {}@{} \"echo \\\"Stopping service...\\\" && sudo systemctl stop {}.service && echo \\\"Service stopped! Final logs:\\\" && echo \\\"========================================\\\" && sudo journalctl -u {}.service --no-pager -n 10 && echo \\\"========================================\\\" && sudo systemctl status {}.service --no-pager -l\"\necho\necho \"========================================\"\necho \"Operation completed. Press any key to close...\"\necho \"========================================\"\nread -n 1 -s\nexit'",
|
||||
service_name,
|
||||
hostname,
|
||||
connection_ip,
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip,
|
||||
service_name,
|
||||
service_name,
|
||||
service_name
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30")
|
||||
.arg(&service_stop_command)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
KeyCode::Char('J') => {
|
||||
// Show service logs via journalctl in tmux split window
|
||||
if let (Some(service_name), Some(hostname)) = (self.get_selected_service(), self.current_host.clone()) {
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
let journalctl_command = format!(
|
||||
"bash -c \"ssh -tt {}@{} 'sudo journalctl -u {}.service -f --no-pager -n 50'; exit\"",
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip,
|
||||
service_name
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30")
|
||||
.arg(&journalctl_command)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
KeyCode::Char('L') => {
|
||||
// Show custom service log file in tmux split window
|
||||
if let (Some(service_name), Some(hostname)) = (self.get_selected_service(), self.current_host.clone()) {
|
||||
// Check if this service has a custom log file configured
|
||||
if let Some(host_logs) = self.config.service_logs.get(&hostname) {
|
||||
if let Some(log_config) = host_logs.iter().find(|config| config.service_name == service_name) {
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
let tail_command = format!(
|
||||
"bash -c \"ssh -tt {}@{} 'sudo tail -n 50 -f {}'; exit\"",
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip,
|
||||
log_config.log_file_path
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30")
|
||||
.arg(&tail_command)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
}
|
||||
KeyCode::Char('b') => {
|
||||
if self.focused_panel == PanelType::Backup {
|
||||
// Trigger backup
|
||||
}
|
||||
KeyCode::Char('w') => {
|
||||
// Wake on LAN for offline hosts
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
self.start_command(&hostname, CommandType::BackupTrigger, hostname.clone());
|
||||
return Ok(Some(UiCommand::TriggerBackup { hostname }));
|
||||
// Check if host has MAC address configured
|
||||
if let Some(host_details) = self.config.hosts.get(&hostname) {
|
||||
if let Some(mac_address) = &host_details.mac_address {
|
||||
// Parse MAC address and send WoL packet
|
||||
let mac_bytes = Self::parse_mac_address(mac_address);
|
||||
match mac_bytes {
|
||||
Ok(mac) => {
|
||||
match MagicPacket::new(&mac).send() {
|
||||
Ok(_) => {
|
||||
info!("WakeOnLAN packet sent successfully to {} ({})", hostname, mac_address);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Failed to send WakeOnLAN packet to {}: {}", hostname, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
tracing::error!("Invalid MAC address format for {}: {}", hostname, mac_address);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
KeyCode::Char('t') => {
|
||||
// Open SSH terminal session in tmux window
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
let ssh_command = format!(
|
||||
"ssh -tt {}@{}",
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30") // Use 30% like other commands
|
||||
.arg(&ssh_command)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
KeyCode::Tab => {
|
||||
if key.modifiers.contains(KeyModifiers::SHIFT) {
|
||||
// Shift+Tab cycles through panels
|
||||
self.next_panel();
|
||||
} else {
|
||||
// Tab cycles to next host
|
||||
self.navigate_host(1);
|
||||
}
|
||||
KeyCode::Up | KeyCode::Char('k') => {
|
||||
// Move service selection up
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.services_widget.select_previous();
|
||||
}
|
||||
KeyCode::BackTab => {
|
||||
// BackTab (Shift+Tab on some terminals) also cycles panels
|
||||
self.next_panel();
|
||||
}
|
||||
KeyCode::Up => {
|
||||
// Scroll up in focused panel
|
||||
self.scroll_focused_panel(-1);
|
||||
KeyCode::Down | KeyCode::Char('j') => {
|
||||
// Move service selection down
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let total_services = {
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.services_widget.get_total_services_count()
|
||||
};
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.services_widget.select_next(total_services);
|
||||
}
|
||||
KeyCode::Down => {
|
||||
// Scroll down in focused panel
|
||||
self.scroll_focused_panel(1);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Navigate between hosts
|
||||
@@ -355,9 +455,8 @@ impl TuiApp {
|
||||
self.current_host = Some(self.available_hosts[self.host_index].clone());
|
||||
|
||||
// Check if user navigated away from localhost
|
||||
let localhost = gethostname::gethostname().to_string_lossy().to_string();
|
||||
if let Some(ref current) = self.current_host {
|
||||
if current != &localhost {
|
||||
if current != &self.localhost {
|
||||
self.user_navigated_away = true;
|
||||
} else {
|
||||
self.user_navigated_away = false; // User navigated back to localhost
|
||||
@@ -367,37 +466,7 @@ impl TuiApp {
|
||||
info!("Switched to host: {}", self.current_host.as_ref().unwrap());
|
||||
}
|
||||
|
||||
/// Check if a host is currently rebuilding
|
||||
pub fn is_host_rebuilding(&self, hostname: &str) -> bool {
|
||||
if let Some(host_widgets) = self.host_widgets.get(hostname) {
|
||||
matches!(
|
||||
&host_widgets.command_status,
|
||||
Some(CommandStatus::InProgress { command_type: CommandType::SystemRebuild, .. })
|
||||
)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Switch to next panel (Shift+Tab) - only cycles through visible panels
|
||||
pub fn next_panel(&mut self) {
|
||||
let visible_panels = self.get_visible_panels();
|
||||
if visible_panels.len() <= 1 {
|
||||
return; // Can't switch if only one or no panels visible
|
||||
}
|
||||
|
||||
// Find current panel index in visible panels
|
||||
if let Some(current_index) = visible_panels.iter().position(|&p| p == self.focused_panel) {
|
||||
// Move to next visible panel
|
||||
let next_index = (current_index + 1) % visible_panels.len();
|
||||
self.focused_panel = visible_panels[next_index];
|
||||
} else {
|
||||
// Current panel not visible, switch to first visible panel
|
||||
self.focused_panel = visible_panels[0];
|
||||
}
|
||||
|
||||
info!("Switched to panel: {:?}", self.focused_panel);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -417,144 +486,11 @@ impl TuiApp {
|
||||
self.should_quit
|
||||
}
|
||||
|
||||
/// Start command execution and track status for visual feedback
|
||||
pub fn start_command(&mut self, hostname: &str, command_type: CommandType, target: String) {
|
||||
if let Some(host_widgets) = self.host_widgets.get_mut(hostname) {
|
||||
host_widgets.command_status = Some(CommandStatus::InProgress {
|
||||
command_type,
|
||||
target,
|
||||
start_time: Instant::now(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Mark command as completed successfully
|
||||
pub fn complete_command(&mut self, hostname: &str) {
|
||||
if let Some(host_widgets) = self.host_widgets.get_mut(hostname) {
|
||||
if let Some(CommandStatus::InProgress { command_type, .. }) = &host_widgets.command_status {
|
||||
host_widgets.command_status = Some(CommandStatus::Success {
|
||||
command_type: command_type.clone(),
|
||||
completed_at: Instant::now(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Check for command timeouts and automatically clear them
|
||||
pub fn check_command_timeouts(&mut self) {
|
||||
let now = Instant::now();
|
||||
let mut hosts_to_clear = Vec::new();
|
||||
|
||||
for (hostname, host_widgets) in &self.host_widgets {
|
||||
if let Some(CommandStatus::InProgress { command_type, start_time, .. }) = &host_widgets.command_status {
|
||||
let timeout_duration = match command_type {
|
||||
CommandType::SystemRebuild => Duration::from_secs(300), // 5 minutes for rebuilds
|
||||
_ => Duration::from_secs(30), // 30 seconds for service commands
|
||||
};
|
||||
|
||||
if now.duration_since(*start_time) > timeout_duration {
|
||||
hosts_to_clear.push(hostname.clone());
|
||||
}
|
||||
}
|
||||
// Also clear success/failed status after display time
|
||||
else if let Some(CommandStatus::Success { completed_at, .. }) = &host_widgets.command_status {
|
||||
if now.duration_since(*completed_at) > Duration::from_secs(3) {
|
||||
hosts_to_clear.push(hostname.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Clear timed out commands
|
||||
for hostname in hosts_to_clear {
|
||||
if let Some(host_widgets) = self.host_widgets.get_mut(&hostname) {
|
||||
host_widgets.command_status = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check for rebuild completion by detecting agent hash changes
|
||||
pub fn check_rebuild_completion(&mut self, metric_store: &MetricStore) {
|
||||
let mut hosts_to_complete = Vec::new();
|
||||
|
||||
for (hostname, host_widgets) in &self.host_widgets {
|
||||
if let Some(CommandStatus::InProgress { command_type: CommandType::SystemRebuild, .. }) = &host_widgets.command_status {
|
||||
// Check if agent hash has changed (indicating successful rebuild)
|
||||
if let Some(agent_hash_metric) = metric_store.get_metric(hostname, "system_agent_hash") {
|
||||
if let cm_dashboard_shared::MetricValue::String(current_hash) = &agent_hash_metric.value {
|
||||
// Compare with stored hash (if we have one)
|
||||
if let Some(stored_hash) = host_widgets.system_widget.get_agent_hash() {
|
||||
if current_hash != stored_hash {
|
||||
// Agent hash changed - rebuild completed successfully
|
||||
hosts_to_complete.push(hostname.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Mark rebuilds as completed
|
||||
for hostname in hosts_to_complete {
|
||||
self.complete_command(&hostname);
|
||||
}
|
||||
}
|
||||
|
||||
/// Scroll the focused panel up or down
|
||||
pub fn scroll_focused_panel(&mut self, direction: i32) {
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let focused_panel = self.focused_panel; // Get the value before borrowing
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
|
||||
match focused_panel {
|
||||
PanelType::System => {
|
||||
if direction > 0 {
|
||||
host_widgets.system_scroll_offset = host_widgets.system_scroll_offset.saturating_add(1);
|
||||
} else {
|
||||
host_widgets.system_scroll_offset = host_widgets.system_scroll_offset.saturating_sub(1);
|
||||
}
|
||||
info!("System panel scroll offset: {}", host_widgets.system_scroll_offset);
|
||||
}
|
||||
PanelType::Services => {
|
||||
// For services panel, Up/Down moves selection cursor, not scroll
|
||||
let total_services = host_widgets.services_widget.get_total_services_count();
|
||||
|
||||
if direction > 0 {
|
||||
host_widgets.services_widget.select_next(total_services);
|
||||
info!("Services selection moved down");
|
||||
} else {
|
||||
host_widgets.services_widget.select_previous();
|
||||
info!("Services selection moved up");
|
||||
}
|
||||
}
|
||||
PanelType::Backup => {
|
||||
if direction > 0 {
|
||||
host_widgets.backup_scroll_offset = host_widgets.backup_scroll_offset.saturating_add(1);
|
||||
} else {
|
||||
host_widgets.backup_scroll_offset = host_widgets.backup_scroll_offset.saturating_sub(1);
|
||||
}
|
||||
info!("Backup panel scroll offset: {}", host_widgets.backup_scroll_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Get list of currently visible panels
|
||||
fn get_visible_panels(&self) -> Vec<PanelType> {
|
||||
let mut visible_panels = vec![PanelType::System, PanelType::Services];
|
||||
|
||||
// Check if backup panel should be shown
|
||||
if let Some(hostname) = &self.current_host {
|
||||
if let Some(host_widgets) = self.host_widgets.get(hostname) {
|
||||
if host_widgets.backup_widget.has_data() {
|
||||
visible_panels.push(PanelType::Backup);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
visible_panels
|
||||
}
|
||||
|
||||
/// Render the dashboard (real btop-style multi-panel layout)
|
||||
pub fn render(&mut self, frame: &mut Frame, metric_store: &MetricStore) {
|
||||
@@ -586,6 +522,21 @@ impl TuiApp {
|
||||
])
|
||||
.split(main_chunks[1]); // main_chunks[1] is now the content area (between title and statusbar)
|
||||
|
||||
// Check if current host is offline
|
||||
let current_host_offline = if let Some(hostname) = self.current_host.clone() {
|
||||
self.calculate_host_status(&hostname, metric_store) == Status::Offline
|
||||
} else {
|
||||
true // No host selected is considered offline
|
||||
};
|
||||
|
||||
// If host is offline, render wake-up message instead of panels
|
||||
if current_host_offline {
|
||||
self.render_offline_host_message(frame, main_chunks[1]);
|
||||
self.render_btop_title(frame, main_chunks[0], metric_store);
|
||||
self.render_statusbar(frame, main_chunks[2]);
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if backup panel should be shown
|
||||
let show_backup = if let Some(hostname) = self.current_host.clone() {
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
@@ -623,19 +574,16 @@ impl TuiApp {
|
||||
|
||||
// Render services widget for current host
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let is_focused = self.focused_panel == PanelType::Services;
|
||||
let (scroll_offset, command_status) = {
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
(host_widgets.services_scroll_offset, host_widgets.command_status.clone())
|
||||
};
|
||||
let is_focused = true; // Always show service selection
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets
|
||||
.services_widget
|
||||
.render_with_command_status(frame, content_chunks[1], is_focused, scroll_offset, command_status.as_ref()); // Services takes full right side
|
||||
.render(frame, content_chunks[1], is_focused); // Services takes full right side
|
||||
}
|
||||
|
||||
// Render statusbar at the bottom
|
||||
self.render_statusbar(frame, main_chunks[2]); // main_chunks[2] is the statusbar area
|
||||
|
||||
}
|
||||
|
||||
/// Render btop-style minimal title with host status colors
|
||||
@@ -646,67 +594,90 @@ impl TuiApp {
|
||||
|
||||
if self.available_hosts.is_empty() {
|
||||
let title_text = "cm-dashboard • no hosts discovered";
|
||||
let title = Paragraph::new(title_text).style(Typography::title());
|
||||
let title = Paragraph::new(title_text)
|
||||
.style(Style::default().fg(Theme::background()).bg(Theme::status_color(Status::Unknown)));
|
||||
frame.render_widget(title, area);
|
||||
return;
|
||||
}
|
||||
|
||||
// Create spans for each host with status indicators
|
||||
let mut spans = vec![Span::styled("cm-dashboard • ", Typography::title())];
|
||||
// Calculate worst-case status across all hosts (excluding offline)
|
||||
let mut worst_status = Status::Ok;
|
||||
for host in &self.available_hosts {
|
||||
let host_status = self.calculate_host_status(host, metric_store);
|
||||
// Don't include offline hosts in status aggregation
|
||||
if host_status != Status::Offline {
|
||||
worst_status = Status::aggregate(&[worst_status, host_status]);
|
||||
}
|
||||
}
|
||||
|
||||
// Use the worst status color as background
|
||||
let background_color = Theme::status_color(worst_status);
|
||||
|
||||
// Split the title bar into left and right sections
|
||||
let chunks = Layout::default()
|
||||
.direction(Direction::Horizontal)
|
||||
.constraints([Constraint::Length(15), Constraint::Min(0)])
|
||||
.split(area);
|
||||
|
||||
// Left side: "cm-dashboard" text
|
||||
let left_span = Span::styled(
|
||||
" cm-dashboard",
|
||||
Style::default().fg(Theme::background()).bg(background_color).add_modifier(Modifier::BOLD)
|
||||
);
|
||||
let left_title = Paragraph::new(Line::from(vec![left_span]))
|
||||
.style(Style::default().bg(background_color));
|
||||
frame.render_widget(left_title, chunks[0]);
|
||||
|
||||
// Right side: hosts with status indicators
|
||||
let mut host_spans = Vec::new();
|
||||
|
||||
for (i, host) in self.available_hosts.iter().enumerate() {
|
||||
if i > 0 {
|
||||
spans.push(Span::styled(" ", Typography::title()));
|
||||
host_spans.push(Span::styled(
|
||||
" ",
|
||||
Style::default().fg(Theme::background()).bg(background_color)
|
||||
));
|
||||
}
|
||||
|
||||
// Check if this host has a command status that affects the icon
|
||||
let (status_icon, status_color) = if let Some(host_widgets) = self.host_widgets.get(host) {
|
||||
match &host_widgets.command_status {
|
||||
Some(CommandStatus::InProgress { command_type: CommandType::SystemRebuild, .. }) => {
|
||||
// Show blue circular arrow during rebuild
|
||||
("↻", Theme::highlight())
|
||||
}
|
||||
Some(CommandStatus::Success { command_type: CommandType::SystemRebuild, .. }) => {
|
||||
// Show green checkmark for successful rebuild
|
||||
("✓", Theme::success())
|
||||
}
|
||||
_ => {
|
||||
// Normal status icon based on metrics
|
||||
// Always show normal status icon based on metrics (no command status at host level)
|
||||
let host_status = self.calculate_host_status(host, metric_store);
|
||||
(StatusIcons::get_icon(host_status), Theme::status_color(host_status))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No host widgets yet, use normal status
|
||||
let host_status = self.calculate_host_status(host, metric_store);
|
||||
(StatusIcons::get_icon(host_status), Theme::status_color(host_status))
|
||||
};
|
||||
let status_icon = StatusIcons::get_icon(host_status);
|
||||
|
||||
// Add status icon
|
||||
spans.push(Span::styled(
|
||||
// Add status icon with background color as foreground against status background
|
||||
host_spans.push(Span::styled(
|
||||
format!("{} ", status_icon),
|
||||
Style::default().fg(status_color),
|
||||
Style::default().fg(Theme::background()).bg(background_color),
|
||||
));
|
||||
|
||||
if Some(host) == self.current_host.as_ref() {
|
||||
// Selected host in bold bright white
|
||||
spans.push(Span::styled(
|
||||
// Selected host in bold background color against status background
|
||||
host_spans.push(Span::styled(
|
||||
host.clone(),
|
||||
Typography::title().add_modifier(Modifier::BOLD),
|
||||
Style::default()
|
||||
.fg(Theme::background())
|
||||
.bg(background_color)
|
||||
.add_modifier(Modifier::BOLD),
|
||||
));
|
||||
} else {
|
||||
// Other hosts in normal style with status color
|
||||
spans.push(Span::styled(
|
||||
// Other hosts in normal background color against status background
|
||||
host_spans.push(Span::styled(
|
||||
host.clone(),
|
||||
Style::default().fg(status_color),
|
||||
Style::default().fg(Theme::background()).bg(background_color),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let title_line = Line::from(spans);
|
||||
let title = Paragraph::new(vec![title_line]);
|
||||
// Add right padding
|
||||
host_spans.push(Span::styled(
|
||||
" ",
|
||||
Style::default().fg(Theme::background()).bg(background_color)
|
||||
));
|
||||
|
||||
frame.render_widget(title, area);
|
||||
let host_line = Line::from(host_spans);
|
||||
let host_title = Paragraph::new(vec![host_line])
|
||||
.style(Style::default().bg(background_color))
|
||||
.alignment(ratatui::layout::Alignment::Right);
|
||||
frame.render_widget(host_title, chunks[1]);
|
||||
}
|
||||
|
||||
/// Calculate overall status for a host based on its metrics
|
||||
@@ -714,7 +685,7 @@ impl TuiApp {
|
||||
let metrics = metric_store.get_metrics_for_host(hostname);
|
||||
|
||||
if metrics.is_empty() {
|
||||
return Status::Unknown;
|
||||
return Status::Offline;
|
||||
}
|
||||
|
||||
// First check if we have the aggregated host status summary from the agent
|
||||
@@ -734,7 +705,9 @@ impl TuiApp {
|
||||
Status::Warning => has_warning = true,
|
||||
Status::Pending => has_pending = true,
|
||||
Status::Ok => ok_count += 1,
|
||||
Status::Unknown => {} // Ignore unknown for aggregation
|
||||
Status::Inactive => ok_count += 1, // Treat inactive as OK for aggregation
|
||||
Status::Unknown => {}, // Ignore unknown for aggregation
|
||||
Status::Offline => {}, // Ignore offline for aggregation
|
||||
}
|
||||
}
|
||||
|
||||
@@ -769,70 +742,139 @@ impl TuiApp {
|
||||
let mut shortcuts = Vec::new();
|
||||
|
||||
// Global shortcuts
|
||||
shortcuts.push("Tab: Switch Host".to_string());
|
||||
shortcuts.push("Shift+Tab: Switch Panel".to_string());
|
||||
|
||||
// Scroll shortcuts (always available)
|
||||
shortcuts.push("↑↓: Scroll".to_string());
|
||||
|
||||
// Panel-specific shortcuts
|
||||
match self.focused_panel {
|
||||
PanelType::System => {
|
||||
shortcuts.push("R: Rebuild".to_string());
|
||||
}
|
||||
PanelType::Services => {
|
||||
shortcuts.push("S: Start".to_string());
|
||||
shortcuts.push("Shift+S: Stop".to_string());
|
||||
shortcuts.push("R: Restart".to_string());
|
||||
}
|
||||
PanelType::Backup => {
|
||||
shortcuts.push("B: Trigger Backup".to_string());
|
||||
}
|
||||
}
|
||||
shortcuts.push("Tab: Host".to_string());
|
||||
shortcuts.push("↑↓/jk: Select".to_string());
|
||||
shortcuts.push("r: Rebuild".to_string());
|
||||
shortcuts.push("s/S: Start/Stop".to_string());
|
||||
shortcuts.push("J: Logs".to_string());
|
||||
shortcuts.push("L: Custom".to_string());
|
||||
shortcuts.push("w: Wake".to_string());
|
||||
|
||||
// Always show quit
|
||||
shortcuts.push("Q: Quit".to_string());
|
||||
shortcuts.push("q: Quit".to_string());
|
||||
|
||||
shortcuts
|
||||
}
|
||||
|
||||
fn render_system_panel(&mut self, frame: &mut Frame, area: Rect, _metric_store: &MetricStore) {
|
||||
let system_block = if self.focused_panel == PanelType::System {
|
||||
Components::focused_widget_block("system")
|
||||
} else {
|
||||
Components::widget_block("system")
|
||||
};
|
||||
let system_block = Components::widget_block("system");
|
||||
let inner_area = system_block.inner(area);
|
||||
frame.render_widget(system_block, area);
|
||||
// Get current host widgets, create if none exist
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let scroll_offset = {
|
||||
// Clone the config to avoid borrowing issues
|
||||
let config = self.config.clone();
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.system_scroll_offset
|
||||
};
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.system_widget.render_with_scroll(frame, inner_area, scroll_offset);
|
||||
host_widgets.system_widget.render(frame, inner_area, &hostname, Some(&config));
|
||||
}
|
||||
}
|
||||
|
||||
fn render_backup_panel(&mut self, frame: &mut Frame, area: Rect) {
|
||||
let backup_block = if self.focused_panel == PanelType::Backup {
|
||||
Components::focused_widget_block("backup")
|
||||
} else {
|
||||
Components::widget_block("backup")
|
||||
};
|
||||
let backup_block = Components::widget_block("backup");
|
||||
let inner_area = backup_block.inner(area);
|
||||
frame.render_widget(backup_block, area);
|
||||
|
||||
// Get current host widgets for backup widget
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let scroll_offset = {
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.backup_scroll_offset
|
||||
};
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.backup_widget.render_with_scroll(frame, inner_area, scroll_offset);
|
||||
host_widgets.backup_widget.render(frame, inner_area);
|
||||
}
|
||||
}
|
||||
|
||||
/// Render offline host message with wake-up option
|
||||
fn render_offline_host_message(&self, frame: &mut Frame, area: Rect) {
|
||||
use ratatui::layout::Alignment;
|
||||
use ratatui::style::Modifier;
|
||||
use ratatui::text::{Line, Span};
|
||||
use ratatui::widgets::{Block, Borders, Paragraph};
|
||||
|
||||
// Get hostname for message
|
||||
let hostname = self.current_host.as_ref()
|
||||
.map(|h| h.as_str())
|
||||
.unwrap_or("Unknown");
|
||||
|
||||
// Check if host has MAC address for wake-on-LAN
|
||||
let has_mac = self.current_host.as_ref()
|
||||
.and_then(|hostname| self.config.hosts.get(hostname))
|
||||
.and_then(|details| details.mac_address.as_ref())
|
||||
.is_some();
|
||||
|
||||
// Create message content
|
||||
let mut lines = vec![
|
||||
Line::from(Span::styled(
|
||||
format!("Host '{}' is offline", hostname),
|
||||
Style::default().fg(Theme::muted_text()).add_modifier(Modifier::BOLD),
|
||||
)),
|
||||
Line::from(""),
|
||||
];
|
||||
|
||||
if has_mac {
|
||||
lines.push(Line::from(Span::styled(
|
||||
"Press 'w' to wake up host",
|
||||
Style::default().fg(Theme::primary_text()).add_modifier(Modifier::BOLD),
|
||||
)));
|
||||
} else {
|
||||
lines.push(Line::from(Span::styled(
|
||||
"No MAC address configured - cannot wake up",
|
||||
Style::default().fg(Theme::muted_text()),
|
||||
)));
|
||||
}
|
||||
|
||||
// Create centered message
|
||||
let message = Paragraph::new(lines)
|
||||
.block(Block::default()
|
||||
.borders(Borders::ALL)
|
||||
.border_style(Style::default().fg(Theme::muted_text()))
|
||||
.title(" Offline Host ")
|
||||
.title_style(Style::default().fg(Theme::muted_text()).add_modifier(Modifier::BOLD)))
|
||||
.style(Style::default().bg(Theme::background()).fg(Theme::primary_text()))
|
||||
.alignment(Alignment::Center);
|
||||
|
||||
// Center the message in the available area
|
||||
let popup_area = ratatui::layout::Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([
|
||||
Constraint::Percentage(40),
|
||||
Constraint::Length(6),
|
||||
Constraint::Percentage(40),
|
||||
])
|
||||
.split(area)[1];
|
||||
|
||||
let popup_area = ratatui::layout::Layout::default()
|
||||
.direction(Direction::Horizontal)
|
||||
.constraints([
|
||||
Constraint::Percentage(25),
|
||||
Constraint::Percentage(50),
|
||||
Constraint::Percentage(25),
|
||||
])
|
||||
.split(popup_area)[1];
|
||||
|
||||
frame.render_widget(message, popup_area);
|
||||
}
|
||||
|
||||
/// Parse MAC address string (e.g., "AA:BB:CC:DD:EE:FF") to [u8; 6]
|
||||
/// Get the connection IP for a hostname based on host configuration
|
||||
fn get_connection_ip(&self, hostname: &str) -> String {
|
||||
if let Some(host_details) = self.config.hosts.get(hostname) {
|
||||
host_details.get_connection_ip(hostname)
|
||||
} else {
|
||||
hostname.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_mac_address(mac_str: &str) -> Result<[u8; 6], &'static str> {
|
||||
let parts: Vec<&str> = mac_str.split(':').collect();
|
||||
if parts.len() != 6 {
|
||||
return Err("MAC address must have 6 parts separated by colons");
|
||||
}
|
||||
|
||||
let mut mac = [0u8; 6];
|
||||
for (i, part) in parts.iter().enumerate() {
|
||||
match u8::from_str_radix(part, 16) {
|
||||
Ok(byte) => mac[i] = byte,
|
||||
Err(_) => return Err("Invalid hexadecimal byte in MAC address"),
|
||||
}
|
||||
}
|
||||
Ok(mac)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -143,10 +143,12 @@ impl Theme {
|
||||
pub fn status_color(status: Status) -> Color {
|
||||
match status {
|
||||
Status::Ok => Self::success(),
|
||||
Status::Inactive => Self::muted_text(), // Gray for inactive services in service list
|
||||
Status::Pending => Self::highlight(), // Blue for pending
|
||||
Status::Warning => Self::warning(),
|
||||
Status::Critical => Self::error(),
|
||||
Status::Unknown => Self::muted_text(),
|
||||
Status::Offline => Self::muted_text(), // Dark gray for offline
|
||||
}
|
||||
}
|
||||
|
||||
@@ -242,10 +244,12 @@ impl StatusIcons {
|
||||
pub fn get_icon(status: Status) -> &'static str {
|
||||
match status {
|
||||
Status::Ok => "●",
|
||||
Status::Inactive => "○", // Empty circle for inactive services
|
||||
Status::Pending => "◉", // Hollow circle for pending
|
||||
Status::Warning => "◐",
|
||||
Status::Critical => "◯",
|
||||
Status::Critical => "!",
|
||||
Status::Unknown => "?",
|
||||
Status::Offline => "○", // Empty circle for offline
|
||||
}
|
||||
}
|
||||
|
||||
@@ -254,10 +258,12 @@ impl StatusIcons {
|
||||
let icon = Self::get_icon(status);
|
||||
let status_color = match status {
|
||||
Status::Ok => Theme::success(), // Green
|
||||
Status::Inactive => Theme::muted_text(), // Gray for inactive services
|
||||
Status::Pending => Theme::highlight(), // Blue
|
||||
Status::Warning => Theme::warning(), // Yellow
|
||||
Status::Critical => Theme::error(), // Red
|
||||
Status::Unknown => Theme::muted_text(), // Gray
|
||||
Status::Offline => Theme::muted_text(), // Dark gray for offline
|
||||
};
|
||||
|
||||
vec![
|
||||
@@ -289,27 +295,9 @@ impl Components {
|
||||
)
|
||||
}
|
||||
|
||||
/// Widget block with focus indicator (blue border)
|
||||
pub fn focused_widget_block(title: &str) -> Block<'_> {
|
||||
Block::default()
|
||||
.title(title)
|
||||
.borders(Borders::ALL)
|
||||
.style(Style::default().fg(Theme::highlight()).bg(Theme::background())) // Blue border for focus
|
||||
.title_style(
|
||||
Style::default()
|
||||
.fg(Theme::highlight()) // Blue title for focus
|
||||
.bg(Theme::background()),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl Typography {
|
||||
/// Main title style (dashboard header)
|
||||
pub fn title() -> Style {
|
||||
Style::default()
|
||||
.fg(Theme::primary_text())
|
||||
.bg(Theme::background())
|
||||
}
|
||||
|
||||
/// Widget title style (panel headers) - bold bright white
|
||||
pub fn widget_title() -> Style {
|
||||
|
||||
@@ -259,7 +259,12 @@ impl Widget for BackupWidget {
|
||||
services.sort_by(|a, b| a.name.cmp(&b.name));
|
||||
self.service_metrics = services;
|
||||
|
||||
self.has_data = !metrics.is_empty();
|
||||
// Only show backup panel if we have meaningful backup data
|
||||
self.has_data = !metrics.is_empty() && (
|
||||
self.last_run_timestamp.is_some() ||
|
||||
self.total_repo_size_gb.is_some() ||
|
||||
!self.service_metrics.is_empty()
|
||||
);
|
||||
|
||||
debug!(
|
||||
"Backup widget updated: status={:?}, services={}, total_size={:?}GB",
|
||||
@@ -280,8 +285,8 @@ impl Widget for BackupWidget {
|
||||
}
|
||||
|
||||
impl BackupWidget {
|
||||
/// Render with scroll offset support
|
||||
pub fn render_with_scroll(&mut self, frame: &mut Frame, area: Rect, scroll_offset: usize) {
|
||||
/// Render backup widget
|
||||
pub fn render(&mut self, frame: &mut Frame, area: Rect) {
|
||||
let mut lines = Vec::new();
|
||||
|
||||
// Latest backup section
|
||||
@@ -361,42 +366,20 @@ impl BackupWidget {
|
||||
let total_lines = lines.len();
|
||||
let available_height = area.height as usize;
|
||||
|
||||
// Calculate scroll boundaries
|
||||
let max_scroll = if total_lines > available_height {
|
||||
total_lines - available_height
|
||||
} else {
|
||||
total_lines.saturating_sub(1)
|
||||
};
|
||||
let effective_scroll = scroll_offset.min(max_scroll);
|
||||
|
||||
// Apply scrolling if needed
|
||||
if scroll_offset > 0 || total_lines > available_height {
|
||||
// Show only what fits, with "X more below" if needed
|
||||
if total_lines > available_height {
|
||||
let lines_for_content = available_height.saturating_sub(1); // Reserve one line for "more below"
|
||||
let mut visible_lines: Vec<_> = lines
|
||||
.into_iter()
|
||||
.skip(effective_scroll)
|
||||
.take(available_height)
|
||||
.take(lines_for_content)
|
||||
.collect();
|
||||
|
||||
// Add scroll indicator if there are hidden lines
|
||||
if total_lines > available_height {
|
||||
let hidden_above = effective_scroll;
|
||||
let hidden_below = total_lines.saturating_sub(effective_scroll + available_height);
|
||||
|
||||
if (hidden_above > 0 || hidden_below > 0) && !visible_lines.is_empty() {
|
||||
let scroll_text = if hidden_above > 0 && hidden_below > 0 {
|
||||
format!("... {} above, {} below", hidden_above, hidden_below)
|
||||
} else if hidden_above > 0 {
|
||||
format!("... {} more above", hidden_above)
|
||||
} else {
|
||||
format!("... {} more below", hidden_below)
|
||||
};
|
||||
|
||||
// Replace last line with scroll indicator
|
||||
visible_lines.pop();
|
||||
visible_lines.push(ratatui::text::Line::from(vec![
|
||||
ratatui::text::Span::styled(scroll_text, Typography::muted())
|
||||
]));
|
||||
}
|
||||
let hidden_below = total_lines.saturating_sub(lines_for_content);
|
||||
if hidden_below > 0 {
|
||||
let more_line = ratatui::text::Line::from(vec![
|
||||
ratatui::text::Span::styled(format!("... {} more below", hidden_below), Typography::muted())
|
||||
]);
|
||||
visible_lines.push(more_line);
|
||||
}
|
||||
|
||||
let paragraph = Paragraph::new(ratatui::text::Text::from(visible_lines));
|
||||
|
||||
@@ -9,7 +9,6 @@ use tracing::debug;
|
||||
|
||||
use super::Widget;
|
||||
use crate::ui::theme::{Components, StatusIcons, Theme, Typography};
|
||||
use crate::ui::{CommandStatus, CommandType};
|
||||
use ratatui::style::Style;
|
||||
|
||||
/// Services widget displaying hierarchical systemd service statuses
|
||||
@@ -113,13 +112,10 @@ impl ServicesWidget {
|
||||
name.to_string()
|
||||
};
|
||||
|
||||
// Parent services always show active/inactive status
|
||||
// Parent services always show actual systemctl status
|
||||
let status_str = match info.widget_status {
|
||||
Status::Ok => "active".to_string(),
|
||||
Status::Pending => "pending".to_string(),
|
||||
Status::Warning => "inactive".to_string(),
|
||||
Status::Critical => "failed".to_string(),
|
||||
Status::Unknown => "unknown".to_string(),
|
||||
_ => info.status.clone(), // Use actual status from agent (active/inactive/failed)
|
||||
};
|
||||
|
||||
format!(
|
||||
@@ -128,49 +124,14 @@ impl ServicesWidget {
|
||||
)
|
||||
}
|
||||
|
||||
/// Get status icon for service, considering command status for visual feedback
|
||||
fn get_service_icon_and_status(&self, service_name: &str, info: &ServiceInfo, command_status: Option<&CommandStatus>) -> (String, String, ratatui::prelude::Color) {
|
||||
// Check if this service is currently being operated on
|
||||
if let Some(status) = command_status {
|
||||
match status {
|
||||
CommandStatus::InProgress { command_type, target, .. } => {
|
||||
if target == service_name {
|
||||
// Only show special icons for service commands
|
||||
if let Some((icon, status_text)) = match command_type {
|
||||
CommandType::ServiceRestart => Some(("↻", "restarting")),
|
||||
CommandType::ServiceStart => Some(("↑", "starting")),
|
||||
CommandType::ServiceStop => Some(("↓", "stopping")),
|
||||
_ => None, // Don't handle non-service commands here
|
||||
} {
|
||||
return (icon.to_string(), status_text.to_string(), Theme::highlight());
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {} // Success/Failed states will show normal status
|
||||
}
|
||||
}
|
||||
|
||||
// Normal status display
|
||||
let icon = StatusIcons::get_icon(info.widget_status);
|
||||
let status_color = match info.widget_status {
|
||||
Status::Ok => Theme::success(),
|
||||
Status::Pending => Theme::highlight(),
|
||||
Status::Warning => Theme::warning(),
|
||||
Status::Critical => Theme::error(),
|
||||
Status::Unknown => Theme::muted_text(),
|
||||
};
|
||||
|
||||
(icon.to_string(), info.status.clone(), status_color)
|
||||
}
|
||||
|
||||
|
||||
/// Create spans for sub-service with icon next to name, considering command status
|
||||
fn create_sub_service_spans_with_status(
|
||||
/// Create spans for sub-service with icon next to name
|
||||
fn create_sub_service_spans(
|
||||
&self,
|
||||
name: &str,
|
||||
info: &ServiceInfo,
|
||||
is_last: bool,
|
||||
command_status: Option<&CommandStatus>,
|
||||
) -> Vec<ratatui::text::Span<'static>> {
|
||||
// Truncate long sub-service names to fit layout (accounting for indentation)
|
||||
let short_name = if name.len() > 18 {
|
||||
@@ -179,19 +140,28 @@ impl ServicesWidget {
|
||||
name.to_string()
|
||||
};
|
||||
|
||||
// Get status icon and text, considering command status
|
||||
let (icon, mut status_str, status_color) = self.get_service_icon_and_status(name, info, command_status);
|
||||
// Get status icon and text
|
||||
let icon = StatusIcons::get_icon(info.widget_status);
|
||||
let status_color = match info.widget_status {
|
||||
Status::Ok => Theme::success(),
|
||||
Status::Inactive => Theme::muted_text(),
|
||||
Status::Pending => Theme::highlight(),
|
||||
Status::Warning => Theme::warning(),
|
||||
Status::Critical => Theme::error(),
|
||||
Status::Unknown => Theme::muted_text(),
|
||||
Status::Offline => Theme::muted_text(),
|
||||
};
|
||||
|
||||
// For sub-services, prefer latency if available (unless command is in progress)
|
||||
if command_status.is_none() {
|
||||
if let Some(latency) = info.latency_ms {
|
||||
status_str = if latency < 0.0 {
|
||||
// For sub-services, prefer latency if available
|
||||
let status_str = if let Some(latency) = info.latency_ms {
|
||||
if latency < 0.0 {
|
||||
"timeout".to_string()
|
||||
} else {
|
||||
format!("{:.0}ms", latency)
|
||||
}
|
||||
} else {
|
||||
info.status.clone()
|
||||
};
|
||||
}
|
||||
}
|
||||
let tree_symbol = if is_last { "└─" } else { "├─" };
|
||||
|
||||
vec![
|
||||
@@ -241,13 +211,14 @@ impl ServicesWidget {
|
||||
/// Get currently selected service name (for actions)
|
||||
pub fn get_selected_service(&self) -> Option<String> {
|
||||
// Build the same display list to find the selected service
|
||||
let mut display_lines: Vec<(String, Status, bool, Option<(ServiceInfo, bool)>)> = Vec::new();
|
||||
let mut display_lines: Vec<(String, Status, bool, Option<(ServiceInfo, bool)>, String)> = Vec::new();
|
||||
|
||||
let mut parent_services: Vec<_> = self.parent_services.iter().collect();
|
||||
parent_services.sort_by(|(a, _), (b, _)| a.cmp(b));
|
||||
|
||||
for (parent_name, parent_info) in parent_services {
|
||||
display_lines.push((parent_name.clone(), parent_info.widget_status, false, None));
|
||||
let parent_line = self.format_parent_service_line(parent_name, parent_info);
|
||||
display_lines.push((parent_line, parent_info.widget_status, false, None, parent_name.clone()));
|
||||
|
||||
if let Some(sub_list) = self.sub_services.get(parent_name) {
|
||||
let mut sorted_subs = sub_list.clone();
|
||||
@@ -255,17 +226,19 @@ impl ServicesWidget {
|
||||
|
||||
for (i, (sub_name, sub_info)) in sorted_subs.iter().enumerate() {
|
||||
let is_last_sub = i == sorted_subs.len() - 1;
|
||||
let full_sub_name = format!("{}_{}", parent_name, sub_name);
|
||||
display_lines.push((
|
||||
format!("{}_{}", parent_name, sub_name), // Use parent_sub format for sub-services
|
||||
sub_name.clone(),
|
||||
sub_info.widget_status,
|
||||
true,
|
||||
Some((sub_info.clone(), is_last_sub)),
|
||||
full_sub_name,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
display_lines.get(self.selected_index).map(|(name, _, _, _)| name.clone())
|
||||
display_lines.get(self.selected_index).map(|(_, _, _, _, raw_name)| raw_name.clone())
|
||||
}
|
||||
|
||||
/// Get total count of selectable services (parent services only, not sub-services)
|
||||
@@ -274,6 +247,7 @@ impl ServicesWidget {
|
||||
self.parent_services.len()
|
||||
}
|
||||
|
||||
|
||||
/// Calculate which parent service index corresponds to a display line index
|
||||
fn calculate_parent_service_index(&self, display_line_index: &usize) -> usize {
|
||||
// Build the same display list to map line index to parent service index
|
||||
@@ -427,13 +401,9 @@ impl Widget for ServicesWidget {
|
||||
|
||||
impl ServicesWidget {
|
||||
|
||||
/// Render with focus, scroll, and command status for visual feedback
|
||||
pub fn render_with_command_status(&mut self, frame: &mut Frame, area: Rect, is_focused: bool, scroll_offset: usize, command_status: Option<&CommandStatus>) {
|
||||
let services_block = if is_focused {
|
||||
Components::focused_widget_block("services")
|
||||
} else {
|
||||
Components::widget_block("services")
|
||||
};
|
||||
/// Render with focus
|
||||
pub fn render(&mut self, frame: &mut Frame, area: Rect, is_focused: bool) {
|
||||
let services_block = Components::widget_block("services");
|
||||
let inner_area = services_block.inner(area);
|
||||
frame.render_widget(services_block, area);
|
||||
|
||||
@@ -457,13 +427,13 @@ impl ServicesWidget {
|
||||
return;
|
||||
}
|
||||
|
||||
// Use the existing render logic but with command status
|
||||
self.render_services_with_status(frame, content_chunks[1], is_focused, scroll_offset, command_status);
|
||||
// Render the services list
|
||||
self.render_services(frame, content_chunks[1], is_focused);
|
||||
}
|
||||
|
||||
/// Render services list with command status awareness
|
||||
fn render_services_with_status(&mut self, frame: &mut Frame, area: Rect, is_focused: bool, scroll_offset: usize, command_status: Option<&CommandStatus>) {
|
||||
// Build hierarchical service list for display (same as existing logic)
|
||||
/// Render services list
|
||||
fn render_services(&mut self, frame: &mut Frame, area: Rect, is_focused: bool) {
|
||||
// Build hierarchical service list for display
|
||||
let mut display_lines: Vec<(String, Status, bool, Option<(ServiceInfo, bool)>)> = Vec::new();
|
||||
|
||||
// Sort parent services alphabetically for consistent order
|
||||
@@ -473,7 +443,7 @@ impl ServicesWidget {
|
||||
for (parent_name, parent_info) in parent_services {
|
||||
// Add parent service line
|
||||
let parent_line = self.format_parent_service_line(parent_name, parent_info);
|
||||
display_lines.push((parent_line, parent_info.widget_status, false, None)); // false = not sub-service
|
||||
display_lines.push((parent_line, parent_info.widget_status, false, None));
|
||||
|
||||
// Add sub-services for this parent (if any)
|
||||
if let Some(sub_list) = self.sub_services.get(parent_name) {
|
||||
@@ -494,36 +464,37 @@ impl ServicesWidget {
|
||||
}
|
||||
}
|
||||
|
||||
// Apply scroll offset and render visible lines (same as existing logic)
|
||||
// Show only what fits, with "X more below" if needed
|
||||
let available_lines = area.height as usize;
|
||||
let total_lines = display_lines.len();
|
||||
|
||||
// Calculate scroll boundaries
|
||||
let max_scroll = if total_lines > available_lines {
|
||||
total_lines - available_lines
|
||||
// Reserve one line for "X more below" if needed
|
||||
let lines_for_content = if total_lines > available_lines {
|
||||
available_lines.saturating_sub(1)
|
||||
} else {
|
||||
total_lines.saturating_sub(1)
|
||||
available_lines
|
||||
};
|
||||
let effective_scroll = scroll_offset.min(max_scroll);
|
||||
|
||||
// Get visible lines after scrolling
|
||||
let visible_lines: Vec<_> = display_lines
|
||||
.iter()
|
||||
.skip(effective_scroll)
|
||||
.take(available_lines)
|
||||
.take(lines_for_content)
|
||||
.collect();
|
||||
|
||||
let hidden_below = total_lines.saturating_sub(lines_for_content);
|
||||
|
||||
let lines_to_show = visible_lines.len();
|
||||
|
||||
if lines_to_show > 0 {
|
||||
// Add space for "X more below" message if needed
|
||||
let total_chunks_needed = if hidden_below > 0 { lines_to_show + 1 } else { lines_to_show };
|
||||
let service_chunks = Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints(vec![Constraint::Length(1); lines_to_show])
|
||||
.constraints(vec![Constraint::Length(1); total_chunks_needed])
|
||||
.split(area);
|
||||
|
||||
for (i, (line_text, line_status, is_sub, sub_info)) in visible_lines.iter().enumerate()
|
||||
{
|
||||
let actual_index = effective_scroll + i; // Real index in the full list
|
||||
let actual_index = i; // Simple index since we're not scrolling
|
||||
|
||||
// Only parent services can be selected - calculate parent service index
|
||||
let is_selected = if !*is_sub {
|
||||
@@ -535,47 +506,23 @@ impl ServicesWidget {
|
||||
};
|
||||
|
||||
let mut spans = if *is_sub && sub_info.is_some() {
|
||||
// Use custom sub-service span creation WITH command status
|
||||
// Use custom sub-service span creation
|
||||
let (service_info, is_last) = sub_info.as_ref().unwrap();
|
||||
self.create_sub_service_spans_with_status(line_text, service_info, *is_last, command_status)
|
||||
} else {
|
||||
// Parent services - check if this parent service has a command in progress
|
||||
let service_spans = if let Some(status) = command_status {
|
||||
match status {
|
||||
CommandStatus::InProgress { target, .. } => {
|
||||
if target == line_text {
|
||||
// Create spans with progress status
|
||||
let (icon, status_text, status_color) = self.get_service_icon_and_status(line_text, &ServiceInfo {
|
||||
status: "".to_string(),
|
||||
memory_mb: None,
|
||||
disk_gb: None,
|
||||
latency_ms: None,
|
||||
widget_status: *line_status
|
||||
}, command_status);
|
||||
vec![
|
||||
ratatui::text::Span::styled(format!("{} ", icon), Style::default().fg(status_color)),
|
||||
ratatui::text::Span::styled(line_text.clone(), Style::default().fg(Theme::primary_text())),
|
||||
ratatui::text::Span::styled(format!(" {}", status_text), Style::default().fg(status_color)),
|
||||
]
|
||||
self.create_sub_service_spans(line_text, service_info, *is_last)
|
||||
} else {
|
||||
// Parent services - use normal status spans
|
||||
StatusIcons::create_status_spans(*line_status, line_text)
|
||||
}
|
||||
}
|
||||
_ => StatusIcons::create_status_spans(*line_status, line_text)
|
||||
}
|
||||
} else {
|
||||
StatusIcons::create_status_spans(*line_status, line_text)
|
||||
};
|
||||
service_spans
|
||||
};
|
||||
|
||||
// Apply selection highlighting to parent services only, preserving status icon color
|
||||
// Apply selection highlighting to parent services only
|
||||
// Only show selection when Services panel is focused
|
||||
if is_selected && !*is_sub && is_focused {
|
||||
for (i, span) in spans.iter_mut().enumerate() {
|
||||
if i == 0 {
|
||||
// First span is the status icon - preserve its color
|
||||
span.style = span.style.bg(Theme::highlight());
|
||||
// First span is the status icon - use background color for visibility against blue selection
|
||||
span.style = span.style
|
||||
.bg(Theme::highlight())
|
||||
.fg(Theme::background());
|
||||
} else {
|
||||
// Other spans (text) get full selection highlighting
|
||||
span.style = span.style
|
||||
@@ -589,33 +536,12 @@ impl ServicesWidget {
|
||||
|
||||
frame.render_widget(service_para, service_chunks[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Show scroll indicator if there are more services than we can display (same as existing)
|
||||
if total_lines > available_lines {
|
||||
let hidden_above = effective_scroll;
|
||||
let hidden_below = total_lines.saturating_sub(effective_scroll + available_lines);
|
||||
|
||||
if hidden_above > 0 || hidden_below > 0 {
|
||||
let scroll_text = if hidden_above > 0 && hidden_below > 0 {
|
||||
format!("... {} above, {} below", hidden_above, hidden_below)
|
||||
} else if hidden_above > 0 {
|
||||
format!("... {} more above", hidden_above)
|
||||
} else {
|
||||
format!("... {} more below", hidden_below)
|
||||
};
|
||||
|
||||
if available_lines > 0 && lines_to_show > 0 {
|
||||
let last_line_area = Rect {
|
||||
x: area.x,
|
||||
y: area.y + (lines_to_show - 1) as u16,
|
||||
width: area.width,
|
||||
height: 1,
|
||||
};
|
||||
|
||||
let scroll_para = Paragraph::new(scroll_text).style(Typography::muted());
|
||||
frame.render_widget(scroll_para, last_line_area);
|
||||
}
|
||||
// Show "X more below" message if content was truncated
|
||||
if hidden_below > 0 {
|
||||
let more_text = format!("... {} more below", hidden_below);
|
||||
let more_para = Paragraph::new(more_text).style(Typography::muted());
|
||||
frame.render_widget(more_para, service_chunks[lines_to_show]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,6 @@ pub struct SystemWidget {
|
||||
// NixOS information
|
||||
nixos_build: Option<String>,
|
||||
config_hash: Option<String>,
|
||||
active_users: Option<String>,
|
||||
agent_hash: Option<String>,
|
||||
|
||||
// CPU metrics
|
||||
@@ -33,6 +32,7 @@ pub struct SystemWidget {
|
||||
tmp_used_gb: Option<f32>,
|
||||
tmp_total_gb: Option<f32>,
|
||||
memory_status: Status,
|
||||
tmp_status: Status,
|
||||
|
||||
// Storage metrics (collected from disk metrics)
|
||||
storage_pools: Vec<StoragePool>,
|
||||
@@ -66,7 +66,6 @@ impl SystemWidget {
|
||||
Self {
|
||||
nixos_build: None,
|
||||
config_hash: None,
|
||||
active_users: None,
|
||||
agent_hash: None,
|
||||
cpu_load_1min: None,
|
||||
cpu_load_5min: None,
|
||||
@@ -80,6 +79,7 @@ impl SystemWidget {
|
||||
tmp_used_gb: None,
|
||||
tmp_total_gb: None,
|
||||
memory_status: Status::Unknown,
|
||||
tmp_status: Status::Unknown,
|
||||
storage_pools: Vec::new(),
|
||||
has_data: false,
|
||||
}
|
||||
@@ -129,7 +129,7 @@ impl SystemWidget {
|
||||
}
|
||||
|
||||
/// Get the current agent hash for rebuild completion detection
|
||||
pub fn get_agent_hash(&self) -> Option<&String> {
|
||||
pub fn _get_agent_hash(&self) -> Option<&String> {
|
||||
self.agent_hash.as_ref()
|
||||
}
|
||||
|
||||
@@ -230,20 +230,49 @@ impl SystemWidget {
|
||||
|
||||
/// Extract pool name from disk metric name
|
||||
fn extract_pool_name(&self, metric_name: &str) -> Option<String> {
|
||||
if let Some(captures) = metric_name.strip_prefix("disk_") {
|
||||
// Pattern: disk_{pool_name}_{drive_name}_{metric_type}
|
||||
// Since pool_name can contain underscores, work backwards from known metric suffixes
|
||||
if metric_name.starts_with("disk_") {
|
||||
// First try drive-specific metrics that have device names
|
||||
if let Some(suffix_pos) = metric_name.rfind("_temperature")
|
||||
.or_else(|| metric_name.rfind("_wear_percent"))
|
||||
.or_else(|| metric_name.rfind("_health")) {
|
||||
// Find the second-to-last underscore to get pool name
|
||||
let before_suffix = &metric_name[..suffix_pos];
|
||||
if let Some(drive_start) = before_suffix.rfind('_') {
|
||||
return Some(metric_name[5..drive_start].to_string()); // Skip "disk_"
|
||||
}
|
||||
}
|
||||
// For pool-level metrics (usage_percent, used_gb, total_gb), take everything before the metric suffix
|
||||
else if let Some(suffix_pos) = metric_name.rfind("_usage_percent")
|
||||
.or_else(|| metric_name.rfind("_used_gb"))
|
||||
.or_else(|| metric_name.rfind("_total_gb")) {
|
||||
return Some(metric_name[5..suffix_pos].to_string()); // Skip "disk_"
|
||||
}
|
||||
// Fallback to old behavior for unknown patterns
|
||||
else if let Some(captures) = metric_name.strip_prefix("disk_") {
|
||||
if let Some(pos) = captures.find('_') {
|
||||
return Some(captures[..pos].to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Extract drive name from disk metric name
|
||||
fn extract_drive_name(&self, metric_name: &str) -> Option<String> {
|
||||
// Pattern: disk_pool_drive_metric
|
||||
let parts: Vec<&str> = metric_name.split('_').collect();
|
||||
if parts.len() >= 3 && parts[0] == "disk" {
|
||||
return Some(parts[2].to_string());
|
||||
// Pattern: disk_{pool_name}_{drive_name}_{metric_type}
|
||||
// Since pool_name can contain underscores, work backwards from known metric suffixes
|
||||
if metric_name.starts_with("disk_") {
|
||||
if let Some(suffix_pos) = metric_name.rfind("_temperature")
|
||||
.or_else(|| metric_name.rfind("_wear_percent"))
|
||||
.or_else(|| metric_name.rfind("_health")) {
|
||||
// Find the second-to-last underscore to get the drive name
|
||||
let before_suffix = &metric_name[..suffix_pos];
|
||||
if let Some(drive_start) = before_suffix.rfind('_') {
|
||||
return Some(before_suffix[drive_start + 1..].to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
@@ -334,14 +363,9 @@ impl Widget for SystemWidget {
|
||||
self.config_hash = Some(hash.clone());
|
||||
}
|
||||
}
|
||||
"system_active_users" => {
|
||||
if let MetricValue::String(users) = &metric.value {
|
||||
self.active_users = Some(users.clone());
|
||||
}
|
||||
}
|
||||
"system_agent_hash" => {
|
||||
if let MetricValue::String(hash) = &metric.value {
|
||||
self.agent_hash = Some(hash.clone());
|
||||
"agent_version" => {
|
||||
if let MetricValue::String(version) = &metric.value {
|
||||
self.agent_hash = Some(version.clone());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -390,6 +414,7 @@ impl Widget for SystemWidget {
|
||||
"memory_tmp_usage_percent" => {
|
||||
if let MetricValue::Float(usage) = metric.value {
|
||||
self.tmp_usage_percent = Some(usage);
|
||||
self.tmp_status = metric.status.clone();
|
||||
}
|
||||
}
|
||||
"memory_tmp_used_gb" => {
|
||||
@@ -413,30 +438,36 @@ impl Widget for SystemWidget {
|
||||
}
|
||||
|
||||
impl SystemWidget {
|
||||
/// Render with scroll offset support
|
||||
pub fn render_with_scroll(&mut self, frame: &mut Frame, area: Rect, scroll_offset: usize) {
|
||||
/// Render system widget
|
||||
pub fn render(&mut self, frame: &mut Frame, area: Rect, hostname: &str, config: Option<&crate::config::DashboardConfig>) {
|
||||
let mut lines = Vec::new();
|
||||
|
||||
// NixOS section
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled("NixOS:", Typography::widget_title())
|
||||
Span::styled(format!("NixOS {}:", hostname), Typography::widget_title())
|
||||
]));
|
||||
|
||||
let config_text = self.config_hash.as_deref().unwrap_or("unknown");
|
||||
let build_text = self.nixos_build.as_deref().unwrap_or("unknown");
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled(format!("Build: {}", config_text), Typography::secondary())
|
||||
Span::styled(format!("Build: {}", build_text), Typography::secondary())
|
||||
]));
|
||||
|
||||
let agent_hash_text = self.agent_hash.as_deref().unwrap_or("unknown");
|
||||
let short_hash = if agent_hash_text.len() > 8 && agent_hash_text != "unknown" {
|
||||
&agent_hash_text[..8]
|
||||
} else {
|
||||
agent_hash_text
|
||||
};
|
||||
let agent_version_text = self.agent_hash.as_deref().unwrap_or("unknown");
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled(format!("Agent: {}", short_hash), Typography::secondary())
|
||||
Span::styled(format!("Agent: {}", agent_version_text), Typography::secondary())
|
||||
]));
|
||||
|
||||
// Display detected connection IP
|
||||
if let Some(config) = config {
|
||||
if let Some(host_details) = config.hosts.get(hostname) {
|
||||
let detected_ip = host_details.get_connection_ip(hostname);
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled(format!("IP: {}", detected_ip), Typography::secondary())
|
||||
]));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// CPU section
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled("CPU:", Typography::widget_title())
|
||||
@@ -472,7 +503,7 @@ impl SystemWidget {
|
||||
Span::styled(" └─ ", Typography::tree()),
|
||||
];
|
||||
tmp_spans.extend(StatusIcons::create_status_spans(
|
||||
self.memory_status.clone(),
|
||||
self.tmp_status.clone(),
|
||||
&format!("/tmp: {}", tmp_text)
|
||||
));
|
||||
lines.push(Line::from(tmp_spans));
|
||||
@@ -529,22 +560,22 @@ impl SystemWidget {
|
||||
let total_lines = lines.len();
|
||||
let available_height = area.height as usize;
|
||||
|
||||
// Always apply scrolling if scroll_offset > 0, even if content fits
|
||||
if scroll_offset > 0 || total_lines > available_height {
|
||||
let max_scroll = if total_lines > available_height {
|
||||
total_lines - available_height
|
||||
} else {
|
||||
total_lines.saturating_sub(1)
|
||||
};
|
||||
let effective_scroll = scroll_offset.min(max_scroll);
|
||||
|
||||
// Take only the visible portion after scrolling
|
||||
let visible_lines: Vec<Line> = lines
|
||||
// Show only what fits, with "X more below" if needed
|
||||
if total_lines > available_height {
|
||||
let lines_for_content = available_height.saturating_sub(1); // Reserve one line for "more below"
|
||||
let mut visible_lines: Vec<Line> = lines
|
||||
.into_iter()
|
||||
.skip(effective_scroll)
|
||||
.take(available_height)
|
||||
.take(lines_for_content)
|
||||
.collect();
|
||||
|
||||
let hidden_below = total_lines.saturating_sub(lines_for_content);
|
||||
if hidden_below > 0 {
|
||||
let more_line = Line::from(vec![
|
||||
Span::styled(format!("... {} more below", hidden_below), Typography::muted())
|
||||
]);
|
||||
visible_lines.push(more_line);
|
||||
}
|
||||
|
||||
let paragraph = Paragraph::new(Text::from(visible_lines));
|
||||
frame.render_widget(paragraph, area);
|
||||
} else {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.0"
|
||||
version = "0.1.82"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -82,11 +82,13 @@ impl MetricValue {
|
||||
/// Health status for metrics
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum Status {
|
||||
Ok,
|
||||
Inactive, // Lowest priority - treated as good
|
||||
Ok, // Second lowest - also good
|
||||
Unknown,
|
||||
Offline,
|
||||
Pending,
|
||||
Warning,
|
||||
Critical,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl Status {
|
||||
@@ -180,6 +182,16 @@ impl HysteresisThresholds {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
Status::Inactive => {
|
||||
// Inactive services use normal thresholds like first measurement
|
||||
if value >= self.critical_high {
|
||||
Status::Critical
|
||||
} else if value >= self.warning_high {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
Status::Pending => {
|
||||
// Service transitioning, use normal thresholds like first measurement
|
||||
if value >= self.critical_high {
|
||||
@@ -190,6 +202,16 @@ impl HysteresisThresholds {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
Status::Offline => {
|
||||
// Host coming back online, use normal thresholds like first measurement
|
||||
if value >= self.critical_high {
|
||||
Status::Critical
|
||||
} else if value >= self.warning_high {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,17 @@ pub struct MetricMessage {
|
||||
pub metrics: Vec<Metric>,
|
||||
}
|
||||
|
||||
/// Command output streaming message
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CommandOutputMessage {
|
||||
pub hostname: String,
|
||||
pub command_id: String,
|
||||
pub command_type: String,
|
||||
pub output_line: String,
|
||||
pub is_complete: bool,
|
||||
pub timestamp: u64,
|
||||
}
|
||||
|
||||
impl MetricMessage {
|
||||
pub fn new(hostname: String, metrics: Vec<Metric>) -> Self {
|
||||
Self {
|
||||
@@ -19,6 +30,19 @@ impl MetricMessage {
|
||||
}
|
||||
}
|
||||
|
||||
impl CommandOutputMessage {
|
||||
pub fn new(hostname: String, command_id: String, command_type: String, output_line: String, is_complete: bool) -> Self {
|
||||
Self {
|
||||
hostname,
|
||||
command_id,
|
||||
command_type,
|
||||
output_line,
|
||||
is_complete,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Commands that can be sent from dashboard to agent
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum Command {
|
||||
@@ -55,6 +79,7 @@ pub enum MessageType {
|
||||
Metrics,
|
||||
Command,
|
||||
CommandResponse,
|
||||
CommandOutput,
|
||||
Heartbeat,
|
||||
}
|
||||
|
||||
@@ -80,6 +105,13 @@ impl MessageEnvelope {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn command_output(message: CommandOutputMessage) -> Result<Self, crate::SharedError> {
|
||||
Ok(Self {
|
||||
message_type: MessageType::CommandOutput,
|
||||
payload: serde_json::to_vec(&message)?,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn heartbeat() -> Result<Self, crate::SharedError> {
|
||||
Ok(Self {
|
||||
message_type: MessageType::Heartbeat,
|
||||
@@ -113,4 +145,13 @@ impl MessageEnvelope {
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_command_output(&self) -> Result<CommandOutputMessage, crate::SharedError> {
|
||||
match self.message_type {
|
||||
MessageType::CommandOutput => Ok(serde_json::from_slice(&self.payload)?),
|
||||
_ => Err(crate::SharedError::Protocol {
|
||||
message: "Expected command output message".to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user